summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig132
-rw-r--r--arch/x86/Kconfig.cpu24
-rw-r--r--arch/x86/Kconfig.debug54
-rw-r--r--arch/x86/Makefile12
-rw-r--r--arch/x86/boot/Makefile5
-rw-r--r--arch/x86/boot/boot.h1
-rw-r--r--arch/x86/boot/compressed/Makefile1
-rw-r--r--arch/x86/boot/compressed/eboot.c28
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/boot/video-mode.c2
-rw-r--r--arch/x86/boot/video.c2
-rw-r--r--arch/x86/crypto/Makefile8
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c3
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c3
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c3
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c3
-rw-r--r--arch/x86/crypto/chacha20-ssse3-x86_64.S6
-rw-r--r--arch/x86/crypto/chacha20_glue.c4
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c2
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S2
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c26
-rw-r--r--arch/x86/crypto/poly1305_glue.c2
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c3
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c3
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S302
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c316
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S353
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c331
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c251
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c2
-rw-r--r--arch/x86/entry/calling.h15
-rw-r--r--arch/x86/entry/common.c264
-rw-r--r--arch/x86/entry/entry_32.S193
-rw-r--r--arch/x86/entry/entry_64.S36
-rw-r--r--arch/x86/entry/entry_64_compat.S563
-rw-r--r--arch/x86/entry/syscall_32.c9
-rw-r--r--arch/x86/entry/syscall_64.c4
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl14
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/vdso/Makefile40
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c151
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S3
-rw-r--r--arch/x86/entry/vdso/vdso2c.c5
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c28
-rw-r--r--arch/x86/entry/vdso/vdso32/int80.S56
-rw-r--r--arch/x86/entry/vdso/vdso32/syscall.S75
-rw-r--r--arch/x86/entry/vdso/vdso32/sysenter.S116
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S89
-rw-r--r--arch/x86/entry/vdso/vdso32/vclock_gettime.c2
-rw-r--r--arch/x86/entry/vdso/vma.c27
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c9
-rw-r--r--arch/x86/ia32/ia32_signal.c12
-rw-r--r--arch/x86/include/asm/acpi.h23
-rw-r--r--arch/x86/include/asm/amd_nb.h2
-rw-r--r--arch/x86/include/asm/apic.h116
-rw-r--r--arch/x86/include/asm/atomic.h5
-rw-r--r--arch/x86/include/asm/atomic64_32.h1
-rw-r--r--arch/x86/include/asm/atomic64_64.h4
-rw-r--r--arch/x86/include/asm/barrier.h36
-rw-r--r--arch/x86/include/asm/boot.h2
-rw-r--r--arch/x86/include/asm/calgary.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h2
-rw-r--r--arch/x86/include/asm/cpu.h3
-rw-r--r--arch/x86/include/asm/cpufeature.h119
-rw-r--r--arch/x86/include/asm/device.h10
-rw-r--r--arch/x86/include/asm/dma-mapping.h2
-rw-r--r--arch/x86/include/asm/dwarf2.h84
-rw-r--r--arch/x86/include/asm/efi.h1
-rw-r--r--arch/x86/include/asm/elf.h10
-rw-r--r--arch/x86/include/asm/fixmap.h5
-rw-r--r--arch/x86/include/asm/fpu/internal.h174
-rw-r--r--arch/x86/include/asm/fpu/signal.h2
-rw-r--r--arch/x86/include/asm/fpu/types.h148
-rw-r--r--arch/x86/include/asm/fpu/xstate.h16
-rw-r--r--arch/x86/include/asm/highmem.h1
-rw-r--r--arch/x86/include/asm/hpet.h6
-rw-r--r--arch/x86/include/asm/hw_irq.h5
-rw-r--r--arch/x86/include/asm/i8259.h1
-rw-r--r--arch/x86/include/asm/ia32.h4
-rw-r--r--arch/x86/include/asm/intel_pt.h10
-rw-r--r--arch/x86/include/asm/intel_punit_ipc.h101
-rw-r--r--arch/x86/include/asm/intel_telemetry.h147
-rw-r--r--arch/x86/include/asm/iosf_mbi.h51
-rw-r--r--arch/x86/include/asm/ipi.h2
-rw-r--r--arch/x86/include/asm/irq.h5
-rw-r--r--arch/x86/include/asm/irq_remapping.h10
-rw-r--r--arch/x86/include/asm/jump_label.h63
-rw-r--r--arch/x86/include/asm/kdebug.h6
-rw-r--r--arch/x86/include/asm/kvm_emulate.h10
-rw-r--r--arch/x86/include/asm/kvm_host.h132
-rw-r--r--arch/x86/include/asm/lguest.h4
-rw-r--r--arch/x86/include/asm/mce.h34
-rw-r--r--arch/x86/include/asm/microcode.h65
-rw-r--r--arch/x86/include/asm/microcode_amd.h3
-rw-r--r--arch/x86/include/asm/microcode_intel.h10
-rw-r--r--arch/x86/include/asm/mmu_context.h34
-rw-r--r--arch/x86/include/asm/msi.h6
-rw-r--r--arch/x86/include/asm/msr-index.h11
-rw-r--r--arch/x86/include/asm/msr-trace.h57
-rw-r--r--arch/x86/include/asm/msr.h43
-rw-r--r--arch/x86/include/asm/numachip/numachip.h1
-rw-r--r--arch/x86/include/asm/numachip/numachip_csr.h153
-rw-r--r--arch/x86/include/asm/page_64_types.h3
-rw-r--r--arch/x86/include/asm/page_types.h19
-rw-r--r--arch/x86/include/asm/paravirt.h44
-rw-r--r--arch/x86/include/asm/paravirt_types.h40
-rw-r--r--arch/x86/include/asm/pci_x86.h10
-rw-r--r--arch/x86/include/asm/pgtable.h90
-rw-r--r--arch/x86/include/asm/pgtable_types.h49
-rw-r--r--arch/x86/include/asm/platform_sst_audio.h1
-rw-r--r--arch/x86/include/asm/pmem.h18
-rw-r--r--arch/x86/include/asm/preempt.h5
-rw-r--r--arch/x86/include/asm/processor.h9
-rw-r--r--arch/x86/include/asm/pvclock.h14
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h59
-rw-r--r--arch/x86/include/asm/reboot.h1
-rw-r--r--arch/x86/include/asm/sigcontext.h75
-rw-r--r--arch/x86/include/asm/sigframe.h8
-rw-r--r--arch/x86/include/asm/signal.h2
-rw-r--r--arch/x86/include/asm/smp.h12
-rw-r--r--arch/x86/include/asm/suspend_32.h1
-rw-r--r--arch/x86/include/asm/suspend_64.h1
-rw-r--r--arch/x86/include/asm/switch_to.h12
-rw-r--r--arch/x86/include/asm/syscall.h14
-rw-r--r--arch/x86/include/asm/thread_info.h3
-rw-r--r--arch/x86/include/asm/trace/mpx.h7
-rw-r--r--arch/x86/include/asm/uaccess.h101
-rw-r--r--arch/x86/include/asm/uaccess_64.h94
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h2
-rw-r--r--arch/x86/include/asm/vdso.h11
-rw-r--r--arch/x86/include/asm/vmx.h6
-rw-r--r--arch/x86/include/asm/x86_init.h3
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h5
-rw-r--r--arch/x86/include/asm/xen/page.h8
-rw-r--r--arch/x86/include/asm/xor_32.h2
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h110
-rw-r--r--arch/x86/include/uapi/asm/mce.h4
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h475
-rw-r--r--arch/x86/include/uapi/asm/sigcontext32.h73
-rw-r--r--arch/x86/include/uapi/asm/svm.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h4
-rw-r--r--arch/x86/kernel/acpi/boot.c22
-rw-r--r--arch/x86/kernel/apic/apic.c53
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c19
-rw-r--r--arch/x86/kernel/apic/apic_noop.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c217
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c10
-rw-r--r--arch/x86/kernel/apic/io_apic.c8
-rw-r--r--arch/x86/kernel/apic/ipi.c18
-rw-r--r--arch/x86/kernel/apic/msi.c8
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/vector.c229
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c9
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c9
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c6
-rw-r--r--arch/x86/kernel/asm-offsets.c24
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c24
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c67
-rw-r--r--arch/x86/kernel/cpu/intel.c4
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c8
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile3
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c509
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_early.c440
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c233
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c170
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c787
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c808
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c11
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c77
-rw-r--r--arch/x86/kernel/cpu/perf_event.h28
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c124
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_bts.c13
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cstate.c694
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c79
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c52
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c16
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c31
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c81
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h16
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c38
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c669
-rw-r--r--arch/x86/kernel/cpu/perf_event_msr.c7
-rw-r--r--arch/x86/kernel/cpu/rdrand.c25
-rw-r--r--arch/x86/kernel/cpu/scattered.c20
-rw-r--r--arch/x86/kernel/cpu/transmeta.c4
-rw-r--r--arch/x86/kernel/cpuid.c24
-rw-r--r--arch/x86/kernel/crash.c14
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/early-quirks.c3
-rw-r--r--arch/x86/kernel/early_printk.c6
-rw-r--r--arch/x86/kernel/fpu/init.c192
-rw-r--r--arch/x86/kernel/fpu/regset.c4
-rw-r--r--arch/x86/kernel/fpu/signal.c21
-rw-r--r--arch/x86/kernel/fpu/xstate.c407
-rw-r--r--arch/x86/kernel/ftrace.c25
-rw-r--r--arch/x86/kernel/head64.c8
-rw-r--r--arch/x86/kernel/head_32.S5
-rw-r--r--arch/x86/kernel/head_64.S8
-rw-r--r--arch/x86/kernel/hpet.c29
-rw-r--r--arch/x86/kernel/hw_breakpoint.c6
-rw-r--r--arch/x86/kernel/i8259.c29
-rw-r--r--arch/x86/kernel/irq.c11
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/irq_work.c2
-rw-r--r--arch/x86/kernel/kgdb.c17
-rw-r--r--arch/x86/kernel/kvmclock.c57
-rw-r--r--arch/x86/kernel/livepatch.c28
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mcount_64.S6
-rw-r--r--arch/x86/kernel/msr.c24
-rw-r--r--arch/x86/kernel/nmi.c32
-rw-r--r--arch/x86/kernel/paravirt.c36
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c3
-rw-r--r--arch/x86/kernel/pci-calgary_64.c4
-rw-r--r--arch/x86/kernel/pci-dma.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/pmem.c12
-rw-r--r--arch/x86/kernel/process_32.c8
-rw-r--r--arch/x86/kernel/process_64.c12
-rw-r--r--arch/x86/kernel/ptrace.c15
-rw-r--r--arch/x86/kernel/pvclock.c24
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c38
-rw-r--r--arch/x86/kernel/rtc.c3
-rw-r--r--arch/x86/kernel/setup.c108
-rw-r--r--arch/x86/kernel/signal.c27
-rw-r--r--arch/x86/kernel/smp.c4
-rw-r--r--arch/x86/kernel/smpboot.c18
-rw-r--r--arch/x86/kernel/traps.c4
-rw-r--r--arch/x86/kernel/tsc.c37
-rw-r--r--arch/x86/kernel/verify_cpu.S62
-rw-r--r--arch/x86/kernel/vm86_32.c10
-rw-r--r--arch/x86/kernel/x86_init.c1
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/assigned-dev.c62
-rw-r--r--arch/x86/kvm/cpuid.c6
-rw-r--r--arch/x86/kvm/cpuid.h79
-rw-r--r--arch/x86/kvm/emulate.c35
-rw-r--r--arch/x86/kvm/hyperv.c739
-rw-r--r--arch/x86/kvm/hyperv.h55
-rw-r--r--arch/x86/kvm/i8254.c5
-rw-r--r--arch/x86/kvm/ioapic.c31
-rw-r--r--arch/x86/kvm/ioapic.h18
-rw-r--r--arch/x86/kvm/iommu.c11
-rw-r--r--arch/x86/kvm/irq.c40
-rw-r--r--arch/x86/kvm/irq.h27
-rw-r--r--arch/x86/kvm/irq_comm.c164
-rw-r--r--arch/x86/kvm/lapic.c165
-rw-r--r--arch/x86/kvm/lapic.h12
-rw-r--r--arch/x86/kvm/mmu.c541
-rw-r--r--arch/x86/kvm/mmu.h6
-rw-r--r--arch/x86/kvm/mmu_audit.c17
-rw-r--r--arch/x86/kvm/mtrr.c25
-rw-r--r--arch/x86/kvm/paging_tmpl.h48
-rw-r--r--arch/x86/kvm/svm.c231
-rw-r--r--arch/x86/kvm/trace.h316
-rw-r--r--arch/x86/kvm/vmx.c983
-rw-r--r--arch/x86/kvm/x86.c574
-rw-r--r--arch/x86/kvm/x86.h6
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/cpu.c35
-rw-r--r--arch/x86/lib/msr.c26
-rw-r--r--arch/x86/lib/x86-opcode-map.txt24
-rw-r--r--arch/x86/math-emu/fpu_aux.c70
-rw-r--r--arch/x86/math-emu/fpu_emu.h2
-rw-r--r--arch/x86/math-emu/fpu_entry.c96
-rw-r--r--arch/x86/math-emu/fpu_proto.h12
-rw-r--r--arch/x86/math-emu/load_store.c63
-rw-r--r--arch/x86/math-emu/reg_compare.c128
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/debug_pagetables.c46
-rw-r--r--arch/x86/mm/dump_pagetables.c110
-rw-r--r--arch/x86/mm/gup.c90
-rw-r--r--arch/x86/mm/highmem_32.c14
-rw-r--r--arch/x86/mm/hugetlbpage.c4
-rw-r--r--arch/x86/mm/init.c6
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c42
-rw-r--r--arch/x86/mm/ioremap.c4
-rw-r--r--arch/x86/mm/kasan_init_64.c2
-rw-r--r--arch/x86/mm/mmap.c12
-rw-r--r--arch/x86/mm/mpx.c68
-rw-r--r--arch/x86/mm/numa.c2
-rw-r--r--arch/x86/mm/pageattr.c105
-rw-r--r--arch/x86/mm/pat.c17
-rw-r--r--arch/x86/mm/pat_rbtree.c52
-rw-r--r--arch/x86/mm/pgtable.c20
-rw-r--r--arch/x86/mm/setup_nx.c4
-rw-r--r--arch/x86/mm/srat.c2
-rw-r--r--arch/x86/mm/tlb.c29
-rw-r--r--arch/x86/net/bpf_jit_comp.c42
-rw-r--r--arch/x86/pci/Makefile2
-rw-r--r--arch/x86/pci/acpi.c296
-rw-r--r--arch/x86/pci/bus_numa.c13
-rw-r--r--arch/x86/pci/common.c46
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/pci/pcbios.c108
-rw-r--r--arch/x86/pci/vmd.c723
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c7
-rw-r--r--arch/x86/platform/efi/efi-bgrt.c9
-rw-r--r--arch/x86/platform/efi/efi.c28
-rw-r--r--arch/x86/platform/efi/quirks.c17
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c10
-rw-r--r--arch/x86/platform/intel-mid/sfi.c3
-rw-r--r--arch/x86/platform/intel-quark/imr.c46
-rw-r--r--arch/x86/platform/uv/uv_nmi.c54
-rw-r--r--arch/x86/power/cpu.c92
-rw-r--r--arch/x86/ras/Kconfig4
-rw-r--r--arch/x86/ras/mce_amd_inj.c103
-rw-r--r--arch/x86/realmode/rm/Makefile1
-rw-r--r--arch/x86/um/Makefile2
-rw-r--r--arch/x86/um/asm/barrier.h9
-rw-r--r--arch/x86/um/asm/syscall.h5
-rw-r--r--arch/x86/um/ptrace_32.c8
-rw-r--r--arch/x86/um/signal.c18
-rw-r--r--arch/x86/um/stub_32.S1
-rw-r--r--arch/x86/um/stub_64.S18
-rw-r--r--arch/x86/um/sys_call_table_32.c7
-rw-r--r--arch/x86/um/sys_call_table_64.c7
-rw-r--r--arch/x86/xen/apic.c2
-rw-r--r--arch/x86/xen/enlighten.c41
-rw-r--r--arch/x86/xen/grant-table.c2
-rw-r--r--arch/x86/xen/mmu.c11
-rw-r--r--arch/x86/xen/p2m.c19
-rw-r--r--arch/x86/xen/setup.c22
-rw-r--r--arch/x86/xen/suspend.c24
-rw-r--r--arch/x86/xen/time.c115
-rw-r--r--arch/x86/xen/xen-asm_32.S14
-rw-r--r--arch/x86/xen/xen-asm_64.S19
-rw-r--r--arch/x86/xen/xen-ops.h3
345 files changed, 14433 insertions, 6797 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 96d058a87100..ab2ed5328f0a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,12 +24,14 @@ config X86
select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_SG_CHAIN
+ select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
@@ -82,6 +84,8 @@ config X86
select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
select HAVE_ARCH_KGDB
select HAVE_ARCH_KMEMCHECK
+ select HAVE_ARCH_MMAP_RND_BITS if MMU
+ select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_SOFT_DIRTY if X86_64
select HAVE_ARCH_TRACEHOOK
@@ -96,7 +100,6 @@ config X86
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_DMA_API_DEBUG
- select HAVE_DMA_ATTRS
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
@@ -177,12 +180,23 @@ config LOCKDEP_SUPPORT
config STACKTRACE_SUPPORT
def_bool y
-config HAVE_LATENCYTOP_SUPPORT
- def_bool y
-
config MMU
def_bool y
+config ARCH_MMAP_RND_BITS_MIN
+ default 28 if 64BIT
+ default 8
+
+config ARCH_MMAP_RND_BITS_MAX
+ default 32 if 64BIT
+ default 16
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+ default 8
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+ default 16
+
config SBUS
bool
@@ -349,6 +363,17 @@ config X86_FEATURE_NAMES
If in doubt, say Y.
+config X86_FAST_FEATURE_TESTS
+ bool "Fast CPU feature tests" if EMBEDDED
+ default y
+ ---help---
+ Some fast-paths in the kernel depend on the capabilities of the CPU.
+ Say Y here for the kernel to patch in the appropriate code at runtime
+ based on the capabilities of the CPU. The infrastructure for patching
+ code at runtime takes up some additional space; space-constrained
+ embedded systems may wish to say N here to produce smaller, slightly
+ slower code.
+
config X86_X2APIC
bool "Support x2apic"
depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST)
@@ -450,6 +475,7 @@ config X86_UV
depends on X86_64
depends on X86_EXTENDED_PLATFORM
depends on NUMA
+ depends on EFI
depends on X86_X2APIC
depends on PCI
---help---
@@ -484,11 +510,10 @@ config X86_INTEL_CE
config X86_INTEL_MID
bool "Intel MID platform support"
- depends on X86_32
depends on X86_EXTENDED_PLATFORM
depends on X86_PLATFORM_DEVICES
depends on PCI
- depends on PCI_GOANY
+ depends on X86_64 || (PCI_GOANY && X86_32)
depends on X86_IO_APIC
select SFI
select I2C
@@ -523,9 +548,10 @@ config X86_INTEL_QUARK
config X86_INTEL_LPSS
bool "Intel Low Power Subsystem Support"
- depends on ACPI
+ depends on X86 && ACPI
select COMMON_CLK
select PINCTRL
+ select IOSF_MBI
---help---
Select to build support for Intel Low Power Subsystem such as
found on Intel Lynxpoint PCH. Selecting this option enables
@@ -687,6 +713,14 @@ config PARAVIRT_SPINLOCKS
If you are unsure how to answer this question, answer Y.
+config QUEUED_LOCK_STAT
+ bool "Paravirt queued spinlock statistics"
+ depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
+ ---help---
+ Enable the collection of statistical data on the slowpath
+ behavior of paravirtualized queued spinlocks and report
+ them on debugfs.
+
source "arch/x86/xen/Kconfig"
config KVM_GUEST
@@ -1123,8 +1157,10 @@ config X86_REBOOTFIXUPS
Say N otherwise.
config MICROCODE
- tristate "CPU microcode loading support"
+ bool "CPU microcode loading support"
+ default y
depends on CPU_SUP_AMD || CPU_SUP_INTEL
+ depends on BLK_DEV_INITRD
select FW_LOADER
---help---
@@ -1166,24 +1202,6 @@ config MICROCODE_OLD_INTERFACE
def_bool y
depends on MICROCODE
-config MICROCODE_INTEL_EARLY
- bool
-
-config MICROCODE_AMD_EARLY
- bool
-
-config MICROCODE_EARLY
- bool "Early load microcode"
- depends on MICROCODE=y && BLK_DEV_INITRD
- select MICROCODE_INTEL_EARLY if MICROCODE_INTEL
- select MICROCODE_AMD_EARLY if MICROCODE_AMD
- default y
- help
- This option provides functionality to read additional microcode data
- at the beginning of initrd image. The data tells kernel to load
- microcode to CPU's as early as possible. No functional change if no
- microcode data is glued to the initrd, therefore it's safe to say Y.
-
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
---help---
@@ -2043,6 +2061,55 @@ config COMPAT_VDSO
If unsure, say N: if you are compiling your own kernel, you
are unlikely to be using a buggy version of glibc.
+choice
+ prompt "vsyscall table for legacy applications"
+ depends on X86_64
+ default LEGACY_VSYSCALL_EMULATE
+ help
+ Legacy user code that does not know how to find the vDSO expects
+ to be able to issue three syscalls by calling fixed addresses in
+ kernel space. Since this location is not randomized with ASLR,
+ it can be used to assist security vulnerability exploitation.
+
+ This setting can be changed at boot time via the kernel command
+ line parameter vsyscall=[native|emulate|none].
+
+ On a system with recent enough glibc (2.14 or newer) and no
+ static binaries, you can say None without a performance penalty
+ to improve security.
+
+ If unsure, select "Emulate".
+
+ config LEGACY_VSYSCALL_NATIVE
+ bool "Native"
+ help
+ Actual executable code is located in the fixed vsyscall
+ address mapping, implementing time() efficiently. Since
+ this makes the mapping executable, it can be used during
+ security vulnerability exploitation (traditionally as
+ ROP gadgets). This configuration is not recommended.
+
+ config LEGACY_VSYSCALL_EMULATE
+ bool "Emulate"
+ help
+ The kernel traps and emulates calls into the fixed
+ vsyscall address mapping. This makes the mapping
+ non-executable, but it still contains known contents,
+ which could be used in certain rare security vulnerability
+ exploits. This configuration is recommended when userspace
+ still uses the vsyscall area.
+
+ config LEGACY_VSYSCALL_NONE
+ bool "None"
+ help
+ There will be no vsyscall mapping at all. This will
+ eliminate any risk of ASLR bypass due to the vsyscall
+ fixed address mapping. Attempts to use the vsyscalls
+ will be reported to dmesg, so that either old or
+ malicious userspace programs can be identified.
+
+endchoice
+
config CMDLINE_BOOL
bool "Built-in kernel command line"
---help---
@@ -2632,6 +2699,19 @@ config PMC_ATOM
def_bool y
depends on PCI
+config VMD
+ depends on PCI_MSI
+ tristate "Volume Management Device Driver"
+ default N
+ ---help---
+ Adds support for the Intel Volume Management Device (VMD). VMD is a
+ secondary PCI host bridge that allows PCI Express root ports,
+ and devices attached to them, to be removed from the default
+ PCI domain and placed within the VMD domain. This provides
+ more bus resources than are otherwise possible with a
+ single domain. If you know your system provides one of these and
+ has devices attached to it, say Y; if you are not sure, say N.
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 6983314c8b37..3ba5ff2f2d08 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -3,10 +3,6 @@ choice
prompt "Processor family"
default M686 if X86_32
default GENERIC_CPU if X86_64
-
-config M486
- bool "486"
- depends on X86_32
---help---
This is the processor type of your CPU. This information is
used for optimizing purposes. In order to compile a kernel
@@ -23,9 +19,9 @@ config M486
Here are the settings recommended for greatest speed:
- "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or
- SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
+ SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
- "586" for generic Pentium CPUs lacking the TSC
- (time stamp counter) register.
+ (time stamp counter) register.
- "Pentium-Classic" for the Intel Pentium.
- "Pentium-MMX" for the Intel Pentium MMX.
- "Pentium-Pro" for the Intel Pentium Pro.
@@ -34,17 +30,31 @@ config M486
- "Pentium-4" for the Intel Pentium 4 or P4-based Celeron.
- "K6" for the AMD K6, K6-II and K6-III (aka K6-3D).
- "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird).
+ - "Opteron/Athlon64/Hammer/K8" for all K8 and newer AMD CPUs.
- "Crusoe" for the Transmeta Crusoe series.
- "Efficeon" for the Transmeta Efficeon series.
- "Winchip-C6" for original IDT Winchip.
- "Winchip-2" for IDT Winchips with 3dNow! capabilities.
+ - "AMD Elan" for the 32-bit AMD Elan embedded CPU.
- "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
- "Geode GX/LX" For AMD Geode GX and LX processors.
- "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
- "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above).
- "VIA C7" for VIA C7.
+ - "Intel P4" for the Pentium 4/Netburst microarchitecture.
+ - "Core 2/newer Xeon" for all core2 and newer Intel CPUs.
+ - "Intel Atom" for the Atom-microarchitecture CPUs.
+ - "Generic-x86-64" for a kernel which runs on any x86-64 CPU.
+
+ See each option's help text for additional details. If you don't know
+ what to do, choose "486".
- If you don't know what to do, choose "486".
+config M486
+ bool "486"
+ depends on X86_32
+ ---help---
+ Select this for an 486-class CPU such as AMD/Cyrix/IBM/Intel
+ 486DX/DX2/DX4 or SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
config M586
bool "586/K5/5x86/6x86/6x86MX"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d8c0d3266173..9b18ed97a8a2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
-config STRICT_DEVMEM
- bool "Filter access to /dev/mem"
- ---help---
- If this option is disabled, you allow userspace (root) access to all
- of memory, including kernel and userspace memory. Accidental
- access to this is obviously disastrous, but specific access can
- be used by people debugging the kernel. Note that with PAT support
- enabled, even in this case there are restrictions on /dev/mem
- use due to the cache aliasing requirements.
-
- If this option is switched on, the /dev/mem file only allows
- userspace access to PCI space and the BIOS code and data regions.
- This is sufficient for dosemu and X and all common users of
- /dev/mem.
-
- If in doubt, say Y.
-
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
@@ -65,10 +48,14 @@ config EARLY_PRINTK_EFI
This is useful for kernel debugging when your machine crashes very
early before the console code is initialized.
+config X86_PTDUMP_CORE
+ def_bool n
+
config X86_PTDUMP
- bool "Export kernel pagetable layout to userspace via debugfs"
+ tristate "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL
select DEBUG_FS
+ select X86_PTDUMP_CORE
---help---
Say Y here if you want to show the kernel pagetable layout in a
debugfs file. This information is only useful for kernel developers
@@ -79,7 +66,8 @@ config X86_PTDUMP
config EFI_PGT_DUMP
bool "Dump the EFI pagetable"
- depends on EFI && X86_PTDUMP
+ depends on EFI
+ select X86_PTDUMP_CORE
---help---
Enable this if you want to dump the EFI page table before
enabling virtual mode. This can be used to debug miscellaneous
@@ -105,6 +93,34 @@ config DEBUG_RODATA_TEST
feature as well as for the change_page_attr() infrastructure.
If in doubt, say "N"
+config DEBUG_WX
+ bool "Warn on W+X mappings at boot"
+ depends on DEBUG_RODATA
+ select X86_PTDUMP_CORE
+ ---help---
+ Generate a warning if any W+X mappings are found at boot.
+
+ This is useful for discovering cases where the kernel is leaving
+ W+X mappings after applying NX, as such mappings are a security risk.
+
+ Look for a message in dmesg output like this:
+
+ x86/mm: Checked W+X mappings: passed, no W+X pages found.
+
+ or like this, if the check failed:
+
+ x86/mm: Checked W+X mappings: FAILED, <N> W+X pages found.
+
+ Note that even if the check fails, your kernel is possibly
+ still fine, as W+X mappings are not a security hole in
+ themselves, what they do is that they make the exploitation
+ of other unfixed kernel bugs easier.
+
+ There is no runtime or memory usage effect of this option
+ once the kernel has booted up - it's a one time check.
+
+ If in doubt, say "Y".
+
config DEBUG_SET_MODULE_RONX
bool "Set loadable kernel module data as NX and text as RO"
depends on MODULES
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 747860c696e1..4086abca0b32 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -159,15 +159,23 @@ endif
sp-$(CONFIG_X86_32) := esp
sp-$(CONFIG_X86_64) := rsp
+# do binutils support CFI?
+cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
+# is .cfi_signal_frame supported too?
+cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
+cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
+
# does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
+sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
+sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
-KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
-KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
LDFLAGS := -m elf_$(UTS_MACHINE)
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 0d553e54171b..bbe1a62efc02 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,13 +9,13 @@
# Changed by many, many contributors over the years.
#
+KASAN_SANITIZE := n
+
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
# The number is the same as you would ordinarily press at bootup.
-KASAN_SANITIZE := n
-
SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
targets := vmlinux.bin setup.bin setup.elf bzImage
@@ -60,6 +60,7 @@ clean-files += cpustr.h
KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
+UBSAN_SANITIZE := n
$(obj)/bzImage: asflags-y := $(SVGA_MODE)
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 0033e96c3f09..9011a88353de 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -23,7 +23,6 @@
#include <stdarg.h>
#include <linux/types.h>
#include <linux/edd.h>
-#include <asm/boot.h>
#include <asm/setup.h>
#include "bitops.h"
#include "ctype.h"
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0a291cdfaf77..f9ce75d80101 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -33,6 +33,7 @@ KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
+UBSAN_SANITIZE :=n
LDFLAGS := -m elf_$(UTS_MACHINE)
LDFLAGS_vmlinux := -T
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index db51c1f27446..583d539a4197 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -624,7 +624,7 @@ setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
static efi_status_t
__gop_query32(struct efi_graphics_output_protocol_32 *gop32,
struct efi_graphics_output_mode_info **info,
- unsigned long *size, u32 *fb_base)
+ unsigned long *size, u64 *fb_base)
{
struct efi_graphics_output_protocol_mode_32 *mode;
efi_status_t status;
@@ -650,7 +650,8 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto,
unsigned long nr_gops;
u16 width, height;
u32 pixels_per_scan_line;
- u32 fb_base;
+ u32 ext_lfb_base;
+ u64 fb_base;
struct efi_pixel_bitmask pixel_info;
int pixel_format;
efi_status_t status;
@@ -667,7 +668,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto,
bool conout_found = false;
void *dummy = NULL;
u32 h = handles[i];
- u32 current_fb_base;
+ u64 current_fb_base;
status = efi_call_early(handle_protocol, h,
proto, (void **)&gop32);
@@ -715,6 +716,13 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto,
si->lfb_width = width;
si->lfb_height = height;
si->lfb_base = fb_base;
+
+ ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
+ if (ext_lfb_base) {
+ si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+ si->ext_lfb_base = ext_lfb_base;
+ }
+
si->pages = 1;
setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
@@ -729,7 +737,7 @@ out:
static efi_status_t
__gop_query64(struct efi_graphics_output_protocol_64 *gop64,
struct efi_graphics_output_mode_info **info,
- unsigned long *size, u32 *fb_base)
+ unsigned long *size, u64 *fb_base)
{
struct efi_graphics_output_protocol_mode_64 *mode;
efi_status_t status;
@@ -755,7 +763,8 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto,
unsigned long nr_gops;
u16 width, height;
u32 pixels_per_scan_line;
- u32 fb_base;
+ u32 ext_lfb_base;
+ u64 fb_base;
struct efi_pixel_bitmask pixel_info;
int pixel_format;
efi_status_t status;
@@ -772,7 +781,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto,
bool conout_found = false;
void *dummy = NULL;
u64 h = handles[i];
- u32 current_fb_base;
+ u64 current_fb_base;
status = efi_call_early(handle_protocol, h,
proto, (void **)&gop64);
@@ -820,6 +829,13 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto,
si->lfb_width = width;
si->lfb_height = height;
si->lfb_base = fb_base;
+
+ ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
+ if (ext_lfb_base) {
+ si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+ si->ext_lfb_base = ext_lfb_base;
+ }
+
si->pages = 1;
setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 2d6b309c8e9a..6236b9ec4b76 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -154,7 +154,7 @@ extra_header_fields:
#else
.quad 0 # ImageBase
#endif
- .long CONFIG_PHYSICAL_ALIGN # SectionAlignment
+ .long 0x20 # SectionAlignment
.long 0x20 # FileAlignment
.word 0 # MajorOperatingSystemVersion
.word 0 # MinorOperatingSystemVersion
diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c
index aa8a96b052e3..95c7a818c0ed 100644
--- a/arch/x86/boot/video-mode.c
+++ b/arch/x86/boot/video-mode.c
@@ -19,6 +19,8 @@
#include "video.h"
#include "vesa.h"
+#include <uapi/asm/boot.h>
+
/*
* Common variables
*/
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 05111bb8d018..77780e386e9b 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -13,6 +13,8 @@
* Select video mode
*/
+#include <uapi/asm/boot.h>
+
#include "boot.h"
#include "video.h"
#include "vesa.h"
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9a2838cf0591..b9b912a44d61 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,6 +5,8 @@
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
+sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
+sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@@ -91,9 +93,15 @@ ifeq ($(avx2_supported),yes)
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
poly1305-x86_64-y += poly1305-avx2-x86_64.o
endif
+ifeq ($(sha1_ni_supported),yes)
+sha1-ssse3-y += sha1_ni_asm.o
+endif
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+ifeq ($(sha256_ni_supported),yes)
+sha256-ssse3-y += sha256_ni_asm.o
+endif
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 4c65c70e628b..d84456924563 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -567,7 +567,8 @@ static int __init camellia_aesni_init(void)
return -ENODEV;
}
- if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
pr_info("CPU feature '%s' is not supported.\n", feature_name);
return -ENODEV;
}
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index bacaa13acac5..93d8f295784e 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -559,7 +559,8 @@ static int __init camellia_aesni_init(void)
return -ENODEV;
}
- if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
pr_info("CPU feature '%s' is not supported.\n", feature_name);
return -ENODEV;
}
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index be00aa48b2b5..8648158f3916 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -469,7 +469,8 @@ static int __init cast5_init(void)
{
const char *feature_name;
- if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
pr_info("CPU feature '%s' is not supported.\n", feature_name);
return -ENODEV;
}
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 5dbba7224221..fca459578c35 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -591,7 +591,8 @@ static int __init cast6_init(void)
{
const char *feature_name;
- if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
pr_info("CPU feature '%s' is not supported.\n", feature_name);
return -ENODEV;
}
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 712b13047b41..3a33124e9112 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -157,7 +157,9 @@ ENTRY(chacha20_4block_xor_ssse3)
# done with the slightly better performing SSSE3 byte shuffling,
# 7/12-bit word rotation uses traditional shift+OR.
- sub $0x40,%rsp
+ mov %rsp,%r11
+ sub $0x80,%rsp
+ and $~63,%rsp
# x0..15[0-3] = s0..3[0..3]
movq 0x00(%rdi),%xmm1
@@ -620,6 +622,6 @@ ENTRY(chacha20_4block_xor_ssse3)
pxor %xmm1,%xmm15
movdqu %xmm15,0xf0(%rsi)
- add $0x40,%rsp
+ mov %r11,%rsp
ret
ENDPROC(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index effe2160b7c5..8baaff5af0b5 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -125,12 +125,12 @@ static struct crypto_alg alg = {
static int __init chacha20_simd_mod_init(void)
{
- if (!cpu_has_ssse3)
+ if (!boot_cpu_has(X86_FEATURE_SSSE3))
return -ENODEV;
#ifdef CONFIG_AS_AVX2
chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
- cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL);
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
#endif
return crypto_register_alg(&alg);
}
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 81a595d75cf5..0e9871693f24 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -257,7 +257,7 @@ static int __init crc32c_intel_mod_init(void)
if (!x86_match_cpu(crc32c_cpu_id))
return -ENODEV;
#ifdef CONFIG_X86_64
- if (cpu_has_pclmulqdq) {
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
alg.update = crc32c_pcl_intel_update;
alg.finup = crc32c_pcl_intel_finup;
alg.digest = crc32c_pcl_intel_digest;
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 225be06edc80..4fe27e074194 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -330,7 +330,7 @@ ENDPROC(crc_pcl)
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each
################################################################
-.section .rotata, "a", %progbits
+.section .rodata, "a", %progbits
.align 8
K_table:
.long 0x493c7d27, 0x00000001
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 440df0c7a2ee..a69321a77783 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -219,6 +219,29 @@ static int ghash_async_final(struct ahash_request *req)
}
}
+static int ghash_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ ghash_async_init(req);
+ memcpy(dctx, in, sizeof(*dctx));
+ return 0;
+
+}
+
+static int ghash_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memcpy(out, dctx, sizeof(*dctx));
+ return 0;
+
+}
+
static int ghash_async_digest(struct ahash_request *req)
{
struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
@@ -288,8 +311,11 @@ static struct ahash_alg ghash_async_alg = {
.final = ghash_async_final,
.setkey = ghash_async_setkey,
.digest = ghash_async_digest,
+ .export = ghash_async_export,
+ .import = ghash_async_import,
.halg = {
.digestsize = GHASH_DIGEST_SIZE,
+ .statesize = sizeof(struct ghash_desc_ctx),
.base = {
.cra_name = "ghash",
.cra_driver_name = "ghash-clmulni",
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index f7170d764f32..4264a3d59589 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -184,7 +184,7 @@ static int __init poly1305_simd_mod_init(void)
#ifdef CONFIG_AS_AVX2
poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
- cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL);
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
if (poly1305_use_avx2)
alg.descsize += 10 * sizeof(u32);
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 7d838dc4d888..6d198342e2de 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -542,7 +542,8 @@ static int __init init(void)
pr_info("AVX2 instructions are not detected.\n");
return -ENODEV;
}
- if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
pr_info("CPU feature '%s' is not supported.\n", feature_name);
return -ENODEV;
}
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index da7dafc9b16d..5dc37026c7ce 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -597,7 +597,8 @@ static int __init serpent_init(void)
{
const char *feature_name;
- if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
pr_info("CPU feature '%s' is not supported.\n", feature_name);
return -ENODEV;
}
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
new file mode 100644
index 000000000000..874a651b9e7d
--- /dev/null
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -0,0 +1,302 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define RSPSAVE %rax
+
+/* gcc conversion */
+#define FRAME_SIZE 32 /* space for 2x16 bytes */
+
+#define ABCD %xmm0
+#define E0 %xmm1 /* Need two E's b/c they ping pong */
+#define E1 %xmm2
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define SHUF_MASK %xmm7
+
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process. Once all blocks have
+ * been processed, the digest pointer is updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha1_ni_transform(uint32_t *digest, const void *data,
+ uint32_t numBlocks)
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+.text
+.align 32
+ENTRY(sha1_ni_transform)
+ mov %rsp, RSPSAVE
+ sub $FRAME_SIZE, %rsp
+ and $~0xF, %rsp
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ jz .Ldone_hash
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /* load initial hash values */
+ pinsrd $3, 1*16(DIGEST_PTR), E0
+ movdqu 0*16(DIGEST_PTR), ABCD
+ pand UPPER_WORD_MASK(%rip), E0
+ pshufd $0x1B, ABCD, ABCD
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa E0, (0*16)(%rsp)
+ movdqa ABCD, (1*16)(%rsp)
+
+ /* Rounds 0-3 */
+ movdqu 0*16(DATA_PTR), MSG0
+ pshufb SHUF_MASK, MSG0
+ paddd MSG0, E0
+ movdqa ABCD, E1
+ sha1rnds4 $0, E0, ABCD
+
+ /* Rounds 4-7 */
+ movdqu 1*16(DATA_PTR), MSG1
+ pshufb SHUF_MASK, MSG1
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1rnds4 $0, E1, ABCD
+ sha1msg1 MSG1, MSG0
+
+ /* Rounds 8-11 */
+ movdqu 2*16(DATA_PTR), MSG2
+ pshufb SHUF_MASK, MSG2
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1rnds4 $0, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 12-15 */
+ movdqu 3*16(DATA_PTR), MSG3
+ pshufb SHUF_MASK, MSG3
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $0, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 16-19 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $0, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 20-23 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 24-27 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $1, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 28-31 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 32-35 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $1, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 36-39 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 40-43 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 44-47 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $2, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 48-51 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 52-55 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $2, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 56-59 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 60-63 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $3, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 64-67 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $3, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 68-71 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $3, E1, ABCD
+ pxor MSG1, MSG3
+
+ /* Rounds 72-75 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $3, E0, ABCD
+
+ /* Rounds 76-79 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1rnds4 $3, E1, ABCD
+
+ /* Add current hash values with previously saved */
+ sha1nexte (0*16)(%rsp), E0
+ paddd (1*16)(%rsp), ABCD
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ pshufd $0x1B, ABCD, ABCD
+ movdqu ABCD, 0*16(DIGEST_PTR)
+ pextrd $3, E0, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+ mov RSPSAVE, %rsp
+
+ ret
+ENDPROC(sha1_ni_transform)
+
+.data
+
+.align 64
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+UPPER_WORD_MASK:
+ .octa 0xFFFFFFFF000000000000000000000000
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 7c48e8b20848..dd14616b7739 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -31,24 +31,11 @@
#include <crypto/sha1_base.h>
#include <asm/fpu/api.h>
+typedef void (sha1_transform_fn)(u32 *digest, const char *data,
+ unsigned int rounds);
-asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
- unsigned int rounds);
-#ifdef CONFIG_AS_AVX
-asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
- unsigned int rounds);
-#endif
-#ifdef CONFIG_AS_AVX2
-#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
-
-asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
- unsigned int rounds);
-#endif
-
-static void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
-
-static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+static int sha1_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha1_transform_fn *sha1_xform)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
@@ -61,14 +48,14 @@ static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
sha1_base_do_update(desc, data, len,
- (sha1_block_fn *)sha1_transform_asm);
+ (sha1_block_fn *)sha1_xform);
kernel_fpu_end();
return 0;
}
-static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
+static int sha1_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha1_transform_fn *sha1_xform)
{
if (!irq_fpu_usable())
return crypto_sha1_finup(desc, data, len, out);
@@ -76,32 +63,37 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
if (len)
sha1_base_do_update(desc, data, len,
- (sha1_block_fn *)sha1_transform_asm);
- sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm);
+ (sha1_block_fn *)sha1_xform);
+ sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform);
kernel_fpu_end();
return sha1_base_finish(desc, out);
}
-/* Add padding and return the message digest. */
-static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
+asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
- return sha1_ssse3_finup(desc, NULL, 0, out);
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_transform_ssse3);
}
-#ifdef CONFIG_AS_AVX2
-static void sha1_apply_transform_avx2(u32 *digest, const char *data,
- unsigned int rounds)
+static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
{
- /* Select the optimal transform based on data block size */
- if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
- sha1_transform_avx2(digest, data, rounds);
- else
- sha1_transform_avx(digest, data, rounds);
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_transform_ssse3);
+}
+
+/* Add padding and return the message digest. */
+static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_ssse3_finup(desc, NULL, 0, out);
}
-#endif
-static struct shash_alg alg = {
+static struct shash_alg sha1_ssse3_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_ssse3_update,
@@ -110,7 +102,7 @@ static struct shash_alg alg = {
.descsize = sizeof(struct sha1_state),
.base = {
.cra_name = "sha1",
- .cra_driver_name= "sha1-ssse3",
+ .cra_driver_name = "sha1-ssse3",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
@@ -118,10 +110,62 @@ static struct shash_alg alg = {
}
};
+static int register_sha1_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shash(&sha1_ssse3_alg);
+ return 0;
+}
+
+static void unregister_sha1_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shash(&sha1_ssse3_alg);
+}
+
#ifdef CONFIG_AS_AVX
-static bool __init avx_usable(void)
+asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
- if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_transform_avx);
+}
+
+static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_transform_avx);
+}
+
+static int sha1_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_avx_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_avx_update,
+ .final = sha1_avx_final,
+ .finup = sha1_avx_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-avx",
+ .cra_priority = 160,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static bool avx_usable(void)
+{
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
if (cpu_has_avx)
pr_info("AVX detected but unusable.\n");
return false;
@@ -130,55 +174,197 @@ static bool __init avx_usable(void)
return true;
}
-#ifdef CONFIG_AS_AVX2
-static bool __init avx2_usable(void)
+static int register_sha1_avx(void)
+{
+ if (avx_usable())
+ return crypto_register_shash(&sha1_avx_alg);
+ return 0;
+}
+
+static void unregister_sha1_avx(void)
{
- if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) &&
- boot_cpu_has(X86_FEATURE_BMI2))
+ if (avx_usable())
+ crypto_unregister_shash(&sha1_avx_alg);
+}
+
+#else /* CONFIG_AS_AVX */
+static inline int register_sha1_avx(void) { return 0; }
+static inline void unregister_sha1_avx(void) { }
+#endif /* CONFIG_AS_AVX */
+
+
+#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX)
+#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
+
+asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
+ && boot_cpu_has(X86_FEATURE_BMI1)
+ && boot_cpu_has(X86_FEATURE_BMI2))
return true;
return false;
}
+
+static void sha1_apply_transform_avx2(u32 *digest, const char *data,
+ unsigned int rounds)
+{
+ /* Select the optimal transform based on data block size */
+ if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
+ sha1_transform_avx2(digest, data, rounds);
+ else
+ sha1_transform_avx(digest, data, rounds);
+}
+
+static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_apply_transform_avx2);
+}
+
+static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_apply_transform_avx2);
+}
+
+static int sha1_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_avx2_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_avx2_update,
+ .final = sha1_avx2_final,
+ .finup = sha1_avx2_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-avx2",
+ .cra_priority = 170,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int register_sha1_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shash(&sha1_avx2_alg);
+ return 0;
+}
+
+static void unregister_sha1_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shash(&sha1_avx2_alg);
+}
+
+#else
+static inline int register_sha1_avx2(void) { return 0; }
+static inline void unregister_sha1_avx2(void) { }
#endif
+
+#ifdef CONFIG_AS_SHA1_NI
+asmlinkage void sha1_ni_transform(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_ni_transform);
+}
+
+static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_ni_transform);
+}
+
+static int sha1_ni_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_ni_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_ni_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_ni_update,
+ .final = sha1_ni_final,
+ .finup = sha1_ni_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-ni",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int register_sha1_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ return crypto_register_shash(&sha1_ni_alg);
+ return 0;
+}
+
+static void unregister_sha1_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ crypto_unregister_shash(&sha1_ni_alg);
+}
+
+#else
+static inline int register_sha1_ni(void) { return 0; }
+static inline void unregister_sha1_ni(void) { }
#endif
static int __init sha1_ssse3_mod_init(void)
{
- char *algo_name;
+ if (register_sha1_ssse3())
+ goto fail;
- /* test for SSSE3 first */
- if (cpu_has_ssse3) {
- sha1_transform_asm = sha1_transform_ssse3;
- algo_name = "SSSE3";
+ if (register_sha1_avx()) {
+ unregister_sha1_ssse3();
+ goto fail;
}
-#ifdef CONFIG_AS_AVX
- /* allow AVX to override SSSE3, it's a little faster */
- if (avx_usable()) {
- sha1_transform_asm = sha1_transform_avx;
- algo_name = "AVX";
-#ifdef CONFIG_AS_AVX2
- /* allow AVX2 to override AVX, it's a little faster */
- if (avx2_usable()) {
- sha1_transform_asm = sha1_apply_transform_avx2;
- algo_name = "AVX2";
- }
-#endif
+ if (register_sha1_avx2()) {
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
+ goto fail;
}
-#endif
- if (sha1_transform_asm) {
- pr_info("Using %s optimized SHA-1 implementation\n", algo_name);
- return crypto_register_shash(&alg);
+ if (register_sha1_ni()) {
+ unregister_sha1_avx2();
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
+ goto fail;
}
- pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n");
+ return 0;
+fail:
return -ENODEV;
}
static void __exit sha1_ssse3_mod_fini(void)
{
- crypto_unregister_shash(&alg);
+ unregister_sha1_ni();
+ unregister_sha1_avx2();
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
}
module_init(sha1_ssse3_mod_init);
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
new file mode 100644
index 000000000000..748cdf21a938
--- /dev/null
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -0,0 +1,353 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define SHA256CONSTANTS %rax
+
+#define MSG %xmm0
+#define STATE0 %xmm1
+#define STATE1 %xmm2
+#define MSGTMP0 %xmm3
+#define MSGTMP1 %xmm4
+#define MSGTMP2 %xmm5
+#define MSGTMP3 %xmm6
+#define MSGTMP4 %xmm7
+
+#define SHUF_MASK %xmm8
+
+#define ABEF_SAVE %xmm9
+#define CDGH_SAVE %xmm10
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process. Once all blocks have
+ * been processed, the digest pointer is updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha256_ni_transform(uint32_t *digest, const void *data,
+ uint32_t numBlocks);
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+
+.text
+.align 32
+ENTRY(sha256_ni_transform)
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ jz .Ldone_hash
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /*
+ * load initial hash values
+ * Need to reorder these appropriately
+ * DCBA, HGFE -> ABEF, CDGH
+ */
+ movdqu 0*16(DIGEST_PTR), STATE0
+ movdqu 1*16(DIGEST_PTR), STATE1
+
+ pshufd $0xB1, STATE0, STATE0 /* CDAB */
+ pshufd $0x1B, STATE1, STATE1 /* EFGH */
+ movdqa STATE0, MSGTMP4
+ palignr $8, STATE1, STATE0 /* ABEF */
+ pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+ lea K256(%rip), SHA256CONSTANTS
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa STATE0, ABEF_SAVE
+ movdqa STATE1, CDGH_SAVE
+
+ /* Rounds 0-3 */
+ movdqu 0*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP0
+ paddd 0*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 4-7 */
+ movdqu 1*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP1
+ paddd 1*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 8-11 */
+ movdqu 2*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP2
+ paddd 2*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 12-15 */
+ movdqu 3*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP3
+ paddd 3*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 16-19 */
+ movdqa MSGTMP0, MSG
+ paddd 4*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 20-23 */
+ movdqa MSGTMP1, MSG
+ paddd 5*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 24-27 */
+ movdqa MSGTMP2, MSG
+ paddd 6*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 28-31 */
+ movdqa MSGTMP3, MSG
+ paddd 7*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 32-35 */
+ movdqa MSGTMP0, MSG
+ paddd 8*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 36-39 */
+ movdqa MSGTMP1, MSG
+ paddd 9*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 40-43 */
+ movdqa MSGTMP2, MSG
+ paddd 10*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 44-47 */
+ movdqa MSGTMP3, MSG
+ paddd 11*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 48-51 */
+ movdqa MSGTMP0, MSG
+ paddd 12*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 52-55 */
+ movdqa MSGTMP1, MSG
+ paddd 13*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 56-59 */
+ movdqa MSGTMP2, MSG
+ paddd 14*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 60-63 */
+ movdqa MSGTMP3, MSG
+ paddd 15*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Add current hash values with previously saved */
+ paddd ABEF_SAVE, STATE0
+ paddd CDGH_SAVE, STATE1
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ pshufd $0x1B, STATE0, STATE0 /* FEBA */
+ pshufd $0xB1, STATE1, STATE1 /* DCHG */
+ movdqa STATE0, MSGTMP4
+ pblendw $0xF0, STATE1, STATE0 /* DCBA */
+ palignr $8, MSGTMP4, STATE1 /* HGFE */
+
+ movdqu STATE0, 0*16(DIGEST_PTR)
+ movdqu STATE1, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+
+ ret
+ENDPROC(sha256_ni_transform)
+
+.data
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index f8097fc0d1d1..5f4d6086dc59 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -42,19 +42,10 @@
asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
u64 rounds);
-#ifdef CONFIG_AS_AVX
-asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
- u64 rounds);
-#endif
-#ifdef CONFIG_AS_AVX2
-asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
- u64 rounds);
-#endif
-
-static void (*sha256_transform_asm)(u32 *, const char *, u64);
+typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds);
-static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+static int sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha256_transform_fn *sha256_xform)
{
struct sha256_state *sctx = shash_desc_ctx(desc);
@@ -67,14 +58,14 @@ static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_transform_asm);
+ (sha256_block_fn *)sha256_xform);
kernel_fpu_end();
return 0;
}
-static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
+static int sha256_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha256_transform_fn *sha256_xform)
{
if (!irq_fpu_usable())
return crypto_sha256_finup(desc, data, len, out);
@@ -82,20 +73,32 @@ static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
if (len)
sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_transform_asm);
- sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm);
+ (sha256_block_fn *)sha256_xform);
+ sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform);
kernel_fpu_end();
return sha256_base_finish(desc, out);
}
+static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_transform_ssse3);
+}
+
+static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_ssse3);
+}
+
/* Add padding and return the message digest. */
static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
{
return sha256_ssse3_finup(desc, NULL, 0, out);
}
-static struct shash_alg algs[] = { {
+static struct shash_alg sha256_ssse3_algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = sha256_ssse3_update,
@@ -127,10 +130,77 @@ static struct shash_alg algs[] = { {
}
} };
+static int register_sha256_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shashes(sha256_ssse3_algs,
+ ARRAY_SIZE(sha256_ssse3_algs));
+ return 0;
+}
+
+static void unregister_sha256_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(sha256_ssse3_algs,
+ ARRAY_SIZE(sha256_ssse3_algs));
+}
+
#ifdef CONFIG_AS_AVX
-static bool __init avx_usable(void)
+asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
+ u64 rounds);
+
+static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_transform_avx);
+}
+
+static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_avx);
+}
+
+static int sha256_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_avx_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_avx_update,
+ .final = sha256_avx_final,
+ .finup = sha256_avx_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-avx",
+ .cra_priority = 160,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_avx_update,
+ .final = sha256_avx_final,
+ .finup = sha256_avx_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-avx",
+ .cra_priority = 160,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static bool avx_usable(void)
{
- if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
if (cpu_has_avx)
pr_info("AVX detected but unusable.\n");
return false;
@@ -138,47 +208,216 @@ static bool __init avx_usable(void)
return true;
}
-#endif
-static int __init sha256_ssse3_mod_init(void)
+static int register_sha256_avx(void)
{
- /* test for SSSE3 first */
- if (cpu_has_ssse3)
- sha256_transform_asm = sha256_transform_ssse3;
+ if (avx_usable())
+ return crypto_register_shashes(sha256_avx_algs,
+ ARRAY_SIZE(sha256_avx_algs));
+ return 0;
+}
-#ifdef CONFIG_AS_AVX
- /* allow AVX to override SSSE3, it's a little faster */
- if (avx_usable()) {
-#ifdef CONFIG_AS_AVX2
- if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2))
- sha256_transform_asm = sha256_transform_rorx;
- else
+static void unregister_sha256_avx(void)
+{
+ if (avx_usable())
+ crypto_unregister_shashes(sha256_avx_algs,
+ ARRAY_SIZE(sha256_avx_algs));
+}
+
+#else
+static inline int register_sha256_avx(void) { return 0; }
+static inline void unregister_sha256_avx(void) { }
#endif
- sha256_transform_asm = sha256_transform_avx;
+
+#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
+asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
+ u64 rounds);
+
+static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_transform_rorx);
+}
+
+static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_rorx);
+}
+
+static int sha256_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_avx2_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_avx2_update,
+ .final = sha256_avx2_final,
+ .finup = sha256_avx2_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-avx2",
+ .cra_priority = 170,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
}
-#endif
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_avx2_update,
+ .final = sha256_avx2_final,
+ .finup = sha256_avx2_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-avx2",
+ .cra_priority = 170,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
- if (sha256_transform_asm) {
-#ifdef CONFIG_AS_AVX
- if (sha256_transform_asm == sha256_transform_avx)
- pr_info("Using AVX optimized SHA-256 implementation\n");
-#ifdef CONFIG_AS_AVX2
- else if (sha256_transform_asm == sha256_transform_rorx)
- pr_info("Using AVX2 optimized SHA-256 implementation\n");
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ return true;
+
+ return false;
+}
+
+static int register_sha256_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shashes(sha256_avx2_algs,
+ ARRAY_SIZE(sha256_avx2_algs));
+ return 0;
+}
+
+static void unregister_sha256_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shashes(sha256_avx2_algs,
+ ARRAY_SIZE(sha256_avx2_algs));
+}
+
+#else
+static inline int register_sha256_avx2(void) { return 0; }
+static inline void unregister_sha256_avx2(void) { }
#endif
- else
+
+#ifdef CONFIG_AS_SHA256_NI
+asmlinkage void sha256_ni_transform(u32 *digest, const char *data,
+ u64 rounds); /*unsigned int rounds);*/
+
+static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_ni_transform);
+}
+
+static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_ni_transform);
+}
+
+static int sha256_ni_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_ni_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_ni_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_ni_update,
+ .final = sha256_ni_final,
+ .finup = sha256_ni_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-ni",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_ni_update,
+ .final = sha256_ni_final,
+ .finup = sha256_ni_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-ni",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int register_sha256_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ return crypto_register_shashes(sha256_ni_algs,
+ ARRAY_SIZE(sha256_ni_algs));
+ return 0;
+}
+
+static void unregister_sha256_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ crypto_unregister_shashes(sha256_ni_algs,
+ ARRAY_SIZE(sha256_ni_algs));
+}
+
+#else
+static inline int register_sha256_ni(void) { return 0; }
+static inline void unregister_sha256_ni(void) { }
#endif
- pr_info("Using SSSE3 optimized SHA-256 implementation\n");
- return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+static int __init sha256_ssse3_mod_init(void)
+{
+ if (register_sha256_ssse3())
+ goto fail;
+
+ if (register_sha256_avx()) {
+ unregister_sha256_ssse3();
+ goto fail;
}
- pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+ if (register_sha256_avx2()) {
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
+ goto fail;
+ }
+
+ if (register_sha256_ni()) {
+ unregister_sha256_avx2();
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
+ goto fail;
+ }
+
+ return 0;
+fail:
return -ENODEV;
}
static void __exit sha256_ssse3_mod_fini(void)
{
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+ unregister_sha256_ni();
+ unregister_sha256_avx2();
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
}
module_init(sha256_ssse3_mod_init);
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 2edad7b81870..34e5083d6f36 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -41,19 +41,11 @@
asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
u64 rounds);
-#ifdef CONFIG_AS_AVX
-asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
- u64 rounds);
-#endif
-#ifdef CONFIG_AS_AVX2
-asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
- u64 rounds);
-#endif
-static void (*sha512_transform_asm)(u64 *, const char *, u64);
+typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds);
-static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+static int sha512_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha512_transform_fn *sha512_xform)
{
struct sha512_state *sctx = shash_desc_ctx(desc);
@@ -66,14 +58,14 @@ static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_transform_asm);
+ (sha512_block_fn *)sha512_xform);
kernel_fpu_end();
return 0;
}
-static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
+static int sha512_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha512_transform_fn *sha512_xform)
{
if (!irq_fpu_usable())
return crypto_sha512_finup(desc, data, len, out);
@@ -81,20 +73,32 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
kernel_fpu_begin();
if (len)
sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_transform_asm);
- sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm);
+ (sha512_block_fn *)sha512_xform);
+ sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform);
kernel_fpu_end();
return sha512_base_finish(desc, out);
}
+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha512_update(desc, data, len, sha512_transform_ssse3);
+}
+
+static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
+}
+
/* Add padding and return the message digest. */
static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
{
return sha512_ssse3_finup(desc, NULL, 0, out);
}
-static struct shash_alg algs[] = { {
+static struct shash_alg sha512_ssse3_algs[] = { {
.digestsize = SHA512_DIGEST_SIZE,
.init = sha512_base_init,
.update = sha512_ssse3_update,
@@ -126,10 +130,27 @@ static struct shash_alg algs[] = { {
}
} };
+static int register_sha512_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shashes(sha512_ssse3_algs,
+ ARRAY_SIZE(sha512_ssse3_algs));
+ return 0;
+}
+
+static void unregister_sha512_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(sha512_ssse3_algs,
+ ARRAY_SIZE(sha512_ssse3_algs));
+}
+
#ifdef CONFIG_AS_AVX
-static bool __init avx_usable(void)
+asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
+ u64 rounds);
+static bool avx_usable(void)
{
- if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
if (cpu_has_avx)
pr_info("AVX detected but unusable.\n");
return false;
@@ -137,47 +158,185 @@ static bool __init avx_usable(void)
return true;
}
-#endif
-static int __init sha512_ssse3_mod_init(void)
+static int sha512_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
- /* test for SSSE3 first */
- if (cpu_has_ssse3)
- sha512_transform_asm = sha512_transform_ssse3;
+ return sha512_update(desc, data, len, sha512_transform_avx);
+}
-#ifdef CONFIG_AS_AVX
- /* allow AVX to override SSSE3, it's a little faster */
- if (avx_usable()) {
-#ifdef CONFIG_AS_AVX2
- if (boot_cpu_has(X86_FEATURE_AVX2))
- sha512_transform_asm = sha512_transform_rorx;
- else
-#endif
- sha512_transform_asm = sha512_transform_avx;
+static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_avx);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha512_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_avx_algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_base_init,
+ .update = sha512_avx_update,
+ .final = sha512_avx_final,
+ .finup = sha512_avx_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-avx",
+ .cra_priority = 160,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
}
-#endif
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_base_init,
+ .update = sha512_avx_update,
+ .final = sha512_avx_final,
+ .finup = sha512_avx_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-avx",
+ .cra_priority = 160,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
- if (sha512_transform_asm) {
-#ifdef CONFIG_AS_AVX
- if (sha512_transform_asm == sha512_transform_avx)
- pr_info("Using AVX optimized SHA-512 implementation\n");
-#ifdef CONFIG_AS_AVX2
- else if (sha512_transform_asm == sha512_transform_rorx)
- pr_info("Using AVX2 optimized SHA-512 implementation\n");
+static int register_sha512_avx(void)
+{
+ if (avx_usable())
+ return crypto_register_shashes(sha512_avx_algs,
+ ARRAY_SIZE(sha512_avx_algs));
+ return 0;
+}
+
+static void unregister_sha512_avx(void)
+{
+ if (avx_usable())
+ crypto_unregister_shashes(sha512_avx_algs,
+ ARRAY_SIZE(sha512_avx_algs));
+}
+#else
+static inline int register_sha512_avx(void) { return 0; }
+static inline void unregister_sha512_avx(void) { }
#endif
- else
+
+#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
+asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
+ u64 rounds);
+
+static int sha512_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha512_update(desc, data, len, sha512_transform_rorx);
+}
+
+static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_rorx);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha512_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_avx2_algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_base_init,
+ .update = sha512_avx2_update,
+ .final = sha512_avx2_final,
+ .finup = sha512_avx2_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-avx2",
+ .cra_priority = 170,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_base_init,
+ .update = sha512_avx2_update,
+ .final = sha512_avx2_final,
+ .finup = sha512_avx2_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-avx2",
+ .cra_priority = 170,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ return true;
+
+ return false;
+}
+
+static int register_sha512_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shashes(sha512_avx2_algs,
+ ARRAY_SIZE(sha512_avx2_algs));
+ return 0;
+}
+
+static void unregister_sha512_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shashes(sha512_avx2_algs,
+ ARRAY_SIZE(sha512_avx2_algs));
+}
+#else
+static inline int register_sha512_avx2(void) { return 0; }
+static inline void unregister_sha512_avx2(void) { }
#endif
- pr_info("Using SSSE3 optimized SHA-512 implementation\n");
- return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+static int __init sha512_ssse3_mod_init(void)
+{
+
+ if (register_sha512_ssse3())
+ goto fail;
+
+ if (register_sha512_avx()) {
+ unregister_sha512_ssse3();
+ goto fail;
}
- pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+ if (register_sha512_avx2()) {
+ unregister_sha512_avx();
+ unregister_sha512_ssse3();
+ goto fail;
+ }
+
+ return 0;
+fail:
return -ENODEV;
}
static void __exit sha512_ssse3_mod_fini(void)
{
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+ unregister_sha512_avx2();
+ unregister_sha512_avx();
+ unregister_sha512_ssse3();
}
module_init(sha512_ssse3_mod_init);
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index c2bd0ce718ee..b7a3904b953c 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -558,7 +558,7 @@ static int __init twofish_init(void)
{
const char *feature_name;
- if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) {
pr_info("CPU feature '%s' is not supported.\n", feature_name);
return -ENODEV;
}
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3c71dd947c7b..e32206e09868 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,3 +1,5 @@
+#include <linux/jump_label.h>
+
/*
x86 function call convention, 64-bit:
@@ -232,3 +234,16 @@ For 32-bit we have the following conventions - kernel is built with
#endif /* CONFIG_X86_64 */
+/*
+ * This does 'call enter_from_user_mode' unless we can avoid it based on
+ * kernel config or using the static jump infrastructure.
+ */
+.macro CALL_enter_from_user_mode
+#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef HAVE_JUMP_LABEL
+ STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
+#endif
+ call enter_from_user_mode
+.Lafter_call_\@:
+#endif
+.endm
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 80dcc9261ca3..03663740c866 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -24,10 +24,19 @@
#include <asm/desc.h>
#include <asm/traps.h>
+#include <asm/vdso.h>
+#include <asm/uaccess.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
+static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
+{
+ unsigned long top_of_stack =
+ (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
+ return (struct thread_info *)(top_of_stack - THREAD_SIZE);
+}
+
#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
__visible void enter_from_user_mode(void)
@@ -66,13 +75,14 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
*/
unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
{
+ struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned long ret = 0;
u32 work;
- BUG_ON(regs != task_pt_regs(current));
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ BUG_ON(regs != task_pt_regs(current));
- work = ACCESS_ONCE(current_thread_info()->flags) &
- _TIF_WORK_SYSCALL_ENTRY;
+ work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
#ifdef CONFIG_CONTEXT_TRACKING
/*
@@ -154,11 +164,12 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
unsigned long phase1_result)
{
+ struct thread_info *ti = pt_regs_to_thread_info(regs);
long ret = 0;
- u32 work = ACCESS_ONCE(current_thread_info()->flags) &
- _TIF_WORK_SYSCALL_ENTRY;
+ u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
- BUG_ON(regs != task_pt_regs(current));
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ BUG_ON(regs != task_pt_regs(current));
/*
* If we stepped into a sysenter/syscall insn, it trapped in
@@ -207,19 +218,12 @@ long syscall_trace_enter(struct pt_regs *regs)
return syscall_trace_enter_phase2(regs, arch, phase1_result);
}
-static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
-{
- unsigned long top_of_stack =
- (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
- return (struct thread_info *)(top_of_stack - THREAD_SIZE);
-}
+#define EXIT_TO_USERMODE_LOOP_FLAGS \
+ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+ _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
-/* Called with IRQs disabled. */
-__visible void prepare_exit_to_usermode(struct pt_regs *regs)
+static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
{
- if (WARN_ON(!irqs_disabled()))
- local_irq_disable();
-
/*
* In order to return to user mode, we need to have IRQs off with
* none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
@@ -229,14 +233,6 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs)
* work to clear some of the flags can sleep.
*/
while (true) {
- u32 cached_flags =
- READ_ONCE(pt_regs_to_thread_info(regs)->flags);
-
- if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
- _TIF_UPROBE | _TIF_NEED_RESCHED |
- _TIF_USER_RETURN_NOTIFY)))
- break;
-
/* We have work to do. */
local_irq_enable();
@@ -260,50 +256,81 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs)
/* Disable IRQs and retry */
local_irq_disable();
+
+ cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+
+ if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
+ break;
+
}
+}
+
+/* Called with IRQs disabled. */
+__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
+{
+ u32 cached_flags;
+
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
+ local_irq_disable();
+
+ lockdep_sys_exit();
+
+ cached_flags =
+ READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+
+ if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
+ exit_to_usermode_loop(regs, cached_flags);
user_enter();
}
+#define SYSCALL_EXIT_WORK_FLAGS \
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+ _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
+
+static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
+{
+ bool step;
+
+ audit_syscall_exit(regs);
+
+ if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
+ trace_sys_exit(regs, regs->ax);
+
+ /*
+ * If TIF_SYSCALL_EMU is set, we only get here because of
+ * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+ * We already reported this syscall instruction in
+ * syscall_trace_enter().
+ */
+ step = unlikely(
+ (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
+ == _TIF_SINGLESTEP);
+ if (step || cached_flags & _TIF_SYSCALL_TRACE)
+ tracehook_report_syscall_exit(regs, step);
+}
+
/*
* Called with IRQs on and fully valid regs. Returns with IRQs off in a
* state such that we can immediately switch to user mode.
*/
-__visible void syscall_return_slowpath(struct pt_regs *regs)
+__visible inline void syscall_return_slowpath(struct pt_regs *regs)
{
struct thread_info *ti = pt_regs_to_thread_info(regs);
u32 cached_flags = READ_ONCE(ti->flags);
- bool step;
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
- if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
- regs->orig_ax))
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
+ WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
local_irq_enable();
/*
* First do one-time work. If these work items are enabled, we
* want to run them exactly once per syscall exit with IRQs on.
*/
- if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |
- _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) {
- audit_syscall_exit(regs);
-
- if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
- trace_sys_exit(regs, regs->ax);
-
- /*
- * If TIF_SYSCALL_EMU is set, we only get here because of
- * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
- * We already reported this syscall instruction in
- * syscall_trace_enter().
- */
- step = unlikely(
- (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
- == _TIF_SINGLESTEP);
- if (step || cached_flags & _TIF_SYSCALL_TRACE)
- tracehook_report_syscall_exit(regs, step);
- }
+ if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
+ syscall_slow_exit_work(regs, cached_flags);
#ifdef CONFIG_COMPAT
/*
@@ -316,3 +343,144 @@ __visible void syscall_return_slowpath(struct pt_regs *regs)
local_irq_disable();
prepare_exit_to_usermode(regs);
}
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+/*
+ * Does a 32-bit syscall. Called with IRQs on and does all entry and
+ * exit work and returns with IRQs off. This function is extremely hot
+ * in workloads that use it, and it's usually called from
+ * do_fast_syscall_32, so forcibly inline it to improve performance.
+ */
+#ifdef CONFIG_X86_32
+/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
+__visible
+#else
+/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
+static
+#endif
+__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+{
+ struct thread_info *ti = pt_regs_to_thread_info(regs);
+ unsigned int nr = (unsigned int)regs->orig_ax;
+
+#ifdef CONFIG_IA32_EMULATION
+ ti->status |= TS_COMPAT;
+#endif
+
+ if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
+ /*
+ * Subtlety here: if ptrace pokes something larger than
+ * 2^32-1 into orig_ax, this truncates it. This may or
+ * may not be necessary, but it matches the old asm
+ * behavior.
+ */
+ nr = syscall_trace_enter(regs);
+ }
+
+ if (likely(nr < IA32_NR_syscalls)) {
+ /*
+ * It's possible that a 32-bit syscall implementation
+ * takes a 64-bit parameter but nonetheless assumes that
+ * the high bits are zero. Make sure we zero-extend all
+ * of the args.
+ */
+ regs->ax = ia32_sys_call_table[nr](
+ (unsigned int)regs->bx, (unsigned int)regs->cx,
+ (unsigned int)regs->dx, (unsigned int)regs->si,
+ (unsigned int)regs->di, (unsigned int)regs->bp);
+ }
+
+ syscall_return_slowpath(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* Handles INT80 on 64-bit kernels */
+__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
+{
+ local_irq_enable();
+ do_syscall_32_irqs_on(regs);
+}
+#endif
+
+/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
+__visible long do_fast_syscall_32(struct pt_regs *regs)
+{
+ /*
+ * Called using the internal vDSO SYSENTER/SYSCALL32 calling
+ * convention. Adjust regs so it looks like we entered using int80.
+ */
+
+ unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+ vdso_image_32.sym_int80_landing_pad;
+
+ /*
+ * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
+ * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
+ * Fix it up.
+ */
+ regs->ip = landing_pad;
+
+ /*
+ * Fetch EBP from where the vDSO stashed it.
+ *
+ * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
+ */
+ local_irq_enable();
+ if (
+#ifdef CONFIG_X86_64
+ /*
+ * Micro-optimization: the pointer we're following is explicitly
+ * 32 bits, so it can't be out of range.
+ */
+ __get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp)
+#else
+ get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp)
+#endif
+ ) {
+
+ /* User code screwed up. */
+ local_irq_disable();
+ regs->ax = -EFAULT;
+#ifdef CONFIG_CONTEXT_TRACKING
+ enter_from_user_mode();
+#endif
+ prepare_exit_to_usermode(regs);
+ return 0; /* Keep it simple: use IRET. */
+ }
+
+ /* Now this is just like a normal syscall. */
+ do_syscall_32_irqs_on(regs);
+
+#ifdef CONFIG_X86_64
+ /*
+ * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
+ * SYSRETL is available on all 64-bit CPUs, so we don't need to
+ * bother with SYSEXIT.
+ *
+ * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
+ * because the ECX fixup above will ensure that this is essentially
+ * never the case.
+ */
+ return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
+ regs->ip == landing_pad &&
+ (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
+#else
+ /*
+ * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
+ *
+ * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
+ * because the ECX fixup above will ensure that this is essentially
+ * never the case.
+ *
+ * We don't allow syscalls at all from VM86 mode, but we still
+ * need to check VM, because we might be returning from sys_vm86.
+ */
+ return static_cpu_has(X86_FEATURE_SEP) &&
+ regs->cs == __USER_CS && regs->ss == __USER_DS &&
+ regs->ip == landing_pad &&
+ (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
+#endif
+}
+#endif
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b2909bf8cf70..77d8c5112900 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -3,7 +3,7 @@
*
* entry_32.S contains the system-call and low-level fault and trap handling routines.
*
- * Stack layout in 'syscall_exit':
+ * Stack layout while running C code:
* ptrace needs to have all registers on the stack.
* If the order here is changed, it needs to be
* updated in fork.c:copy_process(), signal.c:do_signal(),
@@ -153,13 +153,13 @@
#endif /* CONFIG_X86_32_LAZY_GS */
-.macro SAVE_ALL
+.macro SAVE_ALL pt_regs_ax=%eax
cld
PUSH_GS
pushl %fs
pushl %es
pushl %ds
- pushl %eax
+ pushl \pt_regs_ax
pushl %ebp
pushl %edi
pushl %esi
@@ -211,7 +211,11 @@ ENTRY(ret_from_fork)
popl %eax
pushl $0x0202 # Reset kernel eflags
popfl
- jmp syscall_exit
+
+ /* When we fork, we trace the syscall return in the child, too. */
+ movl %esp, %eax
+ call syscall_return_slowpath
+ jmp restore_all
END(ret_from_fork)
ENTRY(ret_from_kernel_thread)
@@ -224,7 +228,15 @@ ENTRY(ret_from_kernel_thread)
movl PT_EBP(%esp), %eax
call *PT_EBX(%esp)
movl $0, PT_EAX(%esp)
- jmp syscall_exit
+
+ /*
+ * Kernel threads return to userspace as if returning from a syscall.
+ * We should check whether anything actually uses this path and, if so,
+ * consider switching it over to ret_from_fork.
+ */
+ movl %esp, %eax
+ call syscall_return_slowpath
+ jmp restore_all
ENDPROC(ret_from_kernel_thread)
/*
@@ -255,7 +267,6 @@ ret_from_intr:
jb resume_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace)
- LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl %esp, %eax
@@ -276,77 +287,50 @@ need_resched:
END(resume_kernel)
#endif
-/*
- * SYSENTER_RETURN points to after the SYSENTER instruction
- * in the vsyscall page. See vsyscall-sysentry.S, which defines
- * the symbol.
- */
-
# SYSENTER call handler stub
ENTRY(entry_SYSENTER_32)
movl TSS_sysenter_sp0(%esp), %esp
sysenter_past_esp:
+ pushl $__USER_DS /* pt_regs->ss */
+ pushl %ebp /* pt_regs->sp (stashed in bp) */
+ pushfl /* pt_regs->flags (except IF = 0) */
+ orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
+ pushl $__USER_CS /* pt_regs->cs */
+ pushl $0 /* pt_regs->ip = 0 (placeholder) */
+ pushl %eax /* pt_regs->orig_ax */
+ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
+
/*
- * Interrupts are disabled here, but we can't trace it until
- * enough kernel state to call TRACE_IRQS_OFF can be called - but
- * we immediately enable interrupts at that point anyway.
- */
- pushl $__USER_DS
- pushl %ebp
- pushfl
- orl $X86_EFLAGS_IF, (%esp)
- pushl $__USER_CS
- /*
- * Push current_thread_info()->sysenter_return to the stack.
- * A tiny bit of offset fixup is necessary: TI_sysenter_return
- * is relative to thread_info, which is at the bottom of the
- * kernel stack page. 4*4 means the 4 words pushed above;
- * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
- * and THREAD_SIZE takes us to the bottom.
+ * User mode is traced as though IRQs are on, and SYSENTER
+ * turned them off.
*/
- pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
-
- pushl %eax
- SAVE_ALL
- ENABLE_INTERRUPTS(CLBR_NONE)
-
-/*
- * Load the potential sixth argument from user stack.
- * Careful about security.
- */
- cmpl $__PAGE_OFFSET-3, %ebp
- jae syscall_fault
- ASM_STAC
-1: movl (%ebp), %ebp
- ASM_CLAC
- movl %ebp, PT_EBP(%esp)
- _ASM_EXTABLE(1b, syscall_fault)
-
- GET_THREAD_INFO(%ebp)
-
- testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
- jnz syscall_trace_entry
-sysenter_do_call:
- cmpl $(NR_syscalls), %eax
- jae sysenter_badsys
- call *sys_call_table(, %eax, 4)
-sysenter_after_call:
- movl %eax, PT_EAX(%esp)
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx
- jnz syscall_exit_work_irqs_off
-sysenter_exit:
-/* if something modifies registers it must also disable sysexit */
- movl PT_EIP(%esp), %edx
- movl PT_OLDESP(%esp), %ecx
- xorl %ebp, %ebp
- TRACE_IRQS_ON
+
+ movl %esp, %eax
+ call do_fast_syscall_32
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+
+/* Opportunistic SYSEXIT */
+ TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ movl PT_EIP(%esp), %edx /* pt_regs->ip */
+ movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */
1: mov PT_FS(%esp), %fs
PTGS_TO_GS
- ENABLE_INTERRUPTS_SYSEXIT
+ popl %ebx /* pt_regs->bx */
+ addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */
+ popl %esi /* pt_regs->si */
+ popl %edi /* pt_regs->di */
+ popl %ebp /* pt_regs->bp */
+ popl %eax /* pt_regs->ax */
+
+ /*
+ * Return back to the vDSO, which will pop ecx and edx.
+ * Don't bother with DS and ES (they already contain __USER_DS).
+ */
+ sti
+ sysexit
.pushsection .fixup, "ax"
2: movl $0, PT_FS(%esp)
@@ -359,21 +343,18 @@ ENDPROC(entry_SYSENTER_32)
# system call handler stub
ENTRY(entry_INT80_32)
ASM_CLAC
- pushl %eax # save orig_eax
- SAVE_ALL
- GET_THREAD_INFO(%ebp)
- # system call tracing in operation / emulation
- testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
- jnz syscall_trace_entry
- cmpl $(NR_syscalls), %eax
- jae syscall_badsys
-syscall_call:
- call *sys_call_table(, %eax, 4)
-syscall_after_call:
- movl %eax, PT_EAX(%esp) # store the return value
-syscall_exit:
- LOCKDEP_SYS_EXIT
- jmp syscall_exit_work
+ pushl %eax /* pt_regs->orig_ax */
+ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
+
+ /*
+ * User mode is traced as though IRQs are on. Unlike the 64-bit
+ * case, INT80 is a trap gate on 32-bit kernels, so interrupts
+ * are already on (unless user code is messing around with iopl).
+ */
+
+ movl %esp, %eax
+ call do_syscall_32_irqs_on
+.Lsyscall_32_done:
restore_all:
TRACE_IRQS_IRET
@@ -450,47 +431,6 @@ ldt_ss:
#endif
ENDPROC(entry_INT80_32)
- # perform syscall exit tracing
- ALIGN
-syscall_trace_entry:
- movl $-ENOSYS, PT_EAX(%esp)
- movl %esp, %eax
- call syscall_trace_enter
- /* What it returned is what we'll actually use. */
- cmpl $(NR_syscalls), %eax
- jnae syscall_call
- jmp syscall_exit
-END(syscall_trace_entry)
-
- # perform syscall exit tracing
- ALIGN
-syscall_exit_work_irqs_off:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
-
-syscall_exit_work:
- movl %esp, %eax
- call syscall_return_slowpath
- jmp restore_all
-END(syscall_exit_work)
-
-syscall_fault:
- ASM_CLAC
- GET_THREAD_INFO(%ebp)
- movl $-EFAULT, PT_EAX(%esp)
- jmp resume_userspace
-END(syscall_fault)
-
-syscall_badsys:
- movl $-ENOSYS, %eax
- jmp syscall_after_call
-END(syscall_badsys)
-
-sysenter_badsys:
- movl $-ENOSYS, %eax
- jmp sysenter_after_call
-END(sysenter_badsys)
-
.macro FIXUP_ESPFIX_STACK
/*
* Switch back for ESPFIX stack to the normal zerobased stack
@@ -613,11 +553,6 @@ ENTRY(native_iret)
iret
_ASM_EXTABLE(native_iret, iret_exc)
END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
- sti
- sysexit
-END(native_irq_enable_sysexit)
#endif
ENTRY(overflow)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 055a01de7c8d..9d34d3cfceb6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -391,20 +391,16 @@ GLOBAL(stub_execveat)
jmp return_from_execve
END(stub_execveat)
-#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
+#if defined(CONFIG_X86_X32_ABI)
.align 8
GLOBAL(stub_x32_execve)
-GLOBAL(stub32_execve)
call compat_sys_execve
jmp return_from_execve
-END(stub32_execve)
END(stub_x32_execve)
.align 8
GLOBAL(stub_x32_execveat)
-GLOBAL(stub32_execveat)
call compat_sys_execveat
jmp return_from_execve
-END(stub32_execveat)
END(stub_x32_execveat)
#endif
@@ -513,9 +509,18 @@ END(irq_entries_start)
* tracking that we're in kernel mode.
*/
SWAPGS
-#ifdef CONFIG_CONTEXT_TRACKING
- call enter_from_user_mode
-#endif
+
+ /*
+ * We need to tell lockdep that IRQs are off. We can't do this until
+ * we fix gsbase, and we should do it before enter_from_user_mode
+ * (which can take locks). Since TRACE_IRQS_OFF idempotent,
+ * the simplest way to handle it is to just call it twice if
+ * we enter from user mode. There's no reason to optimize this since
+ * TRACE_IRQS_OFF is a no-op if lockdep is off.
+ */
+ TRACE_IRQS_OFF
+
+ CALL_enter_from_user_mode
1:
/*
@@ -557,7 +562,6 @@ ret_from_intr:
jz retint_kernel
/* Interrupt came from user space */
- LOCKDEP_SYS_EXIT_IRQ
GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
@@ -587,7 +591,7 @@ retint_kernel:
* At this label, code paths which return to kernel and to user,
* which come from interrupts/exception and from syscalls, merge.
*/
-restore_regs_and_iret:
+GLOBAL(restore_regs_and_iret)
RESTORE_EXTRA_REGS
restore_c_regs_and_iret:
RESTORE_C_REGS
@@ -1054,12 +1058,16 @@ ENTRY(error_entry)
SWAPGS
.Lerror_entry_from_usermode_after_swapgs:
-#ifdef CONFIG_CONTEXT_TRACKING
- call enter_from_user_mode
-#endif
+ /*
+ * We need to tell lockdep that IRQs are off. We can't do this until
+ * we fix gsbase, and we should do it before enter_from_user_mode
+ * (which can take locks).
+ */
+ TRACE_IRQS_OFF
+ CALL_enter_from_user_mode
+ ret
.Lerror_entry_done:
-
TRACE_IRQS_OFF
ret
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index a9360d40fb7f..ff1c6d61f332 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -16,25 +16,8 @@
#include <linux/linkage.h>
#include <linux/err.h>
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE 0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-# define sysexit_audit ia32_ret_from_sys_call_irqs_off
-# define sysretl_audit ia32_ret_from_sys_call_irqs_off
-#endif
-
.section .entry.text, "ax"
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret32)
- swapgs
- sysretl
-ENDPROC(native_usergs_sysret32)
-#endif
-
/*
* 32-bit SYSENTER instruction entry.
*
@@ -58,219 +41,88 @@ ENDPROC(native_usergs_sysret32)
* with the int 0x80 path.
*/
ENTRY(entry_SYSENTER_compat)
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
+ /* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- ENABLE_INTERRUPTS(CLBR_NONE)
- /* Zero-extending 32-bit regs, do not remove */
- movl %ebp, %ebp
+ /*
+ * User tracing code (ptrace or signal handlers) might assume that
+ * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+ * syscall. Just in case the high bits are nonzero, zero-extend
+ * the syscall number. (This could almost certainly be deleted
+ * with no ill effects.)
+ */
movl %eax, %eax
- movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
-
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
- pushq %rbp /* pt_regs->sp */
- pushfq /* pt_regs->flags */
+ pushq %rbp /* pt_regs->sp (stashed in bp) */
+
+ /*
+ * Push flags. This is nasty. First, interrupts are currently
+ * off, but we need pt_regs->flags to have IF set. Second, even
+ * if TF was set when SYSENTER started, it's clear by now. We fix
+ * that later using TIF_SINGLESTEP.
+ */
+ pushfq /* pt_regs->flags (except IF = 0) */
+ orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */
+ ASM_CLAC /* Clear AC after saving FLAGS */
+
pushq $__USER32_CS /* pt_regs->cs */
- pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */
+ xorq %r8,%r8
+ pushq %r8 /* pt_regs->ip = 0 (placeholder) */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 = 0 */
+ pushq %r8 /* pt_regs->r9 = 0 */
+ pushq %r8 /* pt_regs->r10 = 0 */
+ pushq %r8 /* pt_regs->r11 = 0 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp (will be overwritten) */
+ pushq %r8 /* pt_regs->r12 = 0 */
+ pushq %r8 /* pt_regs->r13 = 0 */
+ pushq %r8 /* pt_regs->r14 = 0 */
+ pushq %r8 /* pt_regs->r15 = 0 */
cld
- sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
-
- /*
- * no need to do an access_ok check here because rbp has been
- * 32-bit zero extended
- */
- ASM_STAC
-1: movl (%rbp), %ebp
- _ASM_EXTABLE(1b, ia32_badarg)
- ASM_CLAC
/*
* Sysenter doesn't filter flags, so we need to clear NT
* ourselves. To save a few cycles, we can check whether
* NT was set instead of doing an unconditional popfq.
- */
- testl $X86_EFLAGS_NT, EFLAGS(%rsp)
- jnz sysenter_fix_flags
-sysenter_flags_fixed:
-
- orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz sysenter_tracesys
-
-sysenter_do_call:
- /* 32-bit syscall -> 64-bit C ABI argument conversion */
- movl %edi, %r8d /* arg5 */
- movl %ebp, %r9d /* arg6 */
- xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
- movl %ebx, %edi /* arg1 */
- movl %edx, %edx /* arg3 (zero extension) */
-sysenter_dispatch:
- cmpq $(IA32_NR_syscalls-1), %rax
- ja 1f
- call *ia32_sys_call_table(, %rax, 8)
- movq %rax, RAX(%rsp)
-1:
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz sysexit_audit
-sysexit_from_sys_call:
- /*
- * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
- * NMI between STI and SYSEXIT has poorly specified behavior,
- * and and NMI followed by an IRQ with usergs is fatal. So
- * we just pretend we're using SYSEXIT but we really use
- * SYSRETL instead.
- *
- * This code path is still called 'sysexit' because it pairs
- * with 'sysenter' and it uses the SYSENTER calling convention.
- */
- andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- movl RIP(%rsp), %ecx /* User %eip */
- movq RAX(%rsp), %rax
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- xorl %edx, %edx /* Do not leak kernel information */
- xorq %r8, %r8
- xorq %r9, %r9
- xorq %r10, %r10
- movl EFLAGS(%rsp), %r11d /* User eflags */
- TRACE_IRQS_ON
-
- /*
- * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT,
- * since it avoids a dicey window with interrupts enabled.
- */
- movl RSP(%rsp), %esp
-
- /*
- * USERGS_SYSRET32 does:
- * gsbase = user's gs base
- * eip = ecx
- * rflags = r11
- * cs = __USER32_CS
- * ss = __USER_DS
- *
- * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
- *
- * pop %ebp
- * pop %edx
- * pop %ecx
- *
- * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
- * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's
- * address (already known to user code), and R12-R15 are
- * callee-saved and therefore don't contain any interesting
- * kernel data.
- */
- USERGS_SYSRET32
-
-#ifdef CONFIG_AUDITSYSCALL
- .macro auditsys_entry_common
- /*
- * At this point, registers hold syscall args in the 32-bit syscall ABI:
- * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP.
+ * This needs to happen before enabling interrupts so that
+ * we don't get preempted with NT set.
*
- * We want to pass them to __audit_syscall_entry(), which is a 64-bit
- * C function with 5 parameters, so shuffle them to match what
- * the function expects: RDI,RSI,RDX,RCX,R8.
+ * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+ * out-of-line as an optimization: NT is unlikely to be set in the
+ * majority of the cases and instead of polluting the I$ unnecessarily,
+ * we're keeping that code behind a branch which will predict as
+ * not-taken and therefore its instructions won't be fetched.
*/
- movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */
- xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */
- /* arg3 (RDX) <= 2nd syscall arg (ECX) */
- movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */
- movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */
- call __audit_syscall_entry
+ testl $X86_EFLAGS_NT, EFLAGS(%rsp)
+ jnz .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
/*
- * We are going to jump back to the syscall dispatch code.
- * Prepare syscall args as required by the 64-bit C ABI.
- * Registers clobbered by __audit_syscall_entry() are
- * loaded from pt_regs on stack:
+ * User mode is traced as though IRQs are on, and SYSENTER
+ * turned them off.
*/
- movl ORIG_RAX(%rsp), %eax /* syscall number */
- movl %ebx, %edi /* arg1 */
- movl RCX(%rsp), %esi /* arg2 */
- movl RDX(%rsp), %edx /* arg3 */
- movl RSI(%rsp), %ecx /* arg4 */
- movl RDI(%rsp), %r8d /* arg5 */
- .endm
-
- .macro auditsys_exit exit
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz ia32_ret_from_sys_call
- movl %eax, %esi /* second arg, syscall return value */
- cmpl $-MAX_ERRNO, %eax /* is it an error ? */
- jbe 1f
- movslq %eax, %rsi /* if error sign extend to 64 bits */
-1: setbe %al /* 1 if error, 0 if not */
- movzbl %al, %edi /* zero-extend that into %edi */
- call __audit_syscall_exit
- movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi
- DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jz \exit
- xorl %eax, %eax /* Do not leak kernel information */
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
- movq %rax, R8(%rsp)
- jmp int_ret_from_sys_call_irqs_off
- .endm
-
-sysenter_auditsys:
- auditsys_entry_common
- movl %ebp, %r9d /* reload 6th syscall arg */
- jmp sysenter_dispatch
-sysexit_audit:
- auditsys_exit sysexit_from_sys_call
-#endif
+ movq %rsp, %rdi
+ call do_fast_syscall_32
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ jmp sysret32_from_system_call
-sysenter_fix_flags:
- pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
+.Lsysenter_fix_flags:
+ pushq $X86_EFLAGS_FIXED
popfq
- jmp sysenter_flags_fixed
-
-sysenter_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jz sysenter_auditsys
-#endif
- SAVE_EXTRA_REGS
- xorl %eax, %eax /* Do not leak kernel information */
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
- movq %rax, R8(%rsp)
- movq %rsp, %rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
-
- /* Reload arg registers from stack. (see sysenter_tracesys) */
- movl RCX(%rsp), %ecx
- movl RDX(%rsp), %edx
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- movl %eax, %eax /* zero extension */
-
- RESTORE_EXTRA_REGS
- jmp sysenter_do_call
+ jmp .Lsysenter_flags_fixed
ENDPROC(entry_SYSENTER_compat)
/*
@@ -298,21 +150,14 @@ ENDPROC(entry_SYSENTER_compat)
* edi arg5
* esp user stack
* 0(%esp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
*/
ENTRY(entry_SYSCALL_compat)
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
+ /* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+
+ /* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- ENABLE_INTERRUPTS(CLBR_NONE)
/* Zero-extending 32-bit regs, do not remove */
movl %eax, %eax
@@ -327,162 +172,69 @@ ENTRY(entry_SYSCALL_compat)
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
- pushq %rbp /* pt_regs->cx */
- movl %ebp, %ecx
+ pushq %rbp /* pt_regs->cx (stashed in bp) */
pushq $-ENOSYS /* pt_regs->ax */
- sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
+ xorq %r8,%r8
+ pushq %r8 /* pt_regs->r8 = 0 */
+ pushq %r8 /* pt_regs->r9 = 0 */
+ pushq %r8 /* pt_regs->r10 = 0 */
+ pushq %r8 /* pt_regs->r11 = 0 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp (will be overwritten) */
+ pushq %r8 /* pt_regs->r12 = 0 */
+ pushq %r8 /* pt_regs->r13 = 0 */
+ pushq %r8 /* pt_regs->r14 = 0 */
+ pushq %r8 /* pt_regs->r15 = 0 */
/*
- * No need to do an access_ok check here because r8 has been
- * 32-bit zero extended:
+ * User mode is traced as though IRQs are on, and SYSENTER
+ * turned them off.
*/
- ASM_STAC
-1: movl (%r8), %r9d
- _ASM_EXTABLE(1b, ia32_badarg)
- ASM_CLAC
- orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz cstar_tracesys
-
-cstar_do_call:
- /* 32-bit syscall -> 64-bit C ABI argument conversion */
- movl %edi, %r8d /* arg5 */
- /* r9 already loaded */ /* arg6 */
- xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
- movl %ebx, %edi /* arg1 */
- movl %edx, %edx /* arg3 (zero extension) */
-
-cstar_dispatch:
- cmpq $(IA32_NR_syscalls-1), %rax
- ja 1f
-
- call *ia32_sys_call_table(, %rax, 8)
- movq %rax, RAX(%rsp)
-1:
- DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz sysretl_audit
-sysretl_from_sys_call:
- andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- movl RDX(%rsp), %edx
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- movl RIP(%rsp), %ecx
- movl EFLAGS(%rsp), %r11d
- movq RAX(%rsp), %rax
- xorq %r10, %r10
- xorq %r9, %r9
- xorq %r8, %r8
- TRACE_IRQS_ON
- movl RSP(%rsp), %esp
- /*
- * 64-bit->32-bit SYSRET restores eip from ecx,
- * eflags from r11 (but RF and VM bits are forced to 0),
- * cs and ss are loaded from MSRs.
- * (Note: 32-bit->32-bit SYSRET is different: since r11
- * does not exist, it merely sets eflags.IF=1).
+ movq %rsp, %rdi
+ call do_fast_syscall_32
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+
+ /* Opportunistic SYSRET */
+sysret32_from_system_call:
+ TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ movq RBX(%rsp), %rbx /* pt_regs->rbx */
+ movq RBP(%rsp), %rbp /* pt_regs->rbp */
+ movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
+ movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */
+ addq $RAX, %rsp /* Skip r8-r15 */
+ popq %rax /* pt_regs->rax */
+ popq %rdx /* Skip pt_regs->cx */
+ popq %rdx /* pt_regs->dx */
+ popq %rsi /* pt_regs->si */
+ popq %rdi /* pt_regs->di */
+
+ /*
+ * USERGS_SYSRET32 does:
+ * GSBASE = user's GS base
+ * EIP = ECX
+ * RFLAGS = R11
+ * CS = __USER32_CS
+ * SS = __USER_DS
+ *
+ * ECX will not match pt_regs->cx, but we're returning to a vDSO
+ * trampoline that will fix up RCX, so this is okay.
*
- * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
- * descriptor is not reinitialized. This means that we must
- * avoid SYSRET with SS == NULL, which could happen if we schedule,
- * exit the kernel, and re-enter using an interrupt vector. (All
- * interrupt entries on x86_64 set SS to NULL.) We prevent that
- * from happening by reloading SS in __switch_to.
- */
- USERGS_SYSRET32
-
-#ifdef CONFIG_AUDITSYSCALL
-cstar_auditsys:
- movl %r9d, R9(%rsp) /* register to be clobbered by call */
- auditsys_entry_common
- movl R9(%rsp), %r9d /* reload 6th syscall arg */
- jmp cstar_dispatch
-
-sysretl_audit:
- auditsys_exit sysretl_from_sys_call
-#endif
-
-cstar_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jz cstar_auditsys
-#endif
- xchgl %r9d, %ebp
- SAVE_EXTRA_REGS
- xorl %eax, %eax /* Do not leak kernel information */
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %r9, R9(%rsp)
- movq %rax, R8(%rsp)
- movq %rsp, %rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
- movl R9(%rsp), %r9d
-
- /* Reload arg registers from stack. (see sysenter_tracesys) */
- movl RCX(%rsp), %ecx
- movl RDX(%rsp), %edx
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- movl %eax, %eax /* zero extension */
-
- RESTORE_EXTRA_REGS
- xchgl %ebp, %r9d
- jmp cstar_do_call
+ * R12-R15 are callee-saved, so they contain whatever was in them
+ * when the system call started, which is already known to user
+ * code. We zero R8-R10 to avoid info leaks.
+ */
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r10
+ movq RSP-ORIG_RAX(%rsp), %rsp
+ swapgs
+ sysretl
END(entry_SYSCALL_compat)
-ia32_badarg:
- /*
- * So far, we've entered kernel mode, set AC, turned on IRQs, and
- * saved C regs except r8-r11. We haven't done any of the other
- * standard entry work, though. We want to bail, but we shouldn't
- * treat this as a syscall entry since we don't even know what the
- * args are. Instead, treat this as a non-syscall entry, finish
- * the entry work, and immediately exit after setting AX = -EFAULT.
- *
- * We're really just being polite here. Killing the task outright
- * would be a reasonable action, too. Given that the only valid
- * way to have gotten here is through the vDSO, and we already know
- * that the stack pointer is bad, the task isn't going to survive
- * for long no matter what we do.
- */
-
- ASM_CLAC /* undo STAC */
- movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */
-
- /* Fill in the rest of pt_regs */
- xorl %eax, %eax
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
- movq %rax, R8(%rsp)
- SAVE_EXTRA_REGS
-
- /* Turn IRQs back off. */
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
-
- /* Now finish entering normal kernel mode. */
-#ifdef CONFIG_CONTEXT_TRACKING
- call enter_from_user_mode
-#endif
-
- /* And exit again. */
- jmp retint_user
-
-ia32_ret_from_sys_call_irqs_off:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
-
-ia32_ret_from_sys_call:
- xorl %eax, %eax /* Do not leak kernel information */
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
- movq %rax, R8(%rsp)
- jmp int_ret_from_sys_call
-
/*
* Emulated IA32 system calls via int 0x80.
*
@@ -507,14 +259,17 @@ ia32_ret_from_sys_call:
ENTRY(entry_INT80_compat)
/*
* Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
*/
PARAVIRT_ADJUST_EXCEPTION_FRAME
SWAPGS
- ENABLE_INTERRUPTS(CLBR_NONE)
- /* Zero-extending 32-bit regs, do not remove */
+ /*
+ * User tracing code (ptrace or signal handlers) might assume that
+ * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+ * syscall. Just in case the high bits are nonzero, zero-extend
+ * the syscall number. (This could almost certainly be deleted
+ * with no ill effects.)
+ */
movl %eax, %eax
/* Construct struct pt_regs on stack (iret frame is already on stack) */
@@ -524,67 +279,37 @@ ENTRY(entry_INT80_compat)
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
- pushq $0 /* pt_regs->r8 */
- pushq $0 /* pt_regs->r9 */
- pushq $0 /* pt_regs->r10 */
- pushq $0 /* pt_regs->r11 */
+ xorq %r8,%r8
+ pushq %r8 /* pt_regs->r8 = 0 */
+ pushq %r8 /* pt_regs->r9 = 0 */
+ pushq %r8 /* pt_regs->r10 = 0 */
+ pushq %r8 /* pt_regs->r11 = 0 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
cld
- sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
-
- orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz ia32_tracesys
-
-ia32_do_call:
- /* 32-bit syscall -> 64-bit C ABI argument conversion */
- movl %edi, %r8d /* arg5 */
- movl %ebp, %r9d /* arg6 */
- xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
- movl %ebx, %edi /* arg1 */
- movl %edx, %edx /* arg3 (zero extension) */
- cmpq $(IA32_NR_syscalls-1), %rax
- ja 1f
- call *ia32_sys_call_table(, %rax, 8)
- movq %rax, RAX(%rsp)
-1:
- jmp int_ret_from_sys_call
-
-ia32_tracesys:
- SAVE_EXTRA_REGS
- movq %rsp, %rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
/*
- * Reload arg registers from stack in case ptrace changed them.
- * Don't reload %eax because syscall_trace_enter() returned
- * the %rax value we should see. But do truncate it to 32 bits.
- * If it's -1 to make us punt the syscall, then (u32)-1 is still
- * an appropriately invalid value.
+ * User mode is traced as though IRQs are on, and the interrupt
+ * gate turned them off.
*/
- movl RCX(%rsp), %ecx
- movl RDX(%rsp), %edx
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- movl %eax, %eax /* zero extension */
- RESTORE_EXTRA_REGS
- jmp ia32_do_call
-END(entry_INT80_compat)
+ TRACE_IRQS_OFF
- .macro PTREGSCALL label, func
- ALIGN
-GLOBAL(\label)
- leaq \func(%rip), %rax
- jmp ia32_ptregs_common
- .endm
+ movq %rsp, %rdi
+ call do_syscall_32_irqs_off
+.Lsyscall_32_done:
- PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
- PTREGSCALL stub32_sigreturn, sys32_sigreturn
- PTREGSCALL stub32_fork, sys_fork
- PTREGSCALL stub32_vfork, sys_vfork
+ /* Go back to user mode. */
+ TRACE_IRQS_ON
+ SWAPGS
+ jmp restore_regs_and_iret
+END(entry_INT80_compat)
ALIGN
GLOBAL(stub32_clone)
- leaq sys_clone(%rip), %rax
/*
* The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
* The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
@@ -593,12 +318,4 @@ GLOBAL(stub32_clone)
* so we need to swap arguments here before calling it:
*/
xchg %r8, %rcx
- jmp ia32_ptregs_common
-
- ALIGN
-ia32_ptregs_common:
- SAVE_EXTRA_REGS 8
- call *%rax
- RESTORE_EXTRA_REGS 8
- ret
-END(ia32_ptregs_common)
+ jmp sys_clone
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 8ea34f94e973..9a6649857106 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -4,24 +4,21 @@
#include <linux/sys.h>
#include <linux/cache.h>
#include <asm/asm-offsets.h>
+#include <asm/syscall.h>
#ifdef CONFIG_IA32_EMULATION
#define SYM(sym, compat) compat
#else
#define SYM(sym, compat) sym
-#define ia32_sys_call_table sys_call_table
-#define __NR_syscall_compat_max __NR_syscall_max
#endif
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386
#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
-typedef asmlinkage void (*sys_call_ptr_t)(void);
-
-extern asmlinkage void sys_ni_syscall(void);
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = {
/*
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 4ac730b37f0b..41283d22be7a 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -14,13 +14,13 @@
# define __SYSCALL_X32(nr, sym, compat) /* nothing */
#endif
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
-extern void sys_ni_syscall(void);
+extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/*
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 7663c455b9f6..cb713df81180 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -8,7 +8,7 @@
#
0 i386 restart_syscall sys_restart_syscall
1 i386 exit sys_exit
-2 i386 fork sys_fork stub32_fork
+2 i386 fork sys_fork sys_fork
3 i386 read sys_read
4 i386 write sys_write
5 i386 open sys_open compat_sys_open
@@ -17,7 +17,7 @@
8 i386 creat sys_creat
9 i386 link sys_link
10 i386 unlink sys_unlink
-11 i386 execve sys_execve stub32_execve
+11 i386 execve sys_execve compat_sys_execve
12 i386 chdir sys_chdir
13 i386 time sys_time compat_sys_time
14 i386 mknod sys_mknod
@@ -125,7 +125,7 @@
116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
117 i386 ipc sys_ipc compat_sys_ipc
118 i386 fsync sys_fsync
-119 i386 sigreturn sys_sigreturn stub32_sigreturn
+119 i386 sigreturn sys_sigreturn sys32_sigreturn
120 i386 clone sys_clone stub32_clone
121 i386 setdomainname sys_setdomainname
122 i386 uname sys_newuname
@@ -179,7 +179,7 @@
170 i386 setresgid sys_setresgid16
171 i386 getresgid sys_getresgid16
172 i386 prctl sys_prctl
-173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn
+173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn
174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction
175 i386 rt_sigprocmask sys_rt_sigprocmask
176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending
@@ -196,7 +196,7 @@
187 i386 sendfile sys_sendfile compat_sys_sendfile
188 i386 getpmsg
189 i386 putpmsg
-190 i386 vfork sys_vfork stub32_vfork
+190 i386 vfork sys_vfork sys_vfork
191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit
192 i386 mmap2 sys_mmap_pgoff
193 i386 truncate64 sys_truncate64 sys32_truncate64
@@ -364,7 +364,7 @@
355 i386 getrandom sys_getrandom
356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
-358 i386 execveat sys_execveat stub32_execveat
+358 i386 execveat sys_execveat compat_sys_execveat
359 i386 socket sys_socket
360 i386 socketpair sys_socketpair
361 i386 bind sys_bind
@@ -382,3 +382,5 @@
373 i386 shutdown sys_shutdown
374 i386 userfaultfd sys_userfaultfd
375 i386 membarrier sys_membarrier
+376 i386 mlock2 sys_mlock2
+377 i386 copy_file_range sys_copy_file_range
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 278842fdf1f6..dc1040a50bdc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -331,6 +331,8 @@
322 64 execveat stub_execveat
323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier
+325 common mlock2 sys_mlock2
+326 common copy_file_range sys_copy_file_range
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index a3d0767a6b29..c854541d93ff 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -4,6 +4,7 @@
KBUILD_CFLAGS += $(DISABLE_LTO)
KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
@@ -19,9 +20,7 @@ obj-y += vma.o
# vDSO images to build
vdso_img-$(VDSO64-y) += 64
vdso_img-$(VDSOX32-y) += x32
-vdso_img-$(VDSO32-y) += 32-int80
-vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall
-vdso_img-$(VDSO32-y) += 32-sysenter
+vdso_img-$(VDSO32-y) += 32
obj-$(VDSO32-y) += vdso32-setup.o
@@ -69,7 +68,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
$(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
-fno-omit-frame-pointer -foptimize-sibling-calls \
- -DDISABLE_BRANCH_PROFILING
+ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
$(vobjs): KBUILD_CFLAGS += $(CFL)
@@ -122,15 +121,6 @@ $(obj)/%.so: $(obj)/%.so.dbg
$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso)
-#
-# Build multiple 32-bit vDSO images to choose from at boot time.
-#
-vdso32.so-$(VDSO32-y) += int80
-vdso32.so-$(CONFIG_IA32_EMULATION) += syscall
-vdso32.so-$(VDSO32-y) += sysenter
-
-vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
-
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
@@ -139,14 +129,12 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
targets += vdso32/vdso32.lds
-targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o)
+targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o
targets += vdso32/vclock_gettime.o
-$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%)
-
-KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
-$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
-$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
+$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32
KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
@@ -157,13 +145,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
-$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
-$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
- $(obj)/vdso32/vdso32.lds \
- $(obj)/vdso32/vclock_gettime.o \
- $(obj)/vdso32/note.o \
- $(obj)/vdso32/%.o
+$(obj)/vdso32.so.dbg: FORCE \
+ $(obj)/vdso32/vdso32.lds \
+ $(obj)/vdso32/vclock_gettime.o \
+ $(obj)/vdso32/note.o \
+ $(obj)/vdso32/system_call.o
$(call if_changed,vdso)
#
@@ -206,4 +194,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
PHONY += vdso_install $(vdso_img_insttargets)
vdso_install: $(vdso_img_insttargets) FORCE
-clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so*
+clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index ca94fa649251..1a50e09c945b 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -17,8 +17,10 @@
#include <asm/vvar.h>
#include <asm/unistd.h>
#include <asm/msr.h>
+#include <asm/pvclock.h>
#include <linux/math64.h>
#include <linux/time.h>
+#include <linux/kernel.h>
#define gtod (&VVAR(vsyscall_gtod_data))
@@ -36,12 +38,12 @@ static notrace cycle_t vread_hpet(void)
}
#endif
-#ifndef BUILD_VDSO32
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern u8 pvclock_page
+ __attribute__((visibility("hidden")));
+#endif
-#include <linux/kernel.h>
-#include <asm/vsyscall.h>
-#include <asm/fixmap.h>
-#include <asm/pvclock.h>
+#ifndef BUILD_VDSO32
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
@@ -60,75 +62,6 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
return ret;
}
-#ifdef CONFIG_PARAVIRT_CLOCK
-
-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
-{
- const struct pvclock_vsyscall_time_info *pvti_base;
- int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
- int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
-
- BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
-
- pvti_base = (struct pvclock_vsyscall_time_info *)
- __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
-
- return &pvti_base[offset];
-}
-
-static notrace cycle_t vread_pvclock(int *mode)
-{
- const struct pvclock_vsyscall_time_info *pvti;
- cycle_t ret;
- u64 last;
- u32 version;
- u8 flags;
- unsigned cpu, cpu1;
-
-
- /*
- * Note: hypervisor must guarantee that:
- * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
- * 2. that per-CPU pvclock time info is updated if the
- * underlying CPU changes.
- * 3. that version is increased whenever underlying CPU
- * changes.
- *
- */
- do {
- cpu = __getcpu() & VGETCPU_CPU_MASK;
- /* TODO: We can put vcpu id into higher bits of pvti.version.
- * This will save a couple of cycles by getting rid of
- * __getcpu() calls (Gleb).
- */
-
- pvti = get_pvti(cpu);
-
- version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
-
- /*
- * Test we're still on the cpu as well as the version.
- * We could have been migrated just after the first
- * vgetcpu but before fetching the version, so we
- * wouldn't notice a version change.
- */
- cpu1 = __getcpu() & VGETCPU_CPU_MASK;
- } while (unlikely(cpu != cpu1 ||
- (pvti->pvti.version & 1) ||
- pvti->pvti.version != version));
-
- if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
- *mode = VCLOCK_NONE;
-
- /* refer to tsc.c read_tsc() comment for rationale */
- last = gtod->cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- return last;
-}
-#endif
#else
@@ -162,15 +95,77 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
return ret;
}
+#endif
+
#ifdef CONFIG_PARAVIRT_CLOCK
+static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
+{
+ return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
+}
static notrace cycle_t vread_pvclock(int *mode)
{
- *mode = VCLOCK_NONE;
- return 0;
-}
-#endif
+ const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
+ cycle_t ret;
+ u64 tsc, pvti_tsc;
+ u64 last, delta, pvti_system_time;
+ u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
+ /*
+ * Note: The kernel and hypervisor must guarantee that cpu ID
+ * number maps 1:1 to per-CPU pvclock time info.
+ *
+ * Because the hypervisor is entirely unaware of guest userspace
+ * preemption, it cannot guarantee that per-CPU pvclock time
+ * info is updated if the underlying CPU changes or that that
+ * version is increased whenever underlying CPU changes.
+ *
+ * On KVM, we are guaranteed that pvti updates for any vCPU are
+ * atomic as seen by *all* vCPUs. This is an even stronger
+ * guarantee than we get with a normal seqlock.
+ *
+ * On Xen, we don't appear to have that guarantee, but Xen still
+ * supplies a valid seqlock using the version field.
+ *
+ * We only do pvclock vdso timing at all if
+ * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+ * mean that all vCPUs have matching pvti and that the TSC is
+ * synced, so we can just look at vCPU 0's pvti.
+ */
+
+ do {
+ version = pvti->version;
+
+ smp_rmb();
+
+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+ *mode = VCLOCK_NONE;
+ return 0;
+ }
+
+ tsc = rdtsc_ordered();
+ pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
+ pvti_tsc_shift = pvti->tsc_shift;
+ pvti_system_time = pvti->system_time;
+ pvti_tsc = pvti->tsc_timestamp;
+
+ /* Make sure that the version double-check is last. */
+ smp_rmb();
+ } while (unlikely((version & 1) || version != pvti->version));
+
+ delta = tsc - pvti_tsc;
+ ret = pvti_system_time +
+ pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
+ pvti_tsc_shift);
+
+ /* refer to vread_tsc() comment for rationale */
+ last = gtod->cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ return last;
+}
#endif
notrace static cycle_t vread_tsc(void)
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index de2c921025f5..4158acc17df0 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -25,7 +25,7 @@ SECTIONS
* segment.
*/
- vvar_start = . - 2 * PAGE_SIZE;
+ vvar_start = . - 3 * PAGE_SIZE;
vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
@@ -36,6 +36,7 @@ SECTIONS
#undef EMIT_VVAR
hpet_page = vvar_start + PAGE_SIZE;
+ pvclock_page = vvar_start + 2 * PAGE_SIZE;
. = SIZEOF_HEADERS;
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 8627db24a7f6..491020b2826d 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -73,6 +73,7 @@ enum {
sym_vvar_start,
sym_vvar_page,
sym_hpet_page,
+ sym_pvclock_page,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
};
@@ -80,6 +81,7 @@ enum {
const int special_pages[] = {
sym_vvar_page,
sym_hpet_page,
+ sym_pvclock_page,
};
struct vdso_sym {
@@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
[sym_vvar_start] = {"vvar_start", true},
[sym_vvar_page] = {"vvar_page", true},
[sym_hpet_page] = {"hpet_page", true},
+ [sym_pvclock_page] = {"pvclock_page", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
},
@@ -98,10 +101,10 @@ struct vdso_sym required_syms[] = {
"VDSO_FAKE_SECTION_TABLE_END", false
},
{"VDSO32_NOTE_MASK", true},
- {"VDSO32_SYSENTER_RETURN", true},
{"__kernel_vsyscall", true},
{"__kernel_sigreturn", true},
{"__kernel_rt_sigreturn", true},
+ {"int80_landing_pad", true},
};
__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index e904c270573b..08a317a9ae4b 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -48,35 +48,9 @@ __setup("vdso32=", vdso32_setup);
__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
#endif
-#ifdef CONFIG_X86_64
-
-#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
-#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
-
-#else /* CONFIG_X86_32 */
-
-#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
-#define vdso32_syscall() (0)
-
-#endif /* CONFIG_X86_64 */
-
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
-const struct vdso_image *selected_vdso32;
-#endif
-
int __init sysenter_setup(void)
{
-#ifdef CONFIG_COMPAT
- if (vdso32_syscall())
- selected_vdso32 = &vdso_image_32_syscall;
- else
-#endif
- if (vdso32_sysenter())
- selected_vdso32 = &vdso_image_32_sysenter;
- else
- selected_vdso32 = &vdso_image_32_int80;
-
- init_vdso_image(selected_vdso32);
+ init_vdso_image(&vdso_image_32);
return 0;
}
diff --git a/arch/x86/entry/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S
deleted file mode 100644
index b15b7c01aedb..000000000000
--- a/arch/x86/entry/vdso/vdso32/int80.S
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Code for the vDSO. This version uses the old int $0x80 method.
- *
- * First get the common code for the sigreturn entry points.
- * This must come first.
- */
-#include "sigreturn.S"
-
- .text
- .globl __kernel_vsyscall
- .type __kernel_vsyscall,@function
- ALIGN
-__kernel_vsyscall:
-.LSTART_vsyscall:
- int $0x80
- ret
-.LEND_vsyscall:
- .size __kernel_vsyscall,.-.LSTART_vsyscall
- .previous
-
- .section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI:
- .long .LENDCIEDLSI-.LSTARTCIEDLSI
-.LSTARTCIEDLSI:
- .long 0 /* CIE ID */
- .byte 1 /* Version number */
- .string "zR" /* NUL-terminated augmentation string */
- .uleb128 1 /* Code alignment factor */
- .sleb128 -4 /* Data alignment factor */
- .byte 8 /* Return address register column */
- .uleb128 1 /* Augmentation value length */
- .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
- .byte 0x0c /* DW_CFA_def_cfa */
- .uleb128 4
- .uleb128 4
- .byte 0x88 /* DW_CFA_offset, column 0x8 */
- .uleb128 1
- .align 4
-.LENDCIEDLSI:
- .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
-.LSTARTFDEDLSI:
- .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
- .long .LSTART_vsyscall-. /* PC-relative start address */
- .long .LEND_vsyscall-.LSTART_vsyscall
- .uleb128 0
- .align 4
-.LENDFDEDLSI:
- .previous
-
- /*
- * Pad out the segment to match the size of the sysenter.S version.
- */
-VDSO32_vsyscall_eh_frame_size = 0x40
- .section .data,"aw",@progbits
- .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0
- .previous
diff --git a/arch/x86/entry/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S
deleted file mode 100644
index 6b286bb5251c..000000000000
--- a/arch/x86/entry/vdso/vdso32/syscall.S
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Code for the vDSO. This version uses the syscall instruction.
- *
- * First get the common code for the sigreturn entry points.
- * This must come first.
- */
-#define SYSCALL_ENTER_KERNEL syscall
-#include "sigreturn.S"
-
-#include <asm/segment.h>
-
- .text
- .globl __kernel_vsyscall
- .type __kernel_vsyscall,@function
- ALIGN
-__kernel_vsyscall:
-.LSTART_vsyscall:
- push %ebp
-.Lpush_ebp:
- movl %ecx, %ebp
- syscall
- movl %ebp, %ecx
- popl %ebp
-.Lpop_ebp:
- ret
-.LEND_vsyscall:
- .size __kernel_vsyscall,.-.LSTART_vsyscall
-
- .section .eh_frame,"a",@progbits
-.LSTARTFRAME:
- .long .LENDCIE-.LSTARTCIE
-.LSTARTCIE:
- .long 0 /* CIE ID */
- .byte 1 /* Version number */
- .string "zR" /* NUL-terminated augmentation string */
- .uleb128 1 /* Code alignment factor */
- .sleb128 -4 /* Data alignment factor */
- .byte 8 /* Return address register column */
- .uleb128 1 /* Augmentation value length */
- .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
- .byte 0x0c /* DW_CFA_def_cfa */
- .uleb128 4
- .uleb128 4
- .byte 0x88 /* DW_CFA_offset, column 0x8 */
- .uleb128 1
- .align 4
-.LENDCIE:
-
- .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
-.LSTARTFDE1:
- .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
- .long .LSTART_vsyscall-. /* PC-relative start address */
- .long .LEND_vsyscall-.LSTART_vsyscall
- .uleb128 0 /* Augmentation length */
- /* What follows are the instructions for the table generation.
- We have to record all changes of the stack pointer. */
- .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .uleb128 8
- .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */
- .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */
- .byte 0xc5 /* DW_CFA_restore %ebp */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .uleb128 4
- .align 4
-.LENDFDE1:
- .previous
-
- /*
- * Pad out the segment to match the size of the sysenter.S version.
- */
-VDSO32_vsyscall_eh_frame_size = 0x40
- .section .data,"aw",@progbits
- .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0
- .previous
diff --git a/arch/x86/entry/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S
deleted file mode 100644
index e354bceee0e0..000000000000
--- a/arch/x86/entry/vdso/vdso32/sysenter.S
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Code for the vDSO. This version uses the sysenter instruction.
- *
- * First get the common code for the sigreturn entry points.
- * This must come first.
- */
-#include "sigreturn.S"
-
-/*
- * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
- * %ecx itself for arg2. The pushing is because the sysexit instruction
- * (found in entry.S) requires that we clobber %ecx with the desired %esp.
- * User code might expect that %ecx is unclobbered though, as it would be
- * for returning via the iret instruction, so we must push and pop.
- *
- * The caller puts arg3 in %edx, which the sysexit instruction requires
- * for %eip. Thus, exactly as for arg2, we must push and pop.
- *
- * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
- * instruction clobbers %esp, the user's %esp won't even survive entry
- * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
- * arg6 from the stack.
- *
- * You can not use this vsyscall for the clone() syscall because the
- * three words on the parent stack do not get copied to the child.
- */
- .text
- .globl __kernel_vsyscall
- .type __kernel_vsyscall,@function
- ALIGN
-__kernel_vsyscall:
-.LSTART_vsyscall:
- push %ecx
-.Lpush_ecx:
- push %edx
-.Lpush_edx:
- push %ebp
-.Lenter_kernel:
- movl %esp,%ebp
- sysenter
-
- /* 7: align return point with nop's to make disassembly easier */
- .space 7,0x90
-
- /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
- int $0x80
- /* 16: System call normal return point is here! */
-VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */
- pop %ebp
-.Lpop_ebp:
- pop %edx
-.Lpop_edx:
- pop %ecx
-.Lpop_ecx:
- ret
-.LEND_vsyscall:
- .size __kernel_vsyscall,.-.LSTART_vsyscall
- .previous
-
- .section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI:
- .long .LENDCIEDLSI-.LSTARTCIEDLSI
-.LSTARTCIEDLSI:
- .long 0 /* CIE ID */
- .byte 1 /* Version number */
- .string "zR" /* NUL-terminated augmentation string */
- .uleb128 1 /* Code alignment factor */
- .sleb128 -4 /* Data alignment factor */
- .byte 8 /* Return address register column */
- .uleb128 1 /* Augmentation value length */
- .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
- .byte 0x0c /* DW_CFA_def_cfa */
- .uleb128 4
- .uleb128 4
- .byte 0x88 /* DW_CFA_offset, column 0x8 */
- .uleb128 1
- .align 4
-.LENDCIEDLSI:
- .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
-.LSTARTFDEDLSI:
- .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
- .long .LSTART_vsyscall-. /* PC-relative start address */
- .long .LEND_vsyscall-.LSTART_vsyscall
- .uleb128 0
- /* What follows are the instructions for the table generation.
- We have to record all changes of the stack pointer. */
- .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x08 /* RA at offset 8 now */
- .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x0c /* RA at offset 12 now */
- .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x10 /* RA at offset 16 now */
- .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
- /* Finally the epilogue. */
- .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x0c /* RA at offset 12 now */
- .byte 0xc5 /* DW_CFA_restore %ebp */
- .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x08 /* RA at offset 8 now */
- .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x04 /* RA at offset 4 now */
- .align 4
-.LENDFDEDLSI:
- .previous
-
- /*
- * Emit a symbol with the size of this .eh_frame data,
- * to verify it matches the other versions.
- */
-VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI)
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
new file mode 100644
index 000000000000..3a1d9297074b
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -0,0 +1,89 @@
+/*
+ * AT_SYSINFO entry point
+*/
+
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+
+/*
+ * First get the common code for the sigreturn entry points.
+ * This must come first.
+ */
+#include "sigreturn.S"
+
+ .text
+ .globl __kernel_vsyscall
+ .type __kernel_vsyscall,@function
+ ALIGN
+__kernel_vsyscall:
+ CFI_STARTPROC
+ /*
+ * Reshuffle regs so that all of any of the entry instructions
+ * will preserve enough state.
+ *
+ * A really nice entry sequence would be:
+ * pushl %edx
+ * pushl %ecx
+ * movl %esp, %ecx
+ *
+ * Unfortunately, naughty Android versions between July and December
+ * 2015 actually hardcode the traditional Linux SYSENTER entry
+ * sequence. That is severely broken for a number of reasons (ask
+ * anyone with an AMD CPU, for example). Nonetheless, we try to keep
+ * it working approximately as well as it ever worked.
+ *
+ * This link may eludicate some of the history:
+ * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
+ * personally, I find it hard to understand what's going on there.
+ *
+ * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE.
+ * Execute an indirect call to the address in the AT_SYSINFO auxv
+ * entry. That is the ONLY correct way to make a fast 32-bit system
+ * call on Linux. (Open-coding int $0x80 is also fine, but it's
+ * slow.)
+ */
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+
+ #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter"
+ #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall"
+
+#ifdef CONFIG_X86_64
+ /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
+ ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
+ SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32
+#else
+ ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
+#endif
+
+ /* Enter using int $0x80 */
+ int $0x80
+GLOBAL(int80_landing_pad)
+
+ /*
+ * Restore EDX and ECX in case they were clobbered. EBP is not
+ * clobbered (the kernel restores it), but it's cleaner and
+ * probably faster to pop it than to adjust ESP using addl.
+ */
+ popl %ebp
+ CFI_RESTORE ebp
+ CFI_ADJUST_CFA_OFFSET -4
+ popl %edx
+ CFI_RESTORE edx
+ CFI_ADJUST_CFA_OFFSET -4
+ popl %ecx
+ CFI_RESTORE ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ ret
+ CFI_ENDPROC
+
+ .size __kernel_vsyscall,.-__kernel_vsyscall
+ .previous
diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
index 175cc72c0f68..87a86e017f0e 100644
--- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -14,11 +14,13 @@
*/
#undef CONFIG_64BIT
#undef CONFIG_X86_64
+#undef CONFIG_PGTABLE_LEVELS
#undef CONFIG_ILLEGAL_POINTER_VALUE
#undef CONFIG_SPARSEMEM_VMEMMAP
#undef CONFIG_NR_CPUS
#define CONFIG_X86_32 1
+#define CONFIG_PGTABLE_LEVELS 2
#define CONFIG_PAGE_OFFSET 0
#define CONFIG_ILLEGAL_POINTER_VALUE 0
#define CONFIG_NR_CPUS 1
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 434543145d78..b8f69e264ac4 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -12,6 +12,7 @@
#include <linux/random.h>
#include <linux/elf.h>
#include <linux/cpu.h>
+#include <asm/pvclock.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
#include <asm/vdso.h>
@@ -100,6 +101,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
.name = "[vvar]",
.pages = no_pages,
};
+ struct pvclock_vsyscall_time_info *pvti;
if (calculate_addr) {
addr = vdso_addr(current->mm->start_stack,
@@ -169,6 +171,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
}
#endif
+ pvti = pvclock_pvti_cpu0_va();
+ if (pvti && image->sym_pvclock_page) {
+ ret = remap_pfn_range(vma,
+ text_start + image->sym_pvclock_page,
+ __pa(pvti) >> PAGE_SHIFT,
+ PAGE_SIZE,
+ PAGE_READONLY);
+
+ if (ret)
+ goto up_fail;
+ }
+
up_fail:
if (ret)
current->mm->context.vdso = NULL;
@@ -180,21 +194,10 @@ up_fail:
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static int load_vdso32(void)
{
- int ret;
-
if (vdso32_enabled != 1) /* Other values all mean "disabled" */
return 0;
- ret = map_vdso(selected_vdso32, false);
- if (ret)
- return ret;
-
- if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN)
- current_thread_info()->sysenter_return =
- current->mm->context.vdso +
- selected_vdso32->sym_VDSO32_SYSENTER_RETURN;
-
- return 0;
+ return map_vdso(&vdso_image_32, false);
}
#endif
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index b160c0c6baed..174c2549939d 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -38,7 +38,14 @@
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
-static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode =
+#if defined(CONFIG_LEGACY_VSYSCALL_NATIVE)
+ NATIVE;
+#elif defined(CONFIG_LEGACY_VSYSCALL_NONE)
+ NONE;
+#else
+ EMULATE;
+#endif
static int __init vsyscall_setup(char *str)
{
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index a0a19b7ba22d..0552884da18d 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -26,7 +26,7 @@
#include <asm/ptrace.h>
#include <asm/ia32_unistd.h>
#include <asm/user32.h>
-#include <asm/sigcontext32.h>
+#include <uapi/asm/sigcontext.h>
#include <asm/proto.h>
#include <asm/vdso.h>
#include <asm/sigframe.h>
@@ -68,7 +68,7 @@
}
static int ia32_restore_sigcontext(struct pt_regs *regs,
- struct sigcontext_ia32 __user *sc)
+ struct sigcontext_32 __user *sc)
{
unsigned int tmpflags, err = 0;
void __user *buf;
@@ -170,7 +170,7 @@ badframe:
* Set up a signal frame.
*/
-static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
+static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc,
void __user *fpstate,
struct pt_regs *regs, unsigned int mask)
{
@@ -234,7 +234,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
unsigned long fx_aligned, math_size;
sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
- *fpstate = (struct _fpstate_ia32 __user *) sp;
+ *fpstate = (struct _fpstate_32 __user *) sp;
if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
math_size) < 0)
return (void __user *) -1L;
@@ -289,7 +289,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
/* Return stub is in 32bit vsyscall page */
if (current->mm->context.vdso)
restorer = current->mm->context.vdso +
- selected_vdso32->sym___kernel_sigreturn;
+ vdso_image_32.sym___kernel_sigreturn;
else
restorer = &frame->retcode;
}
@@ -368,7 +368,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
restorer = ksig->ka.sa.sa_restorer;
else
restorer = current->mm->context.vdso +
- selected_vdso32->sym___kernel_rt_sigreturn;
+ vdso_image_32.sym___kernel_rt_sigreturn;
put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
/*
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 3a45668f6dc3..94c18ebfd68c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -32,6 +32,10 @@
#include <asm/mpspec.h>
#include <asm/realmode.h>
+#ifdef CONFIG_ACPI_APEI
+# include <asm/pgtable_types.h>
+#endif
+
#ifdef CONFIG_ACPI
extern int acpi_lapic;
extern int acpi_ioapic;
@@ -147,4 +151,23 @@ extern int x86_acpi_numa_init(void);
#define acpi_unlazy_tlb(x) leave_mm(x)
+#ifdef CONFIG_ACPI_APEI
+static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
+{
+ /*
+ * We currently have no way to look up the EFI memory map
+ * attributes for a region in a consistent way, because the
+ * memmap is discarded after efi_free_boot_services(). So if
+ * you call efi_mem_attributes() during boot and at runtime,
+ * you could theoretically see different attributes.
+ *
+ * Since we are yet to see any x86 platforms that require
+ * anything other than PAGE_KERNEL (some arm64 platforms
+ * require the equivalent of PAGE_KERNEL_NOCACHE), return that
+ * until we know differently.
+ */
+ return PAGE_KERNEL;
+}
+#endif
+
#endif /* _ASM_X86_ACPI_H */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 1a5da2e63aee..3c56ef1ae068 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -81,7 +81,7 @@ static inline struct amd_northbridge *node_to_amd_nb(int node)
return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
}
-static inline u16 amd_get_node_id(struct pci_dev *pdev)
+static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev)
{
struct pci_dev *misc;
int i;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index ebf6d5e5668c..c80f6b6f3da2 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -23,6 +23,11 @@
#define APIC_VERBOSE 1
#define APIC_DEBUG 2
+/* Macros for apic_extnmi which controls external NMI masking */
+#define APIC_EXTNMI_BSP 0 /* Default */
+#define APIC_EXTNMI_ALL 1
+#define APIC_EXTNMI_NONE 2
+
/*
* Define the default level of output to be very little
* This can be turned up by using apic=verbose for more
@@ -115,6 +120,59 @@ static inline bool apic_is_x2apic_enabled(void)
return msr & X2APIC_ENABLE;
}
+extern void enable_IR_x2apic(void);
+
+extern int get_physical_broadcast(void);
+
+extern int lapic_get_maxlvt(void);
+extern void clear_local_APIC(void);
+extern void disconnect_bsp_APIC(int virt_wire_setup);
+extern void disable_local_APIC(void);
+extern void lapic_shutdown(void);
+extern void sync_Arb_IDs(void);
+extern void init_bsp_APIC(void);
+extern void setup_local_APIC(void);
+extern void init_apic_mappings(void);
+void register_lapic_address(unsigned long address);
+extern void setup_boot_APIC_clock(void);
+extern void setup_secondary_APIC_clock(void);
+extern int APIC_init_uniprocessor(void);
+
+#ifdef CONFIG_X86_64
+static inline int apic_force_enable(unsigned long addr)
+{
+ return -1;
+}
+#else
+extern int apic_force_enable(unsigned long addr);
+#endif
+
+extern int apic_bsp_setup(bool upmode);
+extern void apic_ap_setup(void);
+
+/*
+ * On 32bit this is mach-xxx local
+ */
+#ifdef CONFIG_X86_64
+extern int apic_is_clustered_box(void);
+#else
+static inline int apic_is_clustered_box(void)
+{
+ return 0;
+}
+#endif
+
+extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
+
+#else /* !CONFIG_X86_LOCAL_APIC */
+static inline void lapic_shutdown(void) { }
+#define local_apic_timer_c2_ok 1
+static inline void init_apic_mappings(void) { }
+static inline void disable_local_APIC(void) { }
+# define setup_boot_APIC_clock x86_init_noop
+# define setup_secondary_APIC_clock x86_init_noop
+#endif /* !CONFIG_X86_LOCAL_APIC */
+
#ifdef CONFIG_X86_X2APIC
/*
* Make previous memory operations globally visible before
@@ -186,67 +244,14 @@ static inline int x2apic_enabled(void)
}
#define x2apic_supported() (cpu_has_x2apic)
-#else
+#else /* !CONFIG_X86_X2APIC */
static inline void check_x2apic(void) { }
static inline void x2apic_setup(void) { }
static inline int x2apic_enabled(void) { return 0; }
#define x2apic_mode (0)
#define x2apic_supported() (0)
-#endif
-
-extern void enable_IR_x2apic(void);
-
-extern int get_physical_broadcast(void);
-
-extern int lapic_get_maxlvt(void);
-extern void clear_local_APIC(void);
-extern void disconnect_bsp_APIC(int virt_wire_setup);
-extern void disable_local_APIC(void);
-extern void lapic_shutdown(void);
-extern void sync_Arb_IDs(void);
-extern void init_bsp_APIC(void);
-extern void setup_local_APIC(void);
-extern void init_apic_mappings(void);
-void register_lapic_address(unsigned long address);
-extern void setup_boot_APIC_clock(void);
-extern void setup_secondary_APIC_clock(void);
-extern int APIC_init_uniprocessor(void);
-
-#ifdef CONFIG_X86_64
-static inline int apic_force_enable(unsigned long addr)
-{
- return -1;
-}
-#else
-extern int apic_force_enable(unsigned long addr);
-#endif
-
-extern int apic_bsp_setup(bool upmode);
-extern void apic_ap_setup(void);
-
-/*
- * On 32bit this is mach-xxx local
- */
-#ifdef CONFIG_X86_64
-extern int apic_is_clustered_box(void);
-#else
-static inline int apic_is_clustered_box(void)
-{
- return 0;
-}
-#endif
-
-extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
-
-#else /* !CONFIG_X86_LOCAL_APIC */
-static inline void lapic_shutdown(void) { }
-#define local_apic_timer_c2_ok 1
-static inline void init_apic_mappings(void) { }
-static inline void disable_local_APIC(void) { }
-# define setup_boot_APIC_clock x86_init_noop
-# define setup_secondary_APIC_clock x86_init_noop
-#endif /* !CONFIG_X86_LOCAL_APIC */
+#endif /* !CONFIG_X86_X2APIC */
#ifdef CONFIG_X86_64
#define SET_APIC_ID(x) (apic->set_apic_id(x))
@@ -303,6 +308,7 @@ struct apic {
unsigned int *apicid);
/* ipi */
+ void (*send_IPI)(int cpu, int vector);
void (*send_IPI_mask)(const struct cpumask *mask, int vector);
void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
int vector);
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index fb52aa644aab..3e8674288198 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -3,7 +3,6 @@
#include <linux/compiler.h>
#include <linux/types.h>
-#include <asm/processor.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
@@ -24,7 +23,7 @@
*/
static __always_inline int atomic_read(const atomic_t *v)
{
- return ACCESS_ONCE((v)->counter);
+ return READ_ONCE((v)->counter);
}
/**
@@ -36,7 +35,7 @@ static __always_inline int atomic_read(const atomic_t *v)
*/
static __always_inline void atomic_set(atomic_t *v, int i)
{
- v->counter = i;
+ WRITE_ONCE(v->counter, i);
}
/**
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index a11c30b77fb5..a984111135b1 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -3,7 +3,6 @@
#include <linux/compiler.h>
#include <linux/types.h>
-#include <asm/processor.h>
//#include <asm/cmpxchg.h>
/* An 64bit atomic type */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 50e33eff58de..037351022f54 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -18,7 +18,7 @@
*/
static inline long atomic64_read(const atomic64_t *v)
{
- return ACCESS_ONCE((v)->counter);
+ return READ_ONCE((v)->counter);
}
/**
@@ -30,7 +30,7 @@ static inline long atomic64_read(const atomic64_t *v)
*/
static inline void atomic64_set(atomic64_t *v, long i)
{
- v->counter = i;
+ WRITE_ONCE(v->counter, i);
}
/**
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 0681d2532527..a584e1c50918 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -31,20 +31,10 @@
#endif
#define dma_wmb() barrier()
-#ifdef CONFIG_SMP
-#define smp_mb() mb()
-#define smp_rmb() dma_rmb()
-#define smp_wmb() barrier()
-#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
-#else /* !SMP */
-#define smp_mb() barrier()
-#define smp_rmb() barrier()
-#define smp_wmb() barrier()
-#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
-#endif /* SMP */
-
-#define read_barrier_depends() do { } while (0)
-#define smp_read_barrier_depends() do { } while (0)
+#define __smp_mb() mb()
+#define __smp_rmb() dma_rmb()
+#define __smp_wmb() barrier()
+#define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
#if defined(CONFIG_X86_PPRO_FENCE)
@@ -53,31 +43,31 @@
* model and we should fall back to full barriers.
*/
-#define smp_store_release(p, v) \
+#define __smp_store_release(p, v) \
do { \
compiletime_assert_atomic_type(*p); \
- smp_mb(); \
+ __smp_mb(); \
WRITE_ONCE(*p, v); \
} while (0)
-#define smp_load_acquire(p) \
+#define __smp_load_acquire(p) \
({ \
typeof(*p) ___p1 = READ_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
- smp_mb(); \
+ __smp_mb(); \
___p1; \
})
#else /* regular x86 TSO memory ordering */
-#define smp_store_release(p, v) \
+#define __smp_store_release(p, v) \
do { \
compiletime_assert_atomic_type(*p); \
barrier(); \
WRITE_ONCE(*p, v); \
} while (0)
-#define smp_load_acquire(p) \
+#define __smp_load_acquire(p) \
({ \
typeof(*p) ___p1 = READ_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
@@ -88,7 +78,9 @@ do { \
#endif
/* Atomic operations are already serializing on x86 */
-#define smp_mb__before_atomic() barrier()
-#define smp_mb__after_atomic() barrier()
+#define __smp_mb__before_atomic() barrier()
+#define __smp_mb__after_atomic() barrier()
+
+#include <asm-generic/barrier.h>
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 4fa687a47a62..6b8d6e8cd449 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -27,7 +27,7 @@
#define BOOT_HEAP_SIZE 0x400000
#else /* !CONFIG_KERNEL_BZIP2 */
-#define BOOT_HEAP_SIZE 0x8000
+#define BOOT_HEAP_SIZE 0x10000
#endif /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index 0d467b338835..a8303ebe089f 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -31,7 +31,7 @@
#include <asm/types.h>
struct iommu_table {
- struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
+ const struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
unsigned long it_base; /* mapped address of tce table */
unsigned long it_hint; /* Hint for next alloc */
unsigned long *it_map; /* A simple allocation bitmap for now */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index f7e142926481..e4959d023af8 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -109,6 +109,6 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
#endif
-#define system_has_cmpxchg_double() cpu_has_cx8
+#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8)
#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 1af94697aae5..caa23a34c963 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -18,6 +18,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
cmpxchg_local((ptr), (o), (n)); \
})
-#define system_has_cmpxchg_double() cpu_has_cx16
+#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index bf2caa1dedc5..678637ad7476 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -36,4 +36,7 @@ extern int _debug_hotplug_cpu(int cpu, int action);
int mwait_usable(const struct cpuinfo_x86 *);
+unsigned int x86_family(unsigned int sig);
+unsigned int x86_model(unsigned int sig);
+unsigned int x86_stepping(unsigned int sig);
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9727b3b48bd1..7ad8c9464297 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
#include <asm/disabled-features.h>
#endif
-#define NCAPINTS 13 /* N 32-bit words worth of info */
+#define NCAPINTS 16 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -181,22 +181,17 @@
/*
* Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc, word 7
+ * CPUID levels like 0x6, 0xA etc, word 7.
+ *
+ * Reuse free bits when adding new feature flags!
*/
-#define X86_FEATURE_IDA ( 7*32+ 0) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */
+
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
+
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-#define X86_FEATURE_HWP ( 7*32+ 10) /* "hwp" Intel HWP */
-#define X86_FEATURE_HWP_NOTIFY ( 7*32+ 11) /* Intel HWP_NOTIFY */
-#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
-#define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */
-#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
+
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
/* Virtualization flags: Linux defined, word 8 */
@@ -205,17 +200,9 @@
#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
-#define X86_FEATURE_NPT ( 8*32+ 5) /* AMD Nested Page Table support */
-#define X86_FEATURE_LBRV ( 8*32+ 6) /* AMD LBR Virtualization support */
-#define X86_FEATURE_SVML ( 8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
-#define X86_FEATURE_NRIPS ( 8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
-#define X86_FEATURE_TSCRATEMSR ( 8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
-#define X86_FEATURE_VMCBCLEAN ( 8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID ( 8*32+11) /* AMD flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS ( 8*32+12) /* AMD Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */
+
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
@@ -255,6 +242,33 @@
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
+#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
+
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
+#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
+#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
+#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
+#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
+#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
+#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+
/*
* BUG word(s)
*/
@@ -275,6 +289,26 @@
#include <asm/asm.h>
#include <linux/bitops.h>
+enum cpuid_leafs
+{
+ CPUID_1_EDX = 0,
+ CPUID_8000_0001_EDX,
+ CPUID_8086_0001_EDX,
+ CPUID_LNX_1,
+ CPUID_1_ECX,
+ CPUID_C000_0001_EDX,
+ CPUID_8000_0001_ECX,
+ CPUID_LNX_2,
+ CPUID_LNX_3,
+ CPUID_7_0_EBX,
+ CPUID_D_1_EAX,
+ CPUID_F_0_EDX,
+ CPUID_F_1_EDX,
+ CPUID_8000_0008_EBX,
+ CPUID_6_EAX,
+ CPUID_8000_000A_EDX,
+};
+
#ifdef CONFIG_X86_FEATURE_NAMES
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
@@ -352,60 +386,31 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
} while (0)
#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
-#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE)
#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC)
#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE)
#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC)
-#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP)
-#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR)
-#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX)
#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR)
#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
-#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
-#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3)
#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES)
#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX)
#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
-#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
-#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
-#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE)
-#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
-#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT)
-#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
-#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2)
-#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN)
-#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE)
-#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN)
-#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM)
-#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN)
-#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS)
-#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS)
#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH)
-#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS)
#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES)
#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT)
-#define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1)
-#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
-#define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT)
#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES)
#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE)
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
-#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
-#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
-#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB)
-#define cpu_has_perfctr_l2 boot_cpu_has(X86_FEATURE_PERFCTR_L2)
-#define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8)
-#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16)
-#define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU)
-#define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT)
-#define cpu_has_bpext boot_cpu_has(X86_FEATURE_BPEXT)
-
-#if __GNUC__ >= 4
+/*
+ * Do not add any more of those clumsy macros - use static_cpu_has_safe() for
+ * fast paths and boot_cpu_has() otherwise!
+ */
+
+#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS)
extern void warn_pre_alternatives(void);
extern bool __static_cpu_has_safe(u16 bit);
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 03dd72957d2f..684ed6c3aa67 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -10,6 +10,16 @@ struct dev_archdata {
#endif
};
+#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
+struct dma_domain {
+ struct list_head node;
+ struct dma_map_ops *dma_ops;
+ int domain_nr;
+};
+void add_dma_domain(struct dma_domain *domain);
+void del_dma_domain(struct dma_domain *domain);
+#endif
+
struct pdev_archdata {
};
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 953b7263f844..3a27b93e6261 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -46,8 +46,6 @@ bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
#define HAVE_ARCH_DMA_SUPPORTED 1
extern int dma_supported(struct device *hwdev, u64 mask);
-#include <asm-generic/dma-mapping-common.h>
-
extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag,
struct dma_attrs *attrs);
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
new file mode 100644
index 000000000000..b7a1ab865d68
--- /dev/null
+++ b/arch/x86/include/asm/dwarf2.h
@@ -0,0 +1,84 @@
+#ifndef _ASM_X86_DWARF2_H
+#define _ASM_X86_DWARF2_H
+
+#ifndef __ASSEMBLY__
+#warning "asm/dwarf2.h should be only included in pure assembly files"
+#endif
+
+/*
+ * Macros for dwarf2 CFI unwind table entries.
+ * See "as.info" for details on these pseudo ops. Unfortunately
+ * they are only supported in very new binutils, so define them
+ * away for older version.
+ */
+
+#ifdef CONFIG_AS_CFI
+
+#define CFI_STARTPROC .cfi_startproc
+#define CFI_ENDPROC .cfi_endproc
+#define CFI_DEF_CFA .cfi_def_cfa
+#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
+#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
+#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
+#define CFI_OFFSET .cfi_offset
+#define CFI_REL_OFFSET .cfi_rel_offset
+#define CFI_REGISTER .cfi_register
+#define CFI_RESTORE .cfi_restore
+#define CFI_REMEMBER_STATE .cfi_remember_state
+#define CFI_RESTORE_STATE .cfi_restore_state
+#define CFI_UNDEFINED .cfi_undefined
+#define CFI_ESCAPE .cfi_escape
+
+#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
+#define CFI_SIGNAL_FRAME .cfi_signal_frame
+#else
+#define CFI_SIGNAL_FRAME
+#endif
+
+#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
+#ifndef BUILD_VDSO
+ /*
+ * Emit CFI data in .debug_frame sections, not .eh_frame sections.
+ * The latter we currently just discard since we don't do DWARF
+ * unwinding at runtime. So only the offline DWARF information is
+ * useful to anyone. Note we should not use this directive if
+ * vmlinux.lds.S gets changed so it doesn't discard .eh_frame.
+ */
+ .cfi_sections .debug_frame
+#else
+ /*
+ * For the vDSO, emit both runtime unwind information and debug
+ * symbols for the .dbg file.
+ */
+ .cfi_sections .eh_frame, .debug_frame
+#endif
+#endif
+
+#else
+
+/*
+ * Due to the structure of pre-exisiting code, don't use assembler line
+ * comment character # to ignore the arguments. Instead, use a dummy macro.
+ */
+.macro cfi_ignore a=0, b=0, c=0, d=0
+.endm
+
+#define CFI_STARTPROC cfi_ignore
+#define CFI_ENDPROC cfi_ignore
+#define CFI_DEF_CFA cfi_ignore
+#define CFI_DEF_CFA_REGISTER cfi_ignore
+#define CFI_DEF_CFA_OFFSET cfi_ignore
+#define CFI_ADJUST_CFA_OFFSET cfi_ignore
+#define CFI_OFFSET cfi_ignore
+#define CFI_REL_OFFSET cfi_ignore
+#define CFI_REGISTER cfi_ignore
+#define CFI_RESTORE cfi_ignore
+#define CFI_REMEMBER_STATE cfi_ignore
+#define CFI_RESTORE_STATE cfi_ignore
+#define CFI_UNDEFINED cfi_ignore
+#define CFI_ESCAPE cfi_ignore
+#define CFI_SIGNAL_FRAME cfi_ignore
+
+#endif
+
+#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ae68be92f755..0010c78c4998 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -105,6 +105,7 @@ extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable);
extern int __init efi_memblock_x86_reserve_range(void);
extern pgd_t * __init efi_call_phys_prolog(void);
extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
+extern void __init efi_print_memmap(void);
extern void __init efi_unmap_memmap(void);
extern void __init efi_memory_uc(u64 addr, unsigned long size);
extern void __init efi_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 141c561f4664..1514753fd435 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -171,11 +171,11 @@ do { \
static inline void elf_common_init(struct thread_struct *t,
struct pt_regs *regs, const u16 ds)
{
- /* Commented-out registers are cleared in stub_execve */
- /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0;
- regs->si = regs->di /*= regs->bp*/ = 0;
+ /* ax gets execve's return value. */
+ /*regs->ax = */ regs->bx = regs->cx = regs->dx = 0;
+ regs->si = regs->di = regs->bp = 0;
regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
- /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/
+ regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
t->fs = t->gs = 0;
t->fsindex = t->gsindex = 0;
t->ds = t->es = ds;
@@ -328,7 +328,7 @@ else \
#define VDSO_ENTRY \
((unsigned long)current->mm->context.vdso + \
- selected_vdso32->sym___kernel_vsyscall)
+ vdso_image_32.sym___kernel_vsyscall)
struct linux_binprm;
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index f80d70009ff8..6d7d0e52ed5a 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,7 +19,6 @@
#include <asm/acpi.h>
#include <asm/apicdef.h>
#include <asm/page.h>
-#include <asm/pvclock.h>
#ifdef CONFIG_X86_32
#include <linux/threads.h>
#include <asm/kmap_types.h>
@@ -72,10 +71,6 @@ enum fixed_addresses {
#ifdef CONFIG_X86_VSYSCALL_EMULATION
VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
#endif
-#ifdef CONFIG_PARAVIRT_CLOCK
- PVCLOCK_FIXMAP_BEGIN,
- PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
-#endif
#endif
FIX_DBGP_BASE,
FIX_EARLYCON_MEM_BASE,
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 3c3550c3a4a3..0fd440df63f1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
+extern u64 fpu__get_supported_xfeatures_mask(void);
/*
* Debugging facility:
@@ -224,18 +225,67 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
-/* xstate instruction fault handler: */
-#define xstate_fault(__err) \
- \
- ".section .fixup,\"ax\"\n" \
- \
- "3: movl $-2,%[_err]\n" \
- " jmp 2b\n" \
- \
- ".previous\n" \
- \
- _ASM_EXTABLE(1b, 3b) \
- : [_err] "=r" (__err)
+#define XSTATE_OP(op, st, lmask, hmask, err) \
+ asm volatile("1:" op "\n\t" \
+ "xor %[err], %[err]\n" \
+ "2:\n\t" \
+ ".pushsection .fixup,\"ax\"\n\t" \
+ "3: movl $-2,%[err]\n\t" \
+ "jmp 2b\n\t" \
+ ".popsection\n\t" \
+ _ASM_EXTABLE(1b, 3b) \
+ : [err] "=r" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
+ * format and supervisor states in addition to modified optimization in
+ * XSAVEOPT.
+ *
+ * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
+ * supports modified optimization which is not supported by XSAVE.
+ *
+ * We use XSAVE as a fallback.
+ *
+ * The 661 label is defined in the ALTERNATIVE* macros as the address of the
+ * original instruction which gets replaced. We need to use it here as the
+ * address of the instruction where we might get an exception at.
+ */
+#define XSTATE_XSAVE(st, lmask, hmask, err) \
+ asm volatile(ALTERNATIVE_2(XSAVE, \
+ XSAVEOPT, X86_FEATURE_XSAVEOPT, \
+ XSAVES, X86_FEATURE_XSAVES) \
+ "\n" \
+ "xor %[err], %[err]\n" \
+ "3:\n" \
+ ".pushsection .fixup,\"ax\"\n" \
+ "4: movl $-2, %[err]\n" \
+ "jmp 3b\n" \
+ ".popsection\n" \
+ _ASM_EXTABLE(661b, 4b) \
+ : [err] "=r" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
+ * XSAVE area format.
+ */
+#define XSTATE_XRESTORE(st, lmask, hmask, err) \
+ asm volatile(ALTERNATIVE(XRSTOR, \
+ XRSTORS, X86_FEATURE_XSAVES) \
+ "\n" \
+ "xor %[err], %[err]\n" \
+ "3:\n" \
+ ".pushsection .fixup,\"ax\"\n" \
+ "4: movl $-2, %[err]\n" \
+ "jmp 3b\n" \
+ ".popsection\n" \
+ _ASM_EXTABLE(661b, 4b) \
+ : [err] "=r" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
/*
* This function is called only during boot time when x86 caps are not set
@@ -246,22 +296,14 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
u64 mask = -1;
u32 lmask = mask;
u32 hmask = mask >> 32;
- int err = 0;
+ int err;
WARN_ON(system_state != SYSTEM_BOOTING);
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- asm volatile("1:"XSAVES"\n\t"
- "2:\n\t"
- xstate_fault(err)
- : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
- : "memory");
+ if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+ XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
else
- asm volatile("1:"XSAVE"\n\t"
- "2:\n\t"
- xstate_fault(err)
- : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
- : "memory");
+ XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
/* We should never fault when copying to a kernel buffer: */
WARN_ON_FPU(err);
@@ -276,22 +318,14 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
u64 mask = -1;
u32 lmask = mask;
u32 hmask = mask >> 32;
- int err = 0;
+ int err;
WARN_ON(system_state != SYSTEM_BOOTING);
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- asm volatile("1:"XRSTORS"\n\t"
- "2:\n\t"
- xstate_fault(err)
- : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
- : "memory");
+ if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
else
- asm volatile("1:"XRSTOR"\n\t"
- "2:\n\t"
- xstate_fault(err)
- : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
- : "memory");
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
/* We should never fault when copying from a kernel buffer: */
WARN_ON_FPU(err);
@@ -305,33 +339,11 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
u64 mask = -1;
u32 lmask = mask;
u32 hmask = mask >> 32;
- int err = 0;
+ int err;
WARN_ON(!alternatives_patched);
- /*
- * If xsaves is enabled, xsaves replaces xsaveopt because
- * it supports compact format and supervisor states in addition to
- * modified optimization in xsaveopt.
- *
- * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
- * because xsaveopt supports modified optimization which is not
- * supported by xsave.
- *
- * If none of xsaves and xsaveopt is enabled, use xsave.
- */
- alternative_input_2(
- "1:"XSAVE,
- XSAVEOPT,
- X86_FEATURE_XSAVEOPT,
- XSAVES,
- X86_FEATURE_XSAVES,
- [xstate] "D" (xstate), "a" (lmask), "d" (hmask) :
- "memory");
- asm volatile("2:\n\t"
- xstate_fault(err)
- : "0" (err)
- : "memory");
+ XSTATE_XSAVE(xstate, lmask, hmask, err);
/* We should never fault when copying to a kernel buffer: */
WARN_ON_FPU(err);
@@ -344,23 +356,9 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
{
u32 lmask = mask;
u32 hmask = mask >> 32;
- int err = 0;
+ int err;
- /*
- * Use xrstors to restore context if it is enabled. xrstors supports
- * compacted format of xsave area which is not supported by xrstor.
- */
- alternative_input(
- "1: " XRSTOR,
- XRSTORS,
- X86_FEATURE_XSAVES,
- "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask)
- : "memory");
-
- asm volatile("2:\n"
- xstate_fault(err)
- : "0" (err)
- : "memory");
+ XSTATE_XRESTORE(xstate, lmask, hmask, err);
/* We should never fault when copying from a kernel buffer: */
WARN_ON_FPU(err);
@@ -388,12 +386,10 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf)
if (unlikely(err))
return -EFAULT;
- __asm__ __volatile__(ASM_STAC "\n"
- "1:"XSAVE"\n"
- "2: " ASM_CLAC "\n"
- xstate_fault(err)
- : "D" (buf), "a" (-1), "d" (-1), "0" (err)
- : "memory");
+ stac();
+ XSTATE_OP(XSAVE, buf, -1, -1, err);
+ clac();
+
return err;
}
@@ -405,14 +401,12 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
struct xregs_state *xstate = ((__force struct xregs_state *)buf);
u32 lmask = mask;
u32 hmask = mask >> 32;
- int err = 0;
-
- __asm__ __volatile__(ASM_STAC "\n"
- "1:"XRSTOR"\n"
- "2: " ASM_CLAC "\n"
- xstate_fault(err)
- : "D" (xstate), "a" (lmask), "d" (hmask), "0" (err)
- : "memory"); /* memory required? */
+ int err;
+
+ stac();
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+ clac();
+
return err;
}
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 7358e9d61f1e..0e970d00dfcd 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -5,7 +5,7 @@
#define _ASM_X86_FPU_SIGNAL_H
#ifdef CONFIG_X86_64
-# include <asm/sigcontext32.h>
+# include <uapi/asm/sigcontext.h>
# include <asm/user32.h>
struct ksignal;
int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index c49c5173158e..1c6f6ac52ad0 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -95,63 +95,122 @@ struct swregs_state {
/*
* List of XSAVE features Linux knows about:
*/
-enum xfeature_bit {
- XSTATE_BIT_FP,
- XSTATE_BIT_SSE,
- XSTATE_BIT_YMM,
- XSTATE_BIT_BNDREGS,
- XSTATE_BIT_BNDCSR,
- XSTATE_BIT_OPMASK,
- XSTATE_BIT_ZMM_Hi256,
- XSTATE_BIT_Hi16_ZMM,
-
- XFEATURES_NR_MAX,
+enum xfeature {
+ XFEATURE_FP,
+ XFEATURE_SSE,
+ /*
+ * Values above here are "legacy states".
+ * Those below are "extended states".
+ */
+ XFEATURE_YMM,
+ XFEATURE_BNDREGS,
+ XFEATURE_BNDCSR,
+ XFEATURE_OPMASK,
+ XFEATURE_ZMM_Hi256,
+ XFEATURE_Hi16_ZMM,
+
+ XFEATURE_MAX,
};
-#define XSTATE_FP (1 << XSTATE_BIT_FP)
-#define XSTATE_SSE (1 << XSTATE_BIT_SSE)
-#define XSTATE_YMM (1 << XSTATE_BIT_YMM)
-#define XSTATE_BNDREGS (1 << XSTATE_BIT_BNDREGS)
-#define XSTATE_BNDCSR (1 << XSTATE_BIT_BNDCSR)
-#define XSTATE_OPMASK (1 << XSTATE_BIT_OPMASK)
-#define XSTATE_ZMM_Hi256 (1 << XSTATE_BIT_ZMM_Hi256)
-#define XSTATE_Hi16_ZMM (1 << XSTATE_BIT_Hi16_ZMM)
+#define XFEATURE_MASK_FP (1 << XFEATURE_FP)
+#define XFEATURE_MASK_SSE (1 << XFEATURE_SSE)
+#define XFEATURE_MASK_YMM (1 << XFEATURE_YMM)
+#define XFEATURE_MASK_BNDREGS (1 << XFEATURE_BNDREGS)
+#define XFEATURE_MASK_BNDCSR (1 << XFEATURE_BNDCSR)
+#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK)
+#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256)
+#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM)
+
+#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
+#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \
+ | XFEATURE_MASK_ZMM_Hi256 \
+ | XFEATURE_MASK_Hi16_ZMM)
+
+#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM
-#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
-#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+struct reg_128_bit {
+ u8 regbytes[128/8];
+};
+struct reg_256_bit {
+ u8 regbytes[256/8];
+};
+struct reg_512_bit {
+ u8 regbytes[512/8];
+};
/*
+ * State component 2:
+ *
* There are 16x 256-bit AVX registers named YMM0-YMM15.
* The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15)
- * and are stored in 'struct fxregs_state::xmm_space[]'.
+ * and are stored in 'struct fxregs_state::xmm_space[]' in the
+ * "legacy" area.
*
- * The high 128 bits are stored here:
- * 16x 128 bits == 256 bytes.
+ * The high 128 bits are stored here.
*/
struct ymmh_struct {
- u8 ymmh_space[256];
-};
-
-/* We don't support LWP yet: */
-struct lwp_struct {
- u8 reserved[128];
-};
+ struct reg_128_bit hi_ymm[16];
+} __packed;
/* Intel MPX support: */
-struct bndreg {
+
+struct mpx_bndreg {
u64 lower_bound;
u64 upper_bound;
} __packed;
+/*
+ * State component 3 is used for the 4 128-bit bounds registers
+ */
+struct mpx_bndreg_state {
+ struct mpx_bndreg bndreg[4];
+} __packed;
-struct bndcsr {
+/*
+ * State component 4 is used for the 64-bit user-mode MPX
+ * configuration register BNDCFGU and the 64-bit MPX status
+ * register BNDSTATUS. We call the pair "BNDCSR".
+ */
+struct mpx_bndcsr {
u64 bndcfgu;
u64 bndstatus;
} __packed;
-struct mpx_struct {
- struct bndreg bndreg[4];
- struct bndcsr bndcsr;
-};
+/*
+ * The BNDCSR state is padded out to be 64-bytes in size.
+ */
+struct mpx_bndcsr_state {
+ union {
+ struct mpx_bndcsr bndcsr;
+ u8 pad_to_64_bytes[64];
+ };
+} __packed;
+
+/* AVX-512 Components: */
+
+/*
+ * State component 5 is used for the 8 64-bit opmask registers
+ * k0-k7 (opmask state).
+ */
+struct avx_512_opmask_state {
+ u64 opmask_reg[8];
+} __packed;
+
+/*
+ * State component 6 is used for the upper 256 bits of the
+ * registers ZMM0-ZMM15. These 16 256-bit values are denoted
+ * ZMM0_H-ZMM15_H (ZMM_Hi256 state).
+ */
+struct avx_512_zmm_uppers_state {
+ struct reg_256_bit zmm_upper[16];
+} __packed;
+
+/*
+ * State component 7 is used for the 16 512-bit registers
+ * ZMM16-ZMM31 (Hi16_ZMM state).
+ */
+struct avx_512_hi16_state {
+ struct reg_512_bit hi16_zmm[16];
+} __packed;
struct xstate_header {
u64 xfeatures;
@@ -159,22 +218,19 @@ struct xstate_header {
u64 reserved[6];
} __attribute__((packed));
-/* New processor state extensions should be added here: */
-#define XSTATE_RESERVE (sizeof(struct ymmh_struct) + \
- sizeof(struct lwp_struct) + \
- sizeof(struct mpx_struct) )
/*
* This is our most modern FPU state format, as saved by the XSAVE
* and restored by the XRSTOR instructions.
*
* It consists of a legacy fxregs portion, an xstate header and
- * subsequent fixed size areas as defined by the xstate header.
- * Not all CPUs support all the extensions.
+ * subsequent areas as defined by the xstate header. Not all CPUs
+ * support all the extensions, so the size of the extended area
+ * can vary quite a bit between CPUs.
*/
struct xregs_state {
struct fxregs_state i387;
struct xstate_header header;
- u8 __reserved[XSTATE_RESERVE];
+ u8 extended_state_area[0];
} __attribute__ ((packed, aligned (64)));
/*
@@ -182,7 +238,9 @@ struct xregs_state {
* put together, so that we can pick the right one runtime.
*
* The size of the structure is determined by the largest
- * member - which is the xsave area:
+ * member - which is the xsave area. The padding is there
+ * to ensure that statically-allocated task_structs (just
+ * the init_task today) have enough space.
*/
union fpregs_state {
struct fregs_state fsave;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 4656b25bb9a7..af30fdeb140d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -6,7 +6,7 @@
#include <linux/uaccess.h>
/* Bit 63 of XCR0 is reserved for future expansion */
-#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63)))
+#define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
#define XSTATE_CPUID 0x0000000d
@@ -19,14 +19,19 @@
#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
/* Supported features which support lazy state saving */
-#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
- | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE)
/* Supported features which require eager state saving */
-#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR)
+#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM)
/* All currently supported features */
-#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER)
+#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
#ifdef CONFIG_X86_64
#define REX_PREFIX "0x48, "
@@ -40,6 +45,7 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
+void fpu__xstate_clear_all_cpu_caps(void);
void *get_xsave_addr(struct xregs_state *xsave, int xstate);
const void *get_xsave_field_ptr(int xstate_field);
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 04e9d023168f..1c0b43724ce3 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -68,7 +68,6 @@ void *kmap_atomic(struct page *page);
void __kunmap_atomic(void *kvaddr);
void *kmap_atomic_pfn(unsigned long pfn);
void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
-struct page *kmap_atomic_to_page(void *ptr);
#define flush_cache_kmaps() do { } while (0)
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 5fa9fb0f8809..cc285ec4b2c1 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -63,10 +63,10 @@
/* hpet memory map physical address */
extern unsigned long hpet_address;
extern unsigned long force_hpet_address;
-extern int boot_hpet_disable;
+extern bool boot_hpet_disable;
extern u8 hpet_blockid;
-extern int hpet_force_user;
-extern u8 hpet_msi_disable;
+extern bool hpet_force_user;
+extern bool hpet_msi_disable;
extern int is_hpet_enabled(void);
extern int hpet_enable(void);
extern void hpet_disable(void);
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1e3408e88604..1815b736269d 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -130,6 +130,11 @@ struct irq_alloc_info {
char *uv_name;
};
#endif
+#if IS_ENABLED(CONFIG_VMD)
+ struct {
+ struct msi_desc *desc;
+ };
+#endif
};
};
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index ccffa53750a8..39bcefc20de7 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,6 +60,7 @@ struct legacy_pic {
void (*mask_all)(void);
void (*restore_mask)(void);
void (*init)(int auto_eoi);
+ int (*probe)(void);
int (*irq_pending)(unsigned int irq);
void (*make_irq)(unsigned int irq);
};
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 28019765442e..a9bdf5569ab3 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -10,7 +10,7 @@
* 32 bit structures for IA32 support.
*/
-#include <asm/sigcontext32.h>
+#include <uapi/asm/sigcontext.h>
/* signal.h */
@@ -18,7 +18,7 @@ struct ucontext_ia32 {
unsigned int uc_flags;
unsigned int uc_link;
compat_stack_t uc_stack;
- struct sigcontext_ia32 uc_mcontext;
+ struct sigcontext_32 uc_mcontext;
compat_sigset_t uc_sigmask; /* mask last for extensibility */
};
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
new file mode 100644
index 000000000000..e1a411786bf5
--- /dev/null
+++ b/arch/x86/include/asm/intel_pt.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_INTEL_PT_H
+#define _ASM_X86_INTEL_PT_H
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+void cpu_emergency_stop_pt(void);
+#else
+static inline void cpu_emergency_stop_pt(void) {}
+#endif
+
+#endif /* _ASM_X86_INTEL_PT_H */
diff --git a/arch/x86/include/asm/intel_punit_ipc.h b/arch/x86/include/asm/intel_punit_ipc.h
new file mode 100644
index 000000000000..201eb9dce595
--- /dev/null
+++ b/arch/x86/include/asm/intel_punit_ipc.h
@@ -0,0 +1,101 @@
+#ifndef _ASM_X86_INTEL_PUNIT_IPC_H_
+#define _ASM_X86_INTEL_PUNIT_IPC_H_
+
+/*
+ * Three types of 8bit P-Unit IPC commands are supported,
+ * bit[7:6]: [00]: BIOS; [01]: GTD; [10]: ISPD.
+ */
+typedef enum {
+ BIOS_IPC = 0,
+ GTDRIVER_IPC,
+ ISPDRIVER_IPC,
+ RESERVED_IPC,
+} IPC_TYPE;
+
+#define IPC_TYPE_OFFSET 6
+#define IPC_PUNIT_BIOS_CMD_BASE (BIOS_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_GTD_CMD_BASE (GTDDRIVER_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_ISPD_CMD_BASE (ISPDRIVER_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_CMD_TYPE_MASK (RESERVED_IPC << IPC_TYPE_OFFSET)
+
+/* BIOS => Pcode commands */
+#define IPC_PUNIT_BIOS_ZERO (IPC_PUNIT_BIOS_CMD_BASE | 0x00)
+#define IPC_PUNIT_BIOS_VR_INTERFACE (IPC_PUNIT_BIOS_CMD_BASE | 0x01)
+#define IPC_PUNIT_BIOS_READ_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x02)
+#define IPC_PUNIT_BIOS_WRITE_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x03)
+#define IPC_PUNIT_BIOS_READ_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x04)
+#define IPC_PUNIT_BIOS_WRITE_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x05)
+#define IPC_PUNIT_BIOS_READ_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x06)
+#define IPC_PUNIT_BIOS_WRITE_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x07)
+#define IPC_PUNIT_BIOS_TRIGGER_VDD_RAM (IPC_PUNIT_BIOS_CMD_BASE | 0x08)
+#define IPC_PUNIT_BIOS_READ_TELE_INFO (IPC_PUNIT_BIOS_CMD_BASE | 0x09)
+#define IPC_PUNIT_BIOS_READ_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0a)
+#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0b)
+#define IPC_PUNIT_BIOS_READ_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0c)
+#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0d)
+#define IPC_PUNIT_BIOS_READ_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0e)
+#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0f)
+#define IPC_PUNIT_BIOS_READ_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x10)
+#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x11)
+#define IPC_PUNIT_BIOS_READ_MODULE_TEMP (IPC_PUNIT_BIOS_CMD_BASE | 0x12)
+#define IPC_PUNIT_BIOS_RESERVED (IPC_PUNIT_BIOS_CMD_BASE | 0x13)
+#define IPC_PUNIT_BIOS_READ_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x14)
+#define IPC_PUNIT_BIOS_WRITE_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x15)
+#define IPC_PUNIT_BIOS_READ_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x16)
+#define IPC_PUNIT_BIOS_WRITE_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x17)
+#define IPC_PUNIT_BIOS_READ_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x18)
+#define IPC_PUNIT_BIOS_WRITE_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x19)
+#define IPC_PUNIT_BIOS_READ_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1a)
+#define IPC_PUNIT_BIOS_WRITE_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1b)
+
+/* GT Driver => Pcode commands */
+#define IPC_PUNIT_GTD_ZERO (IPC_PUNIT_GTD_CMD_BASE | 0x00)
+#define IPC_PUNIT_GTD_CONFIG (IPC_PUNIT_GTD_CMD_BASE | 0x01)
+#define IPC_PUNIT_GTD_READ_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x02)
+#define IPC_PUNIT_GTD_WRITE_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x03)
+#define IPC_PUNIT_GTD_GET_WM_VAL (IPC_PUNIT_GTD_CMD_BASE | 0x06)
+#define IPC_PUNIT_GTD_WRITE_CONFIG_WISHREQ (IPC_PUNIT_GTD_CMD_BASE | 0x07)
+#define IPC_PUNIT_GTD_READ_REQ_DUTY_CYCLE (IPC_PUNIT_GTD_CMD_BASE | 0x16)
+#define IPC_PUNIT_GTD_DIS_VOL_FREQ_CHG_REQUEST (IPC_PUNIT_GTD_CMD_BASE | 0x17)
+#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_CTRL (IPC_PUNIT_GTD_CMD_BASE | 0x1a)
+#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_TUNING (IPC_PUNIT_GTD_CMD_BASE | 0x1c)
+
+/* ISP Driver => Pcode commands */
+#define IPC_PUNIT_ISPD_ZERO (IPC_PUNIT_ISPD_CMD_BASE | 0x00)
+#define IPC_PUNIT_ISPD_CONFIG (IPC_PUNIT_ISPD_CMD_BASE | 0x01)
+#define IPC_PUNIT_ISPD_GET_ISP_LTR_VAL (IPC_PUNIT_ISPD_CMD_BASE | 0x02)
+#define IPC_PUNIT_ISPD_ACCESS_IU_FREQ_BOUNDS (IPC_PUNIT_ISPD_CMD_BASE | 0x03)
+#define IPC_PUNIT_ISPD_READ_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x04)
+#define IPC_PUNIT_ISPD_WRITE_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x05)
+
+/* Error codes */
+#define IPC_PUNIT_ERR_SUCCESS 0
+#define IPC_PUNIT_ERR_INVALID_CMD 1
+#define IPC_PUNIT_ERR_INVALID_PARAMETER 2
+#define IPC_PUNIT_ERR_CMD_TIMEOUT 3
+#define IPC_PUNIT_ERR_CMD_LOCKED 4
+#define IPC_PUNIT_ERR_INVALID_VR_ID 5
+#define IPC_PUNIT_ERR_VR_ERR 6
+
+#if IS_ENABLED(CONFIG_INTEL_PUNIT_IPC)
+
+int intel_punit_ipc_simple_command(int cmd, int para1, int para2);
+int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2, u32 *in, u32 *out);
+
+#else
+
+static inline int intel_punit_ipc_simple_command(int cmd,
+ int para1, int para2)
+{
+ return -ENODEV;
+}
+
+static inline int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2,
+ u32 *in, u32 *out)
+{
+ return -ENODEV;
+}
+
+#endif /* CONFIG_INTEL_PUNIT_IPC */
+
+#endif
diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h
new file mode 100644
index 000000000000..ed65fe701de5
--- /dev/null
+++ b/arch/x86/include/asm/intel_telemetry.h
@@ -0,0 +1,147 @@
+/*
+ * Intel SOC Telemetry Driver Header File
+ * Copyright (C) 2015, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+#ifndef INTEL_TELEMETRY_H
+#define INTEL_TELEMETRY_H
+
+#define TELEM_MAX_EVENTS_SRAM 28
+#define TELEM_MAX_OS_ALLOCATED_EVENTS 20
+
+enum telemetry_unit {
+ TELEM_PSS = 0,
+ TELEM_IOSS,
+ TELEM_UNIT_NONE
+};
+
+struct telemetry_evtlog {
+ u32 telem_evtid;
+ u64 telem_evtlog;
+};
+
+struct telemetry_evtconfig {
+ /* Array of Event-IDs to Enable */
+ u32 *evtmap;
+
+ /* Number of Events (<29) in evtmap */
+ u8 num_evts;
+
+ /* Sampling period */
+ u8 period;
+};
+
+struct telemetry_evtmap {
+ const char *name;
+ u32 evt_id;
+};
+
+struct telemetry_unit_config {
+ struct telemetry_evtmap *telem_evts;
+ void __iomem *regmap;
+ u32 ssram_base_addr;
+ u8 ssram_evts_used;
+ u8 curr_period;
+ u8 max_period;
+ u8 min_period;
+ u32 ssram_size;
+
+};
+
+struct telemetry_plt_config {
+ struct telemetry_unit_config pss_config;
+ struct telemetry_unit_config ioss_config;
+ struct mutex telem_trace_lock;
+ struct mutex telem_lock;
+ bool telem_in_use;
+};
+
+struct telemetry_core_ops {
+ int (*get_sampling_period)(u8 *pss_min_period, u8 *pss_max_period,
+ u8 *ioss_min_period, u8 *ioss_max_period);
+
+ int (*get_eventconfig)(struct telemetry_evtconfig *pss_evtconfig,
+ struct telemetry_evtconfig *ioss_evtconfig,
+ int pss_len, int ioss_len);
+
+ int (*update_events)(struct telemetry_evtconfig pss_evtconfig,
+ struct telemetry_evtconfig ioss_evtconfig);
+
+ int (*set_sampling_period)(u8 pss_period, u8 ioss_period);
+
+ int (*get_trace_verbosity)(enum telemetry_unit telem_unit,
+ u32 *verbosity);
+
+ int (*set_trace_verbosity)(enum telemetry_unit telem_unit,
+ u32 verbosity);
+
+ int (*raw_read_eventlog)(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog,
+ int len, int log_all_evts);
+
+ int (*read_eventlog)(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog,
+ int len, int log_all_evts);
+
+ int (*add_events)(u8 num_pss_evts, u8 num_ioss_evts,
+ u32 *pss_evtmap, u32 *ioss_evtmap);
+
+ int (*reset_events)(void);
+};
+
+int telemetry_set_pltdata(struct telemetry_core_ops *ops,
+ struct telemetry_plt_config *pltconfig);
+
+int telemetry_clear_pltdata(void);
+
+int telemetry_pltconfig_valid(void);
+
+int telemetry_get_evtname(enum telemetry_unit telem_unit,
+ const char **name, int len);
+
+int telemetry_update_events(struct telemetry_evtconfig pss_evtconfig,
+ struct telemetry_evtconfig ioss_evtconfig);
+
+int telemetry_add_events(u8 num_pss_evts, u8 num_ioss_evts,
+ u32 *pss_evtmap, u32 *ioss_evtmap);
+
+int telemetry_reset_events(void);
+
+int telemetry_get_eventconfig(struct telemetry_evtconfig *pss_config,
+ struct telemetry_evtconfig *ioss_config,
+ int pss_len, int ioss_len);
+
+int telemetry_read_events(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_raw_read_events(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_read_eventlog(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_raw_read_eventlog(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_get_sampling_period(u8 *pss_min_period, u8 *pss_max_period,
+ u8 *ioss_min_period, u8 *ioss_max_period);
+
+int telemetry_set_sampling_period(u8 pss_period, u8 ioss_period);
+
+int telemetry_set_trace_verbosity(enum telemetry_unit telem_unit,
+ u32 verbosity);
+
+int telemetry_get_trace_verbosity(enum telemetry_unit telem_unit,
+ u32 *verbosity);
+
+#endif /* INTEL_TELEMETRY_H */
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
index b72ad0faa6c5..b41ee164930a 100644
--- a/arch/x86/include/asm/iosf_mbi.h
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -1,5 +1,5 @@
/*
- * iosf_mbi.h: Intel OnChip System Fabric MailBox access support
+ * Intel OnChip System Fabric MailBox access support
*/
#ifndef IOSF_MBI_SYMS_H
@@ -16,6 +16,18 @@
#define MBI_MASK_LO 0x000000FF
#define MBI_ENABLE 0xF0
+/* IOSF SB read/write opcodes */
+#define MBI_MMIO_READ 0x00
+#define MBI_MMIO_WRITE 0x01
+#define MBI_CFG_READ 0x04
+#define MBI_CFG_WRITE 0x05
+#define MBI_CR_READ 0x06
+#define MBI_CR_WRITE 0x07
+#define MBI_REG_READ 0x10
+#define MBI_REG_WRITE 0x11
+#define MBI_ESRAM_READ 0x12
+#define MBI_ESRAM_WRITE 0x13
+
/* Baytrail available units */
#define BT_MBI_UNIT_AUNIT 0x00
#define BT_MBI_UNIT_SMC 0x01
@@ -28,50 +40,13 @@
#define BT_MBI_UNIT_SATA 0xA3
#define BT_MBI_UNIT_PCIE 0xA6
-/* Baytrail read/write opcodes */
-#define BT_MBI_AUNIT_READ 0x10
-#define BT_MBI_AUNIT_WRITE 0x11
-#define BT_MBI_SMC_READ 0x10
-#define BT_MBI_SMC_WRITE 0x11
-#define BT_MBI_CPU_READ 0x10
-#define BT_MBI_CPU_WRITE 0x11
-#define BT_MBI_BUNIT_READ 0x10
-#define BT_MBI_BUNIT_WRITE 0x11
-#define BT_MBI_PMC_READ 0x06
-#define BT_MBI_PMC_WRITE 0x07
-#define BT_MBI_GFX_READ 0x00
-#define BT_MBI_GFX_WRITE 0x01
-#define BT_MBI_SMIO_READ 0x06
-#define BT_MBI_SMIO_WRITE 0x07
-#define BT_MBI_USB_READ 0x06
-#define BT_MBI_USB_WRITE 0x07
-#define BT_MBI_SATA_READ 0x00
-#define BT_MBI_SATA_WRITE 0x01
-#define BT_MBI_PCIE_READ 0x00
-#define BT_MBI_PCIE_WRITE 0x01
-
/* Quark available units */
#define QRK_MBI_UNIT_HBA 0x00
#define QRK_MBI_UNIT_HB 0x03
#define QRK_MBI_UNIT_RMU 0x04
#define QRK_MBI_UNIT_MM 0x05
-#define QRK_MBI_UNIT_MMESRAM 0x05
#define QRK_MBI_UNIT_SOC 0x31
-/* Quark read/write opcodes */
-#define QRK_MBI_HBA_READ 0x10
-#define QRK_MBI_HBA_WRITE 0x11
-#define QRK_MBI_HB_READ 0x10
-#define QRK_MBI_HB_WRITE 0x11
-#define QRK_MBI_RMU_READ 0x10
-#define QRK_MBI_RMU_WRITE 0x11
-#define QRK_MBI_MM_READ 0x10
-#define QRK_MBI_MM_WRITE 0x11
-#define QRK_MBI_MMESRAM_READ 0x12
-#define QRK_MBI_MMESRAM_WRITE 0x13
-#define QRK_MBI_SOC_READ 0x06
-#define QRK_MBI_SOC_WRITE 0x07
-
#if IS_ENABLED(CONFIG_IOSF_MBI)
bool iosf_mbi_available(void);
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 615fa9061b57..cfc9a0d2d07c 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -119,6 +119,8 @@ static inline void
native_apic_mem_write(APIC_ICR, cfg);
}
+extern void default_send_IPI_single(int cpu, int vector);
+extern void default_send_IPI_single_phys(int cpu, int vector);
extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
int vector);
extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 881b4768644a..e7de5c9a4fbd 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -23,11 +23,13 @@ extern void irq_ctx_init(int cpu);
#define __ARCH_HAS_DO_SOFTIRQ
+struct irq_desc;
+
#ifdef CONFIG_HOTPLUG_CPU
#include <linux/cpumask.h>
extern int check_irq_vectors_for_cpu_disable(void);
extern void fixup_irqs(void);
-extern void irq_force_complete_move(int);
+extern void irq_force_complete_move(struct irq_desc *desc);
#endif
#ifdef CONFIG_HAVE_KVM
@@ -37,7 +39,6 @@ extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
extern void (*x86_platform_ipi_callback)(void);
extern void native_init_IRQ(void);
-struct irq_desc;
extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs);
extern __visible unsigned int do_IRQ(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 046c7fb1ca43..a210eba2727c 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -33,6 +33,11 @@ enum irq_remap_cap {
IRQ_POSTING_CAP = 0,
};
+struct vcpu_data {
+ u64 pi_desc_addr; /* Physical address of PI Descriptor */
+ u32 vector; /* Guest vector of the interrupt */
+};
+
#ifdef CONFIG_IRQ_REMAP
extern bool irq_remapping_cap(enum irq_remap_cap cap);
@@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
return x86_vector_domain;
}
-struct vcpu_data {
- u64 pi_desc_addr; /* Physical address of PI Descriptor */
- u32 vector; /* Guest vector of the interrupt */
-};
-
#else /* CONFIG_IRQ_REMAP */
static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 5daeca3d0f9e..adc54c12cbd1 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -1,12 +1,18 @@
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H
-#ifndef __ASSEMBLY__
-
-#include <linux/stringify.h>
-#include <linux/types.h>
-#include <asm/nops.h>
-#include <asm/asm.h>
+#ifndef HAVE_JUMP_LABEL
+/*
+ * For better or for worse, if jump labels (the gcc extension) are missing,
+ * then the entire static branch patching infrastructure is compiled out.
+ * If that happens, the code in here will malfunction. Raise a compiler
+ * error instead.
+ *
+ * In theory, jump labels and the static branch patching infrastructure
+ * could be decoupled to fix this.
+ */
+#error asm/jump_label.h included on a non-jump-label kernel
+#endif
#define JUMP_LABEL_NOP_SIZE 5
@@ -16,6 +22,14 @@
# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
#endif
+#include <asm/asm.h>
+#include <asm/nops.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+
static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
asm_volatile_goto("1:"
@@ -59,5 +73,40 @@ struct jump_entry {
jump_label_t key;
};
-#endif /* __ASSEMBLY__ */
+#else /* __ASSEMBLY__ */
+
+.macro STATIC_JUMP_IF_TRUE target, key, def
+.Lstatic_jump_\@:
+ .if \def
+ /* Equivalent to "jmp.d32 \target" */
+ .byte 0xe9
+ .long \target - .Lstatic_jump_after_\@
+.Lstatic_jump_after_\@:
+ .else
+ .byte STATIC_KEY_INIT_NOP
+ .endif
+ .pushsection __jump_table, "aw"
+ _ASM_ALIGN
+ _ASM_PTR .Lstatic_jump_\@, \target, \key
+ .popsection
+.endm
+
+.macro STATIC_JUMP_IF_FALSE target, key, def
+.Lstatic_jump_\@:
+ .if \def
+ .byte STATIC_KEY_INIT_NOP
+ .else
+ /* Equivalent to "jmp.d32 \target" */
+ .byte 0xe9
+ .long \target - .Lstatic_jump_after_\@
+.Lstatic_jump_after_\@:
+ .endif
+ .pushsection __jump_table, "aw"
+ _ASM_ALIGN
+ _ASM_PTR .Lstatic_jump_\@, \target, \key + 1
+ .popsection
+.endm
+
+#endif /* __ASSEMBLY__ */
+
#endif
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index b130d59406fb..e5f5dc9787d5 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -29,11 +29,5 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
extern void __show_regs(struct pt_regs *regs, int all);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC_CORE
-extern int in_crash_kexec;
-#else
-/* no crash dump is ever in progress if no crash kernel can be kexec'd */
-#define in_crash_kexec 0
-#endif
#endif /* _ASM_X86_KDEBUG_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e16466ec473c..e9cd7befcb76 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -112,6 +112,16 @@ struct x86_emulate_ops {
struct x86_exception *fault);
/*
+ * read_phys: Read bytes of standard (non-emulated/special) memory.
+ * Used for descriptor reading.
+ * @addr: [IN ] Physical address from which to read.
+ * @val: [OUT] Value read from memory.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+ void *val, unsigned int bytes);
+
+ /*
* write_std: Write bytes of standard (non-emulated/special) memory.
* Used for descriptor writing.
* @addr: [IN ] Linear address to which to write.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3a36ee704c30..44adbb819041 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -24,6 +24,8 @@
#include <linux/perf_event.h>
#include <linux/pvclock_gtod.h>
#include <linux/clocksource.h>
+#include <linux/irqbypass.h>
+#include <linux/hyperv.h>
#include <asm/pvclock-abi.h>
#include <asm/desc.h>
@@ -44,6 +46,31 @@
#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
+/* x86-specific vcpu->requests bit members */
+#define KVM_REQ_MIGRATE_TIMER 8
+#define KVM_REQ_REPORT_TPR_ACCESS 9
+#define KVM_REQ_TRIPLE_FAULT 10
+#define KVM_REQ_MMU_SYNC 11
+#define KVM_REQ_CLOCK_UPDATE 12
+#define KVM_REQ_DEACTIVATE_FPU 13
+#define KVM_REQ_EVENT 14
+#define KVM_REQ_APF_HALT 15
+#define KVM_REQ_STEAL_UPDATE 16
+#define KVM_REQ_NMI 17
+#define KVM_REQ_PMU 18
+#define KVM_REQ_PMI 19
+#define KVM_REQ_SMI 20
+#define KVM_REQ_MASTERCLOCK_UPDATE 21
+#define KVM_REQ_MCLOCK_INPROGRESS 22
+#define KVM_REQ_SCAN_IOAPIC 23
+#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
+#define KVM_REQ_APIC_PAGE_RELOAD 25
+#define KVM_REQ_HV_CRASH 26
+#define KVM_REQ_IOAPIC_EOI_EXIT 27
+#define KVM_REQ_HV_RESET 28
+#define KVM_REQ_HV_EXIT 29
+#define KVM_REQ_HV_STIMER 30
+
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
@@ -176,6 +203,8 @@ enum {
*/
#define KVM_APIC_PV_EOI_PENDING 1
+struct kvm_kernel_irq_routing_entry;
+
/*
* We don't want allocation failures within the mmu code, so we preallocate
* enough memory for a single page fault in a cache.
@@ -210,6 +239,10 @@ union kvm_mmu_page_role {
};
};
+struct kvm_rmap_head {
+ unsigned long val;
+};
+
struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
@@ -227,7 +260,7 @@ struct kvm_mmu_page {
bool unsync;
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
- unsigned long parent_ptes; /* Reverse mapping for parent_pte */
+ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
/* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */
unsigned long mmu_valid_gen;
@@ -371,9 +404,38 @@ struct kvm_mtrr {
struct list_head head;
};
+/* Hyper-V SynIC timer */
+struct kvm_vcpu_hv_stimer {
+ struct hrtimer timer;
+ int index;
+ u64 config;
+ u64 count;
+ u64 exp_time;
+ struct hv_message msg;
+ bool msg_pending;
+};
+
+/* Hyper-V synthetic interrupt controller (SynIC)*/
+struct kvm_vcpu_hv_synic {
+ u64 version;
+ u64 control;
+ u64 msg_page;
+ u64 evt_page;
+ atomic64_t sint[HV_SYNIC_SINT_COUNT];
+ atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT];
+ DECLARE_BITMAP(auto_eoi_bitmap, 256);
+ DECLARE_BITMAP(vec_bitmap, 256);
+ bool active;
+};
+
/* Hyper-V per vcpu emulation context */
struct kvm_vcpu_hv {
u64 hv_vapic;
+ s64 runtime_offset;
+ struct kvm_vcpu_hv_synic synic;
+ struct kvm_hyperv_exit exit;
+ struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
+ DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
};
struct kvm_vcpu_arch {
@@ -396,6 +458,8 @@ struct kvm_vcpu_arch {
u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
+ bool apicv_active;
+ DECLARE_BITMAP(ioapic_handled_vectors, 256);
unsigned long apic_attention;
int32_t apic_arb_prio;
int mp_state;
@@ -500,6 +564,7 @@ struct kvm_vcpu_arch {
u32 virtual_tsc_mult;
u32 virtual_tsc_khz;
s64 ia32_tsc_adjust_msr;
+ u64 tsc_scaling_ratio;
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -573,6 +638,9 @@ struct kvm_vcpu_arch {
struct {
bool pv_unhalted;
} pv;
+
+ int pending_ioapic_eoi;
+ int pending_external_vector;
};
struct kvm_lpage_info {
@@ -580,7 +648,7 @@ struct kvm_lpage_info {
};
struct kvm_arch_memory_slot {
- unsigned long *rmap[KVM_NR_PAGE_SIZES];
+ struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
};
@@ -683,6 +751,9 @@ struct kvm_arch {
u32 bsp_vcpu_id;
u64 disabled_quirks;
+
+ bool irqchip_split;
+ u8 nr_reserved_ioapic_pins;
};
struct kvm_vm_stat {
@@ -766,7 +837,7 @@ struct kvm_x86_ops {
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
- void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
+ void (*update_bp_intercept)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -819,7 +890,8 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- int (*vm_has_apicv)(struct kvm *kvm);
+ bool (*get_enable_apicv)(void);
+ void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm *kvm, int isr);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
@@ -833,7 +905,7 @@ struct kvm_x86_ops {
int (*get_lpage_level)(void);
bool (*rdtscp_supported)(void);
bool (*invpcid_supported)(void);
- void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
+ void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment);
void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -841,11 +913,9 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
- void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
- u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
@@ -887,6 +957,20 @@ struct kvm_x86_ops {
gfn_t offset, unsigned long mask);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
+
+ /*
+ * Architecture specific hooks for vCPU blocking due to
+ * HLT instruction.
+ * Returns for .pre_block():
+ * - 0 means continue to block the vCPU.
+ * - 1 means we cannot block the vCPU since some event
+ * happens during this period, such as, 'ON' bit in
+ * posted-interrupts descriptor is set.
+ */
+ int (*pre_block)(struct kvm_vcpu *vcpu);
+ void (*post_block)(struct kvm_vcpu *vcpu);
+ int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
};
struct kvm_arch_async_pf {
@@ -898,17 +982,6 @@ struct kvm_arch_async_pf {
extern struct kvm_x86_ops *kvm_x86_ops;
-static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
- s64 adjustment)
-{
- kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false);
-}
-
-static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
-{
- kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true);
-}
-
int kvm_mmu_module_init(void);
void kvm_mmu_module_exit(void);
@@ -961,10 +1034,12 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
/* control of guest tsc rate supported? */
extern bool kvm_has_tsc_control;
-/* minimum supported tsc_khz for guests */
-extern u32 kvm_min_guest_tsc_khz;
/* maximum supported tsc_khz for guests */
extern u32 kvm_max_guest_tsc_khz;
+/* number of bits of the fractional part of the TSC scaling ratio */
+extern u8 kvm_tsc_scaling_ratio_frac_bits;
+/* maximum allowed value of TSC scaling ratio */
+extern u64 kvm_max_tsc_scaling_ratio;
enum emulation_result {
EMULATE_DONE, /* no further processing */
@@ -1071,6 +1146,8 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
+void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
@@ -1210,9 +1287,15 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
void kvm_define_shared_msr(unsigned index, u32 msr);
int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
+
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+void kvm_make_mclock_inprogress_request(struct kvm *kvm);
+void kvm_make_scan_ioapic_request(struct kvm *kvm);
+
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
@@ -1231,4 +1314,13 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
+
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq);
+
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 3bbc07a57a31..73d0c9b92087 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -12,7 +12,9 @@
#define GUEST_PL 1
/* Page for Switcher text itself, then two pages per cpu */
-#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
+#define SWITCHER_TEXT_PAGES (1)
+#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
+#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
/* Where we map the Switcher, in both Host and Guest. */
extern unsigned long switcher_addr;
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2dbc0bf2b9f3..2ea4527e462f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -123,19 +123,27 @@ struct mca_config {
};
struct mce_vendor_flags {
- /*
- * overflow recovery cpuid bit indicates that overflow
- * conditions are not fatal
- */
- __u64 overflow_recov : 1,
-
- /*
- * SUCCOR stands for S/W UnCorrectable error COntainment
- * and Recovery. It indicates support for data poisoning
- * in HW and deferred error interrupts.
- */
- succor : 1,
- __reserved_0 : 62;
+ /*
+ * Indicates that overflow conditions are not fatal, when set.
+ */
+ __u64 overflow_recov : 1,
+
+ /*
+ * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
+ * Recovery. It indicates support for data poisoning in HW and deferred
+ * error interrupts.
+ */
+ succor : 1,
+
+ /*
+ * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
+ * the register space for each MCA bank and also increases number of
+ * banks. Also, to accommodate the new banks and registers, the MCA
+ * register space is moved to a new MSR range.
+ */
+ smca : 1,
+
+ __reserved_0 : 61;
};
extern struct mce_vendor_flags mce_flags;
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 9e6278c7140e..1e1b07a5a738 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -1,6 +1,7 @@
#ifndef _ASM_X86_MICROCODE_H
#define _ASM_X86_MICROCODE_H
+#include <asm/cpu.h>
#include <linux/earlycpio.h>
#define native_rdmsr(msr, val1, val2) \
@@ -27,7 +28,6 @@ struct cpu_signature {
struct device;
enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
-extern bool dis_ucode_ldr;
struct microcode_ops {
enum ucode_state (*request_microcode_user) (int cpu,
@@ -55,6 +55,12 @@ struct ucode_cpu_info {
};
extern struct ucode_cpu_info ucode_cpu_info[];
+#ifdef CONFIG_MICROCODE
+int __init microcode_init(void);
+#else
+static inline int __init microcode_init(void) { return 0; };
+#endif
+
#ifdef CONFIG_MICROCODE_INTEL
extern struct microcode_ops * __init init_intel_microcode(void);
#else
@@ -75,7 +81,6 @@ static inline struct microcode_ops * __init init_amd_microcode(void)
static inline void __exit exit_amd_microcode(void) {}
#endif
-#ifdef CONFIG_MICROCODE_EARLY
#define MAX_UCODE_COUNT 128
#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
@@ -91,14 +96,14 @@ static inline void __exit exit_amd_microcode(void) {}
/*
* In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
- * x86_vendor() gets vendor id for BSP.
+ * x86_cpuid_vendor() gets vendor id for BSP.
*
* In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
- * coding, we still use x86_vendor() to get vendor id for AP.
+ * coding, we still use x86_cpuid_vendor() to get vendor id for AP.
*
- * x86_vendor() gets vendor information directly from CPUID.
+ * x86_cpuid_vendor() gets vendor information directly from CPUID.
*/
-static inline int x86_vendor(void)
+static inline int x86_cpuid_vendor(void)
{
u32 eax = 0x00000000;
u32 ebx, ecx = 0, edx;
@@ -114,58 +119,28 @@ static inline int x86_vendor(void)
return X86_VENDOR_UNKNOWN;
}
-static inline unsigned int __x86_family(unsigned int sig)
-{
- unsigned int x86;
-
- x86 = (sig >> 8) & 0xf;
-
- if (x86 == 0xf)
- x86 += (sig >> 20) & 0xff;
-
- return x86;
-}
-
-static inline unsigned int x86_family(void)
+static inline unsigned int x86_cpuid_family(void)
{
u32 eax = 0x00000001;
u32 ebx, ecx = 0, edx;
native_cpuid(&eax, &ebx, &ecx, &edx);
- return __x86_family(eax);
-}
-
-static inline unsigned int x86_model(unsigned int sig)
-{
- unsigned int x86, model;
-
- x86 = __x86_family(sig);
-
- model = (sig >> 4) & 0xf;
-
- if (x86 == 0x6 || x86 == 0xf)
- model += ((sig >> 16) & 0xf) << 4;
-
- return model;
+ return x86_family(eax);
}
+#ifdef CONFIG_MICROCODE
extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
extern int __init save_microcode_in_initrd(void);
void reload_early_microcode(void);
extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
#else
-static inline void __init load_ucode_bsp(void) {}
-static inline void load_ucode_ap(void) {}
-static inline int __init save_microcode_in_initrd(void)
-{
- return 0;
-}
-static inline void reload_early_microcode(void) {}
-static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name)
-{
- return false;
-}
+static inline void __init load_ucode_bsp(void) { }
+static inline void load_ucode_ap(void) { }
+static inline int __init save_microcode_in_initrd(void) { return 0; }
+static inline void reload_early_microcode(void) { }
+static inline bool
+get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; }
#endif
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index ac6d328977a6..adfc847a395e 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -64,7 +64,7 @@ extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, s
#define PATCH_MAX_SIZE PAGE_SIZE
extern u8 amd_ucode_patch[PATCH_MAX_SIZE];
-#ifdef CONFIG_MICROCODE_AMD_EARLY
+#ifdef CONFIG_MICROCODE_AMD
extern void __init load_ucode_amd_bsp(unsigned int family);
extern void load_ucode_amd_ap(void);
extern int __init save_microcode_in_initrd_amd(void);
@@ -76,4 +76,5 @@ static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; }
void reload_ucode_amd(void) {}
#endif
+extern bool check_current_patch_level(u32 *rev, bool early);
#endif /* _ASM_X86_MICROCODE_AMD_H */
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 7991c606125d..8559b0102ea1 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -57,7 +57,7 @@ extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev);
extern int microcode_sanity_check(void *mc, int print_err);
extern int find_matching_signature(void *mc, unsigned int csig, int cpf);
-#ifdef CONFIG_MICROCODE_INTEL_EARLY
+#ifdef CONFIG_MICROCODE_INTEL
extern void __init load_ucode_intel_bsp(void);
extern void load_ucode_intel_ap(void);
extern void show_ucode_info_early(void);
@@ -71,13 +71,9 @@ static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL;
static inline void reload_ucode_intel(void) {}
#endif
-#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
+#ifdef CONFIG_HOTPLUG_CPU
extern int save_mc_for_early(u8 *mc);
#else
-static inline int save_mc_for_early(u8 *mc)
-{
- return 0;
-}
+static inline int save_mc_for_early(u8 *mc) { return 0; }
#endif
-
#endif /* _ASM_X86_MICROCODE_INTEL_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 379cd3658799..bfd9b2a35a0b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
#endif
cpumask_set_cpu(cpu, mm_cpumask(next));
- /* Re-load page tables */
+ /*
+ * Re-load page tables.
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs must
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+ * Fortunately, load_cr3() is serializing and gives the
+ * ordering guarantee we need.
+ *
+ */
load_cr3(next->pgd);
+
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
/* Stop flush ipis for the previous mm */
@@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* schedule, protecting us from simultaneous changes.
*/
cpumask_set_cpu(cpu, mm_cpumask(next));
+
/*
* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables.
+ *
+ * As above, load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask write.
*/
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index 93724cc62177..eb4b09b41df5 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -1,7 +1,13 @@
#ifndef _ASM_X86_MSI_H
#define _ASM_X86_MSI_H
#include <asm/hw_irq.h>
+#include <asm/irqdomain.h>
typedef struct irq_alloc_info msi_alloc_info_t;
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg);
+
+void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc);
+
#endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b8c14bb7fc8f..b05402ef3b84 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -35,7 +35,7 @@
#define MSR_IA32_PERFCTR0 0x000000c1
#define MSR_IA32_PERFCTR1 0x000000c2
#define MSR_FSB_FREQ 0x000000cd
-#define MSR_NHM_PLATFORM_INFO 0x000000ce
+#define MSR_PLATFORM_INFO 0x000000ce
#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2
#define NHM_C3_AUTO_DEMOTE (1UL << 25)
@@ -44,7 +44,6 @@
#define SNB_C1_AUTO_UNDEMOTE (1UL << 27)
#define SNB_C3_AUTO_UNDEMOTE (1UL << 28)
-#define MSR_PLATFORM_INFO 0x000000ce
#define MSR_MTRRcap 0x000000fe
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
@@ -206,6 +205,13 @@
#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0
#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1
+/* Config TDP MSRs */
+#define MSR_CONFIG_TDP_NOMINAL 0x00000648
+#define MSR_CONFIG_TDP_LEVEL1 0x00000649
+#define MSR_CONFIG_TDP_LEVEL2 0x0000064A
+#define MSR_CONFIG_TDP_CONTROL 0x0000064B
+#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C
+
/* Hardware P state interface */
#define MSR_PPERF 0x0000064e
#define MSR_PERF_LIMIT_REASONS 0x0000064f
@@ -315,6 +321,7 @@
#define MSR_F15H_PERF_CTR 0xc0010201
#define MSR_F15H_NB_PERF_CTL 0xc0010240
#define MSR_F15H_NB_PERF_CTR 0xc0010241
+#define MSR_F15H_IC_CFG 0xc0011021
/* Fam 10h MSRs */
#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
diff --git a/arch/x86/include/asm/msr-trace.h b/arch/x86/include/asm/msr-trace.h
new file mode 100644
index 000000000000..7567225747d8
--- /dev/null
+++ b/arch/x86/include/asm/msr-trace.h
@@ -0,0 +1,57 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM msr
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE msr-trace
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/
+
+#if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MSR_H
+
+#include <linux/tracepoint.h>
+
+/*
+ * Tracing for x86 model specific registers. Directly maps to the
+ * RDMSR/WRMSR instructions.
+ */
+
+DECLARE_EVENT_CLASS(msr_trace_class,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed),
+ TP_STRUCT__entry(
+ __field( unsigned, msr )
+ __field( u64, val )
+ __field( int, failed )
+ ),
+ TP_fast_assign(
+ __entry->msr = msr;
+ __entry->val = val;
+ __entry->failed = failed;
+ ),
+ TP_printk("%x, value %llx%s",
+ __entry->msr,
+ __entry->val,
+ __entry->failed ? " #GP" : "")
+);
+
+DEFINE_EVENT(msr_trace_class, read_msr,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, write_msr,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, rdpmc,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+#endif /* _TRACE_MSR_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 77d8b284e4a7..93fb7c1cffda 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -32,6 +32,16 @@ struct msr_regs_info {
int err;
};
+struct saved_msr {
+ bool valid;
+ struct msr_info info;
+};
+
+struct saved_msrs {
+ unsigned int num;
+ struct saved_msr *array;
+};
+
static inline unsigned long long native_read_tscp(unsigned int *aux)
{
unsigned long low, high;
@@ -57,11 +67,34 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
#define EAX_EDX_RET(val, low, high) "=A" (val)
#endif
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Be very careful with includes. This header is prone to include loops.
+ */
+#include <asm/atomic.h>
+#include <linux/tracepoint-defs.h>
+
+extern struct tracepoint __tracepoint_read_msr;
+extern struct tracepoint __tracepoint_write_msr;
+extern struct tracepoint __tracepoint_rdpmc;
+#define msr_tracepoint_active(t) static_key_false(&(t).key)
+extern void do_trace_write_msr(unsigned msr, u64 val, int failed);
+extern void do_trace_read_msr(unsigned msr, u64 val, int failed);
+extern void do_trace_rdpmc(unsigned msr, u64 val, int failed);
+#else
+#define msr_tracepoint_active(t) false
+static inline void do_trace_write_msr(unsigned msr, u64 val, int failed) {}
+static inline void do_trace_read_msr(unsigned msr, u64 val, int failed) {}
+static inline void do_trace_rdpmc(unsigned msr, u64 val, int failed) {}
+#endif
+
static inline unsigned long long native_read_msr(unsigned int msr)
{
DECLARE_ARGS(val, low, high);
asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0);
return EAX_EDX_VAL(val, low, high);
}
@@ -78,6 +111,8 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
_ASM_EXTABLE(2b, 3b)
: [err] "=r" (*err), EAX_EDX_RET(val, low, high)
: "c" (msr), [fault] "i" (-EIO));
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
return EAX_EDX_VAL(val, low, high);
}
@@ -85,6 +120,8 @@ static inline void native_write_msr(unsigned int msr,
unsigned low, unsigned high)
{
asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
}
/* Can be uninlined because referenced by paravirt */
@@ -102,6 +139,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
: "c" (msr), "0" (low), "d" (high),
[fault] "i" (-EIO)
: "memory");
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_write_msr(msr, ((u64)high << 32 | low), err);
return err;
}
@@ -160,6 +199,8 @@ static inline unsigned long long native_read_pmc(int counter)
DECLARE_ARGS(val, low, high);
asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+ if (msr_tracepoint_active(__tracepoint_rdpmc))
+ do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
return EAX_EDX_VAL(val, low, high);
}
@@ -190,7 +231,7 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
static inline void wrmsrl(unsigned msr, u64 val)
{
- native_write_msr(msr, (u32)val, (u32)(val >> 32));
+ native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
}
/* wrmsr with exception handling */
diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h
index 1c6f7f6212c1..c64373a2d731 100644
--- a/arch/x86/include/asm/numachip/numachip.h
+++ b/arch/x86/include/asm/numachip/numachip.h
@@ -14,6 +14,7 @@
#ifndef _ASM_X86_NUMACHIP_NUMACHIP_H
#define _ASM_X86_NUMACHIP_NUMACHIP_H
+extern u8 numachip_system;
extern int __init pci_numachip_init(void);
#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */
diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h
index 660f843df928..29719eecdc2e 100644
--- a/arch/x86/include/asm/numachip/numachip_csr.h
+++ b/arch/x86/include/asm/numachip/numachip_csr.h
@@ -14,12 +14,8 @@
#ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
#define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
-#include <linux/numa.h>
-#include <linux/percpu.h>
+#include <linux/smp.h>
#include <linux/io.h>
-#include <linux/swab.h>
-#include <asm/types.h>
-#include <asm/processor.h>
#define CSR_NODE_SHIFT 16
#define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT)
@@ -27,11 +23,8 @@
/* 32K CSR space, b15 indicates geo/non-geo */
#define CSR_OFFSET_MASK 0x7fffUL
-
-/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */
-#define NUMACHIP_GCSR_BASE 0x3fff00000000ULL
-#define NUMACHIP_GCSR_LIM 0x3fff0fffffffULL
-#define NUMACHIP_GCSR_SIZE (NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1)
+#define CSR_G0_NODE_IDS (0x008 + (0 << 12))
+#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12))
/*
* Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however
@@ -41,12 +34,7 @@
#define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL
#define NUMACHIP_LCSR_LIM 0x3fffffffffffULL
#define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1)
-
-static inline void *gcsr_address(int node, unsigned long offset)
-{
- return __va(NUMACHIP_GCSR_BASE | (1UL << 15) |
- CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK));
-}
+#define NUMACHIP_LAPIC_BITS 8
static inline void *lcsr_address(unsigned long offset)
{
@@ -54,114 +42,57 @@ static inline void *lcsr_address(unsigned long offset)
CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK));
}
-static inline unsigned int read_gcsr(int node, unsigned long offset)
+static inline unsigned int read_lcsr(unsigned long offset)
{
- return swab32(readl(gcsr_address(node, offset)));
+ return swab32(readl(lcsr_address(offset)));
}
-static inline void write_gcsr(int node, unsigned long offset, unsigned int val)
+static inline void write_lcsr(unsigned long offset, unsigned int val)
{
- writel(swab32(val), gcsr_address(node, offset));
+ writel(swab32(val), lcsr_address(offset));
}
-static inline unsigned int read_lcsr(unsigned long offset)
+/*
+ * On NumaChip2, local CSR space is 16MB and starts at fixed offset below 4G
+ */
+
+#define NUMACHIP2_LCSR_BASE 0xf0000000UL
+#define NUMACHIP2_LCSR_SIZE 0x1000000UL
+#define NUMACHIP2_APIC_ICR 0x100000
+#define NUMACHIP2_TIMER_DEADLINE 0x200000
+#define NUMACHIP2_TIMER_INT 0x200008
+#define NUMACHIP2_TIMER_NOW 0x200018
+#define NUMACHIP2_TIMER_RESET 0x200020
+
+static inline void __iomem *numachip2_lcsr_address(unsigned long offset)
{
- return swab32(readl(lcsr_address(offset)));
+ return (void __iomem *)__va(NUMACHIP2_LCSR_BASE |
+ (offset & (NUMACHIP2_LCSR_SIZE - 1)));
}
-static inline void write_lcsr(unsigned long offset, unsigned int val)
+static inline u32 numachip2_read32_lcsr(unsigned long offset)
{
- writel(swab32(val), lcsr_address(offset));
+ return readl(numachip2_lcsr_address(offset));
}
-/* ========================================================================= */
-/* CSR_G0_STATE_CLEAR */
-/* ========================================================================= */
-
-#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12))
-union numachip_csr_g0_state_clear {
- unsigned int v;
- struct numachip_csr_g0_state_clear_s {
- unsigned int _state:2;
- unsigned int _rsvd_2_6:5;
- unsigned int _lost:1;
- unsigned int _rsvd_8_31:24;
- } s;
-};
-
-/* ========================================================================= */
-/* CSR_G0_NODE_IDS */
-/* ========================================================================= */
+static inline u64 numachip2_read64_lcsr(unsigned long offset)
+{
+ return readq(numachip2_lcsr_address(offset));
+}
-#define CSR_G0_NODE_IDS (0x008 + (0 << 12))
-union numachip_csr_g0_node_ids {
- unsigned int v;
- struct numachip_csr_g0_node_ids_s {
- unsigned int _initialid:16;
- unsigned int _nodeid:12;
- unsigned int _rsvd_28_31:4;
- } s;
-};
-
-/* ========================================================================= */
-/* CSR_G3_EXT_IRQ_GEN */
-/* ========================================================================= */
+static inline void numachip2_write32_lcsr(unsigned long offset, u32 val)
+{
+ writel(val, numachip2_lcsr_address(offset));
+}
-#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12))
-union numachip_csr_g3_ext_irq_gen {
- unsigned int v;
- struct numachip_csr_g3_ext_irq_gen_s {
- unsigned int _vector:8;
- unsigned int _msgtype:3;
- unsigned int _index:5;
- unsigned int _destination_apic_id:16;
- } s;
-};
-
-/* ========================================================================= */
-/* CSR_G3_EXT_IRQ_STATUS */
-/* ========================================================================= */
-
-#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12))
-union numachip_csr_g3_ext_irq_status {
- unsigned int v;
- struct numachip_csr_g3_ext_irq_status_s {
- unsigned int _result:32;
- } s;
-};
-
-/* ========================================================================= */
-/* CSR_G3_EXT_IRQ_DEST */
-/* ========================================================================= */
-
-#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12))
-union numachip_csr_g3_ext_irq_dest {
- unsigned int v;
- struct numachip_csr_g3_ext_irq_dest_s {
- unsigned int _irq:8;
- unsigned int _rsvd_8_31:24;
- } s;
-};
-
-/* ========================================================================= */
-/* CSR_G3_NC_ATT_MAP_SELECT */
-/* ========================================================================= */
-
-#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12))
-union numachip_csr_g3_nc_att_map_select {
- unsigned int v;
- struct numachip_csr_g3_nc_att_map_select_s {
- unsigned int _upper_address_bits:4;
- unsigned int _select_ram:4;
- unsigned int _rsvd_8_31:24;
- } s;
-};
-
-/* ========================================================================= */
-/* CSR_G3_NC_ATT_MAP_SELECT_0-255 */
-/* ========================================================================= */
-
-#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12))
+static inline void numachip2_write64_lcsr(unsigned long offset, u64 val)
+{
+ writeq(val, numachip2_lcsr_address(offset));
+}
-#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */
+static inline unsigned int numachip2_timer(void)
+{
+ return (smp_processor_id() % 48) << 6;
+}
+#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 4edd53b79a81..4928cf0d5af0 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -26,9 +26,6 @@
#define MCE_STACK 4
#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */
-#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
-#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
-
/*
* Set __PAGE_OFFSET to the most negative possible address +
* PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index c7c712f2648b..7bd0099384ca 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -5,20 +5,25 @@
#include <linux/types.h>
/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT 12
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_SHIFT 12
+#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
+#define PAGE_MASK (~(PAGE_SIZE-1))
+
+#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
+
+#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
-/* Cast PAGE_MASK to a signed type so that it is sign-extended if
+/* Cast *PAGE_MASK to a signed type so that it is sign-extended if
virtual addresses are 32-bits but physical addresses are larger
(ie, 32-bit PAE). */
#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
-
-#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
-#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
+#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK)
+#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK)
#define HPAGE_SHIFT PMD_SHIFT
#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 10d0596433f8..f6192502149e 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -19,6 +19,12 @@ static inline int paravirt_enabled(void)
return pv_info.paravirt_enabled;
}
+static inline int paravirt_has_feature(unsigned int feature)
+{
+ WARN_ON_ONCE(!pv_info.paravirt_enabled);
+ return (pv_info.features & feature);
+}
+
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{
@@ -285,15 +291,6 @@ static inline void slow_down_io(void)
#endif
}
-#ifdef CONFIG_SMP
-static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
- unsigned long start_esp)
-{
- PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
- phys_apicid, start_eip, start_esp);
-}
-#endif
-
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
@@ -375,23 +372,6 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
{
PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
}
-static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp)
-{
- PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
-}
-
-static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
-{
- PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
-}
-
-static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp)
-{
- PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
-}
static inline pte_t __pte(pteval_t val)
{
@@ -922,23 +902,11 @@ extern void default_banner(void);
call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
-#define USERGS_SYSRET32 \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
- CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
-
#ifdef CONFIG_X86_32
#define GET_CR0_INTO_EAX \
push %ecx; push %edx; \
call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
pop %edx; pop %ecx
-
-#define ENABLE_INTERRUPTS_SYSEXIT \
- PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
- CLBR_NONE, \
- jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
-
-
#else /* !CONFIG_X86_32 */
/*
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 31247b5bff7c..77db5616a473 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -70,9 +70,14 @@ struct pv_info {
#endif
int paravirt_enabled;
+ unsigned int features; /* valid only if paravirt_enabled is set */
const char *name;
};
+#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x)
+/* Supported features */
+#define PV_SUPPORTED_RTC (1<<0)
+
struct pv_init_ops {
/*
* Patch may replace one of the defined code sequences with
@@ -157,15 +162,6 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter);
-#ifdef CONFIG_X86_32
- /*
- * Atomically enable interrupts and return to userspace. This
- * is only used in 32-bit kernels. 64-bit kernels use
- * usergs_sysret32 instead.
- */
- void (*irq_enable_sysexit)(void);
-#endif
-
/*
* Switch to usermode gs and return to 64-bit usermode using
* sysret. Only used in 64-bit kernels to return to 64-bit
@@ -174,14 +170,6 @@ struct pv_cpu_ops {
*/
void (*usergs_sysret64)(void);
- /*
- * Switch to usermode gs and return to 32-bit usermode using
- * sysret. Used to return to 32-on-64 compat processes.
- * Other usermode register state, including %esp, must already
- * be restored.
- */
- void (*usergs_sysret32)(void);
-
/* Normal iret. Jump to this with the standard iret stack
frame set up. */
void (*iret)(void);
@@ -215,14 +203,6 @@ struct pv_irq_ops {
#endif
};
-struct pv_apic_ops {
-#ifdef CONFIG_X86_LOCAL_APIC
- void (*startup_ipi_hook)(int phys_apicid,
- unsigned long start_eip,
- unsigned long start_esp);
-#endif
-};
-
struct pv_mmu_ops {
unsigned long (*read_cr2)(void);
void (*write_cr2)(unsigned long);
@@ -274,12 +254,6 @@ struct pv_mmu_ops {
pmd_t *pmdp, pmd_t pmdval);
void (*pte_update)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
- void (*pte_update_defer)(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep);
- void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp);
- void (*pmd_update_defer)(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp);
pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
@@ -354,7 +328,6 @@ struct paravirt_patch_template {
struct pv_time_ops pv_time_ops;
struct pv_cpu_ops pv_cpu_ops;
struct pv_irq_ops pv_irq_ops;
- struct pv_apic_ops pv_apic_ops;
struct pv_mmu_ops pv_mmu_ops;
struct pv_lock_ops pv_lock_ops;
};
@@ -364,7 +337,6 @@ extern struct pv_init_ops pv_init_ops;
extern struct pv_time_ops pv_time_ops;
extern struct pv_cpu_ops pv_cpu_ops;
extern struct pv_irq_ops pv_irq_ops;
-extern struct pv_apic_ops pv_apic_ops;
extern struct pv_mmu_ops pv_mmu_ops;
extern struct pv_lock_ops pv_lock_ops;
@@ -402,10 +374,8 @@ extern struct pv_lock_ops pv_lock_ops;
__visible extern const char start_##ops##_##name[], end_##ops##_##name[]; \
asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name))
-unsigned paravirt_patch_nop(void);
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
-unsigned paravirt_patch_ignore(unsigned len);
unsigned paravirt_patch_call(void *insnbuf,
const void *target, u16 tgt_clobbers,
unsigned long addr, u16 site_clobbers,
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index fa1195dae425..46873fbd44e1 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -151,11 +151,11 @@ extern struct list_head pci_mmcfg_list;
#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20)
/*
- * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
- * on their northbrige except through the * %eax register. As such, you MUST
- * NOT use normal IOMEM accesses, you need to only use the magic mmio-config
- * accessor functions.
- * In fact just use pci_config_*, nothing else please.
+ * On AMD Fam10h CPUs, all PCI MMIO configuration space accesses must use
+ * %eax. No other source or target registers may be used. The following
+ * mmio_config_* accessors enforce this. See "BIOS and Kernel Developer's
+ * Guide (BKDG) For AMD Family 10h Processors", rev. 3.48, sec 2.11.1,
+ * "MMIO Configuration Coding Requirements".
*/
static inline unsigned char mmio_config_readb(void __iomem *pos)
{
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 867da5bbb4a3..0687c4748b8f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -19,6 +19,13 @@
#include <asm/x86_init.h>
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_checkwx(void);
+
+#ifdef CONFIG_DEBUG_WX
+#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
+#else
+#define debug_checkwx() do { } while (0)
+#endif
/*
* ZERO_PAGE is a global shared page that is always zero: used
@@ -62,9 +69,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#define pmd_clear(pmd) native_pmd_clear(pmd)
#define pte_update(mm, addr, ptep) do { } while (0)
-#define pte_update_defer(mm, addr, ptep) do { } while (0)
-#define pmd_update(mm, addr, ptep) do { } while (0)
-#define pmd_update_defer(mm, addr, ptep) do { } while (0)
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
@@ -142,12 +146,12 @@ static inline unsigned long pte_pfn(pte_t pte)
static inline unsigned long pmd_pfn(pmd_t pmd)
{
- return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+ return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}
static inline unsigned long pud_pfn(pud_t pud)
{
- return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT;
+ return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
@@ -158,20 +162,22 @@ static inline int pmd_large(pmd_t pte)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return pmd_val(pmd) & _PAGE_SPLITTING;
-}
-
static inline int pmd_trans_huge(pmd_t pmd)
{
- return pmd_val(pmd) & _PAGE_PSE;
+ return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}
static inline int has_transparent_hugepage(void)
{
return cpu_has_pse;
}
+
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline int pmd_devmap(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & _PAGE_DEVMAP);
+}
+#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
@@ -248,6 +254,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
return pte_set_flags(pte, _PAGE_SPECIAL);
}
+static inline pte_t pte_mkdevmap(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
+}
+
static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
{
pmdval_t v = native_pmd_val(pmd);
@@ -267,6 +278,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_DIRTY);
+}
+
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_RW);
@@ -277,6 +293,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}
+static inline pmd_t pmd_mkdevmap(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_DEVMAP);
+}
+
static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_PSE);
@@ -318,6 +339,16 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
}
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+}
+
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
/*
@@ -379,7 +410,9 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
return __pgprot(preservebits | addbits);
}
-#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
+#define pte_pgprot(x) __pgprot(pte_flags(x))
+#define pmd_pgprot(x) __pgprot(pmd_flags(x))
+#define pud_pgprot(x) __pgprot(pud_flags(x))
#define canon_pgprot(p) __pgprot(massage_pgprot(p))
@@ -446,6 +479,13 @@ static inline int pte_present(pte_t a)
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline int pte_devmap(pte_t a)
+{
+ return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
+}
+#endif
+
#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
@@ -502,14 +542,15 @@ static inline int pmd_none(pmd_t pmd)
static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
- return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
+ return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
}
/*
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
+#define pmd_page(pmd) \
+ pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT)
/*
* the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -570,14 +611,15 @@ static inline int pud_present(pud_t pud)
static inline unsigned long pud_page_vaddr(pud_t pud)
{
- return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
+ return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud));
}
/*
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
+#define pud_page(pud) \
+ pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT)
/* Find an entry in the second-level page table.. */
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@ -710,14 +752,9 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
* updates should either be sets, clears, or set_pte_atomic for P->P
* transitions, which means this hook should only be called for user PTEs.
* This hook implies a P->P protection or access change has taken place, which
- * requires a subsequent TLB flush. The notification can optionally be delayed
- * until the TLB flush event by using the pte_update_defer form of the
- * interface, but care must be taken to assure that the flush happens while
- * still holding the same page table lock so that the shadow and primary pages
- * do not become out of sync on SMP.
+ * requires a subsequent TLB flush.
*/
#define pte_update(mm, addr, ptep) do { } while (0)
-#define pte_update_defer(mm, addr, ptep) do { } while (0)
#endif
/*
@@ -795,10 +832,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp);
-
#define __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
@@ -809,9 +842,7 @@ static inline int pmd_write(pmd_t pmd)
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
- pmd_t pmd = native_pmdp_get_and_clear(pmdp);
- pmd_update(mm, addr, pmdp);
- return pmd;
+ return native_pmdp_get_and_clear(pmdp);
}
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
@@ -819,7 +850,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
- pmd_update(mm, addr, pmdp);
}
/*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 13f310bfc09a..4432ab7f407c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,10 +22,11 @@
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
-#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
-#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
+#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -46,7 +47,6 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
-#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
#define __HAVE_ARCH_PTE_SPECIAL
#ifdef CONFIG_KMEMCHECK
@@ -85,8 +85,11 @@
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
+#define __HAVE_ARCH_PTE_DEVMAP
#else
#define _PAGE_NX (_AT(pteval_t, 0))
+#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
@@ -209,10 +212,10 @@ enum page_cache_mode {
#include <linux/types.h>
-/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
-/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
+/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
@@ -276,14 +279,40 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
}
#endif
+static inline pudval_t pud_pfn_mask(pud_t pud)
+{
+ if (native_pud_val(pud) & _PAGE_PSE)
+ return PHYSICAL_PUD_PAGE_MASK;
+ else
+ return PTE_PFN_MASK;
+}
+
+static inline pudval_t pud_flags_mask(pud_t pud)
+{
+ return ~pud_pfn_mask(pud);
+}
+
static inline pudval_t pud_flags(pud_t pud)
{
- return native_pud_val(pud) & PTE_FLAGS_MASK;
+ return native_pud_val(pud) & pud_flags_mask(pud);
+}
+
+static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
+{
+ if (native_pmd_val(pmd) & _PAGE_PSE)
+ return PHYSICAL_PMD_PAGE_MASK;
+ else
+ return PTE_PFN_MASK;
+}
+
+static inline pmdval_t pmd_flags_mask(pmd_t pmd)
+{
+ return ~pmd_pfn_mask(pmd);
}
static inline pmdval_t pmd_flags(pmd_t pmd)
{
- return native_pmd_val(pmd) & PTE_FLAGS_MASK;
+ return native_pmd_val(pmd) & pmd_flags_mask(pmd);
}
static inline pte_t native_make_pte(pteval_t val)
@@ -337,20 +366,18 @@ static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
}
static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
{
+ pgprotval_t val = pgprot_val(pgprot);
pgprot_t new;
- unsigned long val;
- val = pgprot_val(pgprot);
pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
return new;
}
static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
{
+ pgprotval_t val = pgprot_val(pgprot);
pgprot_t new;
- unsigned long val;
- val = pgprot_val(pgprot);
pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
((val & _PAGE_PAT_LARGE) >>
(_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h
index 7249e6d0902d..5973a2f3db3d 100644
--- a/arch/x86/include/asm/platform_sst_audio.h
+++ b/arch/x86/include/asm/platform_sst_audio.h
@@ -55,6 +55,7 @@ enum sst_audio_device_id_mrfld {
PIPE_MEDIA0_IN = 0x8F,
PIPE_MEDIA1_IN = 0x90,
PIPE_MEDIA2_IN = 0x91,
+ PIPE_MEDIA3_IN = 0x9C,
PIPE_RSVD = 0xFF,
};
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index d8ce3ec816ab..c57fd1ea9689 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -67,18 +67,19 @@ static inline void arch_wmb_pmem(void)
}
/**
- * __arch_wb_cache_pmem - write back a cache range with CLWB
+ * arch_wb_cache_pmem - write back a cache range with CLWB
* @vaddr: virtual start address
* @size: number of bytes to write back
*
* Write back a cache range using the CLWB (cache line write back)
* instruction. This function requires explicit ordering with an
- * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation.
+ * arch_wmb_pmem() call.
*/
-static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
{
u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
unsigned long clflush_mask = x86_clflush_size - 1;
+ void *vaddr = (void __force *)addr;
void *vend = vaddr + size;
void *p;
@@ -115,7 +116,7 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
len = copy_from_iter_nocache(vaddr, bytes, i);
if (__iter_needs_pmem_wb(i))
- __arch_wb_cache_pmem(vaddr, bytes);
+ arch_wb_cache_pmem(addr, bytes);
return len;
}
@@ -132,13 +133,8 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
{
void *vaddr = (void __force *)addr;
- /* TODO: implement the zeroing via non-temporal writes */
- if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0)
- clear_page(vaddr);
- else
- memset(vaddr, 0, size);
-
- __arch_wb_cache_pmem(vaddr, size);
+ memset(vaddr, 0, size);
+ arch_wb_cache_pmem(addr, size);
}
static inline bool __arch_has_wmb_pmem(void)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index b12f81022a6b..01bcde84d3e4 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc)
/*
* must be macros to avoid header recursion hell
*/
-#define init_task_preempt_count(p) do { \
- task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
-} while (0)
+#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
- task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
} while (0)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 19577dd325fa..20c11d1aa4cc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -11,7 +11,7 @@ struct vm86;
#include <asm/math_emu.h>
#include <asm/segment.h>
#include <asm/types.h>
-#include <asm/sigcontext.h>
+#include <uapi/asm/sigcontext.h>
#include <asm/current.h>
#include <asm/cpufeature.h>
#include <asm/page.h>
@@ -472,6 +472,7 @@ static inline unsigned long current_top_of_stack(void)
#else
#define __cpuid native_cpuid
#define paravirt_enabled() 0
+#define paravirt_has(x) 0
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
@@ -556,12 +557,12 @@ static inline unsigned int cpuid_edx(unsigned int op)
}
/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
+static __always_inline void rep_nop(void)
{
asm volatile("rep; nop" ::: "memory");
}
-static inline void cpu_relax(void)
+static __always_inline void cpu_relax(void)
{
rep_nop();
}
@@ -765,7 +766,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
* Return saved PC of a blocked thread.
* What is this good for? it will be always the scheduler or ret_from_fork.
*/
-#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
+#define thread_saved_pc(t) READ_ONCE_NOCHECK(*(unsigned long *)((t)->thread.sp - 8))
#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
extern unsigned long KSTK_ESP(struct task_struct *task);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 7a6bed5c08bc..fdcc04020636 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -4,6 +4,15 @@
#include <linux/clocksource.h>
#include <asm/pvclock-abi.h>
+#ifdef CONFIG_KVM_GUEST
+extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
+#else
+static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+ return NULL;
+}
+#endif
+
/* some helper functions for xen and kvm pv clock sources */
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
@@ -91,10 +100,5 @@ struct pvclock_vsyscall_time_info {
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
-#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1)
-
-int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
- int size);
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu);
#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index b002e711ba88..9f92c180ed2f 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -1,6 +1,65 @@
#ifndef __ASM_QSPINLOCK_PARAVIRT_H
#define __ASM_QSPINLOCK_PARAVIRT_H
+/*
+ * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
+ * registers. For i386, however, only 1 32-bit register needs to be saved
+ * and restored. So an optimized version of __pv_queued_spin_unlock() is
+ * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
+ */
+#ifdef CONFIG_64BIT
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
+#define __pv_queued_spin_unlock __pv_queued_spin_unlock
+#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
+#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
+
+/*
+ * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
+ * which combines the registers saving trunk and the body of the following
+ * C code:
+ *
+ * void __pv_queued_spin_unlock(struct qspinlock *lock)
+ * {
+ * struct __qspinlock *l = (void *)lock;
+ * u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ *
+ * if (likely(lockval == _Q_LOCKED_VAL))
+ * return;
+ * pv_queued_spin_unlock_slowpath(lock, lockval);
+ * }
+ *
+ * For x86-64,
+ * rdi = lock (first argument)
+ * rsi = lockval (second argument)
+ * rdx = internal variable (set to 0)
+ */
+asm (".pushsection .text;"
+ ".globl " PV_UNLOCK ";"
+ ".align 4,0x90;"
+ PV_UNLOCK ": "
+ "push %rdx;"
+ "mov $0x1,%eax;"
+ "xor %edx,%edx;"
+ "lock cmpxchg %dl,(%rdi);"
+ "cmp $0x1,%al;"
+ "jne .slowpath;"
+ "pop %rdx;"
+ "ret;"
+ ".slowpath: "
+ "push %rsi;"
+ "movzbl %al,%esi;"
+ "call " PV_UNLOCK_SLOWPATH ";"
+ "pop %rsi;"
+ "pop %rdx;"
+ "ret;"
+ ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
+ ".popsection");
+
+#else /* CONFIG_64BIT */
+
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+#endif /* CONFIG_64BIT */
#endif
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index a82c4f1b4d83..2cb1cc253d51 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,5 +25,6 @@ void __noreturn machine_real_restart(unsigned int type);
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
+void run_crash_ipi_callback(struct pt_regs *regs);
#endif /* _ASM_X86_REBOOT_H */
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 9dfce4e0417d..e6cd2c489dbb 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -1,79 +1,8 @@
#ifndef _ASM_X86_SIGCONTEXT_H
#define _ASM_X86_SIGCONTEXT_H
-#include <uapi/asm/sigcontext.h>
-
-#ifdef __i386__
-struct sigcontext {
- unsigned short gs, __gsh;
- unsigned short fs, __fsh;
- unsigned short es, __esh;
- unsigned short ds, __dsh;
- unsigned long di;
- unsigned long si;
- unsigned long bp;
- unsigned long sp;
- unsigned long bx;
- unsigned long dx;
- unsigned long cx;
- unsigned long ax;
- unsigned long trapno;
- unsigned long err;
- unsigned long ip;
- unsigned short cs, __csh;
- unsigned long flags;
- unsigned long sp_at_signal;
- unsigned short ss, __ssh;
+/* This is a legacy header - all kernel code includes <uapi/asm/sigcontext.h> directly. */
- /*
- * fpstate is really (struct _fpstate *) or (struct _xstate *)
- * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
- * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
- * of extended memory layout. See comments at the definition of
- * (struct _fpx_sw_bytes)
- */
- void __user *fpstate; /* zero when no FPU/extended context */
- unsigned long oldmask;
- unsigned long cr2;
-};
-#else /* __i386__ */
-struct sigcontext {
- unsigned long r8;
- unsigned long r9;
- unsigned long r10;
- unsigned long r11;
- unsigned long r12;
- unsigned long r13;
- unsigned long r14;
- unsigned long r15;
- unsigned long di;
- unsigned long si;
- unsigned long bp;
- unsigned long bx;
- unsigned long dx;
- unsigned long ax;
- unsigned long cx;
- unsigned long sp;
- unsigned long ip;
- unsigned long flags;
- unsigned short cs;
- unsigned short gs;
- unsigned short fs;
- unsigned short __pad0;
- unsigned long err;
- unsigned long trapno;
- unsigned long oldmask;
- unsigned long cr2;
+#include <uapi/asm/sigcontext.h>
- /*
- * fpstate is really (struct _fpstate *) or (struct _xstate *)
- * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
- * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
- * of extended memory layout. See comments at the definition of
- * (struct _fpx_sw_bytes)
- */
- void __user *fpstate; /* zero when no FPU/extended context */
- unsigned long reserved1[8];
-};
-#endif /* !__i386__ */
#endif /* _ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 1f3175bb994e..34edd1650bae 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -1,7 +1,7 @@
#ifndef _ASM_X86_SIGFRAME_H
#define _ASM_X86_SIGFRAME_H
-#include <asm/sigcontext.h>
+#include <uapi/asm/sigcontext.h>
#include <asm/siginfo.h>
#include <asm/ucontext.h>
#include <linux/compat.h>
@@ -9,8 +9,6 @@
#ifdef CONFIG_X86_32
#define sigframe_ia32 sigframe
#define rt_sigframe_ia32 rt_sigframe
-#define sigcontext_ia32 sigcontext
-#define _fpstate_ia32 _fpstate
#define ucontext_ia32 ucontext
#else /* !CONFIG_X86_32 */
@@ -24,7 +22,7 @@
struct sigframe_ia32 {
u32 pretcode;
int sig;
- struct sigcontext_ia32 sc;
+ struct sigcontext_32 sc;
/*
* fpstate is unused. fpstate is moved/allocated after
* retcode[] below. This movement allows to have the FP state and the
@@ -33,7 +31,7 @@ struct sigframe_ia32 {
* the offset of extramask[] in the sigframe and thus prevent any
* legacy application accessing/modifying it.
*/
- struct _fpstate_ia32 fpstate_unused;
+ struct _fpstate_32 fpstate_unused;
#ifdef CONFIG_IA32_EMULATION
unsigned int extramask[_COMPAT_NSIG_WORDS-1];
#else /* !CONFIG_IA32_EMULATION */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index c481be78fcf1..2138c9ae19ee 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -34,7 +34,7 @@ extern void do_signal(struct pt_regs *regs);
#define __ARCH_HAS_SA_RESTORER
-#include <asm/sigcontext.h>
+#include <uapi/asm/sigcontext.h>
#ifdef __i386__
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 222a6a3ca2b5..dfcf0727623b 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -21,15 +21,6 @@
extern int smp_num_siblings;
extern unsigned int num_processors;
-static inline bool cpu_has_ht_siblings(void)
-{
- bool has_siblings = false;
-#ifdef CONFIG_SMP
- has_siblings = cpu_has_ht && smp_num_siblings > 1;
-#endif
- return has_siblings;
-}
-
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
/* cpus sharing the last level cache: */
@@ -74,9 +65,6 @@ struct smp_ops {
extern void set_cpu_sibling_map(int cpu);
#ifdef CONFIG_SMP
-#ifndef CONFIG_PARAVIRT
-#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
-#endif
extern struct smp_ops smp_ops;
static inline void smp_send_stop(void)
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index d1793f06854d..8e9dbe7b73a1 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -15,6 +15,7 @@ struct saved_context {
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
bool misc_enable_saved;
+ struct saved_msrs saved_msrs;
struct desc_ptr gdt_desc;
struct desc_ptr idt;
u16 ldt;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 7ebf0ebe4e68..6136a18152af 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -24,6 +24,7 @@ struct saved_context {
unsigned long cr0, cr2, cr3, cr4, cr8;
u64 misc_enable;
bool misc_enable_saved;
+ struct saved_msrs saved_msrs;
unsigned long efer;
u16 gdt_pad; /* Unused */
struct desc_ptr gdt_desc;
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index d7f3b3b78ac3..751bf4b7bf11 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -79,12 +79,12 @@ do { \
#else /* CONFIG_X86_32 */
/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
+#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t"
#define __EXTRA_CLOBBER \
, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
- "r12", "r13", "r14", "r15"
+ "r12", "r13", "r14", "r15", "flags"
#ifdef CONFIG_CC_STACKPROTECTOR
#define __switch_canary \
@@ -100,7 +100,11 @@ do { \
#define __switch_canary_iparam
#endif /* CC_STACKPROTECTOR */
-/* Save restore flags to clear handle leaking NT */
+/*
+ * There is no need to save or restore flags, because flags are always
+ * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL
+ * has no effect.
+ */
#define switch_to(prev, next, last) \
asm volatile(SAVE_CONTEXT \
"movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index d6a756ae04c8..999b7cd2e78c 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -20,9 +20,21 @@
#include <asm/thread_info.h> /* for TS_COMPAT */
#include <asm/unistd.h>
-typedef void (*sys_call_ptr_t)(void);
+typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
extern const sys_call_ptr_t sys_call_table[];
+#if defined(CONFIG_X86_32)
+#define ia32_sys_call_table sys_call_table
+#define __NR_syscall_compat_max __NR_syscall_max
+#define IA32_NR_syscalls NR_syscalls
+#endif
+
+#if defined(CONFIG_IA32_EMULATION)
+extern const sys_call_ptr_t ia32_sys_call_table[];
+#endif
+
/*
* Only the low 32 bits of orig_ax are meaningful, so we return int.
* This importantly ignores the high bits on 64-bit, so comparisons
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8afdc3e44247..c7b551028740 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -57,9 +57,7 @@ struct thread_info {
__u32 flags; /* low level flags */
__u32 status; /* thread synchronous flags */
__u32 cpu; /* current CPU */
- int saved_preempt_count;
mm_segment_t addr_limit;
- void __user *sysenter_return;
unsigned int sig_on_uaccess_error:1;
unsigned int uaccess_err:1; /* uaccess failed */
};
@@ -69,7 +67,6 @@ struct thread_info {
.task = &tsk, \
.flags = 0, \
.cpu = 0, \
- .saved_preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
}
diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h
index 173dd3ba108c..0f492fc50bce 100644
--- a/arch/x86/include/asm/trace/mpx.h
+++ b/arch/x86/include/asm/trace/mpx.h
@@ -11,7 +11,7 @@
TRACE_EVENT(mpx_bounds_register_exception,
TP_PROTO(void *addr_referenced,
- const struct bndreg *bndreg),
+ const struct mpx_bndreg *bndreg),
TP_ARGS(addr_referenced, bndreg),
TP_STRUCT__entry(
@@ -44,7 +44,7 @@ TRACE_EVENT(mpx_bounds_register_exception,
TRACE_EVENT(bounds_exception_mpx,
- TP_PROTO(const struct bndcsr *bndcsr),
+ TP_PROTO(const struct mpx_bndcsr *bndcsr),
TP_ARGS(bndcsr),
TP_STRUCT__entry(
@@ -116,7 +116,8 @@ TRACE_EVENT(mpx_new_bounds_table,
/*
* This gets used outside of MPX-specific code, so we need a stub.
*/
-static inline void trace_bounds_exception_mpx(const struct bndcsr *bndcsr)
+static inline
+void trace_bounds_exception_mpx(const struct mpx_bndcsr *bndcsr)
{
}
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a8df874f3e88..a4a30e4b2d34 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -51,13 +51,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
* limit, not add it to the address).
*/
if (__builtin_constant_p(size))
- return addr > limit - size;
+ return unlikely(addr > limit - size);
/* Arbitrary sizes? Be careful about overflow */
addr += size;
- if (addr < size)
+ if (unlikely(addr < size))
return true;
- return addr > limit;
+ return unlikely(addr > limit);
}
#define __range_not_ok(addr, size, limit) \
@@ -134,6 +134,9 @@ extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_bad(void);
+#define __uaccess_begin() stac()
+#define __uaccess_end() clac()
+
/*
* This is a type: either unsigned long, if the argument fits into
* that type, or otherwise unsigned long long.
@@ -182,7 +185,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
: "=a" (__ret_gu), "=r" (__val_gu) \
: "0" (ptr), "i" (sizeof(*(ptr)))); \
(x) = (__force __typeof__(*(ptr))) __val_gu; \
- __ret_gu; \
+ __builtin_expect(__ret_gu, 0); \
})
#define __put_user_x(size, x, ptr, __ret_pu) \
@@ -193,10 +196,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
#ifdef CONFIG_X86_32
#define __put_user_asm_u64(x, addr, err, errret) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: movl %%eax,0(%2)\n" \
"2: movl %%edx,4(%2)\n" \
- "3: " ASM_CLAC "\n" \
+ "3:" \
".section .fixup,\"ax\"\n" \
"4: movl %3,%0\n" \
" jmp 3b\n" \
@@ -207,10 +210,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
: "A" (x), "r" (addr), "i" (errret), "0" (err))
#define __put_user_asm_ex_u64(x, addr) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: movl %%eax,0(%1)\n" \
"2: movl %%edx,4(%1)\n" \
- "3: " ASM_CLAC "\n" \
+ "3:" \
_ASM_EXTABLE_EX(1b, 2b) \
_ASM_EXTABLE_EX(2b, 3b) \
: : "A" (x), "r" (addr))
@@ -278,7 +281,7 @@ extern void __put_user_8(void);
__put_user_x(X, __pu_val, ptr, __ret_pu); \
break; \
} \
- __ret_pu; \
+ __builtin_expect(__ret_pu, 0); \
})
#define __put_user_size(x, ptr, size, retval, errret) \
@@ -304,6 +307,10 @@ do { \
} \
} while (0)
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
#define __put_user_size_ex(x, ptr, size) \
do { \
__chk_user_ptr(ptr); \
@@ -358,9 +365,9 @@ do { \
} while (0)
#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: mov"itype" %2,%"rtype"1\n" \
- "2: " ASM_CLAC "\n" \
+ "2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %3,%0\n" \
" xor"itype" %"rtype"1,%"rtype"1\n" \
@@ -370,6 +377,10 @@ do { \
: "=r" (err), ltype(x) \
: "m" (__m(addr)), "i" (errret), "0" (err))
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
#define __get_user_size_ex(x, ptr, size) \
do { \
__chk_user_ptr(ptr); \
@@ -400,17 +411,21 @@ do { \
#define __put_user_nocheck(x, ptr, size) \
({ \
int __pu_err; \
+ __uaccess_begin(); \
__put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
- __pu_err; \
+ __uaccess_end(); \
+ __builtin_expect(__pu_err, 0); \
})
#define __get_user_nocheck(x, ptr, size) \
({ \
int __gu_err; \
unsigned long __gu_val; \
+ __uaccess_begin(); \
__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
+ __uaccess_end(); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
- __gu_err; \
+ __builtin_expect(__gu_err, 0); \
})
/* FIXME: this hack is definitely wrong -AK */
@@ -423,9 +438,9 @@ struct __large_struct { unsigned long buf[100]; };
* aliasing issues.
*/
#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: mov"itype" %"rtype"1,%2\n" \
- "2: " ASM_CLAC "\n" \
+ "2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %3,%0\n" \
" jmp 2b\n" \
@@ -445,11 +460,11 @@ struct __large_struct { unsigned long buf[100]; };
*/
#define uaccess_try do { \
current_thread_info()->uaccess_err = 0; \
- stac(); \
+ __uaccess_begin(); \
barrier();
#define uaccess_catch(err) \
- clac(); \
+ __uaccess_end(); \
(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \
} while (0)
@@ -547,12 +562,13 @@ extern void __cmpxchg_wrong_size(void)
__typeof__(ptr) __uval = (uval); \
__typeof__(*(ptr)) __old = (old); \
__typeof__(*(ptr)) __new = (new); \
+ __uaccess_begin(); \
switch (size) { \
case 1: \
{ \
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -566,9 +582,9 @@ extern void __cmpxchg_wrong_size(void)
} \
case 2: \
{ \
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -582,9 +598,9 @@ extern void __cmpxchg_wrong_size(void)
} \
case 4: \
{ \
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -601,9 +617,9 @@ extern void __cmpxchg_wrong_size(void)
if (!IS_ENABLED(CONFIG_X86_64)) \
__cmpxchg_wrong_size(); \
\
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -618,6 +634,7 @@ extern void __cmpxchg_wrong_size(void)
default: \
__cmpxchg_wrong_size(); \
} \
+ __uaccess_end(); \
*__uval = __old; \
__ret; \
})
@@ -745,5 +762,39 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
#undef __copy_from_user_overflow
#undef __copy_to_user_overflow
+/*
+ * We rely on the nested NMI work to allow atomic faults from the NMI path; the
+ * nested NMI paths are careful to preserve CR2.
+ *
+ * Caller must use pagefault_enable/disable, or run in interrupt context,
+ * and also do a uaccess_ok() check
+ */
+#define __copy_from_user_nmi __copy_from_user_inatomic
+
+/*
+ * The "unsafe" user accesses aren't really "unsafe", but the naming
+ * is a big fat warning: you have to not only do the access_ok()
+ * checking before using them, but you have to surround them with the
+ * user_access_begin/end() pair.
+ */
+#define user_access_begin() __uaccess_begin()
+#define user_access_end() __uaccess_end()
+
+#define unsafe_put_user(x, ptr) \
+({ \
+ int __pu_err; \
+ __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \
+ __builtin_expect(__pu_err, 0); \
+})
+
+#define unsafe_get_user(x, ptr) \
+({ \
+ int __gu_err; \
+ unsigned long __gu_val; \
+ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \
+ (x) = (__force __typeof__(*(ptr)))__gu_val; \
+ __builtin_expect(__gu_err, 0); \
+})
+
#endif /* _ASM_X86_UACCESS_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f2f9b39b274a..b89c34c4019b 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -56,35 +56,49 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
if (!__builtin_constant_p(size))
return copy_user_generic(dst, (__force void *)src, size);
switch (size) {
- case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
+ case 1:
+ __uaccess_begin();
+ __get_user_asm(*(u8 *)dst, (u8 __user *)src,
ret, "b", "b", "=q", 1);
+ __uaccess_end();
return ret;
- case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
+ case 2:
+ __uaccess_begin();
+ __get_user_asm(*(u16 *)dst, (u16 __user *)src,
ret, "w", "w", "=r", 2);
+ __uaccess_end();
return ret;
- case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
+ case 4:
+ __uaccess_begin();
+ __get_user_asm(*(u32 *)dst, (u32 __user *)src,
ret, "l", "k", "=r", 4);
+ __uaccess_end();
return ret;
- case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ case 8:
+ __uaccess_begin();
+ __get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 8);
+ __uaccess_end();
return ret;
case 10:
+ __uaccess_begin();
__get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 10);
- if (unlikely(ret))
- return ret;
- __get_user_asm(*(u16 *)(8 + (char *)dst),
- (u16 __user *)(8 + (char __user *)src),
- ret, "w", "w", "=r", 2);
+ if (likely(!ret))
+ __get_user_asm(*(u16 *)(8 + (char *)dst),
+ (u16 __user *)(8 + (char __user *)src),
+ ret, "w", "w", "=r", 2);
+ __uaccess_end();
return ret;
case 16:
+ __uaccess_begin();
__get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 16);
- if (unlikely(ret))
- return ret;
- __get_user_asm(*(u64 *)(8 + (char *)dst),
- (u64 __user *)(8 + (char __user *)src),
- ret, "q", "", "=r", 8);
+ if (likely(!ret))
+ __get_user_asm(*(u64 *)(8 + (char *)dst),
+ (u64 __user *)(8 + (char __user *)src),
+ ret, "q", "", "=r", 8);
+ __uaccess_end();
return ret;
default:
return copy_user_generic(dst, (__force void *)src, size);
@@ -106,35 +120,51 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
if (!__builtin_constant_p(size))
return copy_user_generic((__force void *)dst, src, size);
switch (size) {
- case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
+ case 1:
+ __uaccess_begin();
+ __put_user_asm(*(u8 *)src, (u8 __user *)dst,
ret, "b", "b", "iq", 1);
+ __uaccess_end();
return ret;
- case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
+ case 2:
+ __uaccess_begin();
+ __put_user_asm(*(u16 *)src, (u16 __user *)dst,
ret, "w", "w", "ir", 2);
+ __uaccess_end();
return ret;
- case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
+ case 4:
+ __uaccess_begin();
+ __put_user_asm(*(u32 *)src, (u32 __user *)dst,
ret, "l", "k", "ir", 4);
+ __uaccess_end();
return ret;
- case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ case 8:
+ __uaccess_begin();
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 8);
+ __uaccess_end();
return ret;
case 10:
+ __uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 10);
- if (unlikely(ret))
- return ret;
- asm("":::"memory");
- __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
- ret, "w", "w", "ir", 2);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ }
+ __uaccess_end();
return ret;
case 16:
+ __uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 16);
- if (unlikely(ret))
- return ret;
- asm("":::"memory");
- __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
- ret, "q", "", "er", 8);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
+ ret, "q", "", "er", 8);
+ }
+ __uaccess_end();
return ret;
default:
return copy_user_generic((__force void *)dst, src, size);
@@ -160,39 +190,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
switch (size) {
case 1: {
u8 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u8 __user *)src,
ret, "b", "b", "=q", 1);
if (likely(!ret))
__put_user_asm(tmp, (u8 __user *)dst,
ret, "b", "b", "iq", 1);
+ __uaccess_end();
return ret;
}
case 2: {
u16 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u16 __user *)src,
ret, "w", "w", "=r", 2);
if (likely(!ret))
__put_user_asm(tmp, (u16 __user *)dst,
ret, "w", "w", "ir", 2);
+ __uaccess_end();
return ret;
}
case 4: {
u32 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u32 __user *)src,
ret, "l", "k", "=r", 4);
if (likely(!ret))
__put_user_asm(tmp, (u32 __user *)dst,
ret, "l", "k", "ir", 4);
+ __uaccess_end();
return ret;
}
case 8: {
u64 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u64 __user *)src,
ret, "q", "", "=r", 8);
if (likely(!ret))
__put_user_asm(tmp, (u64 __user *)dst,
ret, "q", "", "er", 8);
+ __uaccess_end();
return ret;
}
default:
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index a00ad8f2a657..ea7074784cc4 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -609,7 +609,7 @@ struct uv_cpu_nmi_s {
DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi);
-#define uv_hub_nmi (uv_cpu_nmi.hub)
+#define uv_hub_nmi this_cpu_read(uv_cpu_nmi.hub)
#define uv_cpu_nmi_per(cpu) (per_cpu(uv_cpu_nmi, cpu))
#define uv_hub_nmi_per(cpu) (uv_cpu_nmi_per(cpu).hub)
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 8021bd28c0f1..deabaf9759b6 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -22,11 +22,12 @@ struct vdso_image {
long sym_vvar_page;
long sym_hpet_page;
+ long sym_pvclock_page;
long sym_VDSO32_NOTE_MASK;
long sym___kernel_sigreturn;
long sym___kernel_rt_sigreturn;
long sym___kernel_vsyscall;
- long sym_VDSO32_SYSENTER_RETURN;
+ long sym_int80_landing_pad;
};
#ifdef CONFIG_X86_64
@@ -38,13 +39,7 @@ extern const struct vdso_image vdso_image_x32;
#endif
#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
-extern const struct vdso_image vdso_image_32_int80;
-#ifdef CONFIG_COMPAT
-extern const struct vdso_image vdso_image_32_syscall;
-#endif
-extern const struct vdso_image vdso_image_32_sysenter;
-
-extern const struct vdso_image *selected_vdso32;
+extern const struct vdso_image vdso_image_32;
#endif
extern void __init init_vdso_image(const struct vdso_image *image);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 448b7ca61aee..14c63c7e8337 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,7 +72,8 @@
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
#define SECONDARY_EXEC_ENABLE_PML 0x00020000
#define SECONDARY_EXEC_XSAVES 0x00100000
-
+#define SECONDARY_EXEC_PCOMMIT 0x00200000
+#define SECONDARY_EXEC_TSC_SCALING 0x02000000
#define PIN_BASED_EXT_INTR_MASK 0x00000001
#define PIN_BASED_NMI_EXITING 0x00000008
@@ -167,6 +168,8 @@ enum vmcs_field {
VMWRITE_BITMAP = 0x00002028,
XSS_EXIT_BITMAP = 0x0000202C,
XSS_EXIT_BITMAP_HIGH = 0x0000202D,
+ TSC_MULTIPLIER = 0x00002032,
+ TSC_MULTIPLIER_HIGH = 0x00002033,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
VMCS_LINK_POINTER = 0x00002800,
@@ -416,6 +419,7 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
+#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */
#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 48d34d28f5a6..1ae89a2721d6 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -1,7 +1,6 @@
#ifndef _ASM_X86_PLATFORM_H
#define _ASM_X86_PLATFORM_H
-#include <asm/pgtable_types.h>
#include <asm/bootparam.h>
struct mpc_bus;
@@ -83,13 +82,11 @@ struct x86_init_paging {
* struct x86_init_timers - platform specific timer setup
* @setup_perpcu_clockev: set up the per cpu clock event device for the
* boot cpu
- * @tsc_pre_init: platform function called before TSC init
* @timer_init: initialize the platform timer (default PIT/HPET)
* @wallclock_init: init the wallclock device
*/
struct x86_init_timers {
void (*setup_percpu_clockev)(void);
- void (*tsc_pre_init)(void);
void (*timer_init)(void);
void (*wallclock_init)(void);
};
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 4c20dd333412..3bcdcc84259d 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -310,10 +310,10 @@ HYPERVISOR_mca(struct xen_mc *mc_op)
}
static inline int
-HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
+HYPERVISOR_platform_op(struct xen_platform_op *op)
{
- platform_op->interface_version = XENPF_INTERFACE_VERSION;
- return _hypercall1(int, dom0_op, platform_op);
+ op->interface_version = XENPF_INTERFACE_VERSION;
+ return _hypercall1(int, platform_op, op);
}
static inline int
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d866959e5685..8b2d4bea9962 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -57,4 +57,9 @@ static inline bool xen_x2apic_para_available(void)
}
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+void xen_arch_register_cpu(int num);
+void xen_arch_unregister_cpu(int num);
+#endif
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 0679e11d2cf7..f5fb840b43e8 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -12,7 +12,7 @@
#include <asm/pgtable.h>
#include <xen/interface/xen.h>
-#include <xen/grant_table.h>
+#include <xen/interface/grant_table.h>
#include <xen/features.h>
/* Xen machine address */
@@ -43,6 +43,8 @@ extern unsigned long *xen_p2m_addr;
extern unsigned long xen_p2m_size;
extern unsigned long xen_max_p2m_pfn;
+extern int xen_alloc_p2m_entry(unsigned long pfn);
+
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
@@ -296,8 +298,8 @@ void make_lowmem_page_readwrite(void *vaddr);
#define xen_unmap(cookie) iounmap((cookie))
static inline bool xen_arch_need_swiotlb(struct device *dev,
- unsigned long pfn,
- unsigned long bfn)
+ phys_addr_t phys,
+ dma_addr_t dev_addr)
{
return false;
}
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 5a08bc8bff33..c54beb44c4c1 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -553,7 +553,7 @@ do { \
if (cpu_has_xmm) { \
xor_speed(&xor_block_pIII_sse); \
xor_speed(&xor_block_sse_pf64); \
- } else if (cpu_has_mmx) { \
+ } else if (boot_cpu_has(X86_FEATURE_MMX)) { \
xor_speed(&xor_block_pII_mmx); \
xor_speed(&xor_block_p5_mmx); \
} else { \
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index f0412c50c47b..7956412d09bd 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -153,6 +153,12 @@
/* MSR used to provide vcpu index */
#define HV_X64_MSR_VP_INDEX 0x40000002
+/* MSR used to reset the guest OS. */
+#define HV_X64_MSR_RESET 0x40000003
+
+/* MSR used to provide vcpu runtime in 100ns units */
+#define HV_X64_MSR_VP_RUNTIME 0x40000010
+
/* MSR used to read the per-partition time reference counter */
#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
@@ -251,4 +257,108 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
__s64 tsc_offset;
} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
+/* Define the number of synthetic interrupt sources. */
+#define HV_SYNIC_SINT_COUNT (16)
+/* Define the expected SynIC version. */
+#define HV_SYNIC_VERSION_1 (0x1)
+
+#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0)
+#define HV_SYNIC_SIMP_ENABLE (1ULL << 0)
+#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0)
+#define HV_SYNIC_SINT_MASKED (1ULL << 16)
+#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17)
+#define HV_SYNIC_SINT_VECTOR_MASK (0xFF)
+
+#define HV_SYNIC_STIMER_COUNT (4)
+
+/* Define synthetic interrupt controller message constants. */
+#define HV_MESSAGE_SIZE (256)
+#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240)
+#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30)
+
+/* Define hypervisor message types. */
+enum hv_message_type {
+ HVMSG_NONE = 0x00000000,
+
+ /* Memory access messages. */
+ HVMSG_UNMAPPED_GPA = 0x80000000,
+ HVMSG_GPA_INTERCEPT = 0x80000001,
+
+ /* Timer notification messages. */
+ HVMSG_TIMER_EXPIRED = 0x80000010,
+
+ /* Error messages. */
+ HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020,
+ HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021,
+ HVMSG_UNSUPPORTED_FEATURE = 0x80000022,
+
+ /* Trace buffer complete messages. */
+ HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040,
+
+ /* Platform-specific processor intercept messages. */
+ HVMSG_X64_IOPORT_INTERCEPT = 0x80010000,
+ HVMSG_X64_MSR_INTERCEPT = 0x80010001,
+ HVMSG_X64_CPUID_INTERCEPT = 0x80010002,
+ HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003,
+ HVMSG_X64_APIC_EOI = 0x80010004,
+ HVMSG_X64_LEGACY_FP_ERROR = 0x80010005
+};
+
+/* Define synthetic interrupt controller message flags. */
+union hv_message_flags {
+ __u8 asu8;
+ struct {
+ __u8 msg_pending:1;
+ __u8 reserved:7;
+ };
+};
+
+/* Define port identifier type. */
+union hv_port_id {
+ __u32 asu32;
+ struct {
+ __u32 id:24;
+ __u32 reserved:8;
+ } u;
+};
+
+/* Define synthetic interrupt controller message header. */
+struct hv_message_header {
+ __u32 message_type;
+ __u8 payload_size;
+ union hv_message_flags message_flags;
+ __u8 reserved[2];
+ union {
+ __u64 sender;
+ union hv_port_id port;
+ };
+};
+
+/* Define synthetic interrupt controller message format. */
+struct hv_message {
+ struct hv_message_header header;
+ union {
+ __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
+ } u;
+};
+
+/* Define the synthetic interrupt message page layout. */
+struct hv_message_page {
+ struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
+};
+
+/* Define timer message payload structure. */
+struct hv_timer_message_payload {
+ __u32 timer_index;
+ __u32 reserved;
+ __u64 expiration_time; /* When the timer expired */
+ __u64 delivery_time; /* When the message was delivered */
+};
+
+#define HV_STIMER_ENABLE (1ULL << 0)
+#define HV_STIMER_PERIODIC (1ULL << 1)
+#define HV_STIMER_LAZY (1ULL << 2)
+#define HV_STIMER_AUTOENABLE (1ULL << 3)
+#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F)
+
#endif
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index 76880ede9a35..2184943341bf 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -2,7 +2,7 @@
#define _UAPI_ASM_X86_MCE_H
#include <linux/types.h>
-#include <asm/ioctls.h>
+#include <linux/ioctl.h>
/* Fields are zero when not available */
struct mce {
@@ -16,7 +16,7 @@ struct mce {
__u8 cpuvendor; /* cpu vendor as encoded in system.h */
__u8 inject_flags; /* software inject flags */
__u8 severity;
- __u8 usable_addr;
+ __u8 pad;
__u32 cpuid; /* CPUID 1 EAX */
__u8 cs; /* code segment */
__u8 bank; /* machine check bank */
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 40836a9a7250..d485232f1e9f 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -1,221 +1,360 @@
#ifndef _UAPI_ASM_X86_SIGCONTEXT_H
#define _UAPI_ASM_X86_SIGCONTEXT_H
+/*
+ * Linux signal context definitions. The sigcontext includes a complex
+ * hierarchy of CPU and FPU state, available to user-space (on the stack) when
+ * a signal handler is executed.
+ *
+ * As over the years this ABI grew from its very simple roots towards
+ * supporting more and more CPU state organically, some of the details (which
+ * were rather clever hacks back in the days) became a bit quirky by today.
+ *
+ * The current ABI includes flexible provisions for future extensions, so we
+ * won't have to grow new quirks for quite some time. Promise!
+ */
+
#include <linux/compiler.h>
#include <linux/types.h>
-#define FP_XSTATE_MAGIC1 0x46505853U
-#define FP_XSTATE_MAGIC2 0x46505845U
-#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2)
+#define FP_XSTATE_MAGIC1 0x46505853U
+#define FP_XSTATE_MAGIC2 0x46505845U
+#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2)
/*
- * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame
- * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes
- * are used to extended the fpstate pointer in the sigcontext, which now
- * includes the extended state information along with fpstate information.
+ * Bytes 464..511 in the current 512-byte layout of the FXSAVE/FXRSTOR frame
+ * are reserved for SW usage. On CPUs supporting XSAVE/XRSTOR, these bytes are
+ * used to extend the fpstate pointer in the sigcontext, which now includes the
+ * extended state information along with fpstate information.
+ *
+ * If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then there's a
+ * sw_reserved.extended_size bytes large extended context area present. (The
+ * last 32-bit word of this extended area (at the
+ * fpstate+extended_size-FP_XSTATE_MAGIC2_SIZE address) is set to
+ * FP_XSTATE_MAGIC2 so that you can sanity check your size calculations.)
*
- * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved
- * area and FP_XSTATE_MAGIC2 at the end of memory layout
- * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the
- * extended state information in the memory layout pointed by the fpstate
- * pointer in sigcontext.
+ * This extended area typically grows with newer CPUs that have larger and
+ * larger XSAVE areas.
*/
struct _fpx_sw_bytes {
- __u32 magic1; /* FP_XSTATE_MAGIC1 */
- __u32 extended_size; /* total size of the layout referred by
- * fpstate pointer in the sigcontext.
- */
- __u64 xfeatures;
- /* feature bit mask (including fp/sse/extended
- * state) that is present in the memory
- * layout.
- */
- __u32 xstate_size; /* actual xsave state size, based on the
- * features saved in the layout.
- * 'extended_size' will be greater than
- * 'xstate_size'.
- */
- __u32 padding[7]; /* for future use. */
+ /*
+ * If set to FP_XSTATE_MAGIC1 then this is an xstate context.
+ * 0 if a legacy frame.
+ */
+ __u32 magic1;
+
+ /*
+ * Total size of the fpstate area:
+ *
+ * - if magic1 == 0 then it's sizeof(struct _fpstate)
+ * - if magic1 == FP_XSTATE_MAGIC1 then it's sizeof(struct _xstate)
+ * plus extensions (if any)
+ */
+ __u32 extended_size;
+
+ /*
+ * Feature bit mask (including FP/SSE/extended state) that is present
+ * in the memory layout:
+ */
+ __u64 xfeatures;
+
+ /*
+ * Actual XSAVE state size, based on the xfeatures saved in the layout.
+ * 'extended_size' is greater than 'xstate_size':
+ */
+ __u32 xstate_size;
+
+ /* For future use: */
+ __u32 padding[7];
};
-#ifdef __i386__
/*
- * As documented in the iBCS2 standard..
- *
- * The first part of "struct _fpstate" is just the normal i387
- * hardware setup, the extra "status" word is used to save the
- * coprocessor status word before entering the handler.
+ * As documented in the iBCS2 standard:
*
- * Pentium III FXSR, SSE support
- * Gareth Hughes <gareth@valinux.com>, May 2000
+ * The first part of "struct _fpstate" is just the normal i387 hardware setup,
+ * the extra "status" word is used to save the coprocessor status word before
+ * entering the handler.
*
- * The FPU state data structure has had to grow to accommodate the
- * extended FPU state required by the Streaming SIMD Extensions.
- * There is no documented standard to accomplish this at the moment.
+ * The FPU state data structure has had to grow to accommodate the extended FPU
+ * state required by the Streaming SIMD Extensions. There is no documented
+ * standard to accomplish this at the moment.
*/
+
+/* 10-byte legacy floating point register: */
struct _fpreg {
- unsigned short significand[4];
- unsigned short exponent;
+ __u16 significand[4];
+ __u16 exponent;
};
+/* 16-byte floating point register: */
struct _fpxreg {
- unsigned short significand[4];
- unsigned short exponent;
- unsigned short padding[3];
+ __u16 significand[4];
+ __u16 exponent;
+ __u16 padding[3];
};
+/* 16-byte XMM register: */
struct _xmmreg {
- unsigned long element[4];
+ __u32 element[4];
};
-struct _fpstate {
- /* Regular FPU environment */
- unsigned long cw;
- unsigned long sw;
- unsigned long tag;
- unsigned long ipoff;
- unsigned long cssel;
- unsigned long dataoff;
- unsigned long datasel;
- struct _fpreg _st[8];
- unsigned short status;
- unsigned short magic; /* 0xffff = regular FPU data only */
+#define X86_FXSR_MAGIC 0x0000
+
+/*
+ * The 32-bit FPU frame:
+ */
+struct _fpstate_32 {
+ /* Legacy FPU environment: */
+ __u32 cw;
+ __u32 sw;
+ __u32 tag;
+ __u32 ipoff;
+ __u32 cssel;
+ __u32 dataoff;
+ __u32 datasel;
+ struct _fpreg _st[8];
+ __u16 status;
+ __u16 magic; /* 0xffff: regular FPU data only */
+ /* 0x0000: FXSR FPU data */
/* FXSR FPU environment */
- unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */
- unsigned long mxcsr;
- unsigned long reserved;
- struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */
- struct _xmmreg _xmm[8];
- unsigned long padding1[44];
+ __u32 _fxsr_env[6]; /* FXSR FPU env is ignored */
+ __u32 mxcsr;
+ __u32 reserved;
+ struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */
+ struct _xmmreg _xmm[8]; /* First 8 XMM registers */
+ union {
+ __u32 padding1[44]; /* Second 8 XMM registers plus padding */
+ __u32 padding[44]; /* Alias name for old user-space */
+ };
union {
- unsigned long padding2[12];
- struct _fpx_sw_bytes sw_reserved; /* represents the extended
- * state info */
+ __u32 padding2[12];
+ struct _fpx_sw_bytes sw_reserved; /* Potential extended state is encoded here */
};
};
-#define X86_FXSR_MAGIC 0x0000
-
-#ifndef __KERNEL__
/*
- * User-space might still rely on the old definition:
+ * The 64-bit FPU frame. (FXSAVE format and later)
+ *
+ * Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is
+ * larger: 'struct _xstate'. Note that 'struct _xstate' embedds
+ * 'struct _fpstate' so that you can always assume the _fpstate portion
+ * exists so that you can check the magic value.
+ *
+ * Note2: Reserved fields may someday contain valuable data. Always
+ * save/restore them when you change signal frames.
*/
-struct sigcontext {
- unsigned short gs, __gsh;
- unsigned short fs, __fsh;
- unsigned short es, __esh;
- unsigned short ds, __dsh;
- unsigned long edi;
- unsigned long esi;
- unsigned long ebp;
- unsigned long esp;
- unsigned long ebx;
- unsigned long edx;
- unsigned long ecx;
- unsigned long eax;
- unsigned long trapno;
- unsigned long err;
- unsigned long eip;
- unsigned short cs, __csh;
- unsigned long eflags;
- unsigned long esp_at_signal;
- unsigned short ss, __ssh;
- struct _fpstate __user *fpstate;
- unsigned long oldmask;
- unsigned long cr2;
-};
-#endif /* !__KERNEL__ */
-
-#else /* __i386__ */
-
-/* FXSAVE frame */
-/* Note: reserved1/2 may someday contain valuable data. Always save/restore
- them when you change signal frames. */
-struct _fpstate {
- __u16 cwd;
- __u16 swd;
- __u16 twd; /* Note this is not the same as the
- 32bit/x87/FSAVE twd */
- __u16 fop;
- __u64 rip;
- __u64 rdp;
- __u32 mxcsr;
- __u32 mxcsr_mask;
- __u32 st_space[32]; /* 8*16 bytes for each FP-reg */
- __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg */
- __u32 reserved2[12];
+struct _fpstate_64 {
+ __u16 cwd;
+ __u16 swd;
+ /* Note this is not the same as the 32-bit/x87/FSAVE twd: */
+ __u16 twd;
+ __u16 fop;
+ __u64 rip;
+ __u64 rdp;
+ __u32 mxcsr;
+ __u32 mxcsr_mask;
+ __u32 st_space[32]; /* 8x FP registers, 16 bytes each */
+ __u32 xmm_space[64]; /* 16x XMM registers, 16 bytes each */
+ __u32 reserved2[12];
union {
- __u32 reserved3[12];
- struct _fpx_sw_bytes sw_reserved; /* represents the extended
- * state information */
+ __u32 reserved3[12];
+ struct _fpx_sw_bytes sw_reserved; /* Potential extended state is encoded here */
};
};
-#ifndef __KERNEL__
-/*
- * User-space might still rely on the old definition:
- */
-struct sigcontext {
- __u64 r8;
- __u64 r9;
- __u64 r10;
- __u64 r11;
- __u64 r12;
- __u64 r13;
- __u64 r14;
- __u64 r15;
- __u64 rdi;
- __u64 rsi;
- __u64 rbp;
- __u64 rbx;
- __u64 rdx;
- __u64 rax;
- __u64 rcx;
- __u64 rsp;
- __u64 rip;
- __u64 eflags; /* RFLAGS */
- __u16 cs;
- __u16 gs;
- __u16 fs;
- __u16 __pad0;
- __u64 err;
- __u64 trapno;
- __u64 oldmask;
- __u64 cr2;
- struct _fpstate __user *fpstate; /* zero when no FPU context */
-#ifdef __ILP32__
- __u32 __fpstate_pad;
+#ifdef __i386__
+# define _fpstate _fpstate_32
+#else
+# define _fpstate _fpstate_64
#endif
- __u64 reserved1[8];
-};
-#endif /* !__KERNEL__ */
-
-#endif /* !__i386__ */
struct _header {
- __u64 xfeatures;
- __u64 reserved1[2];
- __u64 reserved2[5];
+ __u64 xfeatures;
+ __u64 reserved1[2];
+ __u64 reserved2[5];
};
struct _ymmh_state {
- /* 16 * 16 bytes for each YMMH-reg */
- __u32 ymmh_space[64];
+ /* 16x YMM registers, 16 bytes each: */
+ __u32 ymmh_space[64];
};
/*
- * Extended state pointed by the fpstate pointer in the sigcontext.
- * In addition to the fpstate, information encoded in the xstate_hdr
- * indicates the presence of other extended state information
- * supported by the processor and OS.
+ * Extended state pointed to by sigcontext::fpstate.
+ *
+ * In addition to the fpstate, information encoded in _xstate::xstate_hdr
+ * indicates the presence of other extended state information supported
+ * by the CPU and kernel:
*/
struct _xstate {
- struct _fpstate fpstate;
- struct _header xstate_hdr;
- struct _ymmh_state ymmh;
- /* new processor state extensions go here */
+ struct _fpstate fpstate;
+ struct _header xstate_hdr;
+ struct _ymmh_state ymmh;
+ /* New processor state extensions go here: */
+};
+
+/*
+ * The 32-bit signal frame:
+ */
+struct sigcontext_32 {
+ __u16 gs, __gsh;
+ __u16 fs, __fsh;
+ __u16 es, __esh;
+ __u16 ds, __dsh;
+ __u32 di;
+ __u32 si;
+ __u32 bp;
+ __u32 sp;
+ __u32 bx;
+ __u32 dx;
+ __u32 cx;
+ __u32 ax;
+ __u32 trapno;
+ __u32 err;
+ __u32 ip;
+ __u16 cs, __csh;
+ __u32 flags;
+ __u32 sp_at_signal;
+ __u16 ss, __ssh;
+
+ /*
+ * fpstate is really (struct _fpstate *) or (struct _xstate *)
+ * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+ * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+ * of extended memory layout. See comments at the definition of
+ * (struct _fpx_sw_bytes)
+ */
+ __u32 fpstate; /* Zero when no FPU/extended context */
+ __u32 oldmask;
+ __u32 cr2;
+};
+
+/*
+ * The 64-bit signal frame:
+ */
+struct sigcontext_64 {
+ __u64 r8;
+ __u64 r9;
+ __u64 r10;
+ __u64 r11;
+ __u64 r12;
+ __u64 r13;
+ __u64 r14;
+ __u64 r15;
+ __u64 di;
+ __u64 si;
+ __u64 bp;
+ __u64 bx;
+ __u64 dx;
+ __u64 ax;
+ __u64 cx;
+ __u64 sp;
+ __u64 ip;
+ __u64 flags;
+ __u16 cs;
+ __u16 gs;
+ __u16 fs;
+ __u16 __pad0;
+ __u64 err;
+ __u64 trapno;
+ __u64 oldmask;
+ __u64 cr2;
+
+ /*
+ * fpstate is really (struct _fpstate *) or (struct _xstate *)
+ * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+ * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+ * of extended memory layout. See comments at the definition of
+ * (struct _fpx_sw_bytes)
+ */
+ __u64 fpstate; /* Zero when no FPU/extended context */
+ __u64 reserved1[8];
+};
+
+/*
+ * Create the real 'struct sigcontext' type:
+ */
+#ifdef __KERNEL__
+# ifdef __i386__
+# define sigcontext sigcontext_32
+# else
+# define sigcontext sigcontext_64
+# endif
+#endif
+
+/*
+ * The old user-space sigcontext definition, just in case user-space still
+ * relies on it. The kernel definition (in asm/sigcontext.h) has unified
+ * field names but otherwise the same layout.
+ */
+#ifndef __KERNEL__
+
+#define _fpstate_ia32 _fpstate_32
+#define sigcontext_ia32 sigcontext_32
+
+
+# ifdef __i386__
+struct sigcontext {
+ __u16 gs, __gsh;
+ __u16 fs, __fsh;
+ __u16 es, __esh;
+ __u16 ds, __dsh;
+ __u32 edi;
+ __u32 esi;
+ __u32 ebp;
+ __u32 esp;
+ __u32 ebx;
+ __u32 edx;
+ __u32 ecx;
+ __u32 eax;
+ __u32 trapno;
+ __u32 err;
+ __u32 eip;
+ __u16 cs, __csh;
+ __u32 eflags;
+ __u32 esp_at_signal;
+ __u16 ss, __ssh;
+ struct _fpstate __user *fpstate;
+ __u32 oldmask;
+ __u32 cr2;
};
+# else /* __x86_64__: */
+struct sigcontext {
+ __u64 r8;
+ __u64 r9;
+ __u64 r10;
+ __u64 r11;
+ __u64 r12;
+ __u64 r13;
+ __u64 r14;
+ __u64 r15;
+ __u64 rdi;
+ __u64 rsi;
+ __u64 rbp;
+ __u64 rbx;
+ __u64 rdx;
+ __u64 rax;
+ __u64 rcx;
+ __u64 rsp;
+ __u64 rip;
+ __u64 eflags; /* RFLAGS */
+ __u16 cs;
+ __u16 gs;
+ __u16 fs;
+ __u16 __pad0;
+ __u64 err;
+ __u64 trapno;
+ __u64 oldmask;
+ __u64 cr2;
+ struct _fpstate __user *fpstate; /* Zero when no FPU context */
+# ifdef __ILP32__
+ __u32 __fpstate_pad;
+# endif
+ __u64 reserved1[8];
+};
+# endif /* __x86_64__ */
+#endif /* !__KERNEL__ */
#endif /* _UAPI_ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h
index ad1478c4ae12..a92b0f0dc09e 100644
--- a/arch/x86/include/uapi/asm/sigcontext32.h
+++ b/arch/x86/include/uapi/asm/sigcontext32.h
@@ -1,77 +1,8 @@
#ifndef _ASM_X86_SIGCONTEXT32_H
#define _ASM_X86_SIGCONTEXT32_H
-#include <linux/types.h>
+/* This is a legacy file - all the type definitions are in sigcontext.h: */
-/* signal context for 32bit programs. */
-
-#define X86_FXSR_MAGIC 0x0000
-
-struct _fpreg {
- unsigned short significand[4];
- unsigned short exponent;
-};
-
-struct _fpxreg {
- unsigned short significand[4];
- unsigned short exponent;
- unsigned short padding[3];
-};
-
-struct _xmmreg {
- __u32 element[4];
-};
-
-/* FSAVE frame with extensions */
-struct _fpstate_ia32 {
- /* Regular FPU environment */
- __u32 cw;
- __u32 sw;
- __u32 tag; /* not compatible to 64bit twd */
- __u32 ipoff;
- __u32 cssel;
- __u32 dataoff;
- __u32 datasel;
- struct _fpreg _st[8];
- unsigned short status;
- unsigned short magic; /* 0xffff = regular FPU data only */
-
- /* FXSR FPU environment */
- __u32 _fxsr_env[6];
- __u32 mxcsr;
- __u32 reserved;
- struct _fpxreg _fxsr_st[8];
- struct _xmmreg _xmm[8]; /* It's actually 16 */
- __u32 padding[44];
- union {
- __u32 padding2[12];
- struct _fpx_sw_bytes sw_reserved;
- };
-};
-
-struct sigcontext_ia32 {
- unsigned short gs, __gsh;
- unsigned short fs, __fsh;
- unsigned short es, __esh;
- unsigned short ds, __dsh;
- unsigned int di;
- unsigned int si;
- unsigned int bp;
- unsigned int sp;
- unsigned int bx;
- unsigned int dx;
- unsigned int cx;
- unsigned int ax;
- unsigned int trapno;
- unsigned int err;
- unsigned int ip;
- unsigned short cs, __csh;
- unsigned int flags;
- unsigned int sp_at_signal;
- unsigned short ss, __ssh;
- unsigned int fpstate; /* really (struct _fpstate_ia32 *) */
- unsigned int oldmask;
- unsigned int cr2;
-};
+#include <asm/sigcontext.h>
#endif /* _ASM_X86_SIGCONTEXT32_H */
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index b5d7640abc5d..8a4add8e4639 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -100,6 +100,7 @@
{ SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \
{ SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \
{ SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \
+ { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \
{ SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \
{ SVM_EXIT_INTR, "interrupt" }, \
{ SVM_EXIT_NMI, "nmi" }, \
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 37fee272618f..5b15d94a33f8 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -78,6 +78,7 @@
#define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
+#define EXIT_REASON_PCOMMIT 65
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -126,7 +127,8 @@
{ EXIT_REASON_INVVPID, "INVVPID" }, \
{ EXIT_REASON_INVPCID, "INVPCID" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
- { EXIT_REASON_XRSTORS, "XRSTORS" }
+ { EXIT_REASON_XRSTORS, "XRSTORS" }, \
+ { EXIT_REASON_PCOMMIT, "PCOMMIT" }
#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ded848c20e05..e75907601a41 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -976,6 +976,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
{
int count;
int x2count = 0;
+ int ret;
+ struct acpi_subtable_proc madt_proc[2];
if (!cpu_has_apic)
return -ENODEV;
@@ -999,10 +1001,22 @@ static int __init acpi_parse_madt_lapic_entries(void)
acpi_parse_sapic, MAX_LOCAL_APIC);
if (!count) {
- x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
- acpi_parse_x2apic, MAX_LOCAL_APIC);
- count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
- acpi_parse_lapic, MAX_LOCAL_APIC);
+ memset(madt_proc, 0, sizeof(madt_proc));
+ madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
+ madt_proc[0].handler = acpi_parse_lapic;
+ madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
+ madt_proc[1].handler = acpi_parse_x2apic;
+ ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
+ sizeof(struct acpi_table_madt),
+ madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
+ if (ret < 0) {
+ printk(KERN_ERR PREFIX
+ "Error parsing LAPIC/X2APIC entries\n");
+ return ret;
+ }
+
+ x2count = madt_proc[0].count;
+ count = madt_proc[1].count;
}
if (!count && !x2count) {
printk(KERN_ERR PREFIX "No LAPIC entries present\n");
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 24e94ce454e2..8a5cddac7d44 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -82,6 +82,12 @@ physid_mask_t phys_cpu_present_map;
static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
/*
+ * This variable controls which CPUs receive external NMIs. By default,
+ * external NMIs are delivered only to the BSP.
+ */
+static int apic_extnmi = APIC_EXTNMI_BSP;
+
+/*
* Map cpu index to physical APIC ID
*/
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -1161,6 +1167,8 @@ void __init init_bsp_APIC(void)
value = APIC_DM_NMI;
if (!lapic_is_integrated()) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
+ if (apic_extnmi == APIC_EXTNMI_NONE)
+ value |= APIC_LVT_MASKED;
apic_write(APIC_LVT1, value);
}
@@ -1378,9 +1386,11 @@ void setup_local_APIC(void)
apic_write(APIC_LVT0, value);
/*
- * only the BP should see the LINT1 NMI signal, obviously.
+ * Only the BSP sees the LINT1 NMI signal by default. This can be
+ * modified by apic_extnmi= boot option.
*/
- if (!cpu)
+ if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) ||
+ apic_extnmi == APIC_EXTNMI_ALL)
value = APIC_DM_NMI;
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
@@ -1431,7 +1441,7 @@ enum {
};
static int x2apic_state;
-static inline void __x2apic_disable(void)
+static void __x2apic_disable(void)
{
u64 msr;
@@ -1447,7 +1457,7 @@ static inline void __x2apic_disable(void)
printk_once(KERN_INFO "x2apic disabled\n");
}
-static inline void __x2apic_enable(void)
+static void __x2apic_enable(void)
{
u64 msr;
@@ -1807,7 +1817,7 @@ int apic_version[MAX_LOCAL_APIC];
/*
* This interrupt should _never_ happen with our APIC/SMP architecture
*/
-static inline void __smp_spurious_interrupt(u8 vector)
+static void __smp_spurious_interrupt(u8 vector)
{
u32 v;
@@ -1848,7 +1858,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
-static inline void __smp_error_interrupt(struct pt_regs *regs)
+static void __smp_error_interrupt(struct pt_regs *regs)
{
u32 v;
u32 i = 0;
@@ -2270,6 +2280,7 @@ static struct {
unsigned int apic_tmict;
unsigned int apic_tdcr;
unsigned int apic_thmr;
+ unsigned int apic_cmci;
} apic_pm_state;
static int lapic_suspend(void)
@@ -2299,6 +2310,10 @@ static int lapic_suspend(void)
if (maxlvt >= 5)
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI);
+#endif
local_irq_save(flags);
disable_local_APIC();
@@ -2355,10 +2370,14 @@ static void lapic_resume(void)
apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#if defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
if (maxlvt >= 5)
apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci);
+#endif
if (maxlvt >= 4)
apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
@@ -2548,3 +2567,23 @@ static int __init apic_set_disabled_cpu_apicid(char *arg)
return 0;
}
early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
+
+static int __init apic_set_extnmi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strncmp("all", arg, 3))
+ apic_extnmi = APIC_EXTNMI_ALL;
+ else if (!strncmp("none", arg, 4))
+ apic_extnmi = APIC_EXTNMI_NONE;
+ else if (!strncmp("bsp", arg, 3))
+ apic_extnmi = APIC_EXTNMI_BSP;
+ else {
+ pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("apic_extnmi", apic_set_extnmi);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f92ab36979a2..9968f30cca3e 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -185,6 +185,7 @@ static struct apic apic_flat = {
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .send_IPI = default_send_IPI_single,
.send_IPI_mask = flat_send_IPI_mask,
.send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
.send_IPI_allbutself = flat_send_IPI_allbutself,
@@ -230,17 +231,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
-{
- default_send_IPI_mask_sequence_phys(cpumask, vector);
-}
-
-static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
- int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpumask, vector);
-}
-
static void physflat_send_IPI_allbutself(int vector)
{
default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
@@ -248,7 +238,7 @@ static void physflat_send_IPI_allbutself(int vector)
static void physflat_send_IPI_all(int vector)
{
- physflat_send_IPI_mask(cpu_online_mask, vector);
+ default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
}
static int physflat_probe(void)
@@ -292,8 +282,9 @@ static struct apic apic_physflat = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
- .send_IPI_mask = physflat_send_IPI_mask,
- .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys,
.send_IPI_allbutself = physflat_send_IPI_allbutself,
.send_IPI_all = physflat_send_IPI_all,
.send_IPI_self = apic_send_IPI_self,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 0d96749cfcac..331a7a07c48f 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -30,6 +30,7 @@
#include <asm/e820.h>
static void noop_init_apic_ldr(void) { }
+static void noop_send_IPI(int cpu, int vector) { }
static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_allbutself(int vector) { }
@@ -144,6 +145,7 @@ struct apic apic_noop = {
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .send_IPI = noop_send_IPI,
.send_IPI_mask = noop_send_IPI_mask,
.send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
.send_IPI_allbutself = noop_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index b548fd3b764b..c80c02c6ec49 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -11,30 +11,21 @@
*
*/
-#include <linux/errno.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ctype.h>
#include <linux/init.h>
-#include <linux/hardirq.h>
-#include <linux/delay.h>
#include <asm/numachip/numachip.h>
#include <asm/numachip/numachip_csr.h>
-#include <asm/smp.h>
-#include <asm/apic.h>
#include <asm/ipi.h>
#include <asm/apic_flat_64.h>
#include <asm/pgtable.h>
+#include <asm/pci_x86.h>
-static int numachip_system __read_mostly;
+u8 numachip_system __read_mostly;
+static const struct apic apic_numachip1;
+static const struct apic apic_numachip2;
+static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly;
-static const struct apic apic_numachip;
-
-static unsigned int get_apic_id(unsigned long x)
+static unsigned int numachip1_get_apic_id(unsigned long x)
{
unsigned long value;
unsigned int id = (x >> 24) & 0xff;
@@ -47,7 +38,7 @@ static unsigned int get_apic_id(unsigned long x)
return id;
}
-static unsigned long set_apic_id(unsigned int id)
+static unsigned long numachip1_set_apic_id(unsigned int id)
{
unsigned long x;
@@ -55,9 +46,17 @@ static unsigned long set_apic_id(unsigned int id)
return x;
}
-static unsigned int read_xapic_id(void)
+static unsigned int numachip2_get_apic_id(unsigned long x)
+{
+ u64 mcfg;
+
+ rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg);
+ return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
+}
+
+static unsigned long numachip2_set_apic_id(unsigned int id)
{
- return get_apic_id(apic_read(APIC_ID));
+ return id << 24;
}
static int numachip_apic_id_valid(int apicid)
@@ -68,7 +67,7 @@ static int numachip_apic_id_valid(int apicid)
static int numachip_apic_id_registered(void)
{
- return physid_isset(read_xapic_id(), phys_cpu_present_map);
+ return 1;
}
static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
@@ -76,36 +75,48 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
return initial_apic_id >> index_msb;
}
-static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static void numachip1_apic_icr_write(int apicid, unsigned int val)
{
- union numachip_csr_g3_ext_irq_gen int_gen;
-
- int_gen.s._destination_apic_id = phys_apicid;
- int_gen.s._vector = 0;
- int_gen.s._msgtype = APIC_DM_INIT >> 8;
- int_gen.s._index = 0;
-
- write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val);
+}
- int_gen.s._msgtype = APIC_DM_STARTUP >> 8;
- int_gen.s._vector = start_rip >> 12;
+static void numachip2_apic_icr_write(int apicid, unsigned int val)
+{
+ numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
+}
- write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+{
+ numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
+ numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
+ (start_rip >> 12));
return 0;
}
static void numachip_send_IPI_one(int cpu, int vector)
{
- union numachip_csr_g3_ext_irq_gen int_gen;
- int apicid = per_cpu(x86_cpu_to_apicid, cpu);
-
- int_gen.s._destination_apic_id = apicid;
- int_gen.s._vector = vector;
- int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8;
- int_gen.s._index = 0;
+ int local_apicid, apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned int dmode;
+
+ preempt_disable();
+ local_apicid = __this_cpu_read(x86_cpu_to_apicid);
+
+ /* Send via local APIC where non-local part matches */
+ if (!((apicid ^ local_apicid) >> NUMACHIP_LAPIC_BITS)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(apicid, vector,
+ APIC_DEST_PHYSICAL);
+ local_irq_restore(flags);
+ preempt_enable();
+ return;
+ }
+ preempt_enable();
- write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+ dmode = (vector == NMI_VECTOR) ? APIC_DM_NMI : APIC_DM_FIXED;
+ numachip_apic_icr_write(apicid, dmode | vector);
}
static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
@@ -149,9 +160,14 @@ static void numachip_send_IPI_self(int vector)
apic_write(APIC_SELF_IPI, vector);
}
-static int __init numachip_probe(void)
+static int __init numachip1_probe(void)
{
- return apic == &apic_numachip;
+ return apic == &apic_numachip1;
+}
+
+static int __init numachip2_probe(void)
+{
+ return apic == &apic_numachip2;
}
static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
@@ -172,11 +188,19 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
static int __init numachip_system_init(void)
{
- if (!numachip_system)
+ /* Map the LCSR area and set up the apic_icr_write function */
+ switch (numachip_system) {
+ case 1:
+ init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+ numachip_apic_icr_write = numachip1_apic_icr_write;
+ break;
+ case 2:
+ init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE);
+ numachip_apic_icr_write = numachip2_apic_icr_write;
+ break;
+ default:
return 0;
-
- init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
- init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
+ }
x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
x86_init.pci.arch_init = pci_numachip_init;
@@ -185,21 +209,95 @@ static int __init numachip_system_init(void)
}
early_initcall(numachip_system_init);
-static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int numachip1_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- if (!strncmp(oem_id, "NUMASC", 6)) {
- numachip_system = 1;
- return 1;
- }
+ if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+ (strncmp(oem_table_id, "NCONNECT", 8) != 0))
+ return 0;
- return 0;
+ numachip_system = 1;
+
+ return 1;
+}
+
+static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+ (strncmp(oem_table_id, "NCONECT2", 8) != 0))
+ return 0;
+
+ numachip_system = 2;
+
+ return 1;
+}
+
+/* APIC IPIs are queued */
+static void numachip_apic_wait_icr_idle(void)
+{
}
-static const struct apic apic_numachip __refconst = {
+/* APIC NMI IPIs are queued */
+static u32 numachip_safe_apic_wait_icr_idle(void)
+{
+ return 0;
+}
+static const struct apic apic_numachip1 __refconst = {
.name = "NumaConnect system",
- .probe = numachip_probe,
- .acpi_madt_oem_check = numachip_acpi_madt_oem_check,
+ .probe = numachip1_probe,
+ .acpi_madt_oem_check = numachip1_acpi_madt_oem_check,
+ .apic_id_valid = numachip_apic_id_valid,
+ .apic_id_registered = numachip_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .target_cpus = online_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+
+ .vector_allocation_domain = default_vector_allocation_domain,
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = numachip_phys_pkg_id,
+
+ .get_apic_id = numachip1_get_apic_id,
+ .set_apic_id = numachip1_set_apic_id,
+ .apic_id_mask = 0xffU << 24,
+
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+
+ .send_IPI = numachip_send_IPI_one,
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+ .inquire_remote_apic = NULL, /* REMRD not supported */
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = numachip_apic_wait_icr_idle,
+ .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle,
+};
+
+apic_driver(apic_numachip1);
+
+static const struct apic apic_numachip2 __refconst = {
+ .name = "NumaConnect2 system",
+ .probe = numachip2_probe,
+ .acpi_madt_oem_check = numachip2_acpi_madt_oem_check,
.apic_id_valid = numachip_apic_id_valid,
.apic_id_registered = numachip_apic_id_registered,
@@ -221,12 +319,13 @@ static const struct apic apic_numachip __refconst = {
.check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = numachip_phys_pkg_id,
- .get_apic_id = get_apic_id,
- .set_apic_id = set_apic_id,
+ .get_apic_id = numachip2_get_apic_id,
+ .set_apic_id = numachip2_set_apic_id,
.apic_id_mask = 0xffU << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .send_IPI = numachip_send_IPI_one,
.send_IPI_mask = numachip_send_IPI_mask,
.send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
.send_IPI_allbutself = numachip_send_IPI_allbutself,
@@ -241,8 +340,8 @@ static const struct apic apic_numachip __refconst = {
.eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+ .wait_icr_idle = numachip_apic_wait_icr_idle,
+ .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle,
};
-apic_driver(apic_numachip);
+apic_driver(apic_numachip2);
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 971cf8875939..cf9bd896c12d 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -96,11 +96,6 @@ static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
return cpuid_apic >> index_msb;
}
-static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_phys(mask, vector);
-}
-
static void bigsmp_send_IPI_allbutself(int vector)
{
default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
@@ -108,7 +103,7 @@ static void bigsmp_send_IPI_allbutself(int vector)
static void bigsmp_send_IPI_all(int vector)
{
- bigsmp_send_IPI_mask(cpu_online_mask, vector);
+ default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
}
static int dmi_bigsmp; /* can be set by dmi scanners */
@@ -180,7 +175,8 @@ static struct apic apic_bigsmp = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
- .send_IPI_mask = bigsmp_send_IPI_mask,
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
.send_IPI_mask_allbutself = NULL,
.send_IPI_allbutself = bigsmp_send_IPI_allbutself,
.send_IPI_all = bigsmp_send_IPI_all,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4f2821527014..fdb0fbfb1197 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -529,7 +529,7 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector)
}
}
-void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
+static void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
{
unsigned long flags;
struct irq_pin_list *entry;
@@ -2521,6 +2521,7 @@ void __init setup_ioapic_dest(void)
{
int pin, ioapic, irq, irq_entry;
const struct cpumask *mask;
+ struct irq_desc *desc;
struct irq_data *idata;
struct irq_chip *chip;
@@ -2536,7 +2537,9 @@ void __init setup_ioapic_dest(void)
if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq))
continue;
- idata = irq_get_irq_data(irq);
+ desc = irq_to_desc(irq);
+ raw_spin_lock_irq(&desc->lock);
+ idata = irq_desc_get_irq_data(desc);
/*
* Honour affinities which have been set in early boot
@@ -2550,6 +2553,7 @@ void __init setup_ioapic_dest(void)
/* Might be lapic_chip for irq 0 */
if (chip->irq_set_affinity)
chip->irq_set_affinity(idata, mask, false);
+ raw_spin_unlock_irq(&desc->lock);
}
}
#endif
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 62071569bd50..eb45fc9b6124 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -18,6 +18,16 @@
#include <asm/proto.h>
#include <asm/ipi.h>
+void default_send_IPI_single_phys(int cpu, int vector)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu),
+ vector, APIC_DEST_PHYSICAL);
+ local_irq_restore(flags);
+}
+
void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
{
unsigned long query_cpu;
@@ -55,6 +65,14 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
local_irq_restore(flags);
}
+/*
+ * Helper function for APICs which insist on cpumasks
+ */
+void default_send_IPI_single(int cpu, int vector)
+{
+ apic->send_IPI_mask(cpumask_of(cpu), vector);
+}
+
#ifdef CONFIG_X86_32
void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 5f1feb6854af..ade25320df96 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -96,8 +96,8 @@ static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
return arg->msi_hwirq;
}
-static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
- int nvec, msi_alloc_info_t *arg)
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg)
{
struct pci_dev *pdev = to_pci_dev(dev);
struct msi_desc *desc = first_pci_msi_entry(pdev);
@@ -113,11 +113,13 @@ static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
return 0;
}
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
-static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
{
arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
}
+EXPORT_SYMBOL_GPL(pci_msi_set_desc);
static struct msi_domain_ops pci_msi_domain_ops = {
.get_hwirq = pci_msi_get_hwirq,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 7694ae6c1199..f316e34abb42 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -105,6 +105,7 @@ static struct apic apic_default = {
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+ .send_IPI = default_send_IPI_single,
.send_IPI_mask = default_send_IPI_mask_logical,
.send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
.send_IPI_allbutself = default_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 836d11b92811..3b670df4ba7b 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -29,8 +29,9 @@ struct apic_chip_data {
};
struct irq_domain *x86_vector_domain;
+EXPORT_SYMBOL_GPL(x86_vector_domain);
static DEFINE_RAW_SPINLOCK(vector_lock);
-static cpumask_var_t vector_cpumask;
+static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask;
static struct irq_chip lapic_controller;
#ifdef CONFIG_X86_IO_APIC
static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
@@ -66,6 +67,7 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
return data ? &data->cfg : NULL;
}
+EXPORT_SYMBOL_GPL(irqd_cfg);
struct irq_cfg *irq_cfg(unsigned int irq)
{
@@ -116,35 +118,47 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
*/
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
static int current_offset = VECTOR_OFFSET_START % 16;
- int cpu, err;
+ int cpu, vector;
- if (d->move_in_progress)
+ /*
+ * If there is still a move in progress or the previous move has not
+ * been cleaned up completely, tell the caller to come back later.
+ */
+ if (d->move_in_progress ||
+ cpumask_intersects(d->old_domain, cpu_online_mask))
return -EBUSY;
/* Only try and allocate irqs on cpus that are present */
- err = -ENOSPC;
cpumask_clear(d->old_domain);
+ cpumask_clear(searched_cpumask);
cpu = cpumask_first_and(mask, cpu_online_mask);
while (cpu < nr_cpu_ids) {
- int new_cpu, vector, offset;
+ int new_cpu, offset;
+ /* Get the possible target cpus for @mask/@cpu from the apic */
apic->vector_allocation_domain(cpu, vector_cpumask, mask);
+ /*
+ * Clear the offline cpus from @vector_cpumask for searching
+ * and verify whether the result overlaps with @mask. If true,
+ * then the call to apic->cpu_mask_to_apicid_and() will
+ * succeed as well. If not, no point in trying to find a
+ * vector in this mask.
+ */
+ cpumask_and(vector_searchmask, vector_cpumask, cpu_online_mask);
+ if (!cpumask_intersects(vector_searchmask, mask))
+ goto next_cpu;
+
if (cpumask_subset(vector_cpumask, d->domain)) {
- err = 0;
if (cpumask_equal(vector_cpumask, d->domain))
- break;
+ goto success;
/*
- * New cpumask using the vector is a proper subset of
- * the current in use mask. So cleanup the vector
- * allocation for the members that are not used anymore.
+ * Mark the cpus which are not longer in the mask for
+ * cleanup.
*/
- cpumask_andnot(d->old_domain, d->domain,
- vector_cpumask);
- d->move_in_progress =
- cpumask_intersects(d->old_domain, cpu_online_mask);
- cpumask_and(d->domain, d->domain, vector_cpumask);
- break;
+ cpumask_andnot(d->old_domain, d->domain, vector_cpumask);
+ vector = d->cfg.vector;
+ goto update;
}
vector = current_vector;
@@ -156,45 +170,60 @@ next:
vector = FIRST_EXTERNAL_VECTOR + offset;
}
- if (unlikely(current_vector == vector)) {
- cpumask_or(d->old_domain, d->old_domain,
- vector_cpumask);
- cpumask_andnot(vector_cpumask, mask, d->old_domain);
- cpu = cpumask_first_and(vector_cpumask,
- cpu_online_mask);
- continue;
- }
+ /* If the search wrapped around, try the next cpu */
+ if (unlikely(current_vector == vector))
+ goto next_cpu;
if (test_bit(vector, used_vectors))
goto next;
- for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
+ for_each_cpu(new_cpu, vector_searchmask) {
if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
goto next;
}
/* Found one! */
current_vector = vector;
current_offset = offset;
- if (d->cfg.vector) {
+ /* Schedule the old vector for cleanup on all cpus */
+ if (d->cfg.vector)
cpumask_copy(d->old_domain, d->domain);
- d->move_in_progress =
- cpumask_intersects(d->old_domain, cpu_online_mask);
- }
- for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
+ for_each_cpu(new_cpu, vector_searchmask)
per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
- d->cfg.vector = vector;
- cpumask_copy(d->domain, vector_cpumask);
- err = 0;
- break;
- }
+ goto update;
- if (!err) {
- /* cache destination APIC IDs into cfg->dest_apicid */
- err = apic->cpu_mask_to_apicid_and(mask, d->domain,
- &d->cfg.dest_apicid);
+next_cpu:
+ /*
+ * We exclude the current @vector_cpumask from the requested
+ * @mask and try again with the next online cpu in the
+ * result. We cannot modify @mask, so we use @vector_cpumask
+ * as a temporary buffer here as it will be reassigned when
+ * calling apic->vector_allocation_domain() above.
+ */
+ cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask);
+ cpumask_andnot(vector_cpumask, mask, searched_cpumask);
+ cpu = cpumask_first_and(vector_cpumask, cpu_online_mask);
+ continue;
}
+ return -ENOSPC;
- return err;
+update:
+ /*
+ * Exclude offline cpus from the cleanup mask and set the
+ * move_in_progress flag when the result is not empty.
+ */
+ cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
+ d->move_in_progress = !cpumask_empty(d->old_domain);
+ d->cfg.vector = vector;
+ cpumask_copy(d->domain, vector_cpumask);
+success:
+ /*
+ * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail
+ * as we already established, that mask & d->domain & cpu_online_mask
+ * is not empty.
+ */
+ BUG_ON(apic->cpu_mask_to_apicid_and(mask, d->domain,
+ &d->cfg.dest_apicid));
+ return 0;
}
static int assign_irq_vector(int irq, struct apic_chip_data *data,
@@ -224,10 +253,8 @@ static int assign_irq_vector_policy(int irq, int node,
static void clear_irq_vector(int irq, struct apic_chip_data *data)
{
struct irq_desc *desc;
- unsigned long flags;
int cpu, vector;
- raw_spin_lock_irqsave(&vector_lock, flags);
BUG_ON(!data->cfg.vector);
vector = data->cfg.vector;
@@ -237,10 +264,13 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
data->cfg.vector = 0;
cpumask_clear(data->domain);
- if (likely(!data->move_in_progress)) {
- raw_spin_unlock_irqrestore(&vector_lock, flags);
+ /*
+ * If move is in progress or the old_domain mask is not empty,
+ * i.e. the cleanup IPI has not been processed yet, we need to remove
+ * the old references to desc from all cpus vector tables.
+ */
+ if (!data->move_in_progress && cpumask_empty(data->old_domain))
return;
- }
desc = irq_to_desc(irq);
for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
@@ -253,7 +283,6 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
}
}
data->move_in_progress = 0;
- raw_spin_unlock_irqrestore(&vector_lock, flags);
}
void init_irq_alloc_info(struct irq_alloc_info *info,
@@ -274,19 +303,24 @@ void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
static void x86_vector_free_irqs(struct irq_domain *domain,
unsigned int virq, unsigned int nr_irqs)
{
+ struct apic_chip_data *apic_data;
struct irq_data *irq_data;
+ unsigned long flags;
int i;
for (i = 0; i < nr_irqs; i++) {
irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
if (irq_data && irq_data->chip_data) {
+ raw_spin_lock_irqsave(&vector_lock, flags);
clear_irq_vector(virq + i, irq_data->chip_data);
- free_apic_chip_data(irq_data->chip_data);
+ apic_data = irq_data->chip_data;
+ irq_domain_reset_irq_data(irq_data);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ free_apic_chip_data(apic_data);
#ifdef CONFIG_X86_IO_APIC
if (virq + i < nr_legacy_irqs())
legacy_irq_data[virq + i] = NULL;
#endif
- irq_domain_reset_irq_data(irq_data);
}
}
}
@@ -361,7 +395,11 @@ int __init arch_probe_nr_irqs(void)
if (nr < nr_irqs)
nr_irqs = nr;
- return nr_legacy_irqs();
+ /*
+ * We don't know if PIC is present at this point so we need to do
+ * probe() to get the right number of legacy IRQs.
+ */
+ return legacy_pic->probe();
}
#ifdef CONFIG_X86_IO_APIC
@@ -400,6 +438,8 @@ int __init arch_early_irq_init(void)
arch_init_htirq_domain(x86_vector_domain);
BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL));
return arch_early_ioapic_init();
}
@@ -488,14 +528,7 @@ static int apic_set_affinity(struct irq_data *irq_data,
return -EINVAL;
err = assign_irq_vector(irq, data, dest);
- if (err) {
- if (assign_irq_vector(irq, data,
- irq_data_get_affinity_mask(irq_data)))
- pr_err("Failed to recover vector for irq %d\n", irq);
- return err;
- }
-
- return IRQ_SET_MASK_OK;
+ return err ? err : IRQ_SET_MASK_OK;
}
static struct irq_chip lapic_controller = {
@@ -507,20 +540,12 @@ static struct irq_chip lapic_controller = {
#ifdef CONFIG_SMP
static void __send_cleanup_vector(struct apic_chip_data *data)
{
- cpumask_var_t cleanup_mask;
-
- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
- unsigned int i;
-
- for_each_cpu_and(i, data->old_domain, cpu_online_mask)
- apic->send_IPI_mask(cpumask_of(i),
- IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
- apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- free_cpumask_var(cleanup_mask);
- }
+ raw_spin_lock(&vector_lock);
+ cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
data->move_in_progress = 0;
+ if (!cpumask_empty(data->old_domain))
+ apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR);
+ raw_spin_unlock(&vector_lock);
}
void send_cleanup_vector(struct irq_cfg *cfg)
@@ -564,12 +589,25 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
goto unlock;
/*
- * Check if the irq migration is in progress. If so, we
- * haven't received the cleanup request yet for this irq.
+ * Nothing to cleanup if irq migration is in progress
+ * or this cpu is not set in the cleanup mask.
*/
- if (data->move_in_progress)
+ if (data->move_in_progress ||
+ !cpumask_test_cpu(me, data->old_domain))
goto unlock;
+ /*
+ * We have two cases to handle here:
+ * 1) vector is unchanged but the target mask got reduced
+ * 2) vector and the target mask has changed
+ *
+ * #1 is obvious, but in #2 we have two vectors with the same
+ * irq descriptor: the old and the new vector. So we need to
+ * make sure that we only cleanup the old vector. The new
+ * vector has the current @vector number in the config and
+ * this cpu is part of the target mask. We better leave that
+ * one alone.
+ */
if (vector == data->cfg.vector &&
cpumask_test_cpu(me, data->domain))
goto unlock;
@@ -587,6 +625,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
goto unlock;
}
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ cpumask_clear_cpu(me, data->old_domain);
unlock:
raw_spin_unlock(&desc->lock);
}
@@ -615,12 +654,48 @@ void irq_complete_move(struct irq_cfg *cfg)
__irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
}
-void irq_force_complete_move(int irq)
+/*
+ * Called with @desc->lock held and interrupts disabled.
+ */
+void irq_force_complete_move(struct irq_desc *desc)
{
- struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_data *irqdata = irq_desc_get_irq_data(desc);
+ struct apic_chip_data *data = apic_chip_data(irqdata);
+ struct irq_cfg *cfg = data ? &data->cfg : NULL;
- if (cfg)
- __irq_complete_move(cfg, cfg->vector);
+ if (!cfg)
+ return;
+
+ __irq_complete_move(cfg, cfg->vector);
+
+ /*
+ * This is tricky. If the cleanup of @data->old_domain has not been
+ * done yet, then the following setaffinity call will fail with
+ * -EBUSY. This can leave the interrupt in a stale state.
+ *
+ * The cleanup cannot make progress because we hold @desc->lock. So in
+ * case @data->old_domain is not yet cleaned up, we need to drop the
+ * lock and acquire it again. @desc cannot go away, because the
+ * hotplug code holds the sparse irq lock.
+ */
+ raw_spin_lock(&vector_lock);
+ /* Clean out all offline cpus (including ourself) first. */
+ cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
+ while (!cpumask_empty(data->old_domain)) {
+ raw_spin_unlock(&vector_lock);
+ raw_spin_unlock(&desc->lock);
+ cpu_relax();
+ raw_spin_lock(&desc->lock);
+ /*
+ * Reevaluate apic_chip_data. It might have been cleared after
+ * we dropped @desc->lock.
+ */
+ data = apic_chip_data(irqdata);
+ if (!data)
+ return;
+ raw_spin_lock(&vector_lock);
+ }
+ raw_spin_unlock(&vector_lock);
}
#endif
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cc8311c4d298..aca8b75c1552 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -23,6 +23,14 @@ static inline u32 x2apic_cluster(int cpu)
return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
}
+static void x2apic_send_IPI(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ x2apic_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
+}
+
static void
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
@@ -266,6 +274,7 @@ static struct apic apic_x2apic_cluster = {
.cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+ .send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 662e9150ea6f..a1242e2c12e6 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -36,6 +36,14 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
}
+static void x2apic_send_IPI(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
+
+ x2apic_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
+}
+
static void
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
@@ -122,6 +130,7 @@ static struct apic apic_x2apic_phys = {
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 4a139465f1d4..624db00583f4 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -406,6 +406,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
.cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
+ .send_IPI = uv_send_IPI_one,
.send_IPI_mask = uv_send_IPI_mask,
.send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
.send_IPI_allbutself = uv_send_IPI_allbutself,
@@ -888,7 +889,10 @@ void __init uv_system_init(void)
return;
}
pr_info("UV: Found %s hub\n", hub);
- map_low_mmrs();
+
+ /* We now only need to map the MMRs on UV1 */
+ if (is_uv1_hub())
+ map_low_mmrs();
m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
m_val = m_n_config.s.m_skt;
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 8e3d22a1af94..84a7524b202c 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -43,18 +43,15 @@ void common(void) {
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
BLANK();
- OFFSET(IA32_SIGCONTEXT_ax, sigcontext_ia32, ax);
- OFFSET(IA32_SIGCONTEXT_bx, sigcontext_ia32, bx);
- OFFSET(IA32_SIGCONTEXT_cx, sigcontext_ia32, cx);
- OFFSET(IA32_SIGCONTEXT_dx, sigcontext_ia32, dx);
- OFFSET(IA32_SIGCONTEXT_si, sigcontext_ia32, si);
- OFFSET(IA32_SIGCONTEXT_di, sigcontext_ia32, di);
- OFFSET(IA32_SIGCONTEXT_bp, sigcontext_ia32, bp);
- OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp);
- OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip);
-
- BLANK();
- OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+ OFFSET(IA32_SIGCONTEXT_ax, sigcontext_32, ax);
+ OFFSET(IA32_SIGCONTEXT_bx, sigcontext_32, bx);
+ OFFSET(IA32_SIGCONTEXT_cx, sigcontext_32, cx);
+ OFFSET(IA32_SIGCONTEXT_dx, sigcontext_32, dx);
+ OFFSET(IA32_SIGCONTEXT_si, sigcontext_32, si);
+ OFFSET(IA32_SIGCONTEXT_di, sigcontext_32, di);
+ OFFSET(IA32_SIGCONTEXT_bp, sigcontext_32, bp);
+ OFFSET(IA32_SIGCONTEXT_sp, sigcontext_32, sp);
+ OFFSET(IA32_SIGCONTEXT_ip, sigcontext_32, ip);
BLANK();
OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
@@ -68,9 +65,6 @@ void common(void) {
OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-#ifdef CONFIG_X86_32
- OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
-#endif
OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
#endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d8f42f902a0f..f2edafb5f24e 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -23,7 +23,6 @@ int main(void)
{
#ifdef CONFIG_PARAVIRT
OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
- OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
BLANK();
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4eb065c6bed2..58031303e304 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_cstate.o
obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 4a70fc6d400a..a07956a08936 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -304,7 +304,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
int cpu = smp_processor_id();
/* get information required for multi-node processors */
- if (cpu_has_topoext) {
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
u32 eax, ebx, ecx, edx;
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
@@ -352,6 +352,7 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
#ifdef CONFIG_SMP
unsigned bits;
int cpu = smp_processor_id();
+ unsigned int socket_id, core_complex_id;
bits = c->x86_coreid_bits;
/* Low order bits define the core id (index of core in socket) */
@@ -361,6 +362,18 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
amd_get_topology(c);
+
+ /*
+ * Fix percpu cpu_llc_id here as LLC topology is different
+ * for Fam17h systems.
+ */
+ if (c->x86 != 0x17 || !cpuid_edx(0x80000006))
+ return;
+
+ socket_id = (c->apicid >> bits) - 1;
+ core_complex_id = (c->apicid & ((1 << bits) - 1)) >> 3;
+
+ per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id;
#endif
}
@@ -421,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
*/
int ht_nodeid = c->initial_apicid;
- if (ht_nodeid >= 0 &&
- __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
/* Pick a nearby node */
if (!node_online(node))
@@ -665,9 +677,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
* Disable it on the affected CPUs.
*/
if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
- if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
+ if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
value |= 0x1E;
- wrmsrl_safe(0xc0011021, value);
+ wrmsrl_safe(MSR_F15H_IC_CFG, value);
}
}
}
@@ -909,7 +921,7 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
void set_dr_addr_mask(unsigned long mask, int dr)
{
- if (!cpu_has_bpext)
+ if (!boot_cpu_has(X86_FEATURE_BPEXT))
return;
switch (dr) {
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index d8fba5c15fbd..ae20be6e483c 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -43,7 +43,7 @@ static void init_c3(struct cpuinfo_x86 *c)
/* store Centaur Extended Feature Flags as
* word 5 of the CPU capability bit array
*/
- c->x86_capability[5] = cpuid_edx(0xC0000001);
+ c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
}
#ifdef CONFIG_X86_32
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index de22ea7ff82f..37830de8f60a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -273,10 +273,9 @@ __setup("nosmap", setup_disable_smap);
static __always_inline void setup_smap(struct cpuinfo_x86 *c)
{
- unsigned long eflags;
+ unsigned long eflags = native_save_fl();
/* This should have been cleared long ago */
- raw_local_save_flags(eflags);
BUG_ON(eflags & X86_EFLAGS_AC);
if (cpu_has(c, X86_FEATURE_SMAP)) {
@@ -582,14 +581,9 @@ void cpu_detect(struct cpuinfo_x86 *c)
u32 junk, tfms, cap0, misc;
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
- c->x86 = (tfms >> 8) & 0xf;
- c->x86_model = (tfms >> 4) & 0xf;
- c->x86_mask = tfms & 0xf;
-
- if (c->x86 == 0xf)
- c->x86 += (tfms >> 20) & 0xff;
- if (c->x86 >= 0x6)
- c->x86_model += ((tfms >> 16) & 0xf) << 4;
+ c->x86 = x86_family(tfms);
+ c->x86_model = x86_model(tfms);
+ c->x86_mask = x86_stepping(tfms);
if (cap0 & (1<<19)) {
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
@@ -600,50 +594,47 @@ void cpu_detect(struct cpuinfo_x86 *c)
void get_cpu_cap(struct cpuinfo_x86 *c)
{
- u32 tfms, xlvl;
- u32 ebx;
+ u32 eax, ebx, ecx, edx;
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
- u32 capability, excap;
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
- c->x86_capability[0] = capability;
- c->x86_capability[4] = excap;
+ c->x86_capability[CPUID_1_ECX] = ecx;
+ c->x86_capability[CPUID_1_EDX] = edx;
}
/* Additional Intel-defined flags: level 0x00000007 */
if (c->cpuid_level >= 0x00000007) {
- u32 eax, ebx, ecx, edx;
-
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
- c->x86_capability[9] = ebx;
+ c->x86_capability[CPUID_7_0_EBX] = ebx;
+
+ c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
}
/* Extended state features: level 0x0000000d */
if (c->cpuid_level >= 0x0000000d) {
- u32 eax, ebx, ecx, edx;
-
cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
- c->x86_capability[10] = eax;
+ c->x86_capability[CPUID_D_1_EAX] = eax;
}
/* Additional Intel-defined flags: level 0x0000000F */
if (c->cpuid_level >= 0x0000000F) {
- u32 eax, ebx, ecx, edx;
/* QoS sub-leaf, EAX=0Fh, ECX=0 */
cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
- c->x86_capability[11] = edx;
+ c->x86_capability[CPUID_F_0_EDX] = edx;
+
if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
/* will be overridden if occupancy monitoring exists */
c->x86_cache_max_rmid = ebx;
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
- c->x86_capability[12] = edx;
+ c->x86_capability[CPUID_F_1_EDX] = edx;
+
if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
@@ -655,21 +646,24 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
}
/* AMD-defined flags: level 0x80000001 */
- xlvl = cpuid_eax(0x80000000);
- c->extended_cpuid_level = xlvl;
+ eax = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = eax;
- if ((xlvl & 0xffff0000) == 0x80000000) {
- if (xlvl >= 0x80000001) {
- c->x86_capability[1] = cpuid_edx(0x80000001);
- c->x86_capability[6] = cpuid_ecx(0x80000001);
+ if ((eax & 0xffff0000) == 0x80000000) {
+ if (eax >= 0x80000001) {
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+ c->x86_capability[CPUID_8000_0001_EDX] = edx;
}
}
if (c->extended_cpuid_level >= 0x80000008) {
- u32 eax = cpuid_eax(0x80000008);
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
+ c->x86_capability[CPUID_8000_0008_EBX] = ebx;
}
#ifdef CONFIG_X86_32
else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
@@ -679,6 +673,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x80000007)
c->x86_power = cpuid_edx(0x80000007);
+ if (c->extended_cpuid_level >= 0x8000000a)
+ c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+
init_scattered_cpuid_features(c);
}
@@ -1185,7 +1182,7 @@ void syscall_init(void)
* They both write to the same internal register. STAR allows to
* set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
*/
- wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
@@ -1443,7 +1440,9 @@ void cpu_init(void)
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
- if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
+ if (cpu_feature_enabled(X86_FEATURE_VME) ||
+ cpu_has_tsc ||
+ boot_cpu_has(X86_FEATURE_DE))
cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
load_current_idt();
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 98a13db5f4be..565648bc1a0a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -97,6 +97,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
switch (c->x86_model) {
case 0x27: /* Penwell */
case 0x35: /* Cloverview */
+ case 0x4a: /* Merrifield */
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
break;
default:
@@ -444,7 +445,8 @@ static void init_intel(struct cpuinfo_x86 *c)
if (cpu_has_xmm2)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- if (cpu_has_ds) {
+
+ if (boot_cpu_has(X86_FEATURE_DS)) {
unsigned int l1;
rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
if (!(l1 & (1<<11)))
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index be4febc58b94..0b6c52388cf4 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -157,7 +157,7 @@ struct _cpuid4_info_regs {
struct amd_northbridge *nb;
};
-unsigned short num_cache_leaves;
+static unsigned short num_cache_leaves;
/* AMD doesn't have CPUID4. Emulate it here to report the same
information to the user. This makes some assumptions about the machine:
@@ -326,7 +326,7 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb)
*
* @returns: the disabled index if used or negative value if slot free.
*/
-int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
+static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
{
unsigned int reg = 0;
@@ -403,8 +403,8 @@ static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
*
* @return: 0 on success, error status on failure
*/
-int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
- unsigned long index)
+static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
+ unsigned slot, unsigned long index)
{
int ret = 0;
@@ -591,7 +591,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
unsigned edx;
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- if (cpu_has_topoext)
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT))
cpuid_count(0x8000001d, index, &eax.full,
&ebx.full, &ecx.full, &edx);
else
@@ -637,7 +637,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
void init_amd_cacheinfo(struct cpuinfo_x86 *c)
{
- if (cpu_has_topoext) {
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
num_cache_leaves = find_num_cache_leaves(c);
} else if (c->extended_cpuid_level >= 0x80000006) {
if (cpuid_edx(0x80000006) & 0xf000)
@@ -809,7 +809,7 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
struct cacheinfo *this_leaf;
int i, sibling;
- if (cpu_has_topoext) {
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
unsigned int apicid, nshared, first, last;
this_leaf = this_cpu_ci->info_list + index;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9d014b82a124..a006f4cd792b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -114,7 +114,6 @@ static struct work_struct mce_work;
static struct irq_work mce_irq_work;
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
-static int mce_usable_address(struct mce *m);
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
@@ -475,6 +474,28 @@ static void mce_report_event(struct pt_regs *regs)
irq_work_queue(&mce_irq_work);
}
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granuality for now.
+ */
+static int mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+ return 0;
+
+ /* Checks after this one are Intel-specific: */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 1;
+
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+ return 0;
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+ return 0;
+ return 1;
+}
+
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -484,7 +505,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
if (!mce)
return NOTIFY_DONE;
- if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
+ if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
pfn = mce->addr >> PAGE_SHIFT;
memory_failure(pfn, MCE_VECTOR, 0);
}
@@ -522,10 +543,10 @@ static bool memory_error(struct mce *m)
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor == X86_VENDOR_AMD) {
- /*
- * coming soon
- */
- return false;
+ /* ErrCodeExt[20:16] */
+ u8 xec = (m->status >> 16) & 0x1f;
+
+ return (xec == 0x0 || xec == 0x8);
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
@@ -567,7 +588,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
*/
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
- bool error_logged = false;
+ bool error_seen = false;
struct mce m;
int severity;
int i;
@@ -601,6 +622,8 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
(m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
continue;
+ error_seen = true;
+
mce_read_aux(&m, i);
if (!(flags & MCP_TIMESTAMP))
@@ -608,27 +631,24 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
- /*
- * In the cases where we don't have a valid address after all,
- * do not add it into the ring buffer.
- */
- if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
- if (m.status & MCI_STATUS_ADDRV) {
+ if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
+ if (m.status & MCI_STATUS_ADDRV)
m.severity = severity;
- m.usable_addr = mce_usable_address(&m);
-
- if (!mce_gen_pool_add(&m))
- mce_schedule_work();
- }
- }
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
- error_logged = true;
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
mce_log(&m);
+ else if (mce_usable_address(&m)) {
+ /*
+ * Although we skipped logging this, we still want
+ * to take action. Add to the pool so the registered
+ * notifiers will see it.
+ */
+ if (!mce_gen_pool_add(&m))
+ mce_schedule_work();
}
/*
@@ -644,7 +664,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
sync_core();
- return error_logged;
+ return error_seen;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -931,23 +951,6 @@ reset:
return ret;
}
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granuality for now.
- */
-static int mce_usable_address(struct mce *m)
-{
- if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
- return 0;
- if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
- return 0;
- if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
- return 0;
- return 1;
-}
-
static void mce_clear_state(unsigned long *toclear)
{
int i;
@@ -999,6 +1002,17 @@ void do_machine_check(struct pt_regs *regs, long error_code)
int flags = MF_ACTION_REQUIRED;
int lmce = 0;
+ /* If this CPU is offline, just bail out. */
+ if (cpu_is_offline(smp_processor_id())) {
+ u64 mcgstatus;
+
+ mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ return;
+ }
+ }
+
ist_enter(regs);
this_cpu_inc(mce_exception_count);
@@ -1089,7 +1103,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
/* assuming valid severity level != 0 */
m.severity = severity;
- m.usable_addr = mce_usable_address(&m);
mce_log(&m);
@@ -1586,6 +1599,8 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
winchip_mcheck_init(c);
return 1;
break;
+ default:
+ return 0;
}
return 0;
@@ -1605,6 +1620,8 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_amd_feature_init(c);
mce_flags.overflow_recov = !!(ebx & BIT(0));
mce_flags.succor = !!(ebx & BIT(1));
+ mce_flags.smca = !!(ebx & BIT(3));
+
break;
}
@@ -2042,7 +2059,7 @@ int __init mcheck_init(void)
* Disable machine checks on suspend and shutdown. We can't really handle
* them later.
*/
-static int mce_disable_error_reporting(void)
+static void mce_disable_error_reporting(void)
{
int i;
@@ -2052,17 +2069,32 @@ static int mce_disable_error_reporting(void)
if (b->init)
wrmsrl(MSR_IA32_MCx_CTL(i), 0);
}
- return 0;
+ return;
+}
+
+static void vendor_disable_error_reporting(void)
+{
+ /*
+ * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
+ * Disabling them for just a single offlined CPU is bad, since it will
+ * inhibit reporting for all shared resources on the socket like the
+ * last level cache (LLC), the integrated memory controller (iMC), etc.
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ return;
+
+ mce_disable_error_reporting();
}
static int mce_syscore_suspend(void)
{
- return mce_disable_error_reporting();
+ vendor_disable_error_reporting();
+ return 0;
}
static void mce_syscore_shutdown(void)
{
- mce_disable_error_reporting();
+ vendor_disable_error_reporting();
}
/*
@@ -2342,19 +2374,14 @@ static void mce_device_remove(unsigned int cpu)
static void mce_disable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
- int i;
if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
if (!(action & CPU_TASKS_FROZEN))
cmci_clear();
- for (i = 0; i < mca_cfg.banks; i++) {
- struct mce_bank *b = &mce_banks[i];
- if (b->init)
- wrmsrl(MSR_IA32_MCx_CTL(i), 0);
- }
+ vendor_disable_error_reporting();
}
static void mce_reenable_cpu(void *h)
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1af51b1586d7..2c5aaf8c2e2f 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -503,14 +503,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
- /* Check whether a vector already exists */
- if (h & APIC_VECTOR_MASK) {
- printk(KERN_DEBUG
- "CPU%d: Thermal LVT vector (%#x) already installed\n",
- cpu, (h & APIC_VECTOR_MASK));
- return;
- }
-
/* early Pentium M models use different method for enabling TM2 */
if (cpu_has(c, X86_FEATURE_TM2)) {
if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
index 285c85427c32..220b1a508513 100644
--- a/arch/x86/kernel/cpu/microcode/Makefile
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -2,6 +2,3 @@ microcode-y := core.o
obj-$(CONFIG_MICROCODE) += microcode.o
microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
microcode-$(CONFIG_MICROCODE_AMD) += amd.o
-obj-$(CONFIG_MICROCODE_EARLY) += core_early.o
-obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o
-obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 12829c3ced3c..2233f8a76615 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -1,5 +1,9 @@
/*
* AMD CPU Microcode Update Driver for Linux
+ *
+ * This driver allows to upgrade microcode on F10h AMD
+ * CPUs and later.
+ *
* Copyright (C) 2008-2011 Advanced Micro Devices Inc.
*
* Author: Peter Oruba <peter.oruba@amd.com>
@@ -7,34 +11,31 @@
* Based on work by:
* Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
*
- * Maintainers:
- * Andreas Herrmann <herrmann.der.user@googlemail.com>
- * Borislav Petkov <bp@alien8.de>
+ * early loader:
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
*
- * This driver allows to upgrade microcode on F10h AMD
- * CPUs and later.
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
*
* Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
*/
+#define pr_fmt(fmt) "microcode: " fmt
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
+#include <linux/earlycpio.h>
#include <linux/firmware.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
+#include <linux/initrd.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/pci.h>
+#include <asm/microcode_amd.h>
#include <asm/microcode.h>
#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/cpu.h>
#include <asm/msr.h>
-#include <asm/microcode_amd.h>
-
-MODULE_DESCRIPTION("AMD Microcode Update Driver");
-MODULE_AUTHOR("Peter Oruba");
-MODULE_LICENSE("GPL v2");
static struct equiv_cpu_entry *equiv_cpu_table;
@@ -47,6 +48,432 @@ struct ucode_patch {
static LIST_HEAD(pcache);
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd before jettisoning its contents.
+ */
+static u8 *container;
+static size_t container_size;
+
+static u32 ucode_new_rev;
+u8 amd_ucode_patch[PATCH_MAX_SIZE];
+static u16 this_equiv_id;
+
+static struct cpio_data ucode_cpio;
+
+/*
+ * Microcode patch container file is prepended to the initrd in cpio format.
+ * See Documentation/x86/early-microcode.txt
+ */
+static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
+
+static struct cpio_data __init find_ucode_in_initrd(void)
+{
+ long offset = 0;
+ char *path;
+ void *start;
+ size_t size;
+
+#ifdef CONFIG_X86_32
+ struct boot_params *p;
+
+ /*
+ * On 32-bit, early load occurs before paging is turned on so we need
+ * to use physical addresses.
+ */
+ p = (struct boot_params *)__pa_nodebug(&boot_params);
+ path = (char *)__pa_nodebug(ucode_path);
+ start = (void *)p->hdr.ramdisk_image;
+ size = p->hdr.ramdisk_size;
+#else
+ path = ucode_path;
+ start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
+ size = boot_params.hdr.ramdisk_size;
+#endif
+
+ return find_cpio_data(path, start, size, &offset);
+}
+
+static size_t compute_container_size(u8 *data, u32 total_size)
+{
+ size_t size = 0;
+ u32 *header = (u32 *)data;
+
+ if (header[0] != UCODE_MAGIC ||
+ header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+ header[2] == 0) /* size */
+ return size;
+
+ size = header[2] + CONTAINER_HDR_SZ;
+ total_size -= size;
+ data += size;
+
+ while (total_size) {
+ u16 patch_size;
+
+ header = (u32 *)data;
+
+ if (header[0] != UCODE_UCODE_TYPE)
+ break;
+
+ /*
+ * Sanity-check patch size.
+ */
+ patch_size = header[1];
+ if (patch_size > PATCH_MAX_SIZE)
+ break;
+
+ size += patch_size + SECTION_HDR_SIZE;
+ data += patch_size + SECTION_HDR_SIZE;
+ total_size -= patch_size + SECTION_HDR_SIZE;
+ }
+
+ return size;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
+static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
+{
+ struct equiv_cpu_entry *eq;
+ size_t *cont_sz;
+ u32 *header;
+ u8 *data, **cont;
+ u8 (*patch)[PATCH_MAX_SIZE];
+ u16 eq_id = 0;
+ int offset, left;
+ u32 rev, eax, ebx, ecx, edx;
+ u32 *new_rev;
+
+#ifdef CONFIG_X86_32
+ new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+ cont_sz = (size_t *)__pa_nodebug(&container_size);
+ cont = (u8 **)__pa_nodebug(&container);
+ patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
+#else
+ new_rev = &ucode_new_rev;
+ cont_sz = &container_size;
+ cont = &container;
+ patch = &amd_ucode_patch;
+#endif
+
+ data = ucode;
+ left = size;
+ header = (u32 *)data;
+
+ /* find equiv cpu table */
+ if (header[0] != UCODE_MAGIC ||
+ header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+ header[2] == 0) /* size */
+ return;
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ while (left > 0) {
+ eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
+
+ *cont = data;
+
+ /* Advance past the container header */
+ offset = header[2] + CONTAINER_HDR_SZ;
+ data += offset;
+ left -= offset;
+
+ eq_id = find_equiv_id(eq, eax);
+ if (eq_id) {
+ this_equiv_id = eq_id;
+ *cont_sz = compute_container_size(*cont, left + offset);
+
+ /*
+ * truncate how much we need to iterate over in the
+ * ucode update loop below
+ */
+ left = *cont_sz - offset;
+ break;
+ }
+
+ /*
+ * support multiple container files appended together. if this
+ * one does not have a matching equivalent cpu entry, we fast
+ * forward to the next container file.
+ */
+ while (left > 0) {
+ header = (u32 *)data;
+ if (header[0] == UCODE_MAGIC &&
+ header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
+ break;
+
+ offset = header[1] + SECTION_HDR_SIZE;
+ data += offset;
+ left -= offset;
+ }
+
+ /* mark where the next microcode container file starts */
+ offset = data - (u8 *)ucode;
+ ucode = data;
+ }
+
+ if (!eq_id) {
+ *cont = NULL;
+ *cont_sz = 0;
+ return;
+ }
+
+ if (check_current_patch_level(&rev, true))
+ return;
+
+ while (left > 0) {
+ struct microcode_amd *mc;
+
+ header = (u32 *)data;
+ if (header[0] != UCODE_UCODE_TYPE || /* type */
+ header[1] == 0) /* size */
+ break;
+
+ mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
+
+ if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
+
+ if (!__apply_microcode_amd(mc)) {
+ rev = mc->hdr.patch_id;
+ *new_rev = rev;
+
+ if (save_patch)
+ memcpy(patch, mc,
+ min_t(u32, header[1], PATCH_MAX_SIZE));
+ }
+ }
+
+ offset = header[1] + SECTION_HDR_SIZE;
+ data += offset;
+ left -= offset;
+ }
+}
+
+static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
+ unsigned int family)
+{
+#ifdef CONFIG_X86_64
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+
+ if (family >= 0x15)
+ snprintf(fw_name, sizeof(fw_name),
+ "amd-ucode/microcode_amd_fam%.2xh.bin", family);
+
+ return get_builtin_firmware(cp, fw_name);
+#else
+ return false;
+#endif
+}
+
+void __init load_ucode_amd_bsp(unsigned int family)
+{
+ struct cpio_data cp;
+ void **data;
+ size_t *size;
+
+#ifdef CONFIG_X86_32
+ data = (void **)__pa_nodebug(&ucode_cpio.data);
+ size = (size_t *)__pa_nodebug(&ucode_cpio.size);
+#else
+ data = &ucode_cpio.data;
+ size = &ucode_cpio.size;
+#endif
+
+ cp = find_ucode_in_initrd();
+ if (!cp.data) {
+ if (!load_builtin_amd_microcode(&cp, family))
+ return;
+ }
+
+ *data = cp.data;
+ *size = cp.size;
+
+ apply_ucode_in_initrd(cp.data, cp.size, true);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * On 32-bit, since AP's early load occurs before paging is turned on, we
+ * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
+ * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
+ * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
+ * which is used upon resume from suspend.
+ */
+void load_ucode_amd_ap(void)
+{
+ struct microcode_amd *mc;
+ size_t *usize;
+ void **ucode;
+
+ mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
+ if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
+ __apply_microcode_amd(mc);
+ return;
+ }
+
+ ucode = (void *)__pa_nodebug(&container);
+ usize = (size_t *)__pa_nodebug(&container_size);
+
+ if (!*ucode || !*usize)
+ return;
+
+ apply_ucode_in_initrd(*ucode, *usize, false);
+}
+
+static void __init collect_cpu_sig_on_bsp(void *arg)
+{
+ unsigned int cpu = smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ uci->cpu_sig.sig = cpuid_eax(0x00000001);
+}
+
+static void __init get_bsp_sig(void)
+{
+ unsigned int bsp = boot_cpu_data.cpu_index;
+ struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
+
+ if (!uci->cpu_sig.sig)
+ smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+}
+#else
+void load_ucode_amd_ap(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct equiv_cpu_entry *eq;
+ struct microcode_amd *mc;
+ u32 rev, eax;
+ u16 eq_id;
+
+ /* Exit if called on the BSP. */
+ if (!cpu)
+ return;
+
+ if (!container)
+ return;
+
+ /*
+ * 64-bit runs with paging enabled, thus early==false.
+ */
+ if (check_current_patch_level(&rev, false))
+ return;
+
+ eax = cpuid_eax(0x00000001);
+ eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
+
+ eq_id = find_equiv_id(eq, eax);
+ if (!eq_id)
+ return;
+
+ if (eq_id == this_equiv_id) {
+ mc = (struct microcode_amd *)amd_ucode_patch;
+
+ if (mc && rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc))
+ ucode_new_rev = mc->hdr.patch_id;
+ }
+
+ } else {
+ if (!ucode_cpio.data)
+ return;
+
+ /*
+ * AP has a different equivalence ID than BSP, looks like
+ * mixed-steppings silicon so go through the ucode blob anew.
+ */
+ apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
+ }
+}
+#endif
+
+int __init save_microcode_in_initrd_amd(void)
+{
+ unsigned long cont;
+ int retval = 0;
+ enum ucode_state ret;
+ u8 *cont_va;
+ u32 eax;
+
+ if (!container)
+ return -EINVAL;
+
+#ifdef CONFIG_X86_32
+ get_bsp_sig();
+ cont = (unsigned long)container;
+ cont_va = __va(container);
+#else
+ /*
+ * We need the physical address of the container for both bitness since
+ * boot_params.hdr.ramdisk_image is a physical address.
+ */
+ cont = __pa(container);
+ cont_va = container;
+#endif
+
+ /*
+ * Take into account the fact that the ramdisk might get relocated and
+ * therefore we need to recompute the container's position in virtual
+ * memory space.
+ */
+ if (relocated_ramdisk)
+ container = (u8 *)(__va(relocated_ramdisk) +
+ (cont - boot_params.hdr.ramdisk_image));
+ else
+ container = cont_va;
+
+ if (ucode_new_rev)
+ pr_info("microcode: updated early to new patch_level=0x%08x\n",
+ ucode_new_rev);
+
+ eax = cpuid_eax(0x00000001);
+ eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+
+ ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
+ if (ret != UCODE_OK)
+ retval = -EINVAL;
+
+ /*
+ * This will be freed any msec now, stash patches for the current
+ * family and switch to patch cache for cpu hotplug, etc later.
+ */
+ container = NULL;
+ container_size = 0;
+
+ return retval;
+}
+
+void reload_ucode_amd(void)
+{
+ struct microcode_amd *mc;
+ u32 rev;
+
+ /*
+ * early==false because this is a syscore ->resume path and by
+ * that time paging is long enabled.
+ */
+ if (check_current_patch_level(&rev, false))
+ return;
+
+ mc = (struct microcode_amd *)amd_ucode_patch;
+
+ if (mc && rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc)) {
+ ucode_new_rev = mc->hdr.patch_id;
+ pr_info("microcode: reload patch_level=0x%08x\n",
+ ucode_new_rev);
+ }
+ }
+}
static u16 __find_equiv_id(unsigned int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -177,6 +604,53 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
return patch_size;
}
+/*
+ * Those patch levels cannot be updated to newer ones and thus should be final.
+ */
+static u32 final_levels[] = {
+ 0x01000098,
+ 0x0100009f,
+ 0x010000af,
+ 0, /* T-101 terminator */
+};
+
+/*
+ * Check the current patch level on this CPU.
+ *
+ * @rev: Use it to return the patch level. It is set to 0 in the case of
+ * error.
+ *
+ * Returns:
+ * - true: if update should stop
+ * - false: otherwise
+ */
+bool check_current_patch_level(u32 *rev, bool early)
+{
+ u32 lvl, dummy, i;
+ bool ret = false;
+ u32 *levels;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
+
+ if (IS_ENABLED(CONFIG_X86_32) && early)
+ levels = (u32 *)__pa_nodebug(&final_levels);
+ else
+ levels = final_levels;
+
+ for (i = 0; levels[i]; i++) {
+ if (lvl == levels[i]) {
+ lvl = 0;
+ ret = true;
+ break;
+ }
+ }
+
+ if (rev)
+ *rev = lvl;
+
+ return ret;
+}
+
int __apply_microcode_amd(struct microcode_amd *mc_amd)
{
u32 rev, dummy;
@@ -197,7 +671,7 @@ int apply_microcode_amd(int cpu)
struct microcode_amd *mc_amd;
struct ucode_cpu_info *uci;
struct ucode_patch *p;
- u32 rev, dummy;
+ u32 rev;
BUG_ON(raw_smp_processor_id() != cpu);
@@ -210,7 +684,8 @@ int apply_microcode_amd(int cpu)
mc_amd = p->data;
uci->mc = p->data;
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ if (check_current_patch_level(&rev, false))
+ return -1;
/* need to apply patch? */
if (rev >= mc_amd->hdr.patch_id) {
@@ -387,7 +862,7 @@ enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t s
if (ret != UCODE_OK)
cleanup();
-#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32)
+#ifdef CONFIG_X86_32
/* save BSP's matching patch for early load */
if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
struct ucode_patch *p = find_patch(cpu);
@@ -475,7 +950,7 @@ static struct microcode_ops microcode_amd_ops = {
struct microcode_ops * __init init_amd_microcode(void)
{
- struct cpuinfo_x86 *c = &cpu_data(0);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
deleted file mode 100644
index e8a215a9a345..000000000000
--- a/arch/x86/kernel/cpu/microcode/amd_early.c
+++ /dev/null
@@ -1,440 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Jacob Shin <jacob.shin@amd.com>
- * Fixes: Borislav Petkov <bp@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/earlycpio.h>
-#include <linux/initrd.h>
-
-#include <asm/cpu.h>
-#include <asm/setup.h>
-#include <asm/microcode_amd.h>
-
-/*
- * This points to the current valid container of microcode patches which we will
- * save from the initrd before jettisoning its contents.
- */
-static u8 *container;
-static size_t container_size;
-
-static u32 ucode_new_rev;
-u8 amd_ucode_patch[PATCH_MAX_SIZE];
-static u16 this_equiv_id;
-
-static struct cpio_data ucode_cpio;
-
-/*
- * Microcode patch container file is prepended to the initrd in cpio format.
- * See Documentation/x86/early-microcode.txt
- */
-static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
-
-static struct cpio_data __init find_ucode_in_initrd(void)
-{
- long offset = 0;
- char *path;
- void *start;
- size_t size;
-
-#ifdef CONFIG_X86_32
- struct boot_params *p;
-
- /*
- * On 32-bit, early load occurs before paging is turned on so we need
- * to use physical addresses.
- */
- p = (struct boot_params *)__pa_nodebug(&boot_params);
- path = (char *)__pa_nodebug(ucode_path);
- start = (void *)p->hdr.ramdisk_image;
- size = p->hdr.ramdisk_size;
-#else
- path = ucode_path;
- start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
- size = boot_params.hdr.ramdisk_size;
-#endif
-
- return find_cpio_data(path, start, size, &offset);
-}
-
-static size_t compute_container_size(u8 *data, u32 total_size)
-{
- size_t size = 0;
- u32 *header = (u32 *)data;
-
- if (header[0] != UCODE_MAGIC ||
- header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
- header[2] == 0) /* size */
- return size;
-
- size = header[2] + CONTAINER_HDR_SZ;
- total_size -= size;
- data += size;
-
- while (total_size) {
- u16 patch_size;
-
- header = (u32 *)data;
-
- if (header[0] != UCODE_UCODE_TYPE)
- break;
-
- /*
- * Sanity-check patch size.
- */
- patch_size = header[1];
- if (patch_size > PATCH_MAX_SIZE)
- break;
-
- size += patch_size + SECTION_HDR_SIZE;
- data += patch_size + SECTION_HDR_SIZE;
- total_size -= patch_size + SECTION_HDR_SIZE;
- }
-
- return size;
-}
-
-/*
- * Early load occurs before we can vmalloc(). So we look for the microcode
- * patch container file in initrd, traverse equivalent cpu table, look for a
- * matching microcode patch, and update, all in initrd memory in place.
- * When vmalloc() is available for use later -- on 64-bit during first AP load,
- * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
- * load_microcode_amd() to save equivalent cpu table and microcode patches in
- * kernel heap memory.
- */
-static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
-{
- struct equiv_cpu_entry *eq;
- size_t *cont_sz;
- u32 *header;
- u8 *data, **cont;
- u8 (*patch)[PATCH_MAX_SIZE];
- u16 eq_id = 0;
- int offset, left;
- u32 rev, eax, ebx, ecx, edx;
- u32 *new_rev;
-
-#ifdef CONFIG_X86_32
- new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
- cont_sz = (size_t *)__pa_nodebug(&container_size);
- cont = (u8 **)__pa_nodebug(&container);
- patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
-#else
- new_rev = &ucode_new_rev;
- cont_sz = &container_size;
- cont = &container;
- patch = &amd_ucode_patch;
-#endif
-
- data = ucode;
- left = size;
- header = (u32 *)data;
-
- /* find equiv cpu table */
- if (header[0] != UCODE_MAGIC ||
- header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
- header[2] == 0) /* size */
- return;
-
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- while (left > 0) {
- eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
-
- *cont = data;
-
- /* Advance past the container header */
- offset = header[2] + CONTAINER_HDR_SZ;
- data += offset;
- left -= offset;
-
- eq_id = find_equiv_id(eq, eax);
- if (eq_id) {
- this_equiv_id = eq_id;
- *cont_sz = compute_container_size(*cont, left + offset);
-
- /*
- * truncate how much we need to iterate over in the
- * ucode update loop below
- */
- left = *cont_sz - offset;
- break;
- }
-
- /*
- * support multiple container files appended together. if this
- * one does not have a matching equivalent cpu entry, we fast
- * forward to the next container file.
- */
- while (left > 0) {
- header = (u32 *)data;
- if (header[0] == UCODE_MAGIC &&
- header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
- break;
-
- offset = header[1] + SECTION_HDR_SIZE;
- data += offset;
- left -= offset;
- }
-
- /* mark where the next microcode container file starts */
- offset = data - (u8 *)ucode;
- ucode = data;
- }
-
- if (!eq_id) {
- *cont = NULL;
- *cont_sz = 0;
- return;
- }
-
- /* find ucode and update if needed */
-
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-
- while (left > 0) {
- struct microcode_amd *mc;
-
- header = (u32 *)data;
- if (header[0] != UCODE_UCODE_TYPE || /* type */
- header[1] == 0) /* size */
- break;
-
- mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
-
- if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
-
- if (!__apply_microcode_amd(mc)) {
- rev = mc->hdr.patch_id;
- *new_rev = rev;
-
- if (save_patch)
- memcpy(patch, mc,
- min_t(u32, header[1], PATCH_MAX_SIZE));
- }
- }
-
- offset = header[1] + SECTION_HDR_SIZE;
- data += offset;
- left -= offset;
- }
-}
-
-static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
- unsigned int family)
-{
-#ifdef CONFIG_X86_64
- char fw_name[36] = "amd-ucode/microcode_amd.bin";
-
- if (family >= 0x15)
- snprintf(fw_name, sizeof(fw_name),
- "amd-ucode/microcode_amd_fam%.2xh.bin", family);
-
- return get_builtin_firmware(cp, fw_name);
-#else
- return false;
-#endif
-}
-
-void __init load_ucode_amd_bsp(unsigned int family)
-{
- struct cpio_data cp;
- void **data;
- size_t *size;
-
-#ifdef CONFIG_X86_32
- data = (void **)__pa_nodebug(&ucode_cpio.data);
- size = (size_t *)__pa_nodebug(&ucode_cpio.size);
-#else
- data = &ucode_cpio.data;
- size = &ucode_cpio.size;
-#endif
-
- cp = find_ucode_in_initrd();
- if (!cp.data) {
- if (!load_builtin_amd_microcode(&cp, family))
- return;
- }
-
- *data = cp.data;
- *size = cp.size;
-
- apply_ucode_in_initrd(cp.data, cp.size, true);
-}
-
-#ifdef CONFIG_X86_32
-/*
- * On 32-bit, since AP's early load occurs before paging is turned on, we
- * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
- * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
- * which is used upon resume from suspend.
- */
-void load_ucode_amd_ap(void)
-{
- struct microcode_amd *mc;
- size_t *usize;
- void **ucode;
-
- mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
- if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
- __apply_microcode_amd(mc);
- return;
- }
-
- ucode = (void *)__pa_nodebug(&container);
- usize = (size_t *)__pa_nodebug(&container_size);
-
- if (!*ucode || !*usize)
- return;
-
- apply_ucode_in_initrd(*ucode, *usize, false);
-}
-
-static void __init collect_cpu_sig_on_bsp(void *arg)
-{
- unsigned int cpu = smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- uci->cpu_sig.sig = cpuid_eax(0x00000001);
-}
-
-static void __init get_bsp_sig(void)
-{
- unsigned int bsp = boot_cpu_data.cpu_index;
- struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
-
- if (!uci->cpu_sig.sig)
- smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
-}
-#else
-void load_ucode_amd_ap(void)
-{
- unsigned int cpu = smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- struct equiv_cpu_entry *eq;
- struct microcode_amd *mc;
- u32 rev, eax;
- u16 eq_id;
-
- /* Exit if called on the BSP. */
- if (!cpu)
- return;
-
- if (!container)
- return;
-
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-
- uci->cpu_sig.rev = rev;
- uci->cpu_sig.sig = eax;
-
- eax = cpuid_eax(0x00000001);
- eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
-
- eq_id = find_equiv_id(eq, eax);
- if (!eq_id)
- return;
-
- if (eq_id == this_equiv_id) {
- mc = (struct microcode_amd *)amd_ucode_patch;
-
- if (mc && rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc))
- ucode_new_rev = mc->hdr.patch_id;
- }
-
- } else {
- if (!ucode_cpio.data)
- return;
-
- /*
- * AP has a different equivalence ID than BSP, looks like
- * mixed-steppings silicon so go through the ucode blob anew.
- */
- apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
- }
-}
-#endif
-
-int __init save_microcode_in_initrd_amd(void)
-{
- unsigned long cont;
- int retval = 0;
- enum ucode_state ret;
- u8 *cont_va;
- u32 eax;
-
- if (!container)
- return -EINVAL;
-
-#ifdef CONFIG_X86_32
- get_bsp_sig();
- cont = (unsigned long)container;
- cont_va = __va(container);
-#else
- /*
- * We need the physical address of the container for both bitness since
- * boot_params.hdr.ramdisk_image is a physical address.
- */
- cont = __pa(container);
- cont_va = container;
-#endif
-
- /*
- * Take into account the fact that the ramdisk might get relocated and
- * therefore we need to recompute the container's position in virtual
- * memory space.
- */
- if (relocated_ramdisk)
- container = (u8 *)(__va(relocated_ramdisk) +
- (cont - boot_params.hdr.ramdisk_image));
- else
- container = cont_va;
-
- if (ucode_new_rev)
- pr_info("microcode: updated early to new patch_level=0x%08x\n",
- ucode_new_rev);
-
- eax = cpuid_eax(0x00000001);
- eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
-
- ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
- if (ret != UCODE_OK)
- retval = -EINVAL;
-
- /*
- * This will be freed any msec now, stash patches for the current
- * family and switch to patch cache for cpu hotplug, etc later.
- */
- container = NULL;
- container_size = 0;
-
- return retval;
-}
-
-void reload_ucode_amd(void)
-{
- struct microcode_amd *mc;
- u32 rev, eax;
-
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-
- mc = (struct microcode_amd *)amd_ucode_patch;
-
- if (mc && rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc)) {
- ucode_new_rev = mc->hdr.patch_id;
- pr_info("microcode: reload patch_level=0x%08x\n",
- ucode_new_rev);
- }
- }
-}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 9e3f3c7dd5d7..faec7120c508 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -5,6 +5,12 @@
* 2006 Shaohua Li <shaohua.li@intel.com>
* 2013-2015 Borislav Petkov <bp@alien8.de>
*
+ * X86 CPU microcode early update for Linux:
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ * (C) 2015 Borislav Petkov <bp@alien8.de>
+ *
* This driver allows to upgrade microcode on x86 processors.
*
* This program is free software; you can redistribute it and/or
@@ -13,34 +19,39 @@
* 2 of the License, or (at your option) any later version.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define pr_fmt(fmt) "microcode: " fmt
#include <linux/platform_device.h>
+#include <linux/syscore_ops.h>
#include <linux/miscdevice.h>
#include <linux/capability.h>
+#include <linux/firmware.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/cpu.h>
#include <linux/fs.h>
#include <linux/mm.h>
-#include <linux/syscore_ops.h>
-#include <asm/microcode.h>
-#include <asm/processor.h>
+#include <asm/microcode_intel.h>
#include <asm/cpu_device_id.h>
+#include <asm/microcode_amd.h>
#include <asm/perf_event.h>
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-#define MICROCODE_VERSION "2.00"
+#define MICROCODE_VERSION "2.01"
static struct microcode_ops *microcode_ops;
-bool dis_ucode_ldr;
-module_param(dis_ucode_ldr, bool, 0);
+static bool dis_ucode_ldr;
+
+static int __init disable_loader(char *str)
+{
+ dis_ucode_ldr = true;
+ return 1;
+}
+__setup("dis_ucode_ldr", disable_loader);
/*
* Synchronization.
@@ -68,6 +79,150 @@ struct cpu_info_ctx {
int err;
};
+static bool __init check_loader_disabled_bsp(void)
+{
+#ifdef CONFIG_X86_32
+ const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
+ const char *opt = "dis_ucode_ldr";
+ const char *option = (const char *)__pa_nodebug(opt);
+ bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
+
+#else /* CONFIG_X86_64 */
+ const char *cmdline = boot_command_line;
+ const char *option = "dis_ucode_ldr";
+ bool *res = &dis_ucode_ldr;
+#endif
+
+ if (cmdline_find_option_bool(cmdline, option))
+ *res = true;
+
+ return *res;
+}
+
+extern struct builtin_fw __start_builtin_fw[];
+extern struct builtin_fw __end_builtin_fw[];
+
+bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+#ifdef CONFIG_FW_LOADER
+ struct builtin_fw *b_fw;
+
+ for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
+ if (!strcmp(name, b_fw->name)) {
+ cd->size = b_fw->size;
+ cd->data = b_fw->data;
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
+void __init load_ucode_bsp(void)
+{
+ int vendor;
+ unsigned int family;
+
+ if (check_loader_disabled_bsp())
+ return;
+
+ if (!have_cpuid_p())
+ return;
+
+ vendor = x86_cpuid_vendor();
+ family = x86_cpuid_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (family >= 6)
+ load_ucode_intel_bsp();
+ break;
+ case X86_VENDOR_AMD:
+ if (family >= 0x10)
+ load_ucode_amd_bsp(family);
+ break;
+ default:
+ break;
+ }
+}
+
+static bool check_loader_disabled_ap(void)
+{
+#ifdef CONFIG_X86_32
+ return *((bool *)__pa_nodebug(&dis_ucode_ldr));
+#else
+ return dis_ucode_ldr;
+#endif
+}
+
+void load_ucode_ap(void)
+{
+ int vendor, family;
+
+ if (check_loader_disabled_ap())
+ return;
+
+ if (!have_cpuid_p())
+ return;
+
+ vendor = x86_cpuid_vendor();
+ family = x86_cpuid_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (family >= 6)
+ load_ucode_intel_ap();
+ break;
+ case X86_VENDOR_AMD:
+ if (family >= 0x10)
+ load_ucode_amd_ap();
+ break;
+ default:
+ break;
+ }
+}
+
+int __init save_microcode_in_initrd(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (c->x86 >= 6)
+ save_microcode_in_initrd_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (c->x86 >= 0x10)
+ save_microcode_in_initrd_amd();
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+void reload_early_microcode(void)
+{
+ int vendor, family;
+
+ vendor = x86_cpuid_vendor();
+ family = x86_cpuid_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (family >= 6)
+ reload_ucode_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (family >= 0x10)
+ reload_ucode_amd();
+ break;
+ default:
+ break;
+ }
+}
+
static void collect_cpu_info_local(void *arg)
{
struct cpu_info_ctx *ctx = arg;
@@ -210,9 +365,6 @@ static void __exit microcode_dev_exit(void)
{
misc_deregister(&microcode_dev);
}
-
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
-MODULE_ALIAS("devname:cpu/microcode");
#else
#define microcode_dev_init() 0
#define microcode_dev_exit() do { } while (0)
@@ -463,20 +615,6 @@ static struct notifier_block mc_cpu_notifier = {
.notifier_call = mc_cpu_callback,
};
-#ifdef MODULE
-/* Autoload on Intel and AMD systems */
-static const struct x86_cpu_id __initconst microcode_id[] = {
-#ifdef CONFIG_MICROCODE_INTEL
- { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
-#endif
-#ifdef CONFIG_MICROCODE_AMD
- { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
-#endif
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, microcode_id);
-#endif
-
static struct attribute *cpu_root_microcode_attrs[] = {
&dev_attr_reload.attr,
NULL
@@ -487,9 +625,9 @@ static struct attribute_group cpu_root_microcode_group = {
.attrs = cpu_root_microcode_attrs,
};
-static int __init microcode_init(void)
+int __init microcode_init(void)
{
- struct cpuinfo_x86 *c = &cpu_data(0);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
if (paravirt_enabled() || dis_ucode_ldr)
@@ -560,35 +698,4 @@ static int __init microcode_init(void)
return error;
}
-module_init(microcode_init);
-
-static void __exit microcode_exit(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- microcode_dev_exit();
-
- unregister_hotcpu_notifier(&mc_cpu_notifier);
- unregister_syscore_ops(&mc_syscore_ops);
-
- sysfs_remove_group(&cpu_subsys.dev_root->kobj,
- &cpu_root_microcode_group);
-
- get_online_cpus();
- mutex_lock(&microcode_mutex);
-
- subsys_interface_unregister(&mc_cpu_interface);
-
- mutex_unlock(&microcode_mutex);
- put_online_cpus();
-
- platform_device_unregister(microcode_pdev);
-
- microcode_ops = NULL;
-
- if (c->x86_vendor == X86_VENDOR_AMD)
- exit_amd_microcode();
-
- pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
-}
-module_exit(microcode_exit);
+late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
deleted file mode 100644
index 8ebc421d6299..000000000000
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * X86 CPU microcode early update for Linux
- *
- * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- * H Peter Anvin" <hpa@zytor.com>
- * (C) 2015 Borislav Petkov <bp@alien8.de>
- *
- * This driver allows to early upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
- * Software Developer's Manual.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <linux/firmware.h>
-#include <asm/microcode.h>
-#include <asm/microcode_intel.h>
-#include <asm/microcode_amd.h>
-#include <asm/processor.h>
-#include <asm/cmdline.h>
-
-static bool __init check_loader_disabled_bsp(void)
-{
-#ifdef CONFIG_X86_32
- const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
- const char *opt = "dis_ucode_ldr";
- const char *option = (const char *)__pa_nodebug(opt);
- bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
-
-#else /* CONFIG_X86_64 */
- const char *cmdline = boot_command_line;
- const char *option = "dis_ucode_ldr";
- bool *res = &dis_ucode_ldr;
-#endif
-
- if (cmdline_find_option_bool(cmdline, option))
- *res = true;
-
- return *res;
-}
-
-extern struct builtin_fw __start_builtin_fw[];
-extern struct builtin_fw __end_builtin_fw[];
-
-bool get_builtin_firmware(struct cpio_data *cd, const char *name)
-{
-#ifdef CONFIG_FW_LOADER
- struct builtin_fw *b_fw;
-
- for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
- if (!strcmp(name, b_fw->name)) {
- cd->size = b_fw->size;
- cd->data = b_fw->data;
- return true;
- }
- }
-#endif
- return false;
-}
-
-void __init load_ucode_bsp(void)
-{
- int vendor;
- unsigned int family;
-
- if (check_loader_disabled_bsp())
- return;
-
- if (!have_cpuid_p())
- return;
-
- vendor = x86_vendor();
- family = x86_family();
-
- switch (vendor) {
- case X86_VENDOR_INTEL:
- if (family >= 6)
- load_ucode_intel_bsp();
- break;
- case X86_VENDOR_AMD:
- if (family >= 0x10)
- load_ucode_amd_bsp(family);
- break;
- default:
- break;
- }
-}
-
-static bool check_loader_disabled_ap(void)
-{
-#ifdef CONFIG_X86_32
- return *((bool *)__pa_nodebug(&dis_ucode_ldr));
-#else
- return dis_ucode_ldr;
-#endif
-}
-
-void load_ucode_ap(void)
-{
- int vendor, family;
-
- if (check_loader_disabled_ap())
- return;
-
- if (!have_cpuid_p())
- return;
-
- vendor = x86_vendor();
- family = x86_family();
-
- switch (vendor) {
- case X86_VENDOR_INTEL:
- if (family >= 6)
- load_ucode_intel_ap();
- break;
- case X86_VENDOR_AMD:
- if (family >= 0x10)
- load_ucode_amd_ap();
- break;
- default:
- break;
- }
-}
-
-int __init save_microcode_in_initrd(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- if (c->x86 >= 6)
- save_microcode_in_initrd_intel();
- break;
- case X86_VENDOR_AMD:
- if (c->x86 >= 0x10)
- save_microcode_in_initrd_amd();
- break;
- default:
- break;
- }
-
- return 0;
-}
-
-void reload_early_microcode(void)
-{
- int vendor, family;
-
- vendor = x86_vendor();
- family = x86_family();
-
- switch (vendor) {
- case X86_VENDOR_INTEL:
- if (family >= 6)
- reload_ucode_intel();
- break;
- case X86_VENDOR_AMD:
- if (family >= 0x10)
- reload_ucode_amd();
- break;
- default:
- break;
- }
-}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 969dc17eb1b4..ee81c544ee0d 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -4,27 +4,800 @@
* Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* 2006 Shaohua Li <shaohua.li@intel.com>
*
+ * Intel CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+/*
+ * This needs to be before all headers so that pr_debug in printk.h doesn't turn
+ * printk calls into no_printk().
+ *
+ *#define DEBUG
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+#include <linux/earlycpio.h>
#include <linux/firmware.h>
#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/vmalloc.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
#include <asm/microcode_intel.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
#include <asm/msr.h>
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
+static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+static struct mc_saved_data {
+ unsigned int mc_saved_count;
+ struct microcode_intel **mc_saved;
+} mc_saved_data;
+
+static enum ucode_state
+load_microcode_early(struct microcode_intel **saved,
+ unsigned int num_saved, struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *ucode_ptr, *new_mc = NULL;
+ struct microcode_header_intel *mc_hdr;
+ int new_rev, ret, i;
+
+ new_rev = uci->cpu_sig.rev;
+
+ for (i = 0; i < num_saved; i++) {
+ ucode_ptr = saved[i];
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+
+ ret = has_newer_microcode(ucode_ptr,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf,
+ new_rev);
+ if (!ret)
+ continue;
+
+ new_rev = mc_hdr->rev;
+ new_mc = ucode_ptr;
+ }
+
+ if (!new_mc)
+ return UCODE_NFOUND;
+
+ uci->mc = (struct microcode_intel *)new_mc;
+ return UCODE_OK;
+}
+
+static inline void
+copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
+ unsigned long off, int num_saved)
+{
+ int i;
+
+ for (i = 0; i < num_saved; i++)
+ mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
+}
+
+#ifdef CONFIG_X86_32
+static void
+microcode_phys(struct microcode_intel **mc_saved_tmp,
+ struct mc_saved_data *mc_saved_data)
+{
+ int i;
+ struct microcode_intel ***mc_saved;
+
+ mc_saved = (struct microcode_intel ***)
+ __pa_nodebug(&mc_saved_data->mc_saved);
+ for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+ struct microcode_intel *p;
+
+ p = *(struct microcode_intel **)
+ __pa_nodebug(mc_saved_data->mc_saved + i);
+ mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
+ }
+}
+#endif
+
+static enum ucode_state
+load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long initrd_start, struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int count = mc_saved_data->mc_saved_count;
+
+ if (!mc_saved_data->mc_saved) {
+ copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
+
+ return load_microcode_early(mc_saved_tmp, count, uci);
+ } else {
+#ifdef CONFIG_X86_32
+ microcode_phys(mc_saved_tmp, mc_saved_data);
+ return load_microcode_early(mc_saved_tmp, count, uci);
+#else
+ return load_microcode_early(mc_saved_data->mc_saved,
+ count, uci);
+#endif
+ }
+}
+
+/*
+ * Given CPU signature and a microcode patch, this function finds if the
+ * microcode patch has matching family and model with the CPU.
+ */
+static enum ucode_state
+matching_model_microcode(struct microcode_header_intel *mc_header,
+ unsigned long sig)
+{
+ unsigned int fam, model;
+ unsigned int fam_ucode, model_ucode;
+ struct extended_sigtable *ext_header;
+ unsigned long total_size = get_totalsize(mc_header);
+ unsigned long data_size = get_datasize(mc_header);
+ int ext_sigcount, i;
+ struct extended_signature *ext_sig;
+
+ fam = x86_family(sig);
+ model = x86_model(sig);
+
+ fam_ucode = x86_family(mc_header->sig);
+ model_ucode = x86_model(mc_header->sig);
+
+ if (fam == fam_ucode && model == model_ucode)
+ return UCODE_OK;
+
+ /* Look for ext. headers: */
+ if (total_size <= data_size + MC_HEADER_SIZE)
+ return UCODE_NFOUND;
+
+ ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+
+ for (i = 0; i < ext_sigcount; i++) {
+ fam_ucode = x86_family(ext_sig->sig);
+ model_ucode = x86_model(ext_sig->sig);
+
+ if (fam == fam_ucode && model == model_ucode)
+ return UCODE_OK;
+
+ ext_sig++;
+ }
+ return UCODE_NFOUND;
+}
+
+static int
+save_microcode(struct mc_saved_data *mc_saved_data,
+ struct microcode_intel **mc_saved_src,
+ unsigned int mc_saved_count)
+{
+ int i, j;
+ struct microcode_intel **saved_ptr;
+ int ret;
+
+ if (!mc_saved_count)
+ return -EINVAL;
+
+ /*
+ * Copy new microcode data.
+ */
+ saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+ if (!saved_ptr)
+ return -ENOMEM;
+
+ for (i = 0; i < mc_saved_count; i++) {
+ struct microcode_header_intel *mc_hdr;
+ struct microcode_intel *mc;
+ unsigned long size;
+
+ if (!mc_saved_src[i]) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ mc = mc_saved_src[i];
+ mc_hdr = &mc->hdr;
+ size = get_totalsize(mc_hdr);
+
+ saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+ if (!saved_ptr[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ memcpy(saved_ptr[i], mc, size);
+ }
+
+ /*
+ * Point to newly saved microcode.
+ */
+ mc_saved_data->mc_saved = saved_ptr;
+ mc_saved_data->mc_saved_count = mc_saved_count;
+
+ return 0;
+
+err:
+ for (j = 0; j <= i; j++)
+ kfree(saved_ptr[j]);
+ kfree(saved_ptr);
+
+ return ret;
+}
+
+/*
+ * A microcode patch in ucode_ptr is saved into mc_saved
+ * - if it has matching signature and newer revision compared to an existing
+ * patch mc_saved.
+ * - or if it is a newly discovered microcode patch.
+ *
+ * The microcode patch should have matching model with CPU.
+ *
+ * Returns: The updated number @num_saved of saved microcode patches.
+ */
+static unsigned int _save_mc(struct microcode_intel **mc_saved,
+ u8 *ucode_ptr, unsigned int num_saved)
+{
+ struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+ unsigned int sig, pf;
+ int found = 0, i;
+
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+
+ for (i = 0; i < num_saved; i++) {
+ mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
+ sig = mc_saved_hdr->sig;
+ pf = mc_saved_hdr->pf;
+
+ if (!find_matching_signature(ucode_ptr, sig, pf))
+ continue;
+
+ found = 1;
+
+ if (mc_hdr->rev <= mc_saved_hdr->rev)
+ continue;
+
+ /*
+ * Found an older ucode saved earlier. Replace it with
+ * this newer one.
+ */
+ mc_saved[i] = (struct microcode_intel *)ucode_ptr;
+ break;
+ }
+
+ /* Newly detected microcode, save it to memory. */
+ if (i >= num_saved && !found)
+ mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
+
+ return num_saved;
+}
+
+/*
+ * Get microcode matching with BSP's model. Only CPUs with the same model as
+ * BSP can stay in the platform.
+ */
+static enum ucode_state __init
+get_matching_model_microcode(int cpu, unsigned long start,
+ void *data, size_t size,
+ struct mc_saved_data *mc_saved_data,
+ unsigned long *mc_saved_in_initrd,
+ struct ucode_cpu_info *uci)
+{
+ u8 *ucode_ptr = data;
+ unsigned int leftover = size;
+ enum ucode_state state = UCODE_OK;
+ unsigned int mc_size;
+ struct microcode_header_intel *mc_header;
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
+ int i;
+
+ while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
+
+ if (leftover < sizeof(mc_header))
+ break;
+
+ mc_header = (struct microcode_header_intel *)ucode_ptr;
+
+ mc_size = get_totalsize(mc_header);
+ if (!mc_size || mc_size > leftover ||
+ microcode_sanity_check(ucode_ptr, 0) < 0)
+ break;
+
+ leftover -= mc_size;
+
+ /*
+ * Since APs with same family and model as the BSP may boot in
+ * the platform, we need to find and save microcode patches
+ * with the same family and model as the BSP.
+ */
+ if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
+ UCODE_OK) {
+ ucode_ptr += mc_size;
+ continue;
+ }
+
+ mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
+
+ ucode_ptr += mc_size;
+ }
+
+ if (leftover) {
+ state = UCODE_ERROR;
+ goto out;
+ }
+
+ if (mc_saved_count == 0) {
+ state = UCODE_NFOUND;
+ goto out;
+ }
+
+ for (i = 0; i < mc_saved_count; i++)
+ mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
+
+ mc_saved_data->mc_saved_count = mc_saved_count;
+out:
+ return state;
+}
+
+static int collect_cpu_info_early(struct ucode_cpu_info *uci)
+{
+ unsigned int val[2];
+ unsigned int family, model;
+ struct cpu_signature csig;
+ unsigned int eax, ebx, ecx, edx;
+
+ csig.sig = 0;
+ csig.pf = 0;
+ csig.rev = 0;
+
+ memset(uci, 0, sizeof(*uci));
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ csig.sig = eax;
+
+ family = x86_family(csig.sig);
+ model = x86_model(csig.sig);
+
+ if ((model >= 5) || (family > 6)) {
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig.pf = 1 << ((val[1] >> 18) & 7);
+ }
+ native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+ /* As documented in the SDM: Do a CPUID 1 here */
+ sync_core();
+
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+ csig.rev = val[1];
+
+ uci->cpu_sig = csig;
+ uci->valid = 1;
+
+ return 0;
+}
+
+static void show_saved_mc(void)
+{
+#ifdef DEBUG
+ int i, j;
+ unsigned int sig, pf, rev, total_size, data_size, date;
+ struct ucode_cpu_info uci;
+
+ if (mc_saved_data.mc_saved_count == 0) {
+ pr_debug("no microcode data saved.\n");
+ return;
+ }
+ pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
+
+ collect_cpu_info_early(&uci);
+
+ sig = uci.cpu_sig.sig;
+ pf = uci.cpu_sig.pf;
+ rev = uci.cpu_sig.rev;
+ pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
+
+ for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
+ struct microcode_header_intel *mc_saved_header;
+ struct extended_sigtable *ext_header;
+ int ext_sigcount;
+ struct extended_signature *ext_sig;
+
+ mc_saved_header = (struct microcode_header_intel *)
+ mc_saved_data.mc_saved[i];
+ sig = mc_saved_header->sig;
+ pf = mc_saved_header->pf;
+ rev = mc_saved_header->rev;
+ total_size = get_totalsize(mc_saved_header);
+ data_size = get_datasize(mc_saved_header);
+ date = mc_saved_header->date;
+
+ pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
+ i, sig, pf, rev, total_size,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+
+ /* Look for ext. headers: */
+ if (total_size <= data_size + MC_HEADER_SIZE)
+ continue;
+
+ ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+ for (j = 0; j < ext_sigcount; j++) {
+ sig = ext_sig->sig;
+ pf = ext_sig->pf;
+
+ pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
+ j, sig, pf);
+
+ ext_sig++;
+ }
+
+ }
+#endif
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static DEFINE_MUTEX(x86_cpu_microcode_mutex);
+/*
+ * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
+ * hot added or resumes.
+ *
+ * Please make sure this mc should be a valid microcode patch before calling
+ * this function.
+ */
+int save_mc_for_early(u8 *mc)
+{
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int mc_saved_count_init;
+ unsigned int mc_saved_count;
+ struct microcode_intel **mc_saved;
+ int ret = 0;
+ int i;
+
+ /*
+ * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
+ * hotplug.
+ */
+ mutex_lock(&x86_cpu_microcode_mutex);
+
+ mc_saved_count_init = mc_saved_data.mc_saved_count;
+ mc_saved_count = mc_saved_data.mc_saved_count;
+ mc_saved = mc_saved_data.mc_saved;
+
+ if (mc_saved && mc_saved_count)
+ memcpy(mc_saved_tmp, mc_saved,
+ mc_saved_count * sizeof(struct microcode_intel *));
+ /*
+ * Save the microcode patch mc in mc_save_tmp structure if it's a newer
+ * version.
+ */
+ mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
+
+ /*
+ * Save the mc_save_tmp in global mc_saved_data.
+ */
+ ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
+ if (ret) {
+ pr_err("Cannot save microcode patch.\n");
+ goto out;
+ }
+
+ show_saved_mc();
+
+ /*
+ * Free old saved microcode data.
+ */
+ if (mc_saved) {
+ for (i = 0; i < mc_saved_count_init; i++)
+ kfree(mc_saved[i]);
+ kfree(mc_saved);
+ }
+
+out:
+ mutex_unlock(&x86_cpu_microcode_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(save_mc_for_early);
+#endif
+
+static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
+{
+#ifdef CONFIG_X86_64
+ unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
+ char name[30];
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ x86_family(eax), x86_model(eax), x86_stepping(eax));
+
+ return get_builtin_firmware(cp, name);
+#else
+ return false;
+#endif
+}
+
+static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
+static __init enum ucode_state
+scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long start, unsigned long size,
+ struct ucode_cpu_info *uci)
+{
+ struct cpio_data cd;
+ long offset = 0;
+#ifdef CONFIG_X86_32
+ char *p = (char *)__pa_nodebug(ucode_name);
+#else
+ char *p = ucode_name;
+#endif
+
+ cd.data = NULL;
+ cd.size = 0;
+
+ cd = find_cpio_data(p, (void *)start, size, &offset);
+ if (!cd.data) {
+ if (!load_builtin_intel_microcode(&cd))
+ return UCODE_ERROR;
+ }
+
+ return get_matching_model_microcode(0, start, cd.data, cd.size,
+ mc_saved_data, initrd, uci);
+}
+
+/*
+ * Print ucode update info.
+ */
+static void
+print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
+{
+ int cpu = smp_processor_id();
+
+ pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+ cpu,
+ uci->cpu_sig.rev,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+}
+
+#ifdef CONFIG_X86_32
+
+static int delay_ucode_info;
+static int current_mc_date;
+
+/*
+ * Print early updated ucode info after printk works. This is delayed info dump.
+ */
+void show_ucode_info_early(void)
+{
+ struct ucode_cpu_info uci;
+
+ if (delay_ucode_info) {
+ collect_cpu_info_early(&uci);
+ print_ucode_info(&uci, current_mc_date);
+ delay_ucode_info = 0;
+ }
+}
+
+/*
+ * At this point, we can not call printk() yet. Keep microcode patch number in
+ * mc_saved_data.mc_saved and delay printing microcode info in
+ * show_ucode_info_early() until printk() works.
+ */
+static void print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_intel;
+ int *delay_ucode_info_p;
+ int *current_mc_date_p;
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return;
+
+ delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
+ current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
+
+ *delay_ucode_info_p = 1;
+ *current_mc_date_p = mc_intel->hdr.date;
+}
+#else
+
+/*
+ * Flush global tlb. We only do this in x86_64 where paging has been enabled
+ * already and PGE should be enabled as well.
+ */
+static inline void flush_tlb_early(void)
+{
+ __native_flush_tlb_global_irq_disabled();
+}
+
+static inline void print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_intel;
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return;
+
+ print_ucode_info(uci, mc_intel->hdr.date);
+}
+#endif
+
+static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
+{
+ struct microcode_intel *mc_intel;
+ unsigned int val[2];
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return 0;
+
+ /* write microcode via MSR 0x79 */
+ native_wrmsr(MSR_IA32_UCODE_WRITE,
+ (unsigned long) mc_intel->bits,
+ (unsigned long) mc_intel->bits >> 16 >> 16);
+ native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+ /* As documented in the SDM: Do a CPUID 1 here */
+ sync_core();
+
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+ if (val[1] != mc_intel->hdr.rev)
+ return -1;
+
+#ifdef CONFIG_X86_64
+ /* Flush global tlb. This is precaution. */
+ flush_tlb_early();
+#endif
+ uci->cpu_sig.rev = val[1];
+
+ if (early)
+ print_ucode(uci);
+ else
+ print_ucode_info(uci, mc_intel->hdr.date);
+
+ return 0;
+}
+
+/*
+ * This function converts microcode patch offsets previously stored in
+ * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
+ */
+int __init save_microcode_in_initrd_intel(void)
+{
+ unsigned int count = mc_saved_data.mc_saved_count;
+ struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
+ int ret = 0;
+
+ if (count == 0)
+ return ret;
+
+ copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
+ ret = save_microcode(&mc_saved_data, mc_saved, count);
+ if (ret)
+ pr_err("Cannot save microcode patches from initrd.\n");
+
+ show_saved_mc();
+
+ return ret;
+}
+
+static void __init
+_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
+ unsigned long *initrd,
+ unsigned long start, unsigned long size)
+{
+ struct ucode_cpu_info uci;
+ enum ucode_state ret;
+
+ collect_cpu_info_early(&uci);
+
+ ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+ if (ret != UCODE_OK)
+ return;
+
+ ret = load_microcode(mc_saved_data, initrd, start, &uci);
+ if (ret != UCODE_OK)
+ return;
+
+ apply_microcode_early(&uci, true);
+}
+
+void __init load_ucode_intel_bsp(void)
+{
+ u64 start, size;
+#ifdef CONFIG_X86_32
+ struct boot_params *p;
+
+ p = (struct boot_params *)__pa_nodebug(&boot_params);
+ start = p->hdr.ramdisk_image;
+ size = p->hdr.ramdisk_size;
+
+ _load_ucode_intel_bsp(
+ (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+ (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+ start, size);
+#else
+ start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
+ size = boot_params.hdr.ramdisk_size;
+
+ _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
+#endif
+}
+
+void load_ucode_intel_ap(void)
+{
+ struct mc_saved_data *mc_saved_data_p;
+ struct ucode_cpu_info uci;
+ unsigned long *mc_saved_in_initrd_p;
+ unsigned long initrd_start_addr;
+ enum ucode_state ret;
+#ifdef CONFIG_X86_32
+ unsigned long *initrd_start_p;
+
+ mc_saved_in_initrd_p =
+ (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
+ mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
+ initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
+ initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
+#else
+ mc_saved_data_p = &mc_saved_data;
+ mc_saved_in_initrd_p = mc_saved_in_initrd;
+ initrd_start_addr = initrd_start;
+#endif
+
+ /*
+ * If there is no valid ucode previously saved in memory, no need to
+ * update ucode on this AP.
+ */
+ if (mc_saved_data_p->mc_saved_count == 0)
+ return;
+
+ collect_cpu_info_early(&uci);
+ ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+ initrd_start_addr, &uci);
+
+ if (ret != UCODE_OK)
+ return;
+
+ apply_microcode_early(&uci, true);
+}
+
+void reload_ucode_intel(void)
+{
+ struct ucode_cpu_info uci;
+ enum ucode_state ret;
+
+ if (!mc_saved_data.mc_saved_count)
+ return;
+
+ collect_cpu_info_early(&uci);
+
+ ret = load_microcode_early(mc_saved_data.mc_saved,
+ mc_saved_data.mc_saved_count, &uci);
+ if (ret != UCODE_OK)
+ return;
+
+ apply_microcode_early(&uci, false);
+}
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
@@ -264,7 +1037,7 @@ static struct microcode_ops microcode_intel_ops = {
struct microcode_ops * __init init_intel_microcode(void)
{
- struct cpuinfo_x86 *c = &cpu_data(0);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
cpu_has(c, X86_FEATURE_IA64)) {
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
deleted file mode 100644
index 37ea89c11520..000000000000
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ /dev/null
@@ -1,808 +0,0 @@
-/*
- * Intel CPU microcode early update for Linux
- *
- * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- * H Peter Anvin" <hpa@zytor.com>
- *
- * This allows to early upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
- * Software Developer's Manual.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-/*
- * This needs to be before all headers so that pr_debug in printk.h doesn't turn
- * printk calls into no_printk().
- *
- *#define DEBUG
- */
-
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/earlycpio.h>
-#include <linux/initrd.h>
-#include <linux/cpu.h>
-#include <asm/msr.h>
-#include <asm/microcode_intel.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/setup.h>
-
-#undef pr_fmt
-#define pr_fmt(fmt) "microcode: " fmt
-
-static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
-static struct mc_saved_data {
- unsigned int mc_saved_count;
- struct microcode_intel **mc_saved;
-} mc_saved_data;
-
-static enum ucode_state
-load_microcode_early(struct microcode_intel **saved,
- unsigned int num_saved, struct ucode_cpu_info *uci)
-{
- struct microcode_intel *ucode_ptr, *new_mc = NULL;
- struct microcode_header_intel *mc_hdr;
- int new_rev, ret, i;
-
- new_rev = uci->cpu_sig.rev;
-
- for (i = 0; i < num_saved; i++) {
- ucode_ptr = saved[i];
- mc_hdr = (struct microcode_header_intel *)ucode_ptr;
-
- ret = has_newer_microcode(ucode_ptr,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf,
- new_rev);
- if (!ret)
- continue;
-
- new_rev = mc_hdr->rev;
- new_mc = ucode_ptr;
- }
-
- if (!new_mc)
- return UCODE_NFOUND;
-
- uci->mc = (struct microcode_intel *)new_mc;
- return UCODE_OK;
-}
-
-static inline void
-copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
- unsigned long off, int num_saved)
-{
- int i;
-
- for (i = 0; i < num_saved; i++)
- mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
-}
-
-#ifdef CONFIG_X86_32
-static void
-microcode_phys(struct microcode_intel **mc_saved_tmp,
- struct mc_saved_data *mc_saved_data)
-{
- int i;
- struct microcode_intel ***mc_saved;
-
- mc_saved = (struct microcode_intel ***)
- __pa_nodebug(&mc_saved_data->mc_saved);
- for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
- struct microcode_intel *p;
-
- p = *(struct microcode_intel **)
- __pa_nodebug(mc_saved_data->mc_saved + i);
- mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
- }
-}
-#endif
-
-static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
- unsigned long initrd_start, struct ucode_cpu_info *uci)
-{
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int count = mc_saved_data->mc_saved_count;
-
- if (!mc_saved_data->mc_saved) {
- copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
-
- return load_microcode_early(mc_saved_tmp, count, uci);
- } else {
-#ifdef CONFIG_X86_32
- microcode_phys(mc_saved_tmp, mc_saved_data);
- return load_microcode_early(mc_saved_tmp, count, uci);
-#else
- return load_microcode_early(mc_saved_data->mc_saved,
- count, uci);
-#endif
- }
-}
-
-/*
- * Given CPU signature and a microcode patch, this function finds if the
- * microcode patch has matching family and model with the CPU.
- */
-static enum ucode_state
-matching_model_microcode(struct microcode_header_intel *mc_header,
- unsigned long sig)
-{
- unsigned int fam, model;
- unsigned int fam_ucode, model_ucode;
- struct extended_sigtable *ext_header;
- unsigned long total_size = get_totalsize(mc_header);
- unsigned long data_size = get_datasize(mc_header);
- int ext_sigcount, i;
- struct extended_signature *ext_sig;
-
- fam = __x86_family(sig);
- model = x86_model(sig);
-
- fam_ucode = __x86_family(mc_header->sig);
- model_ucode = x86_model(mc_header->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return UCODE_OK;
-
- /* Look for ext. headers: */
- if (total_size <= data_size + MC_HEADER_SIZE)
- return UCODE_NFOUND;
-
- ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
- ext_sigcount = ext_header->count;
-
- for (i = 0; i < ext_sigcount; i++) {
- fam_ucode = __x86_family(ext_sig->sig);
- model_ucode = x86_model(ext_sig->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return UCODE_OK;
-
- ext_sig++;
- }
- return UCODE_NFOUND;
-}
-
-static int
-save_microcode(struct mc_saved_data *mc_saved_data,
- struct microcode_intel **mc_saved_src,
- unsigned int mc_saved_count)
-{
- int i, j;
- struct microcode_intel **saved_ptr;
- int ret;
-
- if (!mc_saved_count)
- return -EINVAL;
-
- /*
- * Copy new microcode data.
- */
- saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
- if (!saved_ptr)
- return -ENOMEM;
-
- for (i = 0; i < mc_saved_count; i++) {
- struct microcode_header_intel *mc_hdr;
- struct microcode_intel *mc;
- unsigned long size;
-
- if (!mc_saved_src[i]) {
- ret = -EINVAL;
- goto err;
- }
-
- mc = mc_saved_src[i];
- mc_hdr = &mc->hdr;
- size = get_totalsize(mc_hdr);
-
- saved_ptr[i] = kmalloc(size, GFP_KERNEL);
- if (!saved_ptr[i]) {
- ret = -ENOMEM;
- goto err;
- }
-
- memcpy(saved_ptr[i], mc, size);
- }
-
- /*
- * Point to newly saved microcode.
- */
- mc_saved_data->mc_saved = saved_ptr;
- mc_saved_data->mc_saved_count = mc_saved_count;
-
- return 0;
-
-err:
- for (j = 0; j <= i; j++)
- kfree(saved_ptr[j]);
- kfree(saved_ptr);
-
- return ret;
-}
-
-/*
- * A microcode patch in ucode_ptr is saved into mc_saved
- * - if it has matching signature and newer revision compared to an existing
- * patch mc_saved.
- * - or if it is a newly discovered microcode patch.
- *
- * The microcode patch should have matching model with CPU.
- *
- * Returns: The updated number @num_saved of saved microcode patches.
- */
-static unsigned int _save_mc(struct microcode_intel **mc_saved,
- u8 *ucode_ptr, unsigned int num_saved)
-{
- struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- unsigned int sig, pf;
- int found = 0, i;
-
- mc_hdr = (struct microcode_header_intel *)ucode_ptr;
-
- for (i = 0; i < num_saved; i++) {
- mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
- sig = mc_saved_hdr->sig;
- pf = mc_saved_hdr->pf;
-
- if (!find_matching_signature(ucode_ptr, sig, pf))
- continue;
-
- found = 1;
-
- if (mc_hdr->rev <= mc_saved_hdr->rev)
- continue;
-
- /*
- * Found an older ucode saved earlier. Replace it with
- * this newer one.
- */
- mc_saved[i] = (struct microcode_intel *)ucode_ptr;
- break;
- }
-
- /* Newly detected microcode, save it to memory. */
- if (i >= num_saved && !found)
- mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
-
- return num_saved;
-}
-
-/*
- * Get microcode matching with BSP's model. Only CPUs with the same model as
- * BSP can stay in the platform.
- */
-static enum ucode_state __init
-get_matching_model_microcode(int cpu, unsigned long start,
- void *data, size_t size,
- struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- struct ucode_cpu_info *uci)
-{
- u8 *ucode_ptr = data;
- unsigned int leftover = size;
- enum ucode_state state = UCODE_OK;
- unsigned int mc_size;
- struct microcode_header_intel *mc_header;
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
- int i;
-
- while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
-
- if (leftover < sizeof(mc_header))
- break;
-
- mc_header = (struct microcode_header_intel *)ucode_ptr;
-
- mc_size = get_totalsize(mc_header);
- if (!mc_size || mc_size > leftover ||
- microcode_sanity_check(ucode_ptr, 0) < 0)
- break;
-
- leftover -= mc_size;
-
- /*
- * Since APs with same family and model as the BSP may boot in
- * the platform, we need to find and save microcode patches
- * with the same family and model as the BSP.
- */
- if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
- UCODE_OK) {
- ucode_ptr += mc_size;
- continue;
- }
-
- mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
-
- ucode_ptr += mc_size;
- }
-
- if (leftover) {
- state = UCODE_ERROR;
- goto out;
- }
-
- if (mc_saved_count == 0) {
- state = UCODE_NFOUND;
- goto out;
- }
-
- for (i = 0; i < mc_saved_count; i++)
- mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
-
- mc_saved_data->mc_saved_count = mc_saved_count;
-out:
- return state;
-}
-
-static int collect_cpu_info_early(struct ucode_cpu_info *uci)
-{
- unsigned int val[2];
- unsigned int family, model;
- struct cpu_signature csig;
- unsigned int eax, ebx, ecx, edx;
-
- csig.sig = 0;
- csig.pf = 0;
- csig.rev = 0;
-
- memset(uci, 0, sizeof(*uci));
-
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- csig.sig = eax;
-
- family = __x86_family(csig.sig);
- model = x86_model(csig.sig);
-
- if ((model >= 5) || (family > 6)) {
- /* get processor flags from MSR 0x17 */
- native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig.pf = 1 << ((val[1] >> 18) & 7);
- }
- native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
- /* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
-
- /* get the current revision from MSR 0x8B */
- native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
- csig.rev = val[1];
-
- uci->cpu_sig = csig;
- uci->valid = 1;
-
- return 0;
-}
-
-#ifdef DEBUG
-static void show_saved_mc(void)
-{
- int i, j;
- unsigned int sig, pf, rev, total_size, data_size, date;
- struct ucode_cpu_info uci;
-
- if (mc_saved_data.mc_saved_count == 0) {
- pr_debug("no microcode data saved.\n");
- return;
- }
- pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
-
- collect_cpu_info_early(&uci);
-
- sig = uci.cpu_sig.sig;
- pf = uci.cpu_sig.pf;
- rev = uci.cpu_sig.rev;
- pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
-
- for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
- struct microcode_header_intel *mc_saved_header;
- struct extended_sigtable *ext_header;
- int ext_sigcount;
- struct extended_signature *ext_sig;
-
- mc_saved_header = (struct microcode_header_intel *)
- mc_saved_data.mc_saved[i];
- sig = mc_saved_header->sig;
- pf = mc_saved_header->pf;
- rev = mc_saved_header->rev;
- total_size = get_totalsize(mc_saved_header);
- data_size = get_datasize(mc_saved_header);
- date = mc_saved_header->date;
-
- pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
- i, sig, pf, rev, total_size,
- date & 0xffff,
- date >> 24,
- (date >> 16) & 0xff);
-
- /* Look for ext. headers: */
- if (total_size <= data_size + MC_HEADER_SIZE)
- continue;
-
- ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
- ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
-
- for (j = 0; j < ext_sigcount; j++) {
- sig = ext_sig->sig;
- pf = ext_sig->pf;
-
- pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
- j, sig, pf);
-
- ext_sig++;
- }
-
- }
-}
-#else
-static inline void show_saved_mc(void)
-{
-}
-#endif
-
-#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
-static DEFINE_MUTEX(x86_cpu_microcode_mutex);
-/*
- * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
- * hot added or resumes.
- *
- * Please make sure this mc should be a valid microcode patch before calling
- * this function.
- */
-int save_mc_for_early(u8 *mc)
-{
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int mc_saved_count_init;
- unsigned int mc_saved_count;
- struct microcode_intel **mc_saved;
- int ret = 0;
- int i;
-
- /*
- * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
- * hotplug.
- */
- mutex_lock(&x86_cpu_microcode_mutex);
-
- mc_saved_count_init = mc_saved_data.mc_saved_count;
- mc_saved_count = mc_saved_data.mc_saved_count;
- mc_saved = mc_saved_data.mc_saved;
-
- if (mc_saved && mc_saved_count)
- memcpy(mc_saved_tmp, mc_saved,
- mc_saved_count * sizeof(struct microcode_intel *));
- /*
- * Save the microcode patch mc in mc_save_tmp structure if it's a newer
- * version.
- */
- mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
-
- /*
- * Save the mc_save_tmp in global mc_saved_data.
- */
- ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
- if (ret) {
- pr_err("Cannot save microcode patch.\n");
- goto out;
- }
-
- show_saved_mc();
-
- /*
- * Free old saved microcode data.
- */
- if (mc_saved) {
- for (i = 0; i < mc_saved_count_init; i++)
- kfree(mc_saved[i]);
- kfree(mc_saved);
- }
-
-out:
- mutex_unlock(&x86_cpu_microcode_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(save_mc_for_early);
-#endif
-
-static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
-{
-#ifdef CONFIG_X86_64
- unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
- unsigned int family, model, stepping;
- char name[30];
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- family = __x86_family(eax);
- model = x86_model(eax);
- stepping = eax & 0xf;
-
- sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping);
-
- return get_builtin_firmware(cp, name);
-#else
- return false;
-#endif
-}
-
-static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
-static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
- unsigned long start, unsigned long size,
- struct ucode_cpu_info *uci)
-{
- struct cpio_data cd;
- long offset = 0;
-#ifdef CONFIG_X86_32
- char *p = (char *)__pa_nodebug(ucode_name);
-#else
- char *p = ucode_name;
-#endif
-
- cd.data = NULL;
- cd.size = 0;
-
- cd = find_cpio_data(p, (void *)start, size, &offset);
- if (!cd.data) {
- if (!load_builtin_intel_microcode(&cd))
- return UCODE_ERROR;
- }
-
- return get_matching_model_microcode(0, start, cd.data, cd.size,
- mc_saved_data, initrd, uci);
-}
-
-/*
- * Print ucode update info.
- */
-static void
-print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
-{
- int cpu = smp_processor_id();
-
- pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
- cpu,
- uci->cpu_sig.rev,
- date & 0xffff,
- date >> 24,
- (date >> 16) & 0xff);
-}
-
-#ifdef CONFIG_X86_32
-
-static int delay_ucode_info;
-static int current_mc_date;
-
-/*
- * Print early updated ucode info after printk works. This is delayed info dump.
- */
-void show_ucode_info_early(void)
-{
- struct ucode_cpu_info uci;
-
- if (delay_ucode_info) {
- collect_cpu_info_early(&uci);
- print_ucode_info(&uci, current_mc_date);
- delay_ucode_info = 0;
- }
-}
-
-/*
- * At this point, we can not call printk() yet. Keep microcode patch number in
- * mc_saved_data.mc_saved and delay printing microcode info in
- * show_ucode_info_early() until printk() works.
- */
-static void print_ucode(struct ucode_cpu_info *uci)
-{
- struct microcode_intel *mc_intel;
- int *delay_ucode_info_p;
- int *current_mc_date_p;
-
- mc_intel = uci->mc;
- if (mc_intel == NULL)
- return;
-
- delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
- current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
-
- *delay_ucode_info_p = 1;
- *current_mc_date_p = mc_intel->hdr.date;
-}
-#else
-
-/*
- * Flush global tlb. We only do this in x86_64 where paging has been enabled
- * already and PGE should be enabled as well.
- */
-static inline void flush_tlb_early(void)
-{
- __native_flush_tlb_global_irq_disabled();
-}
-
-static inline void print_ucode(struct ucode_cpu_info *uci)
-{
- struct microcode_intel *mc_intel;
-
- mc_intel = uci->mc;
- if (mc_intel == NULL)
- return;
-
- print_ucode_info(uci, mc_intel->hdr.date);
-}
-#endif
-
-static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
-{
- struct microcode_intel *mc_intel;
- unsigned int val[2];
-
- mc_intel = uci->mc;
- if (mc_intel == NULL)
- return 0;
-
- /* write microcode via MSR 0x79 */
- native_wrmsr(MSR_IA32_UCODE_WRITE,
- (unsigned long) mc_intel->bits,
- (unsigned long) mc_intel->bits >> 16 >> 16);
- native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
- /* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
-
- /* get the current revision from MSR 0x8B */
- native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- if (val[1] != mc_intel->hdr.rev)
- return -1;
-
-#ifdef CONFIG_X86_64
- /* Flush global tlb. This is precaution. */
- flush_tlb_early();
-#endif
- uci->cpu_sig.rev = val[1];
-
- if (early)
- print_ucode(uci);
- else
- print_ucode_info(uci, mc_intel->hdr.date);
-
- return 0;
-}
-
-/*
- * This function converts microcode patch offsets previously stored in
- * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
- */
-int __init save_microcode_in_initrd_intel(void)
-{
- unsigned int count = mc_saved_data.mc_saved_count;
- struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
- int ret = 0;
-
- if (count == 0)
- return ret;
-
- copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
- ret = save_microcode(&mc_saved_data, mc_saved, count);
- if (ret)
- pr_err("Cannot save microcode patches from initrd.\n");
-
- show_saved_mc();
-
- return ret;
-}
-
-static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
- unsigned long *initrd,
- unsigned long start, unsigned long size)
-{
- struct ucode_cpu_info uci;
- enum ucode_state ret;
-
- collect_cpu_info_early(&uci);
-
- ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
- if (ret != UCODE_OK)
- return;
-
- ret = load_microcode(mc_saved_data, initrd, start, &uci);
- if (ret != UCODE_OK)
- return;
-
- apply_microcode_early(&uci, true);
-}
-
-void __init load_ucode_intel_bsp(void)
-{
- u64 start, size;
-#ifdef CONFIG_X86_32
- struct boot_params *p;
-
- p = (struct boot_params *)__pa_nodebug(&boot_params);
- start = p->hdr.ramdisk_image;
- size = p->hdr.ramdisk_size;
-
- _load_ucode_intel_bsp(
- (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
- (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
- start, size);
-#else
- start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
- size = boot_params.hdr.ramdisk_size;
-
- _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
-#endif
-}
-
-void load_ucode_intel_ap(void)
-{
- struct mc_saved_data *mc_saved_data_p;
- struct ucode_cpu_info uci;
- unsigned long *mc_saved_in_initrd_p;
- unsigned long initrd_start_addr;
- enum ucode_state ret;
-#ifdef CONFIG_X86_32
- unsigned long *initrd_start_p;
-
- mc_saved_in_initrd_p =
- (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
- mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
- initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
- initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
-#else
- mc_saved_data_p = &mc_saved_data;
- mc_saved_in_initrd_p = mc_saved_in_initrd;
- initrd_start_addr = initrd_start;
-#endif
-
- /*
- * If there is no valid ucode previously saved in memory, no need to
- * update ucode on this AP.
- */
- if (mc_saved_data_p->mc_saved_count == 0)
- return;
-
- collect_cpu_info_early(&uci);
- ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
- initrd_start_addr, &uci);
-
- if (ret != UCODE_OK)
- return;
-
- apply_microcode_early(&uci, true);
-}
-
-void reload_ucode_intel(void)
-{
- struct ucode_cpu_info uci;
- enum ucode_state ret;
-
- if (!mc_saved_data.mc_saved_count)
- return;
-
- collect_cpu_info_early(&uci);
-
- ret = load_microcode_early(mc_saved_data.mc_saved,
- mc_saved_data.mc_saved_count, &uci);
- if (ret != UCODE_OK)
- return;
-
- apply_microcode_early(&uci, false);
-}
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index 1883d252ff7d..b96896bcbdaf 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -25,7 +25,6 @@
#include <linux/firmware.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <asm/microcode_intel.h>
#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 70d7c93f4550..0d98503c2245 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -593,9 +593,16 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
unsigned long x_remove_base,
unsigned long x_remove_size, int i)
{
- static struct range range_new[RANGE_NUM];
+ /*
+ * range_new should really be an automatic variable, but
+ * putting 4096 bytes on the stack is frowned upon, to put it
+ * mildly. It is safe to make it a static __initdata variable,
+ * since mtrr_calc_range_state is only called during init and
+ * there's no way it will call itself recursively.
+ */
+ static struct range range_new[RANGE_NUM] __initdata;
unsigned long range_sums_new;
- static int nr_range_new;
+ int nr_range_new;
int num_reg;
/* Convert ranges to var ranges state: */
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 3b533cf37c74..c870af161008 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -349,7 +349,7 @@ static void get_fixed_ranges(mtrr_type *frs)
void mtrr_save_fixed_ranges(void *info)
{
- if (cpu_has_mtrr)
+ if (boot_cpu_has(X86_FEATURE_MTRR))
get_fixed_ranges(mtrr_state.fixed_ranges);
}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index f891b4750f04..5c3d149ee91c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -682,7 +682,7 @@ void __init mtrr_bp_init(void)
phys_addr = 32;
- if (cpu_has_mtrr) {
+ if (boot_cpu_has(X86_FEATURE_MTRR)) {
mtrr_if = &generic_mtrr_ops;
size_or_mask = SIZE_OR_MASK_BITS(36);
size_and_mask = 0x00f00000;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 66dd3fe99b82..1b443db2db50 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -5,7 +5,7 @@
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
* Copyright (C) 2009 Google, Inc., Stephane Eranian
*
@@ -482,6 +482,9 @@ int x86_pmu_hw_config(struct perf_event *event)
/* Support for IP fixup */
if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
precise++;
+
+ if (x86_pmu.pebs_prec_dist)
+ precise++;
}
if (event->attr.precise_ip > precise)
@@ -1175,7 +1178,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
* skip the schedulability test here, it will be performed
* at commit time (->commit_txn) as a whole.
*/
- if (cpuc->group_flag & PERF_EVENT_TXN)
+ if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto done_collect;
ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1326,7 +1329,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
* XXX assumes any ->del() called during a TXN will only be on
* an event added during that same TXN.
*/
- if (cpuc->group_flag & PERF_EVENT_TXN)
+ if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
return;
/*
@@ -1531,6 +1534,7 @@ static void __init filter_events(struct attribute **attrs)
{
struct device_attribute *d;
struct perf_pmu_events_attr *pmu_attr;
+ int offset = 0;
int i, j;
for (i = 0; attrs[i]; i++) {
@@ -1539,7 +1543,7 @@ static void __init filter_events(struct attribute **attrs)
/* str trumps id */
if (pmu_attr->event_str)
continue;
- if (x86_pmu.event_map(i))
+ if (x86_pmu.event_map(i + offset))
continue;
for (j = i; attrs[j]; j++)
@@ -1547,6 +1551,14 @@ static void __init filter_events(struct attribute **attrs)
/* Check the shifted attr. */
i--;
+
+ /*
+ * event_map() is index based, the attrs array is organized
+ * by increasing event index. If we shift the events, then
+ * we need to compensate for the event_map(), otherwise
+ * we are looking up the wrong event in the map
+ */
+ offset++;
}
}
@@ -1748,11 +1760,22 @@ static inline void x86_pmu_read(struct perf_event *event)
* Start group events scheduling transaction
* Set the flag to make pmu::enable() not perform the
* schedulability test, it will be performed at commit time
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
*/
-static void x86_pmu_start_txn(struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */
+
+ cpuc->txn_flags = txn_flags;
+ if (txn_flags & ~PERF_PMU_TXN_ADD)
+ return;
+
perf_pmu_disable(pmu);
- __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
__this_cpu_write(cpu_hw_events.n_txn, 0);
}
@@ -1763,7 +1786,16 @@ static void x86_pmu_start_txn(struct pmu *pmu)
*/
static void x86_pmu_cancel_txn(struct pmu *pmu)
{
- __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
+ unsigned int txn_flags;
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+ txn_flags = cpuc->txn_flags;
+ cpuc->txn_flags = 0;
+ if (txn_flags & ~PERF_PMU_TXN_ADD)
+ return;
+
/*
* Truncate collected array by the number of events added in this
* transaction. See x86_pmu_add() and x86_pmu_*_txn().
@@ -1786,6 +1818,13 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
int assign[X86_PMC_IDX_MAX];
int n, ret;
+ WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+ if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
+ cpuc->txn_flags = 0;
+ return 0;
+ }
+
n = cpuc->n_events;
if (!x86_pmu_initialized())
@@ -1801,7 +1840,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
- cpuc->group_flag &= ~PERF_EVENT_TXN;
+ cpuc->txn_flags = 0;
perf_pmu_enable(pmu);
return 0;
}
@@ -2223,12 +2262,19 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
ss_base = get_segment_base(regs->ss);
fp = compat_ptr(ss_base + regs->bp);
+ pagefault_disable();
while (entry->nr < PERF_MAX_STACK_DEPTH) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
- bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+ if (!access_ok(VERIFY_READ, fp, 8))
+ break;
+
+ bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
+ if (bytes != 0)
+ break;
+ bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
if (bytes != 0)
break;
@@ -2238,6 +2284,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
perf_callchain_store(entry, cs_base + frame.return_address);
fp = compat_ptr(ss_base + frame.next_frame);
}
+ pagefault_enable();
return 1;
}
#else
@@ -2275,12 +2322,19 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
if (perf_callchain_user32(regs, entry))
return;
+ pagefault_disable();
while (entry->nr < PERF_MAX_STACK_DEPTH) {
unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
- bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+ if (!access_ok(VERIFY_READ, fp, 16))
+ break;
+
+ bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
+ if (bytes != 0)
+ break;
+ bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
if (bytes != 0)
break;
@@ -2288,8 +2342,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
break;
perf_callchain_store(entry, frame.return_address);
- fp = frame.next_frame;
+ fp = (void __user *)frame.next_frame;
}
+ pagefault_enable();
}
/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 165be83a7fa4..7bb61e32fb29 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -5,7 +5,7 @@
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
* Copyright (C) 2009 Google, Inc., Stephane Eranian
*
@@ -14,17 +14,7 @@
#include <linux/perf_event.h>
-#if 0
-#undef wrmsrl
-#define wrmsrl(msr, val) \
-do { \
- unsigned int _msr = (msr); \
- u64 _val = (val); \
- trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \
- (unsigned long long)(_val)); \
- native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \
-} while (0)
-#endif
+/* To enable MSR tracing please use the generic trace points. */
/*
* | NHM/WSM | SNB |
@@ -196,7 +186,7 @@ struct cpu_hw_events {
int n_excl; /* the number of exclusive events */
- unsigned int group_flag;
+ unsigned int txn_flags;
int is_fake;
/*
@@ -318,6 +308,10 @@ struct cpu_hw_events {
#define INTEL_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
/* Like UEVENT_CONSTRAINT, but match flags too */
#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
@@ -387,7 +381,7 @@ struct cpu_hw_events {
/* Check flags and event code/umask, and set the HSW N/A flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
__EVENT_CONSTRAINT(code, n, \
- INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
@@ -589,7 +583,8 @@ struct x86_pmu {
bts_active :1,
pebs :1,
pebs_active :1,
- pebs_broken :1;
+ pebs_broken :1,
+ pebs_prec_dist :1;
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
@@ -627,6 +622,7 @@ struct x86_perf_task_context {
u64 lbr_from[MAX_LBR_ENTRIES];
u64 lbr_to[MAX_LBR_ENTRIES];
u64 lbr_info[MAX_LBR_ENTRIES];
+ int tos;
int lbr_callstack_users;
int lbr_stack_state;
};
@@ -906,6 +902,8 @@ void intel_pmu_lbr_init_hsw(void);
void intel_pmu_lbr_init_skl(void);
+void intel_pmu_lbr_init_knl(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 1cee5d2d7ece..58610539b048 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -18,7 +18,7 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
},
[ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
+ [ C(RESULT_ACCESS) ] = 0,
[ C(RESULT_MISS) ] = 0,
},
[ C(OP_PREFETCH) ] = {
@@ -160,7 +160,7 @@ static inline int amd_pmu_addr_offset(int index, bool eventsel)
if (offset)
return offset;
- if (!cpu_has_perfctr_core)
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
offset = index;
else
offset = index << 1;
@@ -652,7 +652,7 @@ static __initconst const struct x86_pmu amd_pmu = {
static int __init amd_core_pmu_init(void)
{
- if (!cpu_has_perfctr_core)
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
return 0;
switch (boot_cpu_data.x86) {
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index cc6cedb8f25d..49742746a6c9 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -523,10 +523,10 @@ static int __init amd_uncore_init(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
goto fail_nodev;
- if (!cpu_has_topoext)
+ if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
goto fail_nodev;
- if (cpu_has_perfctr_nb) {
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
amd_uncore_nb = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_nb) {
ret = -ENOMEM;
@@ -540,7 +540,7 @@ static int __init amd_uncore_init(void)
ret = 0;
}
- if (cpu_has_perfctr_l2) {
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_l2) {
ret = -ENOMEM;
@@ -583,10 +583,11 @@ fail_online:
/* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
amd_uncore_nb = amd_uncore_l2 = NULL;
- if (cpu_has_perfctr_l2)
+
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
perf_pmu_unregister(&amd_l2_pmu);
fail_l2:
- if (cpu_has_perfctr_nb)
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
perf_pmu_unregister(&amd_nb_pmu);
if (amd_uncore_l2)
free_percpu(amd_uncore_l2);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index f63360be2238..fed2ab1f1065 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -185,6 +185,14 @@ struct event_constraint intel_skl_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7,
+ MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7,
+ MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -232,7 +240,7 @@ static struct event_constraint intel_hsw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
- INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
@@ -255,7 +263,7 @@ struct event_constraint intel_bdw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+ INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
EVENT_CONSTRAINT_END
};
@@ -1457,6 +1465,42 @@ static __initconst const u64 slm_hw_cache_event_ids
},
};
+#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL BIT_ULL(21)
+#define KNL_MCDRAM_FAR BIT_ULL(22)
+#define KNL_DDR_LOCAL BIT_ULL(23)
+#define KNL_DDR_FAR BIT_ULL(24)
+#define KNL_DRAM_ANY (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+ KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ SLM_DMND_READ
+#define KNL_L2_WRITE SLM_DMND_WRITE
+#define KNL_L2_PREFETCH SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS SLM_LLC_ACCESS
+#define KNL_L2_MISS (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+ KNL_DRAM_ANY | SNB_SNP_ANY | \
+ SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_WRITE | KNL_L2_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_PREFETCH | KNL_L2_MISS,
+ },
+ },
+};
+
/*
* Use from PMIs where the LBRs are already disabled.
*/
@@ -1916,7 +1960,8 @@ intel_bts_constraints(struct perf_event *event)
static int intel_alt_er(int idx, u64 config)
{
- int alt_idx;
+ int alt_idx = idx;
+
if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
return idx;
@@ -2475,6 +2520,44 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
}
}
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+ * (0x01c0), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * The PREC_DIST event has special support to minimize sample
+ * shadowing effects. One drawback is that it can be
+ * only programmed on counter 1, but that seems like an
+ * acceptable trade off.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_snb(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_core2(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
{
unsigned long flags = x86_pmu.free_running_flags;
@@ -2815,14 +2898,12 @@ static void intel_pmu_cpu_starting(int cpu)
return;
if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
- void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
-
for_each_cpu(i, topology_sibling_cpumask(cpu)) {
struct intel_shared_regs *pc;
pc = per_cpu(cpu_hw_events, i).shared_regs;
if (pc && pc->core_id == core_id) {
- *onln = cpuc->shared_regs;
+ cpuc->kfree_on_online[0] = cpuc->shared_regs;
cpuc->shared_regs = pc;
break;
}
@@ -3332,6 +3413,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_gen_event_constraints;
x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
pr_cont("Atom events, ");
break;
@@ -3431,7 +3513,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_ivb_event_constraints;
x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
if (boot_cpu_data.x86_model == 62)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
@@ -3464,7 +3547,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3499,7 +3583,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_bdw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3511,6 +3596,24 @@ __init int intel_pmu_init(void)
pr_cont("Broadwell events, ");
break;
+ case 87: /* Knights Landing Xeon Phi */
+ memcpy(hw_cache_event_ids,
+ slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs,
+ knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_knl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_knl_extra_regs;
+
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ pr_cont("Knights Landing events, ");
+ break;
+
case 78: /* 14nm Skylake Mobile */
case 94: /* 14nm Skylake Desktop */
x86_pmu.late_ack = true;
@@ -3521,7 +3624,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_skl_event_constraints;
x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
x86_pmu.extra_regs = intel_skl_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
index d1c0f254afbe..2cad71d1b14c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -495,6 +495,19 @@ static int bts_event_init(struct perf_event *event)
if (x86_add_exclusive(x86_lbr_exclusive_bts))
return -EBUSY;
+ /*
+ * BTS leaks kernel addresses even when CPL0 tracing is
+ * disabled, so disallow intel_bts driver for unprivileged
+ * users on paranoid systems since it provides trace data
+ * to the user in a zero-copy fashion.
+ *
+ * Note that the default paranoia setting permits unprivileged
+ * users to profile the kernel.
+ */
+ if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
+ !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
ret = x86_reserve_hardware();
if (ret) {
x86_del_exclusive(x86_lbr_exclusive_bts);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 377e8f8ed391..a316ca96f1b6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
{
if (event->attach_state & PERF_ATTACH_TASK)
- return perf_cgroup_from_task(event->hw.target);
+ return perf_cgroup_from_task(event->hw.target, event->ctx);
return event->cgrp;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/kernel/cpu/perf_event_intel_cstate.c
new file mode 100644
index 000000000000..75a38b5a2e26
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_cstate.c
@@ -0,0 +1,694 @@
+/*
+ * perf_event_intel_cstate.c: support cstate residency counters
+ *
+ * Copyright (C) 2015, Intel Corp.
+ * Author: Kan Liang (kan.liang@intel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ */
+
+/*
+ * This file export cstate related free running (read-only) counters
+ * for perf. These counters may be use simultaneously by other tools,
+ * such as turbostat. However, it still make sense to implement them
+ * in perf. Because we can conveniently collect them together with
+ * other events, and allow to use them from tools without special MSR
+ * access code.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it is not supported by the hardware.
+ *
+ * According to counters' scope and category, two PMUs are registered
+ * with the perf_event core subsystem.
+ * - 'cstate_core': The counter is available for each physical core.
+ * The counters include CORE_C*_RESIDENCY.
+ * - 'cstate_pkg': The counter is available for each physical package.
+ * The counters include PKG_C*_RESIDENCY.
+ *
+ * All of these counters are specified in the Intel® 64 and IA-32
+ * Architectures Software Developer.s Manual Vol3b.
+ *
+ * Model specific counters:
+ * MSR_CORE_C1_RES: CORE C1 Residency Counter
+ * perf code: 0x00
+ * Available model: SLM,AMT
+ * Scope: Core (each processor core has a MSR)
+ * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
+ * perf code: 0x01
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ * Scope: Core
+ * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
+ * perf code: 0x02
+ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ * Scope: Core
+ * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
+ * perf code: 0x03
+ * Available model: SNB,IVB,HSW,BDW,SKL
+ * Scope: Core
+ * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
+ * perf code: 0x00
+ * Available model: SNB,IVB,HSW,BDW,SKL
+ * Scope: Package (physical package)
+ * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
+ * perf code: 0x01
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ * Scope: Package (physical package)
+ * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
+ * perf code: 0x02
+ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ * Scope: Package (physical package)
+ * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
+ * perf code: 0x03
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ * Scope: Package (physical package)
+ * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
+ * perf code: 0x04
+ * Available model: HSW ULT only
+ * Scope: Package (physical package)
+ * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
+ * perf code: 0x05
+ * Available model: HSW ULT only
+ * Scope: Package (physical package)
+ * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
+ * perf code: 0x06
+ * Available model: HSW ULT only
+ * Scope: Package (physical package)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \
+static ssize_t __cstate_##_var##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct kobj_attribute format_attr_##_var = \
+ __ATTR(_name, 0444, __cstate_##_var##_show, NULL)
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr,
+ char *buf);
+
+struct perf_cstate_msr {
+ u64 msr;
+ struct perf_pmu_events_attr *attr;
+ bool (*test)(int idx);
+};
+
+
+/* cstate_core PMU */
+
+static struct pmu cstate_core_pmu;
+static bool has_cstate_core;
+
+enum perf_cstate_core_id {
+ /*
+ * cstate_core events
+ */
+ PERF_CSTATE_CORE_C1_RES = 0,
+ PERF_CSTATE_CORE_C3_RES,
+ PERF_CSTATE_CORE_C6_RES,
+ PERF_CSTATE_CORE_C7_RES,
+
+ PERF_CSTATE_CORE_EVENT_MAX,
+};
+
+bool test_core(int idx)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return false;
+
+ switch (boot_cpu_data.x86_model) {
+ case 30: /* 45nm Nehalem */
+ case 26: /* 45nm Nehalem-EP */
+ case 46: /* 45nm Nehalem-EX */
+
+ case 37: /* 32nm Westmere */
+ case 44: /* 32nm Westmere-EP */
+ case 47: /* 32nm Westmere-EX */
+ if (idx == PERF_CSTATE_CORE_C3_RES ||
+ idx == PERF_CSTATE_CORE_C6_RES)
+ return true;
+ break;
+ case 42: /* 32nm SandyBridge */
+ case 45: /* 32nm SandyBridge-E/EN/EP */
+
+ case 58: /* 22nm IvyBridge */
+ case 62: /* 22nm IvyBridge-EP/EX */
+
+ case 60: /* 22nm Haswell Core */
+ case 63: /* 22nm Haswell Server */
+ case 69: /* 22nm Haswell ULT */
+ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+ case 79: /* 14nm Broadwell Server */
+
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ if (idx == PERF_CSTATE_CORE_C3_RES ||
+ idx == PERF_CSTATE_CORE_C6_RES ||
+ idx == PERF_CSTATE_CORE_C7_RES)
+ return true;
+ break;
+ case 55: /* 22nm Atom "Silvermont" */
+ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+ case 76: /* 14nm Atom "Airmont" */
+ if (idx == PERF_CSTATE_CORE_C1_RES ||
+ idx == PERF_CSTATE_CORE_C6_RES)
+ return true;
+ break;
+ }
+
+ return false;
+}
+
+PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
+
+static struct perf_cstate_msr core_msr[] = {
+ [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1, test_core, },
+ [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3, test_core, },
+ [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6, test_core, },
+ [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7, test_core, },
+};
+
+static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
+ NULL,
+};
+
+static struct attribute_group core_events_attr_group = {
+ .name = "events",
+ .attrs = core_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
+static struct attribute *core_format_attrs[] = {
+ &format_attr_core_event.attr,
+ NULL,
+};
+
+static struct attribute_group core_format_attr_group = {
+ .name = "format",
+ .attrs = core_format_attrs,
+};
+
+static cpumask_t cstate_core_cpu_mask;
+static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
+
+static struct attribute *cstate_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+ .attrs = cstate_cpumask_attrs,
+};
+
+static const struct attribute_group *core_attr_groups[] = {
+ &core_events_attr_group,
+ &core_format_attr_group,
+ &cpumask_attr_group,
+ NULL,
+};
+
+/* cstate_core PMU end */
+
+
+/* cstate_pkg PMU */
+
+static struct pmu cstate_pkg_pmu;
+static bool has_cstate_pkg;
+
+enum perf_cstate_pkg_id {
+ /*
+ * cstate_pkg events
+ */
+ PERF_CSTATE_PKG_C2_RES = 0,
+ PERF_CSTATE_PKG_C3_RES,
+ PERF_CSTATE_PKG_C6_RES,
+ PERF_CSTATE_PKG_C7_RES,
+ PERF_CSTATE_PKG_C8_RES,
+ PERF_CSTATE_PKG_C9_RES,
+ PERF_CSTATE_PKG_C10_RES,
+
+ PERF_CSTATE_PKG_EVENT_MAX,
+};
+
+bool test_pkg(int idx)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return false;
+
+ switch (boot_cpu_data.x86_model) {
+ case 30: /* 45nm Nehalem */
+ case 26: /* 45nm Nehalem-EP */
+ case 46: /* 45nm Nehalem-EX */
+
+ case 37: /* 32nm Westmere */
+ case 44: /* 32nm Westmere-EP */
+ case 47: /* 32nm Westmere-EX */
+ if (idx == PERF_CSTATE_CORE_C3_RES ||
+ idx == PERF_CSTATE_CORE_C6_RES ||
+ idx == PERF_CSTATE_CORE_C7_RES)
+ return true;
+ break;
+ case 42: /* 32nm SandyBridge */
+ case 45: /* 32nm SandyBridge-E/EN/EP */
+
+ case 58: /* 22nm IvyBridge */
+ case 62: /* 22nm IvyBridge-EP/EX */
+
+ case 60: /* 22nm Haswell Core */
+ case 63: /* 22nm Haswell Server */
+ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+ case 79: /* 14nm Broadwell Server */
+
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ if (idx == PERF_CSTATE_PKG_C2_RES ||
+ idx == PERF_CSTATE_PKG_C3_RES ||
+ idx == PERF_CSTATE_PKG_C6_RES ||
+ idx == PERF_CSTATE_PKG_C7_RES)
+ return true;
+ break;
+ case 55: /* 22nm Atom "Silvermont" */
+ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+ case 76: /* 14nm Atom "Airmont" */
+ if (idx == PERF_CSTATE_CORE_C6_RES)
+ return true;
+ break;
+ case 69: /* 22nm Haswell ULT */
+ if (idx == PERF_CSTATE_PKG_C2_RES ||
+ idx == PERF_CSTATE_PKG_C3_RES ||
+ idx == PERF_CSTATE_PKG_C6_RES ||
+ idx == PERF_CSTATE_PKG_C7_RES ||
+ idx == PERF_CSTATE_PKG_C8_RES ||
+ idx == PERF_CSTATE_PKG_C9_RES ||
+ idx == PERF_CSTATE_PKG_C10_RES)
+ return true;
+ break;
+ }
+
+ return false;
+}
+
+PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
+PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
+PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
+PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
+
+static struct perf_cstate_msr pkg_msr[] = {
+ [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2, test_pkg, },
+ [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3, test_pkg, },
+ [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6, test_pkg, },
+ [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7, test_pkg, },
+ [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8, test_pkg, },
+ [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9, test_pkg, },
+ [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10, test_pkg, },
+};
+
+static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
+ NULL,
+};
+
+static struct attribute_group pkg_events_attr_group = {
+ .name = "events",
+ .attrs = pkg_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
+static struct attribute *pkg_format_attrs[] = {
+ &format_attr_pkg_event.attr,
+ NULL,
+};
+static struct attribute_group pkg_format_attr_group = {
+ .name = "format",
+ .attrs = pkg_format_attrs,
+};
+
+static cpumask_t cstate_pkg_cpu_mask;
+
+static const struct attribute_group *pkg_attr_groups[] = {
+ &pkg_events_attr_group,
+ &pkg_format_attr_group,
+ &cpumask_attr_group,
+ NULL,
+};
+
+/* cstate_pkg PMU end*/
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ if (pmu == &cstate_core_pmu)
+ return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
+ else if (pmu == &cstate_pkg_pmu)
+ return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+ else
+ return 0;
+}
+
+static int cstate_pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+ int ret = 0;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ if (event->pmu == &cstate_core_pmu) {
+ if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
+ return -EINVAL;
+ if (!core_msr[cfg].attr)
+ return -EINVAL;
+ event->hw.event_base = core_msr[cfg].msr;
+ } else if (event->pmu == &cstate_pkg_pmu) {
+ if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
+ return -EINVAL;
+ if (!pkg_msr[cfg].attr)
+ return -EINVAL;
+ event->hw.event_base = pkg_msr[cfg].msr;
+ } else
+ return -ENOENT;
+
+ /* must be done before validate_group */
+ event->hw.config = cfg;
+ event->hw.idx = -1;
+
+ return ret;
+}
+
+static inline u64 cstate_pmu_read_counter(struct perf_event *event)
+{
+ u64 val;
+
+ rdmsrl(event->hw.event_base, val);
+ return val;
+}
+
+static void cstate_pmu_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_raw_count, new_raw_count;
+
+again:
+ prev_raw_count = local64_read(&hwc->prev_count);
+ new_raw_count = cstate_pmu_read_counter(event);
+
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ goto again;
+
+ local64_add(new_raw_count - prev_raw_count, &event->count);
+}
+
+static void cstate_pmu_event_start(struct perf_event *event, int mode)
+{
+ local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
+}
+
+static void cstate_pmu_event_stop(struct perf_event *event, int mode)
+{
+ cstate_pmu_event_update(event);
+}
+
+static void cstate_pmu_event_del(struct perf_event *event, int mode)
+{
+ cstate_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int cstate_pmu_event_add(struct perf_event *event, int mode)
+{
+ if (mode & PERF_EF_START)
+ cstate_pmu_event_start(event, mode);
+
+ return 0;
+}
+
+static void cstate_cpu_exit(int cpu)
+{
+ int i, id, target;
+
+ /* cpu exit for cstate core */
+ if (has_cstate_core) {
+ id = topology_core_id(cpu);
+ target = -1;
+
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+ if (id == topology_core_id(i)) {
+ target = i;
+ break;
+ }
+ }
+ if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
+ cpumask_set_cpu(target, &cstate_core_cpu_mask);
+ WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
+ if (target >= 0)
+ perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+ }
+
+ /* cpu exit for cstate pkg */
+ if (has_cstate_pkg) {
+ id = topology_physical_package_id(cpu);
+ target = -1;
+
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+ if (id == topology_physical_package_id(i)) {
+ target = i;
+ break;
+ }
+ }
+ if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
+ cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
+ WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
+ if (target >= 0)
+ perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+ }
+}
+
+static void cstate_cpu_init(int cpu)
+{
+ int i, id;
+
+ /* cpu init for cstate core */
+ if (has_cstate_core) {
+ id = topology_core_id(cpu);
+ for_each_cpu(i, &cstate_core_cpu_mask) {
+ if (id == topology_core_id(i))
+ break;
+ }
+ if (i >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+ }
+
+ /* cpu init for cstate pkg */
+ if (has_cstate_pkg) {
+ id = topology_physical_package_id(cpu);
+ for_each_cpu(i, &cstate_pkg_cpu_mask) {
+ if (id == topology_physical_package_id(i))
+ break;
+ }
+ if (i >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+ }
+}
+
+static int cstate_cpu_notifier(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ break;
+ case CPU_STARTING:
+ cstate_cpu_init(cpu);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DYING:
+ break;
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ break;
+ case CPU_DOWN_PREPARE:
+ cstate_cpu_exit(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+/*
+ * Probe the cstate events and insert the available one into sysfs attrs
+ * Return false if there is no available events.
+ */
+static bool cstate_probe_msr(struct perf_cstate_msr *msr,
+ struct attribute **events_attrs,
+ int max_event_nr)
+{
+ int i, j = 0;
+ u64 val;
+
+ /* Probe the cstate events. */
+ for (i = 0; i < max_event_nr; i++) {
+ if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+ msr[i].attr = NULL;
+ }
+
+ /* List remaining events in the sysfs attrs. */
+ for (i = 0; i < max_event_nr; i++) {
+ if (msr[i].attr)
+ events_attrs[j++] = &msr[i].attr->attr.attr;
+ }
+ events_attrs[j] = NULL;
+
+ return (j > 0) ? true : false;
+}
+
+static int __init cstate_init(void)
+{
+ /* SLM has different MSR for PKG C6 */
+ switch (boot_cpu_data.x86_model) {
+ case 55:
+ case 76:
+ case 77:
+ pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+ }
+
+ if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
+ has_cstate_core = true;
+
+ if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
+ has_cstate_pkg = true;
+
+ return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+}
+
+static void __init cstate_cpumask_init(void)
+{
+ int cpu;
+
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu)
+ cstate_cpu_init(cpu);
+
+ __perf_cpu_notifier(cstate_cpu_notifier);
+
+ cpu_notifier_register_done();
+}
+
+static struct pmu cstate_core_pmu = {
+ .attr_groups = core_attr_groups,
+ .name = "cstate_core",
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = cstate_pmu_event_init,
+ .add = cstate_pmu_event_add, /* must have */
+ .del = cstate_pmu_event_del, /* must have */
+ .start = cstate_pmu_event_start,
+ .stop = cstate_pmu_event_stop,
+ .read = cstate_pmu_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static struct pmu cstate_pkg_pmu = {
+ .attr_groups = pkg_attr_groups,
+ .name = "cstate_pkg",
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = cstate_pmu_event_init,
+ .add = cstate_pmu_event_add, /* must have */
+ .del = cstate_pmu_event_del, /* must have */
+ .start = cstate_pmu_event_start,
+ .stop = cstate_pmu_event_stop,
+ .read = cstate_pmu_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static void __init cstate_pmus_register(void)
+{
+ int err;
+
+ if (has_cstate_core) {
+ err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
+ if (WARN_ON(err))
+ pr_info("Failed to register PMU %s error %d\n",
+ cstate_core_pmu.name, err);
+ }
+
+ if (has_cstate_pkg) {
+ err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
+ if (WARN_ON(err))
+ pr_info("Failed to register PMU %s error %d\n",
+ cstate_pkg_pmu.name, err);
+ }
+}
+
+static int __init cstate_pmu_init(void)
+{
+ int err;
+
+ if (cpu_has_hypervisor)
+ return -ENODEV;
+
+ err = cstate_init();
+ if (err)
+ return err;
+
+ cstate_cpumask_init();
+
+ cstate_pmus_register();
+
+ return 0;
+}
+
+device_initcall(cstate_pmu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 84f236ab96b0..10602f0a438f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -510,10 +510,11 @@ int intel_pmu_drain_bts_buffer(void)
u64 flags;
};
struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
- struct bts_record *at, *top;
+ struct bts_record *at, *base, *top;
struct perf_output_handle handle;
struct perf_event_header header;
struct perf_sample_data data;
+ unsigned long skip = 0;
struct pt_regs regs;
if (!event)
@@ -522,10 +523,10 @@ int intel_pmu_drain_bts_buffer(void)
if (!x86_pmu.bts_active)
return 0;
- at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
- top = (struct bts_record *)(unsigned long)ds->bts_index;
+ base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ top = (struct bts_record *)(unsigned long)ds->bts_index;
- if (top <= at)
+ if (top <= base)
return 0;
memset(&regs, 0, sizeof(regs));
@@ -535,16 +536,43 @@ int intel_pmu_drain_bts_buffer(void)
perf_sample_data_init(&data, 0, event->hw.last_period);
/*
+ * BTS leaks kernel addresses in branches across the cpl boundary,
+ * such as traps or system calls, so unless the user is asking for
+ * kernel tracing (and right now it's not possible), we'd need to
+ * filter them out. But first we need to count how many of those we
+ * have in the current batch. This is an extra O(n) pass, however,
+ * it's much faster than the other one especially considering that
+ * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
+ * alloc_bts_buffer()).
+ */
+ for (at = base; at < top; at++) {
+ /*
+ * Note that right now *this* BTS code only works if
+ * attr::exclude_kernel is set, but let's keep this extra
+ * check here in case that changes.
+ */
+ if (event->attr.exclude_kernel &&
+ (kernel_ip(at->from) || kernel_ip(at->to)))
+ skip++;
+ }
+
+ /*
* Prepare a generic sample, i.e. fill in the invariant fields.
* We will overwrite the from and to address before we output
* the sample.
*/
perf_prepare_sample(&header, &data, event, &regs);
- if (perf_output_begin(&handle, event, header.size * (top - at)))
+ if (perf_output_begin(&handle, event, header.size *
+ (top - base - skip)))
return 1;
- for (; at < top; at++) {
+ for (at = base; at < top; at++) {
+ /* Filter out any records that contain kernel addresses. */
+ if (event->attr.exclude_kernel &&
+ (kernel_ip(at->from) || kernel_ip(at->to)))
+ continue;
+
data.ip = at->from;
data.addr = at->to;
@@ -592,6 +620,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
EVENT_CONSTRAINT_END
};
@@ -658,6 +688,8 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
@@ -672,6 +704,8 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
@@ -690,9 +724,10 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
struct event_constraint intel_skl_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+ /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
@@ -1073,6 +1108,13 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
void *at;
u64 pebs_status;
+ /*
+ * fmt0 does not have a status bitfield (does not use
+ * perf_record_nhm format)
+ */
+ if (x86_pmu.intel_cap.pebs_format < 1)
+ return base;
+
if (base == NULL)
return NULL;
@@ -1158,7 +1200,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
if (!event->attr.precise_ip)
return;
- n = (top - at) / x86_pmu.pebs_record_size;
+ n = top - at;
if (n <= 0)
return;
@@ -1202,12 +1244,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
pebs_status = p->status & cpuc->pebs_enabled;
pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+ /*
+ * On some CPUs the PEBS status can be zero when PEBS is
+ * racing with clearing of GLOBAL_STATUS.
+ *
+ * Normally we would drop that record, but in the
+ * case when there is only a single active PEBS event
+ * we can assume it's for that event.
+ */
+ if (!pebs_status && cpuc->pebs_enabled &&
+ !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+ pebs_status = cpuc->pebs_enabled;
+
bit = find_first_bit((unsigned long *)&pebs_status,
x86_pmu.max_pebs_events);
- if (WARN(bit >= x86_pmu.max_pebs_events,
- "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx",
- (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled,
- *(unsigned long long *)cpuc->active_mask))
+ if (bit >= x86_pmu.max_pebs_events)
continue;
/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index b2c9475b7ff2..653f88d25987 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -42,6 +42,13 @@ static enum {
#define LBR_FAR_BIT 8 /* do not capture far branches */
#define LBR_CALL_STACK_BIT 9 /* enable call stack */
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT 63 /* don't read LBR_INFO. */
+
#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
#define LBR_USER (1 << LBR_USER_BIT)
#define LBR_JCC (1 << LBR_JCC_BIT)
@@ -52,6 +59,7 @@ static enum {
#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
#define LBR_FAR (1 << LBR_FAR_BIT)
#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO (1ULL << LBR_NO_INFO_BIT)
#define LBR_PLM (LBR_KERNEL | LBR_USER)
@@ -151,10 +159,10 @@ static void __intel_pmu_lbr_enable(bool pmi)
* No need to reprogram LBR_SELECT in a PMI, as it
* did not change.
*/
- if (cpuc->lbr_sel && !pmi) {
- lbr_select = cpuc->lbr_sel->config;
+ if (cpuc->lbr_sel)
+ lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
+ if (!pmi && cpuc->lbr_sel)
wrmsrl(MSR_LBR_SELECT, lbr_select);
- }
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
orig_debugctl = debugctl;
@@ -239,7 +247,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
}
mask = x86_pmu.lbr_nr - 1;
- tos = intel_pmu_lbr_tos();
+ tos = task_ctx->tos;
for (i = 0; i < tos; i++) {
lbr_idx = (tos - i) & mask;
wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
@@ -247,6 +255,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
+ wrmsrl(x86_pmu.lbr_tos, tos);
task_ctx->lbr_stack_state = LBR_NONE;
}
@@ -270,6 +279,7 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
+ task_ctx->tos = tos;
task_ctx->lbr_stack_state = LBR_VALID;
}
@@ -420,6 +430,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
*/
static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
+ bool need_info = false;
unsigned long mask = x86_pmu.lbr_nr - 1;
int lbr_format = x86_pmu.intel_cap.lbr_format;
u64 tos = intel_pmu_lbr_tos();
@@ -427,8 +438,11 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
int out = 0;
int num = x86_pmu.lbr_nr;
- if (cpuc->lbr_sel->config & LBR_CALL_STACK)
- num = tos;
+ if (cpuc->lbr_sel) {
+ need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+ if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+ num = tos;
+ }
for (i = 0; i < num; i++) {
unsigned long lbr_idx = (tos - i) & mask;
@@ -440,7 +454,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
- if (lbr_format == LBR_FORMAT_INFO) {
+ if (lbr_format == LBR_FORMAT_INFO && need_info) {
u64 info;
rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
@@ -555,6 +569,8 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
mask |= X86_BR_IND_JMP;
+ if (br_type & PERF_SAMPLE_BRANCH_CALL)
+ mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
@@ -586,6 +602,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
if (v != LBR_IGN)
mask |= v;
}
+
reg = &event->hw.branch_reg;
reg->idx = EXTRA_REG_LBR;
@@ -596,6 +613,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
*/
reg->config = mask ^ x86_pmu.lbr_sel_mask;
+ if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+ (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+ (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+ reg->config |= LBR_NO_INFO;
+
return 0;
}
@@ -890,6 +912,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL,
};
static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
@@ -905,6 +928,7 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
| LBR_RETURN | LBR_CALL_STACK,
[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL,
};
/* core */
@@ -1022,3 +1046,17 @@ void __init intel_pmu_lbr_init_atom(void)
*/
pr_cont("8-deep LBR, ");
}
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = snb_lbr_sel_map;
+
+ pr_cont("8-deep LBR, ");
+}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 42169283448b..c0bbd1033b7c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -27,6 +27,7 @@
#include <asm/perf_event.h>
#include <asm/insn.h>
#include <asm/io.h>
+#include <asm/intel_pt.h>
#include "perf_event.h"
#include "intel_pt.h"
@@ -139,9 +140,6 @@ static int __init pt_pmu_hw_init(void)
long i;
attrs = NULL;
- ret = -ENODEV;
- if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
- goto fail;
for (i = 0; i < PT_CPUID_LEAVES; i++) {
cpuid_count(20, i,
@@ -1125,11 +1123,23 @@ static int pt_event_init(struct perf_event *event)
return 0;
}
+void cpu_emergency_stop_pt(void)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+ if (pt->handle.event)
+ pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
static __init int pt_init(void)
{
int ret, cpu, prior_warn = 0;
BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+ return -ENODEV;
+
get_online_cpus();
for_each_online_cpu(cpu) {
u64 ctl;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 81431c0f0614..24a351ad628d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -63,7 +63,7 @@
#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
#define NR_RAPL_DOMAINS 0x4
-static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
"pp0-core",
"package",
"dram",
@@ -107,19 +107,13 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-#define RAPL_EVENT_DESC(_name, _config) \
-{ \
- .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
- .config = _config, \
-}
-
#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
-#define RAPL_EVENT_ATTR_STR(_name, v, str) \
-static struct perf_pmu_events_attr event_attr_##v = { \
- .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \
- .id = 0, \
- .event_str = str, \
+#define RAPL_EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+ .id = 0, \
+ .event_str = str, \
};
struct rapl_pmu {
@@ -411,19 +405,6 @@ static struct attribute_group rapl_pmu_attr_group = {
.attrs = rapl_pmu_attrs,
};
-static ssize_t rapl_sysfs_show(struct device *dev,
- struct device_attribute *attr,
- char *page)
-{
- struct perf_pmu_events_attr *pmu_attr = \
- container_of(attr, struct perf_pmu_events_attr, attr);
-
- if (pmu_attr->event_str)
- return sprintf(page, "%s", pmu_attr->event_str);
-
- return 0;
-}
-
RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 560e5255b15e..3bf41d413775 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
static bool pcidrv_registered;
struct pci_driver *uncore_pci_driver;
/* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,59 @@ static struct event_constraint uncore_constraint_fixed =
struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+ struct pci2phy_map *map;
+ int phys_id = -1;
+
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == pci_domain_nr(bus)) {
+ phys_id = map->pbus_to_physid[bus->number];
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+ struct pci2phy_map *map, *alloc = NULL;
+ int i;
+
+ lockdep_assert_held(&pci2phy_map_lock);
+
+lookup:
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == segment)
+ goto end;
+ }
+
+ if (!alloc) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+ raw_spin_lock(&pci2phy_map_lock);
+
+ if (!alloc)
+ return NULL;
+
+ goto lookup;
+ }
+
+ map = alloc;
+ alloc = NULL;
+ map->segment = segment;
+ for (i = 0; i < 256; i++)
+ map->pbus_to_physid[i] = -1;
+ list_add_tail(&map->list, &pci2phy_map_head);
+
+end:
+ kfree(alloc);
+ return map;
+}
+
ssize_t uncore_event_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -809,7 +863,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
int phys_id;
bool first_box = false;
- phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+ phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
@@ -830,6 +884,15 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
* each box has a different function id.
*/
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+ /* Knights Landing uses a common PCI device ID for multiple instances of
+ * an uncore PMU device type. There is only one entry per device type in
+ * the knl_uncore_pci_ids table inspite of multiple devices present for
+ * some device types. Hence PCI device idx would be 0 for all devices.
+ * So increment pmu pointer to point to an unused array element.
+ */
+ if (boot_cpu_data.x86_model == 87)
+ while (pmu->func_id >= 0)
+ pmu++;
if (pmu->func_id < 0)
pmu->func_id = pdev->devfn;
else
@@ -856,9 +919,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
- int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+ int i, cpu, phys_id;
bool last_box = false;
+ phys_id = uncore_pcibus_to_physid(pdev->bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
@@ -911,6 +975,7 @@ static int __init uncore_pci_init(void)
case 63: /* Haswell-EP */
ret = hswep_uncore_pci_init();
break;
+ case 79: /* BDX-EP */
case 86: /* BDX-DE */
ret = bdx_uncore_pci_init();
break;
@@ -927,6 +992,12 @@ static int __init uncore_pci_init(void)
case 61: /* Broadwell */
ret = bdw_uncore_pci_init();
break;
+ case 87: /* Knights Landing */
+ ret = knl_uncore_pci_init();
+ break;
+ case 94: /* SkyLake */
+ ret = skl_uncore_pci_init();
+ break;
default:
return 0;
}
@@ -1232,9 +1303,13 @@ static int __init uncore_cpu_init(void)
case 63: /* Haswell-EP */
hswep_uncore_cpu_init();
break;
+ case 79: /* BDX-EP */
case 86: /* BDX-DE */
bdx_uncore_cpu_init();
break;
+ case 87: /* Knights Landing */
+ knl_uncore_cpu_init();
+ break;
default:
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 72c54c2e5b1a..a7086b862156 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,15 @@ struct uncore_event_desc {
const char *config;
};
+struct pci2phy_map {
+ struct list_head list;
+ int segment;
+ int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+struct pci2phy_map *__find_pci2phy_map(int segment);
+
ssize_t uncore_event_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
@@ -317,7 +326,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
extern struct intel_uncore_type **uncore_msr_uncores;
extern struct intel_uncore_type **uncore_pci_uncores;
extern struct pci_driver *uncore_pci_driver;
-extern int uncore_pcibus_to_physid[256];
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
extern struct event_constraint uncore_constraint_empty;
@@ -326,8 +336,10 @@ int snb_uncore_pci_init(void);
int ivb_uncore_pci_init(void);
int hsw_uncore_pci_init(void);
int bdw_uncore_pci_init(void);
+int skl_uncore_pci_init(void);
void snb_uncore_cpu_init(void);
void nhm_uncore_cpu_init(void);
+int snb_pci2phy_map_init(int devid);
/* perf_event_intel_uncore_snbep.c */
int snbep_uncore_pci_init(void);
@@ -338,6 +350,8 @@ int hswep_uncore_pci_init(void);
void hswep_uncore_cpu_init(void);
int bdx_uncore_pci_init(void);
void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
/* perf_event_intel_uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index f78574b3cb55..2bd030ddd0db 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -8,6 +8,7 @@
#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00
#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04
#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604
+#define PCI_DEVICE_ID_INTEL_SKL_IMC 0x191f
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -417,18 +418,28 @@ static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
}
}
-static int snb_pci2phy_map_init(int devid)
+int snb_pci2phy_map_init(int devid)
{
struct pci_dev *dev = NULL;
- int bus;
+ struct pci2phy_map *map;
+ int bus, segment;
dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
if (!dev)
return -ENOTTY;
bus = dev->bus->number;
-
- uncore_pcibus_to_physid[bus] = 0;
+ segment = pci_domain_nr(dev->bus);
+
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ pci_dev_put(dev);
+ return -ENOMEM;
+ }
+ map->pbus_to_physid[bus] = 0;
+ raw_spin_unlock(&pci2phy_map_lock);
pci_dev_put(dev);
@@ -514,6 +525,14 @@ static const struct pci_device_id bdw_uncore_pci_ids[] = {
{ /* end: all zeroes */ },
};
+static const struct pci_device_id skl_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
static struct pci_driver snb_uncore_pci_driver = {
.name = "snb_uncore",
.id_table = snb_uncore_pci_ids,
@@ -534,6 +553,11 @@ static struct pci_driver bdw_uncore_pci_driver = {
.id_table = bdw_uncore_pci_ids,
};
+static struct pci_driver skl_uncore_pci_driver = {
+ .name = "skl_uncore",
+ .id_table = skl_uncore_pci_ids,
+};
+
struct imc_uncore_pci_dev {
__u32 pci_id;
struct pci_driver *driver;
@@ -548,6 +572,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */
IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */
IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */
+ IMC_DEV(SKL_IMC, &skl_uncore_pci_driver), /* 6th Gen Core */
{ /* end marker */ }
};
@@ -600,6 +625,11 @@ int bdw_uncore_pci_init(void)
return imc_uncore_pci_init();
}
+int skl_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
/* end of Sandy Bridge uncore support */
/* Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 694510a887dc..33acb884ccf1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -209,31 +209,98 @@
#define HSWEP_PCU_MSR_PMON_BOX_CTL 0x710
#define HSWEP_PCU_MSR_PMON_BOX_FILTER 0x715
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET 0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR (1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+ KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID 0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE (7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP (0xfffffe2aULL << 32)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW 0x400
+#define KNL_UCLK_MSR_PMON_CTL0 0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL 0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW 0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL 0x454
+#define KNL_PMON_FIXED_CTL_EN 0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW 0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0 0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL 0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW 0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL 0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW 0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0 0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL 0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW 0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL 0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL 0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK 0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR (1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK 0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+ KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_CBO_PMON_CTL_TID_EN | \
+ SNBEP_PMON_CTL_EV_SEL_EXT | \
+ SNBEP_PMON_CTL_INVERT | \
+ KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
@@ -315,8 +382,9 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct pe
static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
{
struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
- pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
+ pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
}
static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
@@ -1087,7 +1155,8 @@ static struct pci_driver snbep_uncore_pci_driver = {
static int snbep_pci2phy_map_init(int devid)
{
struct pci_dev *ubox_dev = NULL;
- int i, bus, nodeid;
+ int i, bus, nodeid, segment;
+ struct pci2phy_map *map;
int err = 0;
u32 config = 0;
@@ -1106,16 +1175,27 @@ static int snbep_pci2phy_map_init(int devid)
err = pci_read_config_dword(ubox_dev, 0x54, &config);
if (err)
break;
+
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
+
/*
* every three bits in the Node ID mapping register maps
* to a particular node.
*/
for (i = 0; i < 8; i++) {
if (nodeid == ((config >> (3 * i)) & 0x7)) {
- uncore_pcibus_to_physid[bus] = i;
+ map->pbus_to_physid[bus] = i;
break;
}
}
+ raw_spin_unlock(&pci2phy_map_lock);
}
if (!err) {
@@ -1123,13 +1203,17 @@ static int snbep_pci2phy_map_init(int devid)
* For PCI bus with no UBOX device, find the next bus
* that has UBOX device and use its mapping.
*/
- i = -1;
- for (bus = 255; bus >= 0; bus--) {
- if (uncore_pcibus_to_physid[bus] >= 0)
- i = uncore_pcibus_to_physid[bus];
- else
- uncore_pcibus_to_physid[bus] = i;
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ i = -1;
+ for (bus = 255; bus >= 0; bus--) {
+ if (map->pbus_to_physid[bus] >= 0)
+ i = map->pbus_to_physid[bus];
+ else
+ map->pbus_to_physid[bus] = i;
+ }
}
+ raw_spin_unlock(&pci2phy_map_lock);
}
pci_dev_put(ubox_dev);
@@ -1712,6 +1796,419 @@ int ivbep_uncore_pci_init(void)
}
/* end of IvyTown uncore support */
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = KNL_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_qor.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid4.attr,
+ &format_attr_filter_link3.attr,
+ &format_attr_filter_state4.attr,
+ &format_attr_filter_local.attr,
+ &format_attr_filter_all_op.attr,
+ &format_attr_filter_nnm.attr,
+ &format_attr_filter_opc3.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_cha_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+ EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x4)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+ return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+ .init_box = snbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = knl_cha_hw_config,
+ .get_constraint = knl_cha_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+ .name = "cha",
+ .num_counters = 4,
+ .num_boxes = 38,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = KNL_CHA_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = knl_uncore_cha_constraints,
+ .ops = &knl_uncore_cha_ops,
+ .format_group = &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+ &format_attr_event2.attr,
+ &format_attr_use_occ_ctr.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh6.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge_det.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+ &knl_uncore_ubox,
+ &knl_uncore_cha,
+ &knl_uncore_pcu,
+ NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+ == UNCORE_FIXED_EVENT)
+ pci_write_config_dword(pdev, hwc->config_base,
+ hwc->config | KNL_PMON_FIXED_CTL_EN);
+ else
+ pci_write_config_dword(pdev, hwc->config_base,
+ hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+ .init_box = snbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = knl_uncore_imc_enable_box,
+ .read_counter = snbep_uncore_pci_read_counter,
+ .enable_event = knl_uncore_imc_enable_event,
+ .disable_event = snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+ .name = "imc_uclk",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_UCLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+ .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+ .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_MC0_CH0_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+ .fixed_ctl = KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+ .box_ctl = KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+ .name = "edc_uclk",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_UCLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+ .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+ .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+ .name = "edc_eclk",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_EDC0_ECLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+ .fixed_ctl = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+ .box_ctl = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = knl_uncore_m2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_qor.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute_group knl_uncore_irp_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = KNL_IRP_PCI_PMON_BOX_CTL,
+ .ops = &snbep_uncore_pci_ops,
+ .format_group = &knl_uncore_irp_format_group,
+};
+
+enum {
+ KNL_PCI_UNCORE_MC_UCLK,
+ KNL_PCI_UNCORE_MC_DCLK,
+ KNL_PCI_UNCORE_EDC_UCLK,
+ KNL_PCI_UNCORE_EDC_ECLK,
+ KNL_PCI_UNCORE_M2PCIE,
+ KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+ [KNL_PCI_UNCORE_MC_UCLK] = &knl_uncore_imc_uclk,
+ [KNL_PCI_UNCORE_MC_DCLK] = &knl_uncore_imc_dclk,
+ [KNL_PCI_UNCORE_EDC_UCLK] = &knl_uncore_edc_uclk,
+ [KNL_PCI_UNCORE_EDC_ECLK] = &knl_uncore_edc_eclk,
+ [KNL_PCI_UNCORE_M2PCIE] = &knl_uncore_m2pcie,
+ [KNL_PCI_UNCORE_IRP] = &knl_uncore_irp,
+ NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ * PCI Device ID Uncore PMU Devices
+ * ----------------------------------
+ * 0x7841 MC0 UClk, MC1 UClk
+ * 0x7843 MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ * MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ * 0x7833 EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ * EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ * 0x7835 EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ * EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ * 0x7817 M2PCIe
+ * 0x7814 IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+ { /* MC UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+ },
+ { /* MC DClk Channel */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+ },
+ { /* EDC UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+ },
+ { /* EDC EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+ },
+ { /* M2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+ .name = "knl_uncore",
+ .id_table = knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+ int ret;
+
+ /* All KNL PCI based PMON units are on the same PCI bus except IRP */
+ ret = snb_pci2phy_map_init(0x7814); /* IRP */
+ if (ret)
+ return ret;
+ ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+ if (ret)
+ return ret;
+ uncore_pci_uncores = knl_pci_uncores;
+ uncore_pci_driver = &knl_uncore_pci_driver;
+ return 0;
+}
+
+/* end of KNL uncore support */
+
/* Haswell-EP uncore support */
static struct attribute *hswep_uncore_ubox_formats_attr[] = {
&format_attr_event.attr,
@@ -2322,7 +2819,7 @@ int hswep_uncore_pci_init(void)
}
/* end of Haswell-EP uncore support */
-/* BDX-DE uncore support */
+/* BDX uncore support */
static struct intel_uncore_type bdx_uncore_ubox = {
.name = "ubox",
@@ -2344,13 +2841,14 @@ static struct event_constraint bdx_uncore_cbox_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
EVENT_CONSTRAINT_END
};
static struct intel_uncore_type bdx_uncore_cbox = {
.name = "cbox",
.num_counters = 4,
- .num_boxes = 8,
+ .num_boxes = 24,
.perf_ctr_bits = 48,
.event_ctl = HSWEP_C0_MSR_PMON_CTL0,
.perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
@@ -2363,9 +2861,24 @@ static struct intel_uncore_type bdx_uncore_cbox = {
.format_group = &hswep_uncore_cbox_format_group,
};
+static struct intel_uncore_type bdx_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_S0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_S0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_SBOX_MSR_OFFSET,
+ .ops = &hswep_uncore_sbox_msr_ops,
+ .format_group = &hswep_uncore_sbox_format_group,
+};
+
static struct intel_uncore_type *bdx_msr_uncores[] = {
&bdx_uncore_ubox,
&bdx_uncore_cbox,
+ &bdx_uncore_sbox,
&hswep_uncore_pcu,
NULL,
};
@@ -2380,7 +2893,7 @@ void bdx_uncore_cpu_init(void)
static struct intel_uncore_type bdx_uncore_ha = {
.name = "ha",
.num_counters = 4,
- .num_boxes = 1,
+ .num_boxes = 2,
.perf_ctr_bits = 48,
SNBEP_UNCORE_PCI_COMMON_INIT(),
};
@@ -2388,7 +2901,7 @@ static struct intel_uncore_type bdx_uncore_ha = {
static struct intel_uncore_type bdx_uncore_imc = {
.name = "imc",
.num_counters = 5,
- .num_boxes = 2,
+ .num_boxes = 8,
.perf_ctr_bits = 48,
.fixed_ctr_bits = 48,
.fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
@@ -2408,6 +2921,19 @@ static struct intel_uncore_type bdx_uncore_irp = {
.format_group = &snbep_uncore_format_group,
};
+static struct intel_uncore_type bdx_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
@@ -2416,6 +2942,8 @@ static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
EVENT_CONSTRAINT_END
};
@@ -2429,26 +2957,77 @@ static struct intel_uncore_type bdx_uncore_r2pcie = {
SNBEP_UNCORE_PCI_COMMON_INIT(),
};
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .constraints = bdx_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
enum {
BDX_PCI_UNCORE_HA,
BDX_PCI_UNCORE_IMC,
BDX_PCI_UNCORE_IRP,
+ BDX_PCI_UNCORE_QPI,
BDX_PCI_UNCORE_R2PCIE,
+ BDX_PCI_UNCORE_R3QPI,
};
static struct intel_uncore_type *bdx_pci_uncores[] = {
[BDX_PCI_UNCORE_HA] = &bdx_uncore_ha,
[BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc,
[BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp,
+ [BDX_PCI_UNCORE_QPI] = &bdx_uncore_qpi,
[BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
+ [BDX_PCI_UNCORE_R3QPI] = &bdx_uncore_r3qpi,
NULL,
};
-static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = {
+static const struct pci_device_id bdx_uncore_pci_ids[] = {
{ /* Home Agent 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
},
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+ },
{ /* MC0 Channel 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
@@ -2457,14 +3036,74 @@ static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
},
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+ },
{ /* IRP */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
},
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+ },
{ /* R2PCIe */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
},
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
+ },
+ { /* QPI Port 1 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
+ },
+ { /* QPI Port 2 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
+ },
{ /* end: all zeroes */ }
};
@@ -2484,4 +3123,4 @@ int bdx_uncore_pci_init(void)
return 0;
}
-/* end of BDX-DE uncore support */
+/* end of BDX uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
index f32ac13934f2..ec863b9a9f78 100644
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -163,10 +163,9 @@ again:
goto again;
delta = now - prev;
- if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
- delta <<= 32;
- delta >>= 32; /* sign extend */
- }
+ if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+ delta = sign_extend64(delta, 31);
+
local64_add(now - prev, &event->count);
}
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 136ac74dee82..819d94982e07 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -33,28 +33,27 @@ static int __init x86_rdrand_setup(char *s)
__setup("nordrand", x86_rdrand_setup);
/*
- * Force a reseed cycle; we are architecturally guaranteed a reseed
- * after no more than 512 128-bit chunks of random data. This also
- * acts as a test of the CPU capability.
+ * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation.
+ * Run the instruction a few times as a sanity check.
+ * If it fails, it is simple to disable RDRAND here.
*/
-#define RESEED_LOOP ((512*128)/sizeof(unsigned long))
+#define SANITY_CHECK_LOOPS 8
void x86_init_rdrand(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_ARCH_RANDOM
unsigned long tmp;
- int i, count, ok;
+ int i;
if (!cpu_has(c, X86_FEATURE_RDRAND))
- return; /* Nothing to do */
+ return;
- for (count = i = 0; i < RESEED_LOOP; i++) {
- ok = rdrand_long(&tmp);
- if (ok)
- count++;
+ for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
+ if (!rdrand_long(&tmp)) {
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ printk_once(KERN_WARNING "rdrand: disabled\n");
+ return;
+ }
}
-
- if (count != RESEED_LOOP)
- clear_cpu_cap(c, X86_FEATURE_RDRAND);
#endif
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 608fb26c7254..8cb57df9398d 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,32 +31,12 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
const struct cpuid_bit *cb;
static const struct cpuid_bit cpuid_bits[] = {
- { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 },
- { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
- { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
- { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
- { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
- { X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 },
- { X86_FEATURE_HWP_NOTIFY, CR_EAX, 8, 0x00000006, 0 },
- { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 },
- { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 },
- { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 },
{ X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 },
- { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
- { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
- { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
- { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
- { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 },
- { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 },
- { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 },
- { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 },
- { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 },
- { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 3fa0e5ad86b4..252da7aceca6 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -12,7 +12,7 @@ static void early_init_transmeta(struct cpuinfo_x86 *c)
xlvl = cpuid_eax(0x80860000);
if ((xlvl & 0xffff0000) == 0x80860000) {
if (xlvl >= 0x80860001)
- c->x86_capability[2] = cpuid_edx(0x80860001);
+ c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
}
}
@@ -82,7 +82,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
/* Unhide possibly hidden capability flags */
rdmsr(0x80860004, cap_mask, uk);
wrmsr(0x80860004, ~0, uk);
- c->x86_capability[0] = cpuid_edx(0x00000001);
+ c->x86_capability[CPUID_1_EDX] = cpuid_edx(0x00000001);
wrmsr(0x80860004, cap_mask, uk);
/* All Transmeta CPUs have a constant TSC */
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index bd3507da39f0..2836de390f95 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -58,28 +58,6 @@ static void cpuid_smp_cpuid(void *cmd_block)
&cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx);
}
-static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
-{
- loff_t ret;
- struct inode *inode = file->f_mapping->host;
-
- mutex_lock(&inode->i_mutex);
- switch (orig) {
- case 0:
- file->f_pos = offset;
- ret = file->f_pos;
- break;
- case 1:
- file->f_pos += offset;
- ret = file->f_pos;
- break;
- default:
- ret = -EINVAL;
- }
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
static ssize_t cpuid_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -132,7 +110,7 @@ static int cpuid_open(struct inode *inode, struct file *file)
*/
static const struct file_operations cpuid_fops = {
.owner = THIS_MODULE,
- .llseek = cpuid_seek,
+ .llseek = no_seek_end_llseek,
.read = cpuid_read,
.open = cpuid_open,
};
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 74ca2fe7a0b3..58f34319b29a 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -35,6 +35,7 @@
#include <asm/cpu.h>
#include <asm/reboot.h>
#include <asm/virtext.h>
+#include <asm/intel_pt.h>
/* Alignment required for elf header segment */
#define ELF_CORE_HEADER_ALIGN 4096
@@ -75,8 +76,6 @@ struct crash_memmap_data {
unsigned int type;
};
-int in_crash_kexec;
-
/*
* This is used to VMCLEAR all VMCSs loaded on the
* processor. And when loading kvm_intel module, the
@@ -127,12 +126,16 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
cpu_emergency_vmxoff();
cpu_emergency_svm_disable();
+ /*
+ * Disable Intel PT to stop its logging
+ */
+ cpu_emergency_stop_pt();
+
disable_local_APIC();
}
static void kdump_nmi_shootdown_cpus(void)
{
- in_crash_kexec = 1;
nmi_shootdown_cpus(kdump_nmi_callback);
disable_local_APIC();
@@ -172,6 +175,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
cpu_emergency_vmxoff();
cpu_emergency_svm_disable();
+ /*
+ * Disable Intel PT to stop its logging
+ */
+ cpu_emergency_stop_pt();
+
#ifdef CONFIG_X86_IO_APIC
/* Prevent crash_kexec() from deadlocking on ioapic_lock. */
ioapic_zap_locks();
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a102564d08eb..569c1e4f96fe 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -911,7 +911,7 @@ void __init finish_e820_parsing(void)
}
}
-static inline const char *e820_type_to_string(int e820_type)
+static const char *e820_type_to_string(int e820_type)
{
switch (e820_type) {
case E820_RESERVED_KERN:
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 9f9cc682e561..bca14c899137 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -547,6 +547,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = {
INTEL_CHV_IDS(&chv_stolen_funcs),
INTEL_SKL_IDS(&gen9_stolen_funcs),
INTEL_BXT_IDS(&gen9_stolen_funcs),
+ INTEL_KBL_IDS(&gen9_stolen_funcs),
};
static void __init intel_graphics_stolen(int num, int slot, int func)
@@ -584,7 +585,7 @@ static void __init intel_graphics_stolen(int num, int slot, int func)
static void __init force_disable_hpet(int num, int slot, int func)
{
#ifdef CONFIG_HPET_TIMER
- boot_hpet_disable = 1;
+ boot_hpet_disable = true;
pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
#endif
}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index eec40f595ab9..21bf92490a7b 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -195,14 +195,14 @@ static __init void early_serial_init(char *s)
#ifdef CONFIG_PCI
static void mem32_serial_out(unsigned long addr, int offset, int value)
{
- u32 *vaddr = (u32 *)addr;
+ u32 __iomem *vaddr = (u32 __iomem *)addr;
/* shift implied by pointer type */
writel(value, vaddr + offset);
}
static unsigned int mem32_serial_in(unsigned long addr, int offset)
{
- u32 *vaddr = (u32 *)addr;
+ u32 __iomem *vaddr = (u32 __iomem *)addr;
/* shift implied by pointer type */
return readl(vaddr + offset);
}
@@ -316,7 +316,7 @@ static struct console early_serial_console = {
.index = -1,
};
-static inline void early_console_register(struct console *con, int keep_early)
+static void early_console_register(struct console *con, int keep_early)
{
if (con->index != -1) {
printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index d14e9ac3235a..6d9f0a7ef4c8 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -3,8 +3,11 @@
*/
#include <asm/fpu/internal.h>
#include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/cmdline.h>
#include <linux/sched.h>
+#include <linux/init.h>
/*
* Initialize the TS bit in CR0 according to the style of context-switches
@@ -12,7 +15,7 @@
*/
static void fpu__init_cpu_ctx_switch(void)
{
- if (!cpu_has_eager_fpu)
+ if (!boot_cpu_has(X86_FEATURE_EAGER_FPU))
stts();
else
clts();
@@ -143,9 +146,18 @@ static void __init fpu__init_system_generic(void)
unsigned int xstate_size;
EXPORT_SYMBOL_GPL(xstate_size);
-/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+/* Get alignment of the TYPE. */
+#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
+
+/*
+ * Enforce that 'MEMBER' is the last field of 'TYPE'.
+ *
+ * Align the computed size with alignment of the TYPE,
+ * because that's how C aligns structs.
+ */
#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
- BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
+ BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \
+ TYPE_ALIGN(TYPE)))
/*
* We append the 'struct fpu' to the task_struct:
@@ -188,7 +200,7 @@ static void __init fpu__init_task_struct_size(void)
*/
static void __init fpu__init_system_xstate_size_legacy(void)
{
- static int on_boot_cpu = 1;
+ static int on_boot_cpu __initdata = 1;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -261,24 +273,56 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
-static int __init eager_fpu_setup(char *s)
+/*
+ * Find supported xfeatures based on cpu features and command-line input.
+ * This must be called after fpu__init_parse_early_param() is called and
+ * xfeatures_mask is enumerated.
+ */
+u64 __init fpu__get_supported_xfeatures_mask(void)
{
- if (!strcmp(s, "on"))
- eagerfpu = ENABLE;
- else if (!strcmp(s, "off"))
- eagerfpu = DISABLE;
- else if (!strcmp(s, "auto"))
- eagerfpu = AUTO;
- return 1;
+ /* Support all xfeatures known to us */
+ if (eagerfpu != DISABLE)
+ return XCNTXT_MASK;
+
+ /* Warning of xfeatures being disabled for no eagerfpu mode */
+ if (xfeatures_mask & XFEATURE_MASK_EAGER) {
+ pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
+ xfeatures_mask & XFEATURE_MASK_EAGER);
+ }
+
+ /* Return a mask that masks out all features requiring eagerfpu mode */
+ return ~XFEATURE_MASK_EAGER;
+}
+
+/*
+ * Disable features dependent on eagerfpu.
+ */
+static void __init fpu__clear_eager_fpu_features(void)
+{
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ setup_clear_cpu_cap(X86_FEATURE_AVX2);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
}
-__setup("eagerfpu=", eager_fpu_setup);
/*
* Pick the FPU context switching strategy:
+ *
+ * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
+ * the following is true:
+ *
+ * (1) the cpu has xsaveopt, as it has the optimization and doing eager
+ * FPU switching has a relatively low cost compared to a plain xsave;
+ * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
+ * switching. Should the kernel boot with noxsaveopt, we support MPX
+ * with eager FPU switching at a higher cost.
*/
static void __init fpu__init_system_ctx_switch(void)
{
- static bool on_boot_cpu = 1;
+ static bool on_boot_cpu __initdata = 1;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -286,19 +330,11 @@ static void __init fpu__init_system_ctx_switch(void)
WARN_ON_FPU(current->thread.fpu.fpstate_active);
current_thread_info()->status = 0;
- /* Auto enable eagerfpu for xsaveopt */
- if (cpu_has_xsaveopt && eagerfpu != DISABLE)
+ if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
eagerfpu = ENABLE;
- if (xfeatures_mask & XSTATE_EAGER) {
- if (eagerfpu == DISABLE) {
- pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
- xfeatures_mask & XSTATE_EAGER);
- xfeatures_mask &= ~XSTATE_EAGER;
- } else {
- eagerfpu = ENABLE;
- }
- }
+ if (xfeatures_mask & XFEATURE_MASK_EAGER)
+ eagerfpu = ENABLE;
if (eagerfpu == ENABLE)
setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
@@ -307,11 +343,48 @@ static void __init fpu__init_system_ctx_switch(void)
}
/*
+ * We parse fpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init fpu__init_parse_early_param(void)
+{
+ /*
+ * No need to check "eagerfpu=auto" again, since it is the
+ * initial default.
+ */
+ if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
+ eagerfpu = DISABLE;
+ fpu__clear_eager_fpu_features();
+ } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
+ eagerfpu = ENABLE;
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+ setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+ setup_clear_cpu_cap(X86_FEATURE_XMM);
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ fpu__xstate_clear_all_cpu_caps();
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+}
+
+/*
* Called on the boot CPU once per system bootup, to set up the initial
* FPU state that is later cloned into all processes:
*/
void __init fpu__init_system(struct cpuinfo_x86 *c)
{
+ fpu__init_parse_early_param();
fpu__init_system_early_generic(c);
/*
@@ -335,72 +408,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_ctx_switch();
}
-
-/*
- * Boot parameter to turn off FPU support and fall back to math-emu:
- */
-static int __init no_387(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_FPU);
- return 1;
-}
-__setup("no387", no_387);
-
-/*
- * Disable all xstate CPU features:
- */
-static int __init x86_noxsave_setup(char *s)
-{
- if (strlen(s))
- return 0;
-
- setup_clear_cpu_cap(X86_FEATURE_XSAVE);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
- setup_clear_cpu_cap(X86_FEATURE_AVX);
- setup_clear_cpu_cap(X86_FEATURE_AVX2);
- setup_clear_cpu_cap(X86_FEATURE_AVX512F);
- setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
- setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
- setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
- setup_clear_cpu_cap(X86_FEATURE_MPX);
-
- return 1;
-}
-__setup("noxsave", x86_noxsave_setup);
-
-/*
- * Disable the XSAVEOPT instruction specifically:
- */
-static int __init x86_noxsaveopt_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-
- return 1;
-}
-__setup("noxsaveopt", x86_noxsaveopt_setup);
-
-/*
- * Disable the XSAVES instruction:
- */
-static int __init x86_noxsaves_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-
- return 1;
-}
-__setup("noxsaves", x86_noxsaves_setup);
-
-/*
- * Disable FX save/restore and SSE support:
- */
-static int __init x86_nofxsr_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
- setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
- setup_clear_cpu_cap(X86_FEATURE_XMM);
-
- return 1;
-}
-__setup("nofxsr", x86_nofxsr_setup);
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index dc60810c1c74..0bc3490420c5 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -66,7 +66,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
* presence of FP and SSE state.
*/
if (cpu_has_xsave)
- fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE;
+ fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
return ret;
}
@@ -326,7 +326,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
* presence of FP.
*/
if (cpu_has_xsave)
- fpu->state.xsave.header.xfeatures |= XSTATE_FP;
+ fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
return ret;
}
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 50ec9af1bd51..31c6a60505e6 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -56,7 +56,7 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
if (use_fxsr()) {
struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
struct user_i387_ia32_struct env;
- struct _fpstate_ia32 __user *fp = buf;
+ struct _fpstate_32 __user *fp = buf;
convert_from_fxsr(&env, tsk);
@@ -107,7 +107,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
* header as well as change any contents in the memory layout.
* xrestore as part of sigreturn will capture all the changes.
*/
- xfeatures |= XSTATE_FPSSE;
+ xfeatures |= XFEATURE_MASK_FPSSE;
err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures);
@@ -165,7 +165,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
if (!static_cpu_has(X86_FEATURE_FPU))
return fpregs_soft_get(current, NULL, 0,
sizeof(struct user_i387_ia32_struct), NULL,
- (struct _fpstate_ia32 __user *) buf) ? -1 : 1;
+ (struct _fpstate_32 __user *) buf) ? -1 : 1;
if (fpregs_active()) {
/* Save the live register state to the user directly. */
@@ -207,7 +207,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
* layout and not enabled by the OS.
*/
if (fx_only)
- header->xfeatures = XSTATE_FPSSE;
+ header->xfeatures = XFEATURE_MASK_FPSSE;
else
header->xfeatures &= (xfeatures_mask & xfeatures);
}
@@ -230,7 +230,7 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_
{
if (use_xsave()) {
if ((unsigned long)buf % 64 || fx_only) {
- u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE;
+ u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
return copy_user_to_fxregs(buf);
} else {
@@ -385,20 +385,19 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
*/
void fpu__init_prepare_fx_sw_frame(void)
{
- int fsave_header_size = sizeof(struct fregs_state);
int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
- if (config_enabled(CONFIG_X86_32))
- size += fsave_header_size;
-
fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
fx_sw_reserved.extended_size = size;
fx_sw_reserved.xfeatures = xfeatures_mask;
fx_sw_reserved.xstate_size = xstate_size;
- if (config_enabled(CONFIG_IA32_EMULATION)) {
+ if (config_enabled(CONFIG_IA32_EMULATION) ||
+ config_enabled(CONFIG_X86_32)) {
+ int fsave_header_size = sizeof(struct fregs_state);
+
fx_sw_reserved_ia32 = fx_sw_reserved;
- fx_sw_reserved_ia32.extended_size += fsave_header_size;
+ fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
}
}
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 62fc001c7846..d425cda5ae6d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -31,12 +31,29 @@ static const char *xfeature_names[] =
*/
u64 xfeatures_mask __read_mostly;
-static unsigned int xstate_offsets[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1};
-static unsigned int xstate_sizes[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1};
+static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
-/* The number of supported xfeatures in xfeatures_mask: */
-static unsigned int xfeatures_nr;
+/*
+ * Clear all of the X86_FEATURE_* bits that are unavailable
+ * when the CPU has no XSAVE support.
+ */
+void fpu__xstate_clear_all_cpu_caps(void)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ setup_clear_cpu_cap(X86_FEATURE_AVX2);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
+ setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
+}
/*
* Return whether the system supports a given xfeature.
@@ -53,7 +70,7 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
/*
* So we use FLS here to be able to print the most advanced
* feature that was requested but is missing. So if a driver
- * asks about "XSTATE_SSE | XSTATE_YMM" we'll print the
+ * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
* missing AVX feature - this is the most informative message
* to users:
*/
@@ -112,7 +129,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
/*
* FP is in init state
*/
- if (!(xfeatures & XSTATE_FP)) {
+ if (!(xfeatures & XFEATURE_MASK_FP)) {
fx->cwd = 0x37f;
fx->swd = 0;
fx->twd = 0;
@@ -125,7 +142,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
/*
* SSE is in init state
*/
- if (!(xfeatures & XSTATE_SSE))
+ if (!(xfeatures & XFEATURE_MASK_SSE))
memset(&fx->xmm_space[0], 0, 256);
/*
@@ -169,25 +186,43 @@ void fpu__init_cpu_xstate(void)
}
/*
+ * Note that in the future we will likely need a pair of
+ * functions here: one for user xstates and the other for
+ * system xstates. For now, they are the same.
+ */
+static int xfeature_enabled(enum xfeature xfeature)
+{
+ return !!(xfeatures_mask & (1UL << xfeature));
+}
+
+/*
* Record the offsets and sizes of various xstates contained
* in the XSAVE state memory layout.
- *
- * ( Note that certain features might be non-present, for them
- * we'll have 0 offset and 0 size. )
*/
static void __init setup_xstate_features(void)
{
- u32 eax, ebx, ecx, edx, leaf;
-
- xfeatures_nr = fls64(xfeatures_mask);
-
- for (leaf = 2; leaf < xfeatures_nr; leaf++) {
- cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
-
- xstate_offsets[leaf] = ebx;
- xstate_sizes[leaf] = eax;
+ u32 eax, ebx, ecx, edx, i;
+ /* start at the beginnning of the "extended state" */
+ unsigned int last_good_offset = offsetof(struct xregs_state,
+ extended_state_area);
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (!xfeature_enabled(i))
+ continue;
+
+ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+ xstate_offsets[i] = ebx;
+ xstate_sizes[i] = eax;
+ /*
+ * In our xstate size checks, we assume that the
+ * highest-numbered xstate feature has the
+ * highest offset in the buffer. Ensure it does.
+ */
+ WARN_ONCE(last_good_offset > xstate_offsets[i],
+ "x86/fpu: misordered xstate at %d\n", last_good_offset);
+ last_good_offset = xstate_offsets[i];
- printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %04x, xstate_sizes[%d]: %04x\n", leaf, ebx, leaf, eax);
+ printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax);
}
}
@@ -204,14 +239,14 @@ static void __init print_xstate_feature(u64 xstate_mask)
*/
static void __init print_xstate_features(void)
{
- print_xstate_feature(XSTATE_FP);
- print_xstate_feature(XSTATE_SSE);
- print_xstate_feature(XSTATE_YMM);
- print_xstate_feature(XSTATE_BNDREGS);
- print_xstate_feature(XSTATE_BNDCSR);
- print_xstate_feature(XSTATE_OPMASK);
- print_xstate_feature(XSTATE_ZMM_Hi256);
- print_xstate_feature(XSTATE_Hi16_ZMM);
+ print_xstate_feature(XFEATURE_MASK_FP);
+ print_xstate_feature(XFEATURE_MASK_SSE);
+ print_xstate_feature(XFEATURE_MASK_YMM);
+ print_xstate_feature(XFEATURE_MASK_BNDREGS);
+ print_xstate_feature(XFEATURE_MASK_BNDCSR);
+ print_xstate_feature(XFEATURE_MASK_OPMASK);
+ print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
+ print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
}
/*
@@ -233,8 +268,8 @@ static void __init setup_xstate_comp(void)
xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
if (!cpu_has_xsaves) {
- for (i = 2; i < xfeatures_nr; i++) {
- if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (xfeature_enabled(i)) {
xstate_comp_offsets[i] = xstate_offsets[i];
xstate_comp_sizes[i] = xstate_sizes[i];
}
@@ -242,15 +277,16 @@ static void __init setup_xstate_comp(void)
return;
}
- xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
+ FXSAVE_SIZE + XSAVE_HDR_SIZE;
- for (i = 2; i < xfeatures_nr; i++) {
- if (test_bit(i, (unsigned long *)&xfeatures_mask))
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (xfeature_enabled(i))
xstate_comp_sizes[i] = xstate_sizes[i];
else
xstate_comp_sizes[i] = 0;
- if (i > 2)
+ if (i > FIRST_EXTENDED_XFEATURE)
xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+ xstate_comp_sizes[i-1];
@@ -262,7 +298,7 @@ static void __init setup_xstate_comp(void)
*/
static void __init setup_init_fpu_buf(void)
{
- static int on_boot_cpu = 1;
+ static int on_boot_cpu __initdata = 1;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -290,27 +326,280 @@ static void __init setup_init_fpu_buf(void)
copy_xregs_to_kernel_booting(&init_fpstate.xsave);
}
+static int xfeature_is_supervisor(int xfeature_nr)
+{
+ /*
+ * We currently do not support supervisor states, but if
+ * we did, we could find out like this.
+ *
+ * SDM says: If state component i is a user state component,
+ * ECX[0] return 0; if state component i is a supervisor
+ * state component, ECX[0] returns 1.
+ u32 eax, ebx, ecx, edx;
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx;
+ return !!(ecx & 1);
+ */
+ return 0;
+}
+/*
+static int xfeature_is_user(int xfeature_nr)
+{
+ return !xfeature_is_supervisor(xfeature_nr);
+}
+*/
+
+/*
+ * This check is important because it is easy to get XSTATE_*
+ * confused with XSTATE_BIT_*.
+ */
+#define CHECK_XFEATURE(nr) do { \
+ WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
+ WARN_ON(nr >= XFEATURE_MAX); \
+} while (0)
+
+/*
+ * We could cache this like xstate_size[], but we only use
+ * it here, so it would be a waste of space.
+ */
+static int xfeature_is_aligned(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ /*
+ * The value returned by ECX[1] indicates the alignment
+ * of state component i when the compacted format
+ * of the extended region of an XSAVE area is used
+ */
+ return !!(ecx & 2);
+}
+
+static int xfeature_uncompacted_offset(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
+
+static int xfeature_size(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ return eax;
+}
+
+/*
+ * 'XSAVES' implies two different things:
+ * 1. saving of supervisor/system state
+ * 2. using the compacted format
+ *
+ * Use this function when dealing with the compacted format so
+ * that it is obvious which aspect of 'XSAVES' is being handled
+ * by the calling code.
+ */
+static int using_compacted_format(void)
+{
+ return cpu_has_xsaves;
+}
+
+static void __xstate_dump_leaves(void)
+{
+ int i;
+ u32 eax, ebx, ecx, edx;
+ static int should_dump = 1;
+
+ if (!should_dump)
+ return;
+ should_dump = 0;
+ /*
+ * Dump out a few leaves past the ones that we support
+ * just in case there are some goodies up there
+ */
+ for (i = 0; i < XFEATURE_MAX + 10; i++) {
+ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+ pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+ XSTATE_CPUID, i, eax, ebx, ecx, edx);
+ }
+}
+
+#define XSTATE_WARN_ON(x) do { \
+ if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \
+ __xstate_dump_leaves(); \
+ } \
+} while (0)
+
+#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \
+ if ((nr == nr_macro) && \
+ WARN_ONCE(sz != sizeof(__struct), \
+ "%s: struct is %zu bytes, cpu state %d bytes\n", \
+ __stringify(nr_macro), sizeof(__struct), sz)) { \
+ __xstate_dump_leaves(); \
+ } \
+} while (0)
+
+/*
+ * We have a C struct for each 'xstate'. We need to ensure
+ * that our software representation matches what the CPU
+ * tells us about the state's size.
+ */
+static void check_xstate_against_struct(int nr)
+{
+ /*
+ * Ask the CPU for the size of the state.
+ */
+ int sz = xfeature_size(nr);
+ /*
+ * Match each CPU state with the corresponding software
+ * structure.
+ */
+ XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct);
+ XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state);
+ XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state);
+ XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
+ XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
+ XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
+
+ /*
+ * Make *SURE* to add any feature numbers in below if
+ * there are "holes" in the xsave state component
+ * numbers.
+ */
+ if ((nr < XFEATURE_YMM) ||
+ (nr >= XFEATURE_MAX)) {
+ WARN_ONCE(1, "no structure for xstate: %d\n", nr);
+ XSTATE_WARN_ON(1);
+ }
+}
+
+/*
+ * This essentially double-checks what the cpu told us about
+ * how large the XSAVE buffer needs to be. We are recalculating
+ * it to be safe.
+ */
+static void do_extra_xstate_size_checks(void)
+{
+ int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ int i;
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (!xfeature_enabled(i))
+ continue;
+
+ check_xstate_against_struct(i);
+ /*
+ * Supervisor state components can be managed only by
+ * XSAVES, which is compacted-format only.
+ */
+ if (!using_compacted_format())
+ XSTATE_WARN_ON(xfeature_is_supervisor(i));
+
+ /* Align from the end of the previous feature */
+ if (xfeature_is_aligned(i))
+ paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
+ /*
+ * The offset of a given state in the non-compacted
+ * format is given to us in a CPUID leaf. We check
+ * them for being ordered (increasing offsets) in
+ * setup_xstate_features().
+ */
+ if (!using_compacted_format())
+ paranoid_xstate_size = xfeature_uncompacted_offset(i);
+ /*
+ * The compacted-format offset always depends on where
+ * the previous state ended.
+ */
+ paranoid_xstate_size += xfeature_size(i);
+ }
+ XSTATE_WARN_ON(paranoid_xstate_size != xstate_size);
+}
+
/*
* Calculate total size of enabled xstates in XCR0/xfeatures_mask.
+ *
+ * Note the SDM's wording here. "sub-function 0" only enumerates
+ * the size of the *user* states. If we use it to size a buffer
+ * that we use 'XSAVES' on, we could potentially overflow the
+ * buffer because 'XSAVES' saves system states too.
+ *
+ * Note that we do not currently set any bits on IA32_XSS so
+ * 'XCR0 | IA32_XSS == XCR0' for now.
*/
-static void __init init_xstate_size(void)
+static unsigned int __init calculate_xstate_size(void)
{
unsigned int eax, ebx, ecx, edx;
- int i;
+ unsigned int calculated_xstate_size;
if (!cpu_has_xsaves) {
+ /*
+ * - CPUID function 0DH, sub-function 0:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVE instruction for an XSAVE area
+ * containing all the *user* state components
+ * corresponding to bits currently set in XCR0.
+ */
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- xstate_size = ebx;
- return;
+ calculated_xstate_size = ebx;
+ } else {
+ /*
+ * - CPUID function 0DH, sub-function 1:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVES instruction for an XSAVE area
+ * containing all the state components
+ * corresponding to bits currently set in
+ * XCR0 | IA32_XSS.
+ */
+ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+ calculated_xstate_size = ebx;
}
+ return calculated_xstate_size;
+}
- xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
- for (i = 2; i < 64; i++) {
- if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
- cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
- xstate_size += eax;
- }
- }
+/*
+ * Will the runtime-enumerated 'xstate_size' fit in the init
+ * task's statically-allocated buffer?
+ */
+static bool is_supported_xstate_size(unsigned int test_xstate_size)
+{
+ if (test_xstate_size <= sizeof(union fpregs_state))
+ return true;
+
+ pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
+ sizeof(union fpregs_state), test_xstate_size);
+ return false;
+}
+
+static int init_xstate_size(void)
+{
+ /* Recompute the context size for enabled features: */
+ unsigned int possible_xstate_size = calculate_xstate_size();
+
+ /* Ensure we have the space to store all enabled: */
+ if (!is_supported_xstate_size(possible_xstate_size))
+ return -EINVAL;
+
+ /*
+ * The size is OK, we are definitely going to use xsave,
+ * make it known to the world that we need more space.
+ */
+ xstate_size = possible_xstate_size;
+ do_extra_xstate_size_checks();
+ return 0;
+}
+
+/*
+ * We enabled the XSAVE hardware, but something went wrong and
+ * we can not use it. Disable it.
+ */
+static void fpu__init_disable_system_xstate(void)
+{
+ xfeatures_mask = 0;
+ cr4_clear_bits(X86_CR4_OSXSAVE);
+ fpu__xstate_clear_all_cpu_caps();
}
/*
@@ -320,7 +609,8 @@ static void __init init_xstate_size(void)
void __init fpu__init_system_xstate(void)
{
unsigned int eax, ebx, ecx, edx;
- static int on_boot_cpu = 1;
+ static int on_boot_cpu __initdata = 1;
+ int err;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -338,26 +628,28 @@ void __init fpu__init_system_xstate(void)
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
xfeatures_mask = eax + ((u64)edx << 32);
- if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
+ if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
BUG();
}
- /* Support only the state known to the OS: */
- xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
+ xfeatures_mask &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
-
- /* Recompute the context size for enabled features: */
- init_xstate_size();
+ err = init_xstate_size();
+ if (err) {
+ /* something went wrong, boot without any XSAVE support */
+ fpu__init_disable_system_xstate();
+ return;
+ }
update_regset_xstate_info(xstate_size, xfeatures_mask);
fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
setup_xstate_comp();
- pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n",
+ pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
xfeatures_mask,
xstate_size,
cpu_has_xsaves ? "compacted" : "standard");
@@ -388,7 +680,7 @@ void fpu__resume_cpu(void)
* Inputs:
* xstate: the thread's storage area for all FPU data
* xstate_feature: state which is defined in xsave.h (e.g.
- * XSTATE_FP, XSTATE_SSE, etc...)
+ * XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...)
* Output:
* address of the state in the xsave area, or NULL if the
* field is not present in the xsave buffer.
@@ -402,7 +694,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return NULL;
- xsave = &current->thread.fpu.state.xsave;
/*
* We should not ever be requesting features that we
* have not enabled. Remember that pcntxt_mask is
@@ -439,8 +730,8 @@ EXPORT_SYMBOL_GPL(get_xsave_addr);
* Note that this only works on the current task.
*
* Inputs:
- * @xsave_state: state which is defined in xsave.h (e.g. XSTATE_FP,
- * XSTATE_SSE, etc...)
+ * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
+ * XFEATURE_MASK_SSE, etc...)
* Output:
* address of the state in the xsave area or NULL if the state
* is not present or is in its 'init state'.
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8b7b0a51e742..29408d6d6626 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -105,14 +105,14 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
{
unsigned char replaced[MCOUNT_INSN_SIZE];
+ ftrace_expected = old_code;
+
/*
- * Note: Due to modules and __init, code can
- * disappear and change, we need to protect against faulting
- * as well as code changing. We do this by using the
- * probe_kernel_* functions.
- *
- * No real locking needed, this code is run through
- * kstop_machine, or before SMP starts.
+ * Note:
+ * We are paranoid about modifying text, as if a bug was to happen, it
+ * could cause us to read or write to someplace that could cause harm.
+ * Carefully read and modify the code with probe_kernel_*(), and make
+ * sure what we read is what we expected it to be before modifying it.
*/
/* read the text we want to modify */
@@ -154,6 +154,8 @@ int ftrace_make_nop(struct module *mod,
if (addr == MCOUNT_ADDR)
return ftrace_modify_code_direct(rec->ip, old, new);
+ ftrace_expected = NULL;
+
/* Normal cases use add_brk_on_nop */
WARN_ONCE(1, "invalid use of ftrace_make_nop");
return -EINVAL;
@@ -220,6 +222,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
unsigned long addr)
{
WARN_ON(1);
+ ftrace_expected = NULL;
return -EINVAL;
}
@@ -314,6 +317,8 @@ static int add_break(unsigned long ip, const char *old)
if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
return -EFAULT;
+ ftrace_expected = old;
+
/* Make sure it is what we expect it to be */
if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
return -EINVAL;
@@ -413,6 +418,8 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
ftrace_addr = ftrace_get_addr_curr(rec);
nop = ftrace_call_replace(ip, ftrace_addr);
+ ftrace_expected = nop;
+
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
return -EINVAL;
}
@@ -556,6 +563,7 @@ void ftrace_replace_code(int enable)
run_sync();
report = "updating code";
+ count = 0;
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
@@ -563,11 +571,13 @@ void ftrace_replace_code(int enable)
ret = add_update(rec, enable);
if (ret)
goto remove_breakpoints;
+ count++;
}
run_sync();
report = "removing breakpoints";
+ count = 0;
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
@@ -575,6 +585,7 @@ void ftrace_replace_code(int enable)
ret = finish_update(rec, enable);
if (ret)
goto remove_breakpoints;
+ count++;
}
run_sync();
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f129a9af6357..2c0f3407bd1f 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -192,5 +192,13 @@ void __init x86_64_start_reservations(char *real_mode_data)
reserve_ebda_region();
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_INTEL_MID:
+ x86_intel_mid_early_setup();
+ break;
+ default:
+ break;
+ }
+
start_kernel();
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 0e2d96ffd158..6bc9ae24b6d2 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -152,7 +152,7 @@ ENTRY(startup_32)
movl %eax, pa(olpc_ofw_pgd)
#endif
-#ifdef CONFIG_MICROCODE_EARLY
+#ifdef CONFIG_MICROCODE
/* Early load ucode on BSP. */
call load_ucode_bsp
#endif
@@ -311,12 +311,11 @@ ENTRY(startup_32_smp)
movl %eax,%ss
leal -__PAGE_OFFSET(%ecx),%esp
-#ifdef CONFIG_MICROCODE_EARLY
+#ifdef CONFIG_MICROCODE
/* Early load ucode on AP. */
call load_ucode_ap
#endif
-
default_entry:
#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 1d40ca8a73f2..ffdc0e860390 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -65,6 +65,9 @@ startup_64:
* tables and then reload them.
*/
+ /* Sanitize CPU configuration */
+ call verify_cpu
+
/*
* Compute the delta between the address I am compiled to run at and the
* address I am actually running at.
@@ -174,6 +177,9 @@ ENTRY(secondary_startup_64)
* after the boot processor executes this code.
*/
+ /* Sanitize CPU configuration */
+ call verify_cpu
+
movq $(init_level4_pgt - __START_KERNEL_map), %rax
1:
@@ -288,6 +294,8 @@ ENTRY(secondary_startup_64)
pushq %rax # target address in negative space
lretq
+#include "verify_cpu.S"
+
#ifdef CONFIG_HOTPLUG_CPU
/*
* Boot CPU0 entry point. It's called from play_dead(). Everything has been set
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 88b4da373081..b8e6ff5cd5d0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -37,10 +37,10 @@
*/
unsigned long hpet_address;
u8 hpet_blockid; /* OS timer block num */
-u8 hpet_msi_disable;
+bool hpet_msi_disable;
#ifdef CONFIG_PCI_MSI
-static unsigned long hpet_num_timers;
+static unsigned int hpet_num_timers;
#endif
static void __iomem *hpet_virt_address;
@@ -86,9 +86,9 @@ static inline void hpet_clear_mapping(void)
/*
* HPET command line enable / disable
*/
-int boot_hpet_disable;
-int hpet_force_user;
-static int hpet_verbose;
+bool boot_hpet_disable;
+bool hpet_force_user;
+static bool hpet_verbose;
static int __init hpet_setup(char *str)
{
@@ -98,11 +98,11 @@ static int __init hpet_setup(char *str)
if (next)
*next++ = 0;
if (!strncmp("disable", str, 7))
- boot_hpet_disable = 1;
+ boot_hpet_disable = true;
if (!strncmp("force", str, 5))
- hpet_force_user = 1;
+ hpet_force_user = true;
if (!strncmp("verbose", str, 7))
- hpet_verbose = 1;
+ hpet_verbose = true;
str = next;
}
return 1;
@@ -111,7 +111,7 @@ __setup("hpet=", hpet_setup);
static int __init disable_hpet(char *str)
{
- boot_hpet_disable = 1;
+ boot_hpet_disable = true;
return 1;
}
__setup("nohpet", disable_hpet);
@@ -124,7 +124,7 @@ static inline int is_hpet_capable(void)
/*
* HPET timer interrupt enable / disable
*/
-static int hpet_legacy_int_enabled;
+static bool hpet_legacy_int_enabled;
/**
* is_hpet_enabled - check whether the hpet timer interrupt is enabled
@@ -230,7 +230,7 @@ static struct clock_event_device hpet_clockevent;
static void hpet_stop_counter(void)
{
- unsigned long cfg = hpet_readl(HPET_CFG);
+ u32 cfg = hpet_readl(HPET_CFG);
cfg &= ~HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
}
@@ -272,7 +272,7 @@ static void hpet_enable_legacy_int(void)
cfg |= HPET_CFG_LEGACY;
hpet_writel(cfg, HPET_CFG);
- hpet_legacy_int_enabled = 1;
+ hpet_legacy_int_enabled = true;
}
static void hpet_legacy_clockevent_register(void)
@@ -983,7 +983,7 @@ void hpet_disable(void)
cfg = *hpet_boot_cfg;
else if (hpet_legacy_int_enabled) {
cfg &= ~HPET_CFG_LEGACY;
- hpet_legacy_int_enabled = 0;
+ hpet_legacy_int_enabled = false;
}
cfg &= ~HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
@@ -1121,8 +1121,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
static void hpet_disable_rtc_channel(void)
{
- unsigned long cfg;
- cfg = hpet_readl(HPET_T1_CFG);
+ u32 cfg = hpet_readl(HPET_T1_CFG);
cfg &= ~HPET_TN_ENABLE;
hpet_writel(cfg, HPET_T1_CFG);
}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 50a3fad5b89f..2bcfb5f2bc44 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -300,6 +300,10 @@ static int arch_build_bp_info(struct perf_event *bp)
return -EINVAL;
if (bp->attr.bp_addr & (bp->attr.bp_len - 1))
return -EINVAL;
+
+ if (!boot_cpu_has(X86_FEATURE_BPEXT))
+ return -EOPNOTSUPP;
+
/*
* It's impossible to use a range breakpoint to fake out
* user vs kernel detection because bp_len - 1 can't
@@ -307,8 +311,6 @@ static int arch_build_bp_info(struct perf_event *bp)
* breakpoints, then we'll have to check for kprobe-blacklisted
* addresses anywhere in the range.
*/
- if (!cpu_has_bpext)
- return -EOPNOTSUPP;
info->mask = bp->attr.bp_len - 1;
info->len = X86_BREAKPOINT_LEN_1;
}
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 16cb827a5b27..be22f5a2192e 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -295,16 +295,11 @@ static void unmask_8259A(void)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-static void init_8259A(int auto_eoi)
+static int probe_8259A(void)
{
unsigned long flags;
unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
unsigned char new_val;
-
- i8259A_auto_eoi = auto_eoi;
-
- raw_spin_lock_irqsave(&i8259A_lock, flags);
-
/*
* Check to see if we have a PIC.
* Mask all except the cascade and read
@@ -312,16 +307,28 @@ static void init_8259A(int auto_eoi)
* have a PIC, we will read 0xff as opposed to the
* value we wrote.
*/
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
outb(probe_val, PIC_MASTER_IMR);
new_val = inb(PIC_MASTER_IMR);
if (new_val != probe_val) {
printk(KERN_INFO "Using NULL legacy PIC\n");
legacy_pic = &null_legacy_pic;
- raw_spin_unlock_irqrestore(&i8259A_lock, flags);
- return;
}
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return nr_legacy_irqs();
+}
+
+static void init_8259A(int auto_eoi)
+{
+ unsigned long flags;
+
+ i8259A_auto_eoi = auto_eoi;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
/*
@@ -379,6 +386,10 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
{
return 0;
}
+static int legacy_pic_probe(void)
+{
+ return 0;
+}
struct legacy_pic null_legacy_pic = {
.nr_legacy_irqs = 0,
@@ -388,6 +399,7 @@ struct legacy_pic null_legacy_pic = {
.mask_all = legacy_pic_noop,
.restore_mask = legacy_pic_noop,
.init = legacy_pic_int_noop,
+ .probe = legacy_pic_probe,
.irq_pending = legacy_pic_irq_pending_noop,
.make_irq = legacy_pic_uint_noop,
};
@@ -400,6 +412,7 @@ struct legacy_pic default_legacy_pic = {
.mask_all = mask_8259A,
.restore_mask = unmask_8259A,
.init = init_8259A,
+ .probe = probe_8259A,
.irq_pending = i8259A_irq_pending,
.make_irq = make_8259A_irq,
};
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f8062aaf5df9..61521dc19c10 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -462,7 +462,7 @@ void fixup_irqs(void)
* non intr-remapping case, we can't wait till this interrupt
* arrives at this cpu before completing the irq move.
*/
- irq_force_complete_move(irq);
+ irq_force_complete_move(desc);
if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
break_affinity = 1;
@@ -470,6 +470,15 @@ void fixup_irqs(void)
}
chip = irq_data_get_irq_chip(data);
+ /*
+ * The interrupt descriptor might have been cleaned up
+ * already, but it is not yet removed from the radix tree
+ */
+ if (!chip) {
+ raw_spin_unlock(&desc->lock);
+ continue;
+ }
+
if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
chip->irq_mask(data);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index c767cf2bc80a..206d0b90a3ab 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -72,7 +72,7 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
stack_overflow_check(regs);
- if (unlikely(IS_ERR_OR_NULL(desc)))
+ if (IS_ERR_OR_NULL(desc))
return false;
generic_handle_irq_desc(desc);
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index dc5fa6a1e8d6..3512ba607361 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -1,7 +1,7 @@
/*
* x86 specific code for irq_work
*
- * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
*/
#include <linux/kernel.h>
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index d6178d9791db..44256a62702b 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -511,26 +511,31 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
return NOTIFY_STOP;
}
-static int was_in_debug_nmi[NR_CPUS];
+static DECLARE_BITMAP(was_in_debug_nmi, NR_CPUS);
static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
+ int cpu;
+
switch (cmd) {
case NMI_LOCAL:
if (atomic_read(&kgdb_active) != -1) {
/* KGDB CPU roundup */
- kgdb_nmicallback(raw_smp_processor_id(), regs);
- was_in_debug_nmi[raw_smp_processor_id()] = 1;
+ cpu = raw_smp_processor_id();
+ kgdb_nmicallback(cpu, regs);
+ set_bit(cpu, was_in_debug_nmi);
touch_nmi_watchdog();
+
return NMI_HANDLED;
}
break;
case NMI_UNKNOWN:
- if (was_in_debug_nmi[raw_smp_processor_id()]) {
- was_in_debug_nmi[raw_smp_processor_id()] = 0;
+ cpu = raw_smp_processor_id();
+
+ if (__test_and_clear_bit(cpu, was_in_debug_nmi))
return NMI_HANDLED;
- }
+
break;
default:
/* do nothing */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2c7aafa70702..72cef58693c7 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -32,6 +32,7 @@
static int kvmclock = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+static cycle_t kvm_sched_clock_offset;
static int parse_no_kvmclock(char *arg)
{
@@ -44,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);
static struct pvclock_vsyscall_time_info *hv_clock;
static struct pvclock_wall_clock wall_clock;
+struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+ return hv_clock;
+}
+
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
@@ -92,6 +98,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
return kvm_clock_read();
}
+static cycle_t kvm_sched_clock_read(void)
+{
+ return kvm_clock_read() - kvm_sched_clock_offset;
+}
+
+static inline void kvm_sched_clock_init(bool stable)
+{
+ if (!stable) {
+ pv_time_ops.sched_clock = kvm_clock_read;
+ return;
+ }
+
+ kvm_sched_clock_offset = kvm_clock_read();
+ pv_time_ops.sched_clock = kvm_sched_clock_read;
+ set_sched_clock_stable();
+
+ printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
+ kvm_sched_clock_offset);
+
+ BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+ sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
@@ -248,7 +277,17 @@ void __init kvmclock_init(void)
memblock_free(mem, size);
return;
}
- pv_time_ops.sched_clock = kvm_clock_read;
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+ cpu = get_cpu();
+ vcpu_time = &hv_clock[cpu].pvti;
+ flags = pvclock_read_flags(vcpu_time);
+
+ kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+ put_cpu();
+
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
@@ -265,23 +304,12 @@ void __init kvmclock_init(void)
kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
-
- if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
- pvclock_set_flags(~0);
-
- cpu = get_cpu();
- vcpu_time = &hv_clock[cpu].pvti;
- flags = pvclock_read_flags(vcpu_time);
- if (flags & PVCLOCK_COUNTS_FROM_ZERO)
- set_sched_clock_stable();
- put_cpu();
}
int __init kvm_setup_vsyscall_timeinfo(void)
{
#ifdef CONFIG_X86_64
int cpu;
- int ret;
u8 flags;
struct pvclock_vcpu_time_info *vcpu_time;
unsigned int size;
@@ -301,11 +329,6 @@ int __init kvm_setup_vsyscall_timeinfo(void)
return 1;
}
- if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
- put_cpu();
- return ret;
- }
-
put_cpu();
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
index ff3c3101d003..92fc1a51f994 100644
--- a/arch/x86/kernel/livepatch.c
+++ b/arch/x86/kernel/livepatch.c
@@ -20,8 +20,6 @@
#include <linux/module.h>
#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-#include <asm/page_types.h>
#include <asm/elf.h>
#include <asm/livepatch.h>
@@ -38,12 +36,10 @@
int klp_write_module_reloc(struct module *mod, unsigned long type,
unsigned long loc, unsigned long value)
{
- int ret, numpages, size = 4;
- bool readonly;
+ size_t size = 4;
unsigned long val;
- unsigned long core = (unsigned long)mod->module_core;
- unsigned long core_ro_size = mod->core_ro_size;
- unsigned long core_size = mod->core_size;
+ unsigned long core = (unsigned long)mod->core_layout.base;
+ unsigned long core_size = mod->core_layout.size;
switch (type) {
case R_X86_64_NONE:
@@ -70,21 +66,5 @@ int klp_write_module_reloc(struct module *mod, unsigned long type,
/* loc does not point to any symbol inside the module */
return -EINVAL;
- if (loc < core + core_ro_size)
- readonly = true;
- else
- readonly = false;
-
- /* determine if the relocation spans a page boundary */
- numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
-
- if (readonly)
- set_memory_rw(loc & PAGE_MASK, numpages);
-
- ret = probe_kernel_write((void *)loc, &val, size);
-
- if (readonly)
- set_memory_ro(loc & PAGE_MASK, numpages);
-
- return ret;
+ return probe_kernel_write((void *)loc, &val, size);
}
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 819ab3f9c9c7..ba7fbba9831b 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -385,6 +385,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
return image->fops->cleanup(image->image_loader_data);
}
+#ifdef CONFIG_KEXEC_VERIFY_SIG
int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
unsigned long kernel_len)
{
@@ -395,6 +396,7 @@ int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
return image->fops->verify_sig(kernel, kernel_len);
}
+#endif
/*
* Apply purgatory relocations.
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index 94ea120fa21f..87e1762e2bca 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -278,6 +278,12 @@ trace:
/* save_mcount_regs fills in first two parameters */
save_mcount_regs
+ /*
+ * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not
+ * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the
+ * ip and parent ip are used and the list function is called when
+ * function tracing is enabled.
+ */
call *ftrace_trace_function
restore_mcount_regs
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 113e70784854..64f9616f93f1 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -45,28 +45,6 @@
static struct class *msr_class;
-static loff_t msr_seek(struct file *file, loff_t offset, int orig)
-{
- loff_t ret;
- struct inode *inode = file_inode(file);
-
- mutex_lock(&inode->i_mutex);
- switch (orig) {
- case SEEK_SET:
- file->f_pos = offset;
- ret = file->f_pos;
- break;
- case SEEK_CUR:
- file->f_pos += offset;
- ret = file->f_pos;
- break;
- default:
- ret = -EINVAL;
- }
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -194,7 +172,7 @@ static int msr_open(struct inode *inode, struct file *file)
*/
static const struct file_operations msr_fops = {
.owner = THIS_MODULE,
- .llseek = msr_seek,
+ .llseek = no_seek_end_llseek,
.read = msr_read,
.write = msr_write,
.open = msr_open,
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 697f90db0e37..8a2cdd736fa4 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -29,6 +29,7 @@
#include <asm/mach_traps.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
+#include <asm/reboot.h>
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
@@ -231,7 +232,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
#endif
if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ nmi_panic(regs, "NMI: Not continuing");
pr_emerg("Dazed and confused, but trying to continue\n");
@@ -255,8 +256,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
reason, smp_processor_id());
show_regs(regs);
- if (panic_on_io_nmi)
- panic("NMI IOCK error: Not continuing");
+ if (panic_on_io_nmi) {
+ nmi_panic(regs, "NMI IOCK error: Not continuing");
+
+ /*
+ * If we end up here, it means we have received an NMI while
+ * processing panic(). Simply return without delaying and
+ * re-enabling NMIs.
+ */
+ return;
+ }
/* Re-enable the IOCK line, wait for a few seconds */
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
@@ -297,7 +306,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
pr_emerg("Do you have a strange power saving mode enabled?\n");
if (unknown_nmi_panic || panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ nmi_panic(regs, "NMI: Not continuing");
pr_emerg("Dazed and confused, but trying to continue\n");
}
@@ -348,8 +357,19 @@ static void default_do_nmi(struct pt_regs *regs)
return;
}
- /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
- raw_spin_lock(&nmi_reason_lock);
+ /*
+ * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
+ *
+ * Another CPU may be processing panic routines while holding
+ * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
+ * and if so, call its callback directly. If there is no CPU preparing
+ * crash dump, we simply loop here.
+ */
+ while (!raw_spin_trylock(&nmi_reason_lock)) {
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+
reason = x86_platform.get_nmi_reason();
if (reason & NMI_REASON_MASK) {
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c2130aef3f9d..f08ac28b8136 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -74,16 +74,6 @@ void __init default_banner(void)
/* Undefined instruction for dealing with missing ops pointers. */
static const unsigned char ud2a[] = { 0x0f, 0x0b };
-unsigned paravirt_patch_nop(void)
-{
- return 0;
-}
-
-unsigned paravirt_patch_ignore(unsigned len)
-{
- return len;
-}
-
struct branch {
unsigned char opcode;
u32 delta;
@@ -133,7 +123,6 @@ static void *get_call_destination(u8 type)
.pv_time_ops = pv_time_ops,
.pv_cpu_ops = pv_cpu_ops,
.pv_irq_ops = pv_irq_ops,
- .pv_apic_ops = pv_apic_ops,
.pv_mmu_ops = pv_mmu_ops,
#ifdef CONFIG_PARAVIRT_SPINLOCKS
.pv_lock_ops = pv_lock_ops,
@@ -152,8 +141,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
/* If there's no function, patch it with a ud2a (BUG) */
ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
else if (opfunc == _paravirt_nop)
- /* If the operation is a nop, then nop the callsite */
- ret = paravirt_patch_nop();
+ ret = 0;
/* identity functions just return their single argument */
else if (opfunc == _paravirt_ident_32)
@@ -162,10 +150,6 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
ret = paravirt_patch_ident_64(insnbuf, len);
else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
-#ifdef CONFIG_X86_32
- type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
-#endif
- type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
@@ -220,8 +204,6 @@ static u64 native_steal_clock(int cpu)
/* These are in entry.S */
extern void native_iret(void);
-extern void native_irq_enable_sysexit(void);
-extern void native_usergs_sysret32(void);
extern void native_usergs_sysret64(void);
static struct resource reserve_ioports = {
@@ -379,13 +361,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.load_sp0 = native_load_sp0,
-#if defined(CONFIG_X86_32)
- .irq_enable_sysexit = native_irq_enable_sysexit,
-#endif
#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
- .usergs_sysret32 = native_usergs_sysret32,
-#endif
.usergs_sysret64 = native_usergs_sysret64,
#endif
.iret = native_iret,
@@ -403,12 +379,6 @@ NOKPROBE_SYMBOL(native_get_debugreg);
NOKPROBE_SYMBOL(native_set_debugreg);
NOKPROBE_SYMBOL(native_load_idt);
-struct pv_apic_ops pv_apic_ops = {
-#ifdef CONFIG_X86_LOCAL_APIC
- .startup_ipi_hook = paravirt_nop,
-#endif
-};
-
#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
/* 32-bit pagetable entries */
#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
@@ -444,9 +414,6 @@ struct pv_mmu_ops pv_mmu_ops = {
.set_pmd = native_set_pmd,
.set_pmd_at = native_set_pmd_at,
.pte_update = paravirt_nop,
- .pte_update_defer = paravirt_nop,
- .pmd_update = paravirt_nop,
- .pmd_update_defer = paravirt_nop,
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
@@ -492,6 +459,5 @@ struct pv_mmu_ops pv_mmu_ops = {
EXPORT_SYMBOL_GPL(pv_time_ops);
EXPORT_SYMBOL (pv_cpu_ops);
EXPORT_SYMBOL (pv_mmu_ops);
-EXPORT_SYMBOL_GPL(pv_apic_ops);
EXPORT_SYMBOL_GPL(pv_info);
EXPORT_SYMBOL (pv_irq_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index c89f50a76e97..158dc0650d5d 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -5,7 +5,6 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
@@ -46,7 +45,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, restore_fl);
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_cpu_ops, iret);
- PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 8aa05583bc42..e70087a04cc8 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -13,9 +13,7 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
-DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
DEF_NATIVE(, mov32, "mov %edi, %eax");
@@ -55,7 +53,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
PATCH_SITE(pv_cpu_ops, swapgs);
PATCH_SITE(pv_mmu_ops, read_cr2);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 0497f719977d..833b1d329c47 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -180,13 +180,13 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl);
static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
static void get_tce_space_from_tar(void);
-static struct cal_chipset_ops calgary_chip_ops = {
+static const struct cal_chipset_ops calgary_chip_ops = {
.handle_quirks = calgary_handle_quirks,
.tce_cache_blast = calgary_tce_cache_blast,
.dump_error_regs = calgary_dump_error_regs
};
-static struct cal_chipset_ops calioc2_chip_ops = {
+static const struct cal_chipset_ops calioc2_chip_ops = {
.handle_quirks = calioc2_handle_quirks,
.tce_cache_blast = calioc2_tce_cache_blast,
.dump_error_regs = calioc2_dump_error_regs
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index cd99433b8ba1..6ba014c61d62 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -90,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
again:
page = NULL;
/* CMA can be used only in the context which permits sleeping */
- if (flag & __GFP_WAIT) {
+ if (gfpflags_allow_blocking(flag)) {
page = dma_alloc_from_contiguous(dev, count, get_order(size));
if (page && page_to_phys(page) + size > dma_mask) {
dma_release_from_contiguous(dev, page, count);
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index adf0392d549a..7c577a178859 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -88,7 +88,7 @@ int __init pci_swiotlb_detect_4gb(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
#ifdef CONFIG_X86_64
- if (!no_iommu && max_pfn > MAX_DMA32_PFN)
+ if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
swiotlb = 1;
#endif
return swiotlb;
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 4f00b63d7ff3..14415aff1813 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -4,10 +4,22 @@
*/
#include <linux/platform_device.h>
#include <linux/module.h>
+#include <linux/ioport.h>
+
+static int found(u64 start, u64 end, void *data)
+{
+ return 1;
+}
static __init int register_e820_pmem(void)
{
+ char *pmem = "Persistent Memory (legacy)";
struct platform_device *pdev;
+ int rc;
+
+ rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found);
+ if (rc <= 0)
+ return 0;
/*
* See drivers/nvdimm/e820.c for the implementation, this is
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 737527b40e5b..9f950917528b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -280,14 +280,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
set_iopl_mask(next->iopl);
/*
- * If it were not for PREEMPT_ACTIVE we could guarantee that the
- * preempt_count of all tasks was equal here and this would not be
- * needed.
- */
- task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
- this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
-
- /*
* Now maybe handle debug registers and/or IO bitmaps
*/
if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b35921a670b2..b9d99e0f82c4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -125,7 +125,7 @@ void release_thread(struct task_struct *dead_task)
if (dead_task->mm->context.ldt) {
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
- dead_task->mm->context.ldt,
+ dead_task->mm->context.ldt->entries,
dead_task->mm->context.ldt->size);
BUG();
}
@@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch FS and GS.
*
- * These are even more complicated than FS and GS: they have
+ * These are even more complicated than DS and ES: they have
* 64-bit bases are that controlled by arch_prctl. Those bases
* only differ from the values in the GDT or LDT if the selector
* is 0.
@@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
this_cpu_write(current_task, next_p);
- /*
- * If it were not for PREEMPT_ACTIVE we could guarantee that the
- * preempt_count of all tasks was equal here and this would not be
- * needed.
- */
- task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
- this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
-
/* Reload esp0 and ss1. This changes current_thread_info(). */
load_sp0(tss, next);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 558f50edebca..32e9d9cbb884 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -124,21 +124,6 @@ const char *regs_query_register_name(unsigned int offset)
return NULL;
}
-static const int arg_offs_table[] = {
-#ifdef CONFIG_X86_32
- [0] = offsetof(struct pt_regs, ax),
- [1] = offsetof(struct pt_regs, dx),
- [2] = offsetof(struct pt_regs, cx)
-#else /* CONFIG_X86_64 */
- [0] = offsetof(struct pt_regs, di),
- [1] = offsetof(struct pt_regs, si),
- [2] = offsetof(struct pt_regs, dx),
- [3] = offsetof(struct pt_regs, cx),
- [4] = offsetof(struct pt_regs, r8),
- [5] = offsetof(struct pt_regs, r9)
-#endif
-};
-
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2f355d229a58..99bfc025111d 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -140,27 +140,3 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
-
-#ifdef CONFIG_X86_64
-/*
- * Initialize the generic pvclock vsyscall state. This will allocate
- * a/some page(s) for the per-vcpu pvclock information, set up a
- * fixmap mapping for the page(s)
- */
-
-int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
- int size)
-{
- int idx;
-
- WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
-
- for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
- __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
- __pa(i) + (idx*PAGE_SIZE),
- PAGE_KERNEL_VVAR);
- }
-
- return 0;
-}
-#endif
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 176a0f99d4da..cc457ff818ad 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -524,7 +524,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU,
*/
static void force_disable_hpet_msi(struct pci_dev *unused)
{
- hpet_msi_disable = 1;
+ hpet_msi_disable = true;
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 02693dd9a079..ab0adc0fa5db 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
},
},
+ { /* Handle problems with rebooting on the iMac10,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac10,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+ },
+ },
/* ASRock */
{ /* Handle problems with rebooting on ASRock Q1900DC-ITX */
@@ -718,6 +726,7 @@ static int crashing_cpu;
static nmi_shootdown_cb shootdown_callback;
static atomic_t waiting_for_crash_ipi;
+static int crash_ipi_issued;
static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
{
@@ -780,6 +789,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
smp_send_nmi_allbutself();
+ /* Kick CPUs looping in NMI context. */
+ WRITE_ONCE(crash_ipi_issued, 1);
+
msecs = 1000; /* Wait at most a second for the other cpus to stop */
while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
mdelay(1);
@@ -788,9 +800,35 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* Leave the nmi callback set */
}
+
+/*
+ * Check if the crash dumping IPI got issued and if so, call its callback
+ * directly. This function is used when we have already been in NMI handler.
+ * It doesn't return.
+ */
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+ if (crash_ipi_issued)
+ crash_nmi_callback(0, regs);
+}
+
+/* Override the weak function in kernel/panic.c */
+void nmi_panic_self_stop(struct pt_regs *regs)
+{
+ while (1) {
+ /* If no CPU is preparing crash dump, we simply loop here. */
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+}
+
#else /* !CONFIG_SMP */
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
/* No other CPUs to shoot down */
}
+
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+}
#endif
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index cd9685235df9..4af8d063fb36 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -200,6 +200,9 @@ static __init int add_rtc_cmos(void)
}
#endif
+ if (paravirt_enabled() && !paravirt_has(RTC))
+ return -ENODEV;
+
platform_device_register(&rtc_device);
dev_info(&rtc_device.dev,
"registered platform RTC device (no PNP device found)\n");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a3cccbfc5f77..d3d80e6d42a2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -111,6 +111,7 @@
#include <asm/mce.h>
#include <asm/alternative.h>
#include <asm/prom.h>
+#include <asm/microcode.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -480,34 +481,34 @@ static void __init memblock_x86_reserve_range_setup_data(void)
#ifdef CONFIG_KEXEC_CORE
+/* 16M alignment for crash kernel regions */
+#define CRASH_ALIGN (16 << 20)
+
/*
* Keep the crash kernel below this limit. On 32 bits earlier kernels
* would limit the kernel to the low 512 MiB due to mapping restrictions.
* On 64bit, old kexec-tools need to under 896MiB.
*/
#ifdef CONFIG_X86_32
-# define CRASH_KERNEL_ADDR_LOW_MAX (512 << 20)
-# define CRASH_KERNEL_ADDR_HIGH_MAX (512 << 20)
+# define CRASH_ADDR_LOW_MAX (512 << 20)
+# define CRASH_ADDR_HIGH_MAX (512 << 20)
#else
-# define CRASH_KERNEL_ADDR_LOW_MAX (896UL<<20)
-# define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM
+# define CRASH_ADDR_LOW_MAX (896UL << 20)
+# define CRASH_ADDR_HIGH_MAX MAXMEM
#endif
-static void __init reserve_crashkernel_low(void)
+static int __init reserve_crashkernel_low(void)
{
#ifdef CONFIG_X86_64
- const unsigned long long alignment = 16<<20; /* 16M */
- unsigned long long low_base = 0, low_size = 0;
+ unsigned long long base, low_base = 0, low_size = 0;
unsigned long total_low_mem;
- unsigned long long base;
- bool auto_set = false;
int ret;
- total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
+ total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT));
+
/* crashkernel=Y,low */
- ret = parse_crashkernel_low(boot_command_line, total_low_mem,
- &low_size, &base);
- if (ret != 0) {
+ ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base);
+ if (ret) {
/*
* two parts from lib/swiotlb.c:
* -swiotlb size: user-specified with swiotlb= or default.
@@ -517,52 +518,52 @@ static void __init reserve_crashkernel_low(void)
* make sure we allocate enough extra low memory so that we
* don't run out of DMA buffers for 32-bit devices.
*/
- low_size = max(swiotlb_size_or_default() + (8UL<<20), 256UL<<20);
- auto_set = true;
+ low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
} else {
/* passed with crashkernel=0,low ? */
if (!low_size)
- return;
+ return 0;
}
- low_base = memblock_find_in_range(low_size, (1ULL<<32),
- low_size, alignment);
-
+ low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN);
if (!low_base) {
- if (!auto_set)
- pr_info("crashkernel low reservation failed - No suitable area found.\n");
+ pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
+ (unsigned long)(low_size >> 20));
+ return -ENOMEM;
+ }
- return;
+ ret = memblock_reserve(low_base, low_size);
+ if (ret) {
+ pr_err("%s: Error reserving crashkernel low memblock.\n", __func__);
+ return ret;
}
- memblock_reserve(low_base, low_size);
pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
- (unsigned long)(low_size >> 20),
- (unsigned long)(low_base >> 20),
- (unsigned long)(total_low_mem >> 20));
+ (unsigned long)(low_size >> 20),
+ (unsigned long)(low_base >> 20),
+ (unsigned long)(total_low_mem >> 20));
+
crashk_low_res.start = low_base;
crashk_low_res.end = low_base + low_size - 1;
insert_resource(&iomem_resource, &crashk_low_res);
#endif
+ return 0;
}
static void __init reserve_crashkernel(void)
{
- const unsigned long long alignment = 16<<20; /* 16M */
- unsigned long long total_mem;
- unsigned long long crash_size, crash_base;
+ unsigned long long crash_size, crash_base, total_mem;
bool high = false;
int ret;
total_mem = memblock_phys_mem_size();
/* crashkernel=XM */
- ret = parse_crashkernel(boot_command_line, total_mem,
- &crash_size, &crash_base);
+ ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
if (ret != 0 || crash_size <= 0) {
/* crashkernel=X,high */
ret = parse_crashkernel_high(boot_command_line, total_mem,
- &crash_size, &crash_base);
+ &crash_size, &crash_base);
if (ret != 0 || crash_size <= 0)
return;
high = true;
@@ -573,11 +574,10 @@ static void __init reserve_crashkernel(void)
/*
* kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
*/
- crash_base = memblock_find_in_range(alignment,
- high ? CRASH_KERNEL_ADDR_HIGH_MAX :
- CRASH_KERNEL_ADDR_LOW_MAX,
- crash_size, alignment);
-
+ crash_base = memblock_find_in_range(CRASH_ALIGN,
+ high ? CRASH_ADDR_HIGH_MAX
+ : CRASH_ADDR_LOW_MAX,
+ crash_size, CRASH_ALIGN);
if (!crash_base) {
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
@@ -587,26 +587,32 @@ static void __init reserve_crashkernel(void)
unsigned long long start;
start = memblock_find_in_range(crash_base,
- crash_base + crash_size, crash_size, 1<<20);
+ crash_base + crash_size,
+ crash_size, 1 << 20);
if (start != crash_base) {
pr_info("crashkernel reservation failed - memory is in use.\n");
return;
}
}
- memblock_reserve(crash_base, crash_size);
+ ret = memblock_reserve(crash_base, crash_size);
+ if (ret) {
+ pr_err("%s: Error reserving crashkernel memblock.\n", __func__);
+ return;
+ }
- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
- "for crashkernel (System RAM: %ldMB)\n",
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
+ if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
+ memblock_free(crash_base, crash_size);
+ return;
+ }
+
+ pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
+ (unsigned long)(crash_size >> 20),
+ (unsigned long)(crash_base >> 20),
+ (unsigned long)(total_mem >> 20));
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
insert_resource(&iomem_resource, &crashk_res);
-
- if (crash_base >= (1ULL<<32))
- reserve_crashkernel_low();
}
#else
static void __init reserve_crashkernel(void)
@@ -1042,6 +1048,8 @@ void __init setup_arch(char **cmdline_p)
if (mtrr_trim_uncached_memory(max_pfn))
max_pfn = e820_end_of_ram_pfn();
+ max_possible_pfn = max_pfn;
+
#ifdef CONFIG_X86_32
/* max_low_pfn get updated here */
find_low_pfn_range();
@@ -1079,8 +1087,10 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
- if (efi_enabled(EFI_BOOT))
+ if (efi_enabled(EFI_BOOT)) {
+ efi_fake_memmap();
efi_find_mirror();
+ }
/*
* The EFI specification says that boot service code won't be called
@@ -1180,7 +1190,7 @@ void __init setup_arch(char **cmdline_p)
*/
clone_pgd_range(initial_page_table,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- KERNEL_PGD_PTRS);
+ min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
#endif
tboot_probe();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index da52e6bb5c7f..cb6282c3638f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -63,6 +63,7 @@
int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
{
+ unsigned long buf_val;
void __user *buf;
unsigned int tmpflags;
unsigned int err = 0;
@@ -107,7 +108,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
regs->orig_ax = -1; /* disable syscall checks */
- get_user_ex(buf, &sc->fpstate);
+ get_user_ex(buf_val, &sc->fpstate);
+ buf = (void __user *)buf_val;
} get_user_catch(err);
err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32));
@@ -196,7 +198,7 @@ static unsigned long align_sigframe(unsigned long sp)
return sp;
}
-static inline void __user *
+static void __user *
get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
void __user **fpstate)
{
@@ -299,7 +301,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
if (current->mm->context.vdso)
restorer = current->mm->context.vdso +
- selected_vdso32->sym___kernel_sigreturn;
+ vdso_image_32.sym___kernel_sigreturn;
else
restorer = &frame->retcode;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
@@ -363,7 +365,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
/* Set up to return from userspace. */
restorer = current->mm->context.vdso +
- selected_vdso32->sym___kernel_rt_sigreturn;
+ vdso_image_32.sym___kernel_rt_sigreturn;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
put_user_ex(restorer, &frame->pretcode);
@@ -688,12 +690,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
signal_setup_done(failed, ksig, stepping);
}
-#ifdef CONFIG_X86_32
-#define NR_restart_syscall __NR_restart_syscall
-#else /* !CONFIG_X86_32 */
-#define NR_restart_syscall \
- test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
-#endif /* CONFIG_X86_32 */
+static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
+{
+#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64)
+ return __NR_restart_syscall;
+#else /* !CONFIG_X86_32 && CONFIG_X86_64 */
+ return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall :
+ __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
+#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */
+}
/*
* Note that 'init' is a special process: it doesn't get signals it doesn't
@@ -722,7 +727,7 @@ void do_signal(struct pt_regs *regs)
break;
case -ERESTART_RESTARTBLOCK:
- regs->ax = NR_restart_syscall;
+ regs->ax = get_nr_restart_syscall(regs);
regs->ip -= 2;
break;
}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 12c8286206ce..658777cf3851 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -125,12 +125,12 @@ static void native_smp_send_reschedule(int cpu)
WARN_ON(1);
return;
}
- apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
+ apic->send_IPI(cpu, RESCHEDULE_VECTOR);
}
void native_send_call_func_single_ipi(int cpu)
{
- apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+ apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
}
void native_send_call_func_ipi(const struct cpumask *mask)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 892ee2e5ecbc..24d57f77b3c1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -304,7 +304,7 @@ do { \
static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- if (cpu_has_topoext) {
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
if (c->phys_proc_id == o->phys_proc_id &&
@@ -509,7 +509,7 @@ void __inquire_remote_apic(int apicid)
*/
#define UDELAY_10MS_DEFAULT 10000
-static unsigned int init_udelay = INT_MAX;
+static unsigned int init_udelay = UINT_MAX;
static int __init cpu_init_udelay(char *str)
{
@@ -522,14 +522,15 @@ early_param("cpu_init_udelay", cpu_init_udelay);
static void __init smp_quirk_init_udelay(void)
{
/* if cmdline changed it from default, leave it alone */
- if (init_udelay != INT_MAX)
+ if (init_udelay != UINT_MAX)
return;
/* if modern processor, use no delay */
if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
- ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF)))
+ ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
init_udelay = 0;
-
+ return;
+ }
/* else, use legacy delay */
init_udelay = UDELAY_10MS_DEFAULT;
}
@@ -629,13 +630,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
num_starts = 0;
/*
- * Paravirt / VMI wants a startup IPI hook here to set up the
- * target processor state.
- */
- startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
- stack_start);
-
- /*
* Run STARTUP IPI loop.
*/
pr_debug("#startup loops: %d\n", num_starts);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 346eec73f7db..ade185a46b1d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
{
- const struct bndcsr *bndcsr;
+ const struct mpx_bndcsr *bndcsr;
siginfo_t *info;
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
@@ -384,7 +384,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
* which is all zeros which indicates MPX was not
* responsible for the exception.
*/
- bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
+ bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
if (!bndcsr)
goto exit_trap;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c3f7602cd038..3d743da828d3 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -168,21 +168,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
* ns = cycles * cyc2ns_scale / SC
*
* And since SC is a constant power of two, we can convert the div
- * into a shift.
+ * into a shift. The larger SC is, the more accurate the conversion, but
+ * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ * (64-bit result) can be used.
*
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ * We can use khz divisor instead of mhz to keep a better precision.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
static void cyc2ns_data_init(struct cyc2ns_data *data)
{
data->cyc2ns_mul = 0;
- data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
data->__count = 0;
}
@@ -216,14 +215,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
if (likely(data == tail)) {
ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
} else {
data->__count++;
barrier();
ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
barrier();
@@ -257,12 +256,22 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- data->cyc2ns_mul =
- DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
- cpu_khz);
- data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
+ NSEC_PER_MSEC, 0);
+
+ /*
+ * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
+ * not expected to be greater than 31 due to the original published
+ * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
+ * value) - refer perf_event_mmap_page documentation in perf_event.h.
+ */
+ if (data->cyc2ns_shift == 32) {
+ data->cyc2ns_shift = 31;
+ data->cyc2ns_mul >>= 1;
+ }
+
data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
cyc2ns_write_end(cpu, data);
@@ -1176,8 +1185,6 @@ void __init tsc_init(void)
u64 lpj;
int cpu;
- x86_init.timers.tsc_pre_init();
-
if (!cpu_has_tsc) {
setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index b9242bacbe59..07efb35ee4bc 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -34,10 +34,11 @@
#include <asm/msr-index.h>
verify_cpu:
- pushfl # Save caller passed flags
- pushl $0 # Kill any dangerous flags
- popfl
+ pushf # Save caller passed flags
+ push $0 # Kill any dangerous flags
+ popf
+#ifndef __x86_64__
pushfl # standard way to check for cpuid
popl %eax
movl %eax,%ebx
@@ -47,30 +48,31 @@ verify_cpu:
pushfl
popl %eax
cmpl %eax,%ebx
- jz verify_cpu_no_longmode # cpu has no cpuid
+ jz .Lverify_cpu_no_longmode # cpu has no cpuid
+#endif
movl $0x0,%eax # See if cpuid 1 is implemented
cpuid
cmpl $0x1,%eax
- jb verify_cpu_no_longmode # no cpuid 1
+ jb .Lverify_cpu_no_longmode # no cpuid 1
xor %di,%di
cmpl $0x68747541,%ebx # AuthenticAMD
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
cmpl $0x69746e65,%edx
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
cmpl $0x444d4163,%ecx
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
mov $1,%di # cpu is from AMD
- jmp verify_cpu_check
+ jmp .Lverify_cpu_check
-verify_cpu_noamd:
+.Lverify_cpu_noamd:
cmpl $0x756e6547,%ebx # GenuineIntel?
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check
cmpl $0x49656e69,%edx
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check
cmpl $0x6c65746e,%ecx
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check
# only call IA32_MISC_ENABLE when:
# family > 6 || (family == 6 && model >= 0xd)
@@ -81,59 +83,59 @@ verify_cpu_noamd:
andl $0x0ff00f00, %eax # mask family and extended family
shrl $8, %eax
cmpl $6, %eax
- ja verify_cpu_clear_xd # family > 6, ok
- jb verify_cpu_check # family < 6, skip
+ ja .Lverify_cpu_clear_xd # family > 6, ok
+ jb .Lverify_cpu_check # family < 6, skip
andl $0x000f00f0, %ecx # mask model and extended model
shrl $4, %ecx
cmpl $0xd, %ecx
- jb verify_cpu_check # family == 6, model < 0xd, skip
+ jb .Lverify_cpu_check # family == 6, model < 0xd, skip
-verify_cpu_clear_xd:
+.Lverify_cpu_clear_xd:
movl $MSR_IA32_MISC_ENABLE, %ecx
rdmsr
btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
- jnc verify_cpu_check # only write MSR if bit was changed
+ jnc .Lverify_cpu_check # only write MSR if bit was changed
wrmsr
-verify_cpu_check:
+.Lverify_cpu_check:
movl $0x1,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK0,%edx
xorl $REQUIRED_MASK0,%edx
- jnz verify_cpu_no_longmode
+ jnz .Lverify_cpu_no_longmode
movl $0x80000000,%eax # See if extended cpuid is implemented
cpuid
cmpl $0x80000001,%eax
- jb verify_cpu_no_longmode # no extended cpuid
+ jb .Lverify_cpu_no_longmode # no extended cpuid
movl $0x80000001,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK1,%edx
xorl $REQUIRED_MASK1,%edx
- jnz verify_cpu_no_longmode
+ jnz .Lverify_cpu_no_longmode
-verify_cpu_sse_test:
+.Lverify_cpu_sse_test:
movl $1,%eax
cpuid
andl $SSE_MASK,%edx
cmpl $SSE_MASK,%edx
- je verify_cpu_sse_ok
+ je .Lverify_cpu_sse_ok
test %di,%di
- jz verify_cpu_no_longmode # only try to force SSE on AMD
+ jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
movl $MSR_K7_HWCR,%ecx
rdmsr
btr $15,%eax # enable SSE
wrmsr
xor %di,%di # don't loop
- jmp verify_cpu_sse_test # try again
+ jmp .Lverify_cpu_sse_test # try again
-verify_cpu_no_longmode:
- popfl # Restore caller passed flags
+.Lverify_cpu_no_longmode:
+ popf # Restore caller passed flags
movl $1,%eax
ret
-verify_cpu_sse_ok:
- popfl # Restore caller passed flags
+.Lverify_cpu_sse_ok:
+ popf # Restore caller passed flags
xorl %eax, %eax
ret
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 524619351961..e574b8546518 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm)
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
- split_huge_page_pmd_mm(mm, 0xA0000, pmd);
+
+ if (pmd_trans_huge(*pmd)) {
+ struct vm_area_struct *vma = find_vma(mm, 0xA0000);
+ split_huge_pmd(vma, pmd, 0xA0000);
+ }
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
@@ -357,8 +361,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
tss = &per_cpu(cpu_tss, get_cpu());
/* make room for real-mode segments */
tsk->thread.sp0 += 16;
- if (cpu_has_sep)
+
+ if (static_cpu_has_safe(X86_FEATURE_SEP))
tsk->thread.sysenter_cs = 0;
+
load_sp0(tss, &tsk->thread);
put_cpu();
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 3839628d962e..dad5fe9633a3 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -68,7 +68,6 @@ struct x86_init_ops x86_init __initdata = {
.timers = {
.setup_percpu_clockev = setup_boot_APIC_clock,
- .tsc_pre_init = x86_init_noop,
.timer_init = hpet_time_init,
.wallclock_init = x86_init_noop,
},
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index d8a1d56276e1..639a6e34500c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,8 @@ config KVM
select ANON_INODES
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
+ select IRQ_BYPASS_MANAGER
+ select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index d090ecf08809..9dc091acd5fb 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -21,6 +21,7 @@
#include <linux/fs.h>
#include "irq.h"
#include "assigned-dev.h"
+#include "trace/events/kvm.h"
struct kvm_assigned_dev_kernel {
struct kvm_irq_ack_notifier ack_notifier;
@@ -131,7 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
return IRQ_HANDLED;
}
-#ifdef __KVM_HAVE_MSI
+/*
+ * Deliver an IRQ in an atomic context if we can, or return a failure,
+ * user can retry in a process context.
+ * Return value:
+ * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
+ * Other values - No need to retry.
+ */
+static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
+ int level)
+{
+ struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
+ struct kvm_kernel_irq_routing_entry *e;
+ int ret = -EINVAL;
+ int idx;
+
+ trace_kvm_set_irq(irq, level, irq_source_id);
+
+ /*
+ * Injection into either PIC or IOAPIC might need to scan all CPUs,
+ * which would need to be retried from thread context; when same GSI
+ * is connected to both PIC and IOAPIC, we'd have to report a
+ * partial failure here.
+ * Since there's no easy way to do this, we only support injecting MSI
+ * which is limited to 1:1 GSI mapping.
+ */
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
+ e = &entries[0];
+ ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
+ irq, level);
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+
static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -150,9 +186,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
return IRQ_HANDLED;
}
-#endif
-#ifdef __KVM_HAVE_MSIX
static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -183,7 +217,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
return IRQ_HANDLED;
}
-#endif
/* Ack the irq line for an assigned device */
static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
@@ -386,7 +419,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
return 0;
}
-#ifdef __KVM_HAVE_MSI
static int assigned_device_enable_host_msi(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
@@ -408,9 +440,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
return 0;
}
-#endif
-#ifdef __KVM_HAVE_MSIX
static int assigned_device_enable_host_msix(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
@@ -443,8 +473,6 @@ err:
return r;
}
-#endif
-
static int assigned_device_enable_guest_intx(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
struct kvm_assigned_irq *irq)
@@ -454,7 +482,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm,
return 0;
}
-#ifdef __KVM_HAVE_MSI
static int assigned_device_enable_guest_msi(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
struct kvm_assigned_irq *irq)
@@ -463,9 +490,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
dev->ack_notifier.gsi = -1;
return 0;
}
-#endif
-#ifdef __KVM_HAVE_MSIX
static int assigned_device_enable_guest_msix(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
struct kvm_assigned_irq *irq)
@@ -474,7 +499,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
dev->ack_notifier.gsi = -1;
return 0;
}
-#endif
static int assign_host_irq(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev,
@@ -492,16 +516,12 @@ static int assign_host_irq(struct kvm *kvm,
case KVM_DEV_IRQ_HOST_INTX:
r = assigned_device_enable_host_intx(kvm, dev);
break;
-#ifdef __KVM_HAVE_MSI
case KVM_DEV_IRQ_HOST_MSI:
r = assigned_device_enable_host_msi(kvm, dev);
break;
-#endif
-#ifdef __KVM_HAVE_MSIX
case KVM_DEV_IRQ_HOST_MSIX:
r = assigned_device_enable_host_msix(kvm, dev);
break;
-#endif
default:
r = -EINVAL;
}
@@ -534,16 +554,12 @@ static int assign_guest_irq(struct kvm *kvm,
case KVM_DEV_IRQ_GUEST_INTX:
r = assigned_device_enable_guest_intx(kvm, dev, irq);
break;
-#ifdef __KVM_HAVE_MSI
case KVM_DEV_IRQ_GUEST_MSI:
r = assigned_device_enable_guest_msi(kvm, dev, irq);
break;
-#endif
-#ifdef __KVM_HAVE_MSIX
case KVM_DEV_IRQ_GUEST_MSIX:
r = assigned_device_enable_guest_msix(kvm, dev, irq);
break;
-#endif
default:
r = -EINVAL;
}
@@ -826,7 +842,6 @@ out:
}
-#ifdef __KVM_HAVE_MSIX
static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
struct kvm_assigned_msix_nr *entry_nr)
{
@@ -906,7 +921,6 @@ msix_entry_out:
return r;
}
-#endif
static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
struct kvm_assigned_pci_dev *assigned_dev)
@@ -1012,7 +1026,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
goto out;
break;
}
-#ifdef __KVM_HAVE_MSIX
case KVM_ASSIGN_SET_MSIX_NR: {
struct kvm_assigned_msix_nr entry_nr;
r = -EFAULT;
@@ -1033,7 +1046,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
goto out;
break;
}
-#endif
case KVM_ASSIGN_SET_INTX_MASK: {
struct kvm_assigned_pci_dev assigned_dev;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2fbea2544f24..6525e926f566 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -30,7 +30,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
int feature_bit = 0;
u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
- xstate_bv &= XSTATE_EXTEND_MASK;
+ xstate_bv &= XFEATURE_MASK_EXTEND;
while (xstate_bv) {
if (xstate_bv & 0x1) {
u32 eax, ebx, ecx, edx, offset;
@@ -51,7 +51,7 @@ u64 kvm_supported_xcr0(void)
u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
if (!kvm_x86_ops->mpx_supported())
- xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR);
+ xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
return xcr0;
}
@@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
- F(AVX512CD);
+ F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
/* cpuid 0xD.1.eax */
const u32 kvm_supported_word10_x86_features =
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dd05b9cef6ae..c8eda1498121 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -2,6 +2,7 @@
#define ARCH_X86_KVM_CPUID_H
#include "x86.h"
+#include <asm/cpu.h>
int kvm_update_cpuid(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
@@ -38,6 +39,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_XSAVE));
}
+static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->edx & bit(X86_FEATURE_MTRR));
+}
+
static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -133,4 +142,74 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->ebx & bit(X86_FEATURE_MPX));
}
+
+static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
+}
+
+static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ return best && (best->edx & bit(X86_FEATURE_RDTSCP));
+}
+
+/*
+ * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
+ */
+#define BIT_NRIPS 3
+
+static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0);
+
+ /*
+ * NRIPS is a scattered cpuid feature, so we can't use
+ * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit
+ * position 8, not 3).
+ */
+ return best && (best->edx & bit(BIT_NRIPS));
+}
+#undef BIT_NRIPS
+
+static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_family(best->eax);
+}
+
+static inline int guest_cpuid_model(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_model(best->eax);
+}
+
+static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_stepping(best->eax);
+}
+
#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9da95b9daf8d..1505587d06e9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
#define GET_SMSTATE(type, smbase, offset) \
({ \
type __val; \
- int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \
- sizeof(__val), NULL); \
+ int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \
+ sizeof(__val)); \
if (r != X86EMUL_CONTINUE) \
return X86EMUL_UNHANDLEABLE; \
__val; \
@@ -2484,17 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
/*
* Get back to real mode, to prepare a safe state in which to load
- * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed
- * to read_std/write_std are not virtual.
- *
- * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
+ * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
+ * supports long mode.
*/
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (emulator_has_longmode(ctxt)) {
+ struct desc_struct cs_desc;
+
+ /* Zero CR4.PCIDE before CR0.PG. */
+ if (cr4 & X86_CR4_PCIDE) {
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+ cr4 &= ~X86_CR4_PCIDE;
+ }
+
+ /* A 32-bit code segment is required to clear EFER.LMA. */
+ memset(&cs_desc, 0, sizeof(cs_desc));
+ cs_desc.type = 0xb;
+ cs_desc.s = cs_desc.g = cs_desc.p = 1;
+ ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
+ }
+
+ /* For the 64-bit case, this will clear EFER.LMA. */
cr0 = ctxt->ops->get_cr(ctxt, 0);
if (cr0 & X86_CR0_PE)
ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
- cr4 = ctxt->ops->get_cr(ctxt, 4);
+
+ /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */
if (cr4 & X86_CR4_PAE)
ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+
+ /* And finally go back to 32-bit mode. */
efer = 0;
ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
@@ -4455,7 +4474,7 @@ static const struct opcode twobyte_table[256] = {
F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
/* 0xA8 - 0xAF */
I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
- II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
+ II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index a8160d2ae362..c58ba67175ac 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -23,13 +23,665 @@
#include "x86.h"
#include "lapic.h"
+#include "ioapic.h"
#include "hyperv.h"
#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <asm/apicdef.h>
#include <trace/events/kvm.h>
#include "trace.h"
+static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
+{
+ return atomic64_read(&synic->sint[sint]);
+}
+
+static inline int synic_get_sint_vector(u64 sint_value)
+{
+ if (sint_value & HV_SYNIC_SINT_MASKED)
+ return -1;
+ return sint_value & HV_SYNIC_SINT_VECTOR_MASK;
+}
+
+static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ return true;
+ }
+ return false;
+}
+
+static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+ u64 sint_value;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ sint_value = synic_read_sint(synic, i);
+ if (synic_get_sint_vector(sint_value) == vector &&
+ sint_value & HV_SYNIC_SINT_AUTO_EOI)
+ return true;
+ }
+ return false;
+}
+
+static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
+ u64 data, bool host)
+{
+ int vector;
+
+ vector = data & HV_SYNIC_SINT_VECTOR_MASK;
+ if (vector < 16 && !host)
+ return 1;
+ /*
+ * Guest may configure multiple SINTs to use the same vector, so
+ * we maintain a bitmap of vectors handled by synic, and a
+ * bitmap of vectors with auto-eoi behavior. The bitmaps are
+ * updated here, and atomically queried on fast paths.
+ */
+
+ atomic64_set(&synic->sint[sint], data);
+
+ if (synic_has_vector_connected(synic, vector))
+ __set_bit(vector, synic->vec_bitmap);
+ else
+ __clear_bit(vector, synic->vec_bitmap);
+
+ if (synic_has_vector_auto_eoi(synic, vector))
+ __set_bit(vector, synic->auto_eoi_bitmap);
+ else
+ __clear_bit(vector, synic->auto_eoi_bitmap);
+
+ /* Load SynIC vectors into EOI exit bitmap */
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
+ return 0;
+}
+
+static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vcpu_hv_synic *synic;
+
+ if (vcpu_id >= atomic_read(&kvm->online_vcpus))
+ return NULL;
+ vcpu = kvm_get_vcpu(kvm, vcpu_id);
+ if (!vcpu)
+ return NULL;
+ synic = vcpu_to_synic(vcpu);
+ return (synic->active) ? synic : NULL;
+}
+
+static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic,
+ u32 sint)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct page *page;
+ gpa_t gpa;
+ struct hv_message *msg;
+ struct hv_message_page *msg_page;
+
+ gpa = synic->msg_page & PAGE_MASK;
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n",
+ gpa);
+ return;
+ }
+ msg_page = kmap_atomic(page);
+
+ msg = &msg_page->sint_message[sint];
+ msg->header.message_flags.msg_pending = 0;
+
+ kunmap_atomic(msg_page);
+ kvm_release_page_dirty(page);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+}
+
+static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ int gsi, idx, stimers_pending;
+
+ trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint);
+
+ if (synic->msg_page & HV_SYNIC_SIMP_ENABLE)
+ synic_clear_sint_msg_pending(synic, sint);
+
+ /* Try to deliver pending Hyper-V SynIC timers messages */
+ stimers_pending = 0;
+ for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) {
+ stimer = &hv_vcpu->stimer[idx];
+ if (stimer->msg_pending &&
+ (stimer->config & HV_STIMER_ENABLE) &&
+ HV_STIMER_SINT(stimer->config) == sint) {
+ set_bit(stimer->index,
+ hv_vcpu->stimer_pending_bitmap);
+ stimers_pending++;
+ }
+ }
+ if (stimers_pending)
+ kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = atomic_read(&synic->sint_to_gsi[sint]);
+ if (gsi != -1)
+ kvm_notify_acked_gsi(kvm, gsi);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+
+ hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
+ hv_vcpu->exit.u.synic.msr = msr;
+ hv_vcpu->exit.u.synic.control = synic->control;
+ hv_vcpu->exit.u.synic.evt_page = synic->evt_page;
+ hv_vcpu->exit.u.synic.msg_page = synic->msg_page;
+
+ kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
+ u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ int ret;
+
+ if (!synic->active)
+ return 1;
+
+ trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ synic->control = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SVERSION:
+ if (!host) {
+ ret = 1;
+ break;
+ }
+ synic->version = data;
+ break;
+ case HV_X64_MSR_SIEFP:
+ if (data & HV_SYNIC_SIEFP_ENABLE)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->evt_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SIMP:
+ if (data & HV_SYNIC_SIMP_ENABLE)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->msg_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_EOM: {
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ kvm_hv_notify_acked_sint(vcpu, i);
+ break;
+ }
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata)
+{
+ int ret;
+
+ if (!synic->active)
+ return 1;
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ *pdata = synic->control;
+ break;
+ case HV_X64_MSR_SVERSION:
+ *pdata = synic->version;
+ break;
+ case HV_X64_MSR_SIEFP:
+ *pdata = synic->evt_page;
+ break;
+ case HV_X64_MSR_SIMP:
+ *pdata = synic->msg_page;
+ break;
+ case HV_X64_MSR_EOM:
+ *pdata = 0;
+ break;
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_lapic_irq irq;
+ int ret, vector;
+
+ if (sint >= ARRAY_SIZE(synic->sint))
+ return -EINVAL;
+
+ vector = synic_get_sint_vector(synic_read_sint(synic, sint));
+ if (vector < 0)
+ return -ENOENT;
+
+ memset(&irq, 0, sizeof(irq));
+ irq.dest_id = kvm_apic_id(vcpu->arch.apic);
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.vector = vector;
+ irq.level = 1;
+
+ ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL);
+ trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
+ return ret;
+}
+
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vcpu_id);
+ if (!synic)
+ return -EINVAL;
+
+ return synic_set_irq(synic, sint);
+}
+
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ int i;
+
+ trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ kvm_hv_notify_acked_sint(vcpu, i);
+}
+
+static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vcpu_id);
+ if (!synic)
+ return -EINVAL;
+
+ if (sint >= ARRAY_SIZE(synic->sint_to_gsi))
+ return -EINVAL;
+
+ atomic_set(&synic->sint_to_gsi[sint], gsi);
+ return 0;
+}
+
+void kvm_hv_irq_routing_update(struct kvm *kvm)
+{
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_kernel_irq_routing_entry *e;
+ u32 gsi;
+
+ irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
+ lockdep_is_held(&kvm->irq_lock));
+
+ for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) {
+ hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
+ if (e->type == KVM_IRQ_ROUTING_HV_SINT)
+ kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu,
+ e->hv_sint.sint, gsi);
+ }
+ }
+}
+
+static void synic_init(struct kvm_vcpu_hv_synic *synic)
+{
+ int i;
+
+ memset(synic, 0, sizeof(*synic));
+ synic->version = HV_SYNIC_VERSION_1;
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED);
+ atomic_set(&synic->sint_to_gsi[i], -1);
+ }
+}
+
+static u64 get_time_ref_counter(struct kvm *kvm)
+{
+ return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+}
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+ bool vcpu_kick)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+ set_bit(stimer->index,
+ vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+ if (vcpu_kick)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+ trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+
+ hrtimer_cancel(&stimer->timer);
+ clear_bit(stimer->index,
+ vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ stimer->msg_pending = false;
+ stimer->exp_time = 0;
+}
+
+static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu_hv_stimer *stimer;
+
+ stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
+ trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+ stimer_mark_pending(stimer, true);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * stimer_start() assumptions:
+ * a) stimer->count is not equal to 0
+ * b) stimer->config has HV_STIMER_ENABLE flag
+ */
+static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
+{
+ u64 time_now;
+ ktime_t ktime_now;
+
+ time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm);
+ ktime_now = ktime_get();
+
+ if (stimer->config & HV_STIMER_PERIODIC) {
+ if (stimer->exp_time) {
+ if (time_now >= stimer->exp_time) {
+ u64 remainder;
+
+ div64_u64_rem(time_now - stimer->exp_time,
+ stimer->count, &remainder);
+ stimer->exp_time =
+ time_now + (stimer->count - remainder);
+ }
+ } else
+ stimer->exp_time = time_now + stimer->count;
+
+ trace_kvm_hv_stimer_start_periodic(
+ stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->exp_time);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now,
+ 100 * (stimer->exp_time - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+ }
+ stimer->exp_time = stimer->count;
+ if (time_now >= stimer->count) {
+ /*
+ * Expire timer according to Hypervisor Top-Level Functional
+ * specification v4(15.3.1):
+ * "If a one shot is enabled and the specified count is in
+ * the past, it will expire immediately."
+ */
+ stimer_mark_pending(stimer, false);
+ return 0;
+ }
+
+ trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->count);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+}
+
+static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
+ bool host)
+{
+ trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, config, host);
+
+ stimer_cleanup(stimer);
+ if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0)
+ config &= ~HV_STIMER_ENABLE;
+ stimer->config = config;
+ stimer_mark_pending(stimer, false);
+ return 0;
+}
+
+static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
+ bool host)
+{
+ trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, count, host);
+
+ stimer_cleanup(stimer);
+ stimer->count = count;
+ if (stimer->count == 0)
+ stimer->config &= ~HV_STIMER_ENABLE;
+ else if (stimer->config & HV_STIMER_AUTOENABLE)
+ stimer->config |= HV_STIMER_ENABLE;
+ stimer_mark_pending(stimer, false);
+ return 0;
+}
+
+static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig)
+{
+ *pconfig = stimer->config;
+ return 0;
+}
+
+static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
+{
+ *pcount = stimer->count;
+ return 0;
+}
+
+static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
+ struct hv_message *src_msg)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct page *page;
+ gpa_t gpa;
+ struct hv_message *dst_msg;
+ int r;
+ struct hv_message_page *msg_page;
+
+ if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE))
+ return -ENOENT;
+
+ gpa = synic->msg_page & PAGE_MASK;
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return -EFAULT;
+
+ msg_page = kmap_atomic(page);
+ dst_msg = &msg_page->sint_message[sint];
+ if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE,
+ src_msg->header.message_type) != HVMSG_NONE) {
+ dst_msg->header.message_flags.msg_pending = 1;
+ r = -EAGAIN;
+ } else {
+ memcpy(&dst_msg->u.payload, &src_msg->u.payload,
+ src_msg->header.payload_size);
+ dst_msg->header.message_type = src_msg->header.message_type;
+ dst_msg->header.payload_size = src_msg->header.payload_size;
+ r = synic_set_irq(synic, sint);
+ if (r >= 1)
+ r = 0;
+ else if (r == 0)
+ r = -EFAULT;
+ }
+ kunmap_atomic(msg_page);
+ kvm_release_page_dirty(page);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+ return r;
+}
+
+static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ payload->expiration_time = stimer->exp_time;
+ payload->delivery_time = get_time_ref_counter(vcpu->kvm);
+ return synic_deliver_msg(vcpu_to_synic(vcpu),
+ HV_STIMER_SINT(stimer->config), msg);
+}
+
+static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
+{
+ int r;
+
+ stimer->msg_pending = true;
+ r = stimer_send_msg(stimer);
+ trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, r);
+ if (!r) {
+ stimer->msg_pending = false;
+ if (!(stimer->config & HV_STIMER_PERIODIC))
+ stimer->config &= ~HV_STIMER_ENABLE;
+ }
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ u64 time_now, exp_time;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
+ stimer = &hv_vcpu->stimer[i];
+ if (stimer->config & HV_STIMER_ENABLE) {
+ exp_time = stimer->exp_time;
+
+ if (exp_time) {
+ time_now =
+ get_time_ref_counter(vcpu->kvm);
+ if (time_now >= exp_time)
+ stimer_expiration(stimer);
+ }
+
+ if ((stimer->config & HV_STIMER_ENABLE) &&
+ stimer->count)
+ stimer_start(stimer);
+ else
+ stimer_cleanup(stimer);
+ }
+ }
+}
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_cleanup(&hv_vcpu->stimer[i]);
+}
+
+static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ memset(&msg->header, 0, sizeof(msg->header));
+ msg->header.message_type = HVMSG_TIMER_EXPIRED;
+ msg->header.payload_size = sizeof(*payload);
+
+ payload->timer_index = stimer->index;
+ payload->expiration_time = 0;
+ payload->delivery_time = 0;
+}
+
+static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
+{
+ memset(stimer, 0, sizeof(*stimer));
+ stimer->index = timer_index;
+ hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ stimer->timer.function = stimer_timer_callback;
+ stimer_prepare_msg(stimer);
+}
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ int i;
+
+ synic_init(&hv_vcpu->synic);
+
+ bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_init(&hv_vcpu->stimer[i], i);
+}
+
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Hyper-V SynIC auto EOI SINT's are
+ * not compatible with APICV, so deactivate APICV
+ */
+ kvm_vcpu_deactivate_apicv(vcpu);
+ vcpu_to_synic(vcpu)->active = true;
+ return 0;
+}
+
static bool kvm_hv_msr_partition_wide(u32 msr)
{
bool r = false;
@@ -41,6 +693,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
case HV_X64_MSR_TIME_REF_COUNT:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_RESET:
r = true;
break;
}
@@ -163,6 +816,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
data);
case HV_X64_MSR_CRASH_CTL:
return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+ case HV_X64_MSR_RESET:
+ if (data == 1) {
+ vcpu_debug(vcpu, "hyper-v reset requested\n");
+ kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+ }
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -171,7 +830,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
return 0;
}
-static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+ cputime_t utime, stime;
+
+ task_cputime_adjusted(current, &utime, &stime);
+ return div_u64(cputime_to_nsecs(utime + stime), 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
@@ -205,6 +873,36 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
case HV_X64_MSR_TPR:
return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ case HV_X64_MSR_VP_RUNTIME:
+ if (!host)
+ return 1;
+ hv->runtime_offset = data - current_task_runtime_100ns();
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_set_config(vcpu_to_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
+ data, host);
+ }
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -227,11 +925,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_HYPERCALL:
data = hv->hv_hypercall;
break;
- case HV_X64_MSR_TIME_REF_COUNT: {
- data =
- div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+ case HV_X64_MSR_TIME_REF_COUNT:
+ data = get_time_ref_counter(kvm);
break;
- }
case HV_X64_MSR_REFERENCE_TSC:
data = hv->hv_tsc_page;
break;
@@ -241,6 +937,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
pdata);
case HV_X64_MSR_CRASH_CTL:
return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+ case HV_X64_MSR_RESET:
+ data = 0;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -277,6 +976,34 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_APIC_ASSIST_PAGE:
data = hv->hv_vapic;
break;
+ case HV_X64_MSR_VP_RUNTIME:
+ data = current_task_runtime_100ns() + hv->runtime_offset;
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_get_config(vcpu_to_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
+ pdata);
+ }
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -295,7 +1022,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
mutex_unlock(&vcpu->kvm->lock);
return r;
} else
- return kvm_hv_set_msr(vcpu, msr, data);
+ return kvm_hv_set_msr(vcpu, msr, data, host);
}
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index c7bce559f67b..60eccd4bd1d3 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -24,9 +24,64 @@
#ifndef __ARCH_X86_KVM_HYPERV_H__
#define __ARCH_X86_KVM_HYPERV_H__
+static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->arch.hyperv;
+}
+
+static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu)
+{
+ struct kvm_vcpu_arch *arch;
+
+ arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv);
+ return container_of(arch, struct kvm_vcpu, arch);
+}
+
+static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->arch.hyperv.synic;
+}
+
+static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+{
+ return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic));
+}
+
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+
bool kvm_hv_hypercall_enabled(struct kvm *kvm);
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
+void kvm_hv_irq_routing_update(struct kvm *kvm);
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu);
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
+ int timer_index)
+{
+ return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index];
+}
+
+static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu_hv *hv_vcpu;
+
+ hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
+ stimer[0]);
+ return hv_vcpu_to_vcpu(hv_vcpu);
+}
+
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+ return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap,
+ HV_SYNIC_STIMER_COUNT);
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f90952f64e79..b0ea42b78ccd 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -35,6 +35,7 @@
#include <linux/kvm_host.h>
#include <linux/slab.h>
+#include "ioapic.h"
#include "irq.h"
#include "i8254.h"
#include "x86.h"
@@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
s64 interval;
- if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+ if (!ioapic_in_kernel(kvm) ||
+ ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
return;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -418,6 +420,7 @@ void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_s
u8 saved_mode;
if (hpet_legacy_start) {
/* save existing mode for later reenablement */
+ WARN_ON(channel != 0);
saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
pit_load_count(kvm, channel, val);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 856f79105bb5..1facfd60b04a 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -233,21 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
}
-static void update_handled_vectors(struct kvm_ioapic *ioapic)
-{
- DECLARE_BITMAP(handled_vectors, 256);
- int i;
-
- memset(handled_vectors, 0, sizeof(handled_vectors));
- for (i = 0; i < IOAPIC_NUM_PINS; ++i)
- __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
- memcpy(ioapic->handled_vectors, handled_vectors,
- sizeof(handled_vectors));
- smp_wmb();
-}
-
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
- u32 *tmr)
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
{
struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
union kvm_ioapic_redirect_entry *e;
@@ -260,13 +246,11 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
index == RTC_GSI) {
if (kvm_apic_match_dest(vcpu, NULL, 0,
- e->fields.dest_id, e->fields.dest_mode)) {
+ e->fields.dest_id, e->fields.dest_mode) ||
+ (e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
+ kvm_apic_pending_eoi(vcpu, e->fields.vector)))
__set_bit(e->fields.vector,
- (unsigned long *)eoi_exit_bitmap);
- if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
- __set_bit(e->fields.vector,
- (unsigned long *)tmr);
- }
+ ioapic_handled_vectors);
}
}
spin_unlock(&ioapic->lock);
@@ -315,7 +299,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
e->bits |= (u32) val;
e->fields.remote_irr = 0;
}
- update_handled_vectors(ioapic);
mask_after = e->fields.mask;
if (mask_before != mask_after)
kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
@@ -599,7 +582,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
ioapic->id = 0;
memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
rtc_irq_eoi_tracking_reset(ioapic);
- update_handled_vectors(ioapic);
}
static const struct kvm_io_device_ops ioapic_mmio_ops = {
@@ -628,8 +610,10 @@ int kvm_ioapic_init(struct kvm *kvm)
if (ret < 0) {
kvm->arch.vioapic = NULL;
kfree(ioapic);
+ return ret;
}
+ kvm_vcpu_request_scan_ioapic(kvm);
return ret;
}
@@ -666,7 +650,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
ioapic->irr = 0;
ioapic->irr_delivered = 0;
- update_handled_vectors(ioapic);
kvm_vcpu_request_scan_ioapic(kvm);
kvm_ioapic_inject_all(ioapic, state->irr);
spin_unlock(&ioapic->lock);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ca0b0b4e6256..2d16dc251d81 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -9,6 +9,7 @@ struct kvm;
struct kvm_vcpu;
#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
#define IOAPIC_EDGE_TRIG 0
#define IOAPIC_LEVEL_TRIG 1
@@ -73,7 +74,6 @@ struct kvm_ioapic {
struct kvm *kvm;
void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
- DECLARE_BITMAP(handled_vectors, 256);
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
@@ -98,11 +98,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
return kvm->arch.vioapic;
}
-static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+static inline int ioapic_in_kernel(struct kvm *kvm)
{
- struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- smp_rmb();
- return test_bit(vector, ioapic->handled_vectors);
+ int ret;
+
+ ret = (ioapic_irqchip(kvm) != NULL);
+ return ret;
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -120,7 +121,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, unsigned long *dest_map);
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
- u32 *tmr);
-
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
#endif
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 5c520ebf6343..a22a488b4622 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -43,11 +43,11 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
-static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
unsigned long npages)
{
gfn_t end_gfn;
- pfn_t pfn;
+ kvm_pfn_t pfn;
pfn = gfn_to_pfn_memslot(slot, gfn);
end_gfn = gfn + npages;
@@ -62,7 +62,8 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
return pfn;
}
-static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
+static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn,
+ unsigned long npages)
{
unsigned long i;
@@ -73,7 +74,7 @@ static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
{
gfn_t gfn, end_gfn;
- pfn_t pfn;
+ kvm_pfn_t pfn;
int r = 0;
struct iommu_domain *domain = kvm->arch.iommu_domain;
int flags;
@@ -275,7 +276,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
{
struct iommu_domain *domain;
gfn_t end_gfn, gfn;
- pfn_t pfn;
+ kvm_pfn_t pfn;
u64 phys;
domain = kvm->arch.iommu_domain;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index a1ec6a50a05a..3982b479bb5f 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -38,14 +38,27 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
/*
+ * check if there is a pending userspace external interrupt
+ */
+static int pending_userspace_extint(struct kvm_vcpu *v)
+{
+ return v->arch.pending_external_vector != -1;
+}
+
+/*
* check if there is pending interrupt from
* non-APIC source without intack.
*/
static int kvm_cpu_has_extint(struct kvm_vcpu *v)
{
- if (kvm_apic_accept_pic_intr(v))
- return pic_irqchip(v->kvm)->output; /* PIC */
- else
+ u8 accept = kvm_apic_accept_pic_intr(v);
+
+ if (accept) {
+ if (irqchip_split(v->kvm))
+ return pending_userspace_extint(v);
+ else
+ return pic_irqchip(v->kvm)->output;
+ } else
return 0;
}
@@ -57,13 +70,13 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
*/
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
{
- if (!irqchip_in_kernel(v->kvm))
+ if (!lapic_in_kernel(v))
return v->arch.interrupt.pending;
if (kvm_cpu_has_extint(v))
return 1;
- if (kvm_apic_vid_enabled(v->kvm))
+ if (kvm_vcpu_apicv_active(v))
return 0;
return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
@@ -75,7 +88,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
- if (!irqchip_in_kernel(v->kvm))
+ if (!lapic_in_kernel(v))
return v->arch.interrupt.pending;
if (kvm_cpu_has_extint(v))
@@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
*/
static int kvm_cpu_get_extint(struct kvm_vcpu *v)
{
- if (kvm_cpu_has_extint(v))
- return kvm_pic_read_irq(v->kvm); /* PIC */
- return -1;
+ if (kvm_cpu_has_extint(v)) {
+ if (irqchip_split(v->kvm)) {
+ int vector = v->arch.pending_external_vector;
+
+ v->arch.pending_external_vector = -1;
+ return vector;
+ } else
+ return kvm_pic_read_irq(v->kvm); /* PIC */
+ } else
+ return -1;
}
/*
@@ -103,7 +123,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
{
int vector;
- if (!irqchip_in_kernel(v->kvm))
+ if (!lapic_in_kernel(v))
return v->arch.interrupt.nr;
vector = kvm_cpu_get_extint(v);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 3d782a2c336a..ae5c78f2337d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -83,13 +83,38 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
return kvm->arch.vpic;
}
+static inline int pic_in_kernel(struct kvm *kvm)
+{
+ int ret;
+
+ ret = (pic_irqchip(kvm) != NULL);
+ return ret;
+}
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ return kvm->arch.irqchip_split;
+}
+
static inline int irqchip_in_kernel(struct kvm *kvm)
{
struct kvm_pic *vpic = pic_irqchip(kvm);
+ bool ret;
+
+ ret = (vpic != NULL);
+ ret |= irqchip_split(kvm);
/* Read vpic before kvm->irq_routing. */
smp_rmb();
- return vpic != NULL;
+ return ret;
+}
+
+static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+ /* Same as irqchip_in_kernel(vcpu->kvm), but with less
+ * pointer chasing and no unnecessary memory barriers.
+ */
+ return vcpu->arch.apic != NULL;
}
void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 9efff9e5b58c..8fc89efb5250 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -33,6 +33,8 @@
#include "lapic.h"
+#include "hyperv.h"
+
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
@@ -91,8 +93,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
return r;
}
-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq)
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
{
trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
@@ -108,6 +110,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
irq->level = 1;
irq->shorthand = 0;
}
+EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level, bool line_status)
@@ -123,12 +126,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
}
-static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm)
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
{
struct kvm_lapic_irq irq;
int r;
+ if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
+ return -EWOULDBLOCK;
+
kvm_set_msi_irq(e, &irq);
if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
@@ -137,42 +144,6 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
return -EWOULDBLOCK;
}
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- * Other values - No need to retry.
- */
-int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
-{
- struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
- struct kvm_kernel_irq_routing_entry *e;
- int ret = -EINVAL;
- int idx;
-
- trace_kvm_set_irq(irq, level, irq_source_id);
-
- /*
- * Injection into either PIC or IOAPIC might need to scan all CPUs,
- * which would need to be retried from thread context; when same GSI
- * is connected to both PIC and IOAPIC, we'd have to report a
- * partial failure here.
- * Since there's no easy way to do this, we only support injecting MSI
- * which is limited to 1:1 GSI mapping.
- */
- idx = srcu_read_lock(&kvm->irq_srcu);
- if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
- e = &entries[0];
- if (likely(e->type == KVM_IRQ_ROUTING_MSI))
- ret = kvm_set_msi_inatomic(e, kvm);
- else
- ret = -EWOULDBLOCK;
- }
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
-
int kvm_request_irq_source_id(struct kvm *kvm)
{
unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
@@ -208,7 +179,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
goto unlock;
}
clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
- if (!irqchip_in_kernel(kvm))
+ if (!ioapic_in_kernel(kvm))
goto unlock;
kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
@@ -250,6 +221,16 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
srcu_read_unlock(&kvm->irq_srcu, idx);
}
+static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ if (!level)
+ return -1;
+
+ return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
+}
+
int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
@@ -288,6 +269,11 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
break;
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_set_sint;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
default:
goto out;
}
@@ -297,6 +283,33 @@ out:
return r;
}
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int i, r = 0;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
#define IOAPIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
@@ -328,3 +341,72 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
return kvm_set_irq_routing(kvm, default_routing,
ARRAY_SIZE(default_routing), 0);
}
+
+static const struct kvm_irq_routing_entry empty_routing[] = {};
+
+int kvm_setup_empty_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
+}
+
+void kvm_arch_post_irq_routing_update(struct kvm *kvm)
+{
+ if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ /* kvm->irq_routing must be read after clearing
+ * KVM_SCAN_IOAPIC. */
+ smp_mb();
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ u32 dest_id, dest_mode;
+ bool level;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ dest_id = (entry->msi.address_lo >> 12) & 0xff;
+ dest_mode = (entry->msi.address_lo >> 2) & 0x1;
+ level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
+ if (level && kvm_apic_match_dest(vcpu, NULL, 0,
+ dest_id, dest_mode)) {
+ u32 vector = entry->msi.data & 0xff;
+
+ __set_bit(vector,
+ ioapic_handled_vectors);
+ }
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ switch (irq->type) {
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_set_sint(irq, kvm, irq_source_id, level,
+ line_status);
+ default:
+ return -EWOULDBLOCK;
+ }
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+ kvm_hv_irq_routing_update(kvm);
+}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8d9013c5e1ee..36591faed13b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -41,6 +41,7 @@
#include "trace.h"
#include "x86.h"
#include "cpuid.h"
+#include "hyperv.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -128,11 +129,6 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-static inline int kvm_apic_id(struct kvm_lapic *apic)
-{
- return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
-}
-
/* The logical map is definitely wrong if we have multiple
* modes at the same time. (Physical map is always right.)
*/
@@ -209,7 +205,7 @@ out:
if (old)
kfree_rcu(old, rcu);
- kvm_vcpu_request_scan_ioapic(kvm);
+ kvm_make_scan_ioapic_request(kvm);
}
static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@@ -348,6 +344,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
struct kvm_lapic *apic = vcpu->arch.apic;
__kvm_apic_update_irr(pir, apic->regs);
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
@@ -377,7 +375,8 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
if (!apic->irr_pending)
return -1;
- kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
+ if (apic->vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
result = apic_search_irr(apic);
ASSERT(result == -1 || result >= 16);
@@ -390,7 +389,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
vcpu = apic->vcpu;
- if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) {
+ if (unlikely(vcpu->arch.apicv_active)) {
/* try to update RVI */
apic_clear_vector(vec, apic->regs + APIC_IRR);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -416,7 +415,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* because the processor can modify ISR under the hood. Instead
* just set SVI.
*/
- if (unlikely(kvm_x86_ops->hwapic_isr_update))
+ if (unlikely(vcpu->arch.apicv_active))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
else {
++apic->isr_count;
@@ -464,7 +463,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* on the other hand isr_count and highest_isr_cache are unused
* and must be left alone.
*/
- if (unlikely(kvm_x86_ops->hwapic_isr_update))
+ if (unlikely(vcpu->arch.apicv_active))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
apic_find_highest_isr(apic));
else {
@@ -551,15 +550,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
- int i;
-
- for (i = 0; i < 8; i++)
- apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
-}
-
static void apic_update_ppr(struct kvm_lapic *apic)
{
u32 tpr, isrv, ppr, old_ppr;
@@ -764,6 +754,65 @@ out:
return ret;
}
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ struct kvm_apic_map *map;
+ bool ret = false;
+ struct kvm_lapic *dst = NULL;
+
+ if (irq->shorthand)
+ return false;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (!map)
+ goto out;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+ if (irq->dest_id == 0xFF)
+ goto out;
+
+ if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
+ goto out;
+
+ dst = map->phys_map[irq->dest_id];
+ if (dst && kvm_apic_present(dst->vcpu))
+ *dest_vcpu = dst->vcpu;
+ else
+ goto out;
+ } else {
+ u16 cid;
+ unsigned long bitmap = 1;
+ int i, r = 0;
+
+ if (!kvm_apic_logical_map_valid(map))
+ goto out;
+
+ apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+
+ if (cid >= ARRAY_SIZE(map->logical_map))
+ goto out;
+
+ for_each_set_bit(i, &bitmap, 16) {
+ dst = map->logical_map[cid][i];
+ if (++r == 2)
+ goto out;
+ }
+
+ if (dst && kvm_apic_present(dst->vcpu))
+ *dest_vcpu = dst->vcpu;
+ else
+ goto out;
+ }
+
+ ret = true;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
/*
* Add a pending IRQ into lapic.
* Return 1 if successfully added and 0 if discarded.
@@ -781,6 +830,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
case APIC_DM_FIXED:
+ if (unlikely(trig_mode && !level))
+ break;
+
/* FIXME add logic for vcpu on reset */
if (unlikely(!apic_enabled(apic)))
break;
@@ -790,7 +842,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
if (dest_map)
__set_bit(vcpu->vcpu_id, dest_map);
- if (kvm_x86_ops->deliver_posted_interrupt)
+ if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
+ if (trig_mode)
+ apic_set_vector(vector, apic->regs + APIC_TMR);
+ else
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
+ }
+
+ if (vcpu->arch.apicv_active)
kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
else {
apic_set_irr(vector, apic);
@@ -868,16 +927,32 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
}
+static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
+{
+ return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors);
+}
+
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{
- if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
- int trigger_mode;
- if (apic_test_vector(vector, apic->regs + APIC_TMR))
- trigger_mode = IOAPIC_LEVEL_TRIG;
- else
- trigger_mode = IOAPIC_EDGE_TRIG;
- kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+ int trigger_mode;
+
+ /* Eoi the ioapic only if the ioapic doesn't own the vector. */
+ if (!kvm_ioapic_handles_vector(apic, vector))
+ return;
+
+ /* Request a KVM exit to inform the userspace IOAPIC. */
+ if (irqchip_split(apic->vcpu->kvm)) {
+ apic->vcpu->arch.pending_ioapic_eoi = vector;
+ kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+ return;
}
+
+ if (apic_test_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+
+ kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
}
static int apic_set_eoi(struct kvm_lapic *apic)
@@ -896,6 +971,9 @@ static int apic_set_eoi(struct kvm_lapic *apic)
apic_clear_isr(vector, apic);
apic_update_ppr(apic);
+ if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap))
+ kvm_hv_synic_send_eoi(apic->vcpu, vector);
+
kvm_ioapic_send_eoi(apic, vector);
kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
return vector;
@@ -1147,7 +1225,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
int vec = reg & APIC_VECTOR_MASK;
void *bitmap = apic->regs + APIC_ISR;
- if (kvm_x86_ops->deliver_posted_interrupt)
+ if (vcpu->arch.apicv_active)
bitmap = apic->regs + APIC_IRR;
if (apic_test_vector(vec, bitmap))
@@ -1172,7 +1250,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
- guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
/* __delay is delay_tsc whenever the hardware has TSC, thus always. */
@@ -1240,7 +1318,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
local_irq_save(flags);
now = apic->lapic_timer.timer.base->get_time();
- guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
@@ -1615,8 +1693,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
- apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
- apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
+ apic->irr_pending = vcpu->arch.apicv_active;
+ apic->isr_count = vcpu->arch.apicv_active ? 1 : 0;
apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
@@ -1805,6 +1883,12 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
apic_set_isr(vector, apic);
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
+
+ if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
+ apic_clear_isr(vector, apic);
+ apic_update_ppr(apic);
+ }
+
return vector;
}
@@ -1828,17 +1912,20 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
update_divide_count(apic);
start_apic_timer(apic);
apic->irr_pending = true;
- apic->isr_count = kvm_x86_ops->hwapic_isr_update ?
+ apic->isr_count = vcpu->arch.apicv_active ?
1 : count_vectors(apic->regs + APIC_ISR);
apic->highest_isr_cache = -1;
- if (kvm_x86_ops->hwapic_irr_update)
+ if (vcpu->arch.apicv_active) {
kvm_x86_ops->hwapic_irr_update(vcpu,
apic_find_highest_irr(apic));
- if (unlikely(kvm_x86_ops->hwapic_isr_update))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
apic_find_highest_isr(apic));
+ }
kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_rtc_eoi_tracking_restore_one(vcpu);
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_rtc_eoi_tracking_restore_one(vcpu);
+
+ vcpu->arch.apic_arb_prio = 0;
}
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1922,7 +2009,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
/* Cache not set: could be safe but we don't bother. */
apic->highest_isr_cache == -1 ||
/* Need EOI to update ioapic. */
- kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
+ kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
/*
* PV EOI was disabled by apic_sync_pv_eoi_from_guest
* so we need not do anything here.
@@ -1978,7 +2065,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 reg = (msr - APIC_BASE_MSR) << 4;
- if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
if (reg == APIC_ICR2)
@@ -1995,7 +2082,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
- if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
if (reg == APIC_DFR || reg == APIC_ICR2) {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 764037991d26..41bdb35b4b67 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
void __kvm_apic_update_irr(u32 *pir, void *regs);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
@@ -144,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
}
-static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
+static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops->vm_has_apicv(kvm);
+ return vcpu->arch.apic && vcpu->arch.apicv_active;
}
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
@@ -165,8 +164,15 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
+static inline int kvm_apic_id(struct kvm_lapic *apic)
+{
+ return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+}
+
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
void wait_lapic_expire(struct kvm_vcpu *vcpu);
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff606f507913..95a955de5964 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -259,7 +259,7 @@ static unsigned get_mmio_spte_access(u64 spte)
}
static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- pfn_t pfn, unsigned access)
+ kvm_pfn_t pfn, unsigned access)
{
if (unlikely(is_noslot_pfn(pfn))) {
mark_mmio_spte(vcpu, sptep, gfn, access);
@@ -311,11 +311,6 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}
-static int is_rmap_spte(u64 pte)
-{
- return is_shadow_present_pte(pte);
-}
-
static int is_last_spte(u64 pte, int level)
{
if (level == PT_PAGE_TABLE_LEVEL)
@@ -325,7 +320,7 @@ static int is_last_spte(u64 pte, int level)
return 0;
}
-static pfn_t spte_to_pfn(u64 pte)
+static kvm_pfn_t spte_to_pfn(u64 pte)
{
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
}
@@ -540,7 +535,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
u64 old_spte = *sptep;
bool ret = false;
- WARN_ON(!is_rmap_spte(new_spte));
+ WARN_ON(!is_shadow_present_pte(new_spte));
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
@@ -587,7 +582,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
*/
static int mmu_spte_clear_track_bits(u64 *sptep)
{
- pfn_t pfn;
+ kvm_pfn_t pfn;
u64 old_spte = *sptep;
if (!spte_has_volatile_bits(old_spte))
@@ -595,7 +590,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
else
old_spte = __update_clear_spte_slow(sptep, 0ull);
- if (!is_rmap_spte(old_spte))
+ if (!is_shadow_present_pte(old_spte))
return 0;
pfn = spte_to_pfn(old_spte);
@@ -818,14 +813,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm->arch.indirect_shadow_pages--;
}
-static int has_wrprotected_page(struct kvm_vcpu *vcpu,
- gfn_t gfn,
- int level)
+static int __has_wrprotected_page(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
{
- struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (slot) {
linfo = lpage_info_slot(gfn, slot, level);
return linfo->write_count;
@@ -834,6 +826,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu,
return 1;
}
+static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return __has_wrprotected_page(gfn, level, slot);
+}
+
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
{
unsigned long page_size;
@@ -851,6 +851,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
return ret;
}
+static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
+ bool no_dirty_log)
+{
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return false;
+ if (no_dirty_log && slot->dirty_bitmap)
+ return false;
+
+ return true;
+}
+
static struct kvm_memory_slot *
gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
@@ -858,21 +869,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
- (no_dirty_log && slot->dirty_bitmap))
+ if (!memslot_valid_for_gpte(slot, no_dirty_log))
slot = NULL;
return slot;
}
-static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-{
- return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
-}
-
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
+ bool *force_pt_level)
{
int host_level, level, max_level;
+ struct kvm_memory_slot *slot;
+
+ if (unlikely(*force_pt_level))
+ return PT_PAGE_TABLE_LEVEL;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
+ *force_pt_level = !memslot_valid_for_gpte(slot, true);
+ if (unlikely(*force_pt_level))
+ return PT_PAGE_TABLE_LEVEL;
host_level = host_mapping_level(vcpu->kvm, large_gfn);
@@ -882,43 +897,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- if (has_wrprotected_page(vcpu, large_gfn, level))
+ if (__has_wrprotected_page(large_gfn, level, slot))
break;
return level - 1;
}
/*
- * Pte mapping structures:
- *
- * If pte_list bit zero is zero, then pte_list point to the spte.
+ * About rmap_head encoding:
*
- * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
+ * If the bit zero of rmap_head->val is clear, then it points to the only spte
+ * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
* pte_list_desc containing more mappings.
- *
- * Returns the number of pte entries before the spte was added or zero if
- * the spte was not added.
- *
+ */
+
+/*
+ * Returns the number of pointers in the rmap chain, not counting the new one.
*/
static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
- unsigned long *pte_list)
+ struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
int i, count = 0;
- if (!*pte_list) {
+ if (!rmap_head->val) {
rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
- *pte_list = (unsigned long)spte;
- } else if (!(*pte_list & 1)) {
+ rmap_head->val = (unsigned long)spte;
+ } else if (!(rmap_head->val & 1)) {
rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_pte_list_desc(vcpu);
- desc->sptes[0] = (u64 *)*pte_list;
+ desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
- *pte_list = (unsigned long)desc | 1;
+ rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
desc = desc->more;
count += PTE_LIST_EXT;
@@ -935,8 +949,9 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
}
static void
-pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
- int i, struct pte_list_desc *prev_desc)
+pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
+ struct pte_list_desc *desc, int i,
+ struct pte_list_desc *prev_desc)
{
int j;
@@ -947,43 +962,43 @@ pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
if (j != 0)
return;
if (!prev_desc && !desc->more)
- *pte_list = (unsigned long)desc->sptes[0];
+ rmap_head->val = (unsigned long)desc->sptes[0];
else
if (prev_desc)
prev_desc->more = desc->more;
else
- *pte_list = (unsigned long)desc->more | 1;
+ rmap_head->val = (unsigned long)desc->more | 1;
mmu_free_pte_list_desc(desc);
}
-static void pte_list_remove(u64 *spte, unsigned long *pte_list)
+static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
struct pte_list_desc *prev_desc;
int i;
- if (!*pte_list) {
+ if (!rmap_head->val) {
printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
BUG();
- } else if (!(*pte_list & 1)) {
+ } else if (!(rmap_head->val & 1)) {
rmap_printk("pte_list_remove: %p 1->0\n", spte);
- if ((u64 *)*pte_list != spte) {
+ if ((u64 *)rmap_head->val != spte) {
printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
BUG();
}
- *pte_list = 0;
+ rmap_head->val = 0;
} else {
rmap_printk("pte_list_remove: %p many->many\n", spte);
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
prev_desc = NULL;
while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+ for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(pte_list,
- desc, i,
- prev_desc);
+ pte_list_desc_remove_entry(rmap_head,
+ desc, i, prev_desc);
return;
}
+ }
prev_desc = desc;
desc = desc->more;
}
@@ -992,28 +1007,8 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list)
}
}
-typedef void (*pte_list_walk_fn) (u64 *spte);
-static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
-{
- struct pte_list_desc *desc;
- int i;
-
- if (!*pte_list)
- return;
-
- if (!(*pte_list & 1))
- return fn((u64 *)*pte_list);
-
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
- while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
- fn(desc->sptes[i]);
- desc = desc->more;
- }
-}
-
-static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
- struct kvm_memory_slot *slot)
+static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
{
unsigned long idx;
@@ -1021,10 +1016,8 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
}
-/*
- * Take gfn and return the reverse mapping to it.
- */
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp)
+static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
+ struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
@@ -1045,24 +1038,24 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu)
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
struct kvm_mmu_page *sp;
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
- return pte_list_add(vcpu, spte, rmapp);
+ rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ return pte_list_add(vcpu, spte, rmap_head);
}
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct kvm_mmu_page *sp;
gfn_t gfn;
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
sp = page_header(__pa(spte));
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
- rmapp = gfn_to_rmap(kvm, gfn, sp);
- pte_list_remove(spte, rmapp);
+ rmap_head = gfn_to_rmap(kvm, gfn, sp);
+ pte_list_remove(spte, rmap_head);
}
/*
@@ -1082,19 +1075,26 @@ struct rmap_iterator {
*
* Returns sptep if found, NULL otherwise.
*/
-static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
+static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
+ struct rmap_iterator *iter)
{
- if (!rmap)
+ u64 *sptep;
+
+ if (!rmap_head->val)
return NULL;
- if (!(rmap & 1)) {
+ if (!(rmap_head->val & 1)) {
iter->desc = NULL;
- return (u64 *)rmap;
+ sptep = (u64 *)rmap_head->val;
+ goto out;
}
- iter->desc = (struct pte_list_desc *)(rmap & ~1ul);
+ iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
iter->pos = 0;
- return iter->desc->sptes[iter->pos];
+ sptep = iter->desc->sptes[iter->pos];
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
}
/*
@@ -1104,14 +1104,14 @@ static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
*/
static u64 *rmap_get_next(struct rmap_iterator *iter)
{
+ u64 *sptep;
+
if (iter->desc) {
if (iter->pos < PTE_LIST_EXT - 1) {
- u64 *sptep;
-
++iter->pos;
sptep = iter->desc->sptes[iter->pos];
if (sptep)
- return sptep;
+ goto out;
}
iter->desc = iter->desc->more;
@@ -1119,17 +1119,20 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
if (iter->desc) {
iter->pos = 0;
/* desc->sptes[0] cannot be NULL */
- return iter->desc->sptes[iter->pos];
+ sptep = iter->desc->sptes[iter->pos];
+ goto out;
}
}
return NULL;
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
}
-#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \
- for (_spte_ = rmap_get_first(*_rmap_, _iter_); \
- _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \
- _spte_ = rmap_get_next(_iter_))
+#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
+ for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
+ _spte_; _spte_ = rmap_get_next(_iter_))
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
@@ -1187,14 +1190,15 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+static bool __rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep)
flush |= spte_write_protect(kvm, sptep, pt_protect);
return flush;
@@ -1211,13 +1215,13 @@ static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep)
flush |= spte_clear_dirty(kvm, sptep);
return flush;
@@ -1234,13 +1238,13 @@ static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
+static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep)
flush |= spte_set_dirty(kvm, sptep);
return flush;
@@ -1260,12 +1264,12 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
while (mask) {
- rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PT_PAGE_TABLE_LEVEL, slot);
- __rmap_write_protect(kvm, rmapp, false);
+ rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_write_protect(kvm, rmap_head, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1285,12 +1289,12 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
while (mask) {
- rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PT_PAGE_TABLE_LEVEL, slot);
- __rmap_clear_dirty(kvm, rmapp);
+ rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_clear_dirty(kvm, rmap_head);
/* clear the first set bit */
mask &= mask - 1;
@@ -1322,28 +1326,27 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
{
struct kvm_memory_slot *slot;
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
int i;
bool write_protected = false;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- rmapp = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true);
+ rmap_head = __gfn_to_rmap(gfn, i, slot);
+ write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
}
return write_protected;
}
-static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- while ((sptep = rmap_get_first(*rmapp, &iter))) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
+ while ((sptep = rmap_get_first(rmap_head, &iter))) {
rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
drop_spte(kvm, sptep);
@@ -1353,14 +1356,14 @@ static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
return flush;
}
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
- return kvm_zap_rmapp(kvm, rmapp);
+ return kvm_zap_rmapp(kvm, rmap_head);
}
-static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
@@ -1369,13 +1372,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
int need_flush = 0;
u64 new_spte;
pte_t *ptep = (pte_t *)data;
- pfn_t new_pfn;
+ kvm_pfn_t new_pfn;
WARN_ON(pte_huge(*ptep));
new_pfn = pte_pfn(*ptep);
restart:
- for_each_rmap_spte(rmapp, &iter, sptep) {
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
@@ -1413,11 +1416,11 @@ struct slot_rmap_walk_iterator {
/* output fields. */
gfn_t gfn;
- unsigned long *rmap;
+ struct kvm_rmap_head *rmap;
int level;
/* private field. */
- unsigned long *end_rmap;
+ struct kvm_rmap_head *end_rmap;
};
static void
@@ -1476,7 +1479,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long end,
unsigned long data,
int (*handler)(struct kvm *kvm,
- unsigned long *rmapp,
+ struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot,
gfn_t gfn,
int level,
@@ -1520,7 +1523,8 @@ static int kvm_handle_hva_range(struct kvm *kvm,
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
unsigned long data,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot,
gfn_t gfn, int level,
unsigned long data))
@@ -1543,7 +1547,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
}
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
@@ -1553,18 +1557,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
BUG_ON(!shadow_accessed_mask);
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
if (*sptep & shadow_accessed_mask) {
young = 1;
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
}
+ }
trace_kvm_age_page(gfn, level, slot, young);
return young;
}
-static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn,
int level, unsigned long data)
{
@@ -1580,11 +1585,12 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (!shadow_accessed_mask)
goto out;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
if (*sptep & shadow_accessed_mask) {
young = 1;
break;
}
+ }
out:
return young;
}
@@ -1593,14 +1599,14 @@ out:
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *sp;
sp = page_header(__pa(spte));
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
- kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
@@ -1700,8 +1706,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
mmu_spte_clear_no_track(parent_pte);
}
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte, int direct)
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
{
struct kvm_mmu_page *sp;
@@ -1717,8 +1722,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
* this feature. See the comments in kvm_zap_obsolete_pages().
*/
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- sp->parent_ptes = 0;
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
}
@@ -1726,7 +1729,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
static void mark_unsync(u64 *spte);
static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- pte_list_walk(&sp->parent_ptes, mark_unsync);
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
+ mark_unsync(sptep);
+ }
}
static void mark_unsync(u64 *spte)
@@ -1786,6 +1794,13 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
return (pvec->nr == KVM_PAGE_ARRAY_NR);
}
+static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
+{
+ --sp->unsync_children;
+ WARN_ON((int)sp->unsync_children < 0);
+ __clear_bit(idx, sp->unsync_child_bitmap);
+}
+
static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_pages *pvec)
{
@@ -1795,8 +1810,10 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_page *child;
u64 ent = sp->spt[i];
- if (!is_shadow_present_pte(ent) || is_large_pte(ent))
- goto clear_child_bitmap;
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ }
child = page_header(ent & PT64_BASE_ADDR_MASK);
@@ -1805,28 +1822,21 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
return -ENOSPC;
ret = __mmu_unsync_walk(child, pvec);
- if (!ret)
- goto clear_child_bitmap;
- else if (ret > 0)
+ if (!ret) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ } else if (ret > 0) {
nr_unsync_leaf += ret;
- else
+ } else
return ret;
} else if (child->unsync) {
nr_unsync_leaf++;
if (mmu_pages_add(pvec, child, i))
return -ENOSPC;
} else
- goto clear_child_bitmap;
-
- continue;
-
-clear_child_bitmap:
- __clear_bit(i, sp->unsync_child_bitmap);
- sp->unsync_children--;
- WARN_ON((int)sp->unsync_children < 0);
+ clear_unsync_child_bit(sp, i);
}
-
return nr_unsync_leaf;
}
@@ -1989,9 +1999,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
if (!sp)
return;
- --sp->unsync_children;
- WARN_ON((int)sp->unsync_children < 0);
- __clear_bit(idx, sp->unsync_child_bitmap);
+ clear_unsync_child_bit(sp, idx);
level++;
} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
}
@@ -2033,14 +2041,6 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
}
}
-static void init_shadow_page_table(struct kvm_mmu_page *sp)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- sp->spt[i] = 0ull;
-}
-
static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
{
sp->write_flooding_count = 0;
@@ -2063,8 +2063,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gva_t gaddr,
unsigned level,
int direct,
- unsigned access,
- u64 *parent_pte)
+ unsigned access)
{
union kvm_mmu_page_role role;
unsigned quadrant;
@@ -2096,21 +2095,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
break;
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- if (sp->unsync_children) {
+ if (sp->unsync_children)
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
- kvm_mmu_mark_parents_unsync(sp);
- } else if (sp->unsync)
- kvm_mmu_mark_parents_unsync(sp);
__clear_sp_write_flooding_count(sp);
trace_kvm_mmu_get_page(sp, false);
return sp;
}
+
++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
- if (!sp)
- return sp;
+
+ sp = kvm_mmu_alloc_page(vcpu, direct);
+
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link,
@@ -2124,7 +2120,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
account_shadowed(vcpu->kvm, sp);
}
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
- init_shadow_page_table(sp);
+ clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
return sp;
}
@@ -2178,7 +2174,8 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
return __shadow_walk_next(iterator, *iterator->sptep);
}
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
+static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
+ struct kvm_mmu_page *sp)
{
u64 spte;
@@ -2186,12 +2183,14 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask;
-
- if (accessed)
- spte |= shadow_accessed_mask;
+ shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
mmu_spte_set(sptep, spte);
+
+ mmu_page_add_parent_pte(vcpu, sp, sptep);
+
+ if (sp->unsync_children || sp->unsync)
+ mark_unsync(sptep);
}
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
@@ -2250,17 +2249,12 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
mmu_page_zap_pte(kvm, sp, sp->spt + i);
}
-static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
-{
- mmu_page_remove_parent_pte(sp, parent_pte);
-}
-
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *sptep;
struct rmap_iterator iter;
- while ((sptep = rmap_get_first(sp->parent_ptes, &iter)))
+ while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
drop_parent_pte(sp, sptep);
}
@@ -2456,7 +2450,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
return 0;
}
-static bool kvm_is_mmio_pfn(pfn_t pfn)
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
{
if (pfn_valid(pfn))
return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
@@ -2466,7 +2460,7 @@ static bool kvm_is_mmio_pfn(pfn_t pfn)
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int level,
- gfn_t gfn, pfn_t pfn, bool speculative,
+ gfn_t gfn, kvm_pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
u64 spte;
@@ -2544,18 +2538,18 @@ done:
return ret;
}
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pte_access, int write_fault, int *emulate,
- int level, gfn_t gfn, pfn_t pfn, bool speculative,
- bool host_writable)
+static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
+ bool emulate = false;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
- if (is_rmap_spte(*sptep)) {
+ if (is_shadow_present_pte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
@@ -2580,12 +2574,12 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
- *emulate = 1;
+ emulate = true;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
- if (unlikely(is_mmio_spte(*sptep) && emulate))
- *emulate = 1;
+ if (unlikely(is_mmio_spte(*sptep)))
+ emulate = true;
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
@@ -2604,9 +2598,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
kvm_release_pfn_clean(pfn);
+
+ return emulate;
}
-static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
{
struct kvm_memory_slot *slot;
@@ -2638,9 +2634,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;
for (i = 0; i < ret; i++, gfn++, start++)
- mmu_set_spte(vcpu, start, access, 0, NULL,
- sp->role.level, gfn, page_to_pfn(pages[i]),
- true, true);
+ mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
+ page_to_pfn(pages[i]), true, true);
return 0;
}
@@ -2688,9 +2683,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int map_writable, int level, gfn_t gfn, pfn_t pfn,
- bool prefault)
+static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
+ int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -2702,9 +2696,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
- write, &emulate, level, gfn, pfn,
- prefault, map_writable);
+ emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
+ write, level, gfn, pfn, prefault,
+ map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
@@ -2717,10 +2711,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
pseudo_gfn = base_addr >> PAGE_SHIFT;
sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
- iterator.level - 1,
- 1, ACC_ALL, iterator.sptep);
+ iterator.level - 1, 1, ACC_ALL);
- link_shadow_page(iterator.sptep, sp, true);
+ link_shadow_page(vcpu, iterator.sptep, sp);
}
}
return emulate;
@@ -2739,7 +2732,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_info(SIGBUS, &info, tsk);
}
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
{
/*
* Do not cache the mmio info caused by writing the readonly gfn
@@ -2759,9 +2752,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
- gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+ gfn_t *gfnp, kvm_pfn_t *pfnp,
+ int *levelp)
{
- pfn_t pfn = *pfnp;
+ kvm_pfn_t pfn = *pfnp;
gfn_t gfn = *gfnp;
int level = *levelp;
@@ -2800,7 +2794,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
}
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
- pfn_t pfn, unsigned access, int *ret_val)
+ kvm_pfn_t pfn, unsigned access, int *ret_val)
{
bool ret = true;
@@ -2899,7 +2893,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
* If the mapping has been changed, let the vcpu fault on the
* same address again.
*/
- if (!is_rmap_spte(spte)) {
+ if (!is_shadow_present_pte(spte)) {
ret = true;
goto exit;
}
@@ -2954,7 +2948,7 @@ exit:
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable);
+ gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
@@ -2962,14 +2956,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
{
int r;
int level;
- int force_pt_level;
- pfn_t pfn;
+ bool force_pt_level = false;
+ kvm_pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, write = error_code & PFERR_WRITE_MASK;
- force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
- level = mapping_level(vcpu, gfn);
/*
* This path builds a PAE pagetable - so we can map
* 2mb pages at maximum. Therefore check if the level
@@ -2979,8 +2972,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- } else
- level = PT_PAGE_TABLE_LEVEL;
+ }
if (fast_page_fault(vcpu, v, level, error_code))
return 0;
@@ -3000,11 +2992,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
- prefault);
+ r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
-
return r;
out_unlock:
@@ -3079,8 +3069,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
- 1, ACC_ALL, NULL);
+ sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3092,9 +3081,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
- i << 30,
- PT32_ROOT_LEVEL, 1, ACC_ALL,
- NULL);
+ i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3131,7 +3118,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
- 0, ACC_ALL, NULL);
+ 0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3164,9 +3151,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, 0,
- ACC_ALL, NULL);
+ sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
+ 0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3341,7 +3327,7 @@ exit:
return reserved;
}
-int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
@@ -3350,7 +3336,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
return RET_MMIO_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
- if (unlikely(reserved))
+ if (WARN_ON(reserved))
return RET_MMIO_PF_BUG;
if (is_mmio_spte(spte)) {
@@ -3374,17 +3360,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
*/
return RET_MMIO_PF_RETRY;
}
-EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
-
-static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
- u32 error_code, bool direct)
-{
- int ret;
-
- ret = handle_mmio_page_fault_common(vcpu, addr, direct);
- WARN_ON(ret == RET_MMIO_PF_BUG);
- return ret;
-}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
@@ -3395,7 +3371,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+ r = handle_mmio_page_fault(vcpu, gva, true);
if (likely(r != RET_MMIO_PF_INVALID))
return r;
@@ -3427,7 +3403,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
{
- if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+ if (unlikely(!lapic_in_kernel(vcpu) ||
kvm_event_needs_reinjection(vcpu)))
return false;
@@ -3435,7 +3411,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable)
+ gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
{
struct kvm_memory_slot *slot;
bool async;
@@ -3473,10 +3449,10 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
bool prefault)
{
- pfn_t pfn;
+ kvm_pfn_t pfn;
int r;
int level;
- int force_pt_level;
+ bool force_pt_level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
@@ -3485,7 +3461,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
+ r = handle_mmio_page_fault(vcpu, gpa, true);
if (likely(r != RET_MMIO_PF_INVALID))
return r;
@@ -3495,20 +3471,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r)
return r;
- if (mapping_level_dirty_bitmap(vcpu, gfn) ||
- !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
- force_pt_level = 1;
- else
- force_pt_level = 0;
-
+ force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
+ PT_DIRECTORY_LEVEL);
+ level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
- level = mapping_level(vcpu, gfn);
if (level > PT_DIRECTORY_LEVEL &&
!check_hugepage_cache_consistency(vcpu, gfn, level))
level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- } else
- level = PT_PAGE_TABLE_LEVEL;
+ }
if (fast_page_fault(vcpu, gpa, level, error_code))
return 0;
@@ -3528,8 +3499,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, gpa, write, map_writable,
- level, gfn, pfn, prefault);
+ r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -3706,7 +3676,7 @@ static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
int maxphyaddr, bool execonly)
{
- int pte;
+ u64 bad_mt_xwr;
rsvd_check->rsvd_bits_mask[0][3] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
@@ -3724,14 +3694,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
- for (pte = 0; pte < 64; pte++) {
- int rwx_bits = pte & 7;
- int mt = pte >> 3;
- if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
- rwx_bits == 0x2 || rwx_bits == 0x6 ||
- (rwx_bits == 0x4 && !execonly))
- rsvd_check->bad_mt_xwr |= (1ull << pte);
+ bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
+ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
+ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
+ if (!execonly) {
+ /* bits 0..2 must not be 100 unless VMX capabilities allow it */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
}
+ rsvd_check->bad_mt_xwr = bad_mt_xwr;
}
static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
@@ -4053,10 +4025,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->inject_page_fault = kvm_inject_page_fault;
/*
- * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
- * translation of l2_gpa to l1_gpa addresses is done using the
- * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
- * functions between mmu and nested_mmu are swapped.
+ * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
+ * L1's nested page tables (e.g. EPT12). The nested translation
+ * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
+ * L2's page tables as the first level of translation and L1's
+ * nested page tables as the second level of translation. Basically
+ * the gva_to_gpa functions between mmu and nested_mmu are swapped.
*/
if (!is_paging(vcpu)) {
g_context->nx = false;
@@ -4490,7 +4464,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
}
/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap);
+typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
/* The caller should hold mmu-lock before calling this function. */
static bool
@@ -4584,9 +4558,10 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
spin_unlock(&kvm->mmu_lock);
}
-static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp)
+static bool slot_rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
{
- return __rmap_write_protect(kvm, rmapp, false);
+ return __rmap_write_protect(kvm, rmap_head, false);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -4622,16 +4597,16 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
}
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
- unsigned long *rmapp)
+ struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
int need_tlb_flush = 0;
- pfn_t pfn;
+ kvm_pfn_t pfn;
struct kvm_mmu_page *sp;
restart:
- for_each_rmap_spte(rmapp, &iter, sptep) {
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
sp = page_header(__pa(sptep));
pfn = spte_to_pfn(*sptep);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e4202e41d535..55ffb7b0f95e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -56,13 +56,13 @@ void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
/*
- * Return values of handle_mmio_page_fault_common:
+ * Return values of handle_mmio_page_fault:
* RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
* directly.
* RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
* fault path update the mmio spte.
* RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: bug is detected.
+ * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
*/
enum {
RET_MMIO_PF_EMULATE = 1,
@@ -71,7 +71,7 @@ enum {
RET_MMIO_PF_BUG = -1
};
-int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
+int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 03d518e499a6..dcce533d420c 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
struct kvm_mmu_page *sp;
gfn_t gfn;
- pfn_t pfn;
+ kvm_pfn_t pfn;
hpa_t hpa;
sp = page_header(__pa(sptep));
@@ -129,7 +129,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
{
static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *rev_sp;
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
@@ -150,8 +150,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
return;
}
- rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
- if (!*rmapp) {
+ rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
+ if (!rmap_head->val) {
if (!__ratelimit(&ratelimit_state))
return;
audit_printk(kvm, "no rmap for writable spte %llx\n",
@@ -183,7 +183,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
return;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_rmap_spte(sp->spt[i]))
+ if (!is_shadow_present_pte(sp->spt[i]))
continue;
inspect_spte_has_rmap(kvm, sp->spt + i);
@@ -192,7 +192,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
u64 *sptep;
struct rmap_iterator iter;
struct kvm_memslots *slots;
@@ -203,13 +203,14 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, sp->gfn);
- rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
+ rmap_head = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
if (is_writable_pte(*sptep))
audit_printk(kvm, "shadow page has writable "
"mappings: gfn %llx role %x\n",
sp->gfn, sp->role.word);
+ }
}
static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 9e8bf13572e6..3f8c732117ec 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -120,14 +120,22 @@ static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
}
-static u8 mtrr_disabled_type(void)
+static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
{
/*
* Intel SDM 11.11.2.2: all MTRRs are disabled when
* IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
* memory type is applied to all of physical memory.
+ *
+ * However, virtual machines can be run with CPUID such that
+ * there are no MTRRs. In that case, the firmware will never
+ * enable MTRRs and it is obviously undesirable to run the
+ * guest entirely with UC memory and we use WB.
*/
- return MTRR_TYPE_UNCACHABLE;
+ if (guest_cpuid_has_mtrr(vcpu))
+ return MTRR_TYPE_UNCACHABLE;
+ else
+ return MTRR_TYPE_WRBACK;
}
/*
@@ -267,7 +275,7 @@ static int fixed_mtrr_addr_to_seg(u64 addr)
for (seg = 0; seg < seg_num; seg++) {
mtrr_seg = &fixed_seg_table[seg];
- if (mtrr_seg->start >= addr && addr < mtrr_seg->end)
+ if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
return seg;
}
@@ -300,7 +308,6 @@ static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
*start = range->base & PAGE_MASK;
mask = range->mask & PAGE_MASK;
- mask |= ~0ULL << boot_cpu_data.x86_phys_bits;
/* This cannot overflow because writing to the reserved bits of
* variable MTRRs causes a #GP.
@@ -356,10 +363,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (var_mtrr_range_is_valid(cur))
list_del(&mtrr_state->var_ranges[index].node);
+ /* Extend the mask with all 1 bits to the left, since those
+ * bits must implicitly be 0. The bits are then cleared
+ * when reading them.
+ */
if (!is_mtrr_mask)
cur->base = data;
else
- cur->mask = data;
+ cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu));
/* add it to the list if it's enabled. */
if (var_mtrr_range_is_valid(cur)) {
@@ -426,6 +437,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
*pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
else
*pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
+
+ *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1;
}
return 0;
@@ -670,7 +683,7 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
}
if (iter.mtrr_disabled)
- return mtrr_disabled_type();
+ return mtrr_disabled_type(vcpu);
/* not contained in any MTRRs. */
if (type == -1)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 736e6ab8784d..6c9fed957cce 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -456,7 +456,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
{
unsigned pte_access;
gfn_t gfn;
- pfn_t pfn;
+ kvm_pfn_t pfn;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
@@ -475,8 +475,8 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* we call mmu_set_spte() with host_writable = true because
* pte_prefetch_gfn_to_pfn always gets a writable pfn.
*/
- mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL,
- gfn, pfn, true, true);
+ mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
+ true, true);
return true;
}
@@ -551,12 +551,12 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int write_fault, int hlevel,
- pfn_t pfn, bool map_writable, bool prefault)
+ kvm_pfn_t pfn, bool map_writable, bool prefault)
{
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, emulate = 0;
+ int top_level, emulate;
direct_access = gw->pte_access;
@@ -587,7 +587,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (!is_shadow_present_pte(*it.sptep)) {
table_gfn = gw->table_gfn[it.level - 2];
sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
- false, access, it.sptep);
+ false, access);
}
/*
@@ -598,7 +598,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
goto out_gpte_changed;
if (sp)
- link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
+ link_shadow_page(vcpu, it.sptep, sp);
}
for (;
@@ -617,20 +617,18 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
- true, direct_access, it.sptep);
- link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
+ true, direct_access);
+ link_shadow_page(vcpu, it.sptep, sp);
}
clear_sp_write_flooding_count(it.sptep);
- mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate,
- it.level, gw->gfn, pfn, prefault, map_writable);
+ emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
+ it.level, gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return emulate;
out_gpte_changed:
- if (sp)
- kvm_mmu_put_page(sp, it.sptep);
kvm_release_pfn_clean(pfn);
return 0;
}
@@ -696,17 +694,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
int r;
- pfn_t pfn;
+ kvm_pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
- int force_pt_level;
+ bool force_pt_level = false;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, addr, error_code,
- mmu_is_nested(vcpu));
+ r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
if (likely(r != RET_MMIO_PF_INVALID))
return r;
@@ -743,15 +740,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
&walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
- if (walker.level >= PT_DIRECTORY_LEVEL)
- force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
- || is_self_change_mapping;
- else
- force_pt_level = 1;
- if (!force_pt_level) {
- level = min(walker.level, mapping_level(vcpu, walker.gfn));
- walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
- }
+ if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
+ level = mapping_level(vcpu, walker.gfn, &force_pt_level);
+ if (likely(!force_pt_level)) {
+ level = min(walker.level, level);
+ walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ }
+ } else
+ force_pt_level = true;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2f9ed1ff0632..c13a64b7d789 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -86,6 +86,7 @@ static const u32 host_save_user_msrs[] = {
MSR_FS_BASE,
#endif
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_TSC_AUX,
};
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
@@ -135,6 +136,7 @@ struct vcpu_svm {
uint64_t asid_generation;
uint64_t sysenter_esp;
uint64_t sysenter_eip;
+ uint64_t tsc_aux;
u64 next_rip;
@@ -158,7 +160,8 @@ struct vcpu_svm {
unsigned long int3_rip;
u32 apf_reason;
- u64 tsc_ratio;
+ /* cached guest cpuid flags for faster access */
+ bool nrips_enabled : 1;
};
static DEFINE_PER_CPU(u64, current_tsc_ratio);
@@ -211,7 +214,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
-static u64 __scale_tsc(u64 ratio, u64 tsc);
enum {
VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
@@ -891,20 +893,9 @@ static __init int svm_hardware_setup(void)
kvm_enable_efer_bits(EFER_FFXSR);
if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- u64 max;
-
kvm_has_tsc_control = true;
-
- /*
- * Make sure the user can only configure tsc_khz values that
- * fit into a signed integer.
- * A min value is not calculated needed because it will always
- * be 1 on all machines and a value of 0 is used to disable
- * tsc-scaling for the vcpu.
- */
- max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
-
- kvm_max_guest_tsc_khz = max;
+ kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 32;
}
if (nested) {
@@ -968,68 +959,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
-static u64 __scale_tsc(u64 ratio, u64 tsc)
-{
- u64 mult, frac, _tsc;
-
- mult = ratio >> 32;
- frac = ratio & ((1ULL << 32) - 1);
-
- _tsc = tsc;
- _tsc *= mult;
- _tsc += (tsc >> 32) * frac;
- _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
-
- return _tsc;
-}
-
-static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 _tsc = tsc;
-
- if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
- _tsc = __scale_tsc(svm->tsc_ratio, tsc);
-
- return _tsc;
-}
-
-static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 ratio;
- u64 khz;
-
- /* Guest TSC same frequency as host TSC? */
- if (!scale) {
- svm->tsc_ratio = TSC_RATIO_DEFAULT;
- return;
- }
-
- /* TSC scaling supported? */
- if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- if (user_tsc_khz > tsc_khz) {
- vcpu->arch.tsc_catchup = 1;
- vcpu->arch.tsc_always_catchup = 1;
- } else
- WARN(1, "user requested TSC rate below hardware speed\n");
- return;
- }
-
- khz = user_tsc_khz;
-
- /* TSC scaling required - calculate ratio */
- ratio = khz << 32;
- do_div(ratio, tsc_khz);
-
- if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
- WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
- user_tsc_khz);
- return;
- }
- svm->tsc_ratio = ratio;
-}
-
static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1056,16 +985,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
+static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (host) {
- if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
- WARN_ON(adjustment < 0);
- adjustment = svm_scale_tsc(vcpu, (u64)adjustment);
- }
-
svm->vmcb->control.tsc_offset += adjustment;
if (is_guest_mode(vcpu))
svm->nested.hsave->control.tsc_offset += adjustment;
@@ -1077,16 +1000,7 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
- u64 tsc;
-
- tsc = svm_scale_tsc(vcpu, rdtsc());
-
- return target_tsc - tsc;
-}
-
-static void init_vmcb(struct vcpu_svm *svm, bool init_event)
+static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1107,6 +1021,8 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
set_exception_intercept(svm, PF_VECTOR);
set_exception_intercept(svm, UD_VECTOR);
set_exception_intercept(svm, MC_VECTOR);
+ set_exception_intercept(svm, AC_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
set_intercept(svm, INTERCEPT_INTR);
set_intercept(svm, INTERCEPT_NMI);
@@ -1157,8 +1073,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- if (!init_event)
- svm_set_efer(&svm->vcpu, 0);
+ svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
kvm_set_rflags(&svm->vcpu, 2);
save->rip = 0x0000fff0;
@@ -1212,7 +1127,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
}
- init_vmcb(svm, init_event);
+ init_vmcb(svm);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1233,8 +1148,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
goto out;
}
- svm->tsc_ratio = TSC_RATIO_DEFAULT;
-
err = kvm_vcpu_init(&svm->vcpu, kvm, id);
if (err)
goto free_svm;
@@ -1268,7 +1181,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
clear_page(svm->vmcb);
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
- init_vmcb(svm, false);
+ init_vmcb(svm);
svm_init_osvw(&svm->vcpu);
@@ -1320,11 +1233,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
- svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
- __this_cpu_write(current_tsc_ratio, svm->tsc_ratio);
- wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+ if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
+ __this_cpu_write(current_tsc_ratio, tsc_ratio);
+ wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
+ }
}
+ /* This assumes that the kernel never uses MSR_TSC_AUX */
+ if (static_cpu_has(X86_FEATURE_RDTSCP))
+ wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1642,20 +1560,13 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
mark_dirty(svm->vmcb, VMCB_SEG);
}
-static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
+static void update_bp_intercept(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- clr_exception_intercept(svm, DB_VECTOR);
clr_exception_intercept(svm, BP_VECTOR);
- if (svm->nmi_singlestep)
- set_exception_intercept(svm, DB_VECTOR);
-
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- set_exception_intercept(svm, DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
set_exception_intercept(svm, BP_VECTOR);
} else
@@ -1761,7 +1672,6 @@ static int db_interception(struct vcpu_svm *svm)
if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
svm->vmcb->save.rflags &=
~(X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_bp_intercept(&svm->vcpu);
}
if (svm->vcpu.guest_debug &
@@ -1796,6 +1706,12 @@ static int ud_interception(struct vcpu_svm *svm)
return 1;
}
+static int ac_interception(struct vcpu_svm *svm)
+{
+ kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+ return 1;
+}
+
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1890,7 +1806,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
* so reinitialize it.
*/
clear_page(svm->vmcb);
- init_vmcb(svm, false);
+ init_vmcb(svm);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@@ -2365,7 +2281,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
- nested_vmcb->control.next_rip = vmcb->control.next_rip;
+
+ if (svm->nrips_enabled)
+ nested_vmcb->control.next_rip = vmcb->control.next_rip;
/*
* If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -3060,7 +2978,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
/* instruction emulation calls kvm_set_cr8() */
r = cr_interception(svm);
- if (irqchip_in_kernel(svm->vcpu.kvm))
+ if (lapic_in_kernel(&svm->vcpu))
return r;
if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
return r;
@@ -3071,8 +2989,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
- return vmcb->control.tsc_offset +
- svm_scale_tsc(vcpu, host_tsc);
+ return vmcb->control.tsc_offset + host_tsc;
}
static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -3082,7 +2999,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_IA32_TSC: {
msr_info->data = svm->vmcb->control.tsc_offset +
- svm_scale_tsc(vcpu, rdtsc());
+ kvm_scale_tsc(vcpu, rdtsc());
break;
}
@@ -3112,6 +3029,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_SYSENTER_ESP:
msr_info->data = svm->sysenter_esp;
break;
+ case MSR_TSC_AUX:
+ if (!boot_cpu_has(X86_FEATURE_RDTSCP))
+ return 1;
+ msr_info->data = svm->tsc_aux;
+ break;
/*
* Nobody will change the following 5 values in the VMCB so we can
* safely return them on rdmsr. They will always be 0 until LBRV is
@@ -3141,6 +3063,23 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_UCODE_REV:
msr_info->data = 0x01000065;
break;
+ case MSR_F15H_IC_CFG: {
+
+ int family, model;
+
+ family = guest_cpuid_family(vcpu);
+ model = guest_cpuid_model(vcpu);
+
+ if (family < 0 || model < 0)
+ return kvm_get_msr_common(vcpu, msr_info);
+
+ msr_info->data = 0;
+
+ if (family == 0x15 &&
+ (model >= 0x2 && model < 0x20))
+ msr_info->data = 0x1E;
+ }
+ break;
default:
return kvm_get_msr_common(vcpu, msr_info);
}
@@ -3233,6 +3172,18 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->sysenter_esp = data;
svm->vmcb->save.sysenter_esp = data;
break;
+ case MSR_TSC_AUX:
+ if (!boot_cpu_has(X86_FEATURE_RDTSCP))
+ return 1;
+
+ /*
+ * This is rare, so we update the MSR here instead of using
+ * direct_access_msrs. Doing that would require a rdmsr in
+ * svm_vcpu_put.
+ */
+ svm->tsc_aux = data;
+ wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ break;
case MSR_IA32_DEBUGCTLMSR:
if (!boot_cpu_has(X86_FEATURE_LBRV)) {
vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
@@ -3294,24 +3245,11 @@ static int msr_interception(struct vcpu_svm *svm)
static int interrupt_window_interception(struct vcpu_svm *svm)
{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
mark_dirty(svm->vmcb, VMCB_INTR);
++svm->vcpu.stat.irq_window_exits;
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(svm->vcpu.kvm) &&
- kvm_run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(&svm->vcpu)) {
- kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
-
return 1;
}
@@ -3371,6 +3309,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
[SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
[SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
[SVM_EXIT_INTR] = intr_interception,
[SVM_EXIT_NMI] = nmi_interception,
[SVM_EXIT_SMI] = nop_on_interception,
@@ -3522,6 +3461,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
+ trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+
if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
@@ -3659,9 +3600,13 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
return;
}
-static int svm_vm_has_apicv(struct kvm *kvm)
+static bool svm_get_enable_apicv(void)
+{
+ return false;
+}
+
+static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
- return 0;
}
static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -3754,7 +3699,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
*/
svm->nmi_singlestep = true;
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_bp_intercept(vcpu);
}
static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -3993,8 +3937,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
- trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
-
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_handle_nmi(&svm->vcpu);
@@ -4098,6 +4040,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
static void svm_cpuid_update(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Update nrips enabled cache */
+ svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
}
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -4134,7 +4080,7 @@ static int svm_get_lpage_level(void)
static bool svm_rdtscp_supported(void)
{
- return false;
+ return boot_cpu_has(X86_FEATURE_RDTSCP);
}
static bool svm_invpcid_supported(void)
@@ -4376,7 +4322,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
- .update_db_bp_intercept = update_db_bp_intercept,
+ .update_bp_intercept = update_bp_intercept,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
.get_segment_base = svm_get_segment_base,
@@ -4425,7 +4371,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
- .vm_has_apicv = svm_vm_has_apicv,
+ .get_enable_apicv = svm_get_enable_apicv,
+ .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.sync_pir_to_irr = svm_sync_pir_to_irr,
@@ -4448,11 +4395,9 @@ static struct kvm_x86_ops svm_x86_ops = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
- .set_tsc_khz = svm_set_tsc_khz,
.read_tsc_offset = svm_read_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
- .adjust_tsc_offset = svm_adjust_tsc_offset,
- .compute_tsc_offset = svm_compute_tsc_offset,
+ .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
.read_l1_tsc = svm_read_l1_tsc,
.set_tdp_cr3 = set_tdp_cr3,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4eae7c35ddf5..ad9f6a23f139 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -129,6 +129,24 @@ TRACE_EVENT(kvm_pio,
);
/*
+ * Tracepoint for fast mmio.
+ */
+TRACE_EVENT(kvm_fast_mmio,
+ TP_PROTO(u64 gpa),
+ TP_ARGS(gpa),
+
+ TP_STRUCT__entry(
+ __field(u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = gpa;
+ ),
+
+ TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
+);
+
+/*
* Tracepoint for cpuid.
*/
TRACE_EVENT(kvm_cpuid,
@@ -250,7 +268,7 @@ TRACE_EVENT(kvm_inj_virq,
#define kvm_trace_sym_exc \
EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
- EXS(MF), EXS(MC)
+ EXS(MF), EXS(AC), EXS(MC)
/*
* Tracepoint for kvm interrupt injection:
@@ -974,6 +992,302 @@ TRACE_EVENT(kvm_enter_smm,
__entry->smbase)
);
+/*
+ * Tracepoint for VT-d posted-interrupts.
+ */
+TRACE_EVENT(kvm_pi_irte_update,
+ TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
+ unsigned int gvec, u64 pi_desc_addr, bool set),
+ TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, gsi )
+ __field( unsigned int, gvec )
+ __field( u64, pi_desc_addr )
+ __field( bool, set )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->gsi = gsi;
+ __entry->gvec = gvec;
+ __entry->pi_desc_addr = pi_desc_addr;
+ __entry->set = set;
+ ),
+
+ TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
+ "gvec: 0x%x, pi_desc_addr: 0x%llx",
+ __entry->set ? "enabled and being updated" : "disabled",
+ __entry->vcpu_id,
+ __entry->gsi,
+ __entry->gvec,
+ __entry->pi_desc_addr)
+);
+
+/*
+ * Tracepoint for kvm_hv_notify_acked_sint.
+ */
+TRACE_EVENT(kvm_hv_notify_acked_sint,
+ TP_PROTO(int vcpu_id, u32 sint),
+ TP_ARGS(vcpu_id, sint),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ ),
+
+ TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint)
+);
+
+/*
+ * Tracepoint for synic_set_irq.
+ */
+TRACE_EVENT(kvm_hv_synic_set_irq,
+ TP_PROTO(int vcpu_id, u32 sint, int vector, int ret),
+ TP_ARGS(vcpu_id, sint, vector, ret),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ __entry->vector = vector;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("vcpu_id %d sint %u vector %d ret %d",
+ __entry->vcpu_id, __entry->sint, __entry->vector,
+ __entry->ret)
+);
+
+/*
+ * Tracepoint for kvm_hv_synic_send_eoi.
+ */
+TRACE_EVENT(kvm_hv_synic_send_eoi,
+ TP_PROTO(int vcpu_id, int vector),
+ TP_ARGS(vcpu_id, vector),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector)
+);
+
+/*
+ * Tracepoint for synic_set_msr.
+ */
+TRACE_EVENT(kvm_hv_synic_set_msr,
+ TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host),
+ TP_ARGS(vcpu_id, msr, data, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, msr)
+ __field(u64, data)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->msr = msr;
+ __entry->data = data;
+ __entry->host = host
+ ),
+
+ TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d",
+ __entry->vcpu_id, __entry->msr, __entry->data, __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_config.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_config,
+ TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host),
+ TP_ARGS(vcpu_id, timer_index, config, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, config)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->config = config;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d config 0x%llx host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->config,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_count.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_count,
+ TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host),
+ TP_ARGS(vcpu_id, timer_index, count, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, count)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->count = count;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d count %llu host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->count,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_start(periodic timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_periodic,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time),
+ TP_ARGS(vcpu_id, timer_index, time_now, exp_time),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, exp_time)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->exp_time = exp_time;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->exp_time)
+);
+
+/*
+ * Tracepoint for stimer_start(one-shot timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_one_shot,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count),
+ TP_ARGS(vcpu_id, timer_index, time_now, count),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, count)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->count = count;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu count %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->count)
+);
+
+/*
+ * Tracepoint for stimer_timer_callback.
+ */
+TRACE_EVENT(kvm_hv_stimer_callback,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
+/*
+ * Tracepoint for stimer_expiration.
+ */
+TRACE_EVENT(kvm_hv_stimer_expiration,
+ TP_PROTO(int vcpu_id, int timer_index, int msg_send_result),
+ TP_ARGS(vcpu_id, timer_index, msg_send_result),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(int, msg_send_result)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->msg_send_result = msg_send_result;
+ ),
+
+ TP_printk("vcpu_id %d timer %d msg send result %d",
+ __entry->vcpu_id, __entry->timer_index,
+ __entry->msg_send_result)
+);
+
+/*
+ * Tracepoint for stimer_cleanup.
+ */
+TRACE_EVENT(kvm_hv_stimer_cleanup,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a8bc64566ab..e2951b6edbbc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -19,6 +19,7 @@
#include "irq.h"
#include "mmu.h"
#include "cpuid.h"
+#include "lapic.h"
#include <linux/kvm_host.h>
#include <linux/module.h>
@@ -35,6 +36,7 @@
#include "kvm_cache_regs.h"
#include "x86.h"
+#include <asm/cpu.h>
#include <asm/io.h>
#include <asm/desc.h>
#include <asm/vmx.h>
@@ -45,6 +47,7 @@
#include <asm/debugreg.h>
#include <asm/kexec.h>
#include <asm/apic.h>
+#include <asm/irq_remapping.h>
#include "trace.h"
#include "pmu.h"
@@ -105,6 +108,8 @@ static u64 __read_mostly host_xss;
static bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
+#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
@@ -424,6 +429,9 @@ struct nested_vmx {
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+ u16 vpid02;
+ u16 last_vpid;
+
u32 nested_vmx_procbased_ctls_low;
u32 nested_vmx_procbased_ctls_high;
u32 nested_vmx_true_procbased_ctls_low;
@@ -440,14 +448,33 @@ struct nested_vmx {
u32 nested_vmx_misc_low;
u32 nested_vmx_misc_high;
u32 nested_vmx_ept_caps;
+ u32 nested_vmx_vpid_caps;
};
#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
/* Posted-Interrupt Descriptor */
struct pi_desc {
u32 pir[8]; /* Posted interrupt requested */
- u32 control; /* bit 0 of control is outstanding notification bit */
- u32 rsvd[7];
+ union {
+ struct {
+ /* bit 256 - Outstanding Notification */
+ u16 on : 1,
+ /* bit 257 - Suppress Notification */
+ sn : 1,
+ /* bit 271:258 - Reserved */
+ rsvd_1 : 14;
+ /* bit 279:272 - Notification Vector */
+ u8 nv;
+ /* bit 287:280 - Reserved */
+ u8 rsvd_2;
+ /* bit 319:288 - Notification Destination */
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
} __aligned(64);
static bool pi_test_and_set_on(struct pi_desc *pi_desc)
@@ -467,6 +494,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ return clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ return set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
@@ -532,8 +583,6 @@ struct vcpu_vmx {
s64 vnmi_blocked_time;
u32 exit_reason;
- bool rdtscp_enabled;
-
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
@@ -563,6 +612,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
@@ -809,7 +863,6 @@ static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static bool vmx_mpx_supported(void);
static bool vmx_xsaves_supported(void);
-static int vmx_vm_has_apicv(struct kvm *kvm);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -817,7 +870,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
static int alloc_identity_pagetable(struct kvm *kvm);
@@ -831,6 +883,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
@@ -946,9 +1005,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
}
-static inline bool vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
{
- return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
+ return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
}
static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -983,7 +1042,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
static inline bool cpu_has_vmx_posted_intr(void)
{
- return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+ return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
+ vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
}
static inline bool cpu_has_vmx_apicv(void)
@@ -1062,9 +1122,9 @@ static inline bool cpu_has_vmx_ple(void)
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
-static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
{
- return flexpriority_enabled && irqchip_in_kernel(kvm);
+ return flexpriority_enabled && lapic_in_kernel(vcpu);
}
static inline bool cpu_has_vmx_vpid(void)
@@ -1113,6 +1173,12 @@ static inline bool cpu_has_vmx_pml(void)
return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
}
+static inline bool cpu_has_vmx_tsc_scaling(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_TSC_SCALING;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -1157,6 +1223,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
}
+static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
{
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
@@ -1337,13 +1408,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
__loaded_vmcs_clear, loaded_vmcs, 1);
}
-static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(int vpid)
{
- if (vmx->vpid == 0)
+ if (vpid == 0)
return;
if (cpu_has_vmx_invvpid_single())
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
}
static inline void vpid_sync_vcpu_global(void)
@@ -1352,10 +1423,10 @@ static inline void vpid_sync_vcpu_global(void)
__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
}
-static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+static inline void vpid_sync_context(int vpid)
{
if (cpu_has_vmx_invvpid_single())
- vpid_sync_vcpu_single(vmx);
+ vpid_sync_vcpu_single(vpid);
else
vpid_sync_vcpu_global();
}
@@ -1376,7 +1447,51 @@ static inline void ept_sync_context(u64 eptp)
}
}
-static __always_inline unsigned long vmcs_readl(unsigned long field)
+static __always_inline void vmcs_check16(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "16-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "16-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "16-bit accessor invalid for 32-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "16-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check32(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "32-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "32-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check64(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "64-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "64-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "64-bit accessor invalid for 32-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "64-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_checkl(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "Natural width accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "Natural width accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "Natural width accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "Natural width accessor invalid for 32-bit field");
+}
+
+static __always_inline unsigned long __vmcs_readl(unsigned long field)
{
unsigned long value;
@@ -1387,23 +1502,32 @@ static __always_inline unsigned long vmcs_readl(unsigned long field)
static __always_inline u16 vmcs_read16(unsigned long field)
{
- return vmcs_readl(field);
+ vmcs_check16(field);
+ return __vmcs_readl(field);
}
static __always_inline u32 vmcs_read32(unsigned long field)
{
- return vmcs_readl(field);
+ vmcs_check32(field);
+ return __vmcs_readl(field);
}
static __always_inline u64 vmcs_read64(unsigned long field)
{
+ vmcs_check64(field);
#ifdef CONFIG_X86_64
- return vmcs_readl(field);
+ return __vmcs_readl(field);
#else
- return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
+ return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
#endif
}
+static __always_inline unsigned long vmcs_readl(unsigned long field)
+{
+ vmcs_checkl(field);
+ return __vmcs_readl(field);
+}
+
static noinline void vmwrite_error(unsigned long field, unsigned long value)
{
printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
@@ -1411,7 +1535,7 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value)
dump_stack();
}
-static void vmcs_writel(unsigned long field, unsigned long value)
+static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
{
u8 error;
@@ -1421,33 +1545,46 @@ static void vmcs_writel(unsigned long field, unsigned long value)
vmwrite_error(field, value);
}
-static void vmcs_write16(unsigned long field, u16 value)
+static __always_inline void vmcs_write16(unsigned long field, u16 value)
{
- vmcs_writel(field, value);
+ vmcs_check16(field);
+ __vmcs_writel(field, value);
}
-static void vmcs_write32(unsigned long field, u32 value)
+static __always_inline void vmcs_write32(unsigned long field, u32 value)
{
- vmcs_writel(field, value);
+ vmcs_check32(field);
+ __vmcs_writel(field, value);
}
-static void vmcs_write64(unsigned long field, u64 value)
+static __always_inline void vmcs_write64(unsigned long field, u64 value)
{
- vmcs_writel(field, value);
+ vmcs_check64(field);
+ __vmcs_writel(field, value);
#ifndef CONFIG_X86_64
asm volatile ("");
- vmcs_writel(field+1, value >> 32);
+ __vmcs_writel(field+1, value >> 32);
#endif
}
-static void vmcs_clear_bits(unsigned long field, u32 mask)
+static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
+{
+ vmcs_checkl(field);
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
{
- vmcs_writel(field, vmcs_readl(field) & ~mask);
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_clear_bits does not support 64-bit fields");
+ __vmcs_writel(field, __vmcs_readl(field) & ~mask);
}
-static void vmcs_set_bits(unsigned long field, u32 mask)
+static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
{
- vmcs_writel(field, vmcs_readl(field) | mask);
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_set_bits does not support 64-bit fields");
+ __vmcs_writel(field, __vmcs_readl(field) | mask);
}
static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
@@ -1567,7 +1704,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
u32 eb;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
- (1u << NM_VECTOR) | (1u << DB_VECTOR);
+ (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1895,6 +2032,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
preempt_enable();
}
+static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ /*
+ * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
+ * are two possible cases:
+ * 1. After running 'pre_block', context switch
+ * happened. For this case, 'sn' was set in
+ * vmx_vcpu_put(), so we need to clear it here.
+ * 2. After running 'pre_block', we were blocked,
+ * and woken up by some other guy. For this case,
+ * we don't need to do anything, 'pi_post_block'
+ * will do everything for us. However, we cannot
+ * check whether it is case #1 or case #2 here
+ * (maybe, not needed), so we also clear sn here,
+ * I think it is not a big deal.
+ */
+ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
+ if (vcpu->cpu != cpu) {
+ dest = cpu_physical_id(cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+ }
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ }
+
+ /* Allow posting non-urgent interrupts */
+ new.sn = 0;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+}
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -1943,12 +2126,35 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+ /* Setup TSC multiplier */
+ if (cpu_has_vmx_tsc_scaling())
+ vmcs_write64(TSC_MULTIPLIER,
+ vcpu->arch.tsc_scaling_ratio);
+
vmx->loaded_vmcs->cpu = cpu;
}
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+}
+
+static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ /* Set SN when the vCPU is preempted */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
}
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
+ vmx_vcpu_pi_put(vcpu);
+
__vmx_load_host_state(to_vmx(vcpu));
if (!vmm_exclusive) {
__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
@@ -2207,7 +2413,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_TSC_AUX);
- if (index >= 0 && vmx->rdtscp_enabled)
+ if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
move_msr_up(vmx, index, save_nmsrs++);
/*
* MSR_STAR is only needed on long mode guests, and only
@@ -2230,15 +2436,16 @@ static void setup_msrs(struct vcpu_vmx *vmx)
/*
* reads and returns guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset -- 21.3
+ * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
+ * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
*/
-static u64 guest_read_tsc(void)
+static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
{
u64 host_tsc, tsc_offset;
host_tsc = rdtsc();
tsc_offset = vmcs_read64(TSC_OFFSET);
- return host_tsc + tsc_offset;
+ return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
}
/*
@@ -2255,22 +2462,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
return host_tsc + tsc_offset;
}
-/*
- * Engage any workarounds for mis-matched TSC rates. Currently limited to
- * software catchup for faster rates on slower CPUs.
- */
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
- if (!scale)
- return;
-
- if (user_tsc_khz > tsc_khz) {
- vcpu->arch.tsc_catchup = 1;
- vcpu->arch.tsc_always_catchup = 1;
- } else
- WARN(1, "user requested TSC rate below hardware speed\n");
-}
-
static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
{
return vmcs_read64(TSC_OFFSET);
@@ -2302,7 +2493,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
}
}
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
+static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
{
u64 offset = vmcs_read64(TSC_OFFSET);
@@ -2315,11 +2506,6 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
offset + adjustment);
}
-static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
- return target_tsc - rdtsc();
-}
-
static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
@@ -2377,7 +2563,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
- if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (kvm_vcpu_apicv_active(&vmx->vcpu))
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_POSTED_INTR;
@@ -2471,10 +2657,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
- SECONDARY_EXEC_XSAVES;
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_PCOMMIT;
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
@@ -2493,6 +2681,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
} else
vmx->nested.nested_vmx_ept_caps = 0;
+ if (enable_vpid)
+ vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+ else
+ vmx->nested.nested_vmx_vpid_caps = 0;
+
if (enable_unrestricted_guest)
vmx->nested.nested_vmx_secondary_ctls_high |=
SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -2608,7 +2802,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
/* Currently, no nested vpid support */
- *pdata = vmx->nested.nested_vmx_ept_caps;
+ *pdata = vmx->nested.nested_vmx_ept_caps |
+ ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
break;
default:
return 1;
@@ -2642,7 +2837,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
case MSR_IA32_TSC:
- msr_info->data = guest_read_tsc();
+ msr_info->data = guest_read_tsc(vcpu);
break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
@@ -2673,7 +2868,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
- if (!to_vmx(vcpu)->rdtscp_enabled)
+ if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
return 1;
/* Otherwise falls through */
default:
@@ -2779,7 +2974,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
case MSR_TSC_AUX:
- if (!vmx->rdtscp_enabled)
+ if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
@@ -2874,6 +3069,8 @@ static int hardware_enable(void)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
/*
* Now we can enable the vmclear operation in kdump
@@ -3015,7 +3212,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_ENABLE_PML;
+ SECONDARY_EXEC_ENABLE_PML |
+ SECONDARY_EXEC_PCOMMIT |
+ SECONDARY_EXEC_TSC_SCALING;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -3441,9 +3640,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
#endif
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
{
- vpid_sync_context(to_vmx(vcpu));
+ vpid_sync_context(vpid);
if (enable_ept) {
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
@@ -3451,6 +3650,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
}
}
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
+}
+
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -3644,20 +3848,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
hw_cr4 |= X86_CR4_PSE;
- /*
- * SMEP/SMAP is disabled if CPU is in non-paging mode
- * in hardware. However KVM always uses paging mode to
- * emulate guest non-paging mode with TDP.
- * To emulate this behavior, SMEP/SMAP needs to be
- * manually disabled when guest switches to non-paging
- * mode.
- */
- hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
} else if (!(cr4 & X86_CR4_PAE)) {
hw_cr4 &= ~X86_CR4_PAE;
}
}
+ if (!enable_unrestricted_guest && !is_paging(vcpu))
+ /*
+ * SMEP/SMAP is disabled if CPU is in non-paging mode in
+ * hardware. However KVM always uses paging mode without
+ * unrestricted guest.
+ * To emulate this behavior, SMEP/SMAP needs to be manually
+ * disabled when guest switches to non-paging mode.
+ */
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
return 0;
@@ -4046,7 +4251,7 @@ out:
static int init_rmode_identity_map(struct kvm *kvm)
{
int i, idx, r = 0;
- pfn_t identity_map_pfn;
+ kvm_pfn_t identity_map_pfn;
u32 tmp;
if (!enable_ept)
@@ -4146,29 +4351,28 @@ static int alloc_identity_pagetable(struct kvm *kvm)
return r;
}
-static void allocate_vpid(struct vcpu_vmx *vmx)
+static int allocate_vpid(void)
{
int vpid;
- vmx->vpid = 0;
if (!enable_vpid)
- return;
+ return 0;
spin_lock(&vmx_vpid_lock);
vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
- if (vpid < VMX_NR_VPIDS) {
- vmx->vpid = vpid;
+ if (vpid < VMX_NR_VPIDS)
__set_bit(vpid, vmx_vpid_bitmap);
- }
+ else
+ vpid = 0;
spin_unlock(&vmx_vpid_lock);
+ return vpid;
}
-static void free_vpid(struct vcpu_vmx *vmx)
+static void free_vpid(int vpid)
{
- if (!enable_vpid)
+ if (!enable_vpid || vpid == 0)
return;
spin_lock(&vmx_vpid_lock);
- if (vmx->vpid != 0)
- __clear_bit(vmx->vpid, vmx_vpid_bitmap);
+ __clear_bit(vpid, vmx_vpid_bitmap);
spin_unlock(&vmx_vpid_lock);
}
@@ -4323,9 +4527,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
msr, MSR_TYPE_W);
}
-static int vmx_vm_has_apicv(struct kvm *kvm)
+static bool vmx_get_enable_apicv(void)
{
- return enable_apicv && irqchip_in_kernel(kvm);
+ return enable_apicv;
}
static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -4369,6 +4573,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_SMP
if (vcpu->mode == IN_GUEST_MODE) {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Currently, we don't support urgent interrupt,
+ * all interrupts are recognized as non-urgent
+ * interrupt, so we cannot post interrupts when
+ * 'SN' is set.
+ *
+ * If the vcpu is in guest mode, it means it is
+ * running instead of being scheduled out and
+ * waiting in the run queue, and that's the only
+ * case when 'SN' is set currently, warning if
+ * 'SN' is set.
+ */
+ WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
+
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
POSTED_INTR_VECTOR);
return true;
@@ -4431,11 +4651,6 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
}
-static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
-{
- return;
-}
-
/*
* Set up the vmcs's constant host-state fields, i.e., host-state fields that
* will not change in the lifetime of the guest.
@@ -4505,11 +4720,18 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
{
u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
- if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (!kvm_vcpu_apicv_active(&vmx->vcpu))
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
return pin_based_exec_ctrl;
}
+static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+}
+
static u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -4517,7 +4739,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
exec_control &= ~CPU_BASED_MOV_DR_EXITING;
- if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+ if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
exec_control &= ~CPU_BASED_TPR_SHADOW;
#ifdef CONFIG_X86_64
exec_control |= CPU_BASED_CR8_STORE_EXITING |
@@ -4534,7 +4756,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+ if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu))
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
if (vmx->vpid == 0)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -4548,7 +4770,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (!ple_gap)
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
- if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+ if (!kvm_vcpu_apicv_active(&vmx->vcpu))
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4558,8 +4780,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
a current VMCS12
*/
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- /* PML is enabled/disabled in creating/destorying vcpu */
- exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ if (!enable_pml)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ /* Currently, we allow L1 guest to directly run pcommit instruction. */
+ exec_control &= ~SECONDARY_EXEC_PCOMMIT;
return exec_control;
}
@@ -4604,12 +4830,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls()) {
+ if (cpu_has_secondary_exec_ctrls())
vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
vmx_secondary_exec_control(vmx));
- }
- if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
+ if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4617,7 +4842,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write16(GUEST_INTR_STATUS, 0);
- vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
}
@@ -4709,7 +4934,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
seg_setup(VCPU_SREG_CS);
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
- vmcs_write32(GUEST_CS_BASE, 0xffff0000);
+ vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
seg_setup(VCPU_SREG_DS);
seg_setup(VCPU_SREG_ES);
@@ -4745,7 +4970,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
- vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
setup_msrs(vmx);
@@ -4753,7 +4978,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (cpu_has_vmx_tpr_shadow() && !init_event) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (vm_need_tpr_shadow(vcpu->kvm))
+ if (cpu_need_tpr_shadow(vcpu))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
__pa(vcpu->arch.apic->regs));
vmcs_write32(TPR_THRESHOLD, 0);
@@ -4761,7 +4986,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (vmx_vm_has_apicv(vcpu->kvm))
+ if (kvm_vcpu_apicv_active(vcpu))
memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
if (vmx->vpid != 0)
@@ -4771,12 +4996,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_set_cr0(vcpu, cr0); /* enter rmode */
vmx->vcpu.arch.cr0 = cr0;
vmx_set_cr4(vcpu, 0);
- if (!init_event)
- vmx_set_efer(vcpu, 0);
+ vmx_set_efer(vcpu, 0);
vmx_fpu_activate(vcpu);
update_exception_bitmap(vcpu);
- vpid_sync_context(vmx);
+ vpid_sync_context(vmx->vpid);
}
/*
@@ -5104,6 +5328,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
return handle_rmode_exception(vcpu, ex_no, error_code);
switch (ex_no) {
+ case AC_VECTOR:
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
case DB_VECTOR:
dr6 = vmcs_readl(EXIT_QUALIFICATION);
if (!(vcpu->guest_debug &
@@ -5296,7 +5523,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
u8 cr8 = (u8)val;
err = kvm_set_cr8(vcpu, cr8);
kvm_complete_insn_gp(vcpu, err);
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
return 1;
if (cr8_prev <= cr8)
return 1;
@@ -5510,17 +5737,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
++vcpu->stat.irq_window_exits;
-
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(vcpu->kvm) &&
- vcpu->run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(vcpu)) {
- vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
return 1;
}
@@ -5753,10 +5969,11 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
skip_emulated_instruction(vcpu);
+ trace_kvm_fast_mmio(gpa);
return 1;
}
- ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+ ret = handle_mmio_page_fault(vcpu, gpa, true);
if (likely(ret == RET_MMIO_PF_EMULATE))
return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
EMULATE_DONE;
@@ -5910,6 +6127,25 @@ static void update_ple_window_actual_max(void)
ple_window_grow, INT_MIN);
}
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+static void wakeup_handler(void)
+{
+ struct kvm_vcpu *vcpu;
+ int cpu = smp_processor_id();
+
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+ blocked_vcpu_list) {
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (pi_test_on(pi_desc) == 1)
+ kvm_vcpu_kick(vcpu);
+ }
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
static __init int hardware_setup(void)
{
int r = -ENOMEM, i, msr;
@@ -6028,13 +6264,10 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_apicv())
enable_apicv = 0;
- if (enable_apicv)
- kvm_x86_ops->update_cr8_intercept = NULL;
- else {
- kvm_x86_ops->hwapic_irr_update = NULL;
- kvm_x86_ops->hwapic_isr_update = NULL;
- kvm_x86_ops->deliver_posted_interrupt = NULL;
- kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
+ if (cpu_has_vmx_tsc_scaling()) {
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 48;
}
vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
@@ -6096,6 +6329,8 @@ static __init int hardware_setup(void)
kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
}
+ kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+
return alloc_kvm_area();
out8:
@@ -6627,7 +6862,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
- u32 exec_control;
if (vmx->nested.current_vmptr == -1ull)
return;
@@ -6640,9 +6874,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
they were modified */
copy_shadow_to_vmcs12(vmx);
vmx->nested.sync_shadow_vmcs = false;
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
vmx->nested.posted_intr_nv = -1;
@@ -6662,6 +6895,7 @@ static void free_nested(struct vcpu_vmx *vmx)
return;
vmx->nested.vmxon = false;
+ free_vpid(vmx->nested.vpid02);
nested_release_vmcs12(vmx);
if (enable_shadow_vmcs)
free_vmcs(vmx->nested.current_shadow_vmcs);
@@ -7038,7 +7272,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
gpa_t vmptr;
- u32 exec_control;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -7070,9 +7303,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
if (enable_shadow_vmcs) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
vmcs_write64(VMCS_LINK_POINTER,
__pa(vmx->nested.current_shadow_vmcs));
vmx->nested.sync_shadow_vmcs = true;
@@ -7178,7 +7410,58 @@ static int handle_invept(struct kvm_vcpu *vcpu)
static int handle_invvpid(struct kvm_vcpu *vcpu)
{
- kvm_queue_exception(vcpu, UD_VECTOR);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmx_instruction_info;
+ unsigned long type, types;
+ gva_t gva;
+ struct x86_exception e;
+ int vpid;
+
+ if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_VPID) ||
+ !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
+
+ if (!(types & (1UL << type))) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return 1;
+ }
+
+ /* according to the intel vmx instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, false, &gva))
+ return 1;
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
+ sizeof(u32), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ switch (type) {
+ case VMX_VPID_EXTENT_ALL_CONTEXT:
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+ nested_vmx_succeed(vcpu);
+ break;
+ default:
+ /* Trap single context invalidation invvpid calls */
+ BUG_ON(1);
+ break;
+ }
+
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7207,6 +7490,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_pcommit(struct kvm_vcpu *vcpu)
+{
+ /* we never catch pcommit instruct for L1 guest. */
+ WARN_ON(1);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7257,6 +7547,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_PCOMMIT] = handle_pcommit,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7558,6 +7849,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
* the XSS exit bitmap in vmcs12.
*/
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ case EXIT_REASON_PCOMMIT:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
default:
return true;
}
@@ -7569,10 +7862,9 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
}
-static int vmx_enable_pml(struct vcpu_vmx *vmx)
+static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
{
struct page *pml_pg;
- u32 exec_control;
pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!pml_pg)
@@ -7583,24 +7875,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_ENABLE_PML;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
-
return 0;
}
-static void vmx_disable_pml(struct vcpu_vmx *vmx)
+static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
{
- u32 exec_control;
-
- ASSERT(vmx->pml_pg);
- __free_page(vmx->pml_pg);
- vmx->pml_pg = NULL;
-
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ if (vmx->pml_pg) {
+ __free_page(vmx->pml_pg);
+ vmx->pml_pg = NULL;
+ }
}
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
@@ -7676,7 +7959,7 @@ static void dump_vmcs(void)
u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
u32 secondary_exec_control = 0;
unsigned long cr4 = vmcs_readl(GUEST_CR4);
- u64 efer = vmcs_readl(GUEST_IA32_EFER);
+ u64 efer = vmcs_read64(GUEST_IA32_EFER);
int i, n;
if (cpu_has_secondary_exec_ctrls())
@@ -7692,10 +7975,10 @@ static void dump_vmcs(void)
if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
(cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
{
- pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n",
- vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1));
- pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n",
- vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3));
+ pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
+ pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
}
pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
@@ -7716,16 +7999,16 @@ static void dump_vmcs(void)
vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
(vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
- pr_err("EFER = 0x%016llx PAT = 0x%016lx\n",
- efer, vmcs_readl(GUEST_IA32_PAT));
- pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n",
- vmcs_readl(GUEST_IA32_DEBUGCTL),
+ pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
+ efer, vmcs_read64(GUEST_IA32_PAT));
+ pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
+ vmcs_read64(GUEST_IA32_DEBUGCTL),
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
- pr_err("PerfGlobCtl = 0x%016lx\n",
- vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL));
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
- pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS));
+ pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
pr_err("Interruptibility = %08x ActivityState = %08x\n",
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
vmcs_read32(GUEST_ACTIVITY_STATE));
@@ -7754,11 +8037,12 @@ static void dump_vmcs(void)
vmcs_read32(HOST_IA32_SYSENTER_CS),
vmcs_readl(HOST_IA32_SYSENTER_EIP));
if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
- pr_err("EFER = 0x%016lx PAT = 0x%016lx\n",
- vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT));
+ pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
+ vmcs_read64(HOST_IA32_EFER),
+ vmcs_read64(HOST_IA32_PAT));
if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
- pr_err("PerfGlobCtl = 0x%016lx\n",
- vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL));
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
pr_err("*** Control State ***\n");
pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
@@ -7781,13 +8065,16 @@ static void dump_vmcs(void)
pr_err("IDTVectoring: info=%08x errcode=%08x\n",
vmcs_read32(IDT_VECTORING_INFO_FIELD),
vmcs_read32(IDT_VECTORING_ERROR_CODE));
- pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+ pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
+ if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+ pr_err("TSC Multiplier = 0x%016llx\n",
+ vmcs_read64(TSC_MULTIPLIER));
if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
- pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER));
+ pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
n = vmcs_read32(CR3_TARGET_COUNT);
for (i = 0; i + 1 < n; i += 4)
pr_err("CR3 target%u=%016lx target%u=%016lx\n",
@@ -7814,6 +8101,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
+ trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
* updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
@@ -7924,10 +8213,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
* apicv
*/
if (!cpu_has_vmx_virtualize_x2apic_mode() ||
- !vmx_vm_has_apicv(vcpu->kvm))
+ !kvm_vcpu_apicv_active(vcpu))
return;
- if (!vm_need_tpr_shadow(vcpu->kvm))
+ if (!cpu_need_tpr_shadow(vcpu))
return;
sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -8031,7 +8320,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
- if (!vmx_vm_has_apicv(vcpu->kvm))
+ if (!kvm_vcpu_apicv_active(vcpu))
return;
vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -8439,7 +8728,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
/*
* the KVM_REQ_EVENT optimization bit is only on for one entry, and if
@@ -8477,8 +8765,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (enable_pml)
- vmx_disable_pml(vmx);
- free_vpid(vmx);
+ vmx_destroy_pml_buffer(vmx);
+ free_vpid(vmx->vpid);
leave_guest_mode(vcpu);
vmx_load_vmcs01(vcpu);
free_nested(vmx);
@@ -8497,7 +8785,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx)
return ERR_PTR(-ENOMEM);
- allocate_vpid(vmx);
+ vmx->vpid = allocate_vpid();
err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
if (err)
@@ -8530,7 +8818,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
put_cpu();
if (err)
goto free_vmcs;
- if (vm_need_virtualize_apic_accesses(kvm)) {
+ if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
err = alloc_apic_access_page(kvm);
if (err)
goto free_vmcs;
@@ -8545,8 +8833,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vmcs;
}
- if (nested)
+ if (nested) {
nested_vmx_setup_ctls_msrs(vmx);
+ vmx->nested.vpid02 = allocate_vpid();
+ }
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
@@ -8559,7 +8849,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
* for the guest, etc.
*/
if (enable_pml) {
- err = vmx_enable_pml(vmx);
+ err = vmx_create_pml_buffer(vmx);
if (err)
goto free_vmcs;
}
@@ -8567,13 +8857,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
return &vmx->vcpu;
free_vmcs:
+ free_vpid(vmx->nested.vpid02);
free_loaded_vmcs(vmx->loaded_vmcs);
free_msrs:
kfree(vmx->guest_msrs);
uninit_vcpu:
kvm_vcpu_uninit(&vmx->vcpu);
free_vcpu:
- free_vpid(vmx);
+ free_vpid(vmx->vpid);
kmem_cache_free(kvm_vcpu_cache, vmx);
return ERR_PTR(err);
}
@@ -8648,49 +8939,68 @@ static int vmx_get_lpage_level(void)
return PT_PDPE_LEVEL;
}
+static void vmcs_set_secondary_exec_control(u32 new_ctl)
+{
+ /*
+ * These bits in the secondary execution controls field
+ * are dynamic, the others are mostly based on the hypervisor
+ * architecture and the guest's CPUID. Do not touch the
+ * dynamic bits.
+ */
+ u32 mask =
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+ u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exec_control;
+ u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
- vmx->rdtscp_enabled = false;
if (vmx_rdtscp_supported()) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- if (exec_control & SECONDARY_EXEC_RDTSCP) {
- best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
- if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
- vmx->rdtscp_enabled = true;
- else {
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- }
+ bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
+ if (!rdtscp_enabled)
+ secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
+
+ if (nested) {
+ if (rdtscp_enabled)
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_RDTSCP;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDTSCP;
}
- if (nested && !vmx->rdtscp_enabled)
- vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDTSCP;
}
/* Exposing INVPCID only when PCID is exposed */
best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
if (vmx_invpcid_supported() &&
- best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
- guest_cpuid_has_pcid(vcpu)) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- } else {
- if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- exec_control);
- }
+ (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
+ !guest_cpuid_has_pcid(vcpu))) {
+ secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+
if (best)
best->ebx &= ~bit(X86_FEATURE_INVPCID);
}
+
+ if (cpu_has_secondary_exec_ctrls())
+ vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+ if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
+ if (guest_cpuid_has_pcommit(vcpu))
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_PCOMMIT;
+ else
+ vmx->nested.nested_vmx_secondary_ctls_high &=
+ ~SECONDARY_EXEC_PCOMMIT;
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9257,7 +9567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
*/
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
vmx->nested.pi_pending = false;
- vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
vmcs_write64(POSTED_INTR_DESC_ADDR,
page_to_phys(vmx->nested.pi_desc_page) +
(unsigned long)(vmcs12->posted_intr_desc_addr &
@@ -9298,13 +9608,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (cpu_has_secondary_exec_ctrls()) {
exec_control = vmx_secondary_exec_control(vmx);
- if (!vmx->rdtscp_enabled)
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_APIC_REGISTER_VIRT);
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_PCOMMIT);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9323,7 +9633,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->nested.apic_access_page));
} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
- (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
+ cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
kvm_vcpu_reload_apic_access_page(vcpu);
@@ -9433,12 +9743,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (enable_vpid) {
/*
- * Trivially support vpid by letting L2s share their parent
- * L1's vpid. TODO: move to a more elaborate solution, giving
- * each L2 its own vpid and exposing the vpid feature to L1.
+ * There is no direct mapping between vpid02 and vpid12, the
+ * vpid02 is per-vCPU for L0 and reused while the value of
+ * vpid12 is changed w/ one invvpid during nested vmentry.
+ * The vpid12 is allocated by L1 for L2, so it will not
+ * influence global bitmap(for vpid01 and vpid02 allocation)
+ * even if spawn a lot of nested vCPUs.
*/
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- vmx_flush_tlb(vcpu);
+ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+ if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+ }
+ } else {
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+ vmx_flush_tlb(vcpu);
+ }
+
}
if (nested_cpu_has_ept(vmcs12)) {
@@ -9906,7 +10228,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* Additionally, restore L2's PDPTR to vmcs12.
*/
if (enable_ept) {
- vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+ vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
@@ -10278,6 +10600,201 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
}
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ * we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ * 'NDST' <-- vcpu->pre_pcpu
+ * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ * interrupt is posted for this vCPU, we cannot block it, in
+ * this case, return 1, otherwise, return 0.
+ *
+ */
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ unsigned int dest;
+ struct pi_desc old, new;
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ vcpu->pre_pcpu = vcpu->cpu;
+ spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu,
+ vcpu->pre_pcpu));
+ spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ /*
+ * We should not block the vCPU if
+ * an interrupt is posted for it.
+ */
+ if (pi_test_on(pi_desc) == 1) {
+ spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock_irqrestore(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ vcpu->pre_pcpu = -1;
+
+ return 1;
+ }
+
+ WARN((pi_desc->sn == 1),
+ "Warning: SN field of posted-interrupts "
+ "is set before blocking\n");
+
+ /*
+ * Since vCPU can be preempted during this process,
+ * vcpu->cpu could be different with pre_pcpu, we
+ * need to set pre_pcpu as the destination of wakeup
+ * notification event, then we can find the right vCPU
+ * to wakeup in wakeup handler if interrupts happen
+ * when the vCPU is in blocked state.
+ */
+ dest = cpu_physical_id(vcpu->pre_pcpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'wakeup vector' */
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ return 0;
+}
+
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+ unsigned long flags;
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return;
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* Allow posting non-urgent interrupts */
+ new.sn = 0;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ if(vcpu->pre_pcpu != -1) {
+ spin_lock_irqsave(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock_irqrestore(
+ &per_cpu(blocked_vcpu_on_cpu_lock,
+ vcpu->pre_pcpu), flags);
+ vcpu->pre_pcpu = -1;
+ }
+}
+
+/*
+ * vmx_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+ int idx, ret = -EINVAL;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d PI cannot support posting multicast/broadcast
+ * interrupts to a vCPU, we still use interrupt remapping
+ * for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ *
+ * We will support full lowest-priority interrupt later.
+ */
+
+ kvm_set_msi_irq(e, &irq);
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+ continue;
+
+ vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
+ vcpu_info.vector = irq.vector;
+
+ trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+ vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+ if (set)
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+ else {
+ /* suppress notification event before unposting */
+ pi_set_sn(vcpu_to_pi_desc(vcpu));
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ pi_clear_sn(vcpu_to_pi_desc(vcpu));
+ }
+
+ if (ret < 0) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -10297,7 +10814,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .update_db_bp_intercept = update_exception_bitmap,
+ .update_bp_intercept = update_exception_bitmap,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
.get_segment_base = vmx_get_segment_base,
@@ -10347,7 +10864,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .vm_has_apicv = vmx_vm_has_apicv,
+ .get_enable_apicv = vmx_get_enable_apicv,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
@@ -10371,11 +10889,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .set_tsc_khz = vmx_set_tsc_khz,
.read_tsc_offset = vmx_read_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
- .adjust_tsc_offset = vmx_adjust_tsc_offset,
- .compute_tsc_offset = vmx_compute_tsc_offset,
+ .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
.read_l1_tsc = vmx_read_l1_tsc,
.set_tdp_cr3 = vmx_set_cr3,
@@ -10394,7 +10910,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+ .pre_block = vmx_pre_block,
+ .post_block = vmx_post_block,
+
.pmu_ops = &intel_pmu_ops,
+
+ .update_pi_irte = vmx_update_pi_irte,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9a9a19830321..4244c2baf57d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -51,6 +51,8 @@
#include <linux/pci.h>
#include <linux/timekeeper_internal.h>
#include <linux/pvclock_gtod.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -64,6 +66,7 @@
#include <asm/fpu/internal.h> /* Ugh! */
#include <asm/pvclock.h>
#include <asm/div64.h>
+#include <asm/irq_remapping.h>
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
@@ -90,10 +93,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-struct kvm_x86_ops *kvm_x86_ops;
+struct kvm_x86_ops *kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
-static bool ignore_msrs = 0;
+static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
unsigned int min_timer_period_us = 500;
@@ -102,20 +105,25 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly kvmclock_periodic_sync = true;
module_param(kvmclock_periodic_sync, bool, S_IRUGO);
-bool kvm_has_tsc_control;
+bool __read_mostly kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
-u32 kvm_max_guest_tsc_khz;
+u32 __read_mostly kvm_max_guest_tsc_khz;
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
+EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
+u64 __read_mostly kvm_max_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
+static u64 __read_mostly kvm_default_tsc_scaling_ratio;
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
-static u32 tsc_tolerance_ppm = 250;
+static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
/* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int lapic_timer_advance_ns = 0;
+unsigned int __read_mostly lapic_timer_advance_ns = 0;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
-static bool backwards_tsc_observed = false;
+static bool __read_mostly backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -622,7 +630,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
- if ((cr0 ^ old_cr0) & X86_CR0_CD)
+ if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
+ kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+ !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
return 0;
@@ -663,9 +673,9 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
if (index != XCR_XFEATURE_ENABLED_MASK)
return 1;
- if (!(xcr0 & XSTATE_FP))
+ if (!(xcr0 & XFEATURE_MASK_FP))
return 1;
- if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+ if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
return 1;
/*
@@ -673,23 +683,24 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
* saving. However, xcr0 bit 0 is always set, even if the
* emulated CPU does not support XSAVE (see fx_init).
*/
- valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
+ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
if (xcr0 & ~valid_bits)
return 1;
- if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
+ if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
+ (!(xcr0 & XFEATURE_MASK_BNDCSR)))
return 1;
- if (xcr0 & XSTATE_AVX512) {
- if (!(xcr0 & XSTATE_YMM))
+ if (xcr0 & XFEATURE_MASK_AVX512) {
+ if (!(xcr0 & XFEATURE_MASK_YMM))
return 1;
- if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512)
+ if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
return 1;
}
kvm_put_guest_xcr0(vcpu);
vcpu->arch.xcr0 = xcr0;
- if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
+ if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
kvm_update_cpuid(vcpu);
return 0;
}
@@ -788,7 +799,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS)
return 1;
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
@@ -798,7 +809,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
return kvm_lapic_get_cr8(vcpu);
else
return vcpu->arch.cr8;
@@ -940,7 +951,7 @@ static u32 msrs_to_save[] = {
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
+ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
};
static unsigned num_msrs_to_save;
@@ -952,6 +963,11 @@ static u32 emulated_msrs[] = {
HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
+ HV_X64_MSR_SCONTROL,
+ HV_X64_MSR_STIMER0_CONFIG,
HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
@@ -1153,7 +1169,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
++version;
- kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+ if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
+ return;
/*
* The guest calculates current wall clock time by adding
@@ -1240,14 +1257,53 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
return v;
}
-static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
+{
+ u64 ratio;
+
+ /* Guest TSC same frequency as host TSC? */
+ if (!scale) {
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return 0;
+ }
+
+ /* TSC scaling supported? */
+ if (!kvm_has_tsc_control) {
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ return 0;
+ } else {
+ WARN(1, "user requested TSC rate below hardware speed\n");
+ return -1;
+ }
+ }
+
+ /* TSC scaling required - calculate ratio */
+ ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
+ user_tsc_khz, tsc_khz);
+
+ if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
+ WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
+ return -1;
+ }
+
+ vcpu->arch.tsc_scaling_ratio = ratio;
+ return 0;
+}
+
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
{
u32 thresh_lo, thresh_hi;
int use_scaling = 0;
/* tsc_khz can be zero if TSC calibration fails */
- if (this_tsc_khz == 0)
- return;
+ if (this_tsc_khz == 0) {
+ /* set tsc_scaling_ratio to a safe value */
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return -1;
+ }
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
@@ -1267,7 +1323,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
- kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+ return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@ -1313,6 +1369,48 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
+/*
+ * Multiply tsc by a fixed point number represented by ratio.
+ *
+ * The most significant 64-N bits (mult) of ratio represent the
+ * integral part of the fixed point number; the remaining N bits
+ * (frac) represent the fractional part, ie. ratio represents a fixed
+ * point number (mult + frac * 2^(-N)).
+ *
+ * N equals to kvm_tsc_scaling_ratio_frac_bits.
+ */
+static inline u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+ return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
+}
+
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+{
+ u64 _tsc = tsc;
+ u64 ratio = vcpu->arch.tsc_scaling_ratio;
+
+ if (ratio != kvm_default_tsc_scaling_ratio)
+ _tsc = __scale_tsc(ratio, tsc);
+
+ return _tsc;
+}
+EXPORT_SYMBOL_GPL(kvm_scale_tsc);
+
+static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ u64 tsc;
+
+ tsc = kvm_scale_tsc(vcpu, rdtsc());
+
+ return target_tsc - tsc;
+}
+
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
+{
+ return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc));
+}
+EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
@@ -1324,7 +1422,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
u64 data = msr->data;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_tsc_offset(vcpu, data);
ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -1381,7 +1479,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
data += delta;
- offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_tsc_offset(vcpu, data);
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
matched = true;
@@ -1438,6 +1536,20 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
EXPORT_SYMBOL_GPL(kvm_write_tsc);
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+ WARN_ON(adjustment < 0);
+ adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+ kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
#ifdef CONFIG_X86_64
static cycle_t read_tsc(void)
@@ -1574,6 +1686,11 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
#endif
}
+void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+{
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+}
+
static void kvm_gen_update_masterclock(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
@@ -1599,7 +1716,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
- unsigned long flags, this_tsc_khz;
+ unsigned long flags, this_tsc_khz, tgt_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
@@ -1636,7 +1753,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
kernel_ns = get_kernel_ns();
}
- tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+ tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
/*
* We may have to catch up the TSC to match elapsed wall clock
@@ -1662,7 +1779,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
return 0;
if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
- kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
+ tgt_tsc_khz = kvm_has_tsc_control ?
+ vcpu->virtual_tsc_khz : this_tsc_khz;
+ kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = this_tsc_khz;
@@ -1897,6 +2016,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu)
static void record_steal_time(struct kvm_vcpu *vcpu)
{
+ accumulate_steal_time(vcpu);
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -2047,12 +2168,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & KVM_MSR_ENABLED))
break;
- vcpu->arch.st.last_steal = current->sched_info.run_delay;
-
- preempt_disable();
- accumulate_steal_time(vcpu);
- preempt_enable();
-
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
break;
@@ -2091,6 +2206,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
return kvm_hv_set_msr_common(vcpu, msr, data,
msr_info->host_initiated);
case MSR_IA32_BBL_CR_CTL3:
@@ -2295,6 +2411,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
return kvm_hv_get_msr_common(vcpu,
msr_info->index, &msr_info->data);
break;
@@ -2434,6 +2551,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV:
case KVM_CAP_HYPERV_VAPIC:
case KVM_CAP_HYPERV_SPIN:
+ case KVM_CAP_HYPERV_SYNIC:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -2448,6 +2566,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENABLE_CAP_VM:
case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID:
+ case KVM_CAP_SPLIT_IRQCHIP:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2585,6 +2704,11 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
}
+static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
+{
+ set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
+}
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
/* Address WBINVD may be executed by guest */
@@ -2611,7 +2735,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
- u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+ u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_x86_ops->write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
@@ -2627,7 +2751,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vcpu->cpu = cpu;
}
- accumulate_steal_time(vcpu);
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
}
@@ -2641,7 +2764,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- kvm_x86_ops->sync_pir_to_irr(vcpu);
+ if (vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+
memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
return 0;
@@ -2656,17 +2781,50 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
return 0;
}
+static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
+{
+ return (!lapic_in_kernel(vcpu) ||
+ kvm_apic_accept_pic_intr(vcpu));
+}
+
+/*
+ * if userspace requested an interrupt window, check that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_interrupt_allowed(vcpu) &&
+ !kvm_cpu_has_interrupt(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+}
+
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
if (irq->irq >= KVM_NR_INTERRUPTS)
return -EINVAL;
- if (irqchip_in_kernel(vcpu->kvm))
+
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ kvm_queue_interrupt(vcpu, irq->irq, false);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+ }
+
+ /*
+ * With in-kernel LAPIC, we only use this to inject EXTINT, so
+ * fail for in-kernel 8259.
+ */
+ if (pic_in_kernel(vcpu->kvm))
return -ENXIO;
- kvm_queue_interrupt(vcpu, irq->irq, false);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ if (vcpu->arch.pending_external_vector != -1)
+ return -EEXIST;
+ vcpu->arch.pending_external_vector = irq->irq;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
@@ -2905,7 +3063,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
* Copy each region from the possibly compacted offset to the
* non-compacted offset.
*/
- valid = xstate_bv & ~XSTATE_FPSSE;
+ valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
u64 feature = valid & -valid;
int index = fls64(feature) - 1;
@@ -2943,7 +3101,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
* Copy each region from the non-compacted offset to the
* possibly compacted offset.
*/
- valid = xstate_bv & ~XSTATE_FPSSE;
+ valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
u64 feature = valid & -valid;
int index = fls64(feature) - 1;
@@ -2971,7 +3129,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
&vcpu->arch.guest_fpu.state.fxsave,
sizeof(struct fxregs_state));
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
- XSTATE_FPSSE;
+ XFEATURE_MASK_FPSSE;
}
}
@@ -2991,7 +3149,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
return -EINVAL;
load_xsave(vcpu, (u8 *)guest_xsave->region);
} else {
- if (xstate_bv & ~XSTATE_FPSSE)
+ if (xstate_bv & ~XFEATURE_MASK_FPSSE)
return -EINVAL;
memcpy(&vcpu->arch.guest_fpu.state.fxsave,
guest_xsave->region, sizeof(struct fxregs_state));
@@ -3051,6 +3209,20 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
return 0;
}
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+ struct kvm_enable_cap *cap)
+{
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_HYPERV_SYNIC:
+ return kvm_hv_activate_synic(vcpu);
+ default:
+ return -EINVAL;
+ }
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -3175,7 +3347,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_vapic_addr va;
r = -EINVAL;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
goto out;
r = -EFAULT;
if (copy_from_user(&va, argp, sizeof va))
@@ -3302,9 +3474,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (user_tsc_khz == 0)
user_tsc_khz = tsc_khz;
- kvm_set_tsc_khz(vcpu, user_tsc_khz);
+ if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
+ r = 0;
- r = 0;
goto out;
}
case KVM_GET_TSC_KHZ: {
@@ -3315,6 +3487,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_set_guest_paused(vcpu);
goto out;
}
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -3424,41 +3605,38 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
- int r = 0;
-
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
+ return 0;
}
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
- int r = 0;
-
+ int i;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
- kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
+ return 0;
}
static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
- int r = 0;
-
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
sizeof(ps->channels));
ps->flags = kvm->arch.vpit->pit_state.flags;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
memset(&ps->reserved, 0, sizeof(ps->reserved));
- return r;
+ return 0;
}
static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
- int r = 0, start = 0;
+ int start = 0;
+ int i;
u32 prev_legacy, cur_legacy;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3468,9 +3646,11 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
sizeof(kvm->arch.vpit->pit_state.channels));
kvm->arch.vpit->pit_state.flags = ps->flags;
- kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+ start && i == 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
+ return 0;
}
static int kvm_vm_ioctl_reinject(struct kvm *kvm,
@@ -3555,6 +3735,28 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
kvm->arch.disabled_quirks = cap->args[0];
r = 0;
break;
+ case KVM_CAP_SPLIT_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
+ goto split_irqchip_unlock;
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto split_irqchip_unlock;
+ if (atomic_read(&kvm->online_vcpus))
+ goto split_irqchip_unlock;
+ r = kvm_setup_empty_irq_routing(kvm);
+ if (r)
+ goto split_irqchip_unlock;
+ /* Pairs with irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_split = true;
+ kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ r = 0;
+split_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -3668,7 +3870,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
+ if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -3692,7 +3894,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
+ if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r)
@@ -3845,16 +4047,17 @@ static void kvm_init_msr_list(void)
/*
* Even MSRs that are valid in the host may not be exposed
- * to the guests in some cases. We could work around this
- * in VMX with the generic MSR save/load machinery, but it
- * is not really worthwhile since it will really only
- * happen with nested virtualization.
+ * to the guests in some cases.
*/
switch (msrs_to_save[i]) {
case MSR_IA32_BNDCFGS:
if (!kvm_x86_ops->mpx_supported())
continue;
break;
+ case MSR_TSC_AUX:
+ if (!kvm_x86_ops->rdtscp_supported())
+ continue;
+ break;
default:
break;
}
@@ -4059,6 +4262,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
}
+static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
+
+ return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
+}
+
int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val,
unsigned int bytes,
@@ -4794,6 +5006,7 @@ static const struct x86_emulate_ops emulate_ops = {
.write_gpr = emulator_write_gpr,
.read_std = kvm_read_guest_virt_system,
.write_std = kvm_write_guest_virt_system,
+ .read_phys = kvm_read_guest_phys_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
@@ -4935,7 +5148,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
int emulation_type)
{
gpa_t gpa = cr2;
- pfn_t pfn;
+ kvm_pfn_t pfn;
if (emulation_type & EMULTYPE_NO_REEXECUTE)
return false;
@@ -5666,7 +5879,7 @@ void kvm_arch_exit(void)
int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
- if (irqchip_in_kernel(vcpu->kvm)) {
+ if (lapic_in_kernel(vcpu)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
} else {
@@ -5701,6 +5914,12 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
}
+void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.apicv_active = false;
+ kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
@@ -5765,17 +5984,10 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
}
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
{
- return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
- vcpu->run->request_interrupt_window &&
- kvm_arch_interrupt_allowed(vcpu));
+ return vcpu->run->request_interrupt_window &&
+ likely(!pic_in_kernel(vcpu->kvm));
}
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@@ -5786,13 +5998,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_run->ready_for_interrupt_injection = 1;
- else
- kvm_run->ready_for_interrupt_injection =
- kvm_arch_interrupt_allowed(vcpu) &&
- !kvm_cpu_has_interrupt(vcpu) &&
- !kvm_event_needs_reinjection(vcpu);
+ kvm_run->ready_for_interrupt_injection =
+ pic_in_kernel(vcpu->kvm) ||
+ kvm_vcpu_ready_for_interrupt_injection(vcpu);
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -5805,6 +6013,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
if (!vcpu->arch.apic)
return;
+ if (vcpu->arch.apicv_active)
+ return;
+
if (!vcpu->arch.apic->vapic_addr)
max_irr = kvm_lapic_find_highest_irr(vcpu);
else
@@ -6141,20 +6352,30 @@ static void process_smi(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
}
+void kvm_make_scan_ioapic_request(struct kvm *kvm)
+{
+ kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+}
+
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
u64 eoi_exit_bitmap[4];
- u32 tmr[8];
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
- memset(eoi_exit_bitmap, 0, 32);
- memset(tmr, 0, 32);
+ bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
- kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
+ if (irqchip_split(vcpu->kvm))
+ kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+ else {
+ if (vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+ }
+ bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
+ vcpu_to_synic(vcpu)->vec_bitmap, 256);
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
- kvm_apic_update_tmr(vcpu, tmr);
}
static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -6167,7 +6388,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
struct page *page = NULL;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
return;
if (!kvm_x86_ops->set_apic_access_page_addr)
@@ -6205,8 +6426,10 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
- bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
- vcpu->run->request_interrupt_window;
+ bool req_int_win =
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+
bool req_immediate_exit = false;
if (vcpu->requests) {
@@ -6257,6 +6480,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_pmu_handle_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_pmu_deliver_pmi(vcpu);
+ if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+ BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+ if (test_bit(vcpu->arch.pending_ioapic_eoi,
+ vcpu->arch.ioapic_handled_vectors)) {
+ vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+ vcpu->run->eoi.vector =
+ vcpu->arch.pending_ioapic_eoi;
+ r = 0;
+ goto out;
+ }
+ }
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@@ -6267,6 +6501,40 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 0;
goto out;
}
+ if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv = vcpu->arch.hyperv.exit;
+ r = 0;
+ goto out;
+ }
+
+ /*
+ * KVM_REQ_HV_STIMER has to be processed after
+ * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
+ * depend on the guest clock being up-to-date
+ */
+ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
+ kvm_hv_process_stimers(vcpu);
+ }
+
+ /*
+ * KVM_REQ_EVENT is not set when posted interrupts are set by
+ * VT-d hardware, so we have to update RVI unconditionally.
+ */
+ if (kvm_lapic_enabled(vcpu)) {
+ /*
+ * Update architecture specific hints for APIC
+ * virtual interrupt delivery.
+ */
+ if (vcpu->arch.apicv_active)
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ kvm_lapic_find_highest_irr(vcpu));
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -6285,13 +6553,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
- /*
- * Update architecture specific hints for APIC
- * virtual interrupt delivery.
- */
- if (kvm_x86_ops->hwapic_irr_update)
- kvm_x86_ops->hwapic_irr_update(vcpu,
- kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
@@ -6334,6 +6595,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
+ trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
__kvm_guest_enter();
if (unlikely(vcpu->arch.switch_db_regs)) {
@@ -6346,8 +6609,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
- trace_kvm_entry(vcpu->vcpu_id);
- wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
@@ -6375,8 +6636,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
- rdtsc());
+ vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
@@ -6427,10 +6687,15 @@ out:
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
- if (!kvm_arch_vcpu_runnable(vcpu)) {
+ if (!kvm_arch_vcpu_runnable(vcpu) &&
+ (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if (kvm_x86_ops->post_block)
+ kvm_x86_ops->post_block(vcpu);
+
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
}
@@ -6467,10 +6732,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
for (;;) {
- if (kvm_vcpu_running(vcpu))
+ if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
- else
+ } else {
r = vcpu_block(kvm, vcpu);
+ }
+
if (r <= 0)
break;
@@ -6478,9 +6745,10 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
- if (dm_request_for_irq_injection(vcpu)) {
- r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
+ if (dm_request_for_irq_injection(vcpu) &&
+ kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
+ r = 0;
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.request_irq_exits;
break;
}
@@ -6607,7 +6875,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
/* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm)) {
+ if (!lapic_in_kernel(vcpu)) {
if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
r = -EINVAL;
goto out;
@@ -6931,7 +7199,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops->update_db_bp_intercept(vcpu);
+ kvm_x86_ops->update_bp_intercept(vcpu);
r = 0;
@@ -7005,7 +7273,7 @@ static void fx_init(struct kvm_vcpu *vcpu)
/*
* Ensure guest xcr0 is valid for loading
*/
- vcpu->arch.xcr0 = XSTATE_FP;
+ vcpu->arch.xcr0 = XFEATURE_MASK_FP;
vcpu->arch.cr0 |= X86_CR0_ET;
}
@@ -7280,6 +7548,20 @@ int kvm_arch_hardware_setup(void)
if (r != 0)
return r;
+ if (kvm_has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated needed because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
+ kvm_max_guest_tsc_khz = max;
+
+ kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
+ }
+
kvm_init_msr_list();
return 0;
}
@@ -7307,7 +7589,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
{
- return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+ return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
}
struct static_key kvm_no_apic_vcpu __read_mostly;
@@ -7321,6 +7603,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
+ vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv();
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
@@ -7376,6 +7659,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
+ vcpu->arch.pending_external_vector = -1;
+
+ kvm_hv_vcpu_init(vcpu);
+
return 0;
fail_free_mce_banks:
@@ -7394,6 +7681,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
{
int idx;
+ kvm_hv_vcpu_uninit(vcpu);
kvm_pmu_destroy(vcpu);
kfree(vcpu->arch.mce_banks);
kvm_free_lapic(vcpu);
@@ -7401,7 +7689,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
kvm_mmu_destroy(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
static_key_slow_dec(&kvm_no_apic_vcpu);
}
@@ -7788,6 +8076,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
kvm_cpu_has_interrupt(vcpu))
return true;
+ if (kvm_hv_has_stimer_pending(vcpu))
+ return true;
+
return false;
}
@@ -8028,7 +8319,59 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ if (kvm_x86_ops->update_pi_irte) {
+ irqfd->producer = prod;
+ return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
+ }
+
+ return -EINVAL;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ int ret;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ if (!kvm_x86_ops->update_pi_irte) {
+ WARN_ON(irqfd->producer != NULL);
+ return;
+ }
+
+ WARN_ON(irqfd->producer != prod);
+ irqfd->producer = NULL;
+
+ /*
+ * When producer of consumer is unregistered, we change back to
+ * remapped mode, so we can re-use the current implementation
+ * when the irq is masked/disabed or the consumer side (KVM
+ * int this case doesn't want to receive the interrupts.
+ */
+ ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ if (ret)
+ printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+ " fails: %d\n", irqfd->consumer.token, ret);
+}
+
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ if (!kvm_x86_ops->update_pi_irte)
+ return -EINVAL;
+
+ return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
+}
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
@@ -8043,3 +8386,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2f822cd886c2..f2afa5fe48a6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -180,9 +180,9 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
-#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
- | XSTATE_BNDREGS | XSTATE_BNDCSR \
- | XSTATE_AVX512)
+#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512)
extern u64 host_xcr0;
extern u64 kvm_supported_xcr0(void);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a0d09f6c6533..4ba229ac3f4f 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1414,6 +1414,7 @@ __init void lguest_init(void)
pv_info.kernel_rpl = 1;
/* Everyone except Xen runs with this set. */
pv_info.shared_kernel_pmd = 1;
+ pv_info.features = 0;
/*
* We set up all the lguest overrides for sensitive operations. These
@@ -1472,7 +1473,6 @@ __init void lguest_init(void)
pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
pv_mmu_ops.pte_update = lguest_pte_update;
- pv_mmu_ops.pte_update_defer = lguest_pte_update;
#ifdef CONFIG_X86_LOCAL_APIC
/* APIC read/write intercepts */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2587888d987..a501fa25da41 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -16,7 +16,7 @@ clean-files := inat-tables.c
obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
-lib-y := delay.o misc.o cmdline.o
+lib-y := delay.o misc.o cmdline.o cpu.o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c
new file mode 100644
index 000000000000..aa417a97511c
--- /dev/null
+++ b/arch/x86/lib/cpu.c
@@ -0,0 +1,35 @@
+#include <linux/module.h>
+
+unsigned int x86_family(unsigned int sig)
+{
+ unsigned int x86;
+
+ x86 = (sig >> 8) & 0xf;
+
+ if (x86 == 0xf)
+ x86 += (sig >> 20) & 0xff;
+
+ return x86;
+}
+EXPORT_SYMBOL_GPL(x86_family);
+
+unsigned int x86_model(unsigned int sig)
+{
+ unsigned int fam, model;
+
+ fam = x86_family(sig);
+
+ model = (sig >> 4) & 0xf;
+
+ if (fam >= 0x6)
+ model += ((sig >> 16) & 0xf) << 4;
+
+ return model;
+}
+EXPORT_SYMBOL_GPL(x86_model);
+
+unsigned int x86_stepping(unsigned int sig)
+{
+ return sig & 0xf;
+}
+EXPORT_SYMBOL_GPL(x86_stepping);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 43623739c7cf..004c861b1648 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,6 +1,8 @@
#include <linux/module.h>
#include <linux/preempt.h>
#include <asm/msr.h>
+#define CREATE_TRACE_POINTS
+#include <asm/msr-trace.h>
struct msr *msrs_alloc(void)
{
@@ -108,3 +110,27 @@ int msr_clear_bit(u32 msr, u8 bit)
{
return __flip_bit(msr, bit, false);
}
+
+#ifdef CONFIG_TRACEPOINTS
+void do_trace_write_msr(unsigned msr, u64 val, int failed)
+{
+ trace_write_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_write_msr);
+EXPORT_TRACEPOINT_SYMBOL(write_msr);
+
+void do_trace_read_msr(unsigned msr, u64 val, int failed)
+{
+ trace_read_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_read_msr);
+EXPORT_TRACEPOINT_SYMBOL(read_msr);
+
+void do_trace_rdpmc(unsigned counter, u64 val, int failed)
+{
+ trace_rdpmc(counter, val, failed);
+}
+EXPORT_SYMBOL(do_trace_rdpmc);
+EXPORT_TRACEPOINT_SYMBOL(rdpmc);
+
+#endif
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 816488c0b97e..d388de72eaca 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -353,8 +353,12 @@ AVXcode: 1
17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
18: Grp16 (1A)
19:
-1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv
-1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv
+# Intel SDM opcode map does not list MPX instructions. For now using Gv for
+# bnd registers and Ev for everything else is OK because the instruction
+# decoder does not use the information except as an indication that there is
+# a ModR/M byte.
+1a: BNDCL Gv,Ev (F3) | BNDCU Gv,Ev (F2) | BNDMOV Gv,Ev (66) | BNDLDX Gv,Ev
+1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv
1c:
1d:
1e:
@@ -732,6 +736,12 @@ bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
be: vfnmsub231ps/d Vx,Hx,Wx (66),(v)
bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
# 0x0f 0x38 0xc0-0xff
+c8: sha1nexte Vdq,Wdq
+c9: sha1msg1 Vdq,Wdq
+ca: sha1msg2 Vdq,Wdq
+cb: sha256rnds2 Vdq,Wdq
+cc: sha256msg1 Vdq,Wdq
+cd: sha256msg2 Vdq,Wdq
db: VAESIMC Vdq,Wdq (66),(v1)
dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
@@ -790,6 +800,7 @@ AVXcode: 3
61: vpcmpestri Vdq,Wdq,Ib (66),(v1)
62: vpcmpistrm Vdq,Wdq,Ib (66),(v1)
63: vpcmpistri Vdq,Wdq,Ib (66),(v1)
+cc: sha1rnds4 Vdq,Wdq,Ib
df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
f0: RORX Gy,Ey,Ib (F2),(v)
EndTable
@@ -874,7 +885,7 @@ GrpTable: Grp7
2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B)
3: LIDT Ms
4: SMSW Mw/Rv
-5:
+5: rdpkru (110),(11B) | wrpkru (111),(11B)
6: LMSW Ew
7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
EndTable
@@ -888,6 +899,9 @@ EndTable
GrpTable: Grp9
1: CMPXCHG8B/16B Mq/Mdq
+3: xrstors
+4: xsavec
+5: xsaves
6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B)
EndTable
@@ -932,8 +946,8 @@ GrpTable: Grp15
3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
4: XSAVE
5: XRSTOR | lfence (11B)
-6: XSAVEOPT | mfence (11B)
-7: clflush | sfence (11B)
+6: XSAVEOPT | clwb (66) | mfence (11B)
+7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B)
EndTable
GrpTable: Grp16
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index dd76a05729b0..024f6e971174 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -169,6 +169,76 @@ void fxch_i(void)
fpu_tag_word = tag_word;
}
+static void fcmovCC(void)
+{
+ /* fcmovCC st(i) */
+ int i = FPU_rm;
+ FPU_REG *st0_ptr = &st(0);
+ FPU_REG *sti_ptr = &st(i);
+ long tag_word = fpu_tag_word;
+ int regnr = top & 7;
+ int regnri = (top + i) & 7;
+ u_char sti_tag = (tag_word >> (regnri * 2)) & 3;
+
+ if (sti_tag == TAG_Empty) {
+ FPU_stack_underflow();
+ clear_C1();
+ return;
+ }
+ reg_copy(sti_ptr, st0_ptr);
+ tag_word &= ~(3 << (regnr * 2));
+ tag_word |= (sti_tag << (regnr * 2));
+ fpu_tag_word = tag_word;
+}
+
+void fcmovb(void)
+{
+ if (FPU_EFLAGS & X86_EFLAGS_CF)
+ fcmovCC();
+}
+
+void fcmove(void)
+{
+ if (FPU_EFLAGS & X86_EFLAGS_ZF)
+ fcmovCC();
+}
+
+void fcmovbe(void)
+{
+ if (FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF))
+ fcmovCC();
+}
+
+void fcmovu(void)
+{
+ if (FPU_EFLAGS & X86_EFLAGS_PF)
+ fcmovCC();
+}
+
+void fcmovnb(void)
+{
+ if (!(FPU_EFLAGS & X86_EFLAGS_CF))
+ fcmovCC();
+}
+
+void fcmovne(void)
+{
+ if (!(FPU_EFLAGS & X86_EFLAGS_ZF))
+ fcmovCC();
+}
+
+void fcmovnbe(void)
+{
+ if (!(FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF)))
+ fcmovCC();
+}
+
+void fcmovnu(void)
+{
+ if (!(FPU_EFLAGS & X86_EFLAGS_PF))
+ fcmovCC();
+}
+
void ffree_(void)
{
/* ffree st(i) */
diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h
index 4dae511c85ad..afbc4d805d66 100644
--- a/arch/x86/math-emu/fpu_emu.h
+++ b/arch/x86/math-emu/fpu_emu.h
@@ -71,7 +71,7 @@
#include "fpu_system.h"
-#include <asm/sigcontext.h> /* for struct _fpstate */
+#include <uapi/asm/sigcontext.h> /* for struct _fpstate */
#include <asm/math_emu.h>
#include <linux/linkage.h>
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 3d8f2e421466..e945fedf1de2 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -40,49 +40,33 @@
#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */
-#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */
+/* fcmovCC and f(u)comi(p) are enabled if CPUID(1).EDX(15) "cmov" is set */
-/* WARNING: These codes are not documented by Intel in their 80486 manual
- and may not work on FPU clones or later Intel FPUs. */
-
-/* Changes to support the un-doc codes provided by Linus Torvalds. */
-
-#define _d9_d8_ fstp_i /* unofficial code (19) */
-#define _dc_d0_ fcom_st /* unofficial code (14) */
-#define _dc_d8_ fcompst /* unofficial code (1c) */
-#define _dd_c8_ fxch_i /* unofficial code (0d) */
-#define _de_d0_ fcompst /* unofficial code (16) */
-#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */
-#define _df_c8_ fxch_i /* unofficial code (0f) */
-#define _df_d0_ fstp_i /* unofficial code (17) */
-#define _df_d8_ fstp_i /* unofficial code (1f) */
+/* WARNING: "u" entries are not documented by Intel in their 80486 manual
+ and may not work on FPU clones or later Intel FPUs.
+ Changes to support them provided by Linus Torvalds. */
static FUNC const st_instr_table[64] = {
- fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_,
- fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_,
- fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_,
- fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_,
- fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
- fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
- fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
- fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
+/* Opcode: d8 d9 da db */
+/* dc dd de df */
+/* c0..7 */ fadd__, fld_i_, fcmovb, fcmovnb,
+/* c0..7 */ fadd_i, ffree_, faddp_, ffreep,/*u*/
+/* c8..f */ fmul__, fxch_i, fcmove, fcmovne,
+/* c8..f */ fmul_i, fxch_i,/*u*/ fmulp_, fxch_i,/*u*/
+/* d0..7 */ fcom_st, fp_nop, fcmovbe, fcmovnbe,
+/* d0..7 */ fcom_st,/*u*/ fst_i_, fcompst,/*u*/ fstp_i,/*u*/
+/* d8..f */ fcompst, fstp_i,/*u*/ fcmovu, fcmovnu,
+/* d8..f */ fcompst,/*u*/ fstp_i, fcompp, fstp_i,/*u*/
+/* e0..7 */ fsub__, FPU_etc, __BAD__, finit_,
+/* e0..7 */ fsubri, fucom_, fsubrp, fstsw_,
+/* e8..f */ fsubr_, fconst, fucompp, fucomi_,
+/* e8..f */ fsub_i, fucomp, fsubp_, fucomip,
+/* f0..7 */ fdiv__, FPU_triga, __BAD__, fcomi_,
+/* f0..7 */ fdivri, __BAD__, fdivrp, fcomip,
+/* f8..f */ fdivr_, FPU_trigb, __BAD__, __BAD__,
+/* f8..f */ fdiv_i, __BAD__, fdivp_, __BAD__,
};
-#else /* Support only documented FPU op-codes */
-
-static FUNC const st_instr_table[64] = {
- fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__,
- fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__,
- fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__,
- fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__,
- fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
- fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
- fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
- fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
-};
-
-#endif /* NO_UNDOC_CODE */
-
#define _NONE_ 0 /* Take no special action */
#define _REG0_ 1 /* Need to check for not empty st(0) */
#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */
@@ -94,36 +78,18 @@ static FUNC const st_instr_table[64] = {
#define _REGIc 0 /* Compare st(0) and st(rm) */
#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */
-#ifndef NO_UNDOC_CODE
-
-/* Un-documented FPU op-codes supported by default. (see above) */
-
static u_char const type_table[64] = {
- _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_,
- _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_,
- _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
- _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
- _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
- _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
- _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
- _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
+/* Opcode: d8 d9 da db dc dd de df */
+/* c0..7 */ _REGI_, _NONE_, _REGIn, _REGIn, _REGIi, _REGi_, _REGIp, _REGi_,
+/* c8..f */ _REGI_, _REGIn, _REGIn, _REGIn, _REGIi, _REGI_, _REGIp, _REGI_,
+/* d0..7 */ _REGIc, _NONE_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_,
+/* d8..f */ _REGIc, _REG0_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_,
+/* e0..7 */ _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
+/* e8..f */ _REGI_, _NONE_, _REGIc, _REGIc, _REGIi, _REGIc, _REGIp, _REGIc,
+/* f0..7 */ _REGI_, _NONE_, _null_, _REGIc, _REGIi, _null_, _REGIp, _REGIc,
+/* f8..f */ _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
};
-#else /* Support only documented FPU op-codes */
-
-static u_char const type_table[64] = {
- _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_,
- _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
- _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_,
- _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_,
- _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
- _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
- _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
- _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
-};
-
-#endif /* NO_UNDOC_CODE */
-
#ifdef RE_ENTRANT_CHECKING
u_char emulating = 0;
#endif /* RE_ENTRANT_CHECKING */
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
index 9779df436b7d..caff438b9c1d 100644
--- a/arch/x86/math-emu/fpu_proto.h
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -46,6 +46,14 @@ extern void fstsw_(void);
extern void fp_nop(void);
extern void fld_i_(void);
extern void fxch_i(void);
+extern void fcmovb(void);
+extern void fcmove(void);
+extern void fcmovbe(void);
+extern void fcmovu(void);
+extern void fcmovnb(void);
+extern void fcmovne(void);
+extern void fcmovnbe(void);
+extern void fcmovnu(void);
extern void ffree_(void);
extern void ffreep(void);
extern void fst_i_(void);
@@ -108,6 +116,10 @@ extern void fcompp(void);
extern void fucom_(void);
extern void fucomp(void);
extern void fucompp(void);
+extern void fcomi_(void);
+extern void fcomip(void);
+extern void fucomi_(void);
+extern void fucomip(void);
/* reg_constant.c */
extern void fconst(void);
/* reg_ld_str.c */
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
index 2931ff355218..95228ff042c0 100644
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -33,11 +33,12 @@
#define pop_0() { FPU_settag0(TAG_Empty); top++; }
+/* index is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */
static u_char const type_table[32] = {
- _PUSH_, _PUSH_, _PUSH_, _PUSH_,
- _null_, _null_, _null_, _null_,
- _REG0_, _REG0_, _REG0_, _REG0_,
- _REG0_, _REG0_, _REG0_, _REG0_,
+ _PUSH_, _PUSH_, _PUSH_, _PUSH_, /* /0: d9:fld f32, db:fild m32, dd:fld f64, df:fild m16 */
+ _null_, _REG0_, _REG0_, _REG0_, /* /1: d9:undef, db,dd,df:fisttp m32/64/16 */
+ _REG0_, _REG0_, _REG0_, _REG0_, /* /2: d9:fst f32, db:fist m32, dd:fst f64, df:fist m16 */
+ _REG0_, _REG0_, _REG0_, _REG0_, /* /3: d9:fstp f32, db:fistp m32, dd:fstp f64, df:fistp m16 */
_NONE_, _null_, _NONE_, _PUSH_,
_NONE_, _PUSH_, _null_, _PUSH_,
_NONE_, _null_, _NONE_, _REG0_,
@@ -45,15 +46,19 @@ static u_char const type_table[32] = {
};
u_char const data_sizes_16[32] = {
- 4, 4, 8, 2, 0, 0, 0, 0,
- 4, 4, 8, 2, 4, 4, 8, 2,
+ 4, 4, 8, 2,
+ 0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */
+ 4, 4, 8, 2,
+ 4, 4, 8, 2,
14, 0, 94, 10, 2, 10, 0, 8,
14, 0, 94, 10, 2, 10, 2, 8
};
static u_char const data_sizes_32[32] = {
- 4, 4, 8, 2, 0, 0, 0, 0,
- 4, 4, 8, 2, 4, 4, 8, 2,
+ 4, 4, 8, 2,
+ 0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */
+ 4, 4, 8, 2,
+ 4, 4, 8, 2,
28, 0, 108, 10, 2, 10, 0, 8,
28, 0, 108, 10, 2, 10, 2, 8
};
@@ -65,6 +70,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
FPU_REG *st0_ptr;
u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */
u_char loaded_tag;
+ int sv_cw;
st0_ptr = NULL; /* Initialized just to stop compiler warnings. */
@@ -111,7 +117,8 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
}
switch (type) {
- case 000: /* fld m32real */
+ /* type is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */
+ case 000: /* fld m32real (d9 /0) */
clear_C1();
loaded_tag =
FPU_load_single((float __user *)data_address, &loaded_data);
@@ -123,13 +130,13 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
}
FPU_copy_to_reg0(&loaded_data, loaded_tag);
break;
- case 001: /* fild m32int */
+ case 001: /* fild m32int (db /0) */
clear_C1();
loaded_tag =
FPU_load_int32((long __user *)data_address, &loaded_data);
FPU_copy_to_reg0(&loaded_data, loaded_tag);
break;
- case 002: /* fld m64real */
+ case 002: /* fld m64real (dd /0) */
clear_C1();
loaded_tag =
FPU_load_double((double __user *)data_address,
@@ -142,12 +149,44 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
}
FPU_copy_to_reg0(&loaded_data, loaded_tag);
break;
- case 003: /* fild m16int */
+ case 003: /* fild m16int (df /0) */
clear_C1();
loaded_tag =
FPU_load_int16((short __user *)data_address, &loaded_data);
FPU_copy_to_reg0(&loaded_data, loaded_tag);
break;
+ /* case 004: undefined (d9 /1) */
+ /* fisttp are enabled if CPUID(1).ECX(0) "sse3" is set */
+ case 005: /* fisttp m32int (db /1) */
+ clear_C1();
+ sv_cw = control_word;
+ control_word |= RC_CHOP;
+ if (FPU_store_int32
+ (st0_ptr, st0_tag, (long __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ control_word = sv_cw;
+ break;
+ case 006: /* fisttp m64int (dd /1) */
+ clear_C1();
+ sv_cw = control_word;
+ control_word |= RC_CHOP;
+ if (FPU_store_int64
+ (st0_ptr, st0_tag, (long long __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ control_word = sv_cw;
+ break;
+ case 007: /* fisttp m16int (df /1) */
+ clear_C1();
+ sv_cw = control_word;
+ control_word |= RC_CHOP;
+ if (FPU_store_int16
+ (st0_ptr, st0_tag, (short __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ control_word = sv_cw;
+ break;
case 010: /* fst m32real */
clear_C1();
FPU_store_single(st0_ptr, st0_tag,
diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c
index ecce55fc2e2e..b77360fdbf4a 100644
--- a/arch/x86/math-emu/reg_compare.c
+++ b/arch/x86/math-emu/reg_compare.c
@@ -249,6 +249,54 @@ static int compare_st_st(int nr)
return 0;
}
+static int compare_i_st_st(int nr)
+{
+ int f, c;
+ FPU_REG *st_ptr;
+
+ if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+ FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+ /* Stack fault */
+ EXCEPTION(EX_StackUnder);
+ return !(control_word & CW_Invalid);
+ }
+
+ partial_status &= ~SW_C0;
+ st_ptr = &st(nr);
+ c = compare(st_ptr, FPU_gettagi(nr));
+ if (c & COMP_NaN) {
+ FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+ EXCEPTION(EX_Invalid);
+ return !(control_word & CW_Invalid);
+ }
+
+ switch (c & 7) {
+ case COMP_A_lt_B:
+ f = X86_EFLAGS_CF;
+ break;
+ case COMP_A_eq_B:
+ f = X86_EFLAGS_ZF;
+ break;
+ case COMP_A_gt_B:
+ f = 0;
+ break;
+ case COMP_No_Comp:
+ f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
+ break;
+#ifdef PARANOID
+ default:
+ EXCEPTION(EX_INTERNAL | 0x122);
+ f = 0;
+ break;
+#endif /* PARANOID */
+ }
+ FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
+ if (c & COMP_Denormal) {
+ return denormal_operand() < 0;
+ }
+ return 0;
+}
+
static int compare_u_st_st(int nr)
{
int f = 0, c;
@@ -299,6 +347,58 @@ static int compare_u_st_st(int nr)
return 0;
}
+static int compare_ui_st_st(int nr)
+{
+ int f = 0, c;
+ FPU_REG *st_ptr;
+
+ if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+ FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+ /* Stack fault */
+ EXCEPTION(EX_StackUnder);
+ return !(control_word & CW_Invalid);
+ }
+
+ partial_status &= ~SW_C0;
+ st_ptr = &st(nr);
+ c = compare(st_ptr, FPU_gettagi(nr));
+ if (c & COMP_NaN) {
+ FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+ if (c & COMP_SNaN) { /* This is the only difference between
+ un-ordered and ordinary comparisons */
+ EXCEPTION(EX_Invalid);
+ return !(control_word & CW_Invalid);
+ }
+ return 0;
+ }
+
+ switch (c & 7) {
+ case COMP_A_lt_B:
+ f = X86_EFLAGS_CF;
+ break;
+ case COMP_A_eq_B:
+ f = X86_EFLAGS_ZF;
+ break;
+ case COMP_A_gt_B:
+ f = 0;
+ break;
+ case COMP_No_Comp:
+ f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
+ break;
+#ifdef PARANOID
+ default:
+ EXCEPTION(EX_INTERNAL | 0x123);
+ f = 0;
+ break;
+#endif /* PARANOID */
+ }
+ FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
+ if (c & COMP_Denormal) {
+ return denormal_operand() < 0;
+ }
+ return 0;
+}
+
/*---------------------------------------------------------------------------*/
void fcom_st(void)
@@ -348,3 +448,31 @@ void fucompp(void)
} else
FPU_illegal();
}
+
+/* P6+ compare-to-EFLAGS ops */
+
+void fcomi_(void)
+{
+ /* fcomi st(i) */
+ compare_i_st_st(FPU_rm);
+}
+
+void fcomip(void)
+{
+ /* fcomip st(i) */
+ if (!compare_i_st_st(FPU_rm))
+ FPU_pop();
+}
+
+void fucomi_(void)
+{
+ /* fucomi st(i) */
+ compare_ui_st_st(FPU_rm);
+}
+
+void fucomip(void)
+{
+ /* fucomip st(i) */
+ if (!compare_ui_st_st(FPU_rm))
+ FPU_pop();
+}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index a482d105172b..f9d38a48e3c8 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -14,7 +14,8 @@ obj-$(CONFIG_SMP) += tlb.o
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
+obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o
+obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
new file mode 100644
index 000000000000..bfcffdf6c577
--- /dev/null
+++ b/arch/x86/mm/debug_pagetables.c
@@ -0,0 +1,46 @@
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <asm/pgtable.h>
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+ ptdump_walk_pgd_level(m, NULL);
+ return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *pe;
+
+static int __init pt_dump_debug_init(void)
+{
+ pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
+ &ptdump_fops);
+ if (!pe)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit pt_dump_debug_exit(void)
+{
+ debugfs_remove_recursive(pe);
+}
+
+module_init(pt_dump_debug_init);
+module_exit(pt_dump_debug_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
+MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index f0cedf3395af..4a6f1d9b5106 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -32,6 +32,8 @@ struct pg_state {
const struct addr_marker *marker;
unsigned long lines;
bool to_dmesg;
+ bool check_wx;
+ unsigned long wx_pages;
};
struct addr_marker {
@@ -87,7 +89,7 @@ static struct addr_marker address_markers[] = {
{ 0/* VMALLOC_START */, "vmalloc() Area" },
{ 0/*VMALLOC_END*/, "vmalloc() End" },
# ifdef CONFIG_HIGHMEM
- { 0/*PKMAP_BASE*/, "Persisent kmap() Area" },
+ { 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
# endif
{ 0/*FIXADDR_START*/, "Fixmap Area" },
#endif
@@ -155,7 +157,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
pt_dump_cont_printf(m, dmsg, " ");
if ((level == 4 && pr & _PAGE_PAT) ||
((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
- pt_dump_cont_printf(m, dmsg, "pat ");
+ pt_dump_cont_printf(m, dmsg, "PAT ");
else
pt_dump_cont_printf(m, dmsg, " ");
if (pr & _PAGE_GLOBAL)
@@ -198,8 +200,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* we have now. "break" is either changing perms, levels or
* address space marker.
*/
- prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
- cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
+ prot = pgprot_val(new_prot);
+ cur = pgprot_val(st->current_prot);
if (!st->level) {
/* First entry */
@@ -214,6 +216,16 @@ static void note_page(struct seq_file *m, struct pg_state *st,
const char *unit = units;
unsigned long delta;
int width = sizeof(unsigned long) * 2;
+ pgprotval_t pr = pgprot_val(st->current_prot);
+
+ if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
+ WARN_ONCE(1,
+ "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
+ (void *)st->start_address,
+ (void *)st->start_address);
+ st->wx_pages += (st->current_address -
+ st->start_address) / PAGE_SIZE;
+ }
/*
* Now print the actual finished series
@@ -269,13 +281,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
{
int i;
pte_t *start;
+ pgprotval_t prot;
start = (pte_t *) pmd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PTE; i++) {
- pgprot_t prot = pte_pgprot(*start);
-
+ prot = pte_flags(*start);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
- note_page(m, st, prot, 4);
+ note_page(m, st, __pgprot(prot), 4);
start++;
}
}
@@ -287,18 +299,19 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
{
int i;
pmd_t *start;
+ pgprotval_t prot;
start = (pmd_t *) pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
- pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
-
- if (pmd_large(*start) || !pmd_present(*start))
+ if (pmd_large(*start) || !pmd_present(*start)) {
+ prot = pmd_flags(*start);
note_page(m, st, __pgprot(prot), 3);
- else
+ } else {
walk_pte_level(m, st, *start,
P + i * PMD_LEVEL_MULT);
+ }
} else
note_page(m, st, __pgprot(0), 3);
start++;
@@ -318,19 +331,20 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
{
int i;
pud_t *start;
+ pgprotval_t prot;
start = (pud_t *) pgd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
if (!pud_none(*start)) {
- pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
-
- if (pud_large(*start) || !pud_present(*start))
+ if (pud_large(*start) || !pud_present(*start)) {
+ prot = pud_flags(*start);
note_page(m, st, __pgprot(prot), 2);
- else
+ } else {
walk_pmd_level(m, st, *start,
P + i * PUD_LEVEL_MULT);
+ }
} else
note_page(m, st, __pgprot(0), 2);
@@ -344,13 +358,30 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#endif
-void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
+#ifdef CONFIG_X86_64
+static inline bool is_hypervisor_range(int idx)
+{
+ /*
+ * ffff800000000000 - ffff87ffffffffff is reserved for
+ * the hypervisor.
+ */
+ return paravirt_enabled() &&
+ (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
+ (idx < pgd_index(__PAGE_OFFSET));
+}
+#else
+static inline bool is_hypervisor_range(int idx) { return false; }
+#endif
+
+static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
+ bool checkwx)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_level4_pgt;
#else
pgd_t *start = swapper_pg_dir;
#endif
+ pgprotval_t prot;
int i;
struct pg_state st = {};
@@ -359,16 +390,20 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
st.to_dmesg = true;
}
+ st.check_wx = checkwx;
+ if (checkwx)
+ st.wx_pages = 0;
+
for (i = 0; i < PTRS_PER_PGD; i++) {
st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
- if (!pgd_none(*start)) {
- pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
-
- if (pgd_large(*start) || !pgd_present(*start))
+ if (!pgd_none(*start) && !is_hypervisor_range(i)) {
+ if (pgd_large(*start) || !pgd_present(*start)) {
+ prot = pgd_flags(*start);
note_page(m, &st, __pgprot(prot), 1);
- else
+ } else {
walk_pud_level(m, &st, *start,
i * PGD_LEVEL_MULT);
+ }
} else
note_page(m, &st, __pgprot(0), 1);
@@ -378,30 +413,28 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
/* Flush out the last page */
st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
note_page(m, &st, __pgprot(0), 0);
+ if (!checkwx)
+ return;
+ if (st.wx_pages)
+ pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
+ st.wx_pages);
+ else
+ pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
}
-static int ptdump_show(struct seq_file *m, void *v)
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
- ptdump_walk_pgd_level(m, NULL);
- return 0;
+ ptdump_walk_pgd_level_core(m, pgd, false);
}
+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
-static int ptdump_open(struct inode *inode, struct file *filp)
+void ptdump_walk_pgd_level_checkwx(void)
{
- return single_open(filp, ptdump_show, NULL);
+ ptdump_walk_pgd_level_core(NULL, NULL, true);
}
-static const struct file_operations ptdump_fops = {
- .open = ptdump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int pt_dump_init(void)
+static int __init pt_dump_init(void)
{
- struct dentry *pe;
-
#ifdef CONFIG_X86_32
/* Not a compile-time constant on x86-32 */
address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
@@ -412,11 +445,6 @@ static int pt_dump_init(void)
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
#endif
- pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
- &ptdump_fops);
- if (!pe)
- return -ENOMEM;
-
return 0;
}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 81bf3d2af3eb..6d5eb5900372 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -9,6 +9,7 @@
#include <linux/vmstat.h>
#include <linux/highmem.h>
#include <linux/swap.h>
+#include <linux/memremap.h>
#include <asm/pgtable.h>
@@ -63,6 +64,16 @@ retry:
#endif
}
+static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+{
+ while ((*nr) - nr_start) {
+ struct page *page = pages[--(*nr)];
+
+ ClearPageReferenced(page);
+ put_page(page);
+ }
+}
+
/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
@@ -71,7 +82,9 @@ retry:
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
+ struct dev_pagemap *pgmap = NULL;
unsigned long mask;
+ int nr_start = *nr;
pte_t *ptep;
mask = _PAGE_PRESENT|_PAGE_USER;
@@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
return 0;
}
- if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ page = pte_page(pte);
+ if (pte_devmap(pte)) {
+ pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ pte_unmap(ptep);
+ return 0;
+ }
+ } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep);
return 0;
}
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- page = pte_page(pte);
get_page(page);
+ put_dev_pagemap(pgmap);
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -114,31 +135,58 @@ static inline void get_head_page_multiple(struct page *page, int nr)
SetPageReferenced(page);
}
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ int nr_start = *nr;
+ unsigned long pfn = pmd_pfn(pmd);
+ struct dev_pagemap *pgmap = NULL;
+
+ pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+ do {
+ struct page *page = pfn_to_page(pfn);
+
+ pgmap = get_dev_pagemap(pfn, pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ return 0;
+ }
+ SetPageReferenced(page);
+ pages[*nr] = page;
+ get_page(page);
+ put_dev_pagemap(pgmap);
+ (*nr)++;
+ pfn++;
+ } while (addr += PAGE_SIZE, addr != end);
+ return 1;
+}
+
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
- pte_t pte = *(pte_t *)&pmd;
struct page *head, *page;
int refs;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
- if ((pte_flags(pte) & mask) != mask)
+ if ((pmd_flags(pmd) & mask) != mask)
return 0;
+
+ VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
+ if (pmd_devmap(pmd))
+ return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
+
/* hugepages are never "special" */
- VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
refs = 0;
- head = pte_page(pte);
+ head = pmd_page(pmd);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
@@ -159,18 +207,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- /*
- * The pmd_trans_splitting() check below explains why
- * pmdp_splitting_flush has to flush the tlb, to stop
- * this gup-fast code from running while we set the
- * splitting bit in the pmd. Returning zero will take
- * the slow path that will call wait_split_huge_page()
- * if the pmd is still in splitting state. gup-fast
- * can't because it has irq disabled and
- * wait_split_huge_page() would never return as the
- * tlb flush IPI wouldn't run.
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
/*
@@ -195,27 +232,24 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
- pte_t pte = *(pte_t *)&pud;
struct page *head, *page;
int refs;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
- if ((pte_flags(pte) & mask) != mask)
+ if ((pud_flags(pud) & mask) != mask)
return 0;
/* hugepages are never "special" */
- VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
+ VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
refs = 0;
- head = pte_page(pte);
+ head = pud_page(pud);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index eecb207a2037..a6d739258137 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -104,20 +104,6 @@ void __kunmap_atomic(void *kvaddr)
}
EXPORT_SYMBOL(__kunmap_atomic);
-struct page *kmap_atomic_to_page(void *ptr)
-{
- unsigned long idx, vaddr = (unsigned long)ptr;
- pte_t *pte;
-
- if (vaddr < FIXADDR_START)
- return virt_to_page(ptr);
-
- idx = virt_to_fix(vaddr);
- pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
- return pte_page(*pte);
-}
-EXPORT_SYMBOL(kmap_atomic_to_page);
-
void __init set_highmem_pages_init(void)
{
struct zone *zone;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 42982b26e32b..740d7ac03a55 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -173,10 +173,10 @@ static __init int setup_hugepagesz(char *opt)
}
__setup("hugepagesz=", setup_hugepagesz);
-#ifdef CONFIG_CMA
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static __init int gigantic_pages_init(void)
{
- /* With CMA we can allocate gigantic pages at runtime */
+ /* With compaction or CMA we can allocate gigantic pages at runtime */
if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT))
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
return 0;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1d8a83df153a..493f54172b4a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -354,7 +354,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
}
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
+ pr_debug(" [mem %#010lx-%#010lx] page %s\n",
mr[i].start, mr[i].end - 1,
page_size_string(&mr[i]));
@@ -401,7 +401,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
unsigned long ret = 0;
int nr_range, i;
- pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+ pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n",
start, end - 1);
memset(mr, 0, sizeof(mr));
@@ -693,14 +693,12 @@ void free_initmem(void)
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
-#ifdef CONFIG_MICROCODE_EARLY
/*
* Remember, initrd memory may contain microcode or other useful things.
* Before we lose initrd mem, we need to find a place to hold them
* now that normal virtual memory is enabled.
*/
save_microcode_in_initrd();
-#endif
/*
* end could be not aligned, and We can not align that,
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 7562f42914b4..cb4ef3de61f9 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -957,6 +957,8 @@ void mark_rodata_ro(void)
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
mark_nxdata_nx();
+ if (__supported_pte_mask & _PAGE_NX)
+ debug_checkwx();
}
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index df48430c279b..5488d21123bd 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -30,6 +30,7 @@
#include <linux/module.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
+#include <linux/memremap.h>
#include <linux/nmi.h>
#include <linux/gfp.h>
#include <linux/kcore.h>
@@ -714,6 +715,12 @@ static void __meminit free_pagetable(struct page *page, int order)
{
unsigned long magic;
unsigned int nr_pages = 1 << order;
+ struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page);
+
+ if (altmap) {
+ vmem_altmap_free(altmap, nr_pages);
+ return;
+ }
/* bootmem page has reserved flag */
if (PageReserved(page)) {
@@ -814,8 +821,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
if (phys_addr < (phys_addr_t)0x40000000)
return;
- if (IS_ALIGNED(addr, PAGE_SIZE) &&
- IS_ALIGNED(next, PAGE_SIZE)) {
+ if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
/*
* Do not free direct mapping pages since they were
* freed when offlining, or simplely not in use.
@@ -1018,13 +1024,19 @@ int __ref arch_remove_memory(u64 start, u64 size)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct page *page = pfn_to_page(start_pfn);
+ struct vmem_altmap *altmap;
struct zone *zone;
int ret;
- zone = page_zone(pfn_to_page(start_pfn));
- kernel_physical_mapping_remove(start, start + size);
+ /* With altmap the first mapped page is offset from @start */
+ altmap = to_vmem_altmap((unsigned long) page);
+ if (altmap)
+ page += vmem_altmap_offset(altmap);
+ zone = page_zone(page);
ret = __remove_pages(zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
+ kernel_physical_mapping_remove(start, start + size);
return ret;
}
@@ -1150,6 +1162,8 @@ void mark_rodata_ro(void)
free_init_pages("unused kernel",
(unsigned long) __va(__pa_symbol(rodata_end)),
(unsigned long) __va(__pa_symbol(_sdata)));
+
+ debug_checkwx();
}
#endif
@@ -1234,7 +1248,7 @@ static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;
static int __meminit vmemmap_populate_hugepages(unsigned long start,
- unsigned long end, int node)
+ unsigned long end, int node, struct vmem_altmap *altmap)
{
unsigned long addr;
unsigned long next;
@@ -1257,7 +1271,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
if (pmd_none(*pmd)) {
void *p;
- p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
if (p) {
pte_t entry;
@@ -1268,7 +1282,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
/* check to see if we have contiguous blocks */
if (p_end != p || node_start != node) {
if (p_start)
- printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
addr_start, addr_end-1, p_start, p_end-1, node_start);
addr_start = addr;
node_start = node;
@@ -1278,7 +1292,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
addr_end = addr + PMD_SIZE;
p_end = p + PMD_SIZE;
continue;
- }
+ } else if (altmap)
+ return -ENOMEM; /* no fallback */
} else if (pmd_large(*pmd)) {
vmemmap_verify((pte_t *)pmd, node, addr, next);
continue;
@@ -1292,11 +1307,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
+ struct vmem_altmap *altmap = to_vmem_altmap(start);
int err;
if (cpu_has_pse)
- err = vmemmap_populate_hugepages(start, end, node);
- else
+ err = vmemmap_populate_hugepages(start, end, node, altmap);
+ else if (altmap) {
+ pr_err_once("%s: no cpu support for altmap allocations\n",
+ __func__);
+ err = -ENOMEM;
+ } else
err = vmemmap_populate_basepages(start, end, node);
if (!err)
sync_global_pgds(start, end - 1, 0);
@@ -1366,7 +1386,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
void __meminit vmemmap_populate_print_last(void)
{
if (p_start) {
- printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
addr_start, addr_end-1, p_start, p_end-1, node_start);
p_start = NULL;
p_end = NULL;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index b9c78f3bcd67..0d8d53d1f5cc 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -194,8 +194,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* Check if the request spans more than any BAR in the iomem resource
* tree.
*/
- WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
- KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+ if (iomem_map_sanity_check(unaligned_phys_addr, unaligned_size))
+ pr_warn("caller %pS mapping multiple BARs\n", caller);
return ret_addr;
err_free_area:
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 9ce5da27b136..d470cf219a2d 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -126,5 +126,5 @@ void __init kasan_init(void)
__flush_tlb_all();
init_task.kasan_depth = 0;
- pr_info("Kernel address sanitizer initialized\n");
+ pr_info("KernelAddressSanitizer initialized\n");
}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 844b06d67df4..96bd1e2bffaf 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -69,14 +69,14 @@ unsigned long arch_mmap_rnd(void)
{
unsigned long rnd;
- /*
- * 8 bits of randomness in 32bit mmaps, 20 address space bits
- * 28 bits of randomness in 64bit mmaps, 40 address space bits
- */
if (mmap_is_ia32())
- rnd = (unsigned long)get_random_int() % (1<<8);
+#ifdef CONFIG_COMPAT
+ rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
+#else
+ rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+#endif
else
- rnd = (unsigned long)get_random_int() % (1<<28);
+ rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
return rnd << PAGE_SHIFT;
}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 134948b0926f..b2fd67da1701 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -101,19 +101,19 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
switch (type) {
case REG_TYPE_RM:
regno = X86_MODRM_RM(insn->modrm.value);
- if (X86_REX_B(insn->rex_prefix.value) == 1)
+ if (X86_REX_B(insn->rex_prefix.value))
regno += 8;
break;
case REG_TYPE_INDEX:
regno = X86_SIB_INDEX(insn->sib.value);
- if (X86_REX_X(insn->rex_prefix.value) == 1)
+ if (X86_REX_X(insn->rex_prefix.value))
regno += 8;
break;
case REG_TYPE_BASE:
regno = X86_SIB_BASE(insn->sib.value);
- if (X86_REX_B(insn->rex_prefix.value) == 1)
+ if (X86_REX_B(insn->rex_prefix.value))
regno += 8;
break;
@@ -237,7 +237,8 @@ bad_opcode:
*/
siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
{
- const struct bndreg *bndregs, *bndreg;
+ const struct mpx_bndreg_state *bndregs;
+ const struct mpx_bndreg *bndreg;
siginfo_t *info = NULL;
struct insn insn;
uint8_t bndregno;
@@ -258,13 +259,13 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
goto err_out;
}
/* get bndregs field from current task's xsave area */
- bndregs = get_xsave_field_ptr(XSTATE_BNDREGS);
+ bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS);
if (!bndregs) {
err = -EINVAL;
goto err_out;
}
/* now go select the individual register in the set of 4 */
- bndreg = &bndregs[bndregno];
+ bndreg = &bndregs->bndreg[bndregno];
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
@@ -306,7 +307,7 @@ err_out:
static __user void *mpx_get_bounds_dir(void)
{
- const struct bndcsr *bndcsr;
+ const struct mpx_bndcsr *bndcsr;
if (!cpu_feature_enabled(X86_FEATURE_MPX))
return MPX_INVALID_BOUNDS_DIR;
@@ -315,7 +316,7 @@ static __user void *mpx_get_bounds_dir(void)
* The bounds directory pointer is stored in a register
* only accessible if we first do an xsave.
*/
- bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
+ bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
if (!bndcsr)
return MPX_INVALID_BOUNDS_DIR;
@@ -489,10 +490,10 @@ out_unmap:
static int do_mpx_bt_fault(void)
{
unsigned long bd_entry, bd_base;
- const struct bndcsr *bndcsr;
+ const struct mpx_bndcsr *bndcsr;
struct mm_struct *mm = current->mm;
- bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
+ bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
if (!bndcsr)
return -EINVAL;
/*
@@ -585,6 +586,29 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
}
/*
+ * We only want to do a 4-byte get_user() on 32-bit. Otherwise,
+ * we might run off the end of the bounds table if we are on
+ * a 64-bit kernel and try to get 8 bytes.
+ */
+int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret,
+ long __user *bd_entry_ptr)
+{
+ u32 bd_entry_32;
+ int ret;
+
+ if (is_64bit_mm(mm))
+ return get_user(*bd_entry_ret, bd_entry_ptr);
+
+ /*
+ * Note that get_user() uses the type of the *pointer* to
+ * establish the size of the get, not the destination.
+ */
+ ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr);
+ *bd_entry_ret = bd_entry_32;
+ return ret;
+}
+
+/*
* Get the base of bounds tables pointed by specific bounds
* directory entry.
*/
@@ -604,7 +628,7 @@ static int get_bt_addr(struct mm_struct *mm,
int need_write = 0;
pagefault_disable();
- ret = get_user(bd_entry, bd_entry_ptr);
+ ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr);
pagefault_enable();
if (!ret)
break;
@@ -699,11 +723,23 @@ static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm,
*/
static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
{
- unsigned long long virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
- if (is_64bit_mm(mm))
- return virt_space / MPX_BD_NR_ENTRIES_64;
- else
- return virt_space / MPX_BD_NR_ENTRIES_32;
+ unsigned long long virt_space;
+ unsigned long long GB = (1ULL << 30);
+
+ /*
+ * This covers 32-bit emulation as well as 32-bit kernels
+ * running on 64-bit harware.
+ */
+ if (!is_64bit_mm(mm))
+ return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
+
+ /*
+ * 'x86_virt_bits' returns what the hardware is capable
+ * of, and returns the full >32-bit adddress space when
+ * running 32-bit kernels on 64-bit hardware.
+ */
+ virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
+ return virt_space / MPX_BD_NR_ENTRIES_64;
}
/*
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index c3b3f653ed0c..d04f8094bc23 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -469,7 +469,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
{
int i, nid;
nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
- unsigned long start, end;
+ phys_addr_t start, end;
struct memblock_region *r;
/*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2c44c0792301..2440814b0069 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -33,7 +33,7 @@ struct cpa_data {
pgd_t *pgd;
pgprot_t mask_set;
pgprot_t mask_clr;
- int numpages;
+ unsigned long numpages;
int flags;
unsigned long pfn;
unsigned force_split : 1;
@@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages)
static void split_page_count(int level)
{
+ if (direct_pages_count[level] == 0)
+ return;
+
direct_pages_count[level]--;
direct_pages_count[level - 1] += PTRS_PER_PTE;
}
@@ -129,14 +132,16 @@ within(unsigned long addr, unsigned long start, unsigned long end)
*/
void clflush_cache_range(void *vaddr, unsigned int size)
{
- unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
+ void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
void *vend = vaddr + size;
- void *p;
+
+ if (p >= vend)
+ return;
mb();
- for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
- p < vend; p += boot_cpu_data.x86_clflush_size)
+ for (; p < vend; p += clflush_size)
clflushopt(p);
mb();
@@ -414,18 +419,28 @@ pmd_t *lookup_pmd_address(unsigned long address)
phys_addr_t slow_virt_to_phys(void *__virt_addr)
{
unsigned long virt_addr = (unsigned long)__virt_addr;
- phys_addr_t phys_addr;
- unsigned long offset;
+ unsigned long phys_addr, offset;
enum pg_level level;
- unsigned long pmask;
pte_t *pte;
pte = lookup_address(virt_addr, &level);
BUG_ON(!pte);
- pmask = page_level_mask(level);
- offset = virt_addr & ~pmask;
- phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
- return (phys_addr | offset);
+
+ switch (level) {
+ case PG_LEVEL_1G:
+ phys_addr = pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
+ offset = virt_addr & ~PUD_PAGE_MASK;
+ break;
+ case PG_LEVEL_2M:
+ phys_addr = pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
+ offset = virt_addr & ~PMD_PAGE_MASK;
+ break;
+ default:
+ phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+ offset = virt_addr & ~PAGE_MASK;
+ }
+
+ return (phys_addr_t)(phys_addr | offset);
}
EXPORT_SYMBOL_GPL(slow_virt_to_phys);
@@ -458,7 +473,7 @@ static int
try_preserve_large_page(pte_t *kpte, unsigned long address,
struct cpa_data *cpa)
{
- unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
+ unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
pte_t new_pte, old_pte, *tmp;
pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
@@ -478,17 +493,21 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
switch (level) {
case PG_LEVEL_2M:
-#ifdef CONFIG_X86_64
+ old_prot = pmd_pgprot(*(pmd_t *)kpte);
+ old_pfn = pmd_pfn(*(pmd_t *)kpte);
+ break;
case PG_LEVEL_1G:
-#endif
- psize = page_level_size(level);
- pmask = page_level_mask(level);
+ old_prot = pud_pgprot(*(pud_t *)kpte);
+ old_pfn = pud_pfn(*(pud_t *)kpte);
break;
default:
do_split = -EINVAL;
goto out_unlock;
}
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
/*
* Calculate the number of pages, which fit into this large
* page starting at address:
@@ -504,7 +523,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* up accordingly.
*/
old_pte = *kpte;
- old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte));
+ req_prot = pgprot_large_2_4k(old_prot);
pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
@@ -530,10 +549,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
req_prot = canon_pgprot(req_prot);
/*
- * old_pte points to the large page base address. So we need
+ * old_pfn points to the large page base pfn. So we need
* to add the offset of the virtual address:
*/
- pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
+ pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
cpa->pfn = pfn;
new_prot = static_protections(req_prot, address, pfn);
@@ -544,7 +563,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* the pages in the range we try to preserve:
*/
addr = address & pmask;
- pfn = pte_pfn(old_pte);
+ pfn = old_pfn;
for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
@@ -574,7 +593,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* The address is aligned and the number of pages
* covers the full page.
*/
- new_pte = pfn_pte(pte_pfn(old_pte), new_prot);
+ new_pte = pfn_pte(old_pfn, new_prot);
__set_pmd_pte(kpte, address, new_pte);
cpa->flags |= CPA_FLUSHTLB;
do_split = 0;
@@ -591,7 +610,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
struct page *base)
{
pte_t *pbase = (pte_t *)page_address(base);
- unsigned long pfn, pfninc = 1;
+ unsigned long ref_pfn, pfn, pfninc = 1;
unsigned int i, level;
pte_t *tmp;
pgprot_t ref_prot;
@@ -608,26 +627,33 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
}
paravirt_alloc_pte(&init_mm, page_to_pfn(base));
- ref_prot = pte_pgprot(pte_clrhuge(*kpte));
- /* promote PAT bit to correct position */
- if (level == PG_LEVEL_2M)
+ switch (level) {
+ case PG_LEVEL_2M:
+ ref_prot = pmd_pgprot(*(pmd_t *)kpte);
+ /* clear PSE and promote PAT bit to correct position */
ref_prot = pgprot_large_2_4k(ref_prot);
+ ref_pfn = pmd_pfn(*(pmd_t *)kpte);
+ break;
-#ifdef CONFIG_X86_64
- if (level == PG_LEVEL_1G) {
+ case PG_LEVEL_1G:
+ ref_prot = pud_pgprot(*(pud_t *)kpte);
+ ref_pfn = pud_pfn(*(pud_t *)kpte);
pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+
/*
- * Set the PSE flags only if the PRESENT flag is set
+ * Clear the PSE flags if the PRESENT flag is not set
* otherwise pmd_present/pmd_huge will return true
* even on a non present pmd.
*/
- if (pgprot_val(ref_prot) & _PAGE_PRESENT)
- pgprot_val(ref_prot) |= _PAGE_PSE;
- else
+ if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
pgprot_val(ref_prot) &= ~_PAGE_PSE;
+ break;
+
+ default:
+ spin_unlock(&pgd_lock);
+ return 1;
}
-#endif
/*
* Set the GLOBAL flags only if the PRESENT flag is set
@@ -643,13 +669,16 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
/*
* Get the target pfn from the original entry:
*/
- pfn = pte_pfn(*kpte);
+ pfn = ref_pfn;
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
- if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
- PFN_DOWN(__pa(address)) + 1))
- split_page_count(level);
+ if (virt_addr_valid(address)) {
+ unsigned long pfn = PFN_DOWN(__pa(address));
+
+ if (pfn_range_is_mapped(pfn, pfn + 1))
+ split_page_count(level);
+ }
/*
* Install the new, split up pagetable.
@@ -1321,7 +1350,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
* CPA operation. Either a large page has been
* preserved or a single page update happened.
*/
- BUG_ON(cpa->numpages > numpages);
+ BUG_ON(cpa->numpages > numpages || !cpa->numpages);
numpages -= cpa->numpages;
if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
cpa->curpage++;
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 188e3e07eeeb..f4ae536b0914 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,6 +12,7 @@
#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/fs.h>
@@ -586,7 +587,7 @@ int free_memtype(u64 start, u64 end)
entry = rbt_memtype_erase(start, end);
spin_unlock(&memtype_lock);
- if (!entry) {
+ if (IS_ERR(entry)) {
pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
current->comm, current->pid, start, end - 1);
return -EINVAL;
@@ -949,7 +950,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
}
int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
- unsigned long pfn)
+ pfn_t pfn)
{
enum page_cache_mode pcm;
@@ -957,7 +958,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
return 0;
/* Set prot based on lookup */
- pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
+ pcm = lookup_memtype(pfn_t_to_phys(pfn));
*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
cachemode2protval(pcm));
@@ -992,6 +993,16 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
vma->vm_flags &= ~VM_PAT;
}
+/*
+ * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
+ * with the old vma after its pfnmap page table has been removed. The new
+ * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
+ */
+void untrack_pfn_moved(struct vm_area_struct *vma)
+{
+ vma->vm_flags &= ~VM_PAT;
+}
+
pgprot_t pgprot_writecombine(pgprot_t prot)
{
return __pgprot(pgprot_val(prot) |
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 63931080366a..2f7702253ccf 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -98,8 +98,13 @@ static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
return last_lower; /* Returns NULL if there is no overlap */
}
-static struct memtype *memtype_rb_exact_match(struct rb_root *root,
- u64 start, u64 end)
+enum {
+ MEMTYPE_EXACT_MATCH = 0,
+ MEMTYPE_END_MATCH = 1
+};
+
+static struct memtype *memtype_rb_match(struct rb_root *root,
+ u64 start, u64 end, int match_type)
{
struct memtype *match;
@@ -107,7 +112,12 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root,
while (match != NULL && match->start < end) {
struct rb_node *node;
- if (match->start == start && match->end == end)
+ if ((match_type == MEMTYPE_EXACT_MATCH) &&
+ (match->start == start) && (match->end == end))
+ return match;
+
+ if ((match_type == MEMTYPE_END_MATCH) &&
+ (match->start < start) && (match->end == end))
return match;
node = rb_next(&match->rb);
@@ -117,7 +127,7 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root,
match = NULL;
}
- return NULL; /* Returns NULL if there is no exact match */
+ return NULL; /* Returns NULL if there is no match */
}
static int memtype_rb_check_conflict(struct rb_root *root,
@@ -210,12 +220,36 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end)
{
struct memtype *data;
- data = memtype_rb_exact_match(&memtype_rbroot, start, end);
- if (!data)
- goto out;
+ /*
+ * Since the memtype_rbroot tree allows overlapping ranges,
+ * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free
+ * a whole node for the munmap case. If no such entry is found,
+ * it then checks with END_MATCH, i.e. shrink the size of a node
+ * from the end for the mremap case.
+ */
+ data = memtype_rb_match(&memtype_rbroot, start, end,
+ MEMTYPE_EXACT_MATCH);
+ if (!data) {
+ data = memtype_rb_match(&memtype_rbroot, start, end,
+ MEMTYPE_END_MATCH);
+ if (!data)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (data->start == start) {
+ /* munmap: erase this node */
+ rb_erase_augmented(&data->rb, &memtype_rbroot,
+ &memtype_rb_augment_cb);
+ } else {
+ /* mremap: update the end value of this node */
+ rb_erase_augmented(&data->rb, &memtype_rbroot,
+ &memtype_rb_augment_cb);
+ data->end = start;
+ data->subtree_max_end = data->end;
+ memtype_rb_insert(&memtype_rbroot, data);
+ return NULL;
+ }
- rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);
-out:
return data;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fb0a9dd1d6e4..4eb287e25043 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -414,7 +414,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*ptep = entry;
- pte_update_defer(vma->vm_mm, address, ptep);
+ pte_update(vma->vm_mm, address, ptep);
}
return changed;
@@ -431,7 +431,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*pmdp = entry;
- pmd_update_defer(vma->vm_mm, address, pmdp);
/*
* We had a write-protection fault here and changed the pmd
* to to more permissive. No need to flush the TLB for that,
@@ -469,9 +468,6 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *)pmdp);
- if (ret)
- pmd_update(vma->vm_mm, addr, pmdp);
-
return ret;
}
#endif
@@ -509,20 +505,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
return young;
}
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- int set;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
- (unsigned long *)pmdp);
- if (set) {
- pmd_update(vma->vm_mm, address, pmdp);
- /* need tlb flush only to serialize against gup-fast */
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- }
-}
#endif
/**
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 90555bf60aa4..92e2eacb3321 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -31,7 +31,7 @@ early_param("noexec", noexec_setup);
void x86_configure_nx(void)
{
- if (cpu_has_nx && !disable_nx)
+ if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx)
__supported_pte_mask |= _PAGE_NX;
else
__supported_pte_mask &= ~_PAGE_NX;
@@ -39,7 +39,7 @@ void x86_configure_nx(void)
void __init x86_report_nx(void)
{
- if (!cpu_has_nx) {
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
"missing in CPU!\n");
} else {
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index c2aea63bee20..b5f821881465 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -203,6 +203,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
(unsigned long long)start, (unsigned long long)end - 1);
+ max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));
+
return 0;
out_err_bad_srat:
bad_srat();
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0d66fb..8f4cc3dfac32 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
preempt_disable();
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+
+ /* This is an implicit full barrier that synchronizes with switch_mm. */
local_flush_tlb();
+
trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
preempt_disable();
- if (current->active_mm != mm)
+ if (current->active_mm != mm) {
+ /* Synchronize with switch_mm. */
+ smp_mb();
+
goto out;
+ }
if (!current->mm) {
leave_mm(smp_processor_id());
+
+ /* Synchronize with switch_mm. */
+ smp_mb();
+
goto out;
}
if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
base_pages_to_flush = (end - start) >> PAGE_SHIFT;
+ /*
+ * Both branches below are implicit full barriers (MOV to CR or
+ * INVLPG) that synchronize with switch_mm.
+ */
if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
base_pages_to_flush = TLB_FLUSH_ALL;
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
preempt_disable();
if (current->active_mm == mm) {
- if (current->mm)
+ if (current->mm) {
+ /*
+ * Implicit full barrier (INVLPG) that synchronizes
+ * with switch_mm.
+ */
__flush_tlb_one(start);
- else
+ } else {
leave_mm(smp_processor_id());
+
+ /* Synchronize with switch_mm. */
+ smp_mb();
+ }
}
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 70efcd0940f9..4286f3618bd0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -193,7 +193,7 @@ struct jit_context {
32 /* space for rbx, r13, r14, r15 */ + \
8 /* space for skb_copy_bits() buffer */)
-#define PROLOGUE_SIZE 51
+#define PROLOGUE_SIZE 48
/* emit x64 prologue code for BPF program and check it's size.
* bpf_tail_call helper will skip it while jumping into another program
@@ -229,11 +229,15 @@ static void emit_prologue(u8 **pprog)
/* mov qword ptr [rbp-X],r15 */
EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);
- /* clear A and X registers */
- EMIT2(0x31, 0xc0); /* xor eax, eax */
- EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */
+ /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls
+ * we need to reset the counter to 0. It's done in two instructions,
+ * resetting rax register to 0 (xor on eax gets 0 extended), and
+ * moving it to the counter location.
+ */
- /* clear tail_cnt: mov qword ptr [rbp-X], rax */
+ /* xor eax, eax */
+ EMIT2(0x31, 0xc0);
+ /* mov qword ptr [rbp-X], rax */
EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);
BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
@@ -455,6 +459,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
}
case BPF_ALU | BPF_MOV | BPF_K:
+ /* optimization: if imm32 is zero, use 'xor <dst>,<dst>'
+ * to save 3 bytes.
+ */
+ if (imm32 == 0) {
+ if (is_ereg(dst_reg))
+ EMIT1(add_2mod(0x40, dst_reg, dst_reg));
+ b2 = 0x31; /* xor */
+ b3 = 0xC0;
+ EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
+ break;
+ }
+
/* mov %eax, imm32 */
if (is_ereg(dst_reg))
EMIT1(add_1mod(0x40, dst_reg));
@@ -469,6 +485,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
return -EINVAL;
}
+ /* optimization: if imm64 is zero, use 'xor <dst>,<dst>'
+ * to save 7 bytes.
+ */
+ if (insn[0].imm == 0 && insn[1].imm == 0) {
+ b1 = add_2mod(0x48, dst_reg, dst_reg);
+ b2 = 0x31; /* xor */
+ b3 = 0xC0;
+ EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg));
+
+ insn++;
+ i++;
+ break;
+ }
+
/* movabsq %rax, imm64 */
EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
EMIT(insn[0].imm, 4);
@@ -1109,7 +1139,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
bpf_flush_icache(header, image + proglen);
set_memory_ro((unsigned long)header, header->pages);
prog->bpf_func = (void *)image;
- prog->jited = true;
+ prog->jited = 1;
}
out:
kfree(addrs);
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 5c6fc3577a49..97062a635b77 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -23,6 +23,8 @@ obj-y += bus_numa.o
obj-$(CONFIG_AMD_NB) += amd_bus.o
obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o
+obj-$(CONFIG_VMD) += vmd.o
+
ifeq ($(CONFIG_PCI_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
endif
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index ff9911707160..3cd69832d7f4 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -4,16 +4,15 @@
#include <linux/irq.h>
#include <linux/dmi.h>
#include <linux/slab.h>
+#include <linux/pci-acpi.h>
#include <asm/numa.h>
#include <asm/pci_x86.h>
struct pci_root_info {
- struct acpi_device *bridge;
- char name[16];
+ struct acpi_pci_root_info common;
struct pci_sysdata sd;
#ifdef CONFIG_PCI_MMCONFIG
bool mcfg_added;
- u16 segment;
u8 start_bus;
u8 end_bus;
#endif
@@ -178,15 +177,18 @@ static int check_segment(u16 seg, struct device *dev, char *estr)
return 0;
}
-static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start,
- u8 end, phys_addr_t addr)
+static int setup_mcfg_map(struct acpi_pci_root_info *ci)
{
- int result;
- struct device *dev = &info->bridge->dev;
+ int result, seg;
+ struct pci_root_info *info;
+ struct acpi_pci_root *root = ci->root;
+ struct device *dev = &ci->bridge->dev;
- info->start_bus = start;
- info->end_bus = end;
+ info = container_of(ci, struct pci_root_info, common);
+ info->start_bus = (u8)root->secondary.start;
+ info->end_bus = (u8)root->secondary.end;
info->mcfg_added = false;
+ seg = info->sd.domain;
/* return success if MMCFG is not in use */
if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg)
@@ -195,7 +197,8 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start,
if (!(pci_probe & PCI_PROBE_MMCONF))
return check_segment(seg, dev, "MMCONFIG is disabled,");
- result = pci_mmconfig_insert(dev, seg, start, end, addr);
+ result = pci_mmconfig_insert(dev, seg, info->start_bus, info->end_bus,
+ root->mcfg_addr);
if (result == 0) {
/* enable MMCFG if it hasn't been enabled yet */
if (raw_pci_ext_ops == NULL)
@@ -208,134 +211,55 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start,
return 0;
}
-static void teardown_mcfg_map(struct pci_root_info *info)
+static void teardown_mcfg_map(struct acpi_pci_root_info *ci)
{
+ struct pci_root_info *info;
+
+ info = container_of(ci, struct pci_root_info, common);
if (info->mcfg_added) {
- pci_mmconfig_delete(info->segment, info->start_bus,
- info->end_bus);
+ pci_mmconfig_delete(info->sd.domain,
+ info->start_bus, info->end_bus);
info->mcfg_added = false;
}
}
#else
-static int setup_mcfg_map(struct pci_root_info *info,
- u16 seg, u8 start, u8 end,
- phys_addr_t addr)
+static int setup_mcfg_map(struct acpi_pci_root_info *ci)
{
return 0;
}
-static void teardown_mcfg_map(struct pci_root_info *info)
+
+static void teardown_mcfg_map(struct acpi_pci_root_info *ci)
{
}
#endif
-static void validate_resources(struct device *dev, struct list_head *crs_res,
- unsigned long type)
+static int pci_acpi_root_get_node(struct acpi_pci_root *root)
{
- LIST_HEAD(list);
- struct resource *res1, *res2, *root = NULL;
- struct resource_entry *tmp, *entry, *entry2;
-
- BUG_ON((type & (IORESOURCE_MEM | IORESOURCE_IO)) == 0);
- root = (type & IORESOURCE_MEM) ? &iomem_resource : &ioport_resource;
-
- list_splice_init(crs_res, &list);
- resource_list_for_each_entry_safe(entry, tmp, &list) {
- bool free = false;
- resource_size_t end;
-
- res1 = entry->res;
- if (!(res1->flags & type))
- goto next;
-
- /* Exclude non-addressable range or non-addressable portion */
- end = min(res1->end, root->end);
- if (end <= res1->start) {
- dev_info(dev, "host bridge window %pR (ignored, not CPU addressable)\n",
- res1);
- free = true;
- goto next;
- } else if (res1->end != end) {
- dev_info(dev, "host bridge window %pR ([%#llx-%#llx] ignored, not CPU addressable)\n",
- res1, (unsigned long long)end + 1,
- (unsigned long long)res1->end);
- res1->end = end;
- }
-
- resource_list_for_each_entry(entry2, crs_res) {
- res2 = entry2->res;
- if (!(res2->flags & type))
- continue;
-
- /*
- * I don't like throwing away windows because then
- * our resources no longer match the ACPI _CRS, but
- * the kernel resource tree doesn't allow overlaps.
- */
- if (resource_overlaps(res1, res2)) {
- res2->start = min(res1->start, res2->start);
- res2->end = max(res1->end, res2->end);
- dev_info(dev, "host bridge window expanded to %pR; %pR ignored\n",
- res2, res1);
- free = true;
- goto next;
- }
- }
+ int busnum = root->secondary.start;
+ struct acpi_device *device = root->device;
+ int node = acpi_get_node(device->handle);
-next:
- resource_list_del(entry);
- if (free)
- resource_list_free_entry(entry);
- else
- resource_list_add_tail(entry, crs_res);
+ if (node == NUMA_NO_NODE) {
+ node = x86_pci_root_bus_node(busnum);
+ if (node != 0 && node != NUMA_NO_NODE)
+ dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
+ node);
}
+ if (node != NUMA_NO_NODE && !node_online(node))
+ node = NUMA_NO_NODE;
+
+ return node;
}
-static void add_resources(struct pci_root_info *info,
- struct list_head *resources,
- struct list_head *crs_res)
+static int pci_acpi_root_init_info(struct acpi_pci_root_info *ci)
{
- struct resource_entry *entry, *tmp;
- struct resource *res, *conflict, *root = NULL;
-
- validate_resources(&info->bridge->dev, crs_res, IORESOURCE_MEM);
- validate_resources(&info->bridge->dev, crs_res, IORESOURCE_IO);
-
- resource_list_for_each_entry_safe(entry, tmp, crs_res) {
- res = entry->res;
- if (res->flags & IORESOURCE_MEM)
- root = &iomem_resource;
- else if (res->flags & IORESOURCE_IO)
- root = &ioport_resource;
- else
- BUG_ON(res);
-
- conflict = insert_resource_conflict(root, res);
- if (conflict) {
- dev_info(&info->bridge->dev,
- "ignoring host bridge window %pR (conflicts with %s %pR)\n",
- res, conflict->name, conflict);
- resource_list_destroy_entry(entry);
- }
- }
-
- list_splice_tail(crs_res, resources);
+ return setup_mcfg_map(ci);
}
-static void release_pci_root_info(struct pci_host_bridge *bridge)
+static void pci_acpi_root_release_info(struct acpi_pci_root_info *ci)
{
- struct resource *res;
- struct resource_entry *entry;
- struct pci_root_info *info = bridge->release_data;
-
- resource_list_for_each_entry(entry, &bridge->windows) {
- res = entry->res;
- if (res->parent &&
- (res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
- release_resource(res);
- }
-
- teardown_mcfg_map(info);
- kfree(info);
+ teardown_mcfg_map(ci);
+ kfree(container_of(ci, struct pci_root_info, common));
}
/*
@@ -358,50 +282,47 @@ static bool resource_is_pcicfg_ioport(struct resource *res)
res->start == 0xCF8 && res->end == 0xCFF;
}
-static void probe_pci_root_info(struct pci_root_info *info,
- struct acpi_device *device,
- int busnum, int domain,
- struct list_head *list)
+static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
{
- int ret;
+ struct acpi_device *device = ci->bridge;
+ int busnum = ci->root->secondary.start;
struct resource_entry *entry, *tmp;
+ int status;
- sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
- info->bridge = device;
- ret = acpi_dev_get_resources(device, list,
- acpi_dev_filter_resource_type_cb,
- (void *)(IORESOURCE_IO | IORESOURCE_MEM));
- if (ret < 0)
- dev_warn(&device->dev,
- "failed to parse _CRS method, error code %d\n", ret);
- else if (ret == 0)
- dev_dbg(&device->dev,
- "no IO and memory resources present in _CRS\n");
- else
- resource_list_for_each_entry_safe(entry, tmp, list) {
- if ((entry->res->flags & IORESOURCE_DISABLED) ||
- resource_is_pcicfg_ioport(entry->res))
+ status = acpi_pci_probe_root_resources(ci);
+ if (pci_use_crs) {
+ resource_list_for_each_entry_safe(entry, tmp, &ci->resources)
+ if (resource_is_pcicfg_ioport(entry->res))
resource_list_destroy_entry(entry);
- else
- entry->res->name = info->name;
- }
+ return status;
+ }
+
+ resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
+ dev_printk(KERN_DEBUG, &device->dev,
+ "host bridge window %pR (ignored)\n", entry->res);
+ resource_list_destroy_entry(entry);
+ }
+ x86_pci_root_bus_resources(busnum, &ci->resources);
+
+ return 0;
}
+static struct acpi_pci_root_ops acpi_pci_root_ops = {
+ .pci_ops = &pci_root_ops,
+ .init_info = pci_acpi_root_init_info,
+ .release_info = pci_acpi_root_release_info,
+ .prepare_resources = pci_acpi_root_prepare_resources,
+};
+
struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
{
- struct acpi_device *device = root->device;
- struct pci_root_info *info;
int domain = root->segment;
int busnum = root->secondary.start;
- struct resource_entry *res_entry;
- LIST_HEAD(crs_res);
- LIST_HEAD(resources);
+ int node = pci_acpi_root_get_node(root);
struct pci_bus *bus;
- struct pci_sysdata *sd;
- int node;
if (pci_ignore_seg)
- domain = 0;
+ root->segment = domain = 0;
if (domain && !pci_domains_supported) {
printk(KERN_WARNING "pci_bus %04x:%02x: "
@@ -410,71 +331,33 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
return NULL;
}
- node = acpi_get_node(device->handle);
- if (node == NUMA_NO_NODE) {
- node = x86_pci_root_bus_node(busnum);
- if (node != 0 && node != NUMA_NO_NODE)
- dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
- node);
- }
-
- if (node != NUMA_NO_NODE && !node_online(node))
- node = NUMA_NO_NODE;
-
- info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
- if (!info) {
- printk(KERN_WARNING "pci_bus %04x:%02x: "
- "ignored (out of memory)\n", domain, busnum);
- return NULL;
- }
-
- sd = &info->sd;
- sd->domain = domain;
- sd->node = node;
- sd->companion = device;
-
bus = pci_find_bus(domain, busnum);
if (bus) {
/*
* If the desired bus has been scanned already, replace
* its bus->sysdata.
*/
- memcpy(bus->sysdata, sd, sizeof(*sd));
- kfree(info);
- } else {
- /* insert busn res at first */
- pci_add_resource(&resources, &root->secondary);
+ struct pci_sysdata sd = {
+ .domain = domain,
+ .node = node,
+ .companion = root->device
+ };
- /*
- * _CRS with no apertures is normal, so only fall back to
- * defaults or native bridge info if we're ignoring _CRS.
- */
- probe_pci_root_info(info, device, busnum, domain, &crs_res);
- if (pci_use_crs) {
- add_resources(info, &resources, &crs_res);
- } else {
- resource_list_for_each_entry(res_entry, &crs_res)
- dev_printk(KERN_DEBUG, &device->dev,
- "host bridge window %pR (ignored)\n",
- res_entry->res);
- resource_list_free(&crs_res);
- x86_pci_root_bus_resources(busnum, &resources);
- }
-
- if (!setup_mcfg_map(info, domain, (u8)root->secondary.start,
- (u8)root->secondary.end, root->mcfg_addr))
- bus = pci_create_root_bus(NULL, busnum, &pci_root_ops,
- sd, &resources);
-
- if (bus) {
- pci_scan_child_bus(bus);
- pci_set_host_bridge_release(
- to_pci_host_bridge(bus->bridge),
- release_pci_root_info, info);
- } else {
- resource_list_free(&resources);
- teardown_mcfg_map(info);
- kfree(info);
+ memcpy(bus->sysdata, &sd, sizeof(sd));
+ } else {
+ struct pci_root_info *info;
+
+ info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
+ if (!info)
+ dev_err(&root->device->dev,
+ "pci_bus %04x:%02x: ignored (out of memory)\n",
+ domain, busnum);
+ else {
+ info->sd.domain = domain;
+ info->sd.node = node;
+ info->sd.companion = root->device;
+ bus = acpi_pci_root_create(root, &acpi_pci_root_ops,
+ &info->common, &info->sd);
}
}
@@ -487,9 +370,6 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
pcie_bus_configure_settings(child);
}
- if (bus && node != NUMA_NO_NODE)
- dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node);
-
return bus;
}
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 7bcf06a7cd12..6eb3c8af96e2 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -50,18 +50,9 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
if (!found)
pci_add_resource(resources, &info->busn);
- list_for_each_entry(root_res, &info->resources, list) {
- struct resource *res;
- struct resource *root;
+ list_for_each_entry(root_res, &info->resources, list)
+ pci_add_resource(resources, &root_res->res);
- res = &root_res->res;
- pci_add_resource(resources, res);
- if (res->flags & IORESOURCE_IO)
- root = &ioport_resource;
- else
- root = &iomem_resource;
- insert_resource(root, res);
- }
return;
default_resources:
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index dc78a4a9a466..2879efc73a96 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -641,6 +641,43 @@ unsigned int pcibios_assign_all_busses(void)
return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0;
}
+#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
+static LIST_HEAD(dma_domain_list);
+static DEFINE_SPINLOCK(dma_domain_list_lock);
+
+void add_dma_domain(struct dma_domain *domain)
+{
+ spin_lock(&dma_domain_list_lock);
+ list_add(&domain->node, &dma_domain_list);
+ spin_unlock(&dma_domain_list_lock);
+}
+EXPORT_SYMBOL_GPL(add_dma_domain);
+
+void del_dma_domain(struct dma_domain *domain)
+{
+ spin_lock(&dma_domain_list_lock);
+ list_del(&domain->node);
+ spin_unlock(&dma_domain_list_lock);
+}
+EXPORT_SYMBOL_GPL(del_dma_domain);
+
+static void set_dma_domain_ops(struct pci_dev *pdev)
+{
+ struct dma_domain *domain;
+
+ spin_lock(&dma_domain_list_lock);
+ list_for_each_entry(domain, &dma_domain_list, node) {
+ if (pci_domain_nr(pdev->bus) == domain->domain_nr) {
+ pdev->dev.archdata.dma_ops = domain->dma_ops;
+ break;
+ }
+ }
+ spin_unlock(&dma_domain_list_lock);
+}
+#else
+static void set_dma_domain_ops(struct pci_dev *pdev) {}
+#endif
+
int pcibios_add_device(struct pci_dev *dev)
{
struct setup_data *data;
@@ -670,11 +707,20 @@ int pcibios_add_device(struct pci_dev *dev)
pa_data = data->next;
iounmap(data);
}
+ set_dma_domain_ops(dev);
return 0;
}
int pcibios_alloc_irq(struct pci_dev *dev)
{
+ /*
+ * If the PCI device was already claimed by core code and has
+ * MSI enabled, probing of the pcibios IRQ will overwrite
+ * dev->irq. So bail out if MSI is already enabled.
+ */
+ if (pci_dev_msi_enabled(dev))
+ return -EBUSY;
+
return pcibios_enable_irq(dev);
}
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 5b662c0faf8c..ea6f3802c17b 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -54,7 +54,7 @@ void pcibios_scan_specific_bus(int busn)
}
EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus);
-int __init pci_subsys_init(void)
+static int __init pci_subsys_init(void)
{
/*
* The init function returns an non zero value when
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 9b83b9051ae7..9770e55e768f 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -180,6 +180,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
unsigned long result = 0;
unsigned long flags;
unsigned long bx = (bus << 8) | devfn;
+ u16 number = 0, mask = 0;
WARN_ON(seg);
if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
@@ -189,53 +190,35 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
switch (len) {
case 1:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=c" (*value),
- "=a" (result)
- : "1" (PCIBIOS_READ_CONFIG_BYTE),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
- /*
- * Zero-extend the result beyond 8 bits, do not trust the
- * BIOS having done it:
- */
- *value &= 0xff;
+ number = PCIBIOS_READ_CONFIG_BYTE;
+ mask = 0xff;
break;
case 2:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=c" (*value),
- "=a" (result)
- : "1" (PCIBIOS_READ_CONFIG_WORD),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
- /*
- * Zero-extend the result beyond 16 bits, do not trust the
- * BIOS having done it:
- */
- *value &= 0xffff;
+ number = PCIBIOS_READ_CONFIG_WORD;
+ mask = 0xffff;
break;
case 4:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=c" (*value),
- "=a" (result)
- : "1" (PCIBIOS_READ_CONFIG_DWORD),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
+ number = PCIBIOS_READ_CONFIG_DWORD;
break;
}
+ __asm__("lcall *(%%esi); cld\n\t"
+ "jc 1f\n\t"
+ "xor %%ah, %%ah\n"
+ "1:"
+ : "=c" (*value),
+ "=a" (result)
+ : "1" (number),
+ "b" (bx),
+ "D" ((long)reg),
+ "S" (&pci_indirect));
+ /*
+ * Zero-extend the result beyond 8 or 16 bits, do not trust the
+ * BIOS having done it:
+ */
+ if (mask)
+ *value &= mask;
+
raw_spin_unlock_irqrestore(&pci_config_lock, flags);
return (int)((result & 0xff00) >> 8);
@@ -247,6 +230,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
unsigned long result = 0;
unsigned long flags;
unsigned long bx = (bus << 8) | devfn;
+ u16 number = 0;
WARN_ON(seg);
if ((bus > 255) || (devfn > 255) || (reg > 255))
@@ -256,43 +240,27 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
switch (len) {
case 1:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=a" (result)
- : "0" (PCIBIOS_WRITE_CONFIG_BYTE),
- "c" (value),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
+ number = PCIBIOS_WRITE_CONFIG_BYTE;
break;
case 2:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=a" (result)
- : "0" (PCIBIOS_WRITE_CONFIG_WORD),
- "c" (value),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
+ number = PCIBIOS_WRITE_CONFIG_WORD;
break;
case 4:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=a" (result)
- : "0" (PCIBIOS_WRITE_CONFIG_DWORD),
- "c" (value),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
+ number = PCIBIOS_WRITE_CONFIG_DWORD;
break;
}
+ __asm__("lcall *(%%esi); cld\n\t"
+ "jc 1f\n\t"
+ "xor %%ah, %%ah\n"
+ "1:"
+ : "=a" (result)
+ : "0" (number),
+ "c" (value),
+ "b" (bx),
+ "D" ((long)reg),
+ "S" (&pci_indirect));
+
raw_spin_unlock_irqrestore(&pci_config_lock, flags);
return (int)((result & 0xff00) >> 8);
diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
new file mode 100644
index 000000000000..d57e48016f15
--- /dev/null
+++ b/arch/x86/pci/vmd.c
@@ -0,0 +1,723 @@
+/*
+ * Volume Management Device driver
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+
+#include <asm/irqdomain.h>
+#include <asm/device.h>
+#include <asm/msi.h>
+#include <asm/msidef.h>
+
+#define VMD_CFGBAR 0
+#define VMD_MEMBAR1 2
+#define VMD_MEMBAR2 4
+
+/*
+ * Lock for manipulating VMD IRQ lists.
+ */
+static DEFINE_RAW_SPINLOCK(list_lock);
+
+/**
+ * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
+ * @node: list item for parent traversal.
+ * @rcu: RCU callback item for freeing.
+ * @irq: back pointer to parent.
+ * @virq: the virtual IRQ value provided to the requesting driver.
+ *
+ * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to
+ * a VMD IRQ using this structure.
+ */
+struct vmd_irq {
+ struct list_head node;
+ struct rcu_head rcu;
+ struct vmd_irq_list *irq;
+ unsigned int virq;
+};
+
+/**
+ * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector
+ * @irq_list: the list of irq's the VMD one demuxes to.
+ * @vmd_vector: the h/w IRQ assigned to the VMD.
+ * @index: index into the VMD MSI-X table; used for message routing.
+ * @count: number of child IRQs assigned to this vector; used to track
+ * sharing.
+ */
+struct vmd_irq_list {
+ struct list_head irq_list;
+ struct vmd_dev *vmd;
+ unsigned int vmd_vector;
+ unsigned int index;
+ unsigned int count;
+};
+
+struct vmd_dev {
+ struct pci_dev *dev;
+
+ spinlock_t cfg_lock;
+ char __iomem *cfgbar;
+
+ int msix_count;
+ struct msix_entry *msix_entries;
+ struct vmd_irq_list *irqs;
+
+ struct pci_sysdata sysdata;
+ struct resource resources[3];
+ struct irq_domain *irq_domain;
+ struct pci_bus *bus;
+
+#ifdef CONFIG_X86_DEV_DMA_OPS
+ struct dma_map_ops dma_ops;
+ struct dma_domain dma_domain;
+#endif
+};
+
+static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
+{
+ return container_of(bus->sysdata, struct vmd_dev, sysdata);
+}
+
+/*
+ * Drivers managing a device in a VMD domain allocate their own IRQs as before,
+ * but the MSI entry for the hardware it's driving will be programmed with a
+ * destination ID for the VMD MSI-X table. The VMD muxes interrupts in its
+ * domain into one of its own, and the VMD driver de-muxes these for the
+ * handlers sharing that VMD IRQ. The vmd irq_domain provides the operations
+ * and irq_chip to set this up.
+ */
+static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ struct vmd_irq *vmdirq = data->chip_data;
+ struct vmd_irq_list *irq = vmdirq->irq;
+
+ msg->address_hi = MSI_ADDR_BASE_HI;
+ msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_DEST_ID(irq->index);
+ msg->data = 0;
+}
+
+/*
+ * We rely on MSI_FLAG_USE_DEF_CHIP_OPS to set the IRQ mask/unmask ops.
+ */
+static void vmd_irq_enable(struct irq_data *data)
+{
+ struct vmd_irq *vmdirq = data->chip_data;
+
+ raw_spin_lock(&list_lock);
+ list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list);
+ raw_spin_unlock(&list_lock);
+
+ data->chip->irq_unmask(data);
+}
+
+static void vmd_irq_disable(struct irq_data *data)
+{
+ struct vmd_irq *vmdirq = data->chip_data;
+
+ data->chip->irq_mask(data);
+
+ raw_spin_lock(&list_lock);
+ list_del_rcu(&vmdirq->node);
+ raw_spin_unlock(&list_lock);
+}
+
+/*
+ * XXX: Stubbed until we develop acceptable way to not create conflicts with
+ * other devices sharing the same vector.
+ */
+static int vmd_irq_set_affinity(struct irq_data *data,
+ const struct cpumask *dest, bool force)
+{
+ return -EINVAL;
+}
+
+static struct irq_chip vmd_msi_controller = {
+ .name = "VMD-MSI",
+ .irq_enable = vmd_irq_enable,
+ .irq_disable = vmd_irq_disable,
+ .irq_compose_msi_msg = vmd_compose_msi_msg,
+ .irq_set_affinity = vmd_irq_set_affinity,
+};
+
+static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return 0;
+}
+
+/*
+ * XXX: We can be even smarter selecting the best IRQ once we solve the
+ * affinity problem.
+ */
+static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd)
+{
+ int i, best = 0;
+
+ raw_spin_lock(&list_lock);
+ for (i = 1; i < vmd->msix_count; i++)
+ if (vmd->irqs[i].count < vmd->irqs[best].count)
+ best = i;
+ vmd->irqs[best].count++;
+ raw_spin_unlock(&list_lock);
+
+ return &vmd->irqs[best];
+}
+
+static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
+ unsigned int virq, irq_hw_number_t hwirq,
+ msi_alloc_info_t *arg)
+{
+ struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(arg->desc)->bus);
+ struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL);
+
+ if (!vmdirq)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&vmdirq->node);
+ vmdirq->irq = vmd_next_irq(vmd);
+ vmdirq->virq = virq;
+
+ irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip,
+ vmdirq, handle_simple_irq, vmd, NULL);
+ return 0;
+}
+
+static void vmd_msi_free(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq)
+{
+ struct vmd_irq *vmdirq = irq_get_chip_data(virq);
+
+ /* XXX: Potential optimization to rebalance */
+ raw_spin_lock(&list_lock);
+ vmdirq->irq->count--;
+ raw_spin_unlock(&list_lock);
+
+ kfree_rcu(vmdirq, rcu);
+}
+
+static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
+
+ if (nvec > vmd->msix_count)
+ return vmd->msix_count;
+
+ memset(arg, 0, sizeof(*arg));
+ return 0;
+}
+
+static void vmd_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+ arg->desc = desc;
+}
+
+static struct msi_domain_ops vmd_msi_domain_ops = {
+ .get_hwirq = vmd_get_hwirq,
+ .msi_init = vmd_msi_init,
+ .msi_free = vmd_msi_free,
+ .msi_prepare = vmd_msi_prepare,
+ .set_desc = vmd_set_desc,
+};
+
+static struct msi_domain_info vmd_msi_domain_info = {
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_PCI_MSIX,
+ .ops = &vmd_msi_domain_ops,
+ .chip = &vmd_msi_controller,
+};
+
+#ifdef CONFIG_X86_DEV_DMA_OPS
+/*
+ * VMD replaces the requester ID with its own. DMA mappings for devices in a
+ * VMD domain need to be mapped for the VMD, not the device requiring
+ * the mapping.
+ */
+static struct device *to_vmd_dev(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
+
+ return &vmd->dev->dev;
+}
+
+static struct dma_map_ops *vmd_dma_ops(struct device *dev)
+{
+ return to_vmd_dev(dev)->archdata.dma_ops;
+}
+
+static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr,
+ gfp_t flag, struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->alloc(to_vmd_dev(dev), size, addr, flag,
+ attrs);
+}
+
+static void vmd_free(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t addr, struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->free(to_vmd_dev(dev), size, vaddr, addr,
+ attrs);
+}
+
+static int vmd_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t addr, size_t size,
+ struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->mmap(to_vmd_dev(dev), vma, cpu_addr, addr,
+ size, attrs);
+}
+
+static int vmd_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t addr, size_t size,
+ struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->get_sgtable(to_vmd_dev(dev), sgt, cpu_addr,
+ addr, size, attrs);
+}
+
+static dma_addr_t vmd_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->map_page(to_vmd_dev(dev), page, offset, size,
+ dir, attrs);
+}
+
+static void vmd_unmap_page(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+ vmd_dma_ops(dev)->unmap_page(to_vmd_dev(dev), addr, size, dir, attrs);
+}
+
+static int vmd_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->map_sg(to_vmd_dev(dev), sg, nents, dir, attrs);
+}
+
+static void vmd_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+ vmd_dma_ops(dev)->unmap_sg(to_vmd_dev(dev), sg, nents, dir, attrs);
+}
+
+static void vmd_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
+{
+ vmd_dma_ops(dev)->sync_single_for_cpu(to_vmd_dev(dev), addr, size, dir);
+}
+
+static void vmd_sync_single_for_device(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
+{
+ vmd_dma_ops(dev)->sync_single_for_device(to_vmd_dev(dev), addr, size,
+ dir);
+}
+
+static void vmd_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir)
+{
+ vmd_dma_ops(dev)->sync_sg_for_cpu(to_vmd_dev(dev), sg, nents, dir);
+}
+
+static void vmd_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir)
+{
+ vmd_dma_ops(dev)->sync_sg_for_device(to_vmd_dev(dev), sg, nents, dir);
+}
+
+static int vmd_mapping_error(struct device *dev, dma_addr_t addr)
+{
+ return vmd_dma_ops(dev)->mapping_error(to_vmd_dev(dev), addr);
+}
+
+static int vmd_dma_supported(struct device *dev, u64 mask)
+{
+ return vmd_dma_ops(dev)->dma_supported(to_vmd_dev(dev), mask);
+}
+
+#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
+static u64 vmd_get_required_mask(struct device *dev)
+{
+ return vmd_dma_ops(dev)->get_required_mask(to_vmd_dev(dev));
+}
+#endif
+
+static void vmd_teardown_dma_ops(struct vmd_dev *vmd)
+{
+ struct dma_domain *domain = &vmd->dma_domain;
+
+ if (vmd->dev->dev.archdata.dma_ops)
+ del_dma_domain(domain);
+}
+
+#define ASSIGN_VMD_DMA_OPS(source, dest, fn) \
+ do { \
+ if (source->fn) \
+ dest->fn = vmd_##fn; \
+ } while (0)
+
+static void vmd_setup_dma_ops(struct vmd_dev *vmd)
+{
+ const struct dma_map_ops *source = vmd->dev->dev.archdata.dma_ops;
+ struct dma_map_ops *dest = &vmd->dma_ops;
+ struct dma_domain *domain = &vmd->dma_domain;
+
+ domain->domain_nr = vmd->sysdata.domain;
+ domain->dma_ops = dest;
+
+ if (!source)
+ return;
+ ASSIGN_VMD_DMA_OPS(source, dest, alloc);
+ ASSIGN_VMD_DMA_OPS(source, dest, free);
+ ASSIGN_VMD_DMA_OPS(source, dest, mmap);
+ ASSIGN_VMD_DMA_OPS(source, dest, get_sgtable);
+ ASSIGN_VMD_DMA_OPS(source, dest, map_page);
+ ASSIGN_VMD_DMA_OPS(source, dest, unmap_page);
+ ASSIGN_VMD_DMA_OPS(source, dest, map_sg);
+ ASSIGN_VMD_DMA_OPS(source, dest, unmap_sg);
+ ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_cpu);
+ ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_device);
+ ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_cpu);
+ ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_device);
+ ASSIGN_VMD_DMA_OPS(source, dest, mapping_error);
+ ASSIGN_VMD_DMA_OPS(source, dest, dma_supported);
+#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
+ ASSIGN_VMD_DMA_OPS(source, dest, get_required_mask);
+#endif
+ add_dma_domain(domain);
+}
+#undef ASSIGN_VMD_DMA_OPS
+#else
+static void vmd_teardown_dma_ops(struct vmd_dev *vmd) {}
+static void vmd_setup_dma_ops(struct vmd_dev *vmd) {}
+#endif
+
+static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus,
+ unsigned int devfn, int reg, int len)
+{
+ char __iomem *addr = vmd->cfgbar +
+ (bus->number << 20) + (devfn << 12) + reg;
+
+ if ((addr - vmd->cfgbar) + len >=
+ resource_size(&vmd->dev->resource[VMD_CFGBAR]))
+ return NULL;
+
+ return addr;
+}
+
+/*
+ * CPU may deadlock if config space is not serialized on some versions of this
+ * hardware, so all config space access is done under a spinlock.
+ */
+static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
+ int len, u32 *value)
+{
+ struct vmd_dev *vmd = vmd_from_bus(bus);
+ char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
+ unsigned long flags;
+ int ret = 0;
+
+ if (!addr)
+ return -EFAULT;
+
+ spin_lock_irqsave(&vmd->cfg_lock, flags);
+ switch (len) {
+ case 1:
+ *value = readb(addr);
+ break;
+ case 2:
+ *value = readw(addr);
+ break;
+ case 4:
+ *value = readl(addr);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&vmd->cfg_lock, flags);
+ return ret;
+}
+
+/*
+ * VMD h/w converts non-posted config writes to posted memory writes. The
+ * read-back in this function forces the completion so it returns only after
+ * the config space was written, as expected.
+ */
+static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
+ int len, u32 value)
+{
+ struct vmd_dev *vmd = vmd_from_bus(bus);
+ char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
+ unsigned long flags;
+ int ret = 0;
+
+ if (!addr)
+ return -EFAULT;
+
+ spin_lock_irqsave(&vmd->cfg_lock, flags);
+ switch (len) {
+ case 1:
+ writeb(value, addr);
+ readb(addr);
+ break;
+ case 2:
+ writew(value, addr);
+ readw(addr);
+ break;
+ case 4:
+ writel(value, addr);
+ readl(addr);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&vmd->cfg_lock, flags);
+ return ret;
+}
+
+static struct pci_ops vmd_ops = {
+ .read = vmd_pci_read,
+ .write = vmd_pci_write,
+};
+
+/*
+ * VMD domains start at 0x1000 to not clash with ACPI _SEG domains.
+ */
+static int vmd_find_free_domain(void)
+{
+ int domain = 0xffff;
+ struct pci_bus *bus = NULL;
+
+ while ((bus = pci_find_next_bus(bus)) != NULL)
+ domain = max_t(int, domain, pci_domain_nr(bus));
+ return domain + 1;
+}
+
+static int vmd_enable_domain(struct vmd_dev *vmd)
+{
+ struct pci_sysdata *sd = &vmd->sysdata;
+ struct resource *res;
+ u32 upper_bits;
+ unsigned long flags;
+ LIST_HEAD(resources);
+
+ res = &vmd->dev->resource[VMD_CFGBAR];
+ vmd->resources[0] = (struct resource) {
+ .name = "VMD CFGBAR",
+ .start = res->start,
+ .end = (resource_size(res) >> 20) - 1,
+ .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
+ };
+
+ res = &vmd->dev->resource[VMD_MEMBAR1];
+ upper_bits = upper_32_bits(res->end);
+ flags = res->flags & ~IORESOURCE_SIZEALIGN;
+ if (!upper_bits)
+ flags &= ~IORESOURCE_MEM_64;
+ vmd->resources[1] = (struct resource) {
+ .name = "VMD MEMBAR1",
+ .start = res->start,
+ .end = res->end,
+ .flags = flags,
+ };
+
+ res = &vmd->dev->resource[VMD_MEMBAR2];
+ upper_bits = upper_32_bits(res->end);
+ flags = res->flags & ~IORESOURCE_SIZEALIGN;
+ if (!upper_bits)
+ flags &= ~IORESOURCE_MEM_64;
+ vmd->resources[2] = (struct resource) {
+ .name = "VMD MEMBAR2",
+ .start = res->start + 0x2000,
+ .end = res->end,
+ .flags = flags,
+ };
+
+ sd->domain = vmd_find_free_domain();
+ if (sd->domain < 0)
+ return sd->domain;
+
+ sd->node = pcibus_to_node(vmd->dev->bus);
+
+ vmd->irq_domain = pci_msi_create_irq_domain(NULL, &vmd_msi_domain_info,
+ NULL);
+ if (!vmd->irq_domain)
+ return -ENODEV;
+
+ pci_add_resource(&resources, &vmd->resources[0]);
+ pci_add_resource(&resources, &vmd->resources[1]);
+ pci_add_resource(&resources, &vmd->resources[2]);
+ vmd->bus = pci_create_root_bus(&vmd->dev->dev, 0, &vmd_ops, sd,
+ &resources);
+ if (!vmd->bus) {
+ pci_free_resource_list(&resources);
+ irq_domain_remove(vmd->irq_domain);
+ return -ENODEV;
+ }
+
+ vmd_setup_dma_ops(vmd);
+ dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
+ pci_rescan_bus(vmd->bus);
+
+ WARN(sysfs_create_link(&vmd->dev->dev.kobj, &vmd->bus->dev.kobj,
+ "domain"), "Can't create symlink to domain\n");
+ return 0;
+}
+
+static irqreturn_t vmd_irq(int irq, void *data)
+{
+ struct vmd_irq_list *irqs = data;
+ struct vmd_irq *vmdirq;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
+ generic_handle_irq(vmdirq->virq);
+ rcu_read_unlock();
+
+ return IRQ_HANDLED;
+}
+
+static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+ struct vmd_dev *vmd;
+ int i, err;
+
+ if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20))
+ return -ENOMEM;
+
+ vmd = devm_kzalloc(&dev->dev, sizeof(*vmd), GFP_KERNEL);
+ if (!vmd)
+ return -ENOMEM;
+
+ vmd->dev = dev;
+ err = pcim_enable_device(dev);
+ if (err < 0)
+ return err;
+
+ vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0);
+ if (!vmd->cfgbar)
+ return -ENOMEM;
+
+ pci_set_master(dev);
+ if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) &&
+ dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32)))
+ return -ENODEV;
+
+ vmd->msix_count = pci_msix_vec_count(dev);
+ if (vmd->msix_count < 0)
+ return -ENODEV;
+
+ vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs),
+ GFP_KERNEL);
+ if (!vmd->irqs)
+ return -ENOMEM;
+
+ vmd->msix_entries = devm_kcalloc(&dev->dev, vmd->msix_count,
+ sizeof(*vmd->msix_entries),
+ GFP_KERNEL);
+ if (!vmd->msix_entries)
+ return -ENOMEM;
+ for (i = 0; i < vmd->msix_count; i++)
+ vmd->msix_entries[i].entry = i;
+
+ vmd->msix_count = pci_enable_msix_range(vmd->dev, vmd->msix_entries, 1,
+ vmd->msix_count);
+ if (vmd->msix_count < 0)
+ return vmd->msix_count;
+
+ for (i = 0; i < vmd->msix_count; i++) {
+ INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
+ vmd->irqs[i].vmd_vector = vmd->msix_entries[i].vector;
+ vmd->irqs[i].index = i;
+
+ err = devm_request_irq(&dev->dev, vmd->irqs[i].vmd_vector,
+ vmd_irq, 0, "vmd", &vmd->irqs[i]);
+ if (err)
+ return err;
+ }
+
+ spin_lock_init(&vmd->cfg_lock);
+ pci_set_drvdata(dev, vmd);
+ err = vmd_enable_domain(vmd);
+ if (err)
+ return err;
+
+ dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n",
+ vmd->sysdata.domain);
+ return 0;
+}
+
+static void vmd_remove(struct pci_dev *dev)
+{
+ struct vmd_dev *vmd = pci_get_drvdata(dev);
+
+ pci_set_drvdata(dev, NULL);
+ sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
+ pci_stop_root_bus(vmd->bus);
+ pci_remove_root_bus(vmd->bus);
+ vmd_teardown_dma_ops(vmd);
+ irq_domain_remove(vmd->irq_domain);
+}
+
+#ifdef CONFIG_PM
+static int vmd_suspend(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ pci_save_state(pdev);
+ return 0;
+}
+
+static int vmd_resume(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ pci_restore_state(pdev);
+ return 0;
+}
+#endif
+static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume);
+
+static const struct pci_device_id vmd_ids[] = {
+ {PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x201d),},
+ {0,}
+};
+MODULE_DEVICE_TABLE(pci, vmd_ids);
+
+static struct pci_driver vmd_drv = {
+ .name = "vmd",
+ .id_table = vmd_ids,
+ .probe = vmd_probe,
+ .remove = vmd_remove,
+ .driver = {
+ .pm = &vmd_dev_pm_ops,
+ },
+};
+module_pci_driver(vmd_drv);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION("0.6");
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
index 5ca8ead91579..81c769e80614 100644
--- a/arch/x86/platform/atom/punit_atom_debug.c
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -25,8 +25,6 @@
#include <asm/cpu_device_id.h>
#include <asm/iosf_mbi.h>
-/* Side band Interface port */
-#define PUNIT_PORT 0x04
/* Power gate status reg */
#define PWRGT_STATUS 0x61
/* Subsystem config/status Video processor */
@@ -85,9 +83,8 @@ static int punit_dev_state_show(struct seq_file *seq_file, void *unused)
seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n");
while (punit_devp->name) {
- status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ,
- punit_devp->reg,
- &punit_pwr_status);
+ status = iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_REG_READ,
+ punit_devp->reg, &punit_pwr_status);
if (status) {
seq_printf(seq_file, "%9s : Read Failed\n",
punit_devp->name);
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index d7f997f7c26d..ea48449b2e63 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -50,11 +50,16 @@ void __init efi_bgrt_init(void)
bgrt_tab->version);
return;
}
- if (bgrt_tab->status != 1) {
- pr_err("Ignoring BGRT: invalid status %u (expected 1)\n",
+ if (bgrt_tab->status & 0xfe) {
+ pr_err("Ignoring BGRT: reserved status bits are non-zero %u\n",
bgrt_tab->status);
return;
}
+ if (bgrt_tab->status != 1) {
+ pr_debug("Ignoring BGRT: invalid status %u (expected 1)\n",
+ bgrt_tab->status);
+ return;
+ }
if (bgrt_tab->image_type != 0) {
pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n",
bgrt_tab->image_type);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 6a28ded74211..ad285404ea7f 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -194,7 +194,7 @@ static void __init do_add_efi_memmap(void)
int __init efi_memblock_x86_reserve_range(void)
{
struct efi_info *e = &boot_params.efi_info;
- unsigned long pmap;
+ phys_addr_t pmap;
if (efi_enabled(EFI_PARAVIRT))
return 0;
@@ -209,7 +209,7 @@ int __init efi_memblock_x86_reserve_range(void)
#else
pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
#endif
- memmap.phys_map = (void *)pmap;
+ memmap.phys_map = pmap;
memmap.nr_map = e->efi_memmap_size /
e->efi_memdesc_size;
memmap.desc_size = e->efi_memdesc_size;
@@ -222,7 +222,7 @@ int __init efi_memblock_x86_reserve_range(void)
return 0;
}
-static void __init print_efi_memmap(void)
+void __init efi_print_memmap(void)
{
#ifdef EFI_DEBUG
efi_memory_desc_t *md;
@@ -524,7 +524,7 @@ void __init efi_init(void)
return;
if (efi_enabled(EFI_DBG))
- print_efi_memmap();
+ efi_print_memmap();
efi_esrt_init();
}
@@ -1017,24 +1017,6 @@ u32 efi_mem_type(unsigned long phys_addr)
return 0;
}
-u64 efi_mem_attributes(unsigned long phys_addr)
-{
- efi_memory_desc_t *md;
- void *p;
-
- if (!efi_enabled(EFI_MEMMAP))
- return 0;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
- if ((md->phys_addr <= phys_addr) &&
- (phys_addr < (md->phys_addr +
- (md->num_pages << EFI_PAGE_SHIFT))))
- return md->attribute;
- }
- return 0;
-}
-
static int __init arch_parse_efi_cmdline(char *str)
{
if (!str) {
@@ -1044,8 +1026,6 @@ static int __init arch_parse_efi_cmdline(char *str)
if (parse_option_str(str, "old_map"))
set_bit(EFI_OLD_MEMMAP, &efi.flags);
- if (parse_option_str(str, "debug"))
- set_bit(EFI_DBG, &efi.flags);
return 0;
}
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 1c7380da65ff..2d66db8f80f9 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -8,6 +8,7 @@
#include <linux/memblock.h>
#include <linux/bootmem.h>
#include <linux/acpi.h>
+#include <linux/dmi.h>
#include <asm/efi.h>
#include <asm/uv/uv.h>
@@ -248,6 +249,16 @@ out:
return ret;
}
+static const struct dmi_system_id sgi_uv1_dmi[] = {
+ { NULL, "SGI UV1",
+ { DMI_MATCH(DMI_PRODUCT_NAME, "Stoutland Platform"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "1.0"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "SGI.COM"),
+ }
+ },
+ { } /* NULL entry stops DMI scanning */
+};
+
void __init efi_apply_memmap_quirks(void)
{
/*
@@ -260,10 +271,8 @@ void __init efi_apply_memmap_quirks(void)
efi_unmap_memmap();
}
- /*
- * UV doesn't support the new EFI pagetable mapping yet.
- */
- if (is_uv_system())
+ /* UV2+ BIOS has a fix for this issue. UV1 still needs the quirk. */
+ if (dmi_check_system(sgi_uv1_dmi))
set_bit(EFI_OLD_MEMMAP, &efi.flags);
}
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 01d54ea766c1..90bb997ed0a2 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -61,7 +61,7 @@
enum intel_mid_timer_options intel_mid_timer_options;
/* intel_mid_ops to store sub arch ops */
-struct intel_mid_ops *intel_mid_ops;
+static struct intel_mid_ops *intel_mid_ops;
/* getter function for sub arch ops*/
static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT;
enum intel_mid_cpu_type __intel_mid_cpu_chip;
@@ -138,7 +138,7 @@ static void intel_mid_arch_setup(void)
intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip]();
else {
intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL]();
- pr_info("ARCH: Unknown SoC, assuming PENWELL!\n");
+ pr_info("ARCH: Unknown SoC, assuming Penwell!\n");
}
out:
@@ -214,12 +214,10 @@ static inline int __init setup_x86_intel_mid_timer(char *arg)
else if (strcmp("lapic_and_apbt", arg) == 0)
intel_mid_timer_options = INTEL_MID_TIMER_LAPIC_APBT;
else {
- pr_warn("X86 INTEL_MID timer option %s not recognised"
- " use x86_intel_mid_timer=apbt_only or lapic_and_apbt\n",
- arg);
+ pr_warn("X86 INTEL_MID timer option %s not recognised use x86_intel_mid_timer=apbt_only or lapic_and_apbt\n",
+ arg);
return -EINVAL;
}
return 0;
}
__setup("x86_intel_mid_timer=", setup_x86_intel_mid_timer);
-
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index ce992e8cc065..5ee360a951ce 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -197,10 +197,9 @@ static int __init sfi_parse_gpio(struct sfi_table_header *table)
num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
pentry = (struct sfi_gpio_table_entry *)sb->pentry;
- gpio_table = kmalloc(num * sizeof(*pentry), GFP_KERNEL);
+ gpio_table = kmemdup(pentry, num * sizeof(*pentry), GFP_KERNEL);
if (!gpio_table)
return -1;
- memcpy(gpio_table, pentry, num * sizeof(*pentry));
gpio_num_entry = num;
pr_debug("GPIO pin info:\n");
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index 0ee619f9fcb7..c61b6c332e97 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -111,23 +111,19 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
int ret;
- ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->addr_lo);
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_lo);
if (ret)
return ret;
- ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->addr_hi);
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_hi);
if (ret)
return ret;
- ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->rmask);
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->rmask);
if (ret)
return ret;
- return iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->wmask);
+ return iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->wmask);
}
/**
@@ -151,31 +147,27 @@ static int imr_write(struct imr_device *idev, u32 imr_id,
local_irq_save(flags);
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, reg++,
- imr->addr_lo);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_lo);
if (ret)
goto failed;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg++, imr->addr_hi);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_hi);
if (ret)
goto failed;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg++, imr->rmask);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->rmask);
if (ret)
goto failed;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg++, imr->wmask);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->wmask);
if (ret)
goto failed;
/* Lock bit must be set separately to addr_lo address bits. */
if (lock) {
imr->addr_lo |= IMR_LOCK;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg - IMR_NUM_REGS, imr->addr_lo);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE,
+ reg - IMR_NUM_REGS, imr->addr_lo);
if (ret)
goto failed;
}
@@ -228,11 +220,12 @@ static int imr_dbgfs_state_show(struct seq_file *s, void *unused)
if (imr_is_enabled(&imr)) {
base = imr_to_phys(imr.addr_lo);
end = imr_to_phys(imr.addr_hi) + IMR_MASK;
+ size = end - base + 1;
} else {
base = 0;
end = 0;
+ size = 0;
}
- size = end - base;
seq_printf(s, "imr%02i: base=%pa, end=%pa, size=0x%08zx "
"rmask=0x%08x, wmask=0x%08x, %s, %s\n", i,
&base, &end, size, imr.rmask, imr.wmask,
@@ -587,6 +580,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev)
{
phys_addr_t base = virt_to_phys(&_text);
size_t size = virt_to_phys(&__end_rodata) - base;
+ unsigned long start, end;
int i;
int ret;
@@ -594,18 +588,24 @@ static void __init imr_fixup_memmap(struct imr_device *idev)
for (i = 0; i < idev->max_imr; i++)
imr_clear(i);
+ start = (unsigned long)_text;
+ end = (unsigned long)__end_rodata - 1;
+
/*
* Setup a locked IMR around the physical extent of the kernel
* from the beginning of the .text secton to the end of the
* .rodata section as one physically contiguous block.
+ *
+ * We don't round up @size since it is already PAGE_SIZE aligned.
+ * See vmlinux.lds.S for details.
*/
ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, true);
if (ret < 0) {
- pr_err("unable to setup IMR for kernel: (%p - %p)\n",
- &_text, &__end_rodata);
+ pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n",
+ size / 1024, start, end);
} else {
- pr_info("protecting kernel .text - .rodata: %zu KiB (%p - %p)\n",
- size / 1024, &_text, &__end_rodata);
+ pr_info("protecting kernel .text - .rodata: %zu KiB (%lx - %lx)\n",
+ size / 1024, start, end);
}
}
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 5c9f63fa6abf..8dd80050d705 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -28,6 +28,7 @@
#include <linux/nmi.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/clocksource.h>
#include <asm/apic.h>
#include <asm/current.h>
@@ -376,38 +377,42 @@ static void uv_nmi_wait(int master)
atomic_read(&uv_nmi_cpus_in_nmi), num_online_cpus());
}
+/* Dump Instruction Pointer header */
static void uv_nmi_dump_cpu_ip_hdr(void)
{
- printk(KERN_DEFAULT
- "\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n",
+ pr_info("\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n",
"CPU", "PID", "COMMAND", "IP");
}
+/* Dump Instruction Pointer info */
static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs)
{
- printk(KERN_DEFAULT "UV: %4d %6d %-32.32s ",
- cpu, current->pid, current->comm);
-
+ pr_info("UV: %4d %6d %-32.32s ", cpu, current->pid, current->comm);
printk_address(regs->ip);
}
-/* Dump this cpu's state */
+/*
+ * Dump this CPU's state. If action was set to "kdump" and the crash_kexec
+ * failed, then we provide "dump" as an alternate action. Action "dump" now
+ * also includes the show "ips" (instruction pointers) action whereas the
+ * action "ips" only displays instruction pointers for the non-idle CPU's.
+ * This is an abbreviated form of the "ps" command.
+ */
static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs)
{
const char *dots = " ................................. ";
- if (uv_nmi_action_is("ips")) {
- if (cpu == 0)
- uv_nmi_dump_cpu_ip_hdr();
+ if (cpu == 0)
+ uv_nmi_dump_cpu_ip_hdr();
- if (current->pid != 0)
- uv_nmi_dump_cpu_ip(cpu, regs);
+ if (current->pid != 0 || !uv_nmi_action_is("ips"))
+ uv_nmi_dump_cpu_ip(cpu, regs);
- } else if (uv_nmi_action_is("dump")) {
- printk(KERN_DEFAULT
- "UV:%sNMI process trace for CPU %d\n", dots, cpu);
+ if (uv_nmi_action_is("dump")) {
+ pr_info("UV:%sNMI process trace for CPU %d\n", dots, cpu);
show_regs(regs);
}
+
this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE);
}
@@ -469,8 +474,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master)
uv_nmi_trigger_dump(tcpu);
}
if (ignored)
- printk(KERN_DEFAULT "UV: %d CPUs ignored NMI\n",
- ignored);
+ pr_alert("UV: %d CPUs ignored NMI\n", ignored);
console_loglevel = saved_console_loglevel;
pr_alert("UV: process trace complete\n");
@@ -492,8 +496,9 @@ static void uv_nmi_touch_watchdogs(void)
touch_nmi_watchdog();
}
-#if defined(CONFIG_KEXEC_CORE)
static atomic_t uv_nmi_kexec_failed;
+
+#if defined(CONFIG_KEXEC_CORE)
static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
/* Call crash to dump system state */
@@ -502,10 +507,9 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
crash_kexec(regs);
pr_emerg("UV: crash_kexec unexpectedly returned, ");
+ atomic_set(&uv_nmi_kexec_failed, 1);
if (!kexec_crash_image) {
pr_cont("crash kernel not loaded\n");
- atomic_set(&uv_nmi_kexec_failed, 1);
- uv_nmi_sync_exit(1);
return;
}
pr_cont("kexec busy, stalling cpus while waiting\n");
@@ -514,9 +518,6 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
/* If crash exec fails the slaves should return, otherwise stall */
while (atomic_read(&uv_nmi_kexec_failed) == 0)
mdelay(10);
-
- /* Crash kernel most likely not loaded, return in an orderly fashion */
- uv_nmi_sync_exit(0);
}
#else /* !CONFIG_KEXEC_CORE */
@@ -524,6 +525,7 @@ static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
if (master)
pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
+ atomic_set(&uv_nmi_kexec_failed, 1);
}
#endif /* !CONFIG_KEXEC_CORE */
@@ -613,9 +615,14 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
master = (atomic_read(&uv_nmi_cpu) == cpu);
/* If NMI action is "kdump", then attempt to do it */
- if (uv_nmi_action_is("kdump"))
+ if (uv_nmi_action_is("kdump")) {
uv_nmi_kdump(cpu, master, regs);
+ /* Unexpected return, revert action to "dump" */
+ if (master)
+ strncpy(uv_nmi_action, "dump", strlen(uv_nmi_action));
+ }
+
/* Pause as all cpus enter the NMI handler */
uv_nmi_wait(master);
@@ -640,6 +647,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
atomic_set(&uv_nmi_cpus_in_nmi, -1);
atomic_set(&uv_nmi_cpu, -1);
atomic_set(&uv_in_nmi, 0);
+ atomic_set(&uv_nmi_kexec_failed, 0);
}
uv_nmi_touch_watchdogs();
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 9ab52791fed5..d5f64996394a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -23,6 +23,7 @@
#include <asm/debugreg.h>
#include <asm/cpu.h>
#include <asm/mmu_context.h>
+#include <linux/dmi.h>
#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -32,6 +33,29 @@ __visible unsigned long saved_context_eflags;
#endif
struct saved_context saved_context;
+static void msr_save_context(struct saved_context *ctxt)
+{
+ struct saved_msr *msr = ctxt->saved_msrs.array;
+ struct saved_msr *end = msr + ctxt->saved_msrs.num;
+
+ while (msr < end) {
+ msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q);
+ msr++;
+ }
+}
+
+static void msr_restore_context(struct saved_context *ctxt)
+{
+ struct saved_msr *msr = ctxt->saved_msrs.array;
+ struct saved_msr *end = msr + ctxt->saved_msrs.num;
+
+ while (msr < end) {
+ if (msr->valid)
+ wrmsrl(msr->info.msr_no, msr->info.reg.q);
+ msr++;
+ }
+}
+
/**
* __save_processor_state - save CPU registers before creating a
* hibernation image and before restoring the memory state from it
@@ -111,6 +135,7 @@ static void __save_processor_state(struct saved_context *ctxt)
#endif
ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
&ctxt->misc_enable);
+ msr_save_context(ctxt);
}
/* Needed by apm.c */
@@ -229,6 +254,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
x86_platform.restore_sched_clock_state();
mtrr_bp_restore();
perf_restore_debug_store();
+ msr_restore_context(ctxt);
}
/* Needed by apm.c */
@@ -320,3 +346,69 @@ static int __init bsp_pm_check_init(void)
}
core_initcall(bsp_pm_check_init);
+
+static int msr_init_context(const u32 *msr_id, const int total_num)
+{
+ int i = 0;
+ struct saved_msr *msr_array;
+
+ if (saved_context.saved_msrs.array || saved_context.saved_msrs.num > 0) {
+ pr_err("x86/pm: MSR quirk already applied, please check your DMI match table.\n");
+ return -EINVAL;
+ }
+
+ msr_array = kmalloc_array(total_num, sizeof(struct saved_msr), GFP_KERNEL);
+ if (!msr_array) {
+ pr_err("x86/pm: Can not allocate memory to save/restore MSRs during suspend.\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < total_num; i++) {
+ msr_array[i].info.msr_no = msr_id[i];
+ msr_array[i].valid = false;
+ msr_array[i].info.reg.q = 0;
+ }
+ saved_context.saved_msrs.num = total_num;
+ saved_context.saved_msrs.array = msr_array;
+
+ return 0;
+}
+
+/*
+ * The following section is a quirk framework for problematic BIOSen:
+ * Sometimes MSRs are modified by the BIOSen after suspended to
+ * RAM, this might cause unexpected behavior after wakeup.
+ * Thus we save/restore these specified MSRs across suspend/resume
+ * in order to work around it.
+ *
+ * For any further problematic BIOSen/platforms,
+ * please add your own function similar to msr_initialize_bdw.
+ */
+static int msr_initialize_bdw(const struct dmi_system_id *d)
+{
+ /* Add any extra MSR ids into this array. */
+ u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL };
+
+ pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident);
+ return msr_init_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id));
+}
+
+static struct dmi_system_id msr_save_dmi_table[] = {
+ {
+ .callback = msr_initialize_bdw,
+ .ident = "BROADWELL BDX_EP",
+ .matches = {
+ DMI_MATCH(DMI_PRODUCT_NAME, "GRANTLEY"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "E63448-400"),
+ },
+ },
+ {}
+};
+
+static int pm_check_save_msr(void)
+{
+ dmi_check_system(msr_save_dmi_table);
+ return 0;
+}
+
+device_initcall(pm_check_save_msr);
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index 10fea5fc821e..df280da34825 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -1,11 +1,9 @@
config AMD_MCE_INJ
tristate "Simple MCE injection interface for AMD processors"
- depends on RAS && EDAC_DECODE_MCE && DEBUG_FS
+ depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB
default n
help
This is a simple debugfs interface to inject MCEs and test different
aspects of the MCE handling code.
WARNING: Do not even assume this interface is staying stable!
-
-
diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
index 17e35b5bf779..55d38cfa46c2 100644
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -17,7 +17,11 @@
#include <linux/cpu.h>
#include <linux/string.h>
#include <linux/uaccess.h>
+#include <linux/pci.h>
+
#include <asm/mce.h>
+#include <asm/amd_nb.h>
+#include <asm/irq_vectors.h>
#include "../kernel/cpu/mcheck/mce-internal.h"
@@ -30,16 +34,21 @@ static struct dentry *dfs_inj;
static u8 n_banks;
#define MAX_FLAG_OPT_SIZE 3
+#define NBCFG 0x44
enum injection_type {
SW_INJ = 0, /* SW injection, simply decode the error */
HW_INJ, /* Trigger a #MC */
+ DFR_INT_INJ, /* Trigger Deferred error interrupt */
+ THR_INT_INJ, /* Trigger threshold interrupt */
N_INJ_TYPES,
};
static const char * const flags_options[] = {
[SW_INJ] = "sw",
[HW_INJ] = "hw",
+ [DFR_INT_INJ] = "df",
+ [THR_INT_INJ] = "th",
NULL
};
@@ -129,12 +138,9 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf,
{
char buf[MAX_FLAG_OPT_SIZE], *__buf;
int err;
- size_t ret;
if (cnt > MAX_FLAG_OPT_SIZE)
- cnt = MAX_FLAG_OPT_SIZE;
-
- ret = cnt;
+ return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
@@ -150,9 +156,9 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf,
return err;
}
- *ppos += ret;
+ *ppos += cnt;
- return ret;
+ return cnt;
}
static const struct file_operations flags_fops = {
@@ -185,6 +191,55 @@ static void trigger_mce(void *info)
asm volatile("int $18");
}
+static void trigger_dfr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
+}
+
+static void trigger_thr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
+}
+
+static u32 get_nbc_for_node(int node_id)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 cores_per_node;
+
+ cores_per_node = c->x86_max_cores / amd_get_nodes_per_socket();
+
+ return cores_per_node * node_id;
+}
+
+static void toggle_nb_mca_mst_cpu(u16 nid)
+{
+ struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
+ u32 val;
+ int err;
+
+ if (!F3)
+ return;
+
+ err = pci_read_config_dword(F3, NBCFG, &val);
+ if (err) {
+ pr_err("%s: Error reading F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+ return;
+ }
+
+ if (val & BIT(27))
+ return;
+
+ pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
+ __func__);
+
+ val |= BIT(27);
+ err = pci_write_config_dword(F3, NBCFG, val);
+ if (err)
+ pr_err("%s: Error writing F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+}
+
static void do_inject(void)
{
u64 mcg_status = 0;
@@ -205,6 +260,26 @@ static void do_inject(void)
if (!(i_mce.status & MCI_STATUS_PCC))
mcg_status |= MCG_STATUS_RIPV;
+ /*
+ * Ensure necessary status bits for deferred errors:
+ * - MCx_STATUS[Deferred]: make sure it is a deferred error
+ * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
+ */
+ if (inj_type == DFR_INT_INJ) {
+ i_mce.status |= MCI_STATUS_DEFERRED;
+ i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
+ }
+
+ /*
+ * For multi node CPUs, logging and reporting of bank 4 errors happens
+ * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
+ * Fam10h and later BKDGs.
+ */
+ if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) {
+ toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
+ cpu = get_nbc_for_node(amd_get_nb_id(cpu));
+ }
+
get_online_cpus();
if (!cpu_online(cpu))
goto err;
@@ -225,7 +300,16 @@ static void do_inject(void)
toggle_hw_mce_inject(cpu, false);
- smp_call_function_single(cpu, trigger_mce, NULL, 0);
+ switch (inj_type) {
+ case DFR_INT_INJ:
+ smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
+ break;
+ case THR_INT_INJ:
+ smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
+ break;
+ default:
+ smp_call_function_single(cpu, trigger_mce, NULL, 0);
+ }
err:
put_online_cpus();
@@ -290,6 +374,11 @@ static const char readme_msg[] =
"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
"\t before injecting.\n"
+"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
+"\t error APIC interrupt handler to handle the error if the feature is \n"
+"\t is present in hardware. \n"
+"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
+"\t APIC interrupt handler to handle the error. \n"
"\n";
static ssize_t
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 2730d775ef9a..3e75fcf6b836 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -70,3 +70,4 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \
-I$(srctree)/arch/x86/boot
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
+UBSAN_SANITIZE := n
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index a8fecc226946..3ee2bb6b440b 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -17,7 +17,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ksyms.o ldt.o \
ifeq ($(CONFIG_X86_32),y)
obj-y += checksum_32.o
-obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_ELF_CORE) += elfcore.o
subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 755481f14d90..174781a404ff 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -36,13 +36,6 @@
#endif /* CONFIG_X86_PPRO_FENCE */
#define dma_wmb() barrier()
-#define smp_mb() barrier()
-#define smp_rmb() barrier()
-#define smp_wmb() barrier()
-
-#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
-
-#define read_barrier_depends() do { } while (0)
-#define smp_read_barrier_depends() do { } while (0)
+#include <asm-generic/barrier.h>
#endif
diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h
index 9fe77b7b5a0e..11ab90dc5f14 100644
--- a/arch/x86/um/asm/syscall.h
+++ b/arch/x86/um/asm/syscall.h
@@ -1,8 +1,13 @@
#ifndef __UM_ASM_SYSCALL_H
#define __UM_ASM_SYSCALL_H
+#include <asm/syscall-generic.h>
#include <uapi/linux/audit.h>
+typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
+
static inline int syscall_get_arch(void)
{
#ifdef CONFIG_X86_32
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index a29756f2d940..47c78d5e5c32 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -68,6 +68,7 @@ static const int reg_offsets[] = {
[EFL] = HOST_EFLAGS,
[UESP] = HOST_SP,
[SS] = HOST_SS,
+ [ORIG_EAX] = HOST_ORIG_AX,
};
int putreg(struct task_struct *child, int regno, unsigned long value)
@@ -83,6 +84,7 @@ int putreg(struct task_struct *child, int regno, unsigned long value)
case EAX:
case EIP:
case UESP:
+ case ORIG_EAX:
break;
case FS:
if (value && (value & 3) != 3)
@@ -108,9 +110,6 @@ int putreg(struct task_struct *child, int regno, unsigned long value)
value &= FLAG_MASK;
child->thread.regs.regs.gp[HOST_EFLAGS] |= value;
return 0;
- case ORIG_EAX:
- child->thread.regs.regs.syscall = value;
- return 0;
default :
panic("Bad register in putreg() : %d\n", regno);
}
@@ -143,8 +142,6 @@ unsigned long getreg(struct task_struct *child, int regno)
regno >>= 2;
switch (regno) {
- case ORIG_EAX:
- return child->thread.regs.regs.syscall;
case FS:
case GS:
case DS:
@@ -163,6 +160,7 @@ unsigned long getreg(struct task_struct *child, int regno)
case EDI:
case EBP:
case EFL:
+ case ORIG_EAX:
break;
default:
panic("Bad register in getreg() : %d\n", regno);
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 06934a8a4872..14fcd01ed992 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -211,7 +211,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
if (err)
return 1;
- err = convert_fxsr_from_user(&fpx, sc.fpstate);
+ err = convert_fxsr_from_user(&fpx, (void *)sc.fpstate);
if (err)
return 1;
@@ -227,7 +227,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
{
struct user_i387_struct fp;
- err = copy_from_user(&fp, sc.fpstate,
+ err = copy_from_user(&fp, (void *)sc.fpstate,
sizeof(struct user_i387_struct));
if (err)
return 1;
@@ -291,7 +291,7 @@ static int copy_sc_to_user(struct sigcontext __user *to,
#endif
#undef PUTREG
sc.oldmask = mask;
- sc.fpstate = to_fp;
+ sc.fpstate = (unsigned long)to_fp;
err = copy_to_user(to, &sc, sizeof(struct sigcontext));
if (err)
@@ -468,12 +468,10 @@ long sys_sigreturn(void)
struct sigframe __user *frame = (struct sigframe __user *)(sp - 8);
sigset_t set;
struct sigcontext __user *sc = &frame->sc;
- unsigned long __user *oldmask = &sc->oldmask;
- unsigned long __user *extramask = frame->extramask;
int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
- if (copy_from_user(&set.sig[0], oldmask, sizeof(set.sig[0])) ||
- copy_from_user(&set.sig[1], extramask, sig_size))
+ if (copy_from_user(&set.sig[0], &sc->oldmask, sizeof(set.sig[0])) ||
+ copy_from_user(&set.sig[1], frame->extramask, sig_size))
goto segfault;
set_current_blocked(&set);
@@ -505,6 +503,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
{
struct rt_sigframe __user *frame;
int err = 0, sig = ksig->sig;
+ unsigned long fp_to;
frame = (struct rt_sigframe __user *)
round_down(stack_top - sizeof(struct rt_sigframe), 16);
@@ -526,7 +525,10 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs));
err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs,
set->sig[0]);
- err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate);
+
+ fp_to = (unsigned long)&frame->fpstate;
+
+ err |= __put_user(fp_to, &frame->uc.uc_mcontext.fpstate);
if (sizeof(*set) == 16) {
err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S
index b972649d3a18..98816804e131 100644
--- a/arch/x86/um/stub_32.S
+++ b/arch/x86/um/stub_32.S
@@ -1,6 +1,5 @@
#include <as-layout.h>
- .globl syscall_stub
.section .__syscall_stub, "ax"
.globl batch_syscall_stub
diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S
index 7160b20172d0..ba914b3b8cc4 100644
--- a/arch/x86/um/stub_64.S
+++ b/arch/x86/um/stub_64.S
@@ -1,25 +1,9 @@
#include <as-layout.h>
- .globl syscall_stub
.section .__syscall_stub, "ax"
-syscall_stub:
- syscall
- /* We don't have 64-bit constants, so this constructs the address
- * we need.
- */
- movq $(STUB_DATA >> 32), %rbx
- salq $32, %rbx
- movq $(STUB_DATA & 0xffffffff), %rcx
- or %rcx, %rbx
- movq %rax, (%rbx)
- int3
-
.globl batch_syscall_stub
batch_syscall_stub:
- mov $(STUB_DATA >> 32), %rbx
- sal $32, %rbx
- mov $(STUB_DATA & 0xffffffff), %rax
- or %rax, %rbx
+ mov $(STUB_DATA), %rbx
/* load pointer to first operation */
mov %rbx, %rsp
add $0x10, %rsp
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index bd16d6c370ec..439c0994b696 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -7,6 +7,7 @@
#include <linux/sys.h>
#include <linux/cache.h>
#include <generated/user_constants.h>
+#include <asm/syscall.h>
#define __NO_STUBS
@@ -24,15 +25,13 @@
#define old_mmap sys_old_mmap
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386
#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
-typedef asmlinkage void (*sys_call_ptr_t)(void);
-
-extern asmlinkage void sys_ni_syscall(void);
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
/*
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index a75d8700472a..b74ea6c2c0e7 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -7,6 +7,7 @@
#include <linux/sys.h>
#include <linux/cache.h>
#include <generated/user_constants.h>
+#include <asm/syscall.h>
#define __NO_STUBS
@@ -37,15 +38,13 @@
#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
-typedef void (*sys_call_ptr_t)(void);
-
-extern void sys_ni_syscall(void);
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
/*
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index acda713ab5be..abf4901c917b 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -64,7 +64,7 @@ static u32 xen_apic_read(u32 reg)
if (reg != APIC_ID)
return 0;
- ret = HYPERVISOR_dom0_op(&op);
+ ret = HYPERVISOR_platform_op(&op);
if (ret)
return 0;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 993b7a71386d..d09e4c9d7cc5 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -75,6 +75,7 @@
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/pat.h>
+#include <asm/cpu.h>
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
@@ -414,7 +415,7 @@ static bool __init xen_check_mwait(void)
set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
- if ((HYPERVISOR_dom0_op(&op) == 0) &&
+ if ((HYPERVISOR_platform_op(&op) == 0) &&
(buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
cpuid_leaf5_ecx_val = cx;
cpuid_leaf5_edx_val = dx;
@@ -1191,7 +1192,7 @@ static const struct pv_info xen_info __initconst = {
#ifdef CONFIG_X86_64
.extra_user_64bit_cs = FLAT_USER_CS64,
#endif
-
+ .features = 0,
.name = "Xen",
};
@@ -1228,10 +1229,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.iret = xen_iret,
#ifdef CONFIG_X86_64
- .usergs_sysret32 = xen_sysret32,
.usergs_sysret64 = xen_sysret64,
-#else
- .irq_enable_sysexit = xen_sysexit,
#endif
.load_tr_desc = paravirt_nop,
@@ -1264,12 +1262,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.end_context_switch = xen_end_context_switch,
};
-static const struct pv_apic_ops xen_apic_ops __initconst = {
-#ifdef CONFIG_X86_LOCAL_APIC
- .startup_ipi_hook = paravirt_nop,
-#endif
-};
-
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
@@ -1373,7 +1365,7 @@ static void __init xen_boot_params_init_edd(void)
info->params.length = sizeof(info->params);
set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
&info->params);
- ret = HYPERVISOR_dom0_op(&op);
+ ret = HYPERVISOR_platform_op(&op);
if (ret)
break;
@@ -1391,7 +1383,7 @@ static void __init xen_boot_params_init_edd(void)
op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
op.u.firmware_info.index = nr;
- ret = HYPERVISOR_dom0_op(&op);
+ ret = HYPERVISOR_platform_op(&op);
if (ret)
break;
mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
@@ -1534,8 +1526,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
+ if (xen_initial_domain())
+ pv_info.features |= PV_SUPPORTED_RTC;
pv_init_ops = xen_init_ops;
- pv_apic_ops = xen_apic_ops;
if (!xen_pvh_domain()) {
pv_cpu_ops = xen_cpu_ops;
@@ -1697,7 +1690,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_start_info->console.domU.mfn = 0;
xen_start_info->console.domU.evtchn = 0;
- if (HYPERVISOR_dom0_op(&op) == 0)
+ if (HYPERVISOR_platform_op(&op) == 0)
boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
/* Make sure ACS will be enabled */
@@ -1885,8 +1878,10 @@ EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
static void xen_set_cpu_features(struct cpuinfo_x86 *c)
{
- if (xen_pv_domain())
+ if (xen_pv_domain()) {
clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+ set_cpu_cap(c, X86_FEATURE_XENPV);
+ }
}
const struct hypervisor_x86 x86_hyper_xen = {
@@ -1899,3 +1894,17 @@ const struct hypervisor_x86 x86_hyper_xen = {
.set_cpu_features = xen_set_cpu_features,
};
EXPORT_SYMBOL(x86_hyper_xen);
+
+#ifdef CONFIG_HOTPLUG_CPU
+void xen_arch_register_cpu(int num)
+{
+ arch_register_cpu(num);
+}
+EXPORT_SYMBOL(xen_arch_register_cpu);
+
+void xen_arch_unregister_cpu(int num)
+{
+ arch_unregister_cpu(num);
+}
+EXPORT_SYMBOL(xen_arch_unregister_cpu);
+#endif
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 1580e7a5a4cf..e079500b17f3 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -133,7 +133,7 @@ static int __init xlated_setup_gnttab_pages(void)
kfree(pages);
return -ENOMEM;
}
- rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
+ rc = alloc_xenballooned_pages(nr_grant_frames, pages);
if (rc) {
pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
nr_grant_frames, rc);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 9c479fe40459..c913ca4f6958 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2436,7 +2436,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.flush_tlb_others = xen_flush_tlb_others,
.pte_update = paravirt_nop,
- .pte_update_defer = paravirt_nop,
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,
@@ -2495,14 +2494,9 @@ void __init xen_init_mmu_ops(void)
{
x86_init.paging.pagetable_init = xen_pagetable_init;
- /* Optimization - we can use the HVM one but it has no idea which
- * VCPUs are descheduled - which means that it will needlessly IPI
- * them. Xen knows so let it do the job.
- */
- if (xen_feature(XENFEAT_auto_translated_physmap)) {
- pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
return;
- }
+
pv_mmu_ops = xen_mmu_ops;
memset(dummy_mapping, 0xff, PAGE_SIZE);
@@ -2888,6 +2882,7 @@ static int do_remap_gfn(struct vm_area_struct *vma,
addr += range;
if (err_ptr)
err_ptr += batch;
+ cond_resched();
}
out:
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 660b3cfef234..cab9f766bb06 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -530,7 +530,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
* the new pages are installed with cmpxchg; if we lose the race then
* simply free the page we allocated and use the one that's there.
*/
-static bool alloc_p2m(unsigned long pfn)
+int xen_alloc_p2m_entry(unsigned long pfn)
{
unsigned topidx;
unsigned long *top_mfn_p, *mid_mfn;
@@ -540,6 +540,9 @@ static bool alloc_p2m(unsigned long pfn)
unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
unsigned long p2m_pfn;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
ptep = lookup_address(addr, &level);
BUG_ON(!ptep || level != PG_LEVEL_4K);
pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -548,7 +551,7 @@ static bool alloc_p2m(unsigned long pfn)
/* PMD level is missing, allocate a new one */
ptep = alloc_p2m_pmd(addr, pte_pg);
if (!ptep)
- return false;
+ return -ENOMEM;
}
if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
@@ -566,7 +569,7 @@ static bool alloc_p2m(unsigned long pfn)
mid_mfn = alloc_p2m_page();
if (!mid_mfn)
- return false;
+ return -ENOMEM;
p2m_mid_mfn_init(mid_mfn, p2m_missing);
@@ -592,7 +595,7 @@ static bool alloc_p2m(unsigned long pfn)
p2m = alloc_p2m_page();
if (!p2m)
- return false;
+ return -ENOMEM;
if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
p2m_init(p2m);
@@ -625,8 +628,9 @@ static bool alloc_p2m(unsigned long pfn)
HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
}
- return true;
+ return 0;
}
+EXPORT_SYMBOL(xen_alloc_p2m_entry);
unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e)
@@ -688,7 +692,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- if (!alloc_p2m(pfn))
+ int ret;
+
+ ret = xen_alloc_p2m_entry(pfn);
+ if (ret < 0)
return false;
return __set_phys_to_machine(pfn, mfn);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 1c30e4ab1022..7ab29518a3b9 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -212,7 +212,7 @@ static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
e_pfn = PFN_DOWN(entry->addr + entry->size);
/* We only care about E820 after this */
- if (e_pfn < *min_pfn)
+ if (e_pfn <= *min_pfn)
continue;
s_pfn = PFN_UP(entry->addr);
@@ -829,6 +829,8 @@ char * __init xen_memory_setup(void)
addr = xen_e820_map[0].addr;
size = xen_e820_map[0].size;
while (i < xen_e820_map_entries) {
+ bool discard = false;
+
chunk_size = size;
type = xen_e820_map[i].type;
@@ -843,10 +845,11 @@ char * __init xen_memory_setup(void)
xen_add_extra_mem(pfn_s, n_pfns);
xen_max_p2m_pfn = pfn_s + n_pfns;
} else
- type = E820_UNUSABLE;
+ discard = true;
}
- xen_align_and_add_e820_region(addr, chunk_size, type);
+ if (!discard)
+ xen_align_and_add_e820_region(addr, chunk_size, type);
addr += chunk_size;
size -= chunk_size;
@@ -965,17 +968,8 @@ char * __init xen_auto_xlated_memory_setup(void)
static void __init fiddle_vdso(void)
{
#ifdef CONFIG_X86_32
- /*
- * This could be called before selected_vdso32 is initialized, so
- * just fiddle with both possible images. vdso_image_32_syscall
- * can't be selected, since it only exists on 64-bit systems.
- */
- u32 *mask;
- mask = vdso_image_32_int80.data +
- vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
- *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
- mask = vdso_image_32_sysenter.data +
- vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
+ u32 *mask = vdso_image_32.data +
+ vdso_image_32.sym_VDSO32_NOTE_MASK;
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
#endif
}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index feddabdab448..7f664c416faf 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,6 +1,7 @@
#include <linux/types.h>
#include <linux/tick.h>
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/grant_table.h>
#include <xen/events.h>
@@ -33,7 +34,8 @@ static void xen_hvm_post_suspend(int suspend_cancelled)
{
#ifdef CONFIG_XEN_PVHVM
int cpu;
- xen_hvm_init_shared_info();
+ if (!suspend_cancelled)
+ xen_hvm_init_shared_info();
xen_callback_vector();
xen_unplug_emulated_devices();
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
@@ -68,26 +70,16 @@ static void xen_pv_post_suspend(int suspend_cancelled)
void xen_arch_pre_suspend(void)
{
- int cpu;
-
- for_each_online_cpu(cpu)
- xen_pmu_finish(cpu);
-
if (xen_pv_domain())
xen_pv_pre_suspend();
}
void xen_arch_post_suspend(int cancelled)
{
- int cpu;
-
if (xen_pv_domain())
xen_pv_post_suspend(cancelled);
else
xen_hvm_post_suspend(cancelled);
-
- for_each_online_cpu(cpu)
- xen_pmu_init(cpu);
}
static void xen_vcpu_notify_restore(void *data)
@@ -106,10 +98,20 @@ static void xen_vcpu_notify_suspend(void *data)
void xen_arch_resume(void)
{
+ int cpu;
+
on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
+
+ for_each_online_cpu(cpu)
+ xen_pmu_init(cpu);
}
void xen_arch_suspend(void)
{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
+
on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index f1ba6a092854..a0a4e554c6f1 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -16,6 +16,7 @@
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/pvclock_gtod.h>
+#include <linux/timekeeper_internal.h>
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
@@ -32,86 +33,12 @@
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
-/* runstate info updated by Xen */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
-
/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
/* unused ns of stolen time */
static DEFINE_PER_CPU(u64, xen_residual_stolen);
-/* return an consistent snapshot of 64-bit time/counter value */
-static u64 get64(const u64 *p)
-{
- u64 ret;
-
- if (BITS_PER_LONG < 64) {
- u32 *p32 = (u32 *)p;
- u32 h, l;
-
- /*
- * Read high then low, and then make sure high is
- * still the same; this will only loop if low wraps
- * and carries into high.
- * XXX some clean way to make this endian-proof?
- */
- do {
- h = p32[1];
- barrier();
- l = p32[0];
- barrier();
- } while (p32[1] != h);
-
- ret = (((u64)h) << 32) | l;
- } else
- ret = *p;
-
- return ret;
-}
-
-/*
- * Runstate accounting
- */
-static void get_runstate_snapshot(struct vcpu_runstate_info *res)
-{
- u64 state_time;
- struct vcpu_runstate_info *state;
-
- BUG_ON(preemptible());
-
- state = this_cpu_ptr(&xen_runstate);
-
- /*
- * The runstate info is always updated by the hypervisor on
- * the current CPU, so there's no need to use anything
- * stronger than a compiler barrier when fetching it.
- */
- do {
- state_time = get64(&state->state_entry_time);
- barrier();
- *res = *state;
- barrier();
- } while (get64(&state->state_entry_time) != state_time);
-}
-
-/* return true when a vcpu could run but has no real cpu to run on */
-bool xen_vcpu_stolen(int vcpu)
-{
- return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
-}
-
-void xen_setup_runstate_info(int cpu)
-{
- struct vcpu_register_runstate_memory_area area;
-
- area.addr.v = &per_cpu(xen_runstate, cpu);
-
- if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
- cpu, &area))
- BUG();
-}
-
static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
@@ -119,7 +46,7 @@ static void do_stolen_accounting(void)
s64 runnable, offline, stolen;
cputime_t ticks;
- get_runstate_snapshot(&state);
+ xen_get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
@@ -194,26 +121,46 @@ static int xen_pvclock_gtod_notify(struct notifier_block *nb,
unsigned long was_set, void *priv)
{
/* Protected by the calling core code serialization */
- static struct timespec next_sync;
+ static struct timespec64 next_sync;
struct xen_platform_op op;
- struct timespec now;
+ struct timespec64 now;
+ struct timekeeper *tk = priv;
+ static bool settime64_supported = true;
+ int ret;
- now = __current_kernel_time();
+ now.tv_sec = tk->xtime_sec;
+ now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
/*
* We only take the expensive HV call when the clock was set
* or when the 11 minutes RTC synchronization time elapsed.
*/
- if (!was_set && timespec_compare(&now, &next_sync) < 0)
+ if (!was_set && timespec64_compare(&now, &next_sync) < 0)
return NOTIFY_OK;
- op.cmd = XENPF_settime;
- op.u.settime.secs = now.tv_sec;
- op.u.settime.nsecs = now.tv_nsec;
- op.u.settime.system_time = xen_clocksource_read();
+again:
+ if (settime64_supported) {
+ op.cmd = XENPF_settime64;
+ op.u.settime64.mbz = 0;
+ op.u.settime64.secs = now.tv_sec;
+ op.u.settime64.nsecs = now.tv_nsec;
+ op.u.settime64.system_time = xen_clocksource_read();
+ } else {
+ op.cmd = XENPF_settime32;
+ op.u.settime32.secs = now.tv_sec;
+ op.u.settime32.nsecs = now.tv_nsec;
+ op.u.settime32.system_time = xen_clocksource_read();
+ }
+
+ ret = HYPERVISOR_platform_op(&op);
- (void)HYPERVISOR_dom0_op(&op);
+ if (ret == -ENOSYS && settime64_supported) {
+ settime64_supported = false;
+ goto again;
+ }
+ if (ret < 0)
+ return NOTIFY_BAD;
/*
* Move the next drift compensation time 11 minutes
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index fd92a64d748e..feb6d40a0860 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -35,20 +35,6 @@ check_events:
ret
/*
- * We can't use sysexit directly, because we're not running in ring0.
- * But we can easily fake it up using iret. Assuming xen_sysexit is
- * jumped to with a standard stack frame, we can just strip it back to
- * a standard iret frame and use iret.
- */
-ENTRY(xen_sysexit)
- movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
- orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
- lea PT_EIP(%esp), %esp
-
- jmp xen_iret
-ENDPROC(xen_sysexit)
-
-/*
* This is run where a normal iret would be run, with the same stack setup:
* 8: eflags
* 4: cs
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index f22667abf7b9..cc8acc410ddb 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -68,25 +68,6 @@ ENTRY(xen_sysret64)
ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
-ENTRY(xen_sysret32)
- /*
- * We're already on the usermode stack at this point, but
- * still with the kernel gs, so we can easily switch back
- */
- movq %rsp, PER_CPU_VAR(rsp_scratch)
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-
- pushq $__USER32_DS
- pushq PER_CPU_VAR(rsp_scratch)
- pushq %r11
- pushq $__USER32_CS
- pushq %rcx
-
- pushq $0
-1: jmp hypercall_iret
-ENDPATCH(xen_sysret32)
-RELOC(xen_sysret32, 1b+1)
-
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 1399423f3418..4140b070f2e9 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -139,9 +139,6 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long);
/* These are not functions, and cannot be called normally */
__visible void xen_iret(void);
-#ifdef CONFIG_X86_32
-__visible void xen_sysexit(void);
-#endif
__visible void xen_sysret32(void);
__visible void xen_sysret64(void);
__visible void xen_adjust_exception_frame(void);
OpenPOWER on IntegriCloud