From 5bfce5ef55cbe78ee2ee6e97f2e26a8a582008f3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 10 Oct 2013 17:18:15 -0700 Subject: x86, kaslr: Provide randomness functions Adds potential sources of randomness: RDRAND, RDTSC, or the i8254. This moves the pre-alternatives inline rdrand function into the header so both pieces of code can use it. Availability of RDRAND is then controlled by CONFIG_ARCH_RANDOM, if someone wants to disable it even for kASLR. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/1381450698-28710-4-git-send-email-keescook@chromium.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/archrandom.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 0d9ec770f2f8..e6a92455740e 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -39,6 +39,20 @@ #ifdef CONFIG_ARCH_RANDOM +/* Instead of arch_get_random_long() when alternatives haven't run. */ +static inline int rdrand_long(unsigned long *v) +{ + int ok; + asm volatile("1: " RDRAND_LONG "\n\t" + "jc 2f\n\t" + "decl %0\n\t" + "jnz 1b\n\t" + "2:" + : "=r" (ok), "=a" (*v) + : "0" (RDRAND_RETRY_LOOPS)); + return ok; +} + #define GET_RANDOM(name, type, rdrand, nop) \ static inline int name(type *v) \ { \ @@ -68,6 +82,13 @@ GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); #endif /* CONFIG_X86_64 */ +#else + +static inline int rdrand_long(unsigned long *v) +{ + return 0; +} + #endif /* CONFIG_ARCH_RANDOM */ extern void x86_init_rdrand(struct cpuinfo_x86 *c); -- cgit v1.2.1 From 6145cfe394a7f138f6b64491c5663f97dba12450 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 10 Oct 2013 17:18:18 -0700 Subject: x86, kaslr: Raise the maximum virtual address to -1 GiB on x86_64 On 64-bit, this raises the maximum location to -1 GiB (from -1.5 GiB), the upper limit currently, since the kernel fixmap page mappings need to be moved to use the other 1 GiB (which would be the theoretical limit when building with -mcmodel=kernel). Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/1381450698-28710-7-git-send-email-keescook@chromium.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_64_types.h | 15 ++++++++++++--- arch/x86/include/asm/pgtable_64_types.h | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 43dcd804ebd5..8de6d9cf3b95 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -39,9 +39,18 @@ #define __VIRTUAL_MASK_SHIFT 47 /* - * Kernel image size is limited to 512 MB (see level2_kernel_pgt in - * arch/x86/kernel/head_64.S), and it is mapped here: + * Kernel image size is limited to 1GiB due to the fixmap living in the + * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use + * 512MiB by default, leaving 1.5GiB for modules once the page tables + * are fully set up. If kernel ASLR is configured, it can extend the + * kernel page table mapping, reducing the size of the modules area. */ -#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) +#define KERNEL_IMAGE_SIZE_DEFAULT (512 * 1024 * 1024) +#if defined(CONFIG_RANDOMIZE_BASE) && \ + CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT +#define KERNEL_IMAGE_SIZE CONFIG_RANDOMIZE_BASE_MAX_OFFSET +#else +#define KERNEL_IMAGE_SIZE KERNEL_IMAGE_SIZE_DEFAULT +#endif #endif /* _ASM_X86_PAGE_64_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 2d883440cb9a..c883bf726398 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -58,7 +58,7 @@ typedef struct { pteval_t pte; } pte_t; #define VMALLOC_START _AC(0xffffc90000000000, UL) #define VMALLOC_END _AC(0xffffe8ffffffffff, UL) #define VMEMMAP_START _AC(0xffffea0000000000, UL) -#define MODULES_VADDR _AC(0xffffffffa0000000, UL) +#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) -- cgit v1.2.1 From d2f7cbe7b26a74dbbbf8f325b2a6fd01bc34032c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:08 +0100 Subject: x86/efi: Runtime services virtual mapping We map the EFI regions needed for runtime services non-contiguously, with preserved alignment on virtual addresses starting from -4G down for a total max space of 64G. This way, we provide for stable runtime services addresses across kernels so that a kexec'd kernel can still use them. Thus, they're mapped in a separate pagetable so that we don't pollute the kernel namespace. Add an efi= kernel command line parameter for passing miscellaneous options and chicken bits from the command line. While at it, add a chicken bit called "efi=old_map" which can be used as a fallback to the old runtime services mapping method in case there's some b0rkage with a particular EFI implementation (haha, it is hard to hold up the sarcasm here...). Also, add the UEFI RT VA space to Documentation/x86/x86_64/mm.txt. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 64 ++++++++++++++++++++++++++---------- arch/x86/include/asm/pgtable_types.h | 3 +- 2 files changed, 49 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 65c6e6e3a552..89a05b0507b9 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -1,6 +1,24 @@ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H +/* + * We map the EFI regions needed for runtime services non-contiguously, + * with preserved alignment on virtual addresses starting from -4G down + * for a total max space of 64G. This way, we provide for stable runtime + * services addresses across kernels so that a kexec'd kernel can still + * use them. + * + * This is the main reason why we're doing stable VA mappings for RT + * services. + * + * This flag is used in conjuction with a chicken bit called + * "efi=old_map" which can be used as a fallback to the old runtime + * services mapping method in case there's some b0rkage with a + * particular EFI implementation (haha, it is hard to hold up the + * sarcasm here...). + */ +#define EFI_OLD_MEMMAP EFI_ARCH_1 + #ifdef CONFIG_X86_32 #define EFI_LOADER_SIGNATURE "EL32" @@ -69,24 +87,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \ (u64)(a4), (u64)(a5), (u64)(a6)) +#define _efi_call_virtX(x, f, ...) \ +({ \ + efi_status_t __s; \ + \ + efi_sync_low_kernel_mappings(); \ + preempt_disable(); \ + __s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__); \ + preempt_enable(); \ + __s; \ +}) + #define efi_call_virt0(f) \ - efi_call0((efi.systab->runtime->f)) -#define efi_call_virt1(f, a1) \ - efi_call1((efi.systab->runtime->f), (u64)(a1)) -#define efi_call_virt2(f, a1, a2) \ - efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2)) -#define efi_call_virt3(f, a1, a2, a3) \ - efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3)) -#define efi_call_virt4(f, a1, a2, a3, a4) \ - efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4)) -#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ - efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4), (u64)(a5)) -#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ - efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) + _efi_call_virtX(0, f) +#define efi_call_virt1(f, a1) \ + _efi_call_virtX(1, f, (u64)(a1)) +#define efi_call_virt2(f, a1, a2) \ + _efi_call_virtX(2, f, (u64)(a1), (u64)(a2)) +#define efi_call_virt3(f, a1, a2, a3) \ + _efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3)) +#define efi_call_virt4(f, a1, a2, a3, a4) \ + _efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4)) +#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ + _efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5)) +#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ + _efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); @@ -95,12 +120,17 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, extern int add_efi_memmap; extern unsigned long x86_efi_facility; +extern struct efi_scratch efi_scratch; extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); extern void efi_unmap_memmap(void); extern void efi_memory_uc(u64 addr, unsigned long size); +extern void __init efi_map_region(efi_memory_desc_t *md); +extern void efi_sync_low_kernel_mappings(void); +extern void efi_setup_page_tables(void); +extern void __init old_map_region(efi_memory_desc_t *md); #ifdef CONFIG_EFI diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f4843e031131..028e28b6fc2c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -379,7 +379,8 @@ static inline void update_page_count(int level, unsigned long pages) { } */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern phys_addr_t slow_virt_to_phys(void *__address); - +extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, + unsigned numpages, unsigned long page_flags); #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */ -- cgit v1.2.1 From 5305ca10e7f44333e46c1ce57fb87306cb437891 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 15 Nov 2013 14:14:00 -0800 Subject: x86/mm: Unify pte_to_pgoff() and pgoff_to_pte() helpers Use unified pte_bitop() helper to manipulate bits in pte/pgoff bitfields and convert pte_to_pgoff()/pgoff_to_pte() to inlines. Signed-off-by: Cyrill Gorcunov Signed-off-by: Andrew Morton Cc: Pavel Emelyanov Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable-2level.h | 100 ++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 41 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 3bf2dd0cf61f..0d193e234647 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -55,6 +55,13 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +/* Bit manipulation helper on pte/pgoff entry */ +static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift, + unsigned long mask, unsigned int leftshift) +{ + return ((value >> rightshift) & mask) << leftshift; +} + #ifdef CONFIG_MEM_SOFT_DIRTY /* @@ -71,31 +78,34 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) #define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) -#define pte_to_pgoff(pte) \ - ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ - & ((1U << PTE_FILE_BITS1) - 1))) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ - & ((1U << PTE_FILE_BITS2) - 1)) \ - << (PTE_FILE_BITS1)) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ - & ((1U << PTE_FILE_BITS3) - 1)) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) - -#define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = \ - ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ - + ((((off) >> PTE_FILE_BITS1) \ - & ((1U << PTE_FILE_BITS2) - 1)) \ - << PTE_FILE_SHIFT2) \ - + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - & ((1U << PTE_FILE_BITS3) - 1)) \ - << PTE_FILE_SHIFT3) \ - + ((((off) >> \ - (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ - << PTE_FILE_SHIFT4) \ - + _PAGE_FILE }) +#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) +#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) +#define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1) + +#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) +#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) +#define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) + +static __always_inline pgoff_t pte_to_pgoff(pte_t pte) +{ + return (pgoff_t) + (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + + pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + + pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + + pte_bitop(pte.pte_low, PTE_FILE_SHIFT4, -1UL, PTE_FILE_LSHIFT4)); +} + +static __always_inline pte_t pgoff_to_pte(pgoff_t off) +{ + return (pte_t){ + .pte_low = + pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + + pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + + pte_bitop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + + pte_bitop(off, PTE_FILE_LSHIFT4, -1UL, PTE_FILE_SHIFT4) + + _PAGE_FILE, + }; +} #else /* CONFIG_MEM_SOFT_DIRTY */ @@ -115,22 +125,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) -#define pte_to_pgoff(pte) \ - ((((pte).pte_low >> PTE_FILE_SHIFT1) \ - & ((1U << PTE_FILE_BITS1) - 1)) \ - + ((((pte).pte_low >> PTE_FILE_SHIFT2) \ - & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \ - + (((pte).pte_low >> PTE_FILE_SHIFT3) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2))) - -#define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = \ - (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ - + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \ - << PTE_FILE_SHIFT2) \ - + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - << PTE_FILE_SHIFT3) \ - + _PAGE_FILE }) +#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) +#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) + +#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) +#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) + +static __always_inline pgoff_t pte_to_pgoff(pte_t pte) +{ + return (pgoff_t) + (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + + pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + + pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, -1UL, PTE_FILE_LSHIFT3)); +} + +static __always_inline pte_t pgoff_to_pte(pgoff_t off) +{ + return (pte_t){ + .pte_low = + pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + + pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + + pte_bitop(off, PTE_FILE_LSHIFT3, -1UL, PTE_FILE_SHIFT3) + + _PAGE_FILE, + }; +} #endif /* CONFIG_MEM_SOFT_DIRTY */ -- cgit v1.2.1 From fd8526ad14c182605e42b64646344b95befd9f94 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 19 Nov 2013 15:17:50 +0200 Subject: x86/mm: Implement ASLR for hugetlb mappings Matthew noticed that hugetlb mappings don't participate in ASLR on x86-64: % for i in `seq 3`; do > tools/testing/selftests/vm/map_hugetlb | grep address > done Returned address is 0x2aaaaac00000 Returned address is 0x2aaaaac00000 Returned address is 0x2aaaaac00000 /proc/PID/maps entries for the mapping are always the same (except inode number): 2aaaaac00000-2aaabac00000 rw-p 00000000 00:0c 8200 /anon_hugepage (deleted) 2aaaaac00000-2aaabac00000 rw-p 00000000 00:0c 256 /anon_hugepage (deleted) 2aaaaac00000-2aaabac00000 rw-p 00000000 00:0c 7180 /anon_hugepage (deleted) The reason is the generic hugetlb_get_unmapped_area() function which is used on x86-64. It doesn't support randomization and use bottom-up unmapped area lookup, instead of usual top-down on x86-64. x86 has arch-specific hugetlb_get_unmapped_area(), but it's used only on x86-32. Let's use arch-specific hugetlb_get_unmapped_area() on x86-64 too. That adds ASLR and switches hugetlb mappings to use top-down unmapped area lookup: % for i in `seq 3`; do > tools/testing/selftests/vm/map_hugetlb | grep address > done Returned address is 0x7f4f08a00000 Returned address is 0x7fdda4200000 Returned address is 0x7febe0000000 /proc/PID/maps entries: 7f4f08a00000-7f4f18a00000 rw-p 00000000 00:0c 1168 /anon_hugepage (deleted) 7fdda4200000-7fddb4200000 rw-p 00000000 00:0c 7092 /anon_hugepage (deleted) 7febe0000000-7febf0000000 rw-p 00000000 00:0c 7183 /anon_hugepage (deleted) Unmapped area lookup policy for hugetlb mappings is consistent with normal mappings now -- the only difference is alignment requirements for huge pages. libhugetlbfs test-suite didn't detect any regressions with the patch applied (although it shows few failures on my machine regardless the patch). Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Cc: Matthew Wilcox Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Naoya Horiguchi Cc: Linus Torvalds Cc: Andrea Arcangeli Cc: Peter Zijlstra Cc: Mel Gorman Link: http://lkml.kernel.org/r/20131119131750.EA45CE0090@blue.fi.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page.h | 1 + arch/x86/include/asm/page_32.h | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index c87892442e53..775873d3be55 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -71,6 +71,7 @@ extern bool __virt_addr_valid(unsigned long kaddr); #include #define __HAVE_ARCH_GATE_AREA 1 +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif /* __KERNEL__ */ #endif /* _ASM_X86_PAGE_H */ diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 4d550d04b609..904f528cc8e8 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -5,10 +5,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_HUGETLB_PAGE -#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA -#endif - #define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET) #ifdef CONFIG_DEBUG_VIRTUAL extern unsigned long __phys_addr(unsigned long); -- cgit v1.2.1 From 16824255394f55adf31b9a96a9965d8c15bdac4c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Dec 2013 15:08:36 +0100 Subject: x86, acpi, idle: Restructure the mwait idle routines People seem to delight in writing wrong and broken mwait idle routines; collapse the lot. This leaves mwait_play_dead() the sole remaining user of __mwait() and new __mwait() users are probably doing it wrong. Also remove __sti_mwait() as its unused. Cc: Arjan van de Ven Cc: Jacob Jun Pan Cc: Mike Galbraith Cc: Len Brown Cc: Rui Zhang Acked-by: Rafael Wysocki Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131212141654.616820819@infradead.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mwait.h | 40 ++++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 23 ----------------------- 2 files changed, 40 insertions(+), 23 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 2f366d0ac6b4..361b02ef128c 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_MWAIT_H #define _ASM_X86_MWAIT_H +#include + #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 @@ -13,4 +15,42 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +{ + if (!current_set_polling_and_test()) { + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __mwait(eax, ecx); + } + __current_clr_polling(); +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7b034a4057f9..24821f5768bc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -700,29 +700,6 @@ static inline void sync_core(void) #endif } -static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) -{ - /* "monitor %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc8;" - :: "a" (eax), "c" (ecx), "d"(edx)); -} - -static inline void __mwait(unsigned long eax, unsigned long ecx) -{ - /* "mwait %eax, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - -static inline void __sti_mwait(unsigned long eax, unsigned long ecx) -{ - trace_hardirqs_on(); - /* "mwait %eax, %ecx;" */ - asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void init_amd_e400_c1e_mask(void); -- cgit v1.2.1 From 7e98b71920464b8d15fa95c74366416cd3c88861 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Dec 2013 11:58:16 -0800 Subject: x86, idle: Use static_cpu_has() for CLFLUSH workaround, add barriers Use static_cpu_has() to conditionalize the CLFLUSH workaround, and add memory barriers around it since the documentation is explicit that CLFLUSH is only ordered with respect to MFENCE. Signed-off-by: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Len Brown Link: http://lkml.kernel.org/r/CA%2B55aFzGxcML7j8CEvQPYzh0W81uVoAAVmGctMOUZ7CZ1yYd2A@mail.gmail.com --- arch/x86/include/asm/mwait.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 361b02ef128c..19b71c439256 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -43,8 +43,11 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { if (!current_set_polling_and_test()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) { + mb(); clflush((void *)¤t_thread_info()->flags); + mb(); + } __monitor((void *)¤t_thread_info()->flags, 0, 0); if (!need_resched()) -- cgit v1.2.1 From 3b2664964bc886ae9d5127c8d3708b1acc0626d2 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:14 +0800 Subject: x86/efi: Add a wrapper function efi_map_region_fixed() Kexec kernel will use saved runtime virtual mapping, so add a new function efi_map_region_fixed() for directly mapping a md to md->virt. The md is passed in from 1st kernel, the virtual addr is saved in md->virt_addr. Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 89a05b0507b9..9fbaeb239bde 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -128,6 +128,7 @@ extern void efi_call_phys_epilog(void); extern void efi_unmap_memmap(void); extern void efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); +extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); extern void efi_setup_page_tables(void); extern void __init old_map_region(efi_memory_desc_t *md); -- cgit v1.2.1 From c5fe5d80680e2949ffe102180f5fc6cefc0d145f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 27 Dec 2013 15:30:58 -0800 Subject: x86: Replace assembly access_ok() with a C variant It turns out that the assembly variant doesn't actually produce that good code, presumably partly because it creates a long dependency chain with no scheduling, and partly because we cannot get a flags result out of gcc (which could be fixed with asm goto, but it turns out not to be worth it.) The C code allows gcc to schedule and generate multiple (easily predictable) branches, and as a side benefit we can really optimize the case where the size is constant. Link: http://lkml.kernel.org/r/CA%2B55aFzPBdbfKovMT8Edr4SmE2_=%2BOKJFac9XW2awegogTkVTA@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8ec57c07b125..84ecf1df2ac6 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -40,22 +40,28 @@ /* * Test whether a block of memory is a valid user space address. * Returns 0 if the range is valid, nonzero otherwise. - * - * This is equivalent to the following test: - * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64) - * - * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... */ +static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) +{ + /* + * If we have used "sizeof()" for the size, + * we know it won't overflow the limit (but + * it might overflow the 'addr', so it's + * important to subtract the size from the + * limit, not add it to the address). + */ + if (__builtin_constant_p(size)) + return addr > limit - size; + + /* Arbitrary sizes? Be careful about overflow */ + addr += size; + return (addr < size) || (addr > limit); +} #define __range_not_ok(addr, size, limit) \ ({ \ - unsigned long flag, roksum; \ __chk_user_ptr(addr); \ - asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \ - : "=&r" (flag), "=r" (roksum) \ - : "1" (addr), "g" ((long)(size)), \ - "rm" (limit)); \ - flag; \ + __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) /** -- cgit v1.2.1 From a740576a4abf933de8f50787f24f24456cebd761 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 27 Dec 2013 16:52:47 -0800 Subject: x86: Slightly tweak the access_ok() C variant for better code gcc can under very specific circumstances realize that the code sequence: foo += bar; if (foo < bar) ... ... is equivalent to a carry out from the addition. Tweak the implementation of access_ok() (specifically __chk_range_not_ok()) to make it more likely that gcc will make that connection. It isn't fool-proof (sometimes gcc seems to think it can make better code with lea, and ends up with a second comparison), still, but it seems to be able to connect the two more frequently this way. Cc: Linus Torvalds Link: http://lkml.kernel.org/r/CA%2B55aFzPBdbfKovMT8Edr4SmE2_=%2BOKJFac9XW2awegogTkVTA@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uaccess.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 84ecf1df2ac6..6f1bb74d547b 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -41,7 +41,7 @@ * Test whether a block of memory is a valid user space address. * Returns 0 if the range is valid, nonzero otherwise. */ -static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) +static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) { /* * If we have used "sizeof()" for the size, @@ -55,7 +55,9 @@ static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, uns /* Arbitrary sizes? Be careful about overflow */ addr += size; - return (addr < size) || (addr > limit); + if (addr < size) + return true; + return addr > limit; } #define __range_not_ok(addr, size, limit) \ @@ -84,7 +86,7 @@ static inline int __chk_range_not_ok(unsigned long addr, unsigned long size, uns * this function, memory access functions may still return -EFAULT. */ #define access_ok(type, addr, size) \ - (likely(__range_not_ok(addr, size, user_addr_max()) == 0)) + likely(!__range_not_ok(addr, size, user_addr_max())) /* * The exception table consists of pairs of addresses relative to the -- cgit v1.2.1 From 1fec0533693cd74f2d1a46edd29449cfee429df0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:19 +0800 Subject: x86/efi: Pass necessary EFI data for kexec via setup_data Add a new setup_data type SETUP_EFI for kexec use. Passing the saved fw_vendor, runtime, config tables and EFI runtime mappings. When entering virtual mode, directly mapping the EFI runtime regions which we passed in previously. And skip the step to call SetVirtualAddressMap(). Specially for HP z420 workstation we need save the smbios physical address. The kernel boot sequence proceeds in the following order. Step 2 requires efi.smbios to be the physical address. However, I found that on HP z420 EFI system table has a virtual address of SMBIOS in step 1. Hence, we need set it back to the physical address with the smbios in efi_setup_data. (When it is still the physical address, it simply sets the same value.) 1. efi_init() - Set efi.smbios from EFI system table 2. dmi_scan_machine() - Temporary map efi.smbios to access SMBIOS table 3. efi_enter_virtual_mode() - Map EFI ranges Tested on ovmf+qemu, lenovo thinkpad, a dell laptop and an HP z420 workstation. Signed-off-by: Dave Young Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 12 ++++++++++++ arch/x86/include/uapi/asm/bootparam.h | 1 + 2 files changed, 13 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 9fbaeb239bde..4d1ba80b6ff1 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -133,6 +133,18 @@ extern void efi_sync_low_kernel_mappings(void); extern void efi_setup_page_tables(void); extern void __init old_map_region(efi_memory_desc_t *md); +struct efi_setup_data { + u64 fw_vendor; + u64 runtime; + u64 tables; + u64 smbios; + u64 reserved[8]; +}; + +extern u64 efi_setup; +extern u32 efi_data_len; +extern void parse_efi_setup(u64 phys_addr, u32 data_len); + #ifdef CONFIG_EFI static inline bool efi_is_native(void) diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 9c3733c5f8f7..64fe421aab65 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -6,6 +6,7 @@ #define SETUP_E820_EXT 1 #define SETUP_DTB 2 #define SETUP_PCI 3 +#define SETUP_EFI 4 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF -- cgit v1.2.1 From 456a29ddada79198c5965300e04103c40c481f62 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:20 +0800 Subject: x86: Add xloadflags bit for EFI runtime support on kexec Old kexec-tools can not load new kernels. The reason is kexec-tools does not fill efi_info in x86 setup header previously, thus EFI failed to initialize. In new kexec-tools it will by default to fill efi_info and pass other EFI required infomation to 2nd kernel so kexec kernel EFI initialization can succeed finally. To prevent from breaking userspace, add a new xloadflags bit so kexec-tools can check the flag and switch to old logic. Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/include/uapi/asm/bootparam.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 64fe421aab65..225b0988043a 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -24,6 +24,7 @@ #define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1) #define XLF_EFI_HANDOVER_32 (1<<2) #define XLF_EFI_HANDOVER_64 (1<<3) +#define XLF_EFI_KEXEC (1<<4) #ifndef __ASSEMBLY__ -- cgit v1.2.1 From 5c12af0c41e3417e1939095325920463b5f8e726 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 3 Jan 2014 11:56:49 +0800 Subject: x86/efi: parse_efi_setup() build fix In case without CONFIG_EFI, there will be below build error: arch/x86/built-in.o: In function `setup_arch': (.init.text+0x9dc): undefined reference to `parse_efi_setup' Thus fix it by adding blank inline function in asm/efi.h Also remove an unused declaration for variable efi_data_len. Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 4d1ba80b6ff1..3b978c472d08 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -142,8 +142,6 @@ struct efi_setup_data { }; extern u64 efi_setup; -extern u32 efi_data_len; -extern void parse_efi_setup(u64 phys_addr, u32 data_len); #ifdef CONFIG_EFI @@ -153,7 +151,7 @@ static inline bool efi_is_native(void) } extern struct console early_efi_console; - +extern void parse_efi_setup(u64 phys_addr, u32 data_len); #else /* * IF EFI is not configured, have the EFI calls return -ENOSYS. @@ -165,6 +163,7 @@ extern struct console early_efi_console; #define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) #define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) #define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) +static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} #endif /* CONFIG_EFI */ #endif /* _ASM_X86_EFI_H */ -- cgit v1.2.1 From dd360393f4d948eb518372316e52101cf3b44212 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 23 Dec 2013 14:16:58 +0200 Subject: x86, cpu: Detect more TLB configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Intel Software Developer’s Manual covers few more TLB configurations exposed as CPUID 2 descriptors: 61H Instruction TLB: 4 KByte pages, fully associative, 48 entries 63H Data TLB: 1 GByte pages, 4-way set associative, 4 entries 76H Instruction TLB: 2M/4M pages, fully associative, 8 entries B5H Instruction TLB: 4KByte pages, 8-way set associative, 64 entries B6H Instruction TLB: 4KByte pages, 8-way set associative, 128 entries C1H Shared 2nd-Level TLB: 4 KByte/2MByte pages, 8-way associative, 1024 entries C2H DTLB DTLB: 2 MByte/$MByte pages, 4-way associative, 16 entries Let's detect them as well. Signed-off-by: Kirill A. Shutemov Link: http://lkml.kernel.org/r/1387801018-14499-1-git-send-email-kirill.shutemov@linux.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7b034a4057f9..1dd6260ed940 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -72,6 +72,7 @@ extern u16 __read_mostly tlb_lli_4m[NR_INFO]; extern u16 __read_mostly tlb_lld_4k[NR_INFO]; extern u16 __read_mostly tlb_lld_2m[NR_INFO]; extern u16 __read_mostly tlb_lld_4m[NR_INFO]; +extern u16 __read_mostly tlb_lld_1g[NR_INFO]; extern s8 __read_mostly tlb_flushall_shift; /* -- cgit v1.2.1 From 663b55b9b39fa9c848cca273ca4e12bf29b32c71 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 6 Jan 2014 19:20:26 -0500 Subject: x86: Delete non-required instances of include None of these files are actually using any __init type directives and hence don't need to include . Most are just a left over from __devinit and __cpuinit removal, or simply due to code getting copied from one driver to the next. [ hpa: undid incorrect removal from arch/x86/kernel/head_32.S ] Signed-off-by: Paul Gortmaker Link: http://lkml.kernel.org/r/1389054026-12947-1-git-send-email-paul.gortmaker@windriver.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 1 - arch/x86/include/asm/mpspec.h | 1 - arch/x86/include/asm/processor.h | 1 - arch/x86/include/asm/ptrace.h | 1 - arch/x86/include/asm/smp.h | 1 - arch/x86/include/asm/timer.h | 1 - 6 files changed, 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c696a8687567..6e4ce2df87cf 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -118,7 +118,6 @@ extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); #include -#include #include extern int mce_p5_enabled; diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 3142a94c7b4b..3e6b4920ef5d 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_MPSPEC_H #define _ASM_X86_MPSPEC_H -#include #include #include diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7b034a4057f9..8ade61721ffb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -27,7 +27,6 @@ struct mm_struct; #include #include #include -#include #include #include diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 942a08623a1a..14fd6fd75a19 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -60,7 +60,6 @@ struct pt_regs { #endif /* !__i386__ */ -#include #ifdef CONFIG_PARAVIRT #include #endif diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4137890e88e3..8cd27e08e23c 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -2,7 +2,6 @@ #define _ASM_X86_SMP_H #ifndef __ASSEMBLY__ #include -#include #include /* diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 34baa0eb5d0c..a6f3e776d2e4 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -1,6 +1,5 @@ #ifndef _ASM_X86_TIMER_H #define _ASM_X86_TIMER_H -#include #include #include #include -- cgit v1.2.1 From 46184415368a6095d5da33991c5e011f1084353d Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Wed, 8 Jan 2014 13:27:51 -0800 Subject: arch: x86: New MailBox support driver for Intel SOC's Current Intel SOC cores use a MailBox Interface (MBI) to provide access to configuration registers on devices (called units) connected to the system fabric. This is a support driver that implements access to this interface on those platforms that can enumerate the device using PCI. Initial support is for BayTrail, for which port definitons are provided. This is a requirement for implementing platform specific features (e.g. RAPL driver requires this to perform platform specific power management using the registers in PUNIT). Dependant modules should select IOSF_MBI in their respective Kconfig configuraiton. Serialized access is handled by all exported routines with spinlocks. The API includes 3 functions for access to unit registers: int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr) int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr) int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) port: indicating the unit being accessed opcode: the read or write port specific opcode offset: the register offset within the port mdr: the register data to be read, written, or modified mask: bit locations in mdr to change Returns nonzero on error Note: GPU code handles access to the GFX unit. Therefore access to that unit with this driver is disallowed to avoid conflicts. Signed-off-by: David E. Box Link: http://lkml.kernel.org/r/1389216471-734-1-git-send-email-david.e.box@linux.intel.com Signed-off-by: H. Peter Anvin Cc: Rafael J. Wysocki Cc: Matthew Garrett --- arch/x86/include/asm/iosf_mbi.h | 90 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 arch/x86/include/asm/iosf_mbi.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h new file mode 100644 index 000000000000..8e71c7941767 --- /dev/null +++ b/arch/x86/include/asm/iosf_mbi.h @@ -0,0 +1,90 @@ +/* + * iosf_mbi.h: Intel OnChip System Fabric MailBox access support + */ + +#ifndef IOSF_MBI_SYMS_H +#define IOSF_MBI_SYMS_H + +#define MBI_MCR_OFFSET 0xD0 +#define MBI_MDR_OFFSET 0xD4 +#define MBI_MCRX_OFFSET 0xD8 + +#define MBI_RD_MASK 0xFEFFFFFF +#define MBI_WR_MASK 0X01000000 + +#define MBI_MASK_HI 0xFFFFFF00 +#define MBI_MASK_LO 0x000000FF +#define MBI_ENABLE 0xF0 + +/* Baytrail available units */ +#define BT_MBI_UNIT_AUNIT 0x00 +#define BT_MBI_UNIT_SMC 0x01 +#define BT_MBI_UNIT_CPU 0x02 +#define BT_MBI_UNIT_BUNIT 0x03 +#define BT_MBI_UNIT_PMC 0x04 +#define BT_MBI_UNIT_GFX 0x06 +#define BT_MBI_UNIT_SMI 0x0C +#define BT_MBI_UNIT_USB 0x43 +#define BT_MBI_UNIT_SATA 0xA3 +#define BT_MBI_UNIT_PCIE 0xA6 + +/* Baytrail read/write opcodes */ +#define BT_MBI_AUNIT_READ 0x10 +#define BT_MBI_AUNIT_WRITE 0x11 +#define BT_MBI_SMC_READ 0x10 +#define BT_MBI_SMC_WRITE 0x11 +#define BT_MBI_CPU_READ 0x10 +#define BT_MBI_CPU_WRITE 0x11 +#define BT_MBI_BUNIT_READ 0x10 +#define BT_MBI_BUNIT_WRITE 0x11 +#define BT_MBI_PMC_READ 0x06 +#define BT_MBI_PMC_WRITE 0x07 +#define BT_MBI_GFX_READ 0x00 +#define BT_MBI_GFX_WRITE 0x01 +#define BT_MBI_SMIO_READ 0x06 +#define BT_MBI_SMIO_WRITE 0x07 +#define BT_MBI_USB_READ 0x06 +#define BT_MBI_USB_WRITE 0x07 +#define BT_MBI_SATA_READ 0x00 +#define BT_MBI_SATA_WRITE 0x01 +#define BT_MBI_PCIE_READ 0x00 +#define BT_MBI_PCIE_WRITE 0x01 + +/** + * iosf_mbi_read() - MailBox Interface read command + * @port: port indicating subunit being accessed + * @opcode: port specific read or write opcode + * @offset: register address offset + * @mdr: register data to be read + * + * Locking is handled by spinlock - cannot sleep. + * Return: Nonzero on error + */ +int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr); + +/** + * iosf_mbi_write() - MailBox unmasked write command + * @port: port indicating subunit being accessed + * @opcode: port specific read or write opcode + * @offset: register address offset + * @mdr: register data to be written + * + * Locking is handled by spinlock - cannot sleep. + * Return: Nonzero on error + */ +int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr); + +/** + * iosf_mbi_modify() - MailBox masked write command + * @port: port indicating subunit being accessed + * @opcode: port specific read or write opcode + * @offset: register address offset + * @mdr: register data being modified + * @mask: mask indicating bits in mdr to be modified + * + * Locking is handled by spinlock - cannot sleep. + * Return: Nonzero on error + */ +int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask); + +#endif /* IOSF_MBI_SYMS_H */ -- cgit v1.2.1 From 26bef1318adc1b3a530ecc807ef99346db2aa8b0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 11 Jan 2014 19:15:52 -0800 Subject: x86, fpu, amd: Clear exceptions in AMD FXSAVE workaround Before we do an EMMS in the AMD FXSAVE information leak workaround we need to clear any pending exceptions, otherwise we trap with a floating-point exception inside this code. Reported-by: halfdog Tested-by: Borislav Petkov Link: http://lkml.kernel.org/r/CA%2B55aFxQnY_PCG_n4=0w-VG=YLXL-yr7oMxyy0WU2gCBAf3ydg@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/fpu-internal.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index c49a613c6452..cea1c76d49bf 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -293,12 +293,13 @@ static inline int restore_fpu_checking(struct task_struct *tsk) /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. "m" is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (tsk->thread.fpu.has_fpu)); + if (unlikely(static_cpu_has(X86_FEATURE_FXSAVE_LEAK))) { + asm volatile( + "fnclex\n\t" + "emms\n\t" + "fildl %P[addr]" /* set F?P to defined value */ + : : [addr] "m" (tsk->thread.fpu.has_fpu)); + } return fpu_restore_checking(&tsk->thread.fpu); } -- cgit v1.2.1 From 47933ad41a86a4a9b50bed7c9b9bd2ba242aac63 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Nov 2013 14:57:36 +0100 Subject: arch: Introduce smp_load_acquire(), smp_store_release() A number of situations currently require the heavyweight smp_mb(), even though there is no need to order prior stores against later loads. Many architectures have much cheaper ways to handle these situations, but the Linux kernel currently has no portable way to make use of them. This commit therefore supplies smp_load_acquire() and smp_store_release() to remedy this situation. The new smp_load_acquire() primitive orders the specified load against any subsequent reads or writes, while the new smp_store_release() primitive orders the specifed store against any prior reads or writes. These primitives allow array-based circular FIFOs to be implemented without an smp_mb(), and also allow a theoretical hole in rcu_assign_pointer() to be closed at no additional expense on most architectures. In addition, the RCU experience transitioning from explicit smp_read_barrier_depends() and smp_wmb() to rcu_dereference() and rcu_assign_pointer(), respectively resulted in substantial improvements in readability. It therefore seems likely that replacing other explicit barriers with smp_load_acquire() and smp_store_release() will provide similar benefits. It appears that roughly half of the explicit barriers in core kernel code might be so replaced. [Changelog by PaulMck] Reviewed-by: "Paul E. McKenney" Signed-off-by: Peter Zijlstra Acked-by: Will Deacon Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Cc: Michael Ellerman Cc: Michael Neuling Cc: Russell King Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Victor Kaplansky Cc: Tony Luck Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/20131213150640.908486364@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/barrier.h | 43 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index c6cd358a1eec..04a48903b2eb 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -92,12 +92,53 @@ #endif #define smp_read_barrier_depends() read_barrier_depends() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else +#else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() #define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif /* SMP */ + +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) + +/* + * For either of these options x86 doesn't have a strong TSO memory + * model and we should fall back to full barriers. + */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + smp_mb(); \ + ___p1; \ +}) + +#else /* regular x86 TSO memory ordering */ + +#define smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ACCESS_ONCE(*p) = (v); \ +} while (0) + +#define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + barrier(); \ + ___p1; \ +}) + #endif /* -- cgit v1.2.1 From 9345005f4eed805308193658d12e4e7e9c261e74 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Sun, 5 Jan 2014 11:10:52 -0500 Subject: x86/irq: Fix do_IRQ() interrupt warning for cpu hotplug retriggered irqs During heavy CPU-hotplug operations the following spurious kernel warnings can trigger: do_IRQ: No ... irq handler for vector (irq -1) [ See: https://bugzilla.kernel.org/show_bug.cgi?id=64831 ] When downing a cpu it is possible that there are unhandled irqs left in the APIC IRR register. The following code path shows how the problem can occur: 1. CPU 5 is to go down. 2. cpu_disable() on CPU 5 executes with interrupt flag cleared by local_irq_save() via stop_machine(). 3. IRQ 12 asserts on CPU 5, setting IRR but not ISR because interrupt flag is cleared (CPU unabled to handle the irq) 4. IRQs are migrated off of CPU 5, and the vectors' irqs are set to -1. 5. stop_machine() finishes cpu_disable() 6. cpu_die() for CPU 5 executes in normal context. 7. CPU 5 attempts to handle IRQ 12 because the IRR is set for IRQ 12. The code attempts to find the vector's IRQ and cannot because it has been set to -1. 8. do_IRQ() warning displays warning about CPU 5 IRQ 12. I added a debug printk to output which CPU & vector was retriggered and discovered that that we are getting bogus events. I see a 100% correlation between this debug printk in fixup_irqs() and the do_IRQ() warning. This patchset resolves this by adding definitions for VECTOR_UNDEFINED(-1) and VECTOR_RETRIGGERED(-2) and modifying the code to use them. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=64831 Signed-off-by: Prarit Bhargava Reviewed-by: Rui Wang Cc: Michel Lespinasse Cc: Seiji Aguchi Cc: Yang Zhang Cc: Paul Gortmaker Cc: janet.morgan@Intel.com Cc: tony.luck@Intel.com Cc: ruiv.wang@gmail.com Link: http://lkml.kernel.org/r/1388938252-16627-1-git-send-email-prarit@redhat.com [ Cleaned up the code a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index cba45d99ac1a..67d69b8e2d20 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -191,6 +191,9 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); #define trace_interrupt interrupt #endif +#define VECTOR_UNDEFINED -1 +#define VECTOR_RETRIGGERED -2 + typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); extern void setup_vector_irq(int cpu); -- cgit v1.2.1 From 5dd12c2152743747ca9f50ef80281e54cc416dc0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 18:04:39 +0100 Subject: sched/clock, x86: Use mul_u64_u32_shr() for native_sched_clock() Use mul_u64_u32_shr() so that x86_64 can use a single 64x64->128 mul. Before: 0000000000000560 : 560: 44 8b 1d 00 00 00 00 mov 0x0(%rip),%r11d # 567 567: 55 push %rbp 568: 48 89 e5 mov %rsp,%rbp 56b: 45 85 db test %r11d,%r11d 56e: 75 4f jne 5bf 570: 0f 31 rdtsc 572: 89 c0 mov %eax,%eax 574: 48 c1 e2 20 shl $0x20,%rdx 578: 48 c7 c1 00 00 00 00 mov $0x0,%rcx 57f: 48 09 c2 or %rax,%rdx 582: 48 c7 c7 00 00 00 00 mov $0x0,%rdi 589: 65 8b 04 25 00 00 00 mov %gs:0x0,%eax 590: 00 591: 48 98 cltq 593: 48 8b 34 c5 00 00 00 mov 0x0(,%rax,8),%rsi 59a: 00 59b: 48 89 d0 mov %rdx,%rax 59e: 81 e2 ff 03 00 00 and $0x3ff,%edx 5a4: 48 c1 e8 0a shr $0xa,%rax 5a8: 48 0f af 14 0e imul (%rsi,%rcx,1),%rdx 5ad: 48 0f af 04 0e imul (%rsi,%rcx,1),%rax 5b2: 5d pop %rbp 5b3: 48 03 04 3e add (%rsi,%rdi,1),%rax 5b7: 48 c1 ea 0a shr $0xa,%rdx 5bb: 48 01 d0 add %rdx,%rax 5be: c3 retq After: 0000000000000550 : 550: 8b 3d 00 00 00 00 mov 0x0(%rip),%edi # 556 556: 55 push %rbp 557: 48 89 e5 mov %rsp,%rbp 55a: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp 55e: 85 ff test %edi,%edi 560: 75 2c jne 58e 562: 0f 31 rdtsc 564: 89 c0 mov %eax,%eax 566: 48 c1 e2 20 shl $0x20,%rdx 56a: 48 09 c2 or %rax,%rdx 56d: 65 48 8b 04 25 00 00 mov %gs:0x0,%rax 574: 00 00 576: 89 c0 mov %eax,%eax 578: 48 f7 e2 mul %rdx 57b: 65 48 8b 0c 25 00 00 mov %gs:0x0,%rcx 582: 00 00 584: c9 leaveq 585: 48 0f ac d0 0a shrd $0xa,%rdx,%rax 58a: 48 01 c8 add %rcx,%rax 58d: c3 retq MAINLINE POST sched_clock_stable: 1 1 (cold) sched_clock: 329841 331312 (cold) local_clock: 301773 310296 (warm) sched_clock: 38375 38247 (warm) local_clock: 100371 102713 (warm) rdtsc: 27340 27289 sched_clock_stable: 0 0 (cold) sched_clock: 382634 372706 (cold) local_clock: 396890 399275 (warm) sched_clock: 38194 38124 (warm) local_clock: 143452 148698 (warm) rdtsc: 27345 27365 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-piu203ses5y1g36bnyw2n16x@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 34baa0eb5d0c..10a78c037910 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -4,6 +4,7 @@ #include #include #include +#include #define TICK_SIZE (tick_nsec / 1000) @@ -57,10 +58,8 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { - int cpu = smp_processor_id(); - unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), - (1UL << CYC2NS_SCALE_FACTOR)); + unsigned long long ns = this_cpu_read(cyc2ns_offset); + ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); return ns; } -- cgit v1.2.1 From 57c67da274f3fab38e08d2c9edf08b89e1d9c71d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 15:39:25 +0100 Subject: sched/clock, x86: Move some cyc2ns() code around There are no __cycles_2_ns() users outside of arch/x86/kernel/tsc.c, so move it there. There are no cycles_2_ns() users. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-01lslnavfgo3kmbo4532zlcj@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 59 -------------------------------------------- 1 file changed, 59 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 10a78c037910..b4c667693a21 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -13,66 +13,7 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - * - * In: - * - * ns = cycles * cyc2ns_scale / SC - * - * Although we may still have enough bits to store the value of ns, - * in some cases, we may not have enough bits to store cycles * cyc2ns_scale, - * leading to an incorrect result. - * - * To avoid this, we can decompose 'cycles' into quotient and remainder - * of division by SC. Then, - * - * ns = (quot * SC + rem) * cyc2ns_scale / SC - * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC - * - * - sqazi@google.com - */ - DECLARE_PER_CPU(unsigned long, cyc2ns); DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline unsigned long long __cycles_2_ns(unsigned long long cyc) -{ - unsigned long long ns = this_cpu_read(cyc2ns_offset); - ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); - return ns; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - unsigned long long ns; - unsigned long flags; - - local_irq_save(flags); - ns = __cycles_2_ns(cyc); - local_irq_restore(flags); - - return ns; -} - #endif /* _ASM_X86_TIMER_H */ -- cgit v1.2.1 From 20d1c86a57762f0a33a78988e3fc8818316badd4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 15:40:29 +0100 Subject: sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs Use a ring-buffer like multi-version object structure which allows always having a coherent object; we use this to avoid having to disable IRQs while reading sched_clock() and avoids a problem when getting an NMI while changing the cyc2ns data. MAINLINE PRE POST sched_clock_stable: 1 1 1 (cold) sched_clock: 329841 331312 257223 (cold) local_clock: 301773 310296 309889 (warm) sched_clock: 38375 38247 25280 (warm) local_clock: 100371 102713 85268 (warm) rdtsc: 27340 27289 24247 sched_clock_stable: 0 0 0 (cold) sched_clock: 382634 372706 301224 (cold) local_clock: 396890 399275 399870 (warm) sched_clock: 38194 38124 25630 (warm) local_clock: 143452 148698 129629 (warm) rdtsc: 27345 27365 24307 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-s567in1e5ekq2nlyhn8f987r@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index b4c667693a21..3de54ef0aea5 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -13,7 +13,26 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; -DECLARE_PER_CPU(unsigned long, cyc2ns); -DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); +/* + * We use the full linear equation: f(x) = a + b*x, in order to allow + * a continuous function in the face of dynamic freq changes. + * + * Continuity means that when our frequency changes our slope (b); we want to + * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t. + * + * Without an offset (a) the above would not be possible. + * + * See the comment near cycles_2_ns() for details on how we compute (b). + */ +struct cyc2ns_data { + u32 cyc2ns_mul; + u32 cyc2ns_shift; + u64 cyc2ns_offset; + u32 __count; + /* u32 hole */ +}; /* 24 bytes -- do not grow */ + +extern struct cyc2ns_data *cyc2ns_read_begin(void); +extern void cyc2ns_read_end(struct cyc2ns_data *); #endif /* _ASM_X86_TIMER_H */ -- cgit v1.2.1 From 8cb75e0c4ec9786b81439761eac1d18d4a931af3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Nov 2013 12:22:37 +0100 Subject: sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding With various drivers wanting to inject idle time; we get people calling idle routines outside of the idle loop proper. Therefore we need to be extra careful about not missing TIF_NEED_RESCHED -> PREEMPT_NEED_RESCHED propagations. While looking at this, I also realized there's a small window in the existing idle loop where we can miss TIF_NEED_RESCHED; when it hits right after the tif_need_resched() test at the end of the loop but right before the need_resched() test at the start of the loop. So move preempt_fold_need_resched() out of the loop where we're guaranteed to have TIF_NEED_RESCHED set. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-x9jgh45oeayzajz2mjt0y7d6@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mwait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 19b71c439256..1da25a5f96f9 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -53,7 +53,7 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) if (!need_resched()) __mwait(eax, ecx); } - __current_clr_polling(); + current_clr_polling(); } #endif /* _ASM_X86_MWAIT_H */ -- cgit v1.2.1 From 5aa3d718f259007121b9366d36315fb8a2983d3d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Dec 2013 20:50:42 +0100 Subject: x86, ramdisk: Export relocated ramdisk VA The ramdisk can possibly get relocated if the whole image is not mapped. And since we're going over it in the microcode loader and fishing out the relevant microcode patches, we want access it at its new location. Thus, export it. Signed-off-by: Borislav Petkov Tested-by: Aravind Gopalakrishnan --- arch/x86/include/asm/setup.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 59bcf4e22418..d62c9f809bc5 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -3,7 +3,6 @@ #include - #define COMMAND_LINE_SIZE 2048 #include @@ -29,6 +28,8 @@ #include #include +extern u64 relocated_ramdisk; + /* Interrupt control for vSMPowered x86_64 systems */ #ifdef CONFIG_X86_64 void vsmp_init(void); -- cgit v1.2.1 From e1b43e3f13f7157249fb962ccf88b84eb0421fb4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Dec 2013 12:31:31 +0100 Subject: x86, microcode: Share native MSR accessing variants We want to use those in AMD's early loading path too. Also, add a native_wrmsrl variant. Signed-off-by: Borislav Petkov Tested-by: Aravind Gopalakrishnan --- arch/x86/include/asm/microcode.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index f98bd6625318..b59827e76529 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -1,6 +1,21 @@ #ifndef _ASM_X86_MICROCODE_H #define _ASM_X86_MICROCODE_H +#define native_rdmsr(msr, val1, val2) \ +do { \ + u64 __val = native_read_msr((msr)); \ + (void)((val1) = (u32)__val); \ + (void)((val2) = (u32)(__val >> 32)); \ +} while (0) + +#define native_wrmsr(msr, low, high) \ + native_write_msr(msr, low, high) + +#define native_wrmsrl(msr, val) \ + native_write_msr((msr), \ + (u32)((u64)(val)), \ + (u32)((u64)(val) >> 32)) + struct cpu_signature { unsigned int sig; unsigned int pf; -- cgit v1.2.1 From 5335ba5cf475369f88db8e6835764efdcad8ab96 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 29 Nov 2013 14:58:44 +0100 Subject: x86, microcode, AMD: Fix early ucode loading The original idea to use the microcode cache for the APs doesn't pan out because we do memory allocation there very early and with IRQs disabled and we don't want to involve GFP_ATOMIC allocations. Not if it can be helped. Thus, extend the caching of the BSP patch approach to the APs and iterate over the ucode in the initrd instead of using the cache. We still save the relevant patches to it but later, right before we jettison the initrd. While at it, fix early ucode loading on 32-bit too. Signed-off-by: Borislav Petkov Tested-by: Aravind Gopalakrishnan --- arch/x86/include/asm/microcode_amd.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 4c019179a57d..b7b10b82d3e5 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -61,11 +61,10 @@ extern int __apply_microcode_amd(struct microcode_amd *mc_amd); extern int apply_microcode_amd(int cpu); extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); +#define PATCH_MAX_SIZE PAGE_SIZE +extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; + #ifdef CONFIG_MICROCODE_AMD_EARLY -#ifdef CONFIG_X86_32 -#define MPB_MAX_SIZE PAGE_SIZE -extern u8 amd_bsp_mpb[MPB_MAX_SIZE]; -#endif extern void __init load_ucode_amd_bsp(void); extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); -- cgit v1.2.1 From 3b56496865f9f7d9bcb2f93b44c63f274f08e3b6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 15 Jan 2014 00:07:11 +0100 Subject: x86, cpu, amd: Add workaround for family 16h, erratum 793 This adds the workaround for erratum 793 as a precaution in case not every BIOS implements it. This addresses CVE-2013-6885. Erratum text: [Revision Guide for AMD Family 16h Models 00h-0Fh Processors, document 51810 Rev. 3.04 November 2013] 793 Specific Combination of Writes to Write Combined Memory Types and Locked Instructions May Cause Core Hang Description Under a highly specific and detailed set of internal timing conditions, a locked instruction may trigger a timing sequence whereby the write to a write combined memory type is not flushed, causing the locked instruction to stall indefinitely. Potential Effect on System Processor core hang. Suggested Workaround BIOS should set MSR C001_1020[15] = 1b. Fix Planned No fix planned [ hpa: updated description, fixed typo in MSR name ] Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20140114230711.GS29865@pd.tnic Tested-by: Aravind Gopalakrishnan Signed-off-by: H. Peter Anvin --- arch/x86/include/uapi/asm/msr-index.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 37813b5ddc37..59cea185ad1d 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -184,6 +184,7 @@ #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 +#define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 -- cgit v1.2.1 From 85611e3febe78955a519f5f9eb47b941525c8c76 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Mon, 16 Dec 2013 12:07:37 -0800 Subject: x86, intel-mid: Add Clovertrail platform support This patch adds Clovertrail support on intel-mid and makes it more flexible to support other SoCs. Signed-off-by: David Cohen Link: http://lkml.kernel.org/r/1387224459-25746-3-git-send-email-david.a.cohen@linux.intel.com Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Fei Yang Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/intel-mid.h | 46 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index 459769d39263..f8a831431fe0 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -51,10 +51,39 @@ struct devs_id { enum intel_mid_cpu_type { /* 1 was Moorestown */ INTEL_MID_CPU_CHIP_PENWELL = 2, + INTEL_MID_CPU_CHIP_CLOVERVIEW, }; extern enum intel_mid_cpu_type __intel_mid_cpu_chip; +/** + * struct intel_mid_ops - Interface between intel-mid & sub archs + * @arch_setup: arch_setup function to re-initialize platform + * structures (x86_init, x86_platform_init) + * + * This structure can be extended if any new interface is required + * between intel-mid & its sub arch files. + */ +struct intel_mid_ops { + void (*arch_setup)(void); +}; + +/* Helper API's for INTEL_MID_OPS_INIT */ +#define DECLARE_INTEL_MID_OPS_INIT(cpuname, cpuid) \ + [cpuid] = get_##cpuname##_ops + +/* Maximum number of CPU ops */ +#define MAX_CPU_OPS(a) (sizeof(a)/sizeof(void *)) + +/* + * For every new cpu addition, a weak get__ops() function needs be + * declared in arch/x86/platform/intel_mid/intel_mid_weak_decls.h. + */ +#define INTEL_MID_OPS_INIT {\ + DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \ + DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \ +}; + #ifdef CONFIG_X86_INTEL_MID static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void) @@ -86,8 +115,21 @@ extern enum intel_mid_timer_options intel_mid_timer_options; * Penwell uses spread spectrum clock, so the freq number is not exactly * the same as reported by MSR based on SDM. */ -#define PENWELL_FSB_FREQ_83SKU 83200 -#define PENWELL_FSB_FREQ_100SKU 99840 +#define FSB_FREQ_83SKU 83200 +#define FSB_FREQ_100SKU 99840 +#define FSB_FREQ_133SKU 133000 + +#define FSB_FREQ_167SKU 167000 +#define FSB_FREQ_200SKU 200000 +#define FSB_FREQ_267SKU 267000 +#define FSB_FREQ_333SKU 333000 +#define FSB_FREQ_400SKU 400000 + +/* Bus Select SoC Fuse value */ +#define BSEL_SOC_FUSE_MASK 0x7 +#define BSEL_SOC_FUSE_001 0x1 /* FSB 133MHz */ +#define BSEL_SOC_FUSE_101 0x5 /* FSB 100MHz */ +#define BSEL_SOC_FUSE_111 0x7 /* FSB 83MHz */ #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 -- cgit v1.2.1 From bc20aa48bbb3068224a1c91f8332971fdb689fad Mon Sep 17 00:00:00 2001 From: David Cohen Date: Mon, 16 Dec 2013 12:07:38 -0800 Subject: x86, intel-mid: Add Merrifield platform support This code was partially based on Mark Brown's previous work. Signed-off-by: David Cohen Link: http://lkml.kernel.org/r/1387224459-25746-4-git-send-email-david.a.cohen@linux.intel.com Signed-off-by: Fei Yang Cc: Mark F. Brown Cc: Kuppuswamy Sathyanarayanan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/intel-mid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index f8a831431fe0..e34e097b6f9d 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -52,6 +52,7 @@ enum intel_mid_cpu_type { /* 1 was Moorestown */ INTEL_MID_CPU_CHIP_PENWELL = 2, INTEL_MID_CPU_CHIP_CLOVERVIEW, + INTEL_MID_CPU_CHIP_TANGIER, }; extern enum intel_mid_cpu_type __intel_mid_cpu_chip; @@ -82,6 +83,7 @@ struct intel_mid_ops { #define INTEL_MID_OPS_INIT {\ DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \ DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \ + DECLARE_INTEL_MID_OPS_INIT(tangier, INTEL_MID_CPU_CHIP_TANGIER) \ }; #ifdef CONFIG_X86_INTEL_MID -- cgit v1.2.1 From da6139e49c7cb0f4251265cb5243b8d220adb48d Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 13 Jan 2014 06:51:01 -0500 Subject: x86: Add check for number of available vectors before CPU down Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=64791 When a cpu is downed on a system, the irqs on the cpu are assigned to other cpus. It is possible, however, that when a cpu is downed there aren't enough free vectors on the remaining cpus to account for the vectors from the cpu that is being downed. This results in an interesting "overflow" condition where irqs are "assigned" to a CPU but are not handled. For example, when downing cpus on a 1-64 logical processor system: [ 232.021745] smpboot: CPU 61 is now offline [ 238.480275] smpboot: CPU 62 is now offline [ 245.991080] ------------[ cut here ]------------ [ 245.996270] WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:264 dev_watchdog+0x246/0x250() [ 246.005688] NETDEV WATCHDOG: p786p1 (ixgbe): transmit queue 0 timed out [ 246.013070] Modules linked in: lockd sunrpc iTCO_wdt iTCO_vendor_support sb_edac ixgbe microcode e1000e pcspkr joydev edac_core lpc_ich ioatdma ptp mdio mfd_core i2c_i801 dca pps_core i2c_core wmi acpi_cpufreq isci libsas scsi_transport_sas [ 246.037633] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.12.0+ #14 [ 246.044451] Hardware name: Intel Corporation S4600LH ........../SVRBD-ROW_T, BIOS SE5C600.86B.01.08.0003.022620131521 02/26/2013 [ 246.057371] 0000000000000009 ffff88081fa03d40 ffffffff8164fbf6 ffff88081fa0ee48 [ 246.065728] ffff88081fa03d90 ffff88081fa03d80 ffffffff81054ecc ffff88081fa13040 [ 246.074073] 0000000000000000 ffff88200cce0000 0000000000000040 0000000000000000 [ 246.082430] Call Trace: [ 246.085174] [] dump_stack+0x46/0x58 [ 246.091633] [] warn_slowpath_common+0x8c/0xc0 [ 246.098352] [] warn_slowpath_fmt+0x46/0x50 [ 246.104786] [] dev_watchdog+0x246/0x250 [ 246.110923] [] ? dev_deactivate_queue.constprop.31+0x80/0x80 [ 246.119097] [] call_timer_fn+0x3a/0x110 [ 246.125224] [] ? update_process_times+0x6f/0x80 [ 246.132137] [] ? dev_deactivate_queue.constprop.31+0x80/0x80 [ 246.140308] [] run_timer_softirq+0x1f0/0x2a0 [ 246.146933] [] __do_softirq+0xe0/0x220 [ 246.152976] [] call_softirq+0x1c/0x30 [ 246.158920] [] do_softirq+0x55/0x90 [ 246.164670] [] irq_exit+0xa5/0xb0 [ 246.170227] [] smp_apic_timer_interrupt+0x4a/0x60 [ 246.177324] [] apic_timer_interrupt+0x6a/0x70 [ 246.184041] [] ? cpuidle_enter_state+0x5b/0xe0 [ 246.191559] [] ? cpuidle_enter_state+0x57/0xe0 [ 246.198374] [] cpuidle_idle_call+0xbd/0x200 [ 246.204900] [] arch_cpu_idle+0xe/0x30 [ 246.210846] [] cpu_startup_entry+0xd0/0x250 [ 246.217371] [] rest_init+0x77/0x80 [ 246.223028] [] start_kernel+0x3ee/0x3fb [ 246.229165] [] ? repair_env_string+0x5e/0x5e [ 246.235787] [] x86_64_start_reservations+0x2a/0x2c [ 246.242990] [] x86_64_start_kernel+0xf8/0xfc [ 246.249610] ---[ end trace fb74fdef54d79039 ]--- [ 246.254807] ixgbe 0000:c2:00.0 p786p1: initiating reset due to tx timeout [ 246.262489] ixgbe 0000:c2:00.0 p786p1: Reset adapter Last login: Mon Nov 11 08:35:14 from 10.18.17.119 [root@(none) ~]# [ 246.792676] ixgbe 0000:c2:00.0 p786p1: detected SFP+: 5 [ 249.231598] ixgbe 0000:c2:00.0 p786p1: NIC Link is Up 10 Gbps, Flow Control: RX/TX [ 246.792676] ixgbe 0000:c2:00.0 p786p1: detected SFP+: 5 [ 249.231598] ixgbe 0000:c2:00.0 p786p1: NIC Link is Up 10 Gbps, Flow Control: RX/TX (last lines keep repeating. ixgbe driver is dead until module reload.) If the downed cpu has more vectors than are free on the remaining cpus on the system, it is possible that some vectors are "orphaned" even though they are assigned to a cpu. In this case, since the ixgbe driver had a watchdog, the watchdog fired and notified that something was wrong. This patch adds a function, check_vectors(), to compare the number of vectors on the CPU going down and compares it to the number of vectors available on the system. If there aren't enough vectors for the CPU to go down, an error is returned and propogated back to userspace. v2: Do not need to look at percpu irqs v3: Need to check affinity to prevent counting of MSIs in IOAPIC Lowest Priority Mode v4: Additional changes suggested by Gong Chen. v5/v6/v7/v8: Updated comment text Signed-off-by: Prarit Bhargava Link: http://lkml.kernel.org/r/1389613861-3853-1-git-send-email-prarit@redhat.com Reviewed-by: Gong Chen Cc: Andi Kleen Cc: Michel Lespinasse Cc: Seiji Aguchi Cc: Yang Zhang Cc: Paul Gortmaker Cc: Janet Morgan Cc: Tony Luck Cc: Ruiv Wang Cc: Gong Chen Signed-off-by: H. Peter Anvin Cc: --- arch/x86/include/asm/irq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 0ea10f27d613..cb6cfcd034cf 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -25,6 +25,7 @@ extern void irq_ctx_init(int cpu); #ifdef CONFIG_HOTPLUG_CPU #include +extern int check_irq_vectors_for_cpu_disable(void); extern void fixup_irqs(void); extern void irq_force_complete_move(int); #endif -- cgit v1.2.1 From 7da7c1561366ba8adb7275464ab44e84e1faa7e0 Mon Sep 17 00:00:00 2001 From: Bin Gao Date: Mon, 21 Oct 2013 09:16:33 -0700 Subject: x86, tsc: Add static (MSR) TSC calibration on Intel Atom SoCs On SoCs that have the calibration MSRs available, either there is no PIT, HPET or PMTIMER to calibrate against, or the PIT/HPET/PMTIMER is driven from the same clock as the TSC, so calibration is redundant and just slows down the boot. TSC rate is caculated by this formula: * The ratio and the resolved frequency ID can be obtained from MSR. See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5 for details. Signed-off-by: Bin Gao Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-rgm7xmg7k6qnjlw3ynkcjsmh@git.kernel.org --- arch/x86/include/asm/tsc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 235be70d5bb4..57ae63cd6ee2 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -65,4 +65,7 @@ extern int notsc_setup(char *); extern void tsc_save_sched_clock_state(void); extern void tsc_restore_sched_clock_state(void); +/* MSR based TSC calibration for Intel Atom SoC platforms */ +int try_msr_calibrate_tsc(unsigned long *fast_calibrate); + #endif /* _ASM_X86_TSC_H */ -- cgit v1.2.1