From f872f5400cc01373d8e29d9c7a5296ccfaf4ccf3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:19 -0800 Subject: mm: Add a vm_special_mapping.fault() method Requiring special mappings to give a list of struct pages is inflexible: it prevents sane use of IO memory in a special mapping, it's inefficient (it requires arch code to initialize a list of struct pages, and it requires the mm core to walk the entire list just to figure out how long it is), and it prevents arch code from doing anything fancy when a special mapping fault occurs. Add a .fault method as an alternative to filling in a .pages array. Looks-OK-to: Andrew Morton Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a26d1677c0bc7e774c33f469451a78ca31e9e6af.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar --- include/linux/mm_types.h | 22 +++++++++++++++++++--- mm/mmap.c | 13 +++++++++---- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f8d1492a114f..c88e48a3c155 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -568,10 +568,26 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm) } #endif -struct vm_special_mapping -{ - const char *name; +struct vm_fault; + +struct vm_special_mapping { + const char *name; /* The name, e.g. "[vdso]". */ + + /* + * If .fault is not provided, this points to a + * NULL-terminated array of pages that back the special mapping. + * + * This must not be NULL unless .fault is provided. + */ struct page **pages; + + /* + * If non-NULL, then this is called to resolve page faults + * on the special mapping. If used, .pages is not checked. + */ + int (*fault)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, + struct vm_fault *vmf); }; enum tlb_flush_reason { diff --git a/mm/mmap.c b/mm/mmap.c index 2ce04a649f6b..f717453b1a57 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3030,11 +3030,16 @@ static int special_mapping_fault(struct vm_area_struct *vma, pgoff_t pgoff; struct page **pages; - if (vma->vm_ops == &legacy_special_mapping_vmops) + if (vma->vm_ops == &legacy_special_mapping_vmops) { pages = vma->vm_private_data; - else - pages = ((struct vm_special_mapping *)vma->vm_private_data)-> - pages; + } else { + struct vm_special_mapping *sm = vma->vm_private_data; + + if (sm->fault) + return sm->fault(sm, vma, vmf); + + pages = sm->pages; + } for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; -- cgit v1.2.1 From 1745cbc5d0dee0749a6bc0ea8e872c5db0074061 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:20 -0800 Subject: mm: Add vm_insert_pfn_prot() The x86 vvar vma contains pages with differing cacheability flags. x86 currently implements this by manually inserting all the ptes using (io_)remap_pfn_range when the vma is set up. x86 wants to move to using .fault with VM_FAULT_NOPAGE to set up the mappings as needed. The correct API to use to insert a pfn in .fault is vm_insert_pfn(), but vm_insert_pfn() can't override the vma's cache mode, and the HPET page in particular needs to be uncached despite the fact that the rest of the VMA is cached. Add vm_insert_pfn_prot() to support varying cacheability within the same non-COW VMA in a more sane manner. x86 could alternatively use multiple VMAs, but that's messy, would break CRIU, and would create unnecessary VMAs that would waste memory. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Acked-by: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d2938d1eb37be7a5e4f86182db646551f11e45aa.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar --- include/linux/mm.h | 2 ++ mm/memory.c | 25 +++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 00bad7793788..87ef1d7730ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2080,6 +2080,8 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr, int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); diff --git a/mm/memory.c b/mm/memory.c index c387430f06c3..a29f0b90fc56 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1563,9 +1563,30 @@ out: */ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) +{ + return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_pfn); + +/** + * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * @pgprot: pgprot flags for the inserted page + * + * This is exactly like vm_insert_pfn, except that it allows drivers to + * to override pgprot on a per-page basis. + * + * This only makes sense for IO mappings, and it makes no sense for + * cow mappings. In general, using multiple vmas is preferable; + * vm_insert_pfn_prot should only be used if using multiple VMAs is + * impractical. + */ +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot) { int ret; - pgprot_t pgprot = vma->vm_page_prot; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1587,7 +1608,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, return ret; } -EXPORT_SYMBOL(vm_insert_pfn); +EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) -- cgit v1.2.1 From 352b78c62f27b356b182008acd3117f3ee03ffd2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:21 -0800 Subject: x86/vdso: Track each mm's loaded vDSO image as well as its base As we start to do more intelligent things with the vDSO at runtime (as opposed to just at mm initialization time), we'll need to know which vDSO is in use. In principle, we could guess based on the mm type, but that's over-complicated and error-prone. Instead, just track it in the mmu context. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c99ac48681bad709ca7ad5ee899d9042a3af6b00.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vma.c | 1 + arch/x86/include/asm/mmu.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index b8f69e264ac4..80b021067bd6 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -121,6 +121,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) text_start = addr - image->sym_vvar_start; current->mm->context.vdso = (void __user *)text_start; + current->mm->context.vdso_image = image; /* * MAYWRITE to allow gdb to COW and set breakpoints diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 55234d5e7160..1ea0baef1175 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -19,7 +19,8 @@ typedef struct { #endif struct mutex lock; - void __user *vdso; + void __user *vdso; /* vdso base address */ + const struct vdso_image *vdso_image; /* vdso image in use */ atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ } mm_context_t; -- cgit v1.2.1 From 05ef76b20fc4297b0d3f8a956f1c809a8a1b3f1d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:22 -0800 Subject: x86/vdso: Use .fault for the vDSO text mapping The old scheme for mapping the vDSO text is rather complicated. vdso2c generates a struct vm_special_mapping and a blank .pages array of the correct size for each vdso image. Init code in vdso/vma.c populates the .pages array for each vDSO image, and the mapping code selects the appropriate struct vm_special_mapping. With .fault, we can use a less roundabout approach: vdso_fault() just returns the appropriate page for the selected vDSO image. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f886954c186bafd74e1b967c8931d852ae199aa2.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vdso2c.h | 7 ------- arch/x86/entry/vdso/vma.c | 26 +++++++++++++++++++------- arch/x86/include/asm/vdso.h | 3 --- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 0224987556ce..abe961c7c71c 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, } fprintf(outfile, "\n};\n\n"); - fprintf(outfile, "static struct page *pages[%lu];\n\n", - mapping_size / 4096); - fprintf(outfile, "const struct vdso_image %s = {\n", name); fprintf(outfile, "\t.data = raw_data,\n"); fprintf(outfile, "\t.size = %lu,\n", mapping_size); - fprintf(outfile, "\t.text_mapping = {\n"); - fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); - fprintf(outfile, "\t\t.pages = pages,\n"); - fprintf(outfile, "\t},\n"); if (alt_sec) { fprintf(outfile, "\t.alt = %lu,\n", (unsigned long)GET_LE(&alt_sec->sh_offset)); diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 80b021067bd6..eb50d7c1f161 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -27,13 +27,7 @@ unsigned int __read_mostly vdso64_enabled = 1; void __init init_vdso_image(const struct vdso_image *image) { - int i; - int npages = (image->size) / PAGE_SIZE; - BUG_ON(image->size % PAGE_SIZE != 0); - for (i = 0; i < npages; i++) - image->text_mapping.pages[i] = - virt_to_page(image->data + i*PAGE_SIZE); apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + @@ -90,6 +84,24 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) #endif } +static int vdso_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + const struct vdso_image *image = vma->vm_mm->context.vdso_image; + + if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size) + return VM_FAULT_SIGBUS; + + vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT)); + get_page(vmf->page); + return 0; +} + +static const struct vm_special_mapping text_mapping = { + .name = "[vdso]", + .fault = vdso_fault, +}; + static int map_vdso(const struct vdso_image *image, bool calculate_addr) { struct mm_struct *mm = current->mm; @@ -131,7 +143,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) image->size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &image->text_mapping); + &text_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index deabaf9759b6..43dc55be524e 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -13,9 +13,6 @@ struct vdso_image { void *data; unsigned long size; /* Always a multiple of PAGE_SIZE */ - /* text_mapping.pages is big enough for data/size page pointers */ - struct vm_special_mapping text_mapping; - unsigned long alt, alt_len; long sym_vvar_start; /* Negative offset to the vvar area */ -- cgit v1.2.1 From a48a7042613eb1524d18b7b1ed7d3a6b611fd21f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:23 -0800 Subject: x86/vdso: Use ->fault() instead of remap_pfn_range() for the vvar mapping This is IMO much less ugly, and it also opens the door to disallowing unprivileged userspace HPET access on systems with usable TSCs. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c19c2909e5ee3c3d8742f916586676bb7c40345f.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vma.c | 97 ++++++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index eb50d7c1f161..4b5461ba4f6b 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -102,18 +102,69 @@ static const struct vm_special_mapping text_mapping = { .fault = vdso_fault, }; +static int vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + const struct vdso_image *image = vma->vm_mm->context.vdso_image; + long sym_offset; + int ret = -EFAULT; + + if (!image) + return VM_FAULT_SIGBUS; + + sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + + image->sym_vvar_start; + + /* + * Sanity check: a symbol offset of zero means that the page + * does not exist for this vdso image, not that the page is at + * offset zero relative to the text mapping. This should be + * impossible here, because sym_offset should only be zero for + * the page past the end of the vvar mapping. + */ + if (sym_offset == 0) + return VM_FAULT_SIGBUS; + + if (sym_offset == image->sym_vvar_page) { + ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, + __pa_symbol(&__vvar_page) >> PAGE_SHIFT); + } else if (sym_offset == image->sym_hpet_page) { +#ifdef CONFIG_HPET_TIMER + if (hpet_address) { + ret = vm_insert_pfn_prot( + vma, + (unsigned long)vmf->virtual_address, + hpet_address >> PAGE_SHIFT, + pgprot_noncached(PAGE_READONLY)); + } +#endif + } else if (sym_offset == image->sym_pvclock_page) { + struct pvclock_vsyscall_time_info *pvti = + pvclock_pvti_cpu0_va(); + if (pvti) { + ret = vm_insert_pfn( + vma, + (unsigned long)vmf->virtual_address, + __pa(pvti) >> PAGE_SHIFT); + } + } + + if (ret == 0 || ret == -EBUSY) + return VM_FAULT_NOPAGE; + + return VM_FAULT_SIGBUS; +} + static int map_vdso(const struct vdso_image *image, bool calculate_addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long addr, text_start; int ret = 0; - static struct page *no_pages[] = {NULL}; - static struct vm_special_mapping vvar_mapping = { + static const struct vm_special_mapping vvar_mapping = { .name = "[vvar]", - .pages = no_pages, + .fault = vvar_fault, }; - struct pvclock_vsyscall_time_info *pvti; if (calculate_addr) { addr = vdso_addr(current->mm->start_stack, @@ -153,7 +204,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) vma = _install_special_mapping(mm, addr, -image->sym_vvar_start, - VM_READ|VM_MAYREAD, + VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| + VM_PFNMAP, &vvar_mapping); if (IS_ERR(vma)) { @@ -161,41 +213,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) goto up_fail; } - if (image->sym_vvar_page) - ret = remap_pfn_range(vma, - text_start + image->sym_vvar_page, - __pa_symbol(&__vvar_page) >> PAGE_SHIFT, - PAGE_SIZE, - PAGE_READONLY); - - if (ret) - goto up_fail; - -#ifdef CONFIG_HPET_TIMER - if (hpet_address && image->sym_hpet_page) { - ret = io_remap_pfn_range(vma, - text_start + image->sym_hpet_page, - hpet_address >> PAGE_SHIFT, - PAGE_SIZE, - pgprot_noncached(PAGE_READONLY)); - - if (ret) - goto up_fail; - } -#endif - - pvti = pvclock_pvti_cpu0_va(); - if (pvti && image->sym_pvclock_page) { - ret = remap_pfn_range(vma, - text_start + image->sym_pvclock_page, - __pa(pvti) >> PAGE_SHIFT, - PAGE_SIZE, - PAGE_READONLY); - - if (ret) - goto up_fail; - } - up_fail: if (ret) current->mm->context.vdso = NULL; -- cgit v1.2.1 From bd902c536298830e4d126dcf6491b46d3f1bf96e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:24 -0800 Subject: x86/vdso: Disallow vvar access to vclock IO for never-used vclocks It makes me uncomfortable that even modern systems grant every process direct read access to the HPET. While fixing this for real without regressing anything is a mess (unmapping the HPET is tricky because we don't adequately track all the mappings), we can do almost as well by tracking which vclocks have ever been used and only allowing pages associated with used vclocks to be faulted in. This will cause rogue programs that try to peek at the HPET to get SIGBUS instead on most systems. We can't restrict faults to vclock pages that are associated with the currently selected vclock due to a race: a process could start to access the HPET for the first time and race against a switch away from the HPET as the current clocksource. We can't segfault the process trying to peek at the HPET in this case, even though the process isn't going to do anything useful with the data. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e79d06295625c02512277737ab55085a498ac5d8.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vma.c | 4 ++-- arch/x86/entry/vsyscall/vsyscall_gtod.c | 9 ++++++++- arch/x86/include/asm/clocksource.h | 9 +++++---- arch/x86/include/asm/vgtod.h | 6 ++++++ 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 4b5461ba4f6b..7c912fefe79b 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -130,7 +130,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_hpet_page) { #ifdef CONFIG_HPET_TIMER - if (hpet_address) { + if (hpet_address && vclock_was_used(VCLOCK_HPET)) { ret = vm_insert_pfn_prot( vma, (unsigned long)vmf->virtual_address, @@ -141,7 +141,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = pvclock_pvti_cpu0_va(); - if (pvti) { + if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { ret = vm_insert_pfn( vma, (unsigned long)vmf->virtual_address, diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c index 51e330416995..0fb3a104ac62 100644 --- a/arch/x86/entry/vsyscall/vsyscall_gtod.c +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -16,6 +16,8 @@ #include #include +int vclocks_used __read_mostly; + DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); void update_vsyscall_tz(void) @@ -26,12 +28,17 @@ void update_vsyscall_tz(void) void update_vsyscall(struct timekeeper *tk) { + int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + /* Mark the new vclock used. */ + BUILD_BUG_ON(VCLOCK_MAX >= 32); + WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode)); + gtod_write_begin(vdata); /* copy vsyscall data */ - vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->vclock_mode = vclock_mode; vdata->cycle_last = tk->tkr_mono.cycle_last; vdata->mask = tk->tkr_mono.mask; vdata->mult = tk->tkr_mono.mult; diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index eda81dc0f4ae..d194266acb28 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -3,10 +3,11 @@ #ifndef _ASM_X86_CLOCKSOURCE_H #define _ASM_X86_CLOCKSOURCE_H -#define VCLOCK_NONE 0 /* No vDSO clock available. */ -#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ -#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ -#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ +#define VCLOCK_NONE 0 /* No vDSO clock available. */ +#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ +#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ +#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ +#define VCLOCK_MAX 3 struct arch_clocksource_data { int vclock_mode; diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index f556c4843aa1..e728699db774 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -37,6 +37,12 @@ struct vsyscall_gtod_data { }; extern struct vsyscall_gtod_data vsyscall_gtod_data; +extern int vclocks_used; +static inline bool vclock_was_used(int vclock) +{ + return READ_ONCE(vclocks_used) & (1 << vclock); +} + static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) { unsigned ret; -- cgit v1.2.1 From 2024315124b43bb8b25a119ec6c614f0647dcc6d Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Mon, 18 Jan 2016 20:13:14 +0600 Subject: x86/asm/entry: Remove unused SAVE_ALL/RESTORE_ALL macros for !CONFIG_x86_64 SAVE_ALL and RESTORE_ALL macros for !CONFIG_X86_64 were introduced in commit: 1a338ac32 commit ('sched, x86: Optimize the preempt_schedule() call') ... and were used in the ___preempt_schedule() and ___preempt_schedule_context() functions from the arch/x86/kernel/preempt.S. But the arch/x86/kernel/preempt.S file was removed in the following commit: 0ad6e3c5 commit ('x86: Speed up ___preempt_schedule*() by using THUNK helpers') The ___preempt_schedule()/___preempt_schedule_context() functions were reimplemeted and do not use SAVE_ALL/RESTORE_ALL anymore. These macros have no users anymore, so we can remove them. Signed-off-by: Alexander Kuleshov Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453126394-13717-1-git-send-email-kuleshovmail@gmail.com [ Improved the changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index e32206e09868..9a9e5884066c 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with .byte 0xf1 .endm -#else /* CONFIG_X86_64 */ - -/* - * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These - * are different from the entry_32.S versions in not changing the segment - * registers. So only suitable for in kernel use, not when transitioning - * from or to user space. The resulting stack frame is not a standard - * pt_regs frame. The main use case is calling C code from assembler - * when all the registers need to be preserved. - */ - - .macro SAVE_ALL - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - .endm - - .macro RESTORE_ALL - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - .endm - #endif /* CONFIG_X86_64 */ /* -- cgit v1.2.1 From a1ff5726081858a9ad98934eff7af6616c576875 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 16 Jan 2016 10:58:12 +0100 Subject: x86/cpufeature: Add AMD AVIC bit CPUID Fn8000_000A_EDX[13] denotes support for AMD's Virtual Interrupt controller, i.e., APIC virtualization. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: David Kaplan Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Joerg Roedel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Link: http://lkml.kernel.org/r/1452938292-12327-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 7ad8c9464297..bbf166e805be 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -268,6 +268,7 @@ #define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ /* * BUG word(s) -- cgit v1.2.1 From 43c75f933be26422f166d6d869a19997312f4732 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Mon, 30 Nov 2015 10:47:43 -0600 Subject: x86/mm: Streamline and restore probe_memory_block_size() The cumulative effect of the following two commits: bdee237c0343 ("x86: mm: Use 2GB memory block size on large-memory x86-64 systems") 982792c782ef ("x86, mm: probe memory block size for generic x86 64bit") ... is some pretty convoluted code. The first commit also removed code for the UV case without stated reason, which might lead to unexpected change in behavior. This commit has no other (intended) functional change; just seeks to simplify and make the code more understandable, beyond restoring the UV behavior. The whole section with the "tail size" doesn't seem to be reachable, since both the >= 64GB and < 64GB case return, so it was removed. Signed-off-by: Seth Jennings Cc: Daniel J Blueman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yinghai Lu Link: http://lkml.kernel.org/r/1448902063-18885-1-git-send-email-sjennings@variantweb.net [ Rewrote the title and changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8829482d69ec..8f18fec74e67 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include "mm_internal.h" @@ -1193,26 +1194,13 @@ int kern_addr_valid(unsigned long addr) static unsigned long probe_memory_block_size(void) { - /* start from 2g */ - unsigned long bz = 1UL<<31; + unsigned long bz = MIN_MEMORY_BLOCK_SIZE; - if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) { - pr_info("Using 2GB memory block size for large-memory system\n"); - return 2UL * 1024 * 1024 * 1024; - } - - /* less than 64g installed */ - if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) - return MIN_MEMORY_BLOCK_SIZE; - - /* get the tail size */ - while (bz > MIN_MEMORY_BLOCK_SIZE) { - if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) - break; - bz >>= 1; - } + /* if system is UV or has 64GB of RAM or more, use large blocks */ + if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30))) + bz = 2UL << 30; /* 2GB */ - printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20); return bz; } -- cgit v1.2.1 From 95d97adb2bb85d964bae4538e0574e742e522dda Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Thu, 17 Dec 2015 23:56:52 +0000 Subject: x86/signal: Cleanup get_nr_restart_syscall() Check for TS_COMPAT instead of TIF_IA32 to distinguish ia32 tasks from 64-bit tasks. Check for __X32_SYSCALL_BIT iff CONFIG_X86_X32_ABI is defined. Suggested-by: Andy Lutomirski Signed-off-by: Dmitry V. Levin Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Elvira Khabirova Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160111145515.GB29007@altlinux.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cb6282c3638f..c07ff5ddbd47 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -692,12 +692,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { -#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64) +#ifdef CONFIG_X86_64 + if (is_ia32_task()) + return __NR_ia32_restart_syscall; +#endif +#ifdef CONFIG_X86_X32_ABI + return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); +#else return __NR_restart_syscall; -#else /* !CONFIG_X86_32 && CONFIG_X86_64 */ - return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : - __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); -#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */ +#endif } /* -- cgit v1.2.1 From 997963edd912a6d77d68b2bbc19f40ce8facabd7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 18 Dec 2015 06:39:18 -0600 Subject: x86/asm: Clean up frame pointer macros The asm macros for setting up and restoring the frame pointer aren't currently being used. However, they will be needed soon to help asm functions to comply with stacktool. Rename FRAME/ENDFRAME to FRAME_BEGIN/FRAME_END for more symmetry. Also make the code more readable and improve the comments. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Bernd Petrovitsch Cc: Borislav Petkov Cc: Brian Gerst Cc: Chris J Arges Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Slaby Cc: Linus Torvalds Cc: Michal Marek Cc: Namhyung Kim Cc: Pedro Alves Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3f488a8e3bfc8ac7d4d3d350953e664e7182b044.1450442274.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/frame.h | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 793179cf8e21..cec213bd6eba 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -1,23 +1,34 @@ +#ifndef _ASM_X86_FRAME_H +#define _ASM_X86_FRAME_H + #ifdef __ASSEMBLY__ #include -/* The annotation hides the frame from the unwinder and makes it look - like a ordinary ebp save/restore. This avoids some special cases for - frame pointer later */ +/* + * These are stack frame creation macros. They should be used by every + * callable non-leaf asm function to make kernel stack traces more reliable. + */ #ifdef CONFIG_FRAME_POINTER - .macro FRAME - __ASM_SIZE(push,) %__ASM_REG(bp) - __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp) - .endm - .macro ENDFRAME - __ASM_SIZE(pop,) %__ASM_REG(bp) - .endm -#else - .macro FRAME - .endm - .macro ENDFRAME - .endm -#endif + +.macro FRAME_BEGIN + push %_ASM_BP + _ASM_MOV %_ASM_SP, %_ASM_BP +.endm + +.macro FRAME_END + pop %_ASM_BP +.endm + +#define FRAME_OFFSET __ASM_SEL(4, 8) + +#else /* !CONFIG_FRAME_POINTER */ + +#define FRAME_BEGIN +#define FRAME_END +#define FRAME_OFFSET 0 + +#endif /* CONFIG_FRAME_POINTER */ #endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_FRAME_H */ -- cgit v1.2.1 From ec5186557abbe711dfd34e1863735dfecb0602cc Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 18 Dec 2015 06:39:19 -0600 Subject: x86/asm: Add C versions of frame pointer macros Add C versions of the frame pointer macros which can be used to create a stack frame in inline assembly. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Bernd Petrovitsch Cc: Borislav Petkov Cc: Brian Gerst Cc: Chris J Arges Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Slaby Cc: Linus Torvalds Cc: Michal Marek Cc: Namhyung Kim Cc: Pedro Alves Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f6786a282bf232ede3e2866414eae3cf02c7d662.1450442274.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/frame.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index cec213bd6eba..6e4d170726b7 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -1,16 +1,17 @@ #ifndef _ASM_X86_FRAME_H #define _ASM_X86_FRAME_H -#ifdef __ASSEMBLY__ - #include /* * These are stack frame creation macros. They should be used by every * callable non-leaf asm function to make kernel stack traces more reliable. */ + #ifdef CONFIG_FRAME_POINTER +#ifdef __ASSEMBLY__ + .macro FRAME_BEGIN push %_ASM_BP _ASM_MOV %_ASM_SP, %_ASM_BP @@ -20,6 +21,16 @@ pop %_ASM_BP .endm +#else /* !__ASSEMBLY__ */ + +#define FRAME_BEGIN \ + "push %" _ASM_BP "\n" \ + _ASM_MOV "%" _ASM_SP ", %" _ASM_BP "\n" + +#define FRAME_END "pop %" _ASM_BP "\n" + +#endif /* __ASSEMBLY__ */ + #define FRAME_OFFSET __ASM_SEL(4, 8) #else /* !CONFIG_FRAME_POINTER */ @@ -30,5 +41,4 @@ #endif /* CONFIG_FRAME_POINTER */ -#endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_FRAME_H */ -- cgit v1.2.1 From 320d25b6a05f8b73c23fc21025d2906ecdd2d4fc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 19 Jan 2016 13:38:58 -0800 Subject: x86/mm/32: Set NX in __supported_pte_mask before enabling paging There's a short window in which very early mappings can end up with NX clear because they are created before we've noticed that we have NX. It turns out that we detect NX very early, so there's no need to defer __supported_pte_mask setup. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Pavel Machek Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2b544627345f7110160545a3f47031eb45c3ad4f.1453239349.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 6 ++++++ arch/x86/mm/setup_nx.c | 5 ++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 6bc9ae24b6d2..57fc3f8c85fd 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -389,6 +389,12 @@ default_entry: /* Make changes effective */ wrmsr + /* + * And make sure that all the mappings we set up have NX set from + * the beginning. + */ + orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4) + enable_paging: /* diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 92e2eacb3321..78f5d5907f98 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -31,9 +31,8 @@ early_param("noexec", noexec_setup); void x86_configure_nx(void) { - if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) - __supported_pte_mask |= _PAGE_NX; - else + /* If disable_nx is set, clear NX on all new mappings going forward. */ + if (disable_nx) __supported_pte_mask &= ~_PAGE_NX; } -- cgit v1.2.1 From 7c360572b430a0e9757bafc0c20f26c920f2a07f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 19 Jan 2016 13:38:59 -0800 Subject: x86/mm: Make kmap_prot into a #define The value (once we initialize it) is a foregone conclusion. Make it a #define to save a tiny amount of text and data size and to make it more comprehensible. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Pavel Machek Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/0850eb0213de9da88544ff7fae72dc6d06d2b441.1453239349.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 2 +- arch/x86/mm/init_32.c | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 6d7d0e52ed5a..8554f960e21b 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -138,7 +138,7 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; +#define kmap_prot PAGE_KERNEL extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cb4ef3de61f9..a4bb1c7ab65e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -388,7 +388,6 @@ repeat: } pte_t *kmap_pte; -pgprot_t kmap_prot; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { @@ -405,8 +404,6 @@ static void __init kmap_init(void) */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; } #ifdef CONFIG_HIGHMEM -- cgit v1.2.1 From f5a5386bcaee222d2565d87ab33f0896f91a11ef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 6 Dec 2015 20:18:37 -0800 Subject: rcutorture: Add checks for rcutorture writer starvation This commit adds checks for rcutorture writer starvation, so that instances will be added to the test summary. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/parse-console.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 844787a0d7be..0f1520fb5f81 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -33,7 +33,7 @@ if grep -Pq '\x00' < $file then print_warning Console output contains nul bytes, old qemu still running? fi -egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags +egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start|\?\?\? Writer stall state' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags if test -s $1.diags then print_warning Assertion failure in $file $title @@ -64,7 +64,7 @@ then then summary="$summary lockdep: $n_badness" fi - n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1` + n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start|\?\?\? Writer stall state' $1` if test "$n_stalls" -ne 0 then summary="$summary Stalls: $n_stalls" -- cgit v1.2.1 From fc91cbab11d43a7d241297886665b9a5f94a1e47 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 6 Dec 2015 20:20:14 -0800 Subject: rcutorture: Don't keep empty console.log.diags files Currently, an error-free run produces an empty console.log.diags file. This can be annoying when using "vi */console.log.diags" to see a full summary of the errors. This commit therefore removes any empty files during the analysis process. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/parse-console.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 0f1520fb5f81..c6b0e46214f2 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -70,4 +70,6 @@ then summary="$summary Stalls: $n_stalls" fi print_warning Summary: $summary +else + rm $1.diags fi -- cgit v1.2.1 From 5d43edb40779b3d88e9c8006b281a4c828431193 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Dec 2015 16:10:23 -0800 Subject: rcutorture: Check for self-detected stalls The current scripts parse console output only for cases where one CPU detect a stall on some other CPU or task. This commit therefore adds checks for self-detected stalls. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/parse-console.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index c6b0e46214f2..5eb49b7f864c 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -33,7 +33,7 @@ if grep -Pq '\x00' < $file then print_warning Console output contains nul bytes, old qemu still running? fi -egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start|\?\?\? Writer stall state' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags +egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags if test -s $1.diags then print_warning Assertion failure in $file $title @@ -64,7 +64,7 @@ then then summary="$summary lockdep: $n_badness" fi - n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start|\?\?\? Writer stall state' $1` + n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $1` if test "$n_stalls" -ne 0 then summary="$summary Stalls: $n_stalls" -- cgit v1.2.1 From 14365449b6ce34cf6a3040ff8ebbb39d89d67159 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Tue, 26 Jan 2016 18:21:21 +0600 Subject: x86/asm: Remove unused L3_PAGE_OFFSET L3_PAGE_OFFSET was introduced in commit a6523748bd (paravirt/x86, 64-bit: move __PAGE_OFFSET to leave a space for hypervisor), but has no users. Signed-off-by: Alexander Kuleshov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: Andrey Ryabinin Link: http://lkml.kernel.org/r/1453810881-30622-1-git-send-email-kuleshovmail@gmail.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/head_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ffdc0e860390..2e974680f5ad 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -38,7 +38,6 @@ #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) -L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET) L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) -- cgit v1.2.1 From dd42ac8f02aea32661756554aace2095f7181d34 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Fri, 16 Oct 2015 15:20:53 +0600 Subject: clockevents: Rename last parameter of clocks_calc_mult_shift() to maxsec Last parameter of the clocks_calc_mult_shift() was renamed from minsec to maxsec in the 5fdade95 (time: Rename misnamed minsec argument of clocks_calc_mult_shift()). Signed-off-by: Alexander Kuleshov Link: http://lkml.kernel.org/r/1444987253-11018-1-git-send-email-kuleshovmail@gmail.com Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index bdcf358dfce2..0d442e34c349 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -190,9 +190,9 @@ extern void clockevents_config_and_register(struct clock_event_device *dev, extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq); static inline void -clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec) +clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec) { - return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, minsec); + return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec); } extern void clockevents_suspend(void); -- cgit v1.2.1 From 9c808765e88efb6fa6af7e2206ef89512f1840a7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 15 Jan 2016 17:41:08 +0000 Subject: hrtimer: Add support for CLOCK_MONOTONIC_RAW The KVM/ARM timer implementation arms a hrtimer when a vcpu is blocked (usually because it is waiting for an interrupt) while its timer is going to kick in the future. It is essential that this timer doesn't get adjusted, or the guest will end up being woken-up at the wrong time (NTP running on the host seems to confuse the hell out of some guests). In order to allow this, let's add CLOCK_MONOTONIC_RAW support to hrtimer (it is so far only supported for posix timers). It also has the (limited) benefit of fixing de0421d53bfb ("mac80211_hwsim: shuffle code to prepare for dynamic radios"), which already uses this functionnality without realizing wasn't implemented (just being lucky...). Signed-off-by: Marc Zyngier Cc: Tomasz Nowicki Cc: Christoffer Dall Link: http://lkml.kernel.org/r/1452879670-16133-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 + kernel/time/hrtimer.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 76dd4f0da5ca..a6d64af5e73f 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -151,6 +151,7 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, + HRTIMER_BASE_MONOTONIC_RAW, HRTIMER_MAX_CLOCK_BASES, }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 435b8850dd80..a125f222fee2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -90,12 +90,18 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, + { + .index = HRTIMER_BASE_MONOTONIC_RAW, + .clockid = CLOCK_MONOTONIC_RAW, + .get_time = &ktime_get_raw, + }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, + [CLOCK_MONOTONIC_RAW] = HRTIMER_BASE_MONOTONIC_RAW, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; @@ -1268,7 +1274,10 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) if (!(active & 0x01)) continue; - basenow = ktime_add(now, base->offset); + if (unlikely(base->index == HRTIMER_BASE_MONOTONIC_RAW)) + basenow = ktime_get_raw(); + else + basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; -- cgit v1.2.1 From 9006a01829a50cfd6bbd4980910ed46e895e93d7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 15 Jan 2016 17:41:09 +0000 Subject: hrtimer: Catch illegal clockids It is way too easy to take any random clockid and feed it to the hrtimer subsystem. At best, it gets mapped to a monotonic base, but it would be better to just catch illegal values as early as possible. This patch does exactly that, mapping illegal clockids to an illegal base index, and panicing when we detect the illegal condition. Signed-off-by: Marc Zyngier Cc: Tomasz Nowicki Cc: Christoffer Dall Link: http://lkml.kernel.org/r/1452879670-16133-3-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index a125f222fee2..cb0fe70f3c51 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -99,6 +99,9 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + /* Make sure we catch unsupported clockids */ + [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_MONOTONIC_RAW] = HRTIMER_BASE_MONOTONIC_RAW, @@ -108,7 +111,9 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - return hrtimer_clock_to_base_table[clock_id]; + int base = hrtimer_clock_to_base_table[clock_id]; + BUG_ON(base == HRTIMER_MAX_CLOCK_BASES); + return base; } /* -- cgit v1.2.1 From a6e707ddbdf150bd1c2a5c0eccc55abdc62a0039 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 15 Jan 2016 17:41:10 +0000 Subject: KVM: arm/arm64: timer: Switch to CLOCK_MONOTONIC_RAW In order to avoid NTP messing with the guest timer behind our back, use the new and improved monotonic raw version of the hrtimers. Signed-off-by: Marc Zyngier Cc: Tomasz Nowicki Cc: Christoffer Dall Link: http://lkml.kernel.org/r/1452879670-16133-4-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- virt/kvm/arm/arch_timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 69bca185c471..97c58153f923 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -48,7 +48,7 @@ static bool timer_is_armed(struct arch_timer_cpu *timer) static void timer_arm(struct arch_timer_cpu *timer, u64 ns) { timer->armed = true; - hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns), + hrtimer_start(&timer->timer, ktime_add_ns(ktime_get_raw(), ns), HRTIMER_MODE_ABS); } @@ -308,7 +308,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); - hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&timer->timer, CLOCK_MONOTONIC_RAW, HRTIMER_MODE_ABS); timer->timer.function = kvm_timer_expire; } -- cgit v1.2.1 From bbf66d897adf2bb0c310db96c97e8db6369f39e1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 22 Jan 2016 18:31:53 +0100 Subject: clocksource: Allow unregistering the watchdog Hyper-V vmbus module registers TSC page clocksource when loaded. This is the clocksource with the highest rating and thus it becomes the watchdog making unloading of the vmbus module impossible. Separate clocksource_select_watchdog() from clocksource_enqueue_watchdog() and use it on clocksource register/rating change/unregister. After all, lobotomized monkeys may need some love too. Signed-off-by: Vitaly Kuznetsov Cc: John Stultz Cc: Dexuan Cui Cc: K. Y. Srinivasan Link: http://lkml.kernel.org/r/1453483913-25672-1-git-send-email-vkuznets@redhat.com Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 52 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 664de539299b..56ece145a814 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + /* Pick the best watchdog. */ - if (!watchdog || cs->rating > watchdog->rating) { + if (!watchdog || cs->rating > watchdog->rating) watchdog = cs; - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - } } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + /* Check if the watchdog timer needs to be started. */ clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } @@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { - /* - * I really can't convince myself to support this on hardware - * designed by lobotomized monkeys. - */ - if (clocksource_is_watchdog(cs)) - return -EBUSY; + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } if (cs == curr_clocksource) { /* Select and try to install a replacement clock source */ -- cgit v1.2.1 From c31b34255b48d1a169693c9c70c49ad6418cfd20 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:19 -0800 Subject: selftests/x86: Extend Makefile to allow 64-bit-only tests Previously the Makefile supported 32-bit-only tests and tests that were 32-bit and 64-bit. This adds the support for tests that are only built as 64-bit binaries. There aren't any yet, but there might be a few some day. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/99789bfe65706e6df32cc7e13f656e8c9fa92031.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index d0c473f65850..9c81f263a396 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -11,8 +11,9 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall vdso_restorer TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) +TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY) BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) -BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64) +BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64) CFLAGS := -O2 -g -std=gnu99 -pthread -Wall @@ -40,7 +41,7 @@ clean: $(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm -$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c +$(TARGETS_C_64BIT_ALL:%=%_64): %_64: %.c $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl # x86_64 users should be encouraged to install 32-bit libraries -- cgit v1.2.1 From e21d50f3864e2a8995f5d2a41dea3f0fa07758b4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:20 -0800 Subject: selftests/x86: Add check_initial_reg_state() This checks that ELF binaries are started with an appropriately blank register state. ( There's currently a nasty special case in the entry asm to arrange for this. I'm planning on removing the special case, and this will help make sure I don't break it. ) Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ef54f8d066b30a3eb36bbf26300eebb242185700.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/Makefile | 9 +- .../selftests/x86/check_initial_reg_state.c | 109 +++++++++++++++++++++ 2 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/check_initial_reg_state.c diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 9c81f263a396..df4f767f48da 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -4,7 +4,8 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean -TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall +TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \ + check_initial_reg_state TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ ldt_gdt \ @@ -66,3 +67,9 @@ endif sysret_ss_attrs_64: thunks.S ptrace_syscall_32: raw_syscall_helper_32.S test_syscall_vdso_32: thunks_32.S + +# check_initial_reg_state is special: it needs a custom entry, and it +# needs to be static so that its interpreter doesn't destroy its initial +# state. +check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static +check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c new file mode 100644 index 000000000000..6aaed9b85baf --- /dev/null +++ b/tools/testing/selftests/x86/check_initial_reg_state.c @@ -0,0 +1,109 @@ +/* + * check_initial_reg_state.c - check that execve sets the correct state + * Copyright (c) 2014-2016 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#define _GNU_SOURCE + +#include + +unsigned long ax, bx, cx, dx, si, di, bp, sp, flags; +unsigned long r8, r9, r10, r11, r12, r13, r14, r15; + +asm ( + ".pushsection .text\n\t" + ".type real_start, @function\n\t" + ".global real_start\n\t" + "real_start:\n\t" +#ifdef __x86_64__ + "mov %rax, ax\n\t" + "mov %rbx, bx\n\t" + "mov %rcx, cx\n\t" + "mov %rdx, dx\n\t" + "mov %rsi, si\n\t" + "mov %rdi, di\n\t" + "mov %rbp, bp\n\t" + "mov %rsp, sp\n\t" + "mov %r8, r8\n\t" + "mov %r9, r9\n\t" + "mov %r10, r10\n\t" + "mov %r11, r11\n\t" + "mov %r12, r12\n\t" + "mov %r13, r13\n\t" + "mov %r14, r14\n\t" + "mov %r15, r15\n\t" + "pushfq\n\t" + "popq flags\n\t" +#else + "mov %eax, ax\n\t" + "mov %ebx, bx\n\t" + "mov %ecx, cx\n\t" + "mov %edx, dx\n\t" + "mov %esi, si\n\t" + "mov %edi, di\n\t" + "mov %ebp, bp\n\t" + "mov %esp, sp\n\t" + "pushfl\n\t" + "popl flags\n\t" +#endif + "jmp _start\n\t" + ".size real_start, . - real_start\n\t" + ".popsection"); + +int main() +{ + int nerrs = 0; + + if (sp == 0) { + printf("[FAIL]\tTest was built incorrectly\n"); + return 1; + } + + if (ax || bx || cx || dx || si || di || bp +#ifdef __x86_64__ + || r8 || r9 || r10 || r11 || r12 || r13 || r14 || r15 +#endif + ) { + printf("[FAIL]\tAll GPRs except SP should be 0\n"); +#define SHOW(x) printf("\t" #x " = 0x%lx\n", x); + SHOW(ax); + SHOW(bx); + SHOW(cx); + SHOW(dx); + SHOW(si); + SHOW(di); + SHOW(bp); + SHOW(sp); +#ifdef __x86_64__ + SHOW(r8); + SHOW(r9); + SHOW(r10); + SHOW(r11); + SHOW(r12); + SHOW(r13); + SHOW(r14); + SHOW(r15); +#endif + nerrs++; + } else { + printf("[OK]\tAll GPRs except SP are 0\n"); + } + + if (flags != 0x202) { + printf("[FAIL]\tFLAGS is 0x%lx, but it should be 0x202\n", flags); + nerrs++; + } else { + printf("[OK]\tFLAGS is 0x202\n"); + } + + return nerrs ? 1 : 0; +} -- cgit v1.2.1 From fba324744bfd2a7948a7710d7a021d76dafb9b67 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:21 -0800 Subject: x86/syscalls: Refactor syscalltbl.sh This splits out the code to emit a syscall line. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1bfcbba991f5cfaa9291ff950a593daa972a205f.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/syscalls/syscalltbl.sh | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 0e7f8ec071e7..167965ee742e 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -3,13 +3,21 @@ in="$1" out="$2" +emit() { + abi="$1" + nr="$2" + entry="$3" + compat="$4" + if [ -n "$compat" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $compat)" + elif [ -n "$entry" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $entry)" + fi +} + grep '^[0-9]' "$in" | sort -n | ( while read nr abi name entry compat; do abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` - if [ -n "$compat" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $compat)" - elif [ -n "$entry" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $entry)" - fi + emit "$abi" "$nr" "$entry" "$compat" done ) > "$out" -- cgit v1.2.1 From 32324ce15ea8cb4c8acc28acb2fd36fabf73e9db Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:22 -0800 Subject: x86/syscalls: Remove __SYSCALL_COMMON and __SYSCALL_X32 The common/64/x32 distinction has no effect other than determining which kernels actually support the syscall. Move the logic into syscalltbl.sh. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/58d4a95f40e43b894f93288b4a3633963d0ee22e.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/syscall_64.c | 8 -------- arch/x86/entry/syscalls/syscalltbl.sh | 17 ++++++++++++++++- arch/x86/kernel/asm-offsets_64.c | 6 ------ arch/x86/um/sys_call_table_64.c | 3 --- arch/x86/um/user-offsets.c | 2 -- 5 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 41283d22be7a..974fd89ac806 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -6,14 +6,6 @@ #include #include -#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) - -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif - #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 167965ee742e..5ebeaf1041e7 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -18,6 +18,21 @@ emit() { grep '^[0-9]' "$in" | sort -n | ( while read nr abi name entry compat; do abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` - emit "$abi" "$nr" "$entry" "$compat" + if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then + # COMMON is the same as 64, except that we don't expect X32 + # programs to use it. Our expectation has nothing to do with + # any generated code, so treat them the same. + emit 64 "$nr" "$entry" "$compat" + elif [ "$abi" == "X32" ]; then + # X32 is equivalent to 64 on an X32-compatible kernel. + echo "#ifdef CONFIG_X86_X32_ABI" + emit 64 "$nr" "$entry" "$compat" + echo "#endif" + elif [ "$abi" == "I386" ]; then + emit "$abi" "$nr" "$entry" "$compat" + else + echo "Unknown abi $abi" >&2 + exit 1 + fi done ) > "$out" diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index f2edafb5f24e..29db3b3f550c 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -5,12 +5,6 @@ #include #define __SYSCALL_64(nr, sym, compat) [nr] = 1, -#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) [nr] = 1, -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif static char syscalls_64[] = { #include }; diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index b74ea6c2c0e7..71a497cde921 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -35,9 +35,6 @@ #define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#define __SYSCALL_X32(nr, sym, compat) /* Not supported */ - #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index ce7e3607a870..5edf4f4bbf53 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -15,8 +15,6 @@ static char syscalls[] = { }; #else #define __SYSCALL_64(nr, sym, compat) [nr] = 1, -#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, -#define __SYSCALL_X32(nr, sym, compat) /* Not supported */ static char syscalls[] = { #include }; -- cgit v1.2.1 From 3e65654e3db6df6aba9c5b895f8b8e6a8d8eb508 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:23 -0800 Subject: x86/syscalls: Move compat syscall entry handling into syscalltbl.sh Rather than duplicating the compat entry handling in all consumers of syscalls_BITS.h, handle it directly in syscalltbl.sh. Now we generate entries in syscalls_32.h like: __SYSCALL_I386(5, sys_open) __SYSCALL_I386(5, compat_sys_open) and all of its consumers implicitly get the right entry point. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b7c2b501dc0e6e43050e916b95807c3e2e16e9bb.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/syscall_32.c | 10 ++-------- arch/x86/entry/syscall_64.c | 4 ++-- arch/x86/entry/syscalls/syscalltbl.sh | 22 ++++++++++++++++++---- arch/x86/kernel/asm-offsets_32.c | 2 +- arch/x86/kernel/asm-offsets_64.c | 4 ++-- arch/x86/um/sys_call_table_32.c | 4 ++-- arch/x86/um/sys_call_table_64.c | 4 ++-- arch/x86/um/user-offsets.c | 4 ++-- 8 files changed, 31 insertions(+), 23 deletions(-) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 9a6649857106..3e2829759da2 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -6,17 +6,11 @@ #include #include -#ifdef CONFIG_IA32_EMULATION -#define SYM(sym, compat) compat -#else -#define SYM(sym, compat) sym -#endif - -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), +#define __SYSCALL_I386(nr, sym) [nr] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 974fd89ac806..3781989b180e 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -6,11 +6,11 @@ #include #include -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym, compat) [nr] = sym, +#define __SYSCALL_64(nr, sym) [nr] = sym, extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 5ebeaf1041e7..b81479c8c5fb 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -8,10 +8,24 @@ emit() { nr="$2" entry="$3" compat="$4" - if [ -n "$compat" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $compat)" - elif [ -n "$entry" ]; then - echo "__SYSCALL_${abi}($nr, $entry, $entry)" + + if [ "$abi" == "64" -a -n "$compat" ]; then + echo "a compat entry for a 64-bit syscall makes no sense" >&2 + exit 1 + fi + + if [ -z "$compat" ]; then + if [ -n "$entry" ]; then + echo "__SYSCALL_${abi}($nr, $entry)" + fi + else + echo "#ifdef CONFIG_X86_32" + if [ -n "$entry" ]; then + echo "__SYSCALL_${abi}($nr, $entry)" + fi + echo "#else" + echo "__SYSCALL_${abi}($nr, $compat)" + echo "#endif" fi } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 6ce39025f467..abec4c9f1c97 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -7,7 +7,7 @@ #include #include "../../../drivers/lguest/lg.h" -#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +#define __SYSCALL_I386(nr, sym) [nr] = 1, static char syscalls[] = { #include }; diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 29db3b3f550c..9677bf9a616f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -4,11 +4,11 @@ #include -#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +#define __SYSCALL_64(nr, sym) [nr] = 1, static char syscalls_64[] = { #include }; -#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +#define __SYSCALL_I386(nr, sym) [nr] = 1, static char syscalls_ia32[] = { #include }; diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 439c0994b696..d4669a679fd0 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -25,11 +25,11 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, +#define __SYSCALL_I386(nr, sym) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 71a497cde921..6ee5268beb05 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -35,11 +35,11 @@ #define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, +#define __SYSCALL_64(nr, sym) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index 5edf4f4bbf53..6c9a9c1eae32 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -9,12 +9,12 @@ #include #ifdef __i386__ -#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +#define __SYSCALL_I386(nr, sym) [nr] = 1, static char syscalls[] = { #include }; #else -#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +#define __SYSCALL_64(nr, sym) [nr] = 1, static char syscalls[] = { #include }; -- cgit v1.2.1 From cfcbadb49dabb05efa23e1a0f95f3391c0a815bc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:24 -0800 Subject: x86/syscalls: Add syscall entry qualifiers This will let us specify something like 'sys_xyz/foo' instead of 'sys_xyz' in the syscall table, where the 'foo' qualifier conveys some extra information to the C code. The intent is to allow things like sys_execve/ptregs to indicate that sys_execve() touches pt_regs. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2de06e33dce62556b3ec662006fcb295504e296e.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/syscall_32.c | 4 ++-- arch/x86/entry/syscall_64.c | 4 ++-- arch/x86/entry/syscalls/syscalltbl.sh | 19 ++++++++++++++++--- arch/x86/kernel/asm-offsets_32.c | 2 +- arch/x86/kernel/asm-offsets_64.c | 4 ++-- arch/x86/um/sys_call_table_32.c | 4 ++-- arch/x86/um/sys_call_table_64.c | 4 ++-- arch/x86/um/user-offsets.c | 4 ++-- 8 files changed, 29 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 3e2829759da2..8f895ee13a1c 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -6,11 +6,11 @@ #include #include -#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym) [nr] = sym, +#define __SYSCALL_I386(nr, sym, qual) [nr] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 3781989b180e..a1d408772ae6 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -6,11 +6,11 @@ #include #include -#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym) [nr] = sym, +#define __SYSCALL_64(nr, sym, qual) [nr] = sym, extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index b81479c8c5fb..cd3d3015d7df 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -3,6 +3,19 @@ in="$1" out="$2" +syscall_macro() { + abi="$1" + nr="$2" + entry="$3" + + # Entry can be either just a function name or "function/qualifier" + real_entry="${entry%%/*}" + qualifier="${entry:${#real_entry}}" # Strip the function name + qualifier="${qualifier:1}" # Strip the slash, if any + + echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)" +} + emit() { abi="$1" nr="$2" @@ -16,15 +29,15 @@ emit() { if [ -z "$compat" ]; then if [ -n "$entry" ]; then - echo "__SYSCALL_${abi}($nr, $entry)" + syscall_macro "$abi" "$nr" "$entry" fi else echo "#ifdef CONFIG_X86_32" if [ -n "$entry" ]; then - echo "__SYSCALL_${abi}($nr, $entry)" + syscall_macro "$abi" "$nr" "$entry" fi echo "#else" - echo "__SYSCALL_${abi}($nr, $compat)" + syscall_macro "$abi" "$nr" "$compat" echo "#endif" fi } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index abec4c9f1c97..fdeb0ce07c16 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -7,7 +7,7 @@ #include #include "../../../drivers/lguest/lg.h" -#define __SYSCALL_I386(nr, sym) [nr] = 1, +#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include }; diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 9677bf9a616f..d875f97d4e0b 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -4,11 +4,11 @@ #include -#define __SYSCALL_64(nr, sym) [nr] = 1, +#define __SYSCALL_64(nr, sym, qual) [nr] = 1, static char syscalls_64[] = { #include }; -#define __SYSCALL_I386(nr, sym) [nr] = 1, +#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls_ia32[] = { #include }; diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index d4669a679fd0..bfce503dffae 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -25,11 +25,11 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym) [ nr ] = sym, +#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 6ee5268beb05..f306413d3eb6 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -35,11 +35,11 @@ #define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym) [ nr ] = sym, +#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index 6c9a9c1eae32..470564bbd08e 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -9,12 +9,12 @@ #include #ifdef __i386__ -#define __SYSCALL_I386(nr, sym) [nr] = 1, +#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include }; #else -#define __SYSCALL_64(nr, sym) [nr] = 1, +#define __SYSCALL_64(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include }; -- cgit v1.2.1 From 302f5b260c322696cbeb962a263a4d2d99864aed Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:25 -0800 Subject: x86/entry/64: Always run ptregs-using syscalls on the slow path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 64-bit syscalls currently have an optimization in which they are called with partial pt_regs. A small handful require full pt_regs. In the 32-bit and compat cases, I cleaned this up by forcing full pt_regs for all syscalls. The performance hit doesn't really matter as the affected system calls are fundamentally heavy and this is the 32-bit compat case. I want to clean up the 64-bit case as well, but I don't want to hurt fast path performance. To do that, I want to force the syscalls that use pt_regs onto the slow path. This will enable us to make slow path syscalls be real ABI-compliant C functions. Use the new syscall entry qualification machinery for this. 'stub_clone' is now 'stub_clone/ptregs'. The next patch will eliminate the stubs, and we'll just have 'sys_clone/ptregs'. As of this patch, two-phase entry tracing is no longer used. It has served its purpose (namely a huge speedup on some workloads prior to more general opportunistic SYSRET support), and once the dust settles I'll send patches to back it out. The implementation is heavily based on a patch from Brian Gerst: http://lkml.kernel.org/g/1449666173-15366-1-git-send-email-brgerst@gmail.com Originally-From: Brian Gerst Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: Frédéric Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Linux Kernel Mailing List Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b9beda88460bcefec6e7d792bd44eca9b760b0c4.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 56 +++++++++++++++++++++++++--------- arch/x86/entry/syscall_64.c | 7 +++-- arch/x86/entry/syscalls/syscall_64.tbl | 16 +++++----- 3 files changed, 55 insertions(+), 24 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9d34d3cfceb6..f1c8f150728e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -182,7 +182,15 @@ entry_SYSCALL_64_fastpath: #endif ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10, %rcx + + /* + * This call instruction is handled specially in stub_ptregs_64. + * It might end up jumping to the slow path. If it jumps, RAX is + * clobbered. + */ call *sys_call_table(, %rax, 8) +.Lentry_SYSCALL_64_after_fastpath_call: + movq %rax, RAX(%rsp) 1: /* @@ -235,25 +243,13 @@ GLOBAL(int_ret_from_sys_call_irqs_off) /* Do syscall entry tracing */ tracesys: - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - call syscall_trace_enter_phase1 - test %rax, %rax - jnz tracesys_phase2 /* if needed, run the slow path */ - RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ - movq ORIG_RAX(%rsp), %rax - jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ - -tracesys_phase2: SAVE_EXTRA_REGS movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - movq %rax, %rdx - call syscall_trace_enter_phase2 + call syscall_trace_enter /* * Reload registers from stack in case ptrace changed them. - * We don't reload %rax because syscall_trace_entry_phase2() returned + * We don't reload %rax because syscall_trace_enter() returned * the value it wants us to use in the table lookup. */ RESTORE_C_REGS_EXCEPT_RAX @@ -355,6 +351,38 @@ opportunistic_sysret_failed: jmp restore_c_regs_and_iret END(entry_SYSCALL_64) +ENTRY(stub_ptregs_64) + /* + * Syscalls marked as needing ptregs land here. + * If we are on the fast path, we need to save the extra regs. + * If we are on the slow path, the extra regs are already saved. + * + * RAX stores a pointer to the C function implementing the syscall. + */ + cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) + jne 1f + + /* Called from fast path -- pop return address and jump to slow path */ + popq %rax + jmp tracesys /* called from fast path */ + +1: + /* Called from C */ + jmp *%rax /* called from C */ +END(stub_ptregs_64) + +.macro ptregs_stub func +ENTRY(ptregs_\func) + leaq \func(%rip), %rax + jmp stub_ptregs_64 +END(ptregs_\func) +.endm + +/* Instantiate ptregs_stub for each ptregs-using syscall */ +#define __SYSCALL_64_QUAL_(sym) +#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym +#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) +#include .macro FORK_LIKE func ENTRY(stub_\func) diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index a1d408772ae6..9dbc5abb6162 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -6,11 +6,14 @@ #include #include -#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_64_QUAL_(sym) sym +#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym + +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym, qual) [nr] = sym, +#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym), extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index dc1040a50bdc..5de342a729d0 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -21,7 +21,7 @@ 12 common brk sys_brk 13 64 rt_sigaction sys_rt_sigaction 14 common rt_sigprocmask sys_rt_sigprocmask -15 64 rt_sigreturn stub_rt_sigreturn +15 64 rt_sigreturn stub_rt_sigreturn/ptregs 16 64 ioctl sys_ioctl 17 common pread64 sys_pread64 18 common pwrite64 sys_pwrite64 @@ -62,10 +62,10 @@ 53 common socketpair sys_socketpair 54 64 setsockopt sys_setsockopt 55 64 getsockopt sys_getsockopt -56 common clone stub_clone -57 common fork stub_fork -58 common vfork stub_vfork -59 64 execve stub_execve +56 common clone stub_clone/ptregs +57 common fork stub_fork/ptregs +58 common vfork stub_vfork/ptregs +59 64 execve stub_execve/ptregs 60 common exit sys_exit 61 common wait4 sys_wait4 62 common kill sys_kill @@ -328,7 +328,7 @@ 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf -322 64 execveat stub_execveat +322 64 execveat stub_execveat/ptregs 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 @@ -346,7 +346,7 @@ 517 x32 recvfrom compat_sys_recvfrom 518 x32 sendmsg compat_sys_sendmsg 519 x32 recvmsg compat_sys_recvmsg -520 x32 execve stub_x32_execve +520 x32 execve stub_x32_execve/ptregs 521 x32 ptrace compat_sys_ptrace 522 x32 rt_sigpending compat_sys_rt_sigpending 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait @@ -371,4 +371,4 @@ 542 x32 getsockopt compat_sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit -545 x32 execveat stub_x32_execveat +545 x32 execveat stub_x32_execveat/ptregs -- cgit v1.2.1 From 46eabf06c04a6847a694a0c1413d4ac57e5b058a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:26 -0800 Subject: x86/entry/64: Call all native slow-path syscalls with full pt-regs This removes all of the remaining asm syscall stubs except for stub_ptregs_64. Entries in the main syscall table are now all callable from C. The resulting asm is every bit as ridiculous as it looks. The next few patches will clean it up. This patch is here to let reviewers rest their brains and for bisection. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a6b3801be0d505d50aefabda02d3b93efbfc9c73.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 79 +--------------------------------- arch/x86/entry/syscalls/syscall_64.tbl | 18 ++++---- 2 files changed, 10 insertions(+), 87 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f1c8f150728e..f7050a5d9dbc 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -253,7 +253,6 @@ tracesys: * the value it wants us to use in the table lookup. */ RESTORE_C_REGS_EXCEPT_RAX - RESTORE_EXTRA_REGS #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max, %rax #else @@ -264,6 +263,7 @@ tracesys: movq %r10, %rcx /* fixup for C */ call *sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) + RESTORE_EXTRA_REGS 1: /* Use IRET because user could have changed pt_regs->foo */ @@ -384,83 +384,6 @@ END(ptregs_\func) #define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) #include - .macro FORK_LIKE func -ENTRY(stub_\func) - SAVE_EXTRA_REGS 8 - jmp sys_\func -END(stub_\func) - .endm - - FORK_LIKE clone - FORK_LIKE fork - FORK_LIKE vfork - -ENTRY(stub_execve) - call sys_execve -return_from_execve: - testl %eax, %eax - jz 1f - /* exec failed, can use fast SYSRET code path in this case */ - ret -1: - /* must use IRET code path (pt_regs->cs may have changed) */ - addq $8, %rsp - ZERO_EXTRA_REGS - movq %rax, RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_execve) -/* - * Remaining execve stubs are only 7 bytes long. - * ENTRY() often aligns to 16 bytes, which in this case has no benefits. - */ - .align 8 -GLOBAL(stub_execveat) - call sys_execveat - jmp return_from_execve -END(stub_execveat) - -#if defined(CONFIG_X86_X32_ABI) - .align 8 -GLOBAL(stub_x32_execve) - call compat_sys_execve - jmp return_from_execve -END(stub_x32_execve) - .align 8 -GLOBAL(stub_x32_execveat) - call compat_sys_execveat - jmp return_from_execve -END(stub_x32_execveat) -#endif - -/* - * sigreturn is special because it needs to restore all registers on return. - * This cannot be done with SYSRET, so use the IRET return path instead. - */ -ENTRY(stub_rt_sigreturn) - /* - * SAVE_EXTRA_REGS result is not normally needed: - * sigreturn overwrites all pt_regs->GPREGS. - * But sigreturn can fail (!), and there is no easy way to detect that. - * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, - * we SAVE_EXTRA_REGS here. - */ - SAVE_EXTRA_REGS 8 - call sys_rt_sigreturn -return_from_stub: - addq $8, %rsp - RESTORE_EXTRA_REGS - movq %rax, RAX(%rsp) - jmp int_ret_from_sys_call -END(stub_rt_sigreturn) - -#ifdef CONFIG_X86_X32_ABI -ENTRY(stub_x32_rt_sigreturn) - SAVE_EXTRA_REGS 8 - call sys32_x32_rt_sigreturn - jmp return_from_stub -END(stub_x32_rt_sigreturn) -#endif - /* * A newly forked process directly context switches into this address. * diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5de342a729d0..dcf107ce2cd4 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -21,7 +21,7 @@ 12 common brk sys_brk 13 64 rt_sigaction sys_rt_sigaction 14 common rt_sigprocmask sys_rt_sigprocmask -15 64 rt_sigreturn stub_rt_sigreturn/ptregs +15 64 rt_sigreturn sys_rt_sigreturn/ptregs 16 64 ioctl sys_ioctl 17 common pread64 sys_pread64 18 common pwrite64 sys_pwrite64 @@ -62,10 +62,10 @@ 53 common socketpair sys_socketpair 54 64 setsockopt sys_setsockopt 55 64 getsockopt sys_getsockopt -56 common clone stub_clone/ptregs -57 common fork stub_fork/ptregs -58 common vfork stub_vfork/ptregs -59 64 execve stub_execve/ptregs +56 common clone sys_clone/ptregs +57 common fork sys_fork/ptregs +58 common vfork sys_vfork/ptregs +59 64 execve sys_execve/ptregs 60 common exit sys_exit 61 common wait4 sys_wait4 62 common kill sys_kill @@ -328,7 +328,7 @@ 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf -322 64 execveat stub_execveat/ptregs +322 64 execveat sys_execveat/ptregs 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 @@ -339,14 +339,14 @@ # for native 64-bit operation. # 512 x32 rt_sigaction compat_sys_rt_sigaction -513 x32 rt_sigreturn stub_x32_rt_sigreturn +513 x32 rt_sigreturn sys32_x32_rt_sigreturn 514 x32 ioctl compat_sys_ioctl 515 x32 readv compat_sys_readv 516 x32 writev compat_sys_writev 517 x32 recvfrom compat_sys_recvfrom 518 x32 sendmsg compat_sys_sendmsg 519 x32 recvmsg compat_sys_recvmsg -520 x32 execve stub_x32_execve/ptregs +520 x32 execve compat_sys_execve/ptregs 521 x32 ptrace compat_sys_ptrace 522 x32 rt_sigpending compat_sys_rt_sigpending 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait @@ -371,4 +371,4 @@ 542 x32 getsockopt compat_sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit -545 x32 execveat stub_x32_execveat/ptregs +545 x32 execveat compat_sys_execveat/ptregs -- cgit v1.2.1 From 24d978b76ffd20ecff8a8d1c21b16fe740f8b119 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:27 -0800 Subject: x86/entry/64: Stop using int_ret_from_sys_call in ret_from_fork ret_from_fork is now open-coded and is no longer tangled up with the syscall code. This isn't so bad -- this adds very little code, and IMO the result is much easier to understand. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a0747e2a5e47084655a1e96351c545b755c41fa7.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f7050a5d9dbc..cb5d940a7abd 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -390,7 +390,6 @@ END(ptregs_\func) * rdi: prev task we switched from */ ENTRY(ret_from_fork) - LOCK ; btr $TIF_FORK, TI_flags(%r8) pushq $0x0002 @@ -398,28 +397,32 @@ ENTRY(ret_from_fork) call schedule_tail /* rdi: 'prev' task parameter */ - RESTORE_EXTRA_REGS - testb $3, CS(%rsp) /* from kernel_thread? */ + jnz 1f /* - * By the time we get here, we have no idea whether our pt_regs, - * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the 32-bit compat paths. - * Use IRET code path to return, since it can safely handle - * all of the above. + * We came from kernel_thread. This code path is quite twisted, and + * someone should clean it up. + * + * copy_thread_tls stashes the function pointer in RBX and the + * parameter to be passed in RBP. The called function is permitted + * to call do_execve and thereby jump to user mode. */ - jnz int_ret_from_sys_call + movq RBP(%rsp), %rdi + call *RBX(%rsp) + movl $0, RAX(%rsp) /* - * We came from kernel_thread - * nb: we depend on RESTORE_EXTRA_REGS above + * Fall through as though we're exiting a syscall. This makes a + * twisted sort of sense if we just called do_execve. */ - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call + +1: + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ + TRACE_IRQS_ON /* user mode is traced as IRQS on */ + SWAPGS + jmp restore_regs_and_iret END(ret_from_fork) /* -- cgit v1.2.1 From 1e423bff959e48166f5b7efca01fdb0dbdf05846 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 28 Jan 2016 15:11:28 -0800 Subject: x86/entry/64: Migrate the 64-bit syscall slow path to C This is more complicated than the 32-bit and compat cases because it preserves an asm fast path for the case where the callee-saved regs aren't needed in pt_regs and no entry or exit work needs to be done. This appears to slow down fastpath syscalls by no more than one cycle on my Skylake laptop. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ce2335a4d42dc164b24132ee5e8c7716061f947b.1454022279.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 26 +++++++++++ arch/x86/entry/entry_64.S | 117 ++++++++++++++++------------------------------ 2 files changed, 65 insertions(+), 78 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 03663740c866..75175f92f462 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -344,6 +344,32 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) prepare_exit_to_usermode(regs); } +#ifdef CONFIG_X86_64 +__visible void do_syscall_64(struct pt_regs *regs) +{ + struct thread_info *ti = pt_regs_to_thread_info(regs); + unsigned long nr = regs->orig_ax; + + local_irq_enable(); + + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) + nr = syscall_trace_enter(regs); + + /* + * NB: Native and x32 syscalls are dispatched from the same + * table. The only functional difference is the x32 bit in + * regs->orig_ax, which changes the behavior of some syscalls. + */ + if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { + regs->ax = sys_call_table[nr & __SYSCALL_MASK]( + regs->di, regs->si, regs->dx, + regs->r10, regs->r8, regs->r9); + } + + syscall_return_slowpath(regs); +} +#endif + #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) /* * Does a 32-bit syscall. Called with IRQs on and does all entry and diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index cb5d940a7abd..567aa522ac0a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -145,17 +145,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + TRACE_IRQS_OFF + /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - /* - * Re-enable interrupts. - * We use 'rsp_scratch' as a scratch space, hence irq-off block above - * must execute atomically in the face of possible interrupt-driven - * task preemption. We must enable interrupts only after we're done - * with using rsp_scratch: - */ - ENABLE_INTERRUPTS(CLBR_NONE) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ @@ -171,9 +165,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz tracesys + /* + * If we need to do entry work or if we guess we'll need to do + * exit work, go straight to the slow path. + */ + testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz entry_SYSCALL64_slow_path + entry_SYSCALL_64_fastpath: + /* + * Easy case: enable interrupts and issue the syscall. If the syscall + * needs pt_regs, we'll call a stub that disables interrupts again + * and jumps to the slow path. + */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max, %rax #else @@ -193,88 +199,43 @@ entry_SYSCALL_64_fastpath: movq %rax, RAX(%rsp) 1: -/* - * Syscall return path ending with SYSRET (fast path). - * Has incompletely filled pt_regs. - */ - LOCKDEP_SYS_EXIT - /* - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - DISABLE_INTERRUPTS(CLBR_NONE) /* - * We must check ti flags with interrupts (or at least preemption) - * off because we must *never* return to userspace without - * processing exit work that is enqueued if we're preempted here. - * In particular, returning to userspace with any of the one-shot - * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is - * very bad. + * If we get here, then we know that pt_regs is clean for SYSRET64. + * If we see that no exit work is required (which we are required + * to check with IRQs off), then we can go straight to SYSRET64. */ + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ + jnz 1f - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RIP(%rsp), %rcx - movq EFLAGS(%rsp), %r11 + LOCKDEP_SYS_EXIT + TRACE_IRQS_ON /* user mode is traced as IRQs on */ + RESTORE_C_REGS movq RSP(%rsp), %rsp - /* - * 64-bit SYSRET restores rip from rcx, - * rflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * Restoration of rflags re-enables interrupts. - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we should - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. (Actually - * detecting the failure in 64-bit userspace is tricky but can be - * done.) - */ USERGS_SYSRET64 -GLOBAL(int_ret_from_sys_call_irqs_off) +1: + /* + * The fast path looked good when we started, but something changed + * along the way and we need to switch to the slow path. Calling + * raise(3) will trigger this, for example. IRQs are off. + */ TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - jmp int_ret_from_sys_call - - /* Do syscall entry tracing */ -tracesys: SAVE_EXTRA_REGS movq %rsp, %rdi - call syscall_trace_enter - - /* - * Reload registers from stack in case ptrace changed them. - * We don't reload %rax because syscall_trace_enter() returned - * the value it wants us to use in the table lookup. - */ - RESTORE_C_REGS_EXCEPT_RAX -#if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max, %rax -#else - andl $__SYSCALL_MASK, %eax - cmpl $__NR_syscall_max, %eax -#endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10, %rcx /* fixup for C */ - call *sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) - RESTORE_EXTRA_REGS -1: - /* Use IRET because user could have changed pt_regs->foo */ + call syscall_return_slowpath /* returns with IRQs disabled */ + jmp return_from_SYSCALL_64 -/* - * Syscall return path ending with IRET. - * Has correct iret frame. - */ -GLOBAL(int_ret_from_sys_call) +entry_SYSCALL64_slow_path: + /* IRQs are off. */ SAVE_EXTRA_REGS movq %rsp, %rdi - call syscall_return_slowpath /* returns with IRQs disabled */ + call do_syscall_64 /* returns with IRQs disabled */ + +return_from_SYSCALL_64: RESTORE_EXTRA_REGS TRACE_IRQS_IRETQ /* we're about to change IF */ @@ -364,7 +325,7 @@ ENTRY(stub_ptregs_64) /* Called from fast path -- pop return address and jump to slow path */ popq %rax - jmp tracesys /* called from fast path */ + jmp entry_SYSCALL64_slow_path /* called from fast path */ 1: /* Called from C */ -- cgit v1.2.1 From 5e9ebbd87a99ecc6abb74325b0ac63c46891f6f3 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Sat, 30 Jan 2016 14:01:12 +0600 Subject: x86/boot: Micro-optimize reset_early_page_tables() Save 25 bytes of code and make the bootup a tiny bit faster: text data bss dec filename 9735144 4970776 15474688 30180608 vmlinux.old 9735119 4970776 15474688 30180583 vmlinux Signed-off-by: Alexander Kuleshov Cc: Alexander Popov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454140872-16926-1-git-send-email-kuleshovmail@gmail.com [ Fixed various small details. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index f129a9af6357..35843caa4ea7 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -40,13 +40,8 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { - unsigned long i; - - for (i = 0; i < PTRS_PER_PGD-1; i++) - early_level4_pgt[i].pgd = 0; - + memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_level4_pgt)); } @@ -54,7 +49,6 @@ static void __init reset_early_page_tables(void) int __init early_make_pgtable(unsigned long address) { unsigned long physaddr = address - __PAGE_OFFSET; - unsigned long i; pgdval_t pgd, *pgd_p; pudval_t pud, *pud_p; pmdval_t pmd, *pmd_p; @@ -81,8 +75,7 @@ again: } pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; - for (i = 0; i < PTRS_PER_PUD; i++) - pud_p[i] = 0; + memset(pud_p, 0, sizeof(pud_p) * PTRS_PER_PUD); *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pud_p += pud_index(address); @@ -97,8 +90,7 @@ again: } pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; - for (i = 0; i < PTRS_PER_PMD; i++) - pmd_p[i] = 0; + memset(pmd_p, 0, sizeof(pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pmd = (physaddr & PMD_MASK) + early_pmd_flags; -- cgit v1.2.1 From cd4d09ec6f6c12a2cc3db5b7d8876a325a53545b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jan 2016 22:12:04 +0100 Subject: x86/cpufeature: Carve out X86_FEATURE_* Move them to a separate header and have the following dependency: x86/cpufeatures.h <- x86/processor.h <- x86/cpufeature.h This makes it easier to use the header in asm code and not include the whole cpufeature.h and add guards for asm. Suggested-by: H. Peter Anvin Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453842730-28463-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 2 +- arch/x86/boot/cpuflags.h | 2 +- arch/x86/boot/mkcpustr.c | 2 +- arch/x86/crypto/crc32-pclmul_glue.c | 2 +- arch/x86/crypto/crc32c-intel_glue.c | 2 +- arch/x86/crypto/crct10dif-pclmul_glue.c | 2 +- arch/x86/entry/common.c | 1 + arch/x86/entry/entry_32.S | 2 +- arch/x86/entry/vdso/vdso32-setup.c | 1 - arch/x86/entry/vdso/vdso32/system_call.S | 2 +- arch/x86/entry/vdso/vma.c | 1 + arch/x86/include/asm/alternative.h | 6 - arch/x86/include/asm/apic.h | 1 - arch/x86/include/asm/arch_hweight.h | 2 + arch/x86/include/asm/cmpxchg.h | 1 + arch/x86/include/asm/cpufeature.h | 284 +----------------------------- arch/x86/include/asm/cpufeatures.h | 288 +++++++++++++++++++++++++++++++ arch/x86/include/asm/fpu/internal.h | 1 + arch/x86/include/asm/irq_work.h | 2 +- arch/x86/include/asm/mwait.h | 2 + arch/x86/include/asm/processor.h | 3 +- arch/x86/include/asm/smap.h | 2 +- arch/x86/include/asm/smp.h | 1 - arch/x86/include/asm/thread_info.h | 2 +- arch/x86/include/asm/tlbflush.h | 1 + arch/x86/include/asm/uaccess_64.h | 2 +- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/centaur.c | 2 +- arch/x86/kernel/cpu/cyrix.c | 1 + arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- arch/x86/kernel/cpu/match.c | 2 +- arch/x86/kernel/cpu/mkcapflags.sh | 6 +- arch/x86/kernel/cpu/mtrr/main.c | 2 +- arch/x86/kernel/cpu/transmeta.c | 2 +- arch/x86/kernel/e820.c | 1 + arch/x86/kernel/head_32.S | 2 +- arch/x86/kernel/hpet.c | 1 + arch/x86/kernel/msr.c | 2 +- arch/x86/kernel/verify_cpu.S | 2 +- arch/x86/lib/clear_page_64.S | 2 +- arch/x86/lib/copy_page_64.S | 2 +- arch/x86/lib/copy_user_64.S | 2 +- arch/x86/lib/memcpy_64.S | 2 +- arch/x86/lib/memmove_64.S | 2 +- arch/x86/lib/memset_64.S | 2 +- arch/x86/mm/setup_nx.c | 1 + arch/x86/oprofile/op_model_amd.c | 1 - arch/x86/um/asm/barrier.h | 2 +- lib/atomic64_test.c | 2 +- 50 files changed, 336 insertions(+), 328 deletions(-) create mode 100644 arch/x86/include/asm/cpufeatures.h diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 87d40a72f6a1..c0c62532150d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -666,7 +666,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. clearcpuid=BITNUM [X86] Disable CPUID feature X for the kernel. See - arch/x86/include/asm/cpufeature.h for the valid bit + arch/x86/include/asm/cpufeatures.h for the valid bit numbers. Note the Linux specific bits are not necessarily stable over kernel options, but the vendor specific ones should be. diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index ea97697e51e4..4cb404fd45ce 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -1,7 +1,7 @@ #ifndef BOOT_CPUFLAGS_H #define BOOT_CPUFLAGS_H -#include +#include #include struct cpu_features { diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index 637097e66a62..f72498dc90d2 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -17,7 +17,7 @@ #include "../include/asm/required-features.h" #include "../include/asm/disabled-features.h" -#include "../include/asm/cpufeature.h" +#include "../include/asm/cpufeatures.h" #include "../kernel/cpu/capflags.c" int main(void) diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 07d2c6c86a54..27226df3f7d8 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 0e9871693f24..0857b1a1de3b 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index a3fcfc97a311..cd4df9322501 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 75175f92f462..c6ab2ebb5f4f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -26,6 +26,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 77d8c5112900..4c5228352744 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 08a317a9ae4b..7853b53959cd 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -11,7 +11,6 @@ #include #include -#include #include #include diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 3a1d9297074b..0109ac6cb79c 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -3,7 +3,7 @@ */ #include -#include +#include #include /* diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 7c912fefe79b..429d54d01b38 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -20,6 +20,7 @@ #include #include #include +#include #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 7bfc85bbb8ff..99afb665a004 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -151,12 +151,6 @@ static inline int alternatives_text_reserved(void *start, void *end) ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ ".popsection" -/* - * This must be included *after* the definition of ALTERNATIVE due to - * - */ -#include - /* * Alternative instructions for different CPU types or capabilities. * diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index c80f6b6f3da2..0899cfc8dfe8 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 259a7c1ef709..02e799fa43d1 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_HWEIGHT_H #define _ASM_X86_HWEIGHT_H +#include + #ifdef CONFIG_64BIT /* popcnt %edi, %eax -- redundant REX prefix for alignment */ #define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7" diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index ad19841eddfe..9733361fed6f 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -2,6 +2,7 @@ #define ASM_X86_CMPXCHG_H #include +#include #include /* Provides LOCK_PREFIX */ /* diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bbf166e805be..3cce9f3c5cb1 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -1,289 +1,7 @@ -/* - * Defines x86 CPU feature bits - */ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#include -#endif - -#ifndef _ASM_X86_DISABLED_FEATURES_H -#include -#endif - -#define NCAPINTS 16 /* N 32-bit words worth of info */ -#define NBUGINTS 1 /* N 32-bit bug flags */ - -/* - * Note: If the comment begins with a quoted string, that string is used - * in /proc/cpuinfo instead of the macro name. If the string is "", - * this feature bit is not displayed in /proc/cpuinfo at all. - */ - -/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ -#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ - /* (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ -#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ -#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ -#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ -#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ -#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ - -/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ -/* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ -#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ -#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ - -/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ - -/* Other features, Linux-defined mapping, word 3 */ -/* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ -/* cpu types for specific tunings: */ -#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ -/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */ -#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ -#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ -/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */ -#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ -#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ - -/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ -#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ -#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ -#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ -#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ -#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ -#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ -#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ -#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ -#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ -#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ -#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ - -/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ -#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ -#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ -#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ - -/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ -#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ -#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ -#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ -#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ - -/* - * Auxiliary flags: Linux defined - For features scattered in various - * CPUID levels like 0x6, 0xA etc, word 7. - * - * Reuse free bits when adding new feature flags! - */ - -#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ - -#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ - -#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ - -/* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ - -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ -#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ - - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ -#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ -#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ -#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ - -/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ - -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ -#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ - -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ - -/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ -#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ - -/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ - -/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ -#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ -#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ - -/* - * BUG word(s) - */ -#define X86_BUG(x) (NCAPINTS*32 + (x)) - -#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ -#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ -#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#include #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h new file mode 100644 index 000000000000..0ceb6adc8a48 --- /dev/null +++ b/arch/x86/include/asm/cpufeatures.h @@ -0,0 +1,288 @@ +#ifndef _ASM_X86_CPUFEATURES_H +#define _ASM_X86_CPUFEATURES_H + +#ifndef _ASM_X86_REQUIRED_FEATURES_H +#include +#endif + +#ifndef _ASM_X86_DISABLED_FEATURES_H +#include +#endif + +/* + * Defines x86 CPU feature bits + */ +#define NCAPINTS 16 /* N 32-bit words worth of info */ +#define NBUGINTS 1 /* N 32-bit bug flags */ + +/* + * Note: If the comment begins with a quoted string, that string is used + * in /proc/cpuinfo instead of the macro name. If the string is "", + * this feature bit is not displayed in /proc/cpuinfo at all. + */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ + /* (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ +#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ + +/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ +/* Don't duplicate feature flags which are redundant with Intel! */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ +#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ + +/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ + +/* Other features, Linux-defined mapping, word 3 */ +/* This range is used for feature bits which conflict or are synthesized */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ +/* cpu types for specific tunings: */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ +/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ +/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */ +#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ +/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ +#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ +#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ +#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ +#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ + +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ +#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ + +/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ + +/* + * Auxiliary flags: Linux defined - For features scattered in various + * CPUID levels like 0x6, 0xA etc, word 7. + * + * Reuse free bits when adding new feature flags! + */ + +#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ + +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ + +#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ + +/* Virtualization flags: Linux defined, word 8 */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ + +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ + + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ + +/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ + +/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ +#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ + +/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + +/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ +#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ + +/* + * BUG word(s) + */ +#define X86_BUG(x) (NCAPINTS*32 + (x)) + +#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ +#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ +#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ + +#endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0fd440df63f1..d01199def781 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * High level FPU state handling functions: diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index 78162f8e248b..d0afb05c84fc 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -1,7 +1,7 @@ #ifndef _ASM_IRQ_WORK_H #define _ASM_IRQ_WORK_H -#include +#include static inline bool arch_irq_work_has_interrupt(void) { diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index c70689b5e5aa..0deeb2d26df7 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -3,6 +3,8 @@ #include +#include + #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2d5a50cb61a2..491a3d9dbb15 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -13,7 +13,7 @@ struct vm86; #include #include #include -#include +#include #include #include #include @@ -24,7 +24,6 @@ struct vm86; #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index ba665ebd17bb..db333300bd4b 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -15,7 +15,7 @@ #include #include -#include +#include /* "Raw" instruction opcodes */ #define __ASM_CLAC .byte 0x0f,0x01,0xca diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index dfcf0727623b..20a3de5cb3b0 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -16,7 +16,6 @@ #endif #include #include -#include extern int smp_num_siblings; extern unsigned int num_processors; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c7b551028740..c0778fcab06d 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -49,7 +49,7 @@ */ #ifndef __ASSEMBLY__ struct task_struct; -#include +#include #include struct thread_info { diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6df2029405a3..0bb31cb8c73b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -5,6 +5,7 @@ #include #include +#include #include #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index b89c34c4019b..307698688fa1 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include /* diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 58031303e304..faa7b5204129 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -64,7 +64,7 @@ ifdef CONFIG_X86_FEATURE_NAMES quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ -cpufeature = $(src)/../../include/asm/cpufeature.h +cpufeature = $(src)/../../include/asm/cpufeatures.h targets += capflags.c $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index ae20be6e483c..6608c03c2126 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,7 +1,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index aaf152e79637..15e47c1cd412 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "cpu.h" diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 565648bc1a0a..9299e3bdfad6 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0b6c52388cf4..341449c49f34 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index afa9f0d487ea..fbb5e90557a5 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 3f20710a5b23..6988c74409a8 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -1,6 +1,6 @@ #!/bin/sh # -# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h +# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h # IN=$1 @@ -49,8 +49,8 @@ dump_array() trap 'rm "$OUT"' EXIT ( - echo "#ifndef _ASM_X86_CPUFEATURE_H" - echo "#include " + echo "#ifndef _ASM_X86_CPUFEATURES_H" + echo "#include " echo "#endif" echo "" diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 5c3d149ee91c..74f1d90f9c29 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -47,7 +47,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 252da7aceca6..a19a663282b5 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include "cpu.h" diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 569c1e4f96fe..b3c2a697820a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -24,6 +24,7 @@ #include #include #include +#include /* * The e820 map is the map that gets modified e.g. with command line parameters diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 6bc9ae24b6d2..af1112980dd4 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index b8e6ff5cd5d0..be0ebbb6d1d1 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 64f9616f93f1..7f3550acde1b 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include static struct class *msr_class; diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 07efb35ee4bc..014ea59aa153 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -30,7 +30,7 @@ * appropriately. Either display a message or halt. */ -#include +#include #include verify_cpu: diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index a2fe51b00cce..65be7cfaf947 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,5 @@ #include -#include +#include #include /* diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 009f98216b7e..24ef1c2104d4 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -1,7 +1,7 @@ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ #include -#include +#include #include /* diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 982ce34f4a9b..fba343062055 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 16698bba87de..a0de849435ad 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,7 +1,7 @@ /* Copyright 2002 Andi Kleen */ #include -#include +#include #include /* diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index ca2afdd6d98e..90ce01bee00c 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -6,7 +6,7 @@ * - Copyright 2011 Fenghua Yu */ #include -#include +#include #include #undef memmove diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 2661fad05827..c9c81227ea37 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -1,7 +1,7 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ #include -#include +#include #include .weak memset diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 92e2eacb3321..f65a33f505b6 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -4,6 +4,7 @@ #include #include +#include static int disable_nx; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 50d86c0e9ba4..660a83c8287b 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -24,7 +24,6 @@ #include #include #include -#include #include "op_x86_model.h" #include "op_counter.h" diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 174781a404ff..00c319048d52 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index d62de8bf022d..123481814320 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -17,7 +17,7 @@ #include #ifdef CONFIG_X86 -#include /* for boot_cpu_has below */ +#include /* for boot_cpu_has below */ #endif #define TEST(bit, op, c_op, val) \ -- cgit v1.2.1 From bc696ca05f5a8927329ec276a892341e006b00ba Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jan 2016 22:12:05 +0100 Subject: x86/cpufeature: Replace the old static_cpu_has() with safe variant So the old one didn't work properly before alternatives had run. And it was supposed to provide an optimized JMP because the assumption was that the offset it is jumping to is within a signed byte and thus a two-byte JMP. So I did an x86_64 allyesconfig build and dumped all possible sites where static_cpu_has() was used. The optimization amounted to all in all 12(!) places where static_cpu_has() had generated a 2-byte JMP. Which has saved us a whopping 36 bytes! This clearly is not worth the trouble so we can remove it. The only place where the optimization might count - in __switch_to() - we will handle differently. But that's not subject of this patch. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453842730-28463-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 10 ---- arch/x86/include/asm/cpufeature.h | 100 +++-------------------------------- arch/x86/include/asm/fpu/internal.h | 14 ++--- arch/x86/kernel/apic/apic_numachip.c | 4 +- arch/x86/kernel/cpu/common.c | 12 +---- arch/x86/kernel/vm86_32.c | 2 +- drivers/cpufreq/intel_pstate.c | 2 +- fs/btrfs/disk-io.c | 2 +- 8 files changed, 21 insertions(+), 125 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 9b18ed97a8a2..68a2d1f0a683 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -350,16 +350,6 @@ config DEBUG_IMR_SELFTEST If unsure say N here. -config X86_DEBUG_STATIC_CPU_HAS - bool "Debug alternatives" - depends on DEBUG_KERNEL - ---help--- - This option causes additional code to be generated which - fails if static_cpu_has() is used before alternatives have - run. - - If unsure, say N. - config X86_DEBUG_FPU bool "Debug the x86 FPU code" depends on DEBUG_KERNEL diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3cce9f3c5cb1..a261cf2e7907 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -125,103 +125,19 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) /* - * Do not add any more of those clumsy macros - use static_cpu_has_safe() for + * Do not add any more of those clumsy macros - use static_cpu_has() for * fast paths and boot_cpu_has() otherwise! */ #if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS) -extern void warn_pre_alternatives(void); -extern bool __static_cpu_has_safe(u16 bit); +extern bool __static_cpu_has(u16 bit); /* * Static testing of CPU features. Used the same as boot_cpu_has(). * These are only valid after alternatives have run, but will statically * patch the target code for additional performance. */ -static __always_inline __pure bool __static_cpu_has(u16 bit) -{ -#ifdef CC_HAVE_ASM_GOTO - -#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS - - /* - * Catch too early usage of this before alternatives - * have run. - */ - asm_volatile_goto("1: jmp %l[t_warn]\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" - " .long 0\n" /* no replacement */ - " .word %P0\n" /* 1: do replace */ - " .byte 2b - 1b\n" /* source len */ - " .byte 0\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - /* skipping size check since replacement size = 0 */ - : : "i" (X86_FEATURE_ALWAYS) : : t_warn); - -#endif - - asm_volatile_goto("1: jmp %l[t_no]\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" - " .long 0\n" /* no replacement */ - " .word %P0\n" /* feature bit */ - " .byte 2b - 1b\n" /* source len */ - " .byte 0\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - /* skipping size check since replacement size = 0 */ - : : "i" (bit) : : t_no); - return true; - t_no: - return false; - -#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS - t_warn: - warn_pre_alternatives(); - return false; -#endif - -#else /* CC_HAVE_ASM_GOTO */ - - u8 flag; - /* Open-coded due to __stringify() in ALTERNATIVE() */ - asm volatile("1: movb $0,%0\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" - " .long 3f - .\n" - " .word %P1\n" /* feature bit */ - " .byte 2b - 1b\n" /* source len */ - " .byte 4f - 3f\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .discard,\"aw\",@progbits\n" - " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "3: movb $1,%0\n" - "4:\n" - ".previous\n" - : "=qm" (flag) : "i" (bit)); - return flag; - -#endif /* CC_HAVE_ASM_GOTO */ -} - -#define static_cpu_has(bit) \ -( \ - __builtin_constant_p(boot_cpu_has(bit)) ? \ - boot_cpu_has(bit) : \ - __builtin_constant_p(bit) ? \ - __static_cpu_has(bit) : \ - boot_cpu_has(bit) \ -) - -static __always_inline __pure bool _static_cpu_has_safe(u16 bit) +static __always_inline __pure bool _static_cpu_has(u16 bit) { #ifdef CC_HAVE_ASM_GOTO asm_volatile_goto("1: jmp %l[t_dynamic]\n" @@ -255,7 +171,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) t_no: return false; t_dynamic: - return __static_cpu_has_safe(bit); + return __static_cpu_has(bit); #else u8 flag; /* Open-coded due to __stringify() in ALTERNATIVE() */ @@ -293,22 +209,21 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) ".previous\n" : "=qm" (flag) : "i" (bit), "i" (X86_FEATURE_ALWAYS)); - return (flag == 2 ? __static_cpu_has_safe(bit) : flag); + return (flag == 2 ? __static_cpu_has(bit) : flag); #endif /* CC_HAVE_ASM_GOTO */ } -#define static_cpu_has_safe(bit) \ +#define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ - _static_cpu_has_safe(bit) \ + _static_cpu_has(bit) \ ) #else /* * gcc 3.x is too stupid to do the static test; fall back to dynamic. */ #define static_cpu_has(bit) boot_cpu_has(bit) -#define static_cpu_has_safe(bit) boot_cpu_has(bit) #endif #define cpu_has_bug(c, bit) cpu_has(c, (bit)) @@ -316,7 +231,6 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) #define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) #define static_cpu_has_bug(bit) static_cpu_has((bit)) -#define static_cpu_has_bug_safe(bit) static_cpu_has_safe((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index d01199def781..c2e46eb96b6d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -59,22 +59,22 @@ extern u64 fpu__get_supported_xfeatures_mask(void); */ static __always_inline __pure bool use_eager_fpu(void) { - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); + return static_cpu_has(X86_FEATURE_EAGER_FPU); } static __always_inline __pure bool use_xsaveopt(void) { - return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); + return static_cpu_has(X86_FEATURE_XSAVEOPT); } static __always_inline __pure bool use_xsave(void) { - return static_cpu_has_safe(X86_FEATURE_XSAVE); + return static_cpu_has(X86_FEATURE_XSAVE); } static __always_inline __pure bool use_fxsr(void) { - return static_cpu_has_safe(X86_FEATURE_FXSR); + return static_cpu_has(X86_FEATURE_FXSR); } /* @@ -301,7 +301,7 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has_safe(X86_FEATURE_XSAVES)) + if (static_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XSAVES, xstate, lmask, hmask, err); else XSTATE_OP(XSAVE, xstate, lmask, hmask, err); @@ -323,7 +323,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has_safe(X86_FEATURE_XSAVES)) + if (static_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); @@ -461,7 +461,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) * pending. Clear the x87 state here by setting it to fixed values. * "m" is a random variable that should be in L1. */ - if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { + if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index c80c02c6ec49..ab5c2c685a3c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -30,7 +30,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x) unsigned long value; unsigned int id = (x >> 24) & 0xff; - if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, value); id |= (value << 2) & 0xff00; } @@ -178,7 +178,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) this_cpu_write(cpu_llc_id, node); /* Account for nodes per socket in multi-core-module processors */ - if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 37830de8f60a..ee499817f3f5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1475,19 +1475,11 @@ void cpu_init(void) } #endif -#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS -void warn_pre_alternatives(void) -{ - WARN(1, "You're using static_cpu_has before alternatives have run!\n"); -} -EXPORT_SYMBOL_GPL(warn_pre_alternatives); -#endif - -inline bool __static_cpu_has_safe(u16 bit) +inline bool __static_cpu_has(u16 bit) { return boot_cpu_has(bit); } -EXPORT_SYMBOL_GPL(__static_cpu_has_safe); +EXPORT_SYMBOL_GPL(__static_cpu_has); static void bsp_resume(void) { diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e574b8546518..3dce1ca0a653 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -362,7 +362,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) /* make room for real-mode segments */ tsk->thread.sp0 += 16; - if (static_cpu_has_safe(X86_FEATURE_SEP)) + if (static_cpu_has(X86_FEATURE_SEP)) tsk->thread.sysenter_cs = 0; load_sp0(tss, &tsk->thread); diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index cd83d477e32d..3a4b39afc0ab 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1431,7 +1431,7 @@ static int __init intel_pstate_init(void) if (!all_cpu_data) return -ENOMEM; - if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp) { + if (static_cpu_has(X86_FEATURE_HWP) && !no_hwp) { pr_info("intel_pstate: HWP enabled\n"); hwp_active++; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dd08e29f5117..d9286497924f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -930,7 +930,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) if (bio_flags & EXTENT_BIO_TREE_LOG) return 0; #ifdef CONFIG_X86 - if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) + if (static_cpu_has(X86_FEATURE_XMM4_2)) return 0; #endif return 1; -- cgit v1.2.1 From a362bf9f5e7dd659b96d01382da7b855f4e5a7a1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 27 Jan 2016 09:43:25 +0100 Subject: x86/cpufeature: Get rid of the non-asm goto variant I can simply quote hpa from the mail: "Get rid of the non-asm goto variant and just fall back to dynamic if asm goto is unavailable. It doesn't make any sense, really, if it is supposed to be safe, and by now the asm goto-capable gcc is in more wide use. (Originally the gcc 3.x fallback to pure dynamic didn't exist, either.)" Booy, am I lazy. Cleanup the whole CC_HAVE_ASM_GOTO ifdeffery too, while at it. Suggested-by: H. Peter Anvin Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160127084325.GB30712@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 49 ++++----------------------------------- 1 file changed, 5 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index a261cf2e7907..9048c1bbc519 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -129,17 +129,16 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; * fast paths and boot_cpu_has() otherwise! */ -#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS) +#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) extern bool __static_cpu_has(u16 bit); /* * Static testing of CPU features. Used the same as boot_cpu_has(). - * These are only valid after alternatives have run, but will statically - * patch the target code for additional performance. + * These will statically patch the target code for additional + * performance. */ static __always_inline __pure bool _static_cpu_has(u16 bit) { -#ifdef CC_HAVE_ASM_GOTO asm_volatile_goto("1: jmp %l[t_dynamic]\n" "2:\n" ".skip -(((5f-4f) - (2b-1b)) > 0) * " @@ -172,45 +171,6 @@ static __always_inline __pure bool _static_cpu_has(u16 bit) return false; t_dynamic: return __static_cpu_has(bit); -#else - u8 flag; - /* Open-coded due to __stringify() in ALTERNATIVE() */ - asm volatile("1: movb $2,%0\n" - "2:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 3f - .\n" /* repl offset */ - " .word %P2\n" /* always replace */ - " .byte 2b - 1b\n" /* source len */ - " .byte 4f - 3f\n" /* replacement len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .discard,\"aw\",@progbits\n" - " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "3: movb $0,%0\n" - "4:\n" - ".previous\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 5f - .\n" /* repl offset */ - " .word %P1\n" /* feature bit */ - " .byte 4b - 3b\n" /* src len */ - " .byte 6f - 5f\n" /* repl len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .discard,\"aw\",@progbits\n" - " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "5: movb $1,%0\n" - "6:\n" - ".previous\n" - : "=qm" (flag) - : "i" (bit), "i" (X86_FEATURE_ALWAYS)); - return (flag == 2 ? __static_cpu_has(bit) : flag); -#endif /* CC_HAVE_ASM_GOTO */ } #define static_cpu_has(bit) \ @@ -221,7 +181,8 @@ static __always_inline __pure bool _static_cpu_has(u16 bit) ) #else /* - * gcc 3.x is too stupid to do the static test; fall back to dynamic. + * Fall back to dynamic for gcc versions which don't support asm goto. Should be + * a minority now anyway. */ #define static_cpu_has(bit) boot_cpu_has(bit) #endif -- cgit v1.2.1 From 337e4cc84021212a87b04b77b65cccc49304909e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jan 2016 22:12:07 +0100 Subject: x86/alternatives: Add an auxilary section Add .altinstr_aux for additional instructions which will be used before and/or during patching. All stuff which needs more sophisticated patching should go there. See next patch. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453842730-28463-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 74e4bf11f562..92dc211c11db 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -195,6 +195,17 @@ SECTIONS :init #endif + /* + * Section for code used exclusively before alternatives are run. All + * references to such code must be patched out by alternatives, normally + * by using X86_FEATURE_ALWAYS CPU feature bit. + * + * See static_cpu_has() for an example. + */ + .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) { + *(.altinstr_aux) + } + INIT_DATA_SECTION(16) .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { -- cgit v1.2.1 From 2476f2fa20568bd5d9e09cd35bcd73e99a6f4cc6 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 27 Jan 2016 09:45:25 +0100 Subject: x86/alternatives: Discard dynamic check after init Move the code to do the dynamic check to the altinstr_aux section so that it is discarded after alternatives have run and a static branch has been chosen. This way we're changing the dynamic branch from C code to assembly, which makes it *substantially* smaller while avoiding a completely unnecessary call to an out of line function. Signed-off-by: Brian Gerst [ Changed it to do TESTB, as hpa suggested. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Dave Young Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kristen Carlson Accardi Cc: Laura Abbott Cc: Linus Torvalds Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Ross Zwisler Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1452972124-7380-1-git-send-email-brgerst@gmail.com Link: http://lkml.kernel.org/r/20160127084525.GC30712@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 19 ++++++++++++------- arch/x86/kernel/cpu/common.c | 6 ------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 9048c1bbc519..9fba7a5dd24a 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -130,8 +130,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; */ #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) -extern bool __static_cpu_has(u16 bit); - /* * Static testing of CPU features. Used the same as boot_cpu_has(). * These will statically patch the target code for additional @@ -139,7 +137,7 @@ extern bool __static_cpu_has(u16 bit); */ static __always_inline __pure bool _static_cpu_has(u16 bit) { - asm_volatile_goto("1: jmp %l[t_dynamic]\n" + asm_volatile_goto("1: jmp 6f\n" "2:\n" ".skip -(((5f-4f) - (2b-1b)) > 0) * " "((5f-4f) - (2b-1b)),0x90\n" @@ -164,13 +162,20 @@ static __always_inline __pure bool _static_cpu_has(u16 bit) " .byte 0\n" /* repl len */ " .byte 0\n" /* pad len */ ".previous\n" - : : "i" (bit), "i" (X86_FEATURE_ALWAYS) - : : t_dynamic, t_no); + ".section .altinstr_aux,\"ax\"\n" + "6:\n" + " testb %[bitnum],%[cap_byte]\n" + " jnz %l[t_yes]\n" + " jmp %l[t_no]\n" + ".previous\n" + : : "i" (bit), "i" (X86_FEATURE_ALWAYS), + [bitnum] "i" (1 << (bit & 7)), + [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) + : : t_yes, t_no); + t_yes: return true; t_no: return false; - t_dynamic: - return __static_cpu_has(bit); } #define static_cpu_has(bit) \ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ee499817f3f5..079d83fc6488 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1475,12 +1475,6 @@ void cpu_init(void) } #endif -inline bool __static_cpu_has(u16 bit) -{ - return boot_cpu_has(bit); -} -EXPORT_SYMBOL_GPL(__static_cpu_has); - static void bsp_resume(void) { if (this_cpu->c_bsp_resume) -- cgit v1.2.1 From 8c725306993198f845038dc9e45a1267099867a6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Jan 2016 22:12:09 +0100 Subject: x86/vdso: Use static_cpu_has() ... and simplify and speed up a tad. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453842730-28463-10-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 429d54d01b38..10f704584922 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -285,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg) #ifdef CONFIG_NUMA node = cpu_to_node(cpu); #endif - if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) + if (static_cpu_has(X86_FEATURE_RDTSCP)) write_rdtscp_aux((node << 12) | cpu); /* -- cgit v1.2.1 From a4733143085d6c782ac1e6c85778655b6bac1d4e Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Tue, 26 Jan 2016 22:12:10 +0100 Subject: x86/boot: Simplify kernel load address alignment check We are using %rax as temporary register to check the kernel address alignment. We don't really have to since the TEST instruction does not clobber the destination operand. Suggested-by: Brian Gerst Signed-off-by: Alexander Kuleshov Signed-off-by: Borislav Petkov Cc: Alexander Popov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1453531828-19291-1-git-send-email-kuleshovmail@gmail.com Link: http://lkml.kernel.org/r/1453842730-28463-11-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ffdc0e860390..7c21029cb733 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -76,9 +76,7 @@ startup_64: subq $_text - __START_KERNEL_map, %rbp /* Is the address not 2M aligned? */ - movq %rbp, %rax - andl $~PMD_PAGE_MASK, %eax - testl %eax, %eax + testl $~PMD_PAGE_MASK, %ebp jnz bad_address /* -- cgit v1.2.1 From b7765086b7c5a5be029a739c2caa161da51c2076 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 31 Jan 2016 09:33:26 -0800 Subject: x86/entry/64: Fix an IRQ state error on ptregs-using syscalls I messed up the IRQ state when jumping off the fast path due to invocation of a ptregs-using syscall. This bug shouldn't have had any impact yet, but it would have caused problems with subsequent context tracking cleanups. Reported-and-tested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 1e423bff959e x86/entry/64: ("Migrate the 64-bit syscall slow path to C") Link: http://lkml.kernel.org/r/ab92cd365fb7b0a56869e920017790d96610fdca.1454261517.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 567aa522ac0a..9f7bb808035e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -191,8 +191,8 @@ entry_SYSCALL_64_fastpath: /* * This call instruction is handled specially in stub_ptregs_64. - * It might end up jumping to the slow path. If it jumps, RAX is - * clobbered. + * It might end up jumping to the slow path. If it jumps, RAX + * and all argument registers are clobbered. */ call *sys_call_table(, %rax, 8) .Lentry_SYSCALL_64_after_fastpath_call: @@ -315,17 +315,24 @@ END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) /* * Syscalls marked as needing ptregs land here. - * If we are on the fast path, we need to save the extra regs. - * If we are on the slow path, the extra regs are already saved. + * If we are on the fast path, we need to save the extra regs, + * which we achieve by trying again on the slow path. If we are on + * the slow path, the extra regs are already saved. * * RAX stores a pointer to the C function implementing the syscall. + * IRQs are on. */ cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) jne 1f - /* Called from fast path -- pop return address and jump to slow path */ + /* + * Called from fast path -- disable IRQs again, pop return address + * and jump to slow path + */ + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF popq %rax - jmp entry_SYSCALL64_slow_path /* called from fast path */ + jmp entry_SYSCALL64_slow_path 1: /* Called from C */ -- cgit v1.2.1 From eb2a54c3271cb6443ae93ec44a91687b60c559a3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 31 Jan 2016 09:33:27 -0800 Subject: x86/entry/64: Fix fast-path syscall return register state I was fishing RIP (i.e. RCX) out of pt_regs->cx and RFLAGS (i.e. R11) out of pt_regs->r11. While it usually worked (pt_regs started out with CX == IP and R11 == FLAGS), it was very fragile. In particular, it broke sys_iopl() because sys_iopl() forgot to mark itself as using ptregs. Undo that part of the syscall rework. There was no compelling reason to do it this way. While I'm at it, load RCX and R11 before the other regs to be a little friendlier to the CPU, as they will be the first of the reloaded registers to be used. Reported-and-tested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 1e423bff959e x86/entry/64: ("Migrate the 64-bit syscall slow path to C") Link: http://lkml.kernel.org/r/a85f8360c397e48186a9bc3e565ad74307a7b011.1454261517.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9f7bb808035e..70eadb0ea5fa 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -212,7 +212,9 @@ entry_SYSCALL_64_fastpath: LOCKDEP_SYS_EXIT TRACE_IRQS_ON /* user mode is traced as IRQs on */ - RESTORE_C_REGS + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 + RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp USERGS_SYSRET64 -- cgit v1.2.1 From bb56968a37a44070de92d5690c4b08dd98a5d3f1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 31 Jan 2016 09:33:28 -0800 Subject: x86/syscalls/64: Mark sys_iopl() as using ptregs sys_iopl() both reads and writes pt_regs->flags. Mark it as using ptregs. This isn't strictly necessary, as pt_regs->flags is available even in the fast path, but this is very lightweight now that we have syscall qualifiers and it could avoid some pain down the road. Reported-and-tested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3de0ca692fa8bf414c5e3d7afe3e6195d1a10e1f.1454261517.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index dcf107ce2cd4..2e5b565adacc 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -178,7 +178,7 @@ 169 common reboot sys_reboot 170 common sethostname sys_sethostname 171 common setdomainname sys_setdomainname -172 common iopl sys_iopl +172 common iopl sys_iopl/ptregs 173 common ioperm sys_ioperm 174 64 create_module 175 common init_module sys_init_module -- cgit v1.2.1 From d99e1bd175f4291ddb6e62b22bb5bdbe3976389a Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Mon, 25 Jan 2016 20:41:46 +0100 Subject: x86/entry/traps: Refactor preemption and interrupt flag handling Make the preemption and interrupt flag handling more readable by removing preempt_conditional_sti() and preempt_conditional_cli() helpers and using preempt_disable() and preempt_enable_no_resched() instead. Rename contitional_sti() and conditional_cli() to the more understandable cond_local_irq_enable() and cond_local_irq_disable() respectively, while at it. Suggested-by: Borislav Petkov Signed-off-by: Alexander Kuleshov [ Boris: massage text. ] Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Tony Luck Cc: Wang Nan Link: http://lkml.kernel.org/r/1453750913-4781-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 47 +++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ade185a46b1d..410e8e2700c5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -83,30 +83,16 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_bss; DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); -static inline void conditional_sti(struct pt_regs *regs) +static inline void cond_local_irq_enable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - preempt_count_inc(); - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void conditional_cli(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); -} - -static inline void preempt_conditional_cli(struct pt_regs *regs) +static inline void cond_local_irq_disable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); - preempt_count_dec(); } void ist_enter(struct pt_regs *regs) @@ -286,7 +272,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { - conditional_sti(regs); + cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, fill_trap_info(regs, signr, trapnr, &info)); } @@ -368,7 +354,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) if (notify_die(DIE_TRAP, "bounds", regs, error_code, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; - conditional_sti(regs); + cond_local_irq_enable(regs); if (!user_mode(regs)) die("bounds", regs, error_code); @@ -443,7 +429,7 @@ do_general_protection(struct pt_regs *regs, long error_code) struct task_struct *tsk; RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - conditional_sti(regs); + cond_local_irq_enable(regs); if (v8086_mode(regs)) { local_irq_enable(); @@ -517,9 +503,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) * as we may switch to the interrupt stack. */ debug_stack_usage_inc(); - preempt_conditional_sti(regs); + preempt_disable(); + cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); exit: ist_exit(regs); @@ -648,12 +636,14 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_inc(); /* It's safe to allow irq's after DR6 has been saved */ - preempt_conditional_sti(regs); + preempt_disable(); + cond_local_irq_enable(regs); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, X86_TRAP_DB); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); goto exit; } @@ -673,7 +663,8 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); - preempt_conditional_cli(regs); + cond_local_irq_disable(regs); + preempt_enable_no_resched(); debug_stack_usage_dec(); exit: @@ -696,7 +687,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) return; - conditional_sti(regs); + cond_local_irq_enable(regs); if (!user_mode(regs)) { if (!fixup_exception(regs)) { @@ -743,7 +734,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { - conditional_sti(regs); + cond_local_irq_enable(regs); } dotraplinkage void @@ -756,7 +747,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; - conditional_sti(regs); + cond_local_irq_enable(regs); info.regs = regs; math_emulate(&info); @@ -765,7 +756,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) #endif fpu__restore(¤t->thread.fpu); /* interrupts still off */ #ifdef CONFIG_X86_32 - conditional_sti(regs); + cond_local_irq_enable(regs); #endif } NOKPROBE_SYMBOL(do_device_not_available); -- cgit v1.2.1 From 02afeaae9843733a39cd9b11053748b2d1dc5ae7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Dec 2015 14:52:38 -0800 Subject: x86/boot: Fix early command-line parsing when matching at end The x86 early command line parsing in cmdline_find_option_bool() is buggy. If it matches a specified 'option' all the way to the end of the command-line, it will consider it a match. For instance, cmdline = "foo"; cmdline_find_option_bool(cmdline, "fool"); will return 1. This is particularly annoying since we have actual FPU options like "noxsave" and "noxsaves" So, command-line "foo bar noxsave" will match *BOTH* a "noxsave" and "noxsaves". (This turns out not to be an actual problem because "noxsave" implies "noxsaves", but it's still confusing.) To fix this, we simplify the code and stop tracking 'len'. 'len' was trying to indicate either the NULL terminator *OR* the end of a non-NULL-terminated command line at 'COMMAND_LINE_SIZE'. But, each of the three states is *already* checking 'cmdline' for a NULL terminator. We _only_ need to check if we have overrun 'COMMAND_LINE_SIZE', and that we can do without keeping 'len' around. Also add some commends to clarify what is going on. Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: yu-cheng.yu@intel.com Link: http://lkml.kernel.org/r/20151222225238.9AEB560C@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/lib/cmdline.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 422db000d727..49548bed2301 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -21,12 +21,14 @@ static inline int myisspace(u8 c) * @option: option string to look for * * Returns the position of that @option (starts counting with 1) - * or 0 on not found. + * or 0 on not found. @option will only be found if it is found + * as an entire word in @cmdline. For instance, if @option="car" + * then a cmdline which contains "cart" will not match. */ int cmdline_find_option_bool(const char *cmdline, const char *option) { char c; - int len, pos = 0, wstart = 0; + int pos = 0, wstart = 0; const char *opptr = NULL; enum { st_wordstart = 0, /* Start of word/after whitespace */ @@ -37,11 +39,14 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) if (!cmdline) return -1; /* No command line */ - len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE); - if (!len) + if (!strlen(cmdline)) return 0; - while (len--) { + /* + * This 'pos' check ensures we do not overrun + * a non-NULL-terminated 'cmdline' + */ + while (pos < COMMAND_LINE_SIZE) { c = *(char *)cmdline++; pos++; @@ -58,17 +63,26 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) /* fall through */ case st_wordcmp: - if (!*opptr) + if (!*opptr) { + /* + * We matched all the way to the end of the + * option we were looking for. If the + * command-line has a space _or_ ends, then + * we matched! + */ if (!c || myisspace(c)) return wstart; else state = st_wordskip; - else if (!c) + } else if (!c) { + /* + * Hit the NULL terminator on the end of + * cmdline. + */ return 0; - else if (c != *opptr++) + } else if (c != *opptr++) { state = st_wordskip; - else if (!len) /* last word and is matching */ - return wstart; + } break; case st_wordskip: -- cgit v1.2.1 From abcdc1c694fa4055323cbec1cde4c2cb6b68398c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Dec 2015 14:52:39 -0800 Subject: x86/boot: Fix early command-line parsing when partial word matches cmdline_find_option_bool() keeps track of position in two strings: 1. the command-line 2. the option we are searchign for in the command-line We plow through each character in the command-line one at a time, always moving forward. We move forward in the option ('opptr') when we match characters in 'cmdline'. We reset the 'opptr' only when we go in to the 'st_wordstart' state. But, if we fail to match an option because we see a space (state=st_wordcmp, *opptr='\0',c=' '), we set state='st_wordskip' and 'break', moving to the next character. But, that move to the next character is the one *after* the ' '. This means that we will miss a 'st_wordstart' state. For instance, if we have cmdline = "foo fool"; and are searching for "fool", we have: "fool" opptr = ----^ "foo fool" c = --------^ We see that 'l' != ' ', set state=st_wordskip, break, and then move 'c', so: "foo fool" c = ---------^ and are still in state=st_wordskip. We will stay in wordskip until we have skipped "fool", thus missing the option we were looking for. This *only* happens when you have a partially- matching word followed by a matching one. To fix this, we always fall *into* the 'st_wordskip' state when we set it. Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: yu-cheng.yu@intel.com Link: http://lkml.kernel.org/r/20151222225239.8E1DCA58@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/lib/cmdline.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 49548bed2301..ff8d1beead6c 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -72,18 +72,26 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) */ if (!c || myisspace(c)) return wstart; - else - state = st_wordskip; + /* + * We hit the end of the option, but _not_ + * the end of a word on the cmdline. Not + * a match. + */ } else if (!c) { /* * Hit the NULL terminator on the end of * cmdline. */ return 0; - } else if (c != *opptr++) { - state = st_wordskip; + } else if (c == *opptr++) { + /* + * We are currently matching, so continue + * to the next character on the cmdline. + */ + break; } - break; + state = st_wordskip; + /* fall through */ case st_wordskip: if (!c) -- cgit v1.2.1 From 4de07ea481361b08fe13735004dafae862482d38 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Dec 2015 14:52:41 -0800 Subject: x86/boot: Simplify early command line parsing __cmdline_find_option_bool() tries to account for both NULL-terminated and non-NULL-terminated strings. It keeps 'pos' to look for the end of the buffer and also looks for '!c' in a bunch of places to look for NULL termination. But, it also calls strlen(). You can't call strlen on a non-NULL-terminated string. If !strlen(cmdline), then cmdline[0]=='\0'. In that case, we will go in to the while() loop, set c='\0', hit st_wordstart, notice !c, and will immediately return 0. So, remove the strlen(). It is unnecessary and unsafe. Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: yu-cheng.yu@intel.com Link: http://lkml.kernel.org/r/20151222225241.15365E43@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/lib/cmdline.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index ff8d1beead6c..945a639c02dd 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -39,9 +39,6 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) if (!cmdline) return -1; /* No command line */ - if (!strlen(cmdline)) - return 0; - /* * This 'pos' check ensures we do not overrun * a non-NULL-terminated 'cmdline' -- cgit v1.2.1 From 8c0517759a1a100a8b83134cf3c7f254774aaeba Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Dec 2015 14:52:43 -0800 Subject: x86/boot: Pass in size to early cmdline parsing We will use this in a few patches to implement tests for early parsing. Signed-off-by: Dave Hansen [ Aligned args properly. ] Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: yu-cheng.yu@intel.com Link: http://lkml.kernel.org/r/20151222225243.5CC47EB6@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/lib/cmdline.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 945a639c02dd..5cc78bf57232 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -25,7 +25,9 @@ static inline int myisspace(u8 c) * as an entire word in @cmdline. For instance, if @option="car" * then a cmdline which contains "cart" will not match. */ -int cmdline_find_option_bool(const char *cmdline, const char *option) +static int +__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, + const char *option) { char c; int pos = 0, wstart = 0; @@ -43,7 +45,7 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) * This 'pos' check ensures we do not overrun * a non-NULL-terminated 'cmdline' */ - while (pos < COMMAND_LINE_SIZE) { + while (pos < max_cmdline_size) { c = *(char *)cmdline++; pos++; @@ -101,3 +103,8 @@ int cmdline_find_option_bool(const char *cmdline, const char *option) return 0; /* Buffer overrun */ } + +int cmdline_find_option_bool(const char *cmdline, const char *option) +{ + return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); +} -- cgit v1.2.1 From c1a0bf347c40dd4b0a5bb10fdf4de76a1fbbbe8c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Feb 2016 09:57:34 +0100 Subject: x86/mm/numa: Clean up numa_clear_kernel_node_hotplug() So we fixed an overflow bug in numa_clear_kernel_node_hotplug(): 2b54ab3c66d4 ("x86/mm/numa: Fix memory corruption on 32-bit NUMA kernels") ... and the bug was indirectly caused by poor coding style, such as using start/end local variables unnecessarily, which lost the physaddr_t type. So make the code more readable and try to fully comment all the thinking behind the logic. No change in functionality. Cc: Andrew Morton Cc: Brad Spengler Cc: Chen Tang Cc: "H. Peter Anvin" Cc: Lai Jiangshan Cc: Linus Torvalds Cc: PaX Team Cc: Taku Izumi Cc: Tang Chen Cc: Thomas Gleixner Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: y14sg1 Cc: Zhang Yanfei Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 65 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index d04f8094bc23..ede4506abb18 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -465,46 +465,65 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) return true; } +/* + * Mark all currently memblock-reserved physical memory (which covers the + * kernel's own memory ranges) as hot-unswappable. + */ static void __init numa_clear_kernel_node_hotplug(void) { - int i, nid; - nodemask_t numa_kernel_nodes = NODE_MASK_NONE; - phys_addr_t start, end; - struct memblock_region *r; + nodemask_t reserved_nodemask = NODE_MASK_NONE; + struct memblock_region *mb_region; + int i; /* + * We have to do some preprocessing of memblock regions, to + * make them suitable for reservation. + * * At this time, all memory regions reserved by memblock are - * used by the kernel. Set the nid in memblock.reserved will - * mark out all the nodes the kernel resides in. + * used by the kernel, but those regions are not split up + * along node boundaries yet, and don't necessarily have their + * node ID set yet either. + * + * So iterate over all memory known to the x86 architecture, + * and use those ranges to set the nid in memblock.reserved. + * This will split up the memblock regions along node + * boundaries and will set the node IDs as well. */ for (i = 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb = &numa_meminfo.blk[i]; + struct numa_memblk *mb = numa_meminfo.blk + i; - memblock_set_node(mb->start, mb->end - mb->start, - &memblock.reserved, mb->nid); + memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); } /* - * Mark all kernel nodes. + * Now go over all reserved memblock regions, to construct a + * node mask of all kernel reserved memory areas. * - * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo - * may not include all the memblock.reserved memory ranges because - * trim_snb_memory() reserves specific pages for Sandy Bridge graphics. + * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, + * numa_meminfo might not include all memblock.reserved + * memory ranges, because quirks such as trim_snb_memory() + * reserve specific pages for Sandy Bridge graphics. ] */ - for_each_memblock(reserved, r) - if (r->nid != MAX_NUMNODES) - node_set(r->nid, numa_kernel_nodes); + for_each_memblock(reserved, mb_region) { + if (mb_region->nid != MAX_NUMNODES) + node_set(mb_region->nid, reserved_nodemask); + } - /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ + /* + * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory + * belonging to the reserved node mask. + * + * Note that this will include memory regions that reside + * on nodes that contain kernel memory - entire nodes + * become hot-unpluggable: + */ for (i = 0; i < numa_meminfo.nr_blks; i++) { - nid = numa_meminfo.blk[i].nid; - if (!node_isset(nid, numa_kernel_nodes)) - continue; + struct numa_memblk *mb = numa_meminfo.blk + i; - start = numa_meminfo.blk[i].start; - end = numa_meminfo.blk[i].end; + if (!node_isset(mb->nid, reserved_nodemask)) + continue; - memblock_clear_hotplug(start, end - start); + memblock_clear_hotplug(mb->start, mb->end - mb->start); } } -- cgit v1.2.1 From 5f7ee246850ba18a6a7bcb3d5eddb6db68354688 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Feb 2016 12:14:55 +0100 Subject: x86/mm/numa: Check for failures in numa_clear_kernel_node_hotplug() numa_clear_kernel_node_hotplug() uses memblock_set_node() without checking for failures. memblock_set_node() is a complex function that might extend the memblock array - which extension might fail - so check for this possibility. It's not supposed to happen (because realistically if we have so little memory that this fails then we likely won't be able to boot anyway), but do the check nevertheless. Cc: Andrew Morton Cc: Brad Spengler Cc: Chen Tang Cc: "H. Peter Anvin" Cc: Lai Jiangshan Cc: Linus Torvalds Cc: PaX Team Cc: Taku Izumi Cc: Tang Chen Cc: Thomas Gleixner Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: y14sg1 Cc: Zhang Yanfei Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ede4506abb18..f70c1ff46125 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -491,8 +491,10 @@ static void __init numa_clear_kernel_node_hotplug(void) */ for (i = 0; i < numa_meminfo.nr_blks; i++) { struct numa_memblk *mb = numa_meminfo.blk + i; + int ret; - memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); + ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); + WARN_ON_ONCE(ret); } /* -- cgit v1.2.1 From dd85c79150079339b3ded62dda5f6985d192900a Mon Sep 17 00:00:00 2001 From: Milo Kim Date: Wed, 13 Jan 2016 16:19:49 +0900 Subject: irqchip/atmel-aic: Handle aic_common_irq_fixup in aic_common_of_init AIC IRQ fixup is handled in each IRQ chip driver. It can be moved into aic_common_of_init() before returning the result. Then, aic_common_irq_fixup() can be changed to static type. Signed-off-by: Milo Kim Acked-by: Boris Brezillon Cc: Jason Cooper Cc: Marc Zyngier Cc: Ludovic Desroches Cc: Nicholas Ferre Link: http://lkml.kernel.org/r/1452669592-3401-1-git-send-email-milo.kim@ti.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-atmel-aic-common.c | 6 ++++-- drivers/irqchip/irq-atmel-aic-common.h | 5 ++--- drivers/irqchip/irq-atmel-aic.c | 4 +--- drivers/irqchip/irq-atmel-aic5.c | 4 +--- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index 37199b9b2cfa..661840b5d553 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -193,7 +193,7 @@ void __init aic_common_rtt_irq_fixup(struct device_node *root) } } -void __init aic_common_irq_fixup(const struct of_device_id *matches) +static void __init aic_common_irq_fixup(const struct of_device_id *matches) { struct device_node *root = of_find_node_by_path("/"); const struct of_device_id *match; @@ -214,7 +214,8 @@ void __init aic_common_irq_fixup(const struct of_device_id *matches) struct irq_domain *__init aic_common_of_init(struct device_node *node, const struct irq_domain_ops *ops, - const char *name, int nirqs) + const char *name, int nirqs, + const struct of_device_id *matches) { struct irq_chip_generic *gc; struct irq_domain *domain; @@ -264,6 +265,7 @@ struct irq_domain *__init aic_common_of_init(struct device_node *node, } aic_common_ext_irq_of_init(domain); + aic_common_irq_fixup(matches); return domain; diff --git a/drivers/irqchip/irq-atmel-aic-common.h b/drivers/irqchip/irq-atmel-aic-common.h index 603f0a9d5411..046bcc88205b 100644 --- a/drivers/irqchip/irq-atmel-aic-common.h +++ b/drivers/irqchip/irq-atmel-aic-common.h @@ -30,12 +30,11 @@ int aic_common_irq_domain_xlate(struct irq_domain *d, struct irq_domain *__init aic_common_of_init(struct device_node *node, const struct irq_domain_ops *ops, - const char *name, int nirqs); + const char *name, int nirqs, + const struct of_device_id *matches); void __init aic_common_rtc_irq_fixup(struct device_node *root); void __init aic_common_rtt_irq_fixup(struct device_node *root); -void __init aic_common_irq_fixup(const struct of_device_id *matches); - #endif /* __IRQ_ATMEL_AIC_COMMON_H */ diff --git a/drivers/irqchip/irq-atmel-aic.c b/drivers/irqchip/irq-atmel-aic.c index 8a0c7f288198..799834de276a 100644 --- a/drivers/irqchip/irq-atmel-aic.c +++ b/drivers/irqchip/irq-atmel-aic.c @@ -248,12 +248,10 @@ static int __init aic_of_init(struct device_node *node, return -EEXIST; domain = aic_common_of_init(node, &aic_irq_ops, "atmel-aic", - NR_AIC_IRQS); + NR_AIC_IRQS, aic_irq_fixups); if (IS_ERR(domain)) return PTR_ERR(domain); - aic_common_irq_fixup(aic_irq_fixups); - aic_domain = domain; gc = irq_get_domain_generic_chip(domain, 0); diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c index 62bb840c613f..a7e8fc81c80c 100644 --- a/drivers/irqchip/irq-atmel-aic5.c +++ b/drivers/irqchip/irq-atmel-aic5.c @@ -312,12 +312,10 @@ static int __init aic5_of_init(struct device_node *node, return -EEXIST; domain = aic_common_of_init(node, &aic5_irq_ops, "atmel-aic5", - nirqs); + nirqs, aic5_irq_fixups); if (IS_ERR(domain)) return PTR_ERR(domain); - aic_common_irq_fixup(aic5_irq_fixups); - aic5_domain = domain; nchips = aic5_domain->revmap_size / 32; for (i = 0; i < nchips; i++) { -- cgit v1.2.1 From 5fd26a0bb1e479014adf024df779172d33defdd5 Mon Sep 17 00:00:00 2001 From: Milo Kim Date: Wed, 13 Jan 2016 16:19:51 +0900 Subject: irqchip/atmel-aic: Change return type of aic_common_set_priority() Priority validation is not necessary because aic_common_irq_domain_xlate() already handles it. With this removal, return type can be changed to void. Signed-off-by: Milo Kim Acked-by: Boris Brezillon Cc: Jason Cooper Cc: Marc Zyngier Cc: Ludovic Desroches Cc: Nicholas Ferre Link: http://lkml.kernel.org/r/1452669592-3401-3-git-send-email-milo.kim@ti.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-atmel-aic-common.c | 8 +------- drivers/irqchip/irq-atmel-aic-common.h | 2 +- drivers/irqchip/irq-atmel-aic.c | 5 ++--- drivers/irqchip/irq-atmel-aic5.c | 5 ++--- 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index 661840b5d553..28b26c80f4cf 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -80,16 +80,10 @@ int aic_common_set_type(struct irq_data *d, unsigned type, unsigned *val) return 0; } -int aic_common_set_priority(int priority, unsigned *val) +void aic_common_set_priority(int priority, unsigned *val) { - if (priority < AT91_AIC_IRQ_MIN_PRIORITY || - priority > AT91_AIC_IRQ_MAX_PRIORITY) - return -EINVAL; - *val &= ~AT91_AIC_PRIOR; *val |= priority; - - return 0; } int aic_common_irq_domain_xlate(struct irq_domain *d, diff --git a/drivers/irqchip/irq-atmel-aic-common.h b/drivers/irqchip/irq-atmel-aic-common.h index 046bcc88205b..af60376d50de 100644 --- a/drivers/irqchip/irq-atmel-aic-common.h +++ b/drivers/irqchip/irq-atmel-aic-common.h @@ -19,7 +19,7 @@ int aic_common_set_type(struct irq_data *d, unsigned type, unsigned *val); -int aic_common_set_priority(int priority, unsigned *val); +void aic_common_set_priority(int priority, unsigned *val); int aic_common_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, diff --git a/drivers/irqchip/irq-atmel-aic.c b/drivers/irqchip/irq-atmel-aic.c index 799834de276a..112e17c2768b 100644 --- a/drivers/irqchip/irq-atmel-aic.c +++ b/drivers/irqchip/irq-atmel-aic.c @@ -196,9 +196,8 @@ static int aic_irq_domain_xlate(struct irq_domain *d, irq_gc_lock(gc); smr = irq_reg_readl(gc, AT91_AIC_SMR(*out_hwirq)); - ret = aic_common_set_priority(intspec[2], &smr); - if (!ret) - irq_reg_writel(gc, smr, AT91_AIC_SMR(*out_hwirq)); + aic_common_set_priority(intspec[2], &smr); + irq_reg_writel(gc, smr, AT91_AIC_SMR(*out_hwirq)); irq_gc_unlock(gc); return ret; diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c index a7e8fc81c80c..f36f426edc62 100644 --- a/drivers/irqchip/irq-atmel-aic5.c +++ b/drivers/irqchip/irq-atmel-aic5.c @@ -272,9 +272,8 @@ static int aic5_irq_domain_xlate(struct irq_domain *d, irq_gc_lock(bgc); irq_reg_writel(bgc, *out_hwirq, AT91_AIC5_SSR); smr = irq_reg_readl(bgc, AT91_AIC5_SMR); - ret = aic_common_set_priority(intspec[2], &smr); - if (!ret) - irq_reg_writel(bgc, intspec[2] | smr, AT91_AIC5_SMR); + aic_common_set_priority(intspec[2], &smr); + irq_reg_writel(bgc, intspec[2] | smr, AT91_AIC5_SMR); irq_gc_unlock(bgc); return ret; -- cgit v1.2.1 From 4b5ce20b5429eb08ae0776962a4aff7f00017800 Mon Sep 17 00:00:00 2001 From: Milo Kim Date: Wed, 13 Jan 2016 16:19:52 +0900 Subject: irqchip/atmel-aic: Remove duplicate bit operation AIC5 priority value is updated twice - in aic_common_set_priority() and when updating AT91_AIC5_SMR. Variable, 'smr' has updated priority value (intspec[2]) in the first step, so no need to update it again in the second step. Signed-off-by: Milo Kim Acked-by: Boris Brezillon Cc: Jason Cooper Cc: Marc Zyngier Cc: Ludovic Desroches Cc: Nicholas Ferre Link: http://lkml.kernel.org/r/1452669592-3401-4-git-send-email-milo.kim@ti.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-atmel-aic5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c index f36f426edc62..4f0d068e1abe 100644 --- a/drivers/irqchip/irq-atmel-aic5.c +++ b/drivers/irqchip/irq-atmel-aic5.c @@ -273,7 +273,7 @@ static int aic5_irq_domain_xlate(struct irq_domain *d, irq_reg_writel(bgc, *out_hwirq, AT91_AIC5_SSR); smr = irq_reg_readl(bgc, AT91_AIC5_SMR); aic_common_set_priority(intspec[2], &smr); - irq_reg_writel(bgc, intspec[2] | smr, AT91_AIC5_SMR); + irq_reg_writel(bgc, smr, AT91_AIC5_SMR); irq_gc_unlock(bgc); return ret; -- cgit v1.2.1 From c7c42ec2baa1de7ab3965e4f1bf5073bee6065e4 Mon Sep 17 00:00:00 2001 From: Simon Arlott Date: Sun, 22 Nov 2015 14:30:14 +0000 Subject: irqchips/bmips: Add bcm6345-l1 interrupt controller Add the BCM6345 interrupt controller based on the SMP-capable BCM7038 and the BCM3380 but with packed interrupt registers. Add the BCM6345 interrupt controller to a list with the existing BCM7038 so that interrupts on CPU1 are not ignored. Update the maintainers file list for BMIPS to include this driver. Signed-off-by: Simon Arlott Cc: Mark Rutland Cc: devicetree@vger.kernel.org Cc: Ian Campbell Cc: Florian Fainelli Cc: Jason Cooper Cc: Pawel Moll Cc: linux-mips@linux-mips.org Cc: Marc Zyngier Cc: Kevin Cernekee Cc: Ralf Baechle Cc: Jonas Gorski Cc: Kumar Gala Cc: Rob Herring Link: http://lkml.kernel.org/r/5651D176.6030908@simon.arlott.org.uk Signed-off-by: Thomas Gleixner --- MAINTAINERS | 1 + arch/mips/Kconfig | 1 + arch/mips/bmips/irq.c | 10 +- drivers/irqchip/Kconfig | 5 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-bcm6345-l1.c | 364 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 380 insertions(+), 2 deletions(-) create mode 100644 drivers/irqchip/irq-bcm6345-l1.c diff --git a/MAINTAINERS b/MAINTAINERS index 7f1fa4ff300a..b5fab14ffac7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2423,6 +2423,7 @@ F: arch/mips/bmips/* F: arch/mips/include/asm/mach-bmips/* F: arch/mips/kernel/*bmips* F: arch/mips/boot/dts/brcm/bcm*.dts* +F: drivers/irqchip/irq-bcm63* F: drivers/irqchip/irq-bcm7* F: drivers/irqchip/irq-brcmstb* F: include/linux/bcm963xx_nvram.h diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 57a945e832f4..eb079ac36dc8 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -151,6 +151,7 @@ config BMIPS_GENERIC select CSRC_R4K select SYNC_R4K select COMMON_CLK + select BCM6345_L1_IRQ select BCM7038_L1_IRQ select BCM7120_L2_IRQ select BRCMSTB_L2_IRQ diff --git a/arch/mips/bmips/irq.c b/arch/mips/bmips/irq.c index e7fc6f9348ba..7efefcf44033 100644 --- a/arch/mips/bmips/irq.c +++ b/arch/mips/bmips/irq.c @@ -15,6 +15,12 @@ #include #include +static const struct of_device_id smp_intc_dt_match[] = { + { .compatible = "brcm,bcm7038-l1-intc" }, + { .compatible = "brcm,bcm6345-l1-intc" }, + {} +}; + unsigned int get_c0_compare_int(void) { return CP0_LEGACY_COMPARE_IRQ; @@ -24,8 +30,8 @@ void __init arch_init_irq(void) { struct device_node *dn; - /* Only the STB (bcm7038) controller supports SMP IRQ affinity */ - dn = of_find_compatible_node(NULL, NULL, "brcm,bcm7038-l1-intc"); + /* Only these controllers support SMP IRQ affinity */ + dn = of_find_matching_node(NULL, smp_intc_dt_match); if (dn) of_node_put(dn); else diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index fb50911b3940..abe7a571836e 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -78,6 +78,11 @@ config I8259 bool select IRQ_DOMAIN +config BCM6345_L1_IRQ + bool + select GENERIC_IRQ_CHIP + select IRQ_DOMAIN + config BCM7038_L1_IRQ bool select GENERIC_IRQ_CHIP diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 18caacb60d58..d91d99ded30f 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_XTENSA) += irq-xtensa-pic.o obj-$(CONFIG_XTENSA_MX) += irq-xtensa-mx.o obj-$(CONFIG_IRQ_CROSSBAR) += irq-crossbar.o obj-$(CONFIG_SOC_VF610) += irq-vf610-mscm-ir.o +obj-$(CONFIG_BCM6345_L1_IRQ) += irq-bcm6345-l1.o obj-$(CONFIG_BCM7038_L1_IRQ) += irq-bcm7038-l1.o obj-$(CONFIG_BCM7120_L2_IRQ) += irq-bcm7120-l2.o obj-$(CONFIG_BRCMSTB_L2_IRQ) += irq-brcmstb-l2.o diff --git a/drivers/irqchip/irq-bcm6345-l1.c b/drivers/irqchip/irq-bcm6345-l1.c new file mode 100644 index 000000000000..b844c89a9506 --- /dev/null +++ b/drivers/irqchip/irq-bcm6345-l1.c @@ -0,0 +1,364 @@ +/* + * Broadcom BCM6345 style Level 1 interrupt controller driver + * + * Copyright (C) 2014 Broadcom Corporation + * Copyright 2015 Simon Arlott + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is based on the BCM7038 (which supports SMP) but with a single + * enable register instead of separate mask/set/clear registers. + * + * The BCM3380 has a similar mask/status register layout, but each pair + * of words is at separate locations (and SMP is not supported). + * + * ENABLE/STATUS words are packed next to each other for each CPU: + * + * BCM6368: + * 0x1000_0020: CPU0_W0_ENABLE + * 0x1000_0024: CPU0_W1_ENABLE + * 0x1000_0028: CPU0_W0_STATUS IRQs 31-63 + * 0x1000_002c: CPU0_W1_STATUS IRQs 0-31 + * 0x1000_0030: CPU1_W0_ENABLE + * 0x1000_0034: CPU1_W1_ENABLE + * 0x1000_0038: CPU1_W0_STATUS IRQs 31-63 + * 0x1000_003c: CPU1_W1_STATUS IRQs 0-31 + * + * BCM63168: + * 0x1000_0020: CPU0_W0_ENABLE + * 0x1000_0024: CPU0_W1_ENABLE + * 0x1000_0028: CPU0_W2_ENABLE + * 0x1000_002c: CPU0_W3_ENABLE + * 0x1000_0030: CPU0_W0_STATUS IRQs 96-127 + * 0x1000_0034: CPU0_W1_STATUS IRQs 64-95 + * 0x1000_0038: CPU0_W2_STATUS IRQs 32-63 + * 0x1000_003c: CPU0_W3_STATUS IRQs 0-31 + * 0x1000_0040: CPU1_W0_ENABLE + * 0x1000_0044: CPU1_W1_ENABLE + * 0x1000_0048: CPU1_W2_ENABLE + * 0x1000_004c: CPU1_W3_ENABLE + * 0x1000_0050: CPU1_W0_STATUS IRQs 96-127 + * 0x1000_0054: CPU1_W1_STATUS IRQs 64-95 + * 0x1000_0058: CPU1_W2_STATUS IRQs 32-63 + * 0x1000_005c: CPU1_W3_STATUS IRQs 0-31 + * + * IRQs are numbered in CPU native endian order + * (which is big-endian in these examples) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IRQS_PER_WORD 32 +#define REG_BYTES_PER_IRQ_WORD (sizeof(u32) * 2) + +struct bcm6345_l1_cpu; + +struct bcm6345_l1_chip { + raw_spinlock_t lock; + unsigned int n_words; + struct irq_domain *domain; + struct cpumask cpumask; + struct bcm6345_l1_cpu *cpus[NR_CPUS]; +}; + +struct bcm6345_l1_cpu { + void __iomem *map_base; + unsigned int parent_irq; + u32 enable_cache[]; +}; + +static inline unsigned int reg_enable(struct bcm6345_l1_chip *intc, + unsigned int word) +{ +#ifdef __BIG_ENDIAN + return (1 * intc->n_words - word - 1) * sizeof(u32); +#else + return (0 * intc->n_words + word) * sizeof(u32); +#endif +} + +static inline unsigned int reg_status(struct bcm6345_l1_chip *intc, + unsigned int word) +{ +#ifdef __BIG_ENDIAN + return (2 * intc->n_words - word - 1) * sizeof(u32); +#else + return (1 * intc->n_words + word) * sizeof(u32); +#endif +} + +static inline unsigned int cpu_for_irq(struct bcm6345_l1_chip *intc, + struct irq_data *d) +{ + return cpumask_first_and(&intc->cpumask, irq_data_get_affinity_mask(d)); +} + +static void bcm6345_l1_irq_handle(struct irq_desc *desc) +{ + struct bcm6345_l1_chip *intc = irq_desc_get_handler_data(desc); + struct bcm6345_l1_cpu *cpu; + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned int idx; + +#ifdef CONFIG_SMP + cpu = intc->cpus[cpu_logical_map(smp_processor_id())]; +#else + cpu = intc->cpus[0]; +#endif + + chained_irq_enter(chip, desc); + + for (idx = 0; idx < intc->n_words; idx++) { + int base = idx * IRQS_PER_WORD; + unsigned long pending; + irq_hw_number_t hwirq; + unsigned int irq; + + pending = __raw_readl(cpu->map_base + reg_status(intc, idx)); + pending &= __raw_readl(cpu->map_base + reg_enable(intc, idx)); + + for_each_set_bit(hwirq, &pending, IRQS_PER_WORD) { + irq = irq_linear_revmap(intc->domain, base + hwirq); + if (irq) + do_IRQ(irq); + else + spurious_interrupt(); + } + } + + chained_irq_exit(chip, desc); +} + +static inline void __bcm6345_l1_unmask(struct irq_data *d) +{ + struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d); + u32 word = d->hwirq / IRQS_PER_WORD; + u32 mask = BIT(d->hwirq % IRQS_PER_WORD); + unsigned int cpu_idx = cpu_for_irq(intc, d); + + intc->cpus[cpu_idx]->enable_cache[word] |= mask; + __raw_writel(intc->cpus[cpu_idx]->enable_cache[word], + intc->cpus[cpu_idx]->map_base + reg_enable(intc, word)); +} + +static inline void __bcm6345_l1_mask(struct irq_data *d) +{ + struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d); + u32 word = d->hwirq / IRQS_PER_WORD; + u32 mask = BIT(d->hwirq % IRQS_PER_WORD); + unsigned int cpu_idx = cpu_for_irq(intc, d); + + intc->cpus[cpu_idx]->enable_cache[word] &= ~mask; + __raw_writel(intc->cpus[cpu_idx]->enable_cache[word], + intc->cpus[cpu_idx]->map_base + reg_enable(intc, word)); +} + +static void bcm6345_l1_unmask(struct irq_data *d) +{ + struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d); + unsigned long flags; + + raw_spin_lock_irqsave(&intc->lock, flags); + __bcm6345_l1_unmask(d); + raw_spin_unlock_irqrestore(&intc->lock, flags); +} + +static void bcm6345_l1_mask(struct irq_data *d) +{ + struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d); + unsigned long flags; + + raw_spin_lock_irqsave(&intc->lock, flags); + __bcm6345_l1_mask(d); + raw_spin_unlock_irqrestore(&intc->lock, flags); +} + +static int bcm6345_l1_set_affinity(struct irq_data *d, + const struct cpumask *dest, + bool force) +{ + struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d); + u32 word = d->hwirq / IRQS_PER_WORD; + u32 mask = BIT(d->hwirq % IRQS_PER_WORD); + unsigned int old_cpu = cpu_for_irq(intc, d); + unsigned int new_cpu; + struct cpumask valid; + unsigned long flags; + bool enabled; + + if (!cpumask_and(&valid, &intc->cpumask, dest)) + return -EINVAL; + + new_cpu = cpumask_any_and(&valid, cpu_online_mask); + if (new_cpu >= nr_cpu_ids) + return -EINVAL; + + dest = cpumask_of(new_cpu); + + raw_spin_lock_irqsave(&intc->lock, flags); + if (old_cpu != new_cpu) { + enabled = intc->cpus[old_cpu]->enable_cache[word] & mask; + if (enabled) + __bcm6345_l1_mask(d); + cpumask_copy(irq_data_get_affinity_mask(d), dest); + if (enabled) + __bcm6345_l1_unmask(d); + } else { + cpumask_copy(irq_data_get_affinity_mask(d), dest); + } + raw_spin_unlock_irqrestore(&intc->lock, flags); + + return IRQ_SET_MASK_OK_NOCOPY; +} + +static int __init bcm6345_l1_init_one(struct device_node *dn, + unsigned int idx, + struct bcm6345_l1_chip *intc) +{ + struct resource res; + resource_size_t sz; + struct bcm6345_l1_cpu *cpu; + unsigned int i, n_words; + + if (of_address_to_resource(dn, idx, &res)) + return -EINVAL; + sz = resource_size(&res); + n_words = sz / REG_BYTES_PER_IRQ_WORD; + + if (!intc->n_words) + intc->n_words = n_words; + else if (intc->n_words != n_words) + return -EINVAL; + + cpu = intc->cpus[idx] = kzalloc(sizeof(*cpu) + n_words * sizeof(u32), + GFP_KERNEL); + if (!cpu) + return -ENOMEM; + + cpu->map_base = ioremap(res.start, sz); + if (!cpu->map_base) + return -ENOMEM; + + for (i = 0; i < n_words; i++) { + cpu->enable_cache[i] = 0; + __raw_writel(0, cpu->map_base + reg_enable(intc, i)); + } + + cpu->parent_irq = irq_of_parse_and_map(dn, idx); + if (!cpu->parent_irq) { + pr_err("failed to map parent interrupt %d\n", cpu->parent_irq); + return -EINVAL; + } + irq_set_chained_handler_and_data(cpu->parent_irq, + bcm6345_l1_irq_handle, intc); + + return 0; +} + +static struct irq_chip bcm6345_l1_irq_chip = { + .name = "bcm6345-l1", + .irq_mask = bcm6345_l1_mask, + .irq_unmask = bcm6345_l1_unmask, + .irq_set_affinity = bcm6345_l1_set_affinity, +}; + +static int bcm6345_l1_map(struct irq_domain *d, unsigned int virq, + irq_hw_number_t hw_irq) +{ + irq_set_chip_and_handler(virq, + &bcm6345_l1_irq_chip, handle_percpu_irq); + irq_set_chip_data(virq, d->host_data); + return 0; +} + +static const struct irq_domain_ops bcm6345_l1_domain_ops = { + .xlate = irq_domain_xlate_onecell, + .map = bcm6345_l1_map, +}; + +static int __init bcm6345_l1_of_init(struct device_node *dn, + struct device_node *parent) +{ + struct bcm6345_l1_chip *intc; + unsigned int idx; + int ret; + + intc = kzalloc(sizeof(*intc), GFP_KERNEL); + if (!intc) + return -ENOMEM; + + for_each_possible_cpu(idx) { + ret = bcm6345_l1_init_one(dn, idx, intc); + if (ret) + pr_err("failed to init intc L1 for cpu %d: %d\n", + idx, ret); + else + cpumask_set_cpu(idx, &intc->cpumask); + } + + if (!cpumask_weight(&intc->cpumask)) { + ret = -ENODEV; + goto out_free; + } + + raw_spin_lock_init(&intc->lock); + + intc->domain = irq_domain_add_linear(dn, IRQS_PER_WORD * intc->n_words, + &bcm6345_l1_domain_ops, + intc); + if (!intc->domain) { + ret = -ENOMEM; + goto out_unmap; + } + + pr_info("registered BCM6345 L1 intc (IRQs: %d)\n", + IRQS_PER_WORD * intc->n_words); + for_each_cpu(idx, &intc->cpumask) { + struct bcm6345_l1_cpu *cpu = intc->cpus[idx]; + + pr_info(" CPU%u at MMIO 0x%p (irq = %d)\n", idx, + cpu->map_base, cpu->parent_irq); + } + + return 0; + +out_unmap: + for_each_possible_cpu(idx) { + struct bcm6345_l1_cpu *cpu = intc->cpus[idx]; + + if (cpu) { + if (cpu->map_base) + iounmap(cpu->map_base); + kfree(cpu); + } + } +out_free: + kfree(intc); + return ret; +} + +IRQCHIP_DECLARE(bcm6345_l1, "brcm,bcm6345-l1-intc", bcm6345_l1_of_init); -- cgit v1.2.1 From fbf198030e0b027538c290300cfaf9e2efdce122 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Feb 2016 19:52:23 +0100 Subject: genirq: Add default affinity mask command line option If we isolate CPUs, then we don't want random device interrupts on them. Even w/o the user space irq balancer enabled we can end up with irqs on non boot cpus and chasing newly requested interrupts is a tedious task. Allow to restrict the default irq affinity mask. Signed-off-by: Thomas Gleixner Cc: Rik van Riel Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Sebastian Siewior Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1602031948190.25254@nanos Signed-off-by: Thomas Gleixner --- Documentation/kernel-parameters.txt | 9 +++++++++ kernel/irq/irqdesc.c | 21 +++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 551ecf09c8dd..87298f8592b7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1687,6 +1687,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ip= [IP_PNP] See Documentation/filesystems/nfs/nfsroot.txt. + irqaffinity= [SMP] Set the default irq affinity mask + Format: + ,..., + or + - + (must be a positive range in ascending order) + or a mixture + ,...,- + irqfixup [HW] When an interrupt is not handled search all handlers for it. Intended to get systems with badly broken diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0409da0bcc33..0ccd028817d7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -24,10 +24,27 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) +static int __init irq_affinity_setup(char *str) +{ + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpulist_parse(str, irq_default_affinity); + /* + * Set at least the boot cpu. We don't want to end up with + * bugreports caused by random comandline masks + */ + cpumask_set_cpu(smp_processor_id(), irq_default_affinity); + return 1; +} +__setup("irqaffinity=", irq_affinity_setup); + static void __init init_irq_default_affinity(void) { - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!irq_default_affinity) + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif + if (cpumask_empty(irq_default_affinity)) + cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) -- cgit v1.2.1 From 8dd5032d9c540111dd673078738d137a998d6c3f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 7 Feb 2016 22:51:27 +0100 Subject: x86/asm/bitops: Force inlining of test_and_set_bit and friends Sometimes GCC mysteriously doesn't inline very small functions we expect to be inlined, see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 Arguably, GCC should do better, but GCC people aren't willing to invest time into it and are asking to use __always_inline instead. With this .config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os here's an example of functions getting deinlined many times: test_and_set_bit (166 copies, ~1260 calls) 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 48 0f ab 3e lock bts %rdi,(%rsi) 72 04 jb 31 c0 xor %eax,%eax eb 05 jmp b8 01 00 00 00 mov $0x1,%eax 5d pop %rbp c3 retq test_and_clear_bit (124 copies, ~1000 calls) 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 48 0f b3 3e lock btr %rdi,(%rsi) 72 04 jb 31 c0 xor %eax,%eax eb 05 jmp b8 01 00 00 00 mov $0x1,%eax 5d pop %rbp c3 retq change_bit (3 copies, 8 calls) 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 48 0f bb 3e lock btc %rdi,(%rsi) 5d pop %rbp c3 retq clear_bit_unlock (2 copies, 11 calls) 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 48 0f b3 3e lock btr %rdi,(%rsi) 5d pop %rbp c3 retq This patch works it around via s/inline/__always_inline/. Code size decrease by ~13.5k after the patch: text data bss dec filename 92110727 20826144 36417536 149354407 vmlinux.before 92097234 20826176 36417536 149340946 vmlinux.after Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: David Rientjes Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Graf Link: http://lkml.kernel.org/r/1454881887-1367-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bitops.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index cfe3b954d5e4..7766d1cf096e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __set_bit(long nr, volatile unsigned long *addr) +static __always_inline void __set_bit(long nr, volatile unsigned long *addr) { asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); } @@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr) * clear_bit() is atomic and implies release semantics before the memory * operation. It can be used for an unlock. */ -static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) +static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); clear_bit(nr, addr); } -static inline void __clear_bit(long nr, volatile unsigned long *addr) +static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) { asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); } @@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr) * No memory barrier is required here, because x86 cannot reorder stores past * older loads. Same principle as spin_unlock. */ -static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) +static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); __clear_bit(nr, addr); @@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(long nr, volatile unsigned long *addr) +static __always_inline void __change_bit(long nr, volatile unsigned long *addr) { asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); } @@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void change_bit(long nr, volatile unsigned long *addr) +static __always_inline void change_bit(long nr, volatile unsigned long *addr) { if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "xorb %1,%0" @@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c"); } @@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c"); } @@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr) * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ -static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) } /* WARNING: non atomic and it can be reordered! */ -static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr) { int oldbit; @@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c"); } @@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } -static inline int variable_test_bit(long nr, volatile const unsigned long *addr) +static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr) { int oldbit; @@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr); * * Undefined if no bit exists, so code should check against 0 first. */ -static inline unsigned long __ffs(unsigned long word) +static __always_inline unsigned long __ffs(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) @@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word) * * Undefined if no zero exists, so code should check against ~0UL first. */ -static inline unsigned long ffz(unsigned long word) +static __always_inline unsigned long ffz(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) @@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word) * * Undefined if no set bit exists, so code should check against 0 first. */ -static inline unsigned long __fls(unsigned long word) +static __always_inline unsigned long __fls(unsigned long word) { asm("bsr %1,%0" : "=r" (word) @@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word) * set bit if value is nonzero. The first (least significant) bit * is at position 1. */ -static inline int ffs(int x) +static __always_inline int ffs(int x) { int r; @@ -434,7 +434,7 @@ static inline int ffs(int x) * set bit if value is nonzero. The last (most significant) bit is * at position 32. */ -static inline int fls(int x) +static __always_inline int fls(int x) { int r; -- cgit v1.2.1 From 5f9c01aa7c49a2d74474d6d879a797b8badf29e6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:29 +0100 Subject: x86/microcode: Untangle from BLK_DEV_INITRD Thomas Voegtle reported that doing oldconfig with a .config which has CONFIG_MICROCODE enabled but BLK_DEV_INITRD disabled prevents the microcode loading mechanism from being built. So untangle it from the BLK_DEV_INITRD dependency so that oldconfig doesn't turn it off and add an explanatory text to its Kconfig help what the supported methods for supplying microcode are. Reported-by: Thomas Voegtle Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: # 4.4 Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 27 ++++++++++++++------------- arch/x86/include/asm/microcode.h | 26 ++++++++++++++++++++++++++ arch/x86/kernel/cpu/microcode/intel.c | 14 ++++---------- 3 files changed, 44 insertions(+), 23 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9af2e6338400..405b1858134b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1159,22 +1159,23 @@ config MICROCODE bool "CPU microcode loading support" default y depends on CPU_SUP_AMD || CPU_SUP_INTEL - depends on BLK_DEV_INITRD select FW_LOADER ---help--- - If you say Y here, you will be able to update the microcode on - certain Intel and AMD processors. The Intel support is for the - IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, - Xeon etc. The AMD support is for families 0x10 and later. You will - obviously need the actual microcode binary data itself which is not - shipped with the Linux kernel. - - This option selects the general module only, you need to select - at least one vendor specific module as well. - - To compile this driver as a module, choose M here: the module - will be called microcode. + Intel and AMD processors. The Intel support is for the IA32 family, + e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The + AMD support is for families 0x10 and later. You will obviously need + the actual microcode binary data itself which is not shipped with + the Linux kernel. + + The preferred method to load microcode from a detached initrd is described + in Documentation/x86/early-microcode.txt. For that you need to enable + CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the + initrd for microcode blobs. + + In addition, you can build-in the microcode into the kernel. For that you + need to enable FIRMWARE_IN_KERNEL and add the vendor-supplied microcode + to the CONFIG_EXTRA_FIRMWARE config option. config MICROCODE_INTEL bool "Intel microcode loading support" diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 1e1b07a5a738..9d3a96c4da78 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -3,6 +3,7 @@ #include #include +#include #define native_rdmsr(msr, val1, val2) \ do { \ @@ -143,4 +144,29 @@ static inline void reload_early_microcode(void) { } static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; } #endif + +static inline unsigned long get_initrd_start(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD + return initrd_start; +#else + return 0; +#endif +} + +static inline unsigned long get_initrd_start_addr(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD +#ifdef CONFIG_X86_32 + unsigned long *initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); + + return (unsigned long)__pa_nodebug(*initrd_start_p); +#else + return get_initrd_start(); +#endif +#else /* CONFIG_BLK_DEV_INITRD */ + return 0; +#endif +} + #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ee81c544ee0d..044bbbbcbaf1 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -690,7 +690,7 @@ int __init save_microcode_in_initrd_intel(void) if (count == 0) return ret; - copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); + copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count); ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) pr_err("Cannot save microcode patches from initrd.\n"); @@ -748,20 +748,14 @@ void load_ucode_intel_ap(void) struct mc_saved_data *mc_saved_data_p; struct ucode_cpu_info uci; unsigned long *mc_saved_in_initrd_p; - unsigned long initrd_start_addr; enum ucode_state ret; #ifdef CONFIG_X86_32 - unsigned long *initrd_start_p; - mc_saved_in_initrd_p = - (unsigned long *)__pa_nodebug(mc_saved_in_initrd); + mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd); mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); - initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); #else - mc_saved_data_p = &mc_saved_data; mc_saved_in_initrd_p = mc_saved_in_initrd; - initrd_start_addr = initrd_start; + mc_saved_data_p = &mc_saved_data; #endif /* @@ -773,7 +767,7 @@ void load_ucode_intel_ap(void) collect_cpu_info_early(&uci); ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, - initrd_start_addr, &uci); + get_initrd_start_addr(), &uci); if (ret != UCODE_OK) return; -- cgit v1.2.1 From 264285ac01673e70557c43ecee338ce97c4c0672 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:30 +0100 Subject: x86/microcode/intel: Make early loader look for builtin microcode too Set the initrd @start depending on the presence of an initrd. Otherwise, builtin microcode loading doesn't work as the start is wrong and we're using it to compute offset to the microcode blobs. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: # 4.4 Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 044bbbbcbaf1..4f4735bd8698 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -551,10 +551,14 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, cd.data = NULL; cd.size = 0; - cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) { + /* try built-in microcode if no initrd */ + if (!size) { if (!load_builtin_intel_microcode(&cd)) return UCODE_ERROR; + } else { + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) + return UCODE_ERROR; } return get_matching_model_microcode(0, start, cd.data, cd.size, @@ -728,16 +732,20 @@ void __init load_ucode_intel_bsp(void) struct boot_params *p; p = (struct boot_params *)__pa_nodebug(&boot_params); - start = p->hdr.ramdisk_image; size = p->hdr.ramdisk_size; - _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), - start, size); + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + start = (size ? p->hdr.ramdisk_image : 0); + + _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), + start, size); #else - start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; size = boot_params.hdr.ramdisk_size; + start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0); _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); #endif -- cgit v1.2.1 From e8c8165ecfb1cfd6650777c193361d33b0f7f59e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:31 +0100 Subject: x86/microcode: Remove redundant __setup() param parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We do parse for the disable microcode loader chicken bit very early. After the driver merge, the __setup() param parsing method is not needed anymore so get rid of it. In addition, fix a compiler warning from an old SLES11 gcc (4.3.4) reported by Jan Beulich : arch/x86/kernel/cpu/microcode/core.c: In function ‘load_ucode_bsp’: arch/x86/kernel/cpu/microcode/core.c:96: warning: array subscript is above array bounds Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/core.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index faec7120c508..bca4e48b531d 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -43,16 +43,8 @@ #define MICROCODE_VERSION "2.01" static struct microcode_ops *microcode_ops; - static bool dis_ucode_ldr; -static int __init disable_loader(char *str) -{ - dis_ucode_ldr = true; - return 1; -} -__setup("dis_ucode_ldr", disable_loader); - /* * Synchronization. * @@ -81,15 +73,16 @@ struct cpu_info_ctx { static bool __init check_loader_disabled_bsp(void) { + static const char *__dis_opt_str = "dis_ucode_ldr"; + #ifdef CONFIG_X86_32 const char *cmdline = (const char *)__pa_nodebug(boot_command_line); - const char *opt = "dis_ucode_ldr"; - const char *option = (const char *)__pa_nodebug(opt); + const char *option = (const char *)__pa_nodebug(__dis_opt_str); bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); #else /* CONFIG_X86_64 */ const char *cmdline = boot_command_line; - const char *option = "dis_ucode_ldr"; + const char *option = __dis_opt_str; bool *res = &dis_ucode_ldr; #endif -- cgit v1.2.1 From 43858f57bc02388a6420321c021397591199cf91 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 3 Feb 2016 12:33:32 +0100 Subject: x86/microcode: Remove an unneeded NULL check "uci" is an element of the ucode_cpu_info[] array, it can't be NULL. Tested-by: Thomas Voegtle Signed-off-by: Dan Carpenter Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/1454499225-21544-5-git-send-email-bp@alien8.de Link: http://lkml.kernel.org/r/20140120103046.GC14233@elgon.mountain Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index bca4e48b531d..cea8552e2b3a 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -472,7 +472,7 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) enum ucode_state ustate; struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (uci && uci->valid) + if (uci->valid) return UCODE_OK; if (collect_cpu_info(cpu)) -- cgit v1.2.1 From b7f500aedd4551a9bf29c617804c13f0ff18c879 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:33 +0100 Subject: x86/microcode: Issue update message only once This is especially annoying on large boxes: x86: Booting SMP configuration: .... node #0, CPUs: #1 microcode: CPU1 microcode updated early to revision 0x428, date = 2014-05-29 #2 microcode: CPU2 microcode updated early to revision 0x428, date = 2014-05-29 #3 ... so issue the update message only once. $ grep microcode /proc/cpuinfo shows whether every core got updated properly. Reported-by: Ingo Molnar Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/amd.c | 4 ++-- arch/x86/kernel/cpu/microcode/intel.c | 13 +++++-------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 2233f8a76615..5b63e2f669b0 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -432,8 +432,8 @@ int __init save_microcode_in_initrd_amd(void) container = cont_va; if (ucode_new_rev) - pr_info("microcode: updated early to new patch_level=0x%08x\n", - ucode_new_rev); + pr_info_once("microcode updated early to new patch_level=0x%08x\n", + ucode_new_rev); eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 4f4735bd8698..f4bc5fe00d46 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -571,14 +571,11 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, static void print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) { - int cpu = smp_processor_id(); - - pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", - cpu, - uci->cpu_sig.rev, - date & 0xffff, - date >> 24, - (date >> 16) & 0xff); + pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); } #ifdef CONFIG_X86_32 -- cgit v1.2.1 From a58017c62b5fa23e532b731e70a63ded54cc2c02 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:34 +0100 Subject: x86/microcode/AMD: Drop redundant printk prefix It is supplied by pr_fmt already. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-7-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/amd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 5b63e2f669b0..9f5ccef33fba 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -469,8 +469,7 @@ void reload_ucode_amd(void) if (mc && rev < mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { ucode_new_rev = mc->hdr.patch_id; - pr_info("microcode: reload patch_level=0x%08x\n", - ucode_new_rev); + pr_info("reload patch_level=0x%08x\n", ucode_new_rev); } } } -- cgit v1.2.1 From bd6fe58d8e60d45c80ed664a55ef0df5fa076014 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:35 +0100 Subject: x86/microcode/intel: Rename local variables of type struct mc_saved_data So it is always a head-twister when trying to stare at code which has a bunch of struct mc_saved_data *mc_saved_data; local function variables *and* a global mc_saved_data of the same name. Rename all locals to "mcs" to differentiate from the global one. No functionality change. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 58 ++++++++++++++++------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index f4bc5fe00d46..4af30bee2161 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -89,42 +89,39 @@ copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, #ifdef CONFIG_X86_32 static void -microcode_phys(struct microcode_intel **mc_saved_tmp, - struct mc_saved_data *mc_saved_data) +microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs) { int i; struct microcode_intel ***mc_saved; - mc_saved = (struct microcode_intel ***) - __pa_nodebug(&mc_saved_data->mc_saved); - for (i = 0; i < mc_saved_data->mc_saved_count; i++) { + mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved); + + for (i = 0; i < mcs->mc_saved_count; i++) { struct microcode_intel *p; - p = *(struct microcode_intel **) - __pa_nodebug(mc_saved_data->mc_saved + i); + p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i); mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); } } #endif static enum ucode_state -load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, +load_microcode(struct mc_saved_data *mcs, unsigned long *initrd, unsigned long initrd_start, struct ucode_cpu_info *uci) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mc_saved_data->mc_saved_count; + unsigned int count = mcs->mc_saved_count; - if (!mc_saved_data->mc_saved) { + if (!mcs->mc_saved) { copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); return load_microcode_early(mc_saved_tmp, count, uci); } else { #ifdef CONFIG_X86_32 - microcode_phys(mc_saved_tmp, mc_saved_data); + microcode_phys(mc_saved_tmp, mcs); return load_microcode_early(mc_saved_tmp, count, uci); #else - return load_microcode_early(mc_saved_data->mc_saved, - count, uci); + return load_microcode_early(mcs->mc_saved, count, uci); #endif } } @@ -175,7 +172,7 @@ matching_model_microcode(struct microcode_header_intel *mc_header, } static int -save_microcode(struct mc_saved_data *mc_saved_data, +save_microcode(struct mc_saved_data *mcs, struct microcode_intel **mc_saved_src, unsigned int mc_saved_count) { @@ -219,8 +216,8 @@ save_microcode(struct mc_saved_data *mc_saved_data, /* * Point to newly saved microcode. */ - mc_saved_data->mc_saved = saved_ptr; - mc_saved_data->mc_saved_count = mc_saved_count; + mcs->mc_saved = saved_ptr; + mcs->mc_saved_count = mc_saved_count; return 0; @@ -286,7 +283,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, static enum ucode_state __init get_matching_model_microcode(int cpu, unsigned long start, void *data, size_t size, - struct mc_saved_data *mc_saved_data, + struct mc_saved_data *mcs, unsigned long *mc_saved_in_initrd, struct ucode_cpu_info *uci) { @@ -296,7 +293,7 @@ get_matching_model_microcode(int cpu, unsigned long start, unsigned int mc_size; struct microcode_header_intel *mc_header; struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count = mc_saved_data->mc_saved_count; + unsigned int mc_saved_count = mcs->mc_saved_count; int i; while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { @@ -342,7 +339,7 @@ get_matching_model_microcode(int cpu, unsigned long start, for (i = 0; i < mc_saved_count; i++) mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; - mc_saved_data->mc_saved_count = mc_saved_count; + mcs->mc_saved_count = mc_saved_count; out: return state; } @@ -536,7 +533,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp) static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; static __init enum ucode_state -scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, +scan_microcode(struct mc_saved_data *mcs, unsigned long *initrd, unsigned long start, unsigned long size, struct ucode_cpu_info *uci) { @@ -562,7 +559,7 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, } return get_matching_model_microcode(0, start, cd.data, cd.size, - mc_saved_data, initrd, uci); + mcs, initrd, uci); } /* @@ -702,8 +699,7 @@ int __init save_microcode_in_initrd_intel(void) } static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, - unsigned long *initrd, +_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *initrd, unsigned long start, unsigned long size) { struct ucode_cpu_info uci; @@ -711,11 +707,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, collect_cpu_info_early(&uci); - ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); + ret = scan_microcode(mcs, initrd, start, size, &uci); if (ret != UCODE_OK) return; - ret = load_microcode(mc_saved_data, initrd, start, &uci); + ret = load_microcode(mcs, initrd, start, &uci); if (ret != UCODE_OK) return; @@ -750,28 +746,28 @@ void __init load_ucode_intel_bsp(void) void load_ucode_intel_ap(void) { - struct mc_saved_data *mc_saved_data_p; - struct ucode_cpu_info uci; unsigned long *mc_saved_in_initrd_p; + struct mc_saved_data *mcs_p; + struct ucode_cpu_info uci; enum ucode_state ret; #ifdef CONFIG_X86_32 mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd); - mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); #else mc_saved_in_initrd_p = mc_saved_in_initrd; - mc_saved_data_p = &mc_saved_data; + mcs_p = &mc_saved_data; #endif /* * If there is no valid ucode previously saved in memory, no need to * update ucode on this AP. */ - if (mc_saved_data_p->mc_saved_count == 0) + if (mcs_p->mc_saved_count == 0) return; collect_cpu_info_early(&uci); - ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, + ret = load_microcode(mcs_p, mc_saved_in_initrd_p, get_initrd_start_addr(), &uci); if (ret != UCODE_OK) -- cgit v1.2.1 From 4fe9349fc3b042b481692b577bda97cde4d6f517 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:36 +0100 Subject: x86/microcode/intel: Rename mc_saved_count to num_saved It is shorter and easier on the eyes. Change the "== 0" tests to "!..." while at it. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-9-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 61 ++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 4af30bee2161..9f5fe72419ba 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -41,7 +41,7 @@ static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; static struct mc_saved_data { - unsigned int mc_saved_count; + unsigned int num_saved; struct microcode_intel **mc_saved; } mc_saved_data; @@ -96,7 +96,7 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs) mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved); - for (i = 0; i < mcs->mc_saved_count; i++) { + for (i = 0; i < mcs->num_saved; i++) { struct microcode_intel *p; p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i); @@ -110,7 +110,7 @@ load_microcode(struct mc_saved_data *mcs, unsigned long *initrd, unsigned long initrd_start, struct ucode_cpu_info *uci) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mcs->mc_saved_count; + unsigned int count = mcs->num_saved; if (!mcs->mc_saved) { copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); @@ -174,23 +174,23 @@ matching_model_microcode(struct microcode_header_intel *mc_header, static int save_microcode(struct mc_saved_data *mcs, struct microcode_intel **mc_saved_src, - unsigned int mc_saved_count) + unsigned int num_saved) { int i, j; struct microcode_intel **saved_ptr; int ret; - if (!mc_saved_count) + if (!num_saved) return -EINVAL; /* * Copy new microcode data. */ - saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); + saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL); if (!saved_ptr) return -ENOMEM; - for (i = 0; i < mc_saved_count; i++) { + for (i = 0; i < num_saved; i++) { struct microcode_header_intel *mc_hdr; struct microcode_intel *mc; unsigned long size; @@ -216,8 +216,8 @@ save_microcode(struct mc_saved_data *mcs, /* * Point to newly saved microcode. */ - mcs->mc_saved = saved_ptr; - mcs->mc_saved_count = mc_saved_count; + mcs->mc_saved = saved_ptr; + mcs->num_saved = num_saved; return 0; @@ -293,10 +293,10 @@ get_matching_model_microcode(int cpu, unsigned long start, unsigned int mc_size; struct microcode_header_intel *mc_header; struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count = mcs->mc_saved_count; + unsigned int num_saved = mcs->num_saved; int i; - while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { + while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) { if (leftover < sizeof(mc_header)) break; @@ -321,7 +321,7 @@ get_matching_model_microcode(int cpu, unsigned long start, continue; } - mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); + num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved); ucode_ptr += mc_size; } @@ -331,15 +331,15 @@ get_matching_model_microcode(int cpu, unsigned long start, goto out; } - if (mc_saved_count == 0) { + if (!num_saved) { state = UCODE_NFOUND; goto out; } - for (i = 0; i < mc_saved_count; i++) + for (i = 0; i < num_saved; i++) mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; - mcs->mc_saved_count = mc_saved_count; + mcs->num_saved = num_saved; out: return state; } @@ -393,11 +393,11 @@ static void show_saved_mc(void) unsigned int sig, pf, rev, total_size, data_size, date; struct ucode_cpu_info uci; - if (mc_saved_data.mc_saved_count == 0) { + if (!mc_saved_data.num_saved) { pr_debug("no microcode data saved.\n"); return; } - pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved); collect_cpu_info_early(&uci); @@ -406,7 +406,7 @@ static void show_saved_mc(void) rev = uci.cpu_sig.rev; pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - for (i = 0; i < mc_saved_data.mc_saved_count; i++) { + for (i = 0; i < mc_saved_data.num_saved; i++) { struct microcode_header_intel *mc_saved_header; struct extended_sigtable *ext_header; int ext_sigcount; @@ -462,7 +462,7 @@ int save_mc_for_early(u8 *mc) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; unsigned int mc_saved_count_init; - unsigned int mc_saved_count; + unsigned int num_saved; struct microcode_intel **mc_saved; int ret = 0; int i; @@ -473,23 +473,23 @@ int save_mc_for_early(u8 *mc) */ mutex_lock(&x86_cpu_microcode_mutex); - mc_saved_count_init = mc_saved_data.mc_saved_count; - mc_saved_count = mc_saved_data.mc_saved_count; + mc_saved_count_init = mc_saved_data.num_saved; + num_saved = mc_saved_data.num_saved; mc_saved = mc_saved_data.mc_saved; - if (mc_saved && mc_saved_count) + if (mc_saved && num_saved) memcpy(mc_saved_tmp, mc_saved, - mc_saved_count * sizeof(struct microcode_intel *)); + num_saved * sizeof(struct microcode_intel *)); /* * Save the microcode patch mc in mc_save_tmp structure if it's a newer * version. */ - mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); + num_saved = _save_mc(mc_saved_tmp, mc, num_saved); /* * Save the mc_save_tmp in global mc_saved_data. */ - ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); + ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved); if (ret) { pr_err("Cannot save microcode patch.\n"); goto out; @@ -681,14 +681,15 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) */ int __init save_microcode_in_initrd_intel(void) { - unsigned int count = mc_saved_data.mc_saved_count; + unsigned int count = mc_saved_data.num_saved; struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; int ret = 0; - if (count == 0) + if (!count) return ret; copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count); + ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) pr_err("Cannot save microcode patches from initrd.\n"); @@ -763,7 +764,7 @@ void load_ucode_intel_ap(void) * If there is no valid ucode previously saved in memory, no need to * update ucode on this AP. */ - if (mcs_p->mc_saved_count == 0) + if (!mcs_p->num_saved) return; collect_cpu_info_early(&uci); @@ -781,13 +782,13 @@ void reload_ucode_intel(void) struct ucode_cpu_info uci; enum ucode_state ret; - if (!mc_saved_data.mc_saved_count) + if (!mc_saved_data.num_saved) return; collect_cpu_info_early(&uci); ret = load_microcode_early(mc_saved_data.mc_saved, - mc_saved_data.mc_saved_count, &uci); + mc_saved_data.num_saved, &uci); if (ret != UCODE_OK) return; -- cgit v1.2.1 From de778275c295825e6638f3f74103f40642d45caa Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:37 +0100 Subject: x86/microcode/intel: Rename mc_intel variable to mc Well, it is apparent what it points to - microcode. And since it is the intel loader, no need for the "_intel" suffix. Use "!" for the 0/NULL checks, while at it. No functionality change. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-10-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 58 +++++++++++++++++------------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 9f5fe72419ba..d1b2f583f543 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -601,19 +601,19 @@ void show_ucode_info_early(void) */ static void print_ucode(struct ucode_cpu_info *uci) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; int *delay_ucode_info_p; int *current_mc_date_p; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return; delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); *delay_ucode_info_p = 1; - *current_mc_date_p = mc_intel->hdr.date; + *current_mc_date_p = mc->hdr.date; } #else @@ -628,29 +628,29 @@ static inline void flush_tlb_early(void) static inline void print_ucode(struct ucode_cpu_info *uci) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return; - print_ucode_info(uci, mc_intel->hdr.date); + print_ucode_info(uci, mc->hdr.date); } #endif static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; unsigned int val[2]; - mc_intel = uci->mc; - if (mc_intel == NULL) + mc = uci->mc; + if (!mc) return 0; /* write microcode via MSR 0x79 */ native_wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); + (unsigned long)mc->bits, + (unsigned long)mc->bits >> 16 >> 16); native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* As documented in the SDM: Do a CPUID 1 here */ @@ -658,7 +658,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) /* get the current revision from MSR 0x8B */ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc_intel->hdr.rev) + if (val[1] != mc->hdr.rev) return -1; #ifdef CONFIG_X86_64 @@ -670,7 +670,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) if (early) print_ucode(uci); else - print_ucode_info(uci, mc_intel->hdr.date); + print_ucode_info(uci, mc->hdr.date); return 0; } @@ -821,7 +821,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) * return 0 - no update found * return 1 - found update */ -static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) +static int get_matching_mc(struct microcode_intel *mc, int cpu) { struct cpu_signature cpu_sig; unsigned int csig, cpf, crev; @@ -832,38 +832,38 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) cpf = cpu_sig.pf; crev = cpu_sig.rev; - return has_newer_microcode(mc_intel, csig, cpf, crev); + return has_newer_microcode(mc, csig, cpf, crev); } static int apply_microcode_intel(int cpu) { - struct microcode_intel *mc_intel; + struct microcode_intel *mc; struct ucode_cpu_info *uci; unsigned int val[2]; int cpu_num = raw_smp_processor_id(); struct cpuinfo_x86 *c = &cpu_data(cpu_num); uci = ucode_cpu_info + cpu; - mc_intel = uci->mc; + mc = uci->mc; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); - if (mc_intel == NULL) + if (!mc) return 0; /* * Microcode on this CPU could be updated earlier. Only apply the - * microcode patch in mc_intel when it is newer than the one on this + * microcode patch in mc when it is newer than the one on this * CPU. */ - if (get_matching_mc(mc_intel, cpu) == 0) + if (!get_matching_mc(mc, cpu)) return 0; /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc_intel->bits, - (unsigned long) mc_intel->bits >> 16 >> 16); + (unsigned long) mc->bits, + (unsigned long) mc->bits >> 16 >> 16); wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* As documented in the SDM: Do a CPUID 1 here */ @@ -872,16 +872,16 @@ static int apply_microcode_intel(int cpu) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (val[1] != mc_intel->hdr.rev) { + if (val[1] != mc->hdr.rev) { pr_err("CPU%d update to revision 0x%x failed\n", - cpu_num, mc_intel->hdr.rev); + cpu_num, mc->hdr.rev); return -1; } pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", cpu_num, val[1], - mc_intel->hdr.date & 0xffff, - mc_intel->hdr.date >> 24, - (mc_intel->hdr.date >> 16) & 0xff); + mc->hdr.date & 0xffff, + mc->hdr.date >> 24, + (mc->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; c->microcode = val[1]; -- cgit v1.2.1 From 58b5f2cc4bdbc9b616e68639f5a84886aa5be590 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:38 +0100 Subject: x86/microcode/intel: Move the BUG_ON up and turn it into WARN_ON If we're going to BUG_ON() because we're running on the wrong CPU, we better do it as the first thing we do when entering that function. And also, turn it into a WARN_ON() because it is not worth to panic the system if we apply the microcode on the wrong CPU - we're simply going to exit early. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-11-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index d1b2f583f543..c029c2bb2a29 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -843,12 +843,12 @@ static int apply_microcode_intel(int cpu) int cpu_num = raw_smp_processor_id(); struct cpuinfo_x86 *c = &cpu_data(cpu_num); - uci = ucode_cpu_info + cpu; - mc = uci->mc; - /* We should bind the task to the CPU */ - BUG_ON(cpu_num != cpu); + if (WARN_ON(cpu_num != cpu)) + return -1; + uci = ucode_cpu_info + cpu; + mc = uci->mc; if (!mc) return 0; -- cgit v1.2.1 From 26cbaa4dc676a444aa626cbc642c4c8181ef1378 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:39 +0100 Subject: x86/microcode/intel: Cleanup apply_microcode_intel() Get rid of local variable cpu_num as it is equal to @cpu now. Deref cpu_data() only when it is really needed at the end. No functionality change. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-12-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index c029c2bb2a29..35186a0dd5fc 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -839,12 +839,11 @@ static int apply_microcode_intel(int cpu) { struct microcode_intel *mc; struct ucode_cpu_info *uci; + struct cpuinfo_x86 *c; unsigned int val[2]; - int cpu_num = raw_smp_processor_id(); - struct cpuinfo_x86 *c = &cpu_data(cpu_num); /* We should bind the task to the CPU */ - if (WARN_ON(cpu_num != cpu)) + if (WARN_ON(raw_smp_processor_id() != cpu)) return -1; uci = ucode_cpu_info + cpu; @@ -874,15 +873,18 @@ static int apply_microcode_intel(int cpu) if (val[1] != mc->hdr.rev) { pr_err("CPU%d update to revision 0x%x failed\n", - cpu_num, mc->hdr.rev); + cpu, mc->hdr.rev); return -1; } + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", - cpu_num, val[1], + cpu, val[1], mc->hdr.date & 0xffff, mc->hdr.date >> 24, (mc->hdr.date >> 16) & 0xff); + c = &cpu_data(cpu); + uci->cpu_sig.rev = val[1]; c->microcode = val[1]; -- cgit v1.2.1 From c416e6117575213a5a962149620684a09f9e4ece Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:40 +0100 Subject: x86/microcode/intel: Use *wrmsrl variants ... and drop the 32-bit casting games which we had to do at the time because wrmsr() was unforgiving then, see c3fd0bd5e19a from the full history tree: commit c3fd0bd5e19aaff9cdd104edff136a2023db657e Author: Linus Torvalds Date: Tue Feb 17 23:23:41 2004 -0800 Fix up the microcode update on regular 32-bit x86. Our wrmsr() is a bit unforgiving and really doesn't like 64-bit values. ... Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-13-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 35186a0dd5fc..ff0b44951d12 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -370,7 +370,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); csig.pf = 1 << ((val[1] >> 18) & 7); } - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + native_wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); @@ -648,10 +648,8 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) return 0; /* write microcode via MSR 0x79 */ - native_wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long)mc->bits, - (unsigned long)mc->bits >> 16 >> 16); - native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + native_wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); @@ -860,10 +858,8 @@ static int apply_microcode_intel(int cpu) return 0; /* write microcode via MSR 0x79 */ - wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) mc->bits, - (unsigned long) mc->bits >> 16 >> 16); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); + wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + wrmsrl(MSR_IA32_UCODE_REV, 0); /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); -- cgit v1.2.1 From f8bb45e2c4acf1395bb6e61a135ce8c9107388cf Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:41 +0100 Subject: x86/microcode/intel: Rename mc_saved_in_initrd Rename it to mc_tmp_ptrs to denote better what it is - a temporary array for saving pointers to microcode blobs. And "initrd" is not accurate anymore since initrd is not the only source for early microcode. Therefore, rename copy_initrd_ptrs() to copy_ptrs() simply and "initrd_start" to "offset". And then do the following convention: the global variable is called "mc_tmp_ptrs" and the local function arguments "mc_ptrs" for differentiation. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-14-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 52 +++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ff0b44951d12..5970758bbcdd 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -39,7 +39,13 @@ #include #include -static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +/* + * Temporary microcode blobs pointers storage. We note here the pointers to + * microcode blobs we've got from whatever storage (detached initrd, builtin). + * Later on, we put those into final storage mc_saved_data.mc_saved. + */ +static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT]; + static struct mc_saved_data { unsigned int num_saved; struct microcode_intel **mc_saved; @@ -78,13 +84,13 @@ load_microcode_early(struct microcode_intel **saved, } static inline void -copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, - unsigned long off, int num_saved) +copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs, + unsigned long off, int num_saved) { int i; for (i = 0; i < num_saved; i++) - mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); + mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off); } #ifdef CONFIG_X86_32 @@ -106,14 +112,14 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs) #endif static enum ucode_state -load_microcode(struct mc_saved_data *mcs, unsigned long *initrd, - unsigned long initrd_start, struct ucode_cpu_info *uci) +load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, + unsigned long offset, struct ucode_cpu_info *uci) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; unsigned int count = mcs->num_saved; if (!mcs->mc_saved) { - copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); + copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count); return load_microcode_early(mc_saved_tmp, count, uci); } else { @@ -284,7 +290,7 @@ static enum ucode_state __init get_matching_model_microcode(int cpu, unsigned long start, void *data, size_t size, struct mc_saved_data *mcs, - unsigned long *mc_saved_in_initrd, + unsigned long *mc_ptrs, struct ucode_cpu_info *uci) { u8 *ucode_ptr = data; @@ -337,7 +343,7 @@ get_matching_model_microcode(int cpu, unsigned long start, } for (i = 0; i < num_saved; i++) - mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start; mcs->num_saved = num_saved; out: @@ -533,7 +539,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp) static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; static __init enum ucode_state -scan_microcode(struct mc_saved_data *mcs, unsigned long *initrd, +scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, unsigned long start, unsigned long size, struct ucode_cpu_info *uci) { @@ -559,7 +565,7 @@ scan_microcode(struct mc_saved_data *mcs, unsigned long *initrd, } return get_matching_model_microcode(0, start, cd.data, cd.size, - mcs, initrd, uci); + mcs, mc_ptrs, uci); } /* @@ -675,7 +681,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) /* * This function converts microcode patch offsets previously stored in - * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data. */ int __init save_microcode_in_initrd_intel(void) { @@ -686,7 +692,7 @@ int __init save_microcode_in_initrd_intel(void) if (!count) return ret; - copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count); + copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count); ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) @@ -698,7 +704,7 @@ int __init save_microcode_in_initrd_intel(void) } static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *initrd, +_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, unsigned long start, unsigned long size) { struct ucode_cpu_info uci; @@ -706,11 +712,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *initrd, collect_cpu_info_early(&uci); - ret = scan_microcode(mcs, initrd, start, size, &uci); + ret = scan_microcode(mcs, mc_ptrs, start, size, &uci); if (ret != UCODE_OK) return; - ret = load_microcode(mcs, initrd, start, &uci); + ret = load_microcode(mcs, mc_ptrs, start, &uci); if (ret != UCODE_OK) return; @@ -733,28 +739,28 @@ void __init load_ucode_intel_bsp(void) start = (size ? p->hdr.ramdisk_image : 0); _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), + (unsigned long *)__pa_nodebug(&mc_tmp_ptrs), start, size); #else size = boot_params.hdr.ramdisk_size; start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0); - _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); + _load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size); #endif } void load_ucode_intel_ap(void) { - unsigned long *mc_saved_in_initrd_p; + unsigned long *mcs_tmp_p; struct mc_saved_data *mcs_p; struct ucode_cpu_info uci; enum ucode_state ret; #ifdef CONFIG_X86_32 - mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd); + mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); #else - mc_saved_in_initrd_p = mc_saved_in_initrd; + mcs_tmp_p = mc_tmp_ptrs; mcs_p = &mc_saved_data; #endif @@ -766,9 +772,7 @@ void load_ucode_intel_ap(void) return; collect_cpu_info_early(&uci); - ret = load_microcode(mcs_p, mc_saved_in_initrd_p, - get_initrd_start_addr(), &uci); - + ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci); if (ret != UCODE_OK) return; -- cgit v1.2.1 From 2f303c524ed021825671cfa9b1934338bc04f8ab Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:42 +0100 Subject: x86/microcode/intel: Remove unused arg of get_matching_model_microcode() @cpu is unused, kill it. No functionality change. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-15-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 5970758bbcdd..0c67fd060680 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -287,7 +287,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, * BSP can stay in the platform. */ static enum ucode_state __init -get_matching_model_microcode(int cpu, unsigned long start, +get_matching_model_microcode(unsigned long start, void *data, size_t size, struct mc_saved_data *mcs, unsigned long *mc_ptrs, @@ -564,7 +564,7 @@ scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, return UCODE_ERROR; } - return get_matching_model_microcode(0, start, cd.data, cd.size, + return get_matching_model_microcode(start, cd.data, cd.size, mcs, mc_ptrs, uci); } -- cgit v1.2.1 From f96fde531946524b26d25d4eed9625695837f524 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:43 +0100 Subject: x86/microcode/intel: Cleanup get_matching_model_microcode() Reflow arguments, sort local variables in reverse christmas tree, kill "out" label. No functionality change. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-16-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/intel.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 0c67fd060680..cb397947f688 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -287,19 +287,17 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved, * BSP can stay in the platform. */ static enum ucode_state __init -get_matching_model_microcode(unsigned long start, - void *data, size_t size, - struct mc_saved_data *mcs, - unsigned long *mc_ptrs, +get_matching_model_microcode(unsigned long start, void *data, size_t size, + struct mc_saved_data *mcs, unsigned long *mc_ptrs, struct ucode_cpu_info *uci) { - u8 *ucode_ptr = data; - unsigned int leftover = size; - enum ucode_state state = UCODE_OK; - unsigned int mc_size; - struct microcode_header_intel *mc_header; struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + struct microcode_header_intel *mc_header; unsigned int num_saved = mcs->num_saved; + enum ucode_state state = UCODE_OK; + unsigned int leftover = size; + u8 *ucode_ptr = data; + unsigned int mc_size; int i; while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) { @@ -321,8 +319,7 @@ get_matching_model_microcode(unsigned long start, * the platform, we need to find and save microcode patches * with the same family and model as the BSP. */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != - UCODE_OK) { + if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) { ucode_ptr += mc_size; continue; } @@ -334,19 +331,19 @@ get_matching_model_microcode(unsigned long start, if (leftover) { state = UCODE_ERROR; - goto out; + return state; } if (!num_saved) { state = UCODE_NFOUND; - goto out; + return state; } for (i = 0; i < num_saved; i++) mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start; mcs->num_saved = num_saved; -out: + return state; } -- cgit v1.2.1 From f7eb59dda129e46be5e195a46bfd0dde76db9bbd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:44 +0100 Subject: x86/microcode/AMD: Issue microcode updated message later Before this, we issued this message from save_microcode_in_initrd() which is called from free_initrd_mem(), i.e., only when we have an initrd enabled. However, we can update from builtin microcode too but then we don't issue the update message. Fix it by issuing that message on the generic driver init path. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-17-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/amd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 9f5ccef33fba..f66cbfe74ce4 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -431,10 +431,6 @@ int __init save_microcode_in_initrd_amd(void) else container = cont_va; - if (ucode_new_rev) - pr_info_once("microcode updated early to new patch_level=0x%08x\n", - ucode_new_rev); - eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); @@ -956,6 +952,10 @@ struct microcode_ops * __init init_amd_microcode(void) return NULL; } + if (ucode_new_rev) + pr_info_once("microcode updated early to new patch_level=0x%08x\n", + ucode_new_rev); + return µcode_amd_ops; } -- cgit v1.2.1 From b584303261b7041f209afde4bf7ddb7a21598a1f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 3 Feb 2016 12:33:45 +0100 Subject: x86/microcode: Document builtin microcode loading method Add some text and an example to Documentation/x86/early-microcode.txt explaining how to build in microcode. Tested-by: Thomas Voegtle Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454499225-21544-18-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- Documentation/x86/early-microcode.txt | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Documentation/x86/early-microcode.txt b/Documentation/x86/early-microcode.txt index d62bea6796da..c956d99cf1de 100644 --- a/Documentation/x86/early-microcode.txt +++ b/Documentation/x86/early-microcode.txt @@ -40,3 +40,28 @@ cp ../microcode.bin kernel/x86/microcode/GenuineIntel.bin (or AuthenticAMD.bin) find . | cpio -o -H newc >../ucode.cpio cd .. cat ucode.cpio /boot/initrd-3.5.0.img >/boot/initrd-3.5.0.ucode.img + +Builtin microcode +================= + +We can also load builtin microcode supplied through the regular firmware +builtin method CONFIG_FIRMWARE_IN_KERNEL. Here's an example: + +CONFIG_FIRMWARE_IN_KERNEL=y +CONFIG_EXTRA_FIRMWARE="intel-ucode/06-3a-09 amd-ucode/microcode_amd_fam15h.bin" +CONFIG_EXTRA_FIRMWARE_DIR="/lib/firmware" + +This basically means, you have the following tree structure locally: + +/lib/firmware/ +|-- amd-ucode +... +| |-- microcode_amd_fam15h.bin +... +|-- intel-ucode +... +| |-- 06-3a-09 +... + +so that the build system can find those files and integrate them into +the final kernel image. The early loader finds them and applies them. -- cgit v1.2.1 From 69e0210fd01ff157d332102219aaf5c26ca8069b Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 11 Jan 2016 15:51:18 +0300 Subject: x86/kasan: Clear kasan_zero_page after TLB flush Currently we clear kasan_zero_page before __flush_tlb_all(). This works with current implementation of native_flush_tlb[_global]() because it doesn't cause do any writes to kasan shadow memory. But any subtle change made in native_flush_tlb*() could break this. Also current code seems doesn't work for paravirt guests (lguest). Only after the TLB flush we can be sure that kasan_zero_page is not used as early shadow anymore (instrumented code will not write to it). So it should cleared it only after the TLB flush. Signed-off-by: Andrey Ryabinin Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1452516679-32040-2-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index d470cf219a2d..303e47045864 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -120,11 +120,16 @@ void __init kasan_init(void) kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); - memset(kasan_zero_page, 0, PAGE_SIZE); - load_cr3(init_level4_pgt); __flush_tlb_all(); - init_task.kasan_depth = 0; + /* + * kasan_zero_page has been used as early shadow memory, thus it may + * contain some garbage. Now we can clear it, since after the TLB flush + * no one should write to it. + */ + memset(kasan_zero_page, 0, PAGE_SIZE); + + init_task.kasan_depth = 0; pr_info("KernelAddressSanitizer initialized\n"); } -- cgit v1.2.1 From 063fb3e56f6dd29b2633b678b837e1d904200e6f Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 11 Jan 2016 15:51:19 +0300 Subject: x86/kasan: Write protect kasan zero shadow After kasan_init() executed, no one is allowed to write to kasan_zero_page, so write protect it. Signed-off-by: Andrey Ryabinin Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1452516679-32040-3-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 303e47045864..1b1110fa0057 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -125,10 +125,16 @@ void __init kasan_init(void) /* * kasan_zero_page has been used as early shadow memory, thus it may - * contain some garbage. Now we can clear it, since after the TLB flush - * no one should write to it. + * contain some garbage. Now we can clear and write protect it, since + * after the TLB flush no one should write to it. */ memset(kasan_zero_page, 0, PAGE_SIZE); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + set_pte(&kasan_zero_pte[i], pte); + } + /* Flush TLBs again to be sure that write protection applied. */ + __flush_tlb_all(); init_task.kasan_depth = 0; pr_info("KernelAddressSanitizer initialized\n"); -- cgit v1.2.1 From 060a402a1ddb551455ee410de2eadd3349f2801b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 29 Jan 2016 11:42:57 -0800 Subject: x86/mm: Add INVPCID helpers This adds helpers for each of the four currently-specified INVPCID modes. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/8a62b23ad686888cee01da134c91409e22064db9.1454096309.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 48 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6df2029405a3..8b576832777e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -7,6 +7,54 @@ #include #include +static inline void __invpcid(unsigned long pcid, unsigned long addr, + unsigned long type) +{ + u64 desc[2] = { pcid, addr }; + + /* + * The memory clobber is because the whole point is to invalidate + * stale TLB entries and, especially if we're flushing global + * mappings, we don't want the compiler to reorder any subsequent + * memory accesses before the TLB flush. + * + * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and + * invpcid (%rcx), %rax in long mode. + */ + asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" + : : "m" (desc), "a" (type), "c" (desc) : "memory"); +} + +#define INVPCID_TYPE_INDIV_ADDR 0 +#define INVPCID_TYPE_SINGLE_CTXT 1 +#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 +#define INVPCID_TYPE_ALL_NON_GLOBAL 3 + +/* Flush all mappings for a given pcid and addr, not including globals. */ +static inline void invpcid_flush_one(unsigned long pcid, + unsigned long addr) +{ + __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invpcid_flush_single_context(unsigned long pcid) +{ + __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invpcid_flush_all(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invpcid_flush_all_nonglobals(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +} + #ifdef CONFIG_PARAVIRT #include #else -- cgit v1.2.1 From d12a72b844a49d4162f24cefdab30bed3f86730e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 29 Jan 2016 11:42:58 -0800 Subject: x86/mm: Add a 'noinvpcid' boot option to turn off INVPCID This adds a chicken bit to turn off INVPCID in case something goes wrong. It's an early_param() because we do TLB flushes before we parse __setup() parameters. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/f586317ed1bc2b87aee652267e515b90051af385.1454096309.git.luto@kernel.org Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 2 ++ arch/x86/kernel/cpu/common.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 551ecf09c8dd..e4c4d2a5a28d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2566,6 +2566,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nointroute [IA-64] + noinvpcid [X86] Disable the INVPCID cpu feature. + nojitter [IA-64] Disables jitter checking for ITC timers. no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 37830de8f60a..f4d0aa64d934 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s) } __setup("nompx", x86_mpx_setup); +static int __init x86_noinvpcid_setup(char *s) +{ + /* noinvpcid doesn't accept parameters */ + if (s) + return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_INVPCID)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_INVPCID); + pr_info("noinvpcid: INVPCID feature disabled\n"); + return 0; +} +early_param("noinvpcid", x86_noinvpcid_setup); + #ifdef CONFIG_X86_32 static int cachesize_override = -1; static int disable_x86_serial_nr = 1; -- cgit v1.2.1 From d8bced79af1db6734f66b42064cc773cada2ce99 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 29 Jan 2016 11:42:59 -0800 Subject: x86/mm: If INVPCID is available, use it to flush global mappings On my Skylake laptop, INVPCID function 2 (flush absolutely everything) takes about 376ns, whereas saving flags, twiddling CR4.PGE to flush global mappings, and restoring flags takes about 539ns. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/ed0ef62581c0ea9c99b9bf6df726015e96d44743.1454096309.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 8b576832777e..fc9a2fda1404 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -152,6 +152,15 @@ static inline void __native_flush_tlb_global(void) { unsigned long flags; + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. + */ + invpcid_flush_all(); + return; + } + /* * Read-modify-write to CR4 - protect it from preemption and * from interrupts. (Use the raw variant because this code can -- cgit v1.2.1 From ce1143aa60273220a9f89012f2aaaed04f97e9a2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 25 Jan 2016 23:06:49 -0800 Subject: x86/dmi: Switch dmi_remap() from ioremap() [uncached] to ioremap_cache() DMI cacheability is very confused on x86. dmi_early_remap() uses early_ioremap(), which uses FIXMAP_PAGE_IO, which is __PAGE_KERNEL_IO, which is __PAGE_KERNEL, which is cached. Don't ask me why this makes any sense. dmi_remap() uses ioremap(), which requests an uncached mapping. However, on non-EFI systems, the DMI data generally lives between 0xf0000 and 0x100000, which is in the legacy ISA range, which triggers a special case in the PAT code that overrides the cache mode requested by ioremap() and forces a WB mapping. On a UEFI boot, however, the DMI table can live at any physical address. On my laptop, it's around 0x77dd0000. That's nowhere near the legacy ISA range, so the ioremap() implicit uncached type is honored and we end up with a UC- mapping. UC- is a very, very slow way to read from main memory, so dmi_walk() is likely to take much longer than necessary. Given that, even on UEFI, we do early cached DMI reads, it seems safe to just ask for cached access. Switch to ioremap_cache(). I haven't tried to benchmark this, but I'd guess it saves several milliseconds of boot time. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jean Delvare Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/3147c38e51f439f3c8911db34c7d4ab22d854915.1453791969.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 535192f6bfad..3c69fed215c5 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -15,7 +15,7 @@ static __always_inline __init void *dmi_alloc(unsigned len) /* Use early IO mappings for DMI because it's initialized early */ #define dmi_early_remap early_ioremap #define dmi_early_unmap early_iounmap -#define dmi_remap ioremap +#define dmi_remap ioremap_cache #define dmi_unmap iounmap #endif /* _ASM_X86_DMI_H */ -- cgit v1.2.1 From a91bbe017552b80e12d712c85549b933a62c6ed4 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Tue, 9 Feb 2016 19:44:54 +0600 Subject: x86/boot: Use proper array element type in memset() size calculation I changed open coded zeroing loops to explicit memset()s in the following commit: 5e9ebbd87a99 ("x86/boot: Micro-optimize reset_early_page_tables()") The base for the size argument of memset was sizeof(pud_p/pmd_p), which are pointers - but the initialized array has pud_t/pmd_t elements. Luckily the two types had the same size, so this did not result in any runtime misbehavior. Signed-off-by: Alexander Kuleshov Cc: Alexander Popov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Shevchenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455025494-4063-1-git-send-email-kuleshovmail@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 35843caa4ea7..7793a1702204 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -75,7 +75,7 @@ again: } pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; - memset(pud_p, 0, sizeof(pud_p) * PTRS_PER_PUD); + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pud_p += pud_index(address); @@ -90,7 +90,7 @@ again: } pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; - memset(pmd_p, 0, sizeof(pmd_p) * PTRS_PER_PMD); + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pmd = (physaddr & PMD_MASK) + early_pmd_flags; -- cgit v1.2.1 From dd7b6847670a84b7bb7c38f8e69b2f12059bca66 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 25 Jan 2016 12:25:15 -0500 Subject: x86/mm: Honour passed pgprot in track_pfn_insert() and track_pfn_remap() track_pfn_insert() overwrites the pgprot that is passed in with a value based on the VMA's page_prot. This is a problem for people trying to do clever things with the new vm_insert_pfn_prot() as it will simply overwrite the passed protection flags. If we use the current value of the pgprot as the base, then it will behave as people are expecting. Also fix track_pfn_remap() in the same way. Signed-off-by: Matthew Wilcox Acked-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1453742717-10326-2-git-send-email-matthew.r.wilcox@intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f4ae536b0914..04e2e7144bee 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -943,7 +943,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return -EINVAL; } - *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); return 0; @@ -959,7 +959,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, /* Set prot based on lookup */ pcm = lookup_memtype(pfn_t_to_phys(pfn)); - *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); return 0; -- cgit v1.2.1 From 4ecd16ec7059390b430af34bd8bc3ca2b5dcef9a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 24 Jan 2016 14:38:06 -0800 Subject: x86/fpu: Fix math emulation in eager fpu mode Systems without an FPU are generally old and therefore use lazy FPU switching. Unsurprisingly, math emulation in eager FPU mode is a bit buggy. Fix it. There were two bugs involving kernel code trying to use the FPU registers in eager mode even if they didn't exist and one BUG_ON() that was incorrect. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: yu-cheng yu Link: http://lkml.kernel.org/r/b4b8d112436bd6fab866e1b4011131507e8d7fbe.1453675014.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 3 ++- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/traps.c | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0fd440df63f1..a1f78a9fbf41 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -589,7 +589,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) * If the task has used the math, pre-load the FPU on xsave processors * or if the past 5 consecutive context-switches used math. */ - fpu.preload = new_fpu->fpstate_active && + fpu.preload = static_cpu_has(X86_FEATURE_FPU) && + new_fpu->fpstate_active && (use_eager_fpu() || new_fpu->counter > 5); if (old_fpu->fpregs_active) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index d25097c3fc1d..08e1e11a05ca 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -423,7 +423,7 @@ void fpu__clear(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ - if (!use_eager_fpu()) { + if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) { /* FPU state will be reallocated lazily at the first use. */ fpu__drop(fpu); } else { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ade185a46b1d..87f80febf477 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -750,7 +750,6 @@ dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - BUG_ON(use_eager_fpu()); #ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { -- cgit v1.2.1 From 5ed73f40735c68d8a656b46d09b1885d3b8740ae Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 24 Jan 2016 14:38:07 -0800 Subject: x86/fpu: Fix FNSAVE usage in eagerfpu mode In eager fpu mode, having deactivated FPU without immediately reloading some other context is illegal. Therefore, to recover from FNSAVE, we can't just deactivate the state -- we need to reload it if we're not actively context switching. We had this wrong in fpu__save() and fpu__copy(). Fix both. __kernel_fpu_begin() was fine -- add a comment. This fixes a warning triggerable with nofxsr eagerfpu=on. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: yu-cheng yu Link: http://lkml.kernel.org/r/60662444e13c76f06e23c15c5dcdba31b4ac3d67.1453675014.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 08e1e11a05ca..7a9244df33e2 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -114,6 +114,10 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); if (fpu->fpregs_active) { + /* + * Ignore return value -- we don't care if reg state + * is clobbered. + */ copy_fpregs_to_fpstate(fpu); } else { this_cpu_write(fpu_fpregs_owner_ctx, NULL); @@ -189,8 +193,12 @@ void fpu__save(struct fpu *fpu) preempt_disable(); if (fpu->fpregs_active) { - if (!copy_fpregs_to_fpstate(fpu)) - fpregs_deactivate(fpu); + if (!copy_fpregs_to_fpstate(fpu)) { + if (use_eager_fpu()) + copy_kernel_to_fpregs(&fpu->state); + else + fpregs_deactivate(fpu); + } } preempt_enable(); } @@ -259,7 +267,11 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { memcpy(&src_fpu->state, &dst_fpu->state, xstate_size); - fpregs_deactivate(src_fpu); + + if (use_eager_fpu()) + copy_kernel_to_fpregs(&src_fpu->state); + else + fpregs_deactivate(src_fpu); } preempt_enable(); } -- cgit v1.2.1 From a20d7297045f7fdcd676c15243192eb0e95a4306 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 24 Jan 2016 14:38:08 -0800 Subject: x86/fpu: Fold fpu_copy() into fpu__copy() Splitting it into two functions needlessly obfuscated the code. While we're at it, improve the comment slightly. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: yu-cheng yu Link: http://lkml.kernel.org/r/3eb5a63a9c5c84077b2677a7dfe684eef96fe59e.1453675014.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7a9244df33e2..299b58bb975b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -231,14 +231,15 @@ void fpstate_init(union fpregs_state *state) } EXPORT_SYMBOL_GPL(fpstate_init); -/* - * Copy the current task's FPU state to a new task's FPU context. - * - * In both the 'eager' and the 'lazy' case we save hardware registers - * directly to the destination buffer. - */ -static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) +int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { + dst_fpu->counter = 0; + dst_fpu->fpregs_active = 0; + dst_fpu->last_cpu = -1; + + if (!src_fpu->fpstate_active || !cpu_has_fpu) + return 0; + WARN_ON_FPU(src_fpu != ¤t->thread.fpu); /* @@ -251,10 +252,9 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) /* * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. - * - * If the FPU context got destroyed in the process (FNSAVE - * done on old CPUs) then copy it back into the source - * context and mark the current task for lazy restore. + * In lazy mode, if the FPU context isn't loaded into + * fpregs, CR0.TS will be set and do_device_not_available + * will load the FPU context. * * We have to do all this with preemption disabled, * mostly because of the FNSAVE case, because in that @@ -274,16 +274,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu) fpregs_deactivate(src_fpu); } preempt_enable(); -} - -int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) -{ - dst_fpu->counter = 0; - dst_fpu->fpregs_active = 0; - dst_fpu->last_cpu = -1; - - if (src_fpu->fpstate_active && cpu_has_fpu) - fpu_copy(dst_fpu, src_fpu); return 0; } -- cgit v1.2.1 From c6ab109f7e0eae3bae3bb10f8ddb0df67735c150 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 24 Jan 2016 14:38:09 -0800 Subject: x86/fpu: Speed up lazy FPU restores slightly If we have an FPU, there's no need to check CR0 for FPU emulation. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: yu-cheng yu Link: http://lkml.kernel.org/r/980004297e233c27066d54e71382c44cdd36ef7c.1453675014.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 87f80febf477..36a9c017540e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -752,7 +752,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_MATH_EMULATION - if (read_cr0() & X86_CR0_EM) { + if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) { struct math_emu_info info = { }; conditional_sti(regs); -- cgit v1.2.1 From 58122bf1d856a4ea9581d62a07c557d997d46a19 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 24 Jan 2016 14:38:10 -0800 Subject: x86/fpu: Default eagerfpu=on on all CPUs We have eager and lazy FPU modes, introduced in: 304bceda6a18 ("x86, fpu: use non-lazy fpu restore for processors supporting xsave") The result is rather messy. There are two code paths in almost all of the FPU code, and only one of them (the eager case) is tested frequently, since most kernel developers have new enough hardware that we use eagerfpu. It seems that, on any remotely recent hardware, eagerfpu is a win: glibc uses SSE2, so laziness is probably overoptimistic, and, in any case, manipulating TS is far slower that saving and restoring the full state. (Stores to CR0.TS are serializing and are poorly optimized.) To try to shake out any latent issues on old hardware, this changes the default to eager on all CPUs. If no performance or functionality problems show up, a subsequent patch could remove lazy mode entirely. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Sai Praneeth Prakhya Cc: Thomas Gleixner Cc: yu-cheng yu Link: http://lkml.kernel.org/r/ac290de61bf08d9cfc2664a4f5080257ffc1075a.1453675014.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6d9f0a7ef4c8..471fe277ff40 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -260,7 +260,10 @@ static void __init fpu__init_system_xstate_size_legacy(void) * not only saved the restores along the way, but we also have the * FPU ready to be used for the original task. * - * 'eager' switching is used on modern CPUs, there we switch the FPU + * 'lazy' is deprecated because it's almost never a performance win + * and it's much more complicated than 'eager'. + * + * 'eager' switching is by default on all CPUs, there we switch the FPU * state during every context switch, regardless of whether the task * has used FPU instructions in that time slice or not. This is done * because modern FPU context saving instructions are able to optimize @@ -271,7 +274,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) * to use 'eager' restores, if we detect that a task is using the FPU * frequently. See the fpu->counter logic in fpu/internal.h for that. ] */ -static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; +static enum { ENABLE, DISABLE } eagerfpu = ENABLE; /* * Find supported xfeatures based on cpu features and command-line input. @@ -348,15 +351,9 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { - /* - * No need to check "eagerfpu=auto" again, since it is the - * initial default. - */ if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { eagerfpu = DISABLE; fpu__clear_eager_fpu_features(); - } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) { - eagerfpu = ENABLE; } if (cmdline_find_option_bool(boot_command_line, "no387")) -- cgit v1.2.1 From e2c7698cd61f11d4077fdb28148b2d31b82ac848 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 10 Feb 2016 15:51:16 +0100 Subject: x86/mm: Fix INVPCID asm constraint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So we want to specify the dependency on both @pcid and @addr so that the compiler doesn't reorder accesses to them *before* the TLB flush. But for that to work, we need to express this properly in the inline asm and deref the whole desc array, not the pointer to it. See clwb() for an example. This fixes the build error on 32-bit: arch/x86/include/asm/tlbflush.h: In function ‘__invpcid’: arch/x86/include/asm/tlbflush.h:26:18: error: memory input 0 is not directly addressable which gcc4.7 caught but 5.x didn't. Which is strange. :-\ Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Michael Matz Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index fc9a2fda1404..d0cce90b0855 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -10,7 +10,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, unsigned long type) { - u64 desc[2] = { pcid, addr }; + struct { u64 d[2]; } desc = { { pcid, addr } }; /* * The memory clobber is because the whole point is to invalidate @@ -22,7 +22,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, * invpcid (%rcx), %rax in long mode. */ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" - : : "m" (desc), "a" (type), "c" (desc) : "memory"); + : : "m" (desc), "a" (type), "c" (&desc) : "memory"); } #define INVPCID_TYPE_INDIV_ADDR 0 -- cgit v1.2.1 From f944b5a7aff05a244a6c8cac297819af09a199e4 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 14 Jan 2016 10:54:13 +0100 Subject: genirq: Use a common macro to go through the actions list The irq code browses the list of actions differently to inspect the element one by one. Even if it is not a problem, for the sake of consistent code, provide a macro similar to for_each_irq_desc in order to have the same loop to go through the actions list and use it in the code. [ tglx: Renamed the macro ] Signed-off-by: Daniel Lezcano Link: http://lkml.kernel.org/r/1452765253-31148-1-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 6 ++---- kernel/irq/internals.h | 3 +++ kernel/irq/manage.c | 8 +++----- kernel/irq/proc.c | 2 +- kernel/irq/spurious.c | 4 +--- 5 files changed, 10 insertions(+), 13 deletions(-) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 57bff7857e87..a15b5485b446 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,10 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int flags = 0, irq = desc->irq_data.irq; - struct irqaction *action = desc->action; + struct irqaction *action; - /* action might have become NULL since we dropped the lock */ - while (action) { + for_each_action_of_desc(desc, action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -173,7 +172,6 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) } retval |= res; - action = action->next; } add_interrupt_randomness(irq, flags); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fcab63c66905..eab521fc547c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) +#define for_each_action_of_desc(desc, act) \ + for (act = desc->act; act; act = act->next) + struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 841187239adc..3ddd2297ee95 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq) */ void irq_set_thread_affinity(struct irq_desc *desc) { - struct irqaction *action = desc->action; + struct irqaction *action; - while (action) { + for_each_action_of_desc(desc, action) if (action->thread) set_bit(IRQTF_AFFINITY, &action->thread_flags); - action = action->next; - } } #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id) return; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action; action; action = action->next) { + for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a2c02fd5d6d0..4e1b94726818 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) int ret = 1; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { + for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { ret = 0; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 32144175458d..5707f97a3e6a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) * desc->lock here. See synchronize_irq(). */ raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - while (action) { + for_each_action_of_desc(desc, action) { printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); if (action->thread_fn) printk(KERN_CONT " threaded [<%p>] %pf", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); - action = action->next; } raw_spin_unlock_irqrestore(&desc->lock, flags); } -- cgit v1.2.1 From f2cc8e0791c70833758101d9756609a08dd601ec Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 16 Feb 2016 00:19:18 +0100 Subject: x86/cpufeature: Speed up cpu_feature_enabled() When GCC cannot do constant folding for this macro, it falls back to cpu_has(). But static_cpu_has() is optimal and it works at all times now. So use it and speedup the fallback case. Before we had this: mov 0x99d674(%rip),%rdx # ffffffff81b0d9f4 shr $0x2e,%rdx and $0x1,%edx jne ffffffff811704e9 After alternatives patching, it turns into: jmp 0xffffffff81170390 nopl (%rax) ... callq ffffffff81056e00 ffffffff81170390: mov 0x170(%r12),%rdi Signed-off-by: Borislav Petkov Cc: Joerg Roedel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455578358-28347-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 9fba7a5dd24a..68e4e8258b84 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -88,8 +88,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; * is not relevant. */ #define cpu_feature_enabled(bit) \ - (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : \ - cpu_has(&boot_cpu_data, bit)) + (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) -- cgit v1.2.1 From 7f5301b7e66a1fd096b5d10dbb0bb2a8832516b4 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 14 Feb 2016 18:09:52 -0500 Subject: x86/platform: Make platform/intel-quark/imr.c explicitly non-modular The Kconfig currently controlling compilation of this code is: drivers/platform/x86/Kconfig:config INTEL_IMR drivers/platform/x86/Kconfig: bool "Intel Isolated Memory Region support" ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the driver there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. Also note that MODULE_DEVICE_TABLE is a no-op for non-modular code. We also delete the MODULE_LICENSE tag etc. since all that information was (or is now) contained at the top of the file in the comments. Signed-off-by: Paul Gortmaker Reviewed-by: Bryan O'Donoghue Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455491396-30977-2-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-quark/imr.c | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index c61b6c332e97..0a3736f03edc 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -1,5 +1,5 @@ /** - * imr.c + * imr.c -- Intel Isolated Memory Region driver * * Copyright(c) 2013 Intel Corporation. * Copyright(c) 2015 Bryan O'Donoghue @@ -31,7 +31,6 @@ #include #include #include -#include #include struct imr_device { @@ -269,17 +268,6 @@ static int imr_debugfs_register(struct imr_device *idev) return PTR_ERR_OR_ZERO(idev->file); } -/** - * imr_debugfs_unregister - unregister debugfs hooks. - * - * @idev: pointer to imr_device structure. - * @return: - */ -static void imr_debugfs_unregister(struct imr_device *idev) -{ - debugfs_remove(idev->file); -} - /** * imr_check_params - check passed address range IMR alignment and non-zero size * @@ -614,7 +602,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = { { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ {} }; -MODULE_DEVICE_TABLE(x86cpu, imr_ids); /** * imr_init - entry point for IMR driver. @@ -640,22 +627,4 @@ static int __init imr_init(void) imr_fixup_memmap(idev); return 0; } - -/** - * imr_exit - exit point for IMR code. - * - * Deregisters debugfs, leave IMR state as-is. - * - * return: - */ -static void __exit imr_exit(void) -{ - imr_debugfs_unregister(&imr_dev); -} - -module_init(imr_init); -module_exit(imr_exit); - -MODULE_AUTHOR("Bryan O'Donoghue "); -MODULE_DESCRIPTION("Intel Isolated Memory Region driver"); -MODULE_LICENSE("Dual BSD/GPL"); +device_initcall(imr_init); -- cgit v1.2.1 From 32ed42ad6c42d6e0e78914a25c216fbbd595e74d Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 14 Feb 2016 18:09:53 -0500 Subject: x86/platform: Make platform/intel-quark/imr_selftest.c explicitly non-modular The Kconfig currently controlling compilation of this code is: arch/x86/Kconfig.debug:config DEBUG_IMR_SELFTEST arch/x86/Kconfig.debug: bool "Isolated Memory Region self test" ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the driver there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. Also note that MODULE_DEVICE_TABLE is a no-op for non-modular code. We also delete the MODULE_LICENSE tag etc. since all that information was (or is now) contained at the top of the file in the comments. Signed-off-by: Paul Gortmaker Reviewed-by: Bryan O'Donoghue Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455491396-30977-3-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-quark/imr_selftest.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index 278e4da4222f..0381343a0d3a 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -1,5 +1,5 @@ /** - * imr_selftest.c + * imr_selftest.c -- Intel Isolated Memory Region self-test driver * * Copyright(c) 2013 Intel Corporation. * Copyright(c) 2015 Bryan O'Donoghue @@ -15,7 +15,6 @@ #include #include #include -#include #include #define SELFTEST KBUILD_MODNAME ": " @@ -106,7 +105,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = { { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ {} }; -MODULE_DEVICE_TABLE(x86cpu, imr_ids); /** * imr_self_test_init - entry point for IMR driver. @@ -125,13 +123,4 @@ static int __init imr_self_test_init(void) * * return: */ -static void __exit imr_self_test_exit(void) -{ -} - -module_init(imr_self_test_init); -module_exit(imr_self_test_exit); - -MODULE_AUTHOR("Bryan O'Donoghue "); -MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver"); -MODULE_LICENSE("Dual BSD/GPL"); +device_initcall(imr_self_test_init); -- cgit v1.2.1 From eb61aee743ec647f22071479a5137241a7dcab17 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 14 Feb 2016 18:09:54 -0500 Subject: x86/platform: Make platform/geode/geos.c explicitly non-modular The Kconfig currently controlling compilation of this code is: arch/x86/Kconfig:config GEOS arch/x86/Kconfig: bool "Traverse Technologies GEOS System Support (LEDS, GPIO, etc)" ...meaning that it currently is not being built as a module by anyone. Lets remove the couple traces of modularity, so that when reading the code there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We also delete the MODULE_LICENSE tag etc. since all that information is already contained at the top of the file in the comments. Signed-off-by: Paul Gortmaker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Philip Prindeville Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455491396-30977-4-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- arch/x86/platform/geode/geos.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c index aa733fba2471..4fcdb91318a0 100644 --- a/arch/x86/platform/geode/geos.c +++ b/arch/x86/platform/geode/geos.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -120,9 +119,4 @@ static int __init geos_init(void) return 0; } - -module_init(geos_init); - -MODULE_AUTHOR("Philip Prindeville "); -MODULE_DESCRIPTION("Traverse Technologies Geos System Setup"); -MODULE_LICENSE("GPL"); +device_initcall(geos_init); -- cgit v1.2.1 From 52d856e88171e1ec000d0479363c1e4e81991130 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 14 Feb 2016 18:09:55 -0500 Subject: x86/platform: Make platform/geode/alix.c explicitly non-modular The Kconfig currently controlling compilation of this code is: arch/x86/Kconfig:config ALIX arch/x86/Kconfig: bool "PCEngines ALIX System Support (LED setup)" ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the driver there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We replace module.h with moduleparam.h since the file does declare some module parameters, and leaving them as such is currently the easiest way to remain compatible with existing boot arg use cases. We also delete the MODULE_LICENSE tag etc. since all that information is already contained at the top of the file in the comments. Signed-off-by: Paul Gortmaker Cc: Ed Wildgoose Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455491396-30977-5-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- arch/x86/platform/geode/alix.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index 76b6632d3143..1865c196f136 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -35,6 +35,11 @@ #define BIOS_SIGNATURE_COREBOOT 0x500 #define BIOS_REGION_SIZE 0x10000 +/* + * This driver is not modular, but to keep back compatibility + * with existing use cases, continuing with module_param is + * the easiest way forward. + */ static bool force = 0; module_param(force, bool, 0444); /* FIXME: Award bios is not automatically detected as Alix platform */ @@ -192,9 +197,4 @@ static int __init alix_init(void) return 0; } - -module_init(alix_init); - -MODULE_AUTHOR("Ed Wildgoose "); -MODULE_DESCRIPTION("PCEngines ALIX System Setup"); -MODULE_LICENSE("GPL"); +device_initcall(alix_init); -- cgit v1.2.1 From 605a46ee8353e8292e93baa5dc13e6be98bbec43 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 14 Feb 2016 18:09:56 -0500 Subject: x86/platform: Make platform/geode/net5501.c explicitly non-modular The Kconfig currently controlling compilation of this code is: arch/x86/Kconfig:config NET5501 arch/x86/Kconfig: bool "Soekris Engineering net5501 System Support (LEDS, GPIO, etc)" ...meaning that it currently is not being built as a module by anyone. Lets remove the couple traces of modularity, so that when reading the driver there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We also delete the MODULE_LICENSE tag etc. since all that information is already contained at the top of the file in the comments. Signed-off-by: Paul Gortmaker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Philip Prindeville Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455491396-30977-6-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- arch/x86/platform/geode/net5501.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c index 927e38c0089f..a2f6b982a729 100644 --- a/arch/x86/platform/geode/net5501.c +++ b/arch/x86/platform/geode/net5501.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -146,9 +145,4 @@ static int __init net5501_init(void) return 0; } - -module_init(net5501_init); - -MODULE_AUTHOR("Philip Prindeville "); -MODULE_DESCRIPTION("Soekris net5501 System Setup"); -MODULE_LICENSE("GPL"); +device_initcall(net5501_init); -- cgit v1.2.1 From 25b4caf7c50e8c501310e8c515d8518b1850c948 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Sun, 14 Feb 2016 13:35:58 +0100 Subject: x86/boot: Remove unused 'is_big_kernel' variable Variable is_big_kernel is defined in arch/x86/boot/tools/build.c but never used anywhere. Boris noted that its usage went away 7 years ago, as of: 5e47c478b0b6 ("x86: remove zImage support") Signed-off-by: Nicolas Iooss Reviewed-by: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455453358-4088-1-git-send-email-nicolas.iooss_linux@m4x.org Signed-off-by: Ingo Molnar --- arch/x86/boot/tools/build.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index a7661c430cd9..0702d2531bc7 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -49,7 +49,6 @@ typedef unsigned int u32; /* This must be large enough to hold the entire setup */ u8 buf[SETUP_SECT_MAX*512]; -int is_big_kernel; #define PECOFF_RELOC_RESERVE 0x20 -- cgit v1.2.1 From fed6d3363182fd499cd7f70cf424cd3cba7aef26 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 10 Feb 2016 15:46:56 +0100 Subject: irqchip/armada-370-xp: Add Kconfig option for the driver Instead of building the irq-armada-370-xp driver directly when CONFIG_ARCH_MVEBU is enabled, this commit introduces an intermediate CONFIG_ARMADA_370_XP_IRQ hidden Kconfig option. This allows this option to select other interrupt-related Kconfig options (which will be needed in follow-up commits) rather than having such selects done from arch/arm/mach-/. Signed-off-by: Thomas Petazzoni Acked-by: Gregory CLEMENT Link: https://lkml.kernel.org/r/1455115621-22846-2-git-send-email-thomas.petazzoni@free-electrons.com Signed-off-by: Jason Cooper --- drivers/irqchip/Kconfig | 5 +++++ drivers/irqchip/Makefile | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index fb50911b3940..dfe4ed8d838d 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -60,6 +60,11 @@ config ARM_VIC_NR The maximum number of VICs available in the system, for power management. +config ARMADA_370_XP_IRQ + bool + default y if ARCH_MVEBU + select GENERIC_IRQ_CHIP + config ATMEL_AIC_IRQ bool select GENERIC_IRQ_CHIP diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 18caacb60d58..30dba044d0b8 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -5,7 +5,6 @@ obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2836.o obj-$(CONFIG_ARCH_EXYNOS) += exynos-combiner.o obj-$(CONFIG_ARCH_HIP04) += irq-hip04.o obj-$(CONFIG_ARCH_MMP) += irq-mmp.o -obj-$(CONFIG_ARCH_MVEBU) += irq-armada-370-xp.o obj-$(CONFIG_IRQ_MXS) += irq-mxs.o obj-$(CONFIG_ARCH_TEGRA) += irq-tegra.o obj-$(CONFIG_ARCH_S3C24XX) += irq-s3c24xx.o @@ -28,6 +27,7 @@ obj-$(CONFIG_ARM_GIC_V3_ITS) += irq-gic-v3-its.o irq-gic-v3-its-pci-msi.o irq-g obj-$(CONFIG_HISILICON_IRQ_MBIGEN) += irq-mbigen.o obj-$(CONFIG_ARM_NVIC) += irq-nvic.o obj-$(CONFIG_ARM_VIC) += irq-vic.o +obj-$(CONFIG_ARMADA_370_XP_IRQ) += irq-armada-370-xp.o obj-$(CONFIG_ATMEL_AIC_IRQ) += irq-atmel-aic-common.o irq-atmel-aic.o obj-$(CONFIG_ATMEL_AIC5_IRQ) += irq-atmel-aic-common.o irq-atmel-aic5.o obj-$(CONFIG_I8259) += irq-i8259.o -- cgit v1.2.1 From fcc392d501bd2befdf35180abcf07b4849499ed6 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 10 Feb 2016 15:46:57 +0100 Subject: irqchip/armada-370-xp: Use the generic MSI infrastructure This commit moves the irq-armada-370-xp driver from using the PCI-specific MSI infrastructure to the generic MSI infrastructure, to which drivers are progressively converted. In this hardware, the MSI controller is directly bundled inside the interrupt controller, so we have a single Device Tree node to which multiple IRQ domaines are attached: the wired interrupt domain and the MSI interrupt domain. In order to ensure that they can be differentiated, we have to force the bus_token of the wired interrupt domain to be DOMAIN_BUS_WIRED. The MSI domain bus_token is automatically set to the appropriate value by pci_msi_create_irq_domain(). Signed-off-by: Thomas Petazzoni Suggested-by: Marc Zyngier Reviewed-by: Marc Zyngier Link: https://lkml.kernel.org/r/1455115621-22846-3-git-send-email-thomas.petazzoni@free-electrons.com Signed-off-by: Jason Cooper --- drivers/irqchip/Kconfig | 1 + drivers/irqchip/irq-armada-370-xp.c | 149 +++++++++++++++--------------------- 2 files changed, 62 insertions(+), 88 deletions(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index dfe4ed8d838d..ba6a084f24cd 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -64,6 +64,7 @@ config ARMADA_370_XP_IRQ bool default y if ARCH_MVEBU select GENERIC_IRQ_CHIP + select PCI_MSI_IRQ_DOMAIN if PCI_MSI config ATMEL_AIC_IRQ bool diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c index 3f3a8c3d2175..e5738c5d7a6c 100644 --- a/drivers/irqchip/irq-armada-370-xp.c +++ b/drivers/irqchip/irq-armada-370-xp.c @@ -71,6 +71,7 @@ static u32 doorbell_mask_reg; static int parent_irq; #ifdef CONFIG_PCI_MSI static struct irq_domain *armada_370_xp_msi_domain; +static struct irq_domain *armada_370_xp_msi_inner_domain; static DECLARE_BITMAP(msi_used, PCI_MSI_DOORBELL_NR); static DEFINE_MUTEX(msi_used_lock); static phys_addr_t msi_doorbell_addr; @@ -115,127 +116,99 @@ static void armada_370_xp_irq_unmask(struct irq_data *d) #ifdef CONFIG_PCI_MSI -static int armada_370_xp_alloc_msi(void) -{ - int hwirq; - - mutex_lock(&msi_used_lock); - hwirq = find_first_zero_bit(&msi_used, PCI_MSI_DOORBELL_NR); - if (hwirq >= PCI_MSI_DOORBELL_NR) - hwirq = -ENOSPC; - else - set_bit(hwirq, msi_used); - mutex_unlock(&msi_used_lock); +static struct irq_chip armada_370_xp_msi_irq_chip = { + .name = "armada_370_xp_msi_irq", + .irq_mask = pci_msi_mask_irq, + .irq_unmask = pci_msi_unmask_irq, +}; - return hwirq; -} +static struct msi_domain_info armada_370_xp_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS), + .chip = &armada_370_xp_msi_irq_chip, +}; -static void armada_370_xp_free_msi(int hwirq) +static void armada_370_xp_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { - mutex_lock(&msi_used_lock); - if (!test_bit(hwirq, msi_used)) - pr_err("trying to free unused MSI#%d\n", hwirq); - else - clear_bit(hwirq, msi_used); - mutex_unlock(&msi_used_lock); + msg->address_lo = lower_32_bits(msi_doorbell_addr); + msg->address_hi = upper_32_bits(msi_doorbell_addr); + msg->data = 0xf00 | (data->hwirq + PCI_MSI_DOORBELL_START); } -static int armada_370_xp_setup_msi_irq(struct msi_controller *chip, - struct pci_dev *pdev, - struct msi_desc *desc) +static int armada_370_xp_msi_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) { - struct msi_msg msg; - int virq, hwirq; + return -EINVAL; +} - /* We support MSI, but not MSI-X */ - if (desc->msi_attrib.is_msix) - return -EINVAL; +static struct irq_chip armada_370_xp_msi_bottom_irq_chip = { + .name = "armada_370_xp_msi_irq", + .irq_compose_msi_msg = armada_370_xp_compose_msi_msg, + .irq_set_affinity = armada_370_xp_msi_set_affinity, +}; - hwirq = armada_370_xp_alloc_msi(); - if (hwirq < 0) - return hwirq; +static int armada_370_xp_msi_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + int hwirq; - virq = irq_create_mapping(armada_370_xp_msi_domain, hwirq); - if (!virq) { - armada_370_xp_free_msi(hwirq); - return -EINVAL; + mutex_lock(&msi_used_lock); + hwirq = find_first_zero_bit(&msi_used, PCI_MSI_DOORBELL_NR); + if (hwirq >= PCI_MSI_DOORBELL_NR) { + mutex_unlock(&msi_used_lock); + return -ENOSPC; } - irq_set_msi_desc(virq, desc); + set_bit(hwirq, msi_used); + mutex_unlock(&msi_used_lock); - msg.address_lo = msi_doorbell_addr; - msg.address_hi = 0; - msg.data = 0xf00 | (hwirq + 16); + irq_domain_set_info(domain, virq, hwirq, &armada_370_xp_msi_bottom_irq_chip, + domain->host_data, handle_simple_irq, + NULL, NULL); - pci_write_msi_msg(virq, &msg); - return 0; + return hwirq; } -static void armada_370_xp_teardown_msi_irq(struct msi_controller *chip, - unsigned int irq) +static void armada_370_xp_msi_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) { - struct irq_data *d = irq_get_irq_data(irq); - unsigned long hwirq = d->hwirq; - - irq_dispose_mapping(irq); - armada_370_xp_free_msi(hwirq); -} - -static struct irq_chip armada_370_xp_msi_irq_chip = { - .name = "armada_370_xp_msi_irq", - .irq_enable = pci_msi_unmask_irq, - .irq_disable = pci_msi_mask_irq, - .irq_mask = pci_msi_mask_irq, - .irq_unmask = pci_msi_unmask_irq, -}; + struct irq_data *d = irq_domain_get_irq_data(domain, virq); -static int armada_370_xp_msi_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hw) -{ - irq_set_chip_and_handler(virq, &armada_370_xp_msi_irq_chip, - handle_simple_irq); - - return 0; + mutex_lock(&msi_used_lock); + if (!test_bit(d->hwirq, msi_used)) + pr_err("trying to free unused MSI#%lu\n", d->hwirq); + else + clear_bit(d->hwirq, msi_used); + mutex_unlock(&msi_used_lock); } -static const struct irq_domain_ops armada_370_xp_msi_irq_ops = { - .map = armada_370_xp_msi_map, +static const struct irq_domain_ops armada_370_xp_msi_domain_ops = { + .alloc = armada_370_xp_msi_alloc, + .free = armada_370_xp_msi_free, }; static int armada_370_xp_msi_init(struct device_node *node, phys_addr_t main_int_phys_base) { - struct msi_controller *msi_chip; u32 reg; - int ret; msi_doorbell_addr = main_int_phys_base + ARMADA_370_XP_SW_TRIG_INT_OFFS; - msi_chip = kzalloc(sizeof(*msi_chip), GFP_KERNEL); - if (!msi_chip) + armada_370_xp_msi_inner_domain = + irq_domain_add_linear(NULL, PCI_MSI_DOORBELL_NR, + &armada_370_xp_msi_domain_ops, NULL); + if (!armada_370_xp_msi_inner_domain) return -ENOMEM; - msi_chip->setup_irq = armada_370_xp_setup_msi_irq; - msi_chip->teardown_irq = armada_370_xp_teardown_msi_irq; - msi_chip->of_node = node; - armada_370_xp_msi_domain = - irq_domain_add_linear(NULL, PCI_MSI_DOORBELL_NR, - &armada_370_xp_msi_irq_ops, - NULL); + pci_msi_create_irq_domain(of_node_to_fwnode(node), + &armada_370_xp_msi_domain_info, + armada_370_xp_msi_inner_domain); if (!armada_370_xp_msi_domain) { - kfree(msi_chip); + irq_domain_remove(armada_370_xp_msi_inner_domain); return -ENOMEM; } - ret = of_pci_msi_chip_add(msi_chip); - if (ret < 0) { - irq_domain_remove(armada_370_xp_msi_domain); - kfree(msi_chip); - return ret; - } - reg = readl(per_cpu_int_base + ARMADA_370_XP_IN_DRBEL_MSK_OFFS) | PCI_MSI_DOORBELL_MASK; @@ -427,12 +400,12 @@ static void armada_370_xp_handle_msi_irq(struct pt_regs *regs, bool is_chained) continue; if (is_chained) { - irq = irq_find_mapping(armada_370_xp_msi_domain, + irq = irq_find_mapping(armada_370_xp_msi_inner_domain, msinr - 16); generic_handle_irq(irq); } else { irq = msinr - 16; - handle_domain_irq(armada_370_xp_msi_domain, + handle_domain_irq(armada_370_xp_msi_inner_domain, irq, regs); } } @@ -604,8 +577,8 @@ static int __init armada_370_xp_mpic_of_init(struct device_node *node, armada_370_xp_mpic_domain = irq_domain_add_linear(node, nr_irqs, &armada_370_xp_mpic_irq_ops, NULL); - BUG_ON(!armada_370_xp_mpic_domain); + armada_370_xp_mpic_domain->bus_token = DOMAIN_BUS_WIRED; /* Setup for the boot CPU */ armada_xp_mpic_perf_init(); -- cgit v1.2.1 From 0636bab67fa4df68c85b28ba345596a72986fa9e Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 10 Feb 2016 15:46:58 +0100 Subject: irqchip/armada-370-xp: Use PCI_MSI_DOORBELL_START where appropriate As suggested by Gregory Clement, this commit adjusts the irq-armada-370-xp driver to use the PCI_MSI_DOORBELL_START define in the armada_370_xp_handle_msi_irq() function, rather than hardcoding its value. Suggested-by: Gregory CLEMENT Signed-off-by: Thomas Petazzoni Acked-by: Gregory CLEMENT Link: https://lkml.kernel.org/r/1455115621-22846-4-git-send-email-thomas.petazzoni@free-electrons.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-armada-370-xp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c index e5738c5d7a6c..f53eb716eccb 100644 --- a/drivers/irqchip/irq-armada-370-xp.c +++ b/drivers/irqchip/irq-armada-370-xp.c @@ -401,10 +401,10 @@ static void armada_370_xp_handle_msi_irq(struct pt_regs *regs, bool is_chained) if (is_chained) { irq = irq_find_mapping(armada_370_xp_msi_inner_domain, - msinr - 16); + msinr - PCI_MSI_DOORBELL_START); generic_handle_irq(irq); } else { - irq = msinr - 16; + irq = msinr - PCI_MSI_DOORBELL_START; handle_domain_irq(armada_370_xp_msi_inner_domain, irq, regs); } -- cgit v1.2.1 From f692a172deb5cda8874a44782401901597c526a2 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 10 Feb 2016 15:46:59 +0100 Subject: irqchip/armada-370-xp: Use shorter names for irq_chip In order to make the output of /proc/interrupts, use shorter names for the irq_chip registered by the irq-armada-370-xp driver. Using capital letters also matches better what is done for the GIC driver, which uses just "GIC" as the irq_chip->name. Signed-off-by: Thomas Petazzoni Acked-by: Gregory CLEMENT Link: https://lkml.kernel.org/r/1455115621-22846-5-git-send-email-thomas.petazzoni@free-electrons.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-armada-370-xp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c index f53eb716eccb..c99ae5fc5500 100644 --- a/drivers/irqchip/irq-armada-370-xp.c +++ b/drivers/irqchip/irq-armada-370-xp.c @@ -117,7 +117,7 @@ static void armada_370_xp_irq_unmask(struct irq_data *d) #ifdef CONFIG_PCI_MSI static struct irq_chip armada_370_xp_msi_irq_chip = { - .name = "armada_370_xp_msi_irq", + .name = "MPIC MSI", .irq_mask = pci_msi_mask_irq, .irq_unmask = pci_msi_unmask_irq, }; @@ -141,7 +141,7 @@ static int armada_370_xp_msi_set_affinity(struct irq_data *irq_data, } static struct irq_chip armada_370_xp_msi_bottom_irq_chip = { - .name = "armada_370_xp_msi_irq", + .name = "MPIC MSI", .irq_compose_msi_msg = armada_370_xp_compose_msi_msg, .irq_set_affinity = armada_370_xp_msi_set_affinity, }; @@ -253,7 +253,7 @@ static int armada_xp_set_affinity(struct irq_data *d, #endif static struct irq_chip armada_370_xp_irq_chip = { - .name = "armada_370_xp_irq", + .name = "MPIC", .irq_mask = armada_370_xp_irq_mask, .irq_mask_ack = armada_370_xp_irq_mask, .irq_unmask = armada_370_xp_irq_unmask, -- cgit v1.2.1 From a71b9412c90ca784ae68d32ec307cc527b5962a9 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 10 Feb 2016 15:47:00 +0100 Subject: irqchip/armada-370-xp: Allow allocation of multiple MSIs Add support for allocating multiple MSIs at the same time, so that the MSI_FLAG_MULTI_PCI_MSI flag can be added to the msi_domain_info structure. Signed-off-by: Thomas Petazzoni Reviewed-by: Gregory CLEMENT Link: https://lkml.kernel.org/r/1455115621-22846-6-git-send-email-thomas.petazzoni@free-electrons.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-armada-370-xp.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c index c99ae5fc5500..e7dc6cbda2a1 100644 --- a/drivers/irqchip/irq-armada-370-xp.c +++ b/drivers/irqchip/irq-armada-370-xp.c @@ -123,7 +123,8 @@ static struct irq_chip armada_370_xp_msi_irq_chip = { }; static struct msi_domain_info armada_370_xp_msi_domain_info = { - .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS), + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI), .chip = &armada_370_xp_msi_irq_chip, }; @@ -149,21 +150,26 @@ static struct irq_chip armada_370_xp_msi_bottom_irq_chip = { static int armada_370_xp_msi_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *args) { - int hwirq; + int hwirq, i; mutex_lock(&msi_used_lock); - hwirq = find_first_zero_bit(&msi_used, PCI_MSI_DOORBELL_NR); + + hwirq = bitmap_find_next_zero_area(msi_used, PCI_MSI_DOORBELL_NR, + 0, nr_irqs, 0); if (hwirq >= PCI_MSI_DOORBELL_NR) { mutex_unlock(&msi_used_lock); return -ENOSPC; } - set_bit(hwirq, msi_used); + bitmap_set(msi_used, hwirq, nr_irqs); mutex_unlock(&msi_used_lock); - irq_domain_set_info(domain, virq, hwirq, &armada_370_xp_msi_bottom_irq_chip, - domain->host_data, handle_simple_irq, - NULL, NULL); + for (i = 0; i < nr_irqs; i++) { + irq_domain_set_info(domain, virq + i, hwirq + i, + &armada_370_xp_msi_bottom_irq_chip, + domain->host_data, handle_simple_irq, + NULL, NULL); + } return hwirq; } @@ -174,10 +180,7 @@ static void armada_370_xp_msi_free(struct irq_domain *domain, struct irq_data *d = irq_domain_get_irq_data(domain, virq); mutex_lock(&msi_used_lock); - if (!test_bit(d->hwirq, msi_used)) - pr_err("trying to free unused MSI#%lu\n", d->hwirq); - else - clear_bit(d->hwirq, msi_used); + bitmap_clear(msi_used, d->hwirq, nr_irqs); mutex_unlock(&msi_used_lock); } -- cgit v1.2.1 From cb49d86dbe7a3277fb329c30206d4d95470c879c Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 10 Feb 2016 15:47:01 +0100 Subject: ARM: mvebu: Use the ARMADA_370_XP_IRQ option Now that there is a ARMADA_370_XP_IRQ option to enable the irqchip driver for Armada 370, XP, 375, 38x and 39x, let's select this option when needed. Note that this selection is currently not mandatory because ARMADA_370_XP_IRQ is for now always enabled when ARCH_MVEBU=y, but this is something that we will change in the future, and therefore we should make the relevant platforms select ARMADA_370_XP_IRQ when needed. Due to this, selecting GENERIC_IRQ_CHIP is no longer needed. Signed-off-by: Thomas Petazzoni Acked-by: Gregory CLEMENT Link: https://lkml.kernel.org/r/1455115621-22846-7-git-send-email-thomas.petazzoni@free-electrons.com Signed-off-by: Jason Cooper --- arch/arm/mach-mvebu/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig index 64e3d2ce9a07..b003e3afd693 100644 --- a/arch/arm/mach-mvebu/Kconfig +++ b/arch/arm/mach-mvebu/Kconfig @@ -3,7 +3,6 @@ menuconfig ARCH_MVEBU depends on ARCH_MULTI_V7 || ARCH_MULTI_V5 select ARCH_SUPPORTS_BIG_ENDIAN select CLKSRC_MMIO - select GENERIC_IRQ_CHIP select PINCTRL select PLAT_ORION select SOC_BUS @@ -29,6 +28,7 @@ config MACH_ARMADA_370 bool "Marvell Armada 370 boards" depends on ARCH_MULTI_V7 select ARMADA_370_CLK + select ARMADA_370_XP_IRQ select CPU_PJ4B select MACH_MVEBU_V7 select PINCTRL_ARMADA_370 @@ -39,6 +39,7 @@ config MACH_ARMADA_370 config MACH_ARMADA_375 bool "Marvell Armada 375 boards" depends on ARCH_MULTI_V7 + select ARMADA_370_XP_IRQ select ARM_ERRATA_720789 select ARM_ERRATA_753970 select ARM_GIC @@ -58,6 +59,7 @@ config MACH_ARMADA_38X select ARM_ERRATA_720789 select ARM_ERRATA_753970 select ARM_GIC + select ARMADA_370_XP_IRQ select ARMADA_38X_CLK select HAVE_ARM_SCU select HAVE_ARM_TWD if SMP @@ -72,6 +74,7 @@ config MACH_ARMADA_39X bool "Marvell Armada 39x boards" depends on ARCH_MULTI_V7 select ARM_GIC + select ARMADA_370_XP_IRQ select ARMADA_39X_CLK select CACHE_L2X0 select HAVE_ARM_SCU @@ -86,6 +89,7 @@ config MACH_ARMADA_39X config MACH_ARMADA_XP bool "Marvell Armada XP boards" depends on ARCH_MULTI_V7 + select ARMADA_370_XP_IRQ select ARMADA_XP_CLK select CPU_PJ4B select MACH_MVEBU_V7 -- cgit v1.2.1 From 63131b636a0ec969e8b98122c1a928b5a2649d3b Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Mon, 8 Feb 2016 18:14:10 +0100 Subject: irqchip/armada-370-xp: Do not enable it by default when ARCH_MVEBU is selected The irq-armada-370-xp driver can only be built for ARM 32 bits. The mvebu family had grown with a new ARM64 SoC which will also select the ARCH_MEVBU configuration. Since "ARM: mvebu: use the ARMADA_370_XP_IRQ option", the ARM32 mvebu SoC directly select this new option. Selecting it by default when ARCH_MEVBU is selected is no more needed. This patch removes this dependency, thanks to this, a kernel for ARM64 mvebu SoC can be built without error due this driver. Signed-off-by: Gregory CLEMENT Link: https://lkml.kernel.org/r/1454951660-13289-3-git-send-email-gregory.clement@free-electrons.com Signed-off-by: Jason Cooper --- drivers/irqchip/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index ba6a084f24cd..8115a32a553c 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -62,7 +62,6 @@ config ARM_VIC_NR config ARMADA_370_XP_IRQ bool - default y if ARCH_MVEBU select GENERIC_IRQ_CHIP select PCI_MSI_IRQ_DOMAIN if PCI_MSI -- cgit v1.2.1 From e54fdcca70a33a7e447e526b305db85e978c0563 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 16 Feb 2016 15:09:01 -0800 Subject: x86/signal/64: Add a comment about sigcontext->fs and gs These fields have a strange history. This tries to document it. This borrows from 9a036b93a344 ("x86/signal/64: Remove 'fs' and 'gs' from sigcontext"), which was reverted by ed596cde9425 ("Revert x86 sigcontext cleanups"). Signed-off-by: Andy Lutomirski Acked-by: Borislav Petkov Cc: Al Viro Cc: Andy Lutomirski Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Peter Zijlstra Cc: Stas Sergeev Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/baa78f3c84106fa5acbc319377b1850602f5deec.1455664054.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/sigcontext.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index d485232f1e9f..702c40468859 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -341,6 +341,31 @@ struct sigcontext { __u64 rip; __u64 eflags; /* RFLAGS */ __u16 cs; + + /* + * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"), + * Linux saved and restored fs and gs in these slots. This + * was counterproductive, as fsbase and gsbase were never + * saved, so arch_prctl was presumably unreliable. + * + * These slots should never be reused without extreme caution: + * + * - Some DOSEMU versions stash fs and gs in these slots manually, + * thus overwriting anything the kernel expects to be preserved + * in these slots. + * + * - If these slots are ever needed for any other purpose, + * there is some risk that very old 64-bit binaries could get + * confused. I doubt that many such binaries still work, + * though, since the same patch in 2.5.64 also removed the + * 64-bit set_thread_area syscall, so it appears that there + * is no TLS API beyond modify_ldt that works in both pre- + * and post-2.5.64 kernels. + * + * If the kernel ever adds explicit fs, gs, fsbase, and gsbase + * save/restore, it will most likely need to be opt-in and use + * different context slots. + */ __u16 gs; __u16 fs; __u16 __pad0; -- cgit v1.2.1 From 8ff5bd2e1e2767fbf737f84d5f92668dafe7e7b0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 16 Feb 2016 15:09:02 -0800 Subject: x86/signal/64: Fix SS if needed when delivering a 64-bit signal Signals are always delivered to 64-bit tasks with CS set to a long mode segment. In long mode, SS doesn't matter as long as it's a present writable segment. If SS starts out invalid (this can happen if the signal was caused by an IRET fault or was delivered on the way out of set_thread_area or modify_ldt), then IRET to the signal handler can fail, eventually killing the task. The straightforward fix would be to simply reset SS when delivering a signal. That breaks DOSEMU, though: 64-bit builds of DOSEMU rely on SS being set to the faulting SS when signals are delivered. As a compromise, this patch leaves SS alone so long as it's valid. The net effect should be that the behavior of successfully delivered signals is unchanged. Some signals that would previously have failed to be delivered will now be delivered successfully. This has no effect for x32 or 32-bit tasks: their signal handlers were already called with SS == __USER_DS. (On Xen, there's a slight hole: if a task sets SS to a writable *kernel* data segment, then we will fail to identify it as invalid and we'll still kill the task. If anyone cares, this could be fixed with a new paravirt hook.) Signed-off-by: Andy Lutomirski Acked-by: Borislav Petkov Cc: Al Viro Cc: Andy Lutomirski Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Peter Zijlstra Cc: Stas Sergeev Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/163c6e1eacde41388f3ff4d2fe6769be651d7b6e.1455664054.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc_defs.h | 23 ++++++++++++++++++ arch/x86/kernel/signal.c | 51 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 278441f39856..eb5deb42484d 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -98,4 +98,27 @@ struct desc_ptr { #endif /* !__ASSEMBLY__ */ +/* Access rights as returned by LAR */ +#define AR_TYPE_RODATA (0 * (1 << 9)) +#define AR_TYPE_RWDATA (1 * (1 << 9)) +#define AR_TYPE_RODATA_EXPDOWN (2 * (1 << 9)) +#define AR_TYPE_RWDATA_EXPDOWN (3 * (1 << 9)) +#define AR_TYPE_XOCODE (4 * (1 << 9)) +#define AR_TYPE_XRCODE (5 * (1 << 9)) +#define AR_TYPE_XOCODE_CONF (6 * (1 << 9)) +#define AR_TYPE_XRCODE_CONF (7 * (1 << 9)) +#define AR_TYPE_MASK (7 * (1 << 9)) + +#define AR_DPL0 (0 * (1 << 13)) +#define AR_DPL3 (3 * (1 << 13)) +#define AR_DPL_MASK (3 * (1 << 13)) + +#define AR_A (1 << 8) /* "Accessed" */ +#define AR_S (1 << 12) /* If clear, "System" segment */ +#define AR_P (1 << 15) /* "Present" */ +#define AR_AVL (1 << 20) /* "AVaiLable" (no HW effect) */ +#define AR_L (1 << 21) /* "Long mode" for code segments */ +#define AR_DB (1 << 22) /* D/B, effect depends on type */ +#define AR_G (1 << 23) /* "Granularity" (limit in pages) */ + #endif /* _ASM_X86_DESC_DEFS_H */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index c07ff5ddbd47..52f82c7ef57d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -61,6 +61,35 @@ regs->seg = GET_SEG(seg) | 3; \ } while (0) +#ifdef CONFIG_X86_64 +/* + * If regs->ss will cause an IRET fault, change it. Otherwise leave it + * alone. Using this generally makes no sense unless + * user_64bit_mode(regs) would return true. + */ +static void force_valid_ss(struct pt_regs *regs) +{ + u32 ar; + asm volatile ("lar %[old_ss], %[ar]\n\t" + "jz 1f\n\t" /* If invalid: */ + "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ + "1:" + : [ar] "=r" (ar) + : [old_ss] "rm" ((u16)regs->ss)); + + /* + * For a valid 64-bit user context, we need DPL 3, type + * read-write data or read-write exp-down data, and S and P + * set. We can't use VERW because VERW doesn't check the + * P bit. + */ + ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; + if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && + ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) + regs->ss = __USER_DS; +} +#endif + int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { unsigned long buf_val; @@ -459,10 +488,28 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->sp = (unsigned long)frame; - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. We also have a compatbility issue here: DOSEMU + * relies on the contents of the SS register indicating the + * SS value at the time of the signal, even though that code in + * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU + * avoids relying on sigreturn to restore SS; instead it uses + * a trampoline.) So we do our best: if the old SS was valid, + * we keep it. Otherwise we replace it. + */ regs->cs = __USER_CS; + if (unlikely(regs->ss != __USER_DS)) + force_valid_ss(regs); + return 0; } #endif /* CONFIG_X86_32 */ -- cgit v1.2.1 From 6c25da5ad55d48c41b8909bc1f4e3cd5d85bb499 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 16 Feb 2016 15:09:03 -0800 Subject: x86/signal/64: Re-add support for SS in the 64-bit signal context This is a second attempt to make the improvements from c6f2062935c8 ("x86/signal/64: Fix SS handling for signals delivered to 64-bit programs"), which was reverted by 51adbfbba5c6 ("x86/signal/64: Add support for SS in the 64-bit signal context"). This adds two new uc_flags flags. UC_SIGCONTEXT_SS will be set for all 64-bit signals (including x32). It indicates that the saved SS field is valid and that the kernel supports the new behavior. The goal is to fix a problems with signal handling in 64-bit tasks: SS wasn't saved in the 64-bit signal context, making it awkward to determine what SS was at the time of signal delivery and making it impossible to return to a non-flat SS (as calling sigreturn clobbers SS). This also made it extremely difficult for 64-bit tasks to return to fully-defined 16-bit contexts, because only the kernel can easily do espfix64, but sigreturn was unable to set a non-flag SS:ESP. (DOSEMU has a monstrous hack to partially work around this limitation.) If we could go back in time, the correct fix would be to make 64-bit signals work just like 32-bit signals with respect to SS: save it in signal context, reset it when delivering a signal, and restore it in sigreturn. Unfortunately, doing that (as I tried originally) breaks DOSEMU: DOSEMU wouldn't reset the signal context's SS when clearing the LDT and changing the saved CS to 64-bit mode, since it predates the SS context field existing in the first place. This patch is a bit more complicated, and it tries to balance a bunch of goals. It makes most cases of changing ucontext->ss during signal handling work as expected. I do this by special-casing the interesting case. On sigreturn, ucontext->ss will be honored by default, unless the ucontext was created from scratch by an old program and had a 64-bit CS (unfortunately, CRIU can do this) or was the result of changing a 32-bit signal context to 64-bit without resetting SS (as DOSEMU does). For the benefit of new 64-bit software that uses segmentation (new versions of DOSEMU might), the new behavior can be detected with a new ucontext flag UC_SIGCONTEXT_SS. To avoid compilation issues, __pad0 is left as an alias for ss in ucontext. The nitty-gritty details are documented in the header file. This patch also re-enables the sigreturn_64 and ldt_gdt_64 selftests, as the kernel change allows both of them to pass. Tested-by: Stas Sergeev Signed-off-by: Andy Lutomirski Acked-by: Borislav Petkov Cc: Al Viro Cc: Andy Lutomirski Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/749149cbfc3e75cd7fcdad69a854b399d792cc6f.1455664054.git.luto@kernel.org [ Small readability edit. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sighandling.h | 1 - arch/x86/include/uapi/asm/sigcontext.h | 7 ++-- arch/x86/include/uapi/asm/ucontext.h | 53 +++++++++++++++++++++++++--- arch/x86/kernel/signal.c | 63 ++++++++++++++++++++++++---------- tools/testing/selftests/x86/Makefile | 5 ++- 5 files changed, 99 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 89db46752a8f..452c88b8ad06 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -13,7 +13,6 @@ X86_EFLAGS_CF | X86_EFLAGS_RF) void signal_fault(struct pt_regs *regs, void __user *frame, char *where); -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc); int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask); diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 702c40468859..62d4111c1c54 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -256,7 +256,7 @@ struct sigcontext_64 { __u16 cs; __u16 gs; __u16 fs; - __u16 __pad0; + __u16 ss; __u64 err; __u64 trapno; __u64 oldmask; @@ -368,7 +368,10 @@ struct sigcontext { */ __u16 gs; __u16 fs; - __u16 __pad0; + union { + __u16 ss; /* If UC_SIGCONTEXT_SS */ + __u16 __pad0; /* Alias name for old (!UC_SIGCONTEXT_SS) user-space */ + }; __u64 err; __u64 trapno; __u64 oldmask; diff --git a/arch/x86/include/uapi/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h index b7c29c8017f2..e3d1ec90616e 100644 --- a/arch/x86/include/uapi/asm/ucontext.h +++ b/arch/x86/include/uapi/asm/ucontext.h @@ -1,11 +1,54 @@ #ifndef _ASM_X86_UCONTEXT_H #define _ASM_X86_UCONTEXT_H -#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state - * information in the memory layout pointed - * by the fpstate pointer in the ucontext's - * sigcontext struct (uc_mcontext). - */ +/* + * Indicates the presence of extended state information in the memory + * layout pointed by the fpstate pointer in the ucontext's sigcontext + * struct (uc_mcontext). + */ +#define UC_FP_XSTATE 0x1 + +#ifdef __x86_64__ +/* + * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on + * kernels that save SS in the sigcontext. All kernels that set + * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp + * regardless of SS (i.e. they implement espfix). + * + * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS + * when delivering a signal that came from 64-bit code. + * + * Sigreturn restores SS as follows: + * + * if (saved SS is valid || UC_STRICT_RESTORE_SS is set || + * saved CS is not 64-bit) + * new SS = saved SS (will fail IRET and signal if invalid) + * else + * new SS = a flat 32-bit data segment + * + * This behavior serves three purposes: + * + * - Legacy programs that construct a 64-bit sigcontext from scratch + * with zero or garbage in the SS slot (e.g. old CRIU) and call + * sigreturn will still work. + * + * - Old DOSEMU versions sometimes catch a signal from a segmented + * context, delete the old SS segment (with modify_ldt), and change + * the saved CS to a 64-bit segment. These DOSEMU versions expect + * sigreturn to send them back to 64-bit mode without killing them, + * despite the fact that the SS selector when the signal was raised is + * no longer valid. UC_STRICT_RESTORE_SS will be clear, so the kernel + * will fix up SS for these DOSEMU versions. + * + * - Old and new programs that catch a signal and return without + * modifying the saved context will end up in exactly the state they + * started in, even if they were running in a segmented context when + * the signal was raised.. Old kernels would lose track of the + * previous SS value. + */ +#define UC_SIGCONTEXT_SS 0x2 +#define UC_STRICT_RESTORE_SS 0x4 +#endif #include diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 52f82c7ef57d..548ddf7d6fd2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -90,7 +90,9 @@ static void force_valid_ss(struct pt_regs *regs) } #endif -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) +static int restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *sc, + unsigned long uc_flags) { unsigned long buf_val; void __user *buf; @@ -123,15 +125,18 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) COPY(r15); #endif /* CONFIG_X86_64 */ -#ifdef CONFIG_X86_32 COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_64 + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && + user_64bit_mode(regs))) + force_valid_ss(regs); +#endif get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); @@ -194,6 +199,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs->cs, &sc->cs); put_user_ex(0, &sc->gs); put_user_ex(0, &sc->fs); + put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ put_user_ex(fpstate, &sc->fpstate); @@ -432,6 +438,21 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, return 0; } #else /* !CONFIG_X86_32 */ +static unsigned long frame_uc_flags(struct pt_regs *regs) +{ + unsigned long flags; + + if (cpu_has_xsave) + flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; + else + flags = UC_SIGCONTEXT_SS; + + if (likely(user_64bit_mode(regs))) + flags |= UC_STRICT_RESTORE_SS; + + return flags; +} + static int __setup_rt_frame(int sig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { @@ -451,10 +472,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, put_user_try { /* Create the ucontext. */ - if (cpu_has_xsave) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); save_altstack_ex(&frame->uc.uc_stack, regs->sp); @@ -536,10 +554,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, put_user_try { /* Create the ucontext. */ - if (cpu_has_xsave) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); @@ -601,7 +616,11 @@ asmlinkage unsigned long sys_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->sc)) + /* + * x86_32 has no uc_flags bits relevant to restore_sigcontext. + * Save a few cycles by skipping the __get_user. + */ + if (restore_sigcontext(regs, &frame->sc, 0)) goto badframe; return regs->ax; @@ -617,16 +636,19 @@ asmlinkage long sys_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; sigset_t set; + unsigned long uc_flags; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) @@ -813,6 +835,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; + unsigned long uc_flags; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); @@ -820,10 +843,12 @@ asmlinkage long sys32_x32_rt_sigreturn(void) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; + if (__get_user(uc_flags, &frame->uc.uc_flags)) + goto badframe; set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index df4f767f48da..d5ce7d7aae3e 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -5,10 +5,9 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \ - check_initial_reg_state -TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \ + check_initial_reg_state sigreturn ldt_gdt +TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ - ldt_gdt \ vdso_restorer TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) -- cgit v1.2.1 From 4f6c893822932622fad2b46cc30467be9f20ee5d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 16 Feb 2016 15:09:04 -0800 Subject: selftests/x86: Add tests for UC_SIGCONTEXT_SS and UC_STRICT_RESTORE_SS This tests the two ABI-preserving cases that DOSEMU cares about, and it also explicitly tests the new UC_SIGCONTEXT_SS and UC_STRICT_RESTORE_SS flags. Signed-off-by: Andy Lutomirski Acked-by: Borislav Petkov Cc: Al Viro Cc: Andy Lutomirski Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Peter Zijlstra Cc: Shuah Khan Cc: Stas Sergeev Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f3d08f98541d0bd3030ceb35e05e21f59e30232c.1455664054.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/sigreturn.c | 230 ++++++++++++++++++++++++++++---- 1 file changed, 202 insertions(+), 28 deletions(-) diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index b5aa1bab7416..8a577e7070c6 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -54,6 +54,37 @@ #include #include +/* Pull in AR_xyz defines. */ +typedef unsigned int u32; +typedef unsigned short u16; +#include "../../../../arch/x86/include/asm/desc_defs.h" + +/* + * Copied from asm/ucontext.h, as asm/ucontext.h conflicts badly with the glibc + * headers. + */ +#ifdef __x86_64__ +/* + * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on + * kernels that save SS in the sigcontext. All kernels that set + * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp + * regardless of SS (i.e. they implement espfix). + * + * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS + * when delivering a signal that came from 64-bit code. + * + * Sigreturn restores SS as follows: + * + * if (saved SS is valid || UC_STRICT_RESTORE_SS is set || + * saved CS is not 64-bit) + * new SS = saved SS (will fail IRET and signal if invalid) + * else + * new SS = a flat 32-bit data segment + */ +#define UC_SIGCONTEXT_SS 0x2 +#define UC_STRICT_RESTORE_SS 0x4 +#endif + /* * In principle, this test can run on Linux emulation layers (e.g. * Illumos "LX branded zones"). Solaris-based kernels reserve LDT @@ -267,6 +298,9 @@ static gregset_t initial_regs, requested_regs, resulting_regs; /* Instructions for the SIGUSR1 handler. */ static volatile unsigned short sig_cs, sig_ss; static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno; +#ifdef __x86_64__ +static volatile sig_atomic_t sig_corrupt_final_ss; +#endif /* Abstractions for some 32-bit vs 64-bit differences. */ #ifdef __x86_64__ @@ -305,9 +339,105 @@ static greg_t *csptr(ucontext_t *ctx) } #endif +/* + * Checks a given selector for its code bitness or returns -1 if it's not + * a usable code segment selector. + */ +int cs_bitness(unsigned short cs) +{ + uint32_t valid = 0, ar; + asm ("lar %[cs], %[ar]\n\t" + "jnz 1f\n\t" + "mov $1, %[valid]\n\t" + "1:" + : [ar] "=r" (ar), [valid] "+rm" (valid) + : [cs] "r" (cs)); + + if (!valid) + return -1; + + bool db = (ar & (1 << 22)); + bool l = (ar & (1 << 21)); + + if (!(ar & (1<<11))) + return -1; /* Not code. */ + + if (l && !db) + return 64; + else if (!l && db) + return 32; + else if (!l && !db) + return 16; + else + return -1; /* Unknown bitness. */ +} + +/* + * Checks a given selector for its code bitness or returns -1 if it's not + * a usable code segment selector. + */ +bool is_valid_ss(unsigned short cs) +{ + uint32_t valid = 0, ar; + asm ("lar %[cs], %[ar]\n\t" + "jnz 1f\n\t" + "mov $1, %[valid]\n\t" + "1:" + : [ar] "=r" (ar), [valid] "+rm" (valid) + : [cs] "r" (cs)); + + if (!valid) + return false; + + if ((ar & AR_TYPE_MASK) != AR_TYPE_RWDATA && + (ar & AR_TYPE_MASK) != AR_TYPE_RWDATA_EXPDOWN) + return false; + + return (ar & AR_P); +} + /* Number of errors in the current test case. */ static volatile sig_atomic_t nerrs; +static void validate_signal_ss(int sig, ucontext_t *ctx) +{ +#ifdef __x86_64__ + bool was_64bit = (cs_bitness(*csptr(ctx)) == 64); + + if (!(ctx->uc_flags & UC_SIGCONTEXT_SS)) { + printf("[FAIL]\tUC_SIGCONTEXT_SS was not set\n"); + nerrs++; + + /* + * This happens on Linux 4.1. The rest will fail, too, so + * return now to reduce the noise. + */ + return; + } + + /* UC_STRICT_RESTORE_SS is set iff we came from 64-bit mode. */ + if (!!(ctx->uc_flags & UC_STRICT_RESTORE_SS) != was_64bit) { + printf("[FAIL]\tUC_STRICT_RESTORE_SS was wrong in signal %d\n", + sig); + nerrs++; + } + + if (is_valid_ss(*ssptr(ctx))) { + /* + * DOSEMU was written before 64-bit sigcontext had SS, and + * it tries to figure out the signal source SS by looking at + * the physical register. Make sure that keeps working. + */ + unsigned short hw_ss; + asm ("mov %%ss, %0" : "=rm" (hw_ss)); + if (hw_ss != *ssptr(ctx)) { + printf("[FAIL]\tHW SS didn't match saved SS\n"); + nerrs++; + } + } +#endif +} + /* * SIGUSR1 handler. Sets CS and SS as requested and points IP to the * int3 trampoline. Sets SP to a large known value so that we can see @@ -317,6 +447,8 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t*)ctx_void; + validate_signal_ss(sig, ctx); + memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); *csptr(ctx) = sig_cs; @@ -334,13 +466,16 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void) } /* - * Called after a successful sigreturn. Restores our state so that - * the original raise(SIGUSR1) returns. + * Called after a successful sigreturn (via int3) or from a failed + * sigreturn (directly by kernel). Restores our state so that the + * original raise(SIGUSR1) returns. */ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t*)ctx_void; + validate_signal_ss(sig, ctx); + sig_err = ctx->uc_mcontext.gregs[REG_ERR]; sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO]; @@ -358,41 +493,62 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t)); +#ifdef __x86_64__ + if (sig_corrupt_final_ss) { + if (ctx->uc_flags & UC_STRICT_RESTORE_SS) { + printf("[FAIL]\tUC_STRICT_RESTORE_SS was set inappropriately\n"); + nerrs++; + } else { + /* + * DOSEMU transitions from 32-bit to 64-bit mode by + * adjusting sigcontext, and it requires that this work + * even if the saved SS is bogus. + */ + printf("\tCorrupting SS on return to 64-bit mode\n"); + *ssptr(ctx) = 0; + } + } +#endif + sig_trapped = sig; } -/* - * Checks a given selector for its code bitness or returns -1 if it's not - * a usable code segment selector. - */ -int cs_bitness(unsigned short cs) +#ifdef __x86_64__ +/* Tests recovery if !UC_STRICT_RESTORE_SS */ +static void sigusr2(int sig, siginfo_t *info, void *ctx_void) { - uint32_t valid = 0, ar; - asm ("lar %[cs], %[ar]\n\t" - "jnz 1f\n\t" - "mov $1, %[valid]\n\t" - "1:" - : [ar] "=r" (ar), [valid] "+rm" (valid) - : [cs] "r" (cs)); + ucontext_t *ctx = (ucontext_t*)ctx_void; - if (!valid) - return -1; + if (!(ctx->uc_flags & UC_STRICT_RESTORE_SS)) { + printf("[FAIL]\traise(2) didn't set UC_STRICT_RESTORE_SS\n"); + nerrs++; + return; /* We can't do the rest. */ + } - bool db = (ar & (1 << 22)); - bool l = (ar & (1 << 21)); + ctx->uc_flags &= ~UC_STRICT_RESTORE_SS; + *ssptr(ctx) = 0; - if (!(ar & (1<<11))) - return -1; /* Not code. */ + /* Return. The kernel should recover without sending another signal. */ +} - if (l && !db) - return 64; - else if (!l && db) - return 32; - else if (!l && !db) - return 16; - else - return -1; /* Unknown bitness. */ +static int test_nonstrict_ss(void) +{ + clearhandler(SIGUSR1); + clearhandler(SIGTRAP); + clearhandler(SIGSEGV); + clearhandler(SIGILL); + sethandler(SIGUSR2, sigusr2, 0); + + nerrs = 0; + + printf("[RUN]\tClear UC_STRICT_RESTORE_SS and corrupt SS\n"); + raise(SIGUSR2); + if (!nerrs) + printf("[OK]\tIt worked\n"); + + return nerrs; } +#endif /* Finds a usable code segment of the requested bitness. */ int find_cs(int bitness) @@ -576,6 +732,12 @@ static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs) errdesc, strsignal(sig_trapped)); return 0; } else { + /* + * This also implicitly tests UC_STRICT_RESTORE_SS: + * We check that these signals set UC_STRICT_RESTORE_SS and, + * if UC_STRICT_RESTORE_SS doesn't cause strict behavior, + * then we won't get SIGSEGV. + */ printf("[FAIL]\tDid not get SIGSEGV\n"); return 1; } @@ -632,6 +794,14 @@ int main() GDT3(gdt_data16_idx)); } +#ifdef __x86_64__ + /* Nasty ABI case: check SS corruption handling. */ + sig_corrupt_final_ss = 1; + total_nerrs += test_valid_sigreturn(32, false, -1); + total_nerrs += test_valid_sigreturn(32, true, -1); + sig_corrupt_final_ss = 0; +#endif + /* * We're done testing valid sigreturn cases. Now we test states * for which sigreturn itself will succeed but the subsequent @@ -680,5 +850,9 @@ int main() if (gdt_npdata32_idx) test_bad_iret(32, GDT3(gdt_npdata32_idx), -1); +#ifdef __x86_64__ + total_nerrs += test_nonstrict_ss(); +#endif + return total_nerrs ? 1 : 0; } -- cgit v1.2.1 From 84aba677f009e20185aea322563389ad56e0ef7e Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Tue, 16 Feb 2016 09:43:19 +0100 Subject: x86/microcode: Remove unnecessary paravirt_enabled check Commit: a18a0f6850d4 ("x86, microcode: Don't initialize microcode code on paravirt") added a paravirt test in microcode_init(), primarily to avoid making mc_bp_resume()->load_ucode_ap()->check_loader_disabled_ap() calls because on 32-bit kernels this callchain ends up using __pa_nodebug() macro which is invalid for Xen PV guests. A subsequent commit: fbae4ba8c4a3 ("x86, microcode: Reload microcode on resume") eliminated this callchain thus making a18a0f6850d4 unnecessary. Signed-off-by: Boris Ostrovsky Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: david.vrabel@citrix.com Cc: konrad.wilk@oracle.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1455612202-14414-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index cea8552e2b3a..ac360bfbbdb6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -623,7 +623,7 @@ int __init microcode_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; int error; - if (paravirt_enabled() || dis_ucode_ldr) + if (dis_ucode_ldr) return -EINVAL; if (c->x86_vendor == X86_VENDOR_INTEL) -- cgit v1.2.1 From 9cc6f743c7724eb9abaf27904194c169db85dd31 Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Tue, 16 Feb 2016 09:43:20 +0100 Subject: x86/microcode: Use kmemdup() rather than duplicating its implementation The patch was generated using fixed coccinelle semantic patch scripts/coccinelle/api/memdup.cocci. Signed-off-by: Andrzej Hajda Signed-off-by: Borislav Petkov Cc: Bartlomiej Zolnierkiewicz Cc: Linus Torvalds Cc: Marek Szyprowski Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455612202-14414-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/microcode/amd.c | 4 +--- arch/x86/kernel/cpu/microcode/intel.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index f66cbfe74ce4..e397fc160b05 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -788,15 +788,13 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) return -EINVAL; } - patch->data = kzalloc(patch_size, GFP_KERNEL); + patch->data = kmemdup(fw + SECTION_HDR_SIZE, patch_size, GFP_KERNEL); if (!patch->data) { pr_err("Patch data allocation failure.\n"); kfree(patch); return -EINVAL; } - /* All looks ok, copy patch... */ - memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); INIT_LIST_HEAD(&patch->plist); patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index cb397947f688..cbb3cf09b065 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -210,13 +210,11 @@ save_microcode(struct mc_saved_data *mcs, mc_hdr = &mc->hdr; size = get_totalsize(mc_hdr); - saved_ptr[i] = kmalloc(size, GFP_KERNEL); + saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL); if (!saved_ptr[i]) { ret = -ENOMEM; goto err; } - - memcpy(saved_ptr[i], mc, size); } /* -- cgit v1.2.1 From f1b92bb6b5a4e17b508f128b084fa00e0eda590c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 16 Feb 2016 09:43:21 +0100 Subject: x86/ftrace, x86/asm: Kill ftrace_caller_end label One of ftrace_caller_end and ftrace_return is redundant so unify them. Rename ftrace_return to ftrace_epilogue to mean that everything after that label represents, like an afterword, work which happens *after* the ftrace call, e.g., the function graph tracer for one. Steve wants this to rather mean "[a]n event which reflects meaningfully on a recently ended conflict or struggle." I can imagine that ftrace can be a struggle sometimes. Anyway, beef up the comment about the code contents and layout before ftrace_epilogue label. Signed-off-by: Borislav Petkov Reviewed-by: Steven Rostedt Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455612202-14414-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 11 +++++------ arch/x86/kernel/mcount_64.S | 14 ++++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 29408d6d6626..04f9641e0cb6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -697,9 +697,8 @@ static inline void tramp_free(void *tramp) { } #endif /* Defined as markers to the end of the ftrace default trampolines */ -extern void ftrace_caller_end(void); extern void ftrace_regs_caller_end(void); -extern void ftrace_return(void); +extern void ftrace_epilogue(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); @@ -746,7 +745,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_offset = (unsigned long)ftrace_regs_caller_op_ptr; } else { start_offset = (unsigned long)ftrace_caller; - end_offset = (unsigned long)ftrace_caller_end; + end_offset = (unsigned long)ftrace_epilogue; op_offset = (unsigned long)ftrace_caller_op_ptr; } @@ -754,7 +753,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* * Allocate enough size to store the ftrace_caller code, - * the jmp to ftrace_return, as well as the address of + * the jmp to ftrace_epilogue, as well as the address of * the ftrace_ops this trampoline is used for. */ trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *)); @@ -772,8 +771,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = (unsigned long)trampoline + size; - /* The trampoline ends with a jmp to ftrace_return */ - jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return); + /* The trampoline ends with a jmp to ftrace_epilogue */ + jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue); memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE); /* diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index 87e1762e2bca..ed48a9f465f8 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -168,12 +168,14 @@ GLOBAL(ftrace_call) restore_mcount_regs /* - * The copied trampoline must call ftrace_return as it + * The copied trampoline must call ftrace_epilogue as it * still may need to call the function graph tracer. + * + * The code up to this label is copied into trampolines so + * think twice before adding any new code or changing the + * layout here. */ -GLOBAL(ftrace_caller_end) - -GLOBAL(ftrace_return) +GLOBAL(ftrace_epilogue) #ifdef CONFIG_FUNCTION_GRAPH_TRACER GLOBAL(ftrace_graph_call) @@ -244,14 +246,14 @@ GLOBAL(ftrace_regs_call) popfq /* - * As this jmp to ftrace_return can be a short jump + * As this jmp to ftrace_epilogue can be a short jump * it must not be copied into the trampoline. * The trampoline will add the code to jump * to the return. */ GLOBAL(ftrace_regs_caller_end) - jmp ftrace_return + jmp ftrace_epilogue END(ftrace_regs_caller) -- cgit v1.2.1 From 053080a9d1c8cf1950115ad92ce94242ebc5f25c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 16 Feb 2016 09:43:22 +0100 Subject: x86/msr: Document msr-index.h rule for addition In order to keep this file's size sensible and not cause too much unnecessary churn, make the rule explicit - similar to pci_ids.h - that only MSRs which are used in multiple compilation units, should get added to it. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: alex.williamson@redhat.com Cc: gleb@kernel.org Cc: joro@8bytes.org Cc: kvm@vger.kernel.org Cc: sherry.hurwitz@amd.com Cc: wei@redhat.com Link: http://lkml.kernel.org/r/1455612202-14414-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b05402ef3b84..984ab75bf621 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1,7 +1,12 @@ #ifndef _ASM_X86_MSR_INDEX_H #define _ASM_X86_MSR_INDEX_H -/* CPU model specific register (MSR) numbers */ +/* + * CPU model specific register (MSR) numbers. + * + * Do not add new entries to this file unless the definitions are shared + * between multiple compilation units. + */ /* x86-64 specific MSRs */ #define MSR_EFER 0xc0000080 /* extended feature register */ -- cgit v1.2.1 From adcfd23ead69965e3ac3e69f56451dab5e39157a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Feb 2016 14:15:25 -0800 Subject: selftests/x86: Fix some error messages in ptrace_syscall I had some obvious typos. Signed-off-by: Andy Lutomirski Cc: Al Viro Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert O'Callahan Cc: Shuah Khan Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e5e6772d4802986cf7df702e646fa24ac14f2204.1455142412.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/ptrace_syscall.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c index 5105b49cd8aa..c6a0acc0a632 100644 --- a/tools/testing/selftests/x86/ptrace_syscall.c +++ b/tools/testing/selftests/x86/ptrace_syscall.c @@ -187,7 +187,7 @@ static void test_ptrace_syscall_restart(void) printf("[RUN]\tSYSEMU\n"); if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) - err(1, "PTRACE_SYSCALL"); + err(1, "PTRACE_SYSEMU"); wait_trap(chld); if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) @@ -218,7 +218,7 @@ static void test_ptrace_syscall_restart(void) err(1, "PTRACE_SETREGS"); if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) - err(1, "PTRACE_SYSCALL"); + err(1, "PTRACE_SYSEMU"); wait_trap(chld); if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) @@ -250,7 +250,7 @@ static void test_ptrace_syscall_restart(void) err(1, "PTRACE_SETREGS"); if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) - err(1, "PTRACE_SYSCALL"); + err(1, "PTRACE_SYSEMU"); wait_trap(chld); if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) -- cgit v1.2.1 From 403613432222549727e5d70cf44da8987653202a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Feb 2016 14:15:26 -0800 Subject: selftests/x86: Add a test for syscall restart under ptrace This catches a regression from the compat syscall rework. The 32-bit variant of this test currently fails. The issue is that, for a 32-bit tracer and a 32-bit tracee, GETREGS+SETREGS with no changes should be a no-op. It currently isn't a no-op if RAX indicates signal restart, because the high bits get cleared and the kernel loses track of the restart state. Reported-by: Robert O'Callahan Signed-off-by: Andy Lutomirski Cc: Al Viro Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c4040b40b5b4a37ed31375a69b683f753ec6788a.1455142412.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/ptrace_syscall.c | 126 +++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c index c6a0acc0a632..421456784bc6 100644 --- a/tools/testing/selftests/x86/ptrace_syscall.c +++ b/tools/testing/selftests/x86/ptrace_syscall.c @@ -103,6 +103,17 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), err(1, "sigaction"); } +static void setsigign(int sig, int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = (void *)SIG_IGN; + sa.sa_flags = flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + static void clearhandler(int sig) { struct sigaction sa; @@ -277,6 +288,119 @@ static void test_ptrace_syscall_restart(void) } } +static void test_restart_under_ptrace(void) +{ + printf("[RUN]\tkernel syscall restart under ptrace\n"); + pid_t chld = fork(); + if (chld < 0) + err(1, "fork"); + + if (chld == 0) { + if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0) + err(1, "PTRACE_TRACEME"); + + printf("\tChild will take a nap until signaled\n"); + setsigign(SIGUSR1, SA_RESTART); + raise(SIGSTOP); + + syscall(SYS_pause, 0, 0, 0, 0, 0, 0); + _exit(0); + } + + int status; + + /* Wait for SIGSTOP. */ + if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status)) + err(1, "waitpid"); + + struct user_regs_struct regs; + + printf("[RUN]\tSYSCALL\n"); + if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0) + err(1, "PTRACE_SYSCALL"); + wait_trap(chld); + + /* We should be stopped at pause(2) entry. */ + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + if (regs.user_syscall_nr != SYS_pause || + regs.user_arg0 != 0 || regs.user_arg1 != 0 || + regs.user_arg2 != 0 || regs.user_arg3 != 0 || + regs.user_arg4 != 0 || regs.user_arg5 != 0) { + printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tInitial nr and args are correct\n"); + } + + /* Interrupt it. */ + kill(chld, SIGUSR1); + + /* Advance. We should be stopped at exit. */ + printf("[RUN]\tSYSCALL\n"); + if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0) + err(1, "PTRACE_SYSCALL"); + wait_trap(chld); + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + if (regs.user_syscall_nr != SYS_pause || + regs.user_arg0 != 0 || regs.user_arg1 != 0 || + regs.user_arg2 != 0 || regs.user_arg3 != 0 || + regs.user_arg4 != 0 || regs.user_arg5 != 0) { + printf("[FAIL]\tArgs after SIGUSR1 are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tArgs after SIGUSR1 are correct (ax = %ld)\n", + (long)regs.user_ax); + } + + /* Poke the regs back in. This must not break anything. */ + if (ptrace(PTRACE_SETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_SETREGS"); + + /* Catch the (ignored) SIGUSR1. */ + if (ptrace(PTRACE_CONT, chld, 0, 0) != 0) + err(1, "PTRACE_CONT"); + if (waitpid(chld, &status, 0) != chld) + err(1, "waitpid"); + if (!WIFSTOPPED(status)) { + printf("[FAIL]\tChild was stopped for SIGUSR1 (status = 0x%x)\n", status); + nerrs++; + } else { + printf("[OK]\tChild got SIGUSR1\n"); + } + + /* The next event should be pause(2) again. */ + printf("[RUN]\tStep again\n"); + if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0) + err(1, "PTRACE_SYSCALL"); + wait_trap(chld); + + /* We should be stopped at pause(2) entry. */ + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + if (regs.user_syscall_nr != SYS_pause || + regs.user_arg0 != 0 || regs.user_arg1 != 0 || + regs.user_arg2 != 0 || regs.user_arg3 != 0 || + regs.user_arg4 != 0 || regs.user_arg5 != 0) { + printf("[FAIL]\tpause did not restart (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tpause(2) restarted correctly\n"); + } + + /* Kill it. */ + kill(chld, SIGKILL); + if (waitpid(chld, &status, 0) != chld) + err(1, "waitpid"); +} + int main() { printf("[RUN]\tCheck int80 return regs\n"); @@ -290,5 +414,7 @@ int main() test_ptrace_syscall_restart(); + test_restart_under_ptrace(); + return 0; } -- cgit v1.2.1 From 4e79e182b419172e35936a47f098509092d69817 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 10 Feb 2016 14:15:27 -0800 Subject: x86/entry/compat: Keep TS_COMPAT set during signal delivery Signal delivery needs to know the sign of an interrupted syscall's return value in order to detect -ERESTART variants. Normally this works independently of bitness because syscalls internally return long. Under ptrace, however, this can break, and syscall_get_error is supposed to sign-extend regs->ax if needed. We were clearing TS_COMPAT too early, though, and this prevented sign extension, which subtly broke syscall restart under ptrace. Reported-by: Robert O'Callahan Signed-off-by: Andy Lutomirski Cc: Al Viro Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: stable@vger.kernel.org # 4.3.x- Fixes: c5c46f59e4e7 ("x86/entry: Add new, comprehensible entry and exit handlers written in C") Link: http://lkml.kernel.org/r/cbce3cf545522f64eb37f5478cb59746230db3b5.1455142412.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index c6ab2ebb5f4f..1a000f59f04a 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -269,6 +269,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) /* Called with IRQs disabled. */ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) { + struct thread_info *ti = pt_regs_to_thread_info(regs); u32 cached_flags; if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) @@ -276,12 +277,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) lockdep_sys_exit(); - cached_flags = - READ_ONCE(pt_regs_to_thread_info(regs)->flags); + cached_flags = READ_ONCE(ti->flags); if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) exit_to_usermode_loop(regs, cached_flags); +#ifdef CONFIG_COMPAT + /* + * Compat syscalls set TS_COMPAT. Make sure we clear it before + * returning to user mode. We need to clear it *after* signal + * handling, because syscall restart has a fixup for compat + * syscalls. The fixup is exercised by the ptrace_syscall_32 + * selftest. + */ + ti->status &= ~TS_COMPAT; +#endif + user_enter(); } @@ -333,14 +344,6 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) syscall_slow_exit_work(regs, cached_flags); -#ifdef CONFIG_COMPAT - /* - * Compat syscalls set TS_COMPAT. Make sure we clear it before - * returning to user mode. - */ - ti->status &= ~TS_COMPAT; -#endif - local_irq_disable(); prepare_exit_to_usermode(regs); } -- cgit v1.2.1 From 07ba4b061a79896315a7be4b123de12df6a9d2bd Mon Sep 17 00:00:00 2001 From: Alban Bedel Date: Sat, 23 Jan 2016 13:57:46 +0100 Subject: irqchip/ath79-misc: Move the MISC driver from arch/mips/ath79/ The driver stays the same but the initialization changes a bit. For OF boards we now get the memory map from the OF node and use a linear mapping instead of the legacy mapping. For legacy boards we still use a legacy mapping and just pass down all the parameters from the board init code. Signed-off-by: Alban Bedel Acked-by: Marc Zyngier Link: https://lkml.kernel.org/r/1453553867-27003-1-git-send-email-albeu@free.fr Signed-off-by: Jason Cooper --- arch/mips/ath79/irq.c | 163 +++----------------------- arch/mips/include/asm/mach-ath79/ath79.h | 3 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-ath79-misc.c | 189 +++++++++++++++++++++++++++++++ 4 files changed, 208 insertions(+), 148 deletions(-) create mode 100644 drivers/irqchip/irq-ath79-misc.c diff --git a/arch/mips/ath79/irq.c b/arch/mips/ath79/irq.c index 511c06560dc1..05b45140bc1f 100644 --- a/arch/mips/ath79/irq.c +++ b/arch/mips/ath79/irq.c @@ -26,90 +26,6 @@ #include "common.h" #include "machtypes.h" -static void __init ath79_misc_intc_domain_init( - struct device_node *node, int irq); - -static void ath79_misc_irq_handler(struct irq_desc *desc) -{ - struct irq_domain *domain = irq_desc_get_handler_data(desc); - void __iomem *base = domain->host_data; - u32 pending; - - pending = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS) & - __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); - - if (!pending) { - spurious_interrupt(); - return; - } - - while (pending) { - int bit = __ffs(pending); - - generic_handle_irq(irq_linear_revmap(domain, bit)); - pending &= ~BIT(bit); - } -} - -static void ar71xx_misc_irq_unmask(struct irq_data *d) -{ - void __iomem *base = irq_data_get_irq_chip_data(d); - unsigned int irq = d->hwirq; - u32 t; - - t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); - __raw_writel(t | (1 << irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE); - - /* flush write */ - __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); -} - -static void ar71xx_misc_irq_mask(struct irq_data *d) -{ - void __iomem *base = irq_data_get_irq_chip_data(d); - unsigned int irq = d->hwirq; - u32 t; - - t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); - __raw_writel(t & ~(1 << irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE); - - /* flush write */ - __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); -} - -static void ar724x_misc_irq_ack(struct irq_data *d) -{ - void __iomem *base = irq_data_get_irq_chip_data(d); - unsigned int irq = d->hwirq; - u32 t; - - t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS); - __raw_writel(t & ~(1 << irq), base + AR71XX_RESET_REG_MISC_INT_STATUS); - - /* flush write */ - __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS); -} - -static struct irq_chip ath79_misc_irq_chip = { - .name = "MISC", - .irq_unmask = ar71xx_misc_irq_unmask, - .irq_mask = ar71xx_misc_irq_mask, -}; - -static void __init ath79_misc_irq_init(void) -{ - if (soc_is_ar71xx() || soc_is_ar913x()) - ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask; - else if (soc_is_ar724x() || - soc_is_ar933x() || - soc_is_ar934x() || - soc_is_qca955x()) - ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack; - else - BUG(); - - ath79_misc_intc_domain_init(NULL, ATH79_CPU_IRQ(6)); -} static void ar934x_ip2_irq_dispatch(struct irq_desc *desc) { @@ -248,69 +164,6 @@ asmlinkage void plat_irq_dispatch(void) } } -static int misc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) -{ - irq_set_chip_and_handler(irq, &ath79_misc_irq_chip, handle_level_irq); - irq_set_chip_data(irq, d->host_data); - return 0; -} - -static const struct irq_domain_ops misc_irq_domain_ops = { - .xlate = irq_domain_xlate_onecell, - .map = misc_map, -}; - -static void __init ath79_misc_intc_domain_init( - struct device_node *node, int irq) -{ - void __iomem *base = ath79_reset_base; - struct irq_domain *domain; - - domain = irq_domain_add_legacy(node, ATH79_MISC_IRQ_COUNT, - ATH79_MISC_IRQ_BASE, 0, &misc_irq_domain_ops, base); - if (!domain) - panic("Failed to add MISC irqdomain"); - - /* Disable and clear all interrupts */ - __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_ENABLE); - __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_STATUS); - - irq_set_chained_handler_and_data(irq, ath79_misc_irq_handler, domain); -} - -static int __init ath79_misc_intc_of_init( - struct device_node *node, struct device_node *parent) -{ - int irq; - - irq = irq_of_parse_and_map(node, 0); - if (!irq) - panic("Failed to get MISC IRQ"); - - ath79_misc_intc_domain_init(node, irq); - return 0; -} - -static int __init ar7100_misc_intc_of_init( - struct device_node *node, struct device_node *parent) -{ - ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask; - return ath79_misc_intc_of_init(node, parent); -} - -IRQCHIP_DECLARE(ar7100_misc_intc, "qca,ar7100-misc-intc", - ar7100_misc_intc_of_init); - -static int __init ar7240_misc_intc_of_init( - struct device_node *node, struct device_node *parent) -{ - ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack; - return ath79_misc_intc_of_init(node, parent); -} - -IRQCHIP_DECLARE(ar7240_misc_intc, "qca,ar7240-misc-intc", - ar7240_misc_intc_of_init); - static int __init ar79_cpu_intc_of_init( struct device_node *node, struct device_node *parent) { @@ -348,6 +201,8 @@ IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc", void __init arch_init_irq(void) { + bool misc_is_ar71xx; + if (mips_machtype == ATH79_MACH_GENERIC_OF) { irqchip_init(); return; @@ -362,7 +217,19 @@ void __init arch_init_irq(void) } mips_cpu_irq_init(); - ath79_misc_irq_init(); + + if (soc_is_ar71xx() || soc_is_ar913x()) + misc_is_ar71xx = true; + else if (soc_is_ar724x() || + soc_is_ar933x() || + soc_is_ar934x() || + soc_is_qca955x()) + misc_is_ar71xx = false; + else + BUG(); + ath79_misc_irq_init( + ath79_reset_base + AR71XX_RESET_REG_MISC_INT_STATUS, + ATH79_CPU_IRQ(6), ATH79_MISC_IRQ_BASE, misc_is_ar71xx); if (soc_is_ar934x()) ar934x_ip2_irq_init(); diff --git a/arch/mips/include/asm/mach-ath79/ath79.h b/arch/mips/include/asm/mach-ath79/ath79.h index 2b3487213d1e..22a2f56ad5e9 100644 --- a/arch/mips/include/asm/mach-ath79/ath79.h +++ b/arch/mips/include/asm/mach-ath79/ath79.h @@ -144,4 +144,7 @@ static inline u32 ath79_reset_rr(unsigned reg) void ath79_device_reset_set(u32 mask); void ath79_device_reset_clear(u32 mask); +void ath79_misc_irq_init(void __iomem *regs, int irq, + int irq_base, bool is_ar71xx); + #endif /* __ASM_MACH_ATH79_H */ diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 18caacb60d58..b873d4a081b4 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_IRQCHIP) += irqchip.o +obj-$(CONFIG_ATH79) += irq-ath79-misc.o obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2835.o obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2836.o obj-$(CONFIG_ARCH_EXYNOS) += exynos-combiner.o diff --git a/drivers/irqchip/irq-ath79-misc.c b/drivers/irqchip/irq-ath79-misc.c new file mode 100644 index 000000000000..aa7290784636 --- /dev/null +++ b/drivers/irqchip/irq-ath79-misc.c @@ -0,0 +1,189 @@ +/* + * Atheros AR71xx/AR724x/AR913x MISC interrupt controller + * + * Copyright (C) 2015 Alban Bedel + * Copyright (C) 2010-2011 Jaiganesh Narayanan + * Copyright (C) 2008-2011 Gabor Juhos + * Copyright (C) 2008 Imre Kaloz + * + * Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#define AR71XX_RESET_REG_MISC_INT_STATUS 0 +#define AR71XX_RESET_REG_MISC_INT_ENABLE 4 + +#define ATH79_MISC_IRQ_COUNT 32 + +static void ath79_misc_irq_handler(struct irq_desc *desc) +{ + struct irq_domain *domain = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); + void __iomem *base = domain->host_data; + u32 pending; + + chained_irq_enter(chip, desc); + + pending = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS) & + __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); + + if (!pending) { + spurious_interrupt(); + chained_irq_exit(chip, desc); + return; + } + + while (pending) { + int bit = __ffs(pending); + + generic_handle_irq(irq_linear_revmap(domain, bit)); + pending &= ~BIT(bit); + } + + chained_irq_exit(chip, desc); +} + +static void ar71xx_misc_irq_unmask(struct irq_data *d) +{ + void __iomem *base = irq_data_get_irq_chip_data(d); + unsigned int irq = d->hwirq; + u32 t; + + t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); + __raw_writel(t | BIT(irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE); + + /* flush write */ + __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); +} + +static void ar71xx_misc_irq_mask(struct irq_data *d) +{ + void __iomem *base = irq_data_get_irq_chip_data(d); + unsigned int irq = d->hwirq; + u32 t; + + t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); + __raw_writel(t & ~BIT(irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE); + + /* flush write */ + __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE); +} + +static void ar724x_misc_irq_ack(struct irq_data *d) +{ + void __iomem *base = irq_data_get_irq_chip_data(d); + unsigned int irq = d->hwirq; + u32 t; + + t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS); + __raw_writel(t & ~BIT(irq), base + AR71XX_RESET_REG_MISC_INT_STATUS); + + /* flush write */ + __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS); +} + +static struct irq_chip ath79_misc_irq_chip = { + .name = "MISC", + .irq_unmask = ar71xx_misc_irq_unmask, + .irq_mask = ar71xx_misc_irq_mask, +}; + +static int misc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) +{ + irq_set_chip_and_handler(irq, &ath79_misc_irq_chip, handle_level_irq); + irq_set_chip_data(irq, d->host_data); + return 0; +} + +static const struct irq_domain_ops misc_irq_domain_ops = { + .xlate = irq_domain_xlate_onecell, + .map = misc_map, +}; + +static void __init ath79_misc_intc_domain_init( + struct irq_domain *domain, int irq) +{ + void __iomem *base = domain->host_data; + + /* Disable and clear all interrupts */ + __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_ENABLE); + __raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_STATUS); + + irq_set_chained_handler_and_data(irq, ath79_misc_irq_handler, domain); +} + +static int __init ath79_misc_intc_of_init( + struct device_node *node, struct device_node *parent) +{ + struct irq_domain *domain; + void __iomem *base; + int irq; + + irq = irq_of_parse_and_map(node, 0); + if (!irq) { + pr_err("Failed to get MISC IRQ\n"); + return -EINVAL; + } + + base = of_iomap(node, 0); + if (!base) { + pr_err("Failed to get MISC IRQ registers\n"); + return -ENOMEM; + } + + domain = irq_domain_add_linear(node, ATH79_MISC_IRQ_COUNT, + &misc_irq_domain_ops, base); + if (!domain) { + pr_err("Failed to add MISC irqdomain\n"); + return -EINVAL; + } + + ath79_misc_intc_domain_init(domain, irq); + return 0; +} + +static int __init ar7100_misc_intc_of_init( + struct device_node *node, struct device_node *parent) +{ + ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask; + return ath79_misc_intc_of_init(node, parent); +} + +IRQCHIP_DECLARE(ar7100_misc_intc, "qca,ar7100-misc-intc", + ar7100_misc_intc_of_init); + +static int __init ar7240_misc_intc_of_init( + struct device_node *node, struct device_node *parent) +{ + ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack; + return ath79_misc_intc_of_init(node, parent); +} + +IRQCHIP_DECLARE(ar7240_misc_intc, "qca,ar7240-misc-intc", + ar7240_misc_intc_of_init); + +void __init ath79_misc_irq_init(void __iomem *regs, int irq, + int irq_base, bool is_ar71xx) +{ + struct irq_domain *domain; + + if (is_ar71xx) + ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask; + else + ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack; + + domain = irq_domain_add_legacy(NULL, ATH79_MISC_IRQ_COUNT, + irq_base, 0, &misc_irq_domain_ops, regs); + if (!domain) + panic("Failed to create MISC irqdomain"); + + ath79_misc_intc_domain_init(domain, irq); +} -- cgit v1.2.1 From 81ffb18ce4a0c400b051c3df67e452410d6be1ec Mon Sep 17 00:00:00 2001 From: Alban Bedel Date: Sat, 23 Jan 2016 13:57:47 +0100 Subject: irqchip/ath79-cpu: Move the CPU IRQ driver from arch/mips/ath79/ Signed-off-by: Alban Bedel Acked-by: Marc Zyngier Link: https://lkml.kernel.org/r/1453553867-27003-2-git-send-email-albeu@free.fr Signed-off-by: Jason Cooper --- arch/mips/ath79/irq.c | 81 ++------------------------ arch/mips/include/asm/mach-ath79/ath79.h | 1 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-ath79-cpu.c | 97 ++++++++++++++++++++++++++++++++ 4 files changed, 105 insertions(+), 75 deletions(-) create mode 100644 drivers/irqchip/irq-ath79-cpu.c diff --git a/arch/mips/ath79/irq.c b/arch/mips/ath79/irq.c index 05b45140bc1f..2dfff1f19004 100644 --- a/arch/mips/ath79/irq.c +++ b/arch/mips/ath79/irq.c @@ -128,79 +128,10 @@ static void qca955x_irq_init(void) irq_set_chained_handler(ATH79_CPU_IRQ(3), qca955x_ip3_irq_dispatch); } -/* - * The IP2/IP3 lines are tied to a PCI/WMAC/USB device. Drivers for - * these devices typically allocate coherent DMA memory, however the - * DMA controller may still have some unsynchronized data in the FIFO. - * Issue a flush in the handlers to ensure that the driver sees - * the update. - * - * This array map the interrupt lines to the DDR write buffer channels. - */ - -static unsigned irq_wb_chan[8] = { - -1, -1, -1, -1, -1, -1, -1, -1, -}; - -asmlinkage void plat_irq_dispatch(void) -{ - unsigned long pending; - int irq; - - pending = read_c0_status() & read_c0_cause() & ST0_IM; - - if (!pending) { - spurious_interrupt(); - return; - } - - pending >>= CAUSEB_IP; - while (pending) { - irq = fls(pending) - 1; - if (irq < ARRAY_SIZE(irq_wb_chan) && irq_wb_chan[irq] != -1) - ath79_ddr_wb_flush(irq_wb_chan[irq]); - do_IRQ(MIPS_CPU_IRQ_BASE + irq); - pending &= ~BIT(irq); - } -} - -static int __init ar79_cpu_intc_of_init( - struct device_node *node, struct device_node *parent) -{ - int err, i, count; - - /* Fill the irq_wb_chan table */ - count = of_count_phandle_with_args( - node, "qca,ddr-wb-channels", "#qca,ddr-wb-channel-cells"); - - for (i = 0; i < count; i++) { - struct of_phandle_args args; - u32 irq = i; - - of_property_read_u32_index( - node, "qca,ddr-wb-channel-interrupts", i, &irq); - if (irq >= ARRAY_SIZE(irq_wb_chan)) - continue; - - err = of_parse_phandle_with_args( - node, "qca,ddr-wb-channels", - "#qca,ddr-wb-channel-cells", - i, &args); - if (err) - return err; - - irq_wb_chan[irq] = args.args[0]; - pr_info("IRQ: Set flush channel of IRQ%d to %d\n", - irq, args.args[0]); - } - - return mips_cpu_irq_of_init(node, parent); -} -IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc", - ar79_cpu_intc_of_init); - void __init arch_init_irq(void) { + unsigned irq_wb_chan2 = -1; + unsigned irq_wb_chan3 = -1; bool misc_is_ar71xx; if (mips_machtype == ATH79_MACH_GENERIC_OF) { @@ -210,13 +141,13 @@ void __init arch_init_irq(void) if (soc_is_ar71xx() || soc_is_ar724x() || soc_is_ar913x() || soc_is_ar933x()) { - irq_wb_chan[2] = 3; - irq_wb_chan[3] = 2; + irq_wb_chan2 = 3; + irq_wb_chan3 = 2; } else if (soc_is_ar934x()) { - irq_wb_chan[3] = 2; + irq_wb_chan3 = 2; } - mips_cpu_irq_init(); + ath79_cpu_irq_init(irq_wb_chan2, irq_wb_chan3); if (soc_is_ar71xx() || soc_is_ar913x()) misc_is_ar71xx = true; diff --git a/arch/mips/include/asm/mach-ath79/ath79.h b/arch/mips/include/asm/mach-ath79/ath79.h index 22a2f56ad5e9..441faa92c3cd 100644 --- a/arch/mips/include/asm/mach-ath79/ath79.h +++ b/arch/mips/include/asm/mach-ath79/ath79.h @@ -144,6 +144,7 @@ static inline u32 ath79_reset_rr(unsigned reg) void ath79_device_reset_set(u32 mask); void ath79_device_reset_clear(u32 mask); +void ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3); void ath79_misc_irq_init(void __iomem *regs, int irq, int irq_base, bool is_ar71xx); diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index b873d4a081b4..aeb9200f0e16 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_IRQCHIP) += irqchip.o +obj-$(CONFIG_ATH79) += irq-ath79-cpu.o obj-$(CONFIG_ATH79) += irq-ath79-misc.o obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2835.o obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2836.o diff --git a/drivers/irqchip/irq-ath79-cpu.c b/drivers/irqchip/irq-ath79-cpu.c new file mode 100644 index 000000000000..befe93c5a51a --- /dev/null +++ b/drivers/irqchip/irq-ath79-cpu.c @@ -0,0 +1,97 @@ +/* + * Atheros AR71xx/AR724x/AR913x specific interrupt handling + * + * Copyright (C) 2015 Alban Bedel + * Copyright (C) 2010-2011 Jaiganesh Narayanan + * Copyright (C) 2008-2011 Gabor Juhos + * Copyright (C) 2008 Imre Kaloz + * + * Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +/* + * The IP2/IP3 lines are tied to a PCI/WMAC/USB device. Drivers for + * these devices typically allocate coherent DMA memory, however the + * DMA controller may still have some unsynchronized data in the FIFO. + * Issue a flush in the handlers to ensure that the driver sees + * the update. + * + * This array map the interrupt lines to the DDR write buffer channels. + */ + +static unsigned irq_wb_chan[8] = { + -1, -1, -1, -1, -1, -1, -1, -1, +}; + +asmlinkage void plat_irq_dispatch(void) +{ + unsigned long pending; + int irq; + + pending = read_c0_status() & read_c0_cause() & ST0_IM; + + if (!pending) { + spurious_interrupt(); + return; + } + + pending >>= CAUSEB_IP; + while (pending) { + irq = fls(pending) - 1; + if (irq < ARRAY_SIZE(irq_wb_chan) && irq_wb_chan[irq] != -1) + ath79_ddr_wb_flush(irq_wb_chan[irq]); + do_IRQ(MIPS_CPU_IRQ_BASE + irq); + pending &= ~BIT(irq); + } +} + +static int __init ar79_cpu_intc_of_init( + struct device_node *node, struct device_node *parent) +{ + int err, i, count; + + /* Fill the irq_wb_chan table */ + count = of_count_phandle_with_args( + node, "qca,ddr-wb-channels", "#qca,ddr-wb-channel-cells"); + + for (i = 0; i < count; i++) { + struct of_phandle_args args; + u32 irq = i; + + of_property_read_u32_index( + node, "qca,ddr-wb-channel-interrupts", i, &irq); + if (irq >= ARRAY_SIZE(irq_wb_chan)) + continue; + + err = of_parse_phandle_with_args( + node, "qca,ddr-wb-channels", + "#qca,ddr-wb-channel-cells", + i, &args); + if (err) + return err; + + irq_wb_chan[irq] = args.args[0]; + } + + return mips_cpu_irq_of_init(node, parent); +} +IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc", + ar79_cpu_intc_of_init); + +void __init ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3) +{ + irq_wb_chan[2] = irq_wb_chan2; + irq_wb_chan[3] = irq_wb_chan3; + mips_cpu_irq_init(); +} -- cgit v1.2.1 From 8f8e2aec9944dd12671182a1a26b8e1a35872a1d Mon Sep 17 00:00:00 2001 From: Alan Date: Wed, 17 Feb 2016 14:10:15 +0000 Subject: x86/platform/intel/mid: Remove dead code Neither ratio nor fsb are ever zero, so remove the 0 case. Signed-off-by: Alan Cox Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/mfld.c | 5 +---- arch/x86/platform/intel-mid/mrfl.c | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index 23381d2174ae..1eb47b6298c2 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -52,10 +52,7 @@ static unsigned long __init mfld_calibrate_tsc(void) /* mark tsc clocksource as reliable */ set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); - if (fast_calibrate) - return fast_calibrate; - - return 0; + return fast_calibrate; } static void __init penwell_arch_setup(void) diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c index aaca91753d32..bd1adc621781 100644 --- a/arch/x86/platform/intel-mid/mrfl.c +++ b/arch/x86/platform/intel-mid/mrfl.c @@ -81,10 +81,7 @@ static unsigned long __init tangier_calibrate_tsc(void) /* mark tsc clocksource as reliable */ set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); - if (fast_calibrate) - return fast_calibrate; - - return 0; + return fast_calibrate; } static void __init tangier_arch_setup(void) -- cgit v1.2.1 From d08b82c9e11e0997d2fb7dbf07a1f0c4112e587e Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Wed, 20 Jan 2016 18:07:16 +0000 Subject: devicetree: Add binding for Sigma Designs SMP86xx interrupt controller This adds a binding for the secondary interrupt controller in Sigma Designs SMP86xx and SMP87xx chips. Signed-off-by: Mans Rullgard [ jac: use 'interrupt-controller@XXX' notation in binding doc ] Acked-by: Rob Herring Link: https://lkml.kernel.org/r/1453313237-18570-1-git-send-email-mans@mansr.com Signed-off-by: Jason Cooper --- .../interrupt-controller/sigma,smp8642-intc.txt | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt diff --git a/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt new file mode 100644 index 000000000000..1f441fa0ad40 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt @@ -0,0 +1,49 @@ +Sigma Designs SMP86xx/SMP87xx secondary interrupt controller + +Required properties: +- compatible: should be "sigma,smp8642-intc" +- reg: physical address of MMIO region +- ranges: address space mapping of child nodes +- interrupt-parent: phandle of parent interrupt controller +- interrupt-controller: boolean +- #address-cells: should be <1> +- #size-cells: should be <1> + +One child node per control block with properties: +- reg: address of registers for this control block +- interrupt-controller: boolean +- #interrupt-cells: should be <2>, interrupt index and flags per interrupts.txt +- interrupts: interrupt spec of primary interrupt controller + +Example: + +interrupt-controller@6e000 { + compatible = "sigma,smp8642-intc"; + reg = <0x6e000 0x400>; + ranges = <0x0 0x6e000 0x400>; + interrupt-parent = <&gic>; + interrupt-controller; + #address-cells = <1>; + #size-cells = <1>; + + irq0: interrupt-controller@0 { + reg = <0x000 0x100>; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = ; + }; + + irq1: interrupt-controller@100 { + reg = <0x100 0x100>; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = ; + }; + + irq2: interrupt-controller@300 { + reg = <0x300 0x100>; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = ; + }; +}; -- cgit v1.2.1 From 4bba66899ac654cb7c940a3af35d496a0b1952f0 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Wed, 20 Jan 2016 18:07:17 +0000 Subject: irqchip/tango: Add support for Sigma Designs SMP86xx/SMP87xx interrupt controller This adds support for the secondary interrupt controller used in Sigma Designs SMP86xx and SMP87xx chips. Signed-off-by: Mans Rullgard Acked-by: Marc Zyngier Link: https://lkml.kernel.org/r/1453313237-18570-2-git-send-email-mans@mansr.com Signed-off-by: Jason Cooper --- drivers/irqchip/Kconfig | 5 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-tango.c | 232 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 238 insertions(+) create mode 100644 drivers/irqchip/irq-tango.c diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 715923d5236c..6bdfbce0388f 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -151,6 +151,11 @@ config ST_IRQCHIP help Enables SysCfg Controlled IRQs on STi based platforms. +config TANGO_IRQ + bool + select IRQ_DOMAIN + select GENERIC_IRQ_CHIP + config TB10X_IRQC bool select IRQ_DOMAIN diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 18caacb60d58..ec83deb0c719 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_VERSATILE_FPGA_IRQ) += irq-versatile-fpga.o obj-$(CONFIG_ARCH_NSPIRE) += irq-zevio.o obj-$(CONFIG_ARCH_VT8500) += irq-vt8500.o obj-$(CONFIG_ST_IRQCHIP) += irq-st.o +obj-$(CONFIG_TANGO_IRQ) += irq-tango.o obj-$(CONFIG_TB10X_IRQC) += irq-tb10x.o obj-$(CONFIG_TS4800_IRQ) += irq-ts4800.o obj-$(CONFIG_XTENSA) += irq-xtensa-pic.o diff --git a/drivers/irqchip/irq-tango.c b/drivers/irqchip/irq-tango.c new file mode 100644 index 000000000000..bdbb5c0ff7fe --- /dev/null +++ b/drivers/irqchip/irq-tango.c @@ -0,0 +1,232 @@ +/* + * Copyright (C) 2014 Mans Rullgard + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IRQ0_CTL_BASE 0x0000 +#define IRQ1_CTL_BASE 0x0100 +#define EDGE_CTL_BASE 0x0200 +#define IRQ2_CTL_BASE 0x0300 + +#define IRQ_CTL_HI 0x18 +#define EDGE_CTL_HI 0x20 + +#define IRQ_STATUS 0x00 +#define IRQ_RAWSTAT 0x04 +#define IRQ_EN_SET 0x08 +#define IRQ_EN_CLR 0x0c +#define IRQ_SOFT_SET 0x10 +#define IRQ_SOFT_CLR 0x14 + +#define EDGE_STATUS 0x00 +#define EDGE_RAWSTAT 0x04 +#define EDGE_CFG_RISE 0x08 +#define EDGE_CFG_FALL 0x0c +#define EDGE_CFG_RISE_SET 0x10 +#define EDGE_CFG_RISE_CLR 0x14 +#define EDGE_CFG_FALL_SET 0x18 +#define EDGE_CFG_FALL_CLR 0x1c + +struct tangox_irq_chip { + void __iomem *base; + unsigned long ctl; +}; + +static inline u32 intc_readl(struct tangox_irq_chip *chip, int reg) +{ + return readl_relaxed(chip->base + reg); +} + +static inline void intc_writel(struct tangox_irq_chip *chip, int reg, u32 val) +{ + writel_relaxed(val, chip->base + reg); +} + +static void tangox_dispatch_irqs(struct irq_domain *dom, unsigned int status, + int base) +{ + unsigned int hwirq; + unsigned int virq; + + while (status) { + hwirq = __ffs(status); + virq = irq_find_mapping(dom, base + hwirq); + if (virq) + generic_handle_irq(virq); + status &= ~BIT(hwirq); + } +} + +static void tangox_irq_handler(struct irq_desc *desc) +{ + struct irq_domain *dom = irq_desc_get_handler_data(desc); + struct irq_chip *host_chip = irq_desc_get_chip(desc); + struct tangox_irq_chip *chip = dom->host_data; + unsigned int status_lo, status_hi; + + chained_irq_enter(host_chip, desc); + + status_lo = intc_readl(chip, chip->ctl + IRQ_STATUS); + status_hi = intc_readl(chip, chip->ctl + IRQ_CTL_HI + IRQ_STATUS); + + tangox_dispatch_irqs(dom, status_lo, 0); + tangox_dispatch_irqs(dom, status_hi, 32); + + chained_irq_exit(host_chip, desc); +} + +static int tangox_irq_set_type(struct irq_data *d, unsigned int flow_type) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct tangox_irq_chip *chip = gc->domain->host_data; + struct irq_chip_regs *regs = &gc->chip_types[0].regs; + + switch (flow_type & IRQ_TYPE_SENSE_MASK) { + case IRQ_TYPE_EDGE_RISING: + intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask); + intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask); + break; + + case IRQ_TYPE_EDGE_FALLING: + intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask); + intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask); + break; + + case IRQ_TYPE_LEVEL_HIGH: + intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask); + intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask); + break; + + case IRQ_TYPE_LEVEL_LOW: + intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask); + intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask); + break; + + default: + pr_err("Invalid trigger mode %x for IRQ %d\n", + flow_type, d->irq); + return -EINVAL; + } + + return irq_setup_alt_chip(d, flow_type); +} + +static void __init tangox_irq_init_chip(struct irq_chip_generic *gc, + unsigned long ctl_offs, + unsigned long edge_offs) +{ + struct tangox_irq_chip *chip = gc->domain->host_data; + struct irq_chip_type *ct = gc->chip_types; + unsigned long ctl_base = chip->ctl + ctl_offs; + unsigned long edge_base = EDGE_CTL_BASE + edge_offs; + int i; + + gc->reg_base = chip->base; + gc->unused = 0; + + for (i = 0; i < 2; i++) { + ct[i].chip.irq_ack = irq_gc_ack_set_bit; + ct[i].chip.irq_mask = irq_gc_mask_disable_reg; + ct[i].chip.irq_mask_ack = irq_gc_mask_disable_reg_and_ack; + ct[i].chip.irq_unmask = irq_gc_unmask_enable_reg; + ct[i].chip.irq_set_type = tangox_irq_set_type; + ct[i].chip.name = gc->domain->name; + + ct[i].regs.enable = ctl_base + IRQ_EN_SET; + ct[i].regs.disable = ctl_base + IRQ_EN_CLR; + ct[i].regs.ack = edge_base + EDGE_RAWSTAT; + ct[i].regs.type = edge_base; + } + + ct[0].type = IRQ_TYPE_LEVEL_MASK; + ct[0].handler = handle_level_irq; + + ct[1].type = IRQ_TYPE_EDGE_BOTH; + ct[1].handler = handle_edge_irq; + + intc_writel(chip, ct->regs.disable, 0xffffffff); + intc_writel(chip, ct->regs.ack, 0xffffffff); +} + +static void __init tangox_irq_domain_init(struct irq_domain *dom) +{ + struct irq_chip_generic *gc; + int i; + + for (i = 0; i < 2; i++) { + gc = irq_get_domain_generic_chip(dom, i * 32); + tangox_irq_init_chip(gc, i * IRQ_CTL_HI, i * EDGE_CTL_HI); + } +} + +static int __init tangox_irq_init(void __iomem *base, struct resource *baseres, + struct device_node *node) +{ + struct tangox_irq_chip *chip; + struct irq_domain *dom; + struct resource res; + int irq; + int err; + + irq = irq_of_parse_and_map(node, 0); + if (!irq) + panic("%s: failed to get IRQ", node->name); + + err = of_address_to_resource(node, 0, &res); + if (err) + panic("%s: failed to get address", node->name); + + chip = kzalloc(sizeof(*chip), GFP_KERNEL); + chip->ctl = res.start - baseres->start; + chip->base = base; + + dom = irq_domain_add_linear(node, 64, &irq_generic_chip_ops, chip); + if (!dom) + panic("%s: failed to create irqdomain", node->name); + + err = irq_alloc_domain_generic_chips(dom, 32, 2, node->name, + handle_level_irq, 0, 0, 0); + if (err) + panic("%s: failed to allocate irqchip", node->name); + + tangox_irq_domain_init(dom); + + irq_set_chained_handler(irq, tangox_irq_handler); + irq_set_handler_data(irq, dom); + + return 0; +} + +static int __init tangox_of_irq_init(struct device_node *node, + struct device_node *parent) +{ + struct device_node *c; + struct resource res; + void __iomem *base; + + base = of_iomap(node, 0); + if (!base) + panic("%s: of_iomap failed", node->name); + + of_address_to_resource(node, 0, &res); + + for_each_child_of_node(node, c) + tangox_irq_init(base, &res, c); + + return 0; +} +IRQCHIP_DECLARE(tangox_intc, "sigma,smp8642-intc", tangox_of_irq_init); -- cgit v1.2.1 From d2b383dcf49db953a7438810803fdd7ac1e1f385 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 9 Feb 2016 11:19:20 +0100 Subject: irqchip/ts4800: Add hardware dependency The Technologic Systems TS-4800 is an i.MX515 board, so its drivers are useless unless building a SOC_IMX51 kernel, except for build testing purposes. Signed-off-by: Jean Delvare Cc: Damien Riegel Cc: Thomas Gleixner Cc: Jason Cooper Cc: Marc Zyngier Link: https://lkml.kernel.org/r/20160209111920.1ec318bd@endymion Signed-off-by: Jason Cooper --- drivers/irqchip/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 8115a32a553c..72e5a842cdae 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -165,6 +165,7 @@ config TS4800_IRQ tristate "TS-4800 IRQ controller" select IRQ_DOMAIN depends on HAS_IOMEM + depends on SOC_IMX51 || COMPILE_TEST help Support for the TS-4800 FPGA IRQ controller -- cgit v1.2.1 From b6bc902ddca18749253093f5f8dc15391d8c2356 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 4 Feb 2016 13:14:28 -0800 Subject: irqchip/bcm2836: Drop extra memory barrier in SMP boot. The writel() immediately after this has a barrier, anyway. Signed-off-by: Eric Anholt Link: https://lkml.kernel.org/r/1454620468-31303-1-git-send-email-eric@anholt.net Signed-off-by: Jason Cooper --- drivers/irqchip/irq-bcm2836.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/irqchip/irq-bcm2836.c b/drivers/irqchip/irq-bcm2836.c index 963065a0d774..b6e950d4782a 100644 --- a/drivers/irqchip/irq-bcm2836.c +++ b/drivers/irqchip/irq-bcm2836.c @@ -229,7 +229,6 @@ int __init bcm2836_smp_boot_secondary(unsigned int cpu, unsigned long secondary_startup_phys = (unsigned long)virt_to_phys((void *)secondary_startup); - dsb(); writel(secondary_startup_phys, intc.base + LOCAL_MAILBOX3_SET0 + 16 * cpu); -- cgit v1.2.1 From e4e1c0ea731e4a3df470110bfdf8a18d11d59351 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Sun, 14 Feb 2016 21:50:04 +0800 Subject: irqchip/ts4800: Make ts4800_ic_ops static const ts4800_ic_ops is only referenced in this driver, so make it static. In additional, it's never get modified thus also make it const. Signed-off-by: Axel Lin Reviewed-by: Damien Riegel Link: https://lkml.kernel.org/r/1455457804.13175.1.camel@ingics.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-ts4800.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-ts4800.c b/drivers/irqchip/irq-ts4800.c index 4192bdcd2734..2325fb3c482b 100644 --- a/drivers/irqchip/irq-ts4800.c +++ b/drivers/irqchip/irq-ts4800.c @@ -59,7 +59,7 @@ static int ts4800_irqdomain_map(struct irq_domain *d, unsigned int irq, return 0; } -struct irq_domain_ops ts4800_ic_ops = { +static const struct irq_domain_ops ts4800_ic_ops = { .map = ts4800_irqdomain_map, .xlate = irq_domain_xlate_onecell, }; -- cgit v1.2.1 From 0f68c088c0adb3c3bbeb487c4ebcde91fd5d34be Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Feb 2016 10:20:13 -0800 Subject: x86/cpufeature: Create a new synthetic cpu capability for machine check recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Intel Software Developer Manual describes bit 24 in the MCG_CAP MSR: MCG_SER_P (software error recovery support present) flag, bit 24 — Indicates (when set) that the processor supports software error recovery But only some models with this capability bit set will actually generate recoverable machine checks. Check the model name and set a synthetic capability bit. Provide a command line option to set this bit anyway in case the kernel doesn't recognise the model name. Signed-off-by: Tony Luck Reviewed-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2e5bfb23c89800a036fb8a45fa97a74bb16bc362.1455732970.git.tony.luck@intel.com Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/boot-options.txt | 2 ++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 13 +++++++++++++ 4 files changed, 17 insertions(+) diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 68ed3114c363..0965a71f9942 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -60,6 +60,8 @@ Machine check threshold to 1. Enabling this may make memory predictive failure analysis less effective if the bios sets thresholds for memory errors since we will not see details for all errors. + mce=recovery + Force-enable recoverable machine check code paths nomce (for compatibility with i386): same as mce=off diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0ceb6adc8a48..6663fae71b12 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -106,6 +106,7 @@ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ #define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 2ea4527e462f..18d2ba9c8e44 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -113,6 +113,7 @@ struct mca_config { bool ignore_ce; bool disabled; bool ser; + bool recovery; bool bios_cmci_threshold; u8 banks; s8 bootlog; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a006f4cd792b..b5b187c8cc07 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1576,6 +1576,17 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; + /* + * MCG_CAP.MCG_SER_P is necessary but not sufficient to know + * whether this processor will actually generate recoverable + * machine checks. Check to see if this is an E7 model Xeon. + * We can't do a model number check because E5 and E7 use the + * same model number. E5 doesn't support recovery, E7 does. + */ + if (mca_cfg.recovery || (mca_cfg.ser && + !strncmp(c->x86_model_id, + "Intel(R) Xeon(R) CPU E7-", 24))) + set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY); } if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; @@ -2028,6 +2039,8 @@ static int __init mcheck_enable(char *str) cfg->bootlog = (str[0] == 'b'); else if (!strcmp(str, "bios_cmci_threshold")) cfg->bios_cmci_threshold = true; + else if (!strcmp(str, "recovery")) + cfg->recovery = true; else if (isdigit(str[0])) { if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); -- cgit v1.2.1 From c27f29bbbf02168c9b1e8ba0fe7a8cb917e5a50f Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 19 Feb 2016 14:34:43 +0100 Subject: irqchip/mvebu-odmi: Add new driver for platform MSI on Marvell 7K/8K This commits adds a new irqchip driver that handles the ODMI controller found on Marvell 7K/8K processors. The ODMI controller provide MSI interrupt functionality to on-board peripherals, much like the GIC-v2m. Signed-off-by: Thomas Petazzoni Reviewed-by: Marc Zyngier Link: https://lkml.kernel.org/r/1455888883-5127-1-git-send-email-thomas.petazzoni@free-electrons.com Signed-off-by: Jason Cooper --- .../marvell,odmi-controller.txt | 41 ++++ drivers/irqchip/Kconfig | 4 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-mvebu-odmi.c | 248 +++++++++++++++++++++ 4 files changed, 294 insertions(+) create mode 100644 Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt create mode 100644 drivers/irqchip/irq-mvebu-odmi.c diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt new file mode 100644 index 000000000000..252d5c9c31f9 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt @@ -0,0 +1,41 @@ + +* Marvell ODMI for MSI support + +Some Marvell SoCs have an On-Die Message Interrupt (ODMI) controller +which can be used by on-board peripheral for MSI interrupts. + +Required properties: + +- compatible : The value here should contain "marvell,odmi-controller". + +- interrupt,controller : Identifies the node as an interrupt controller. + +- msi-controller : Identifies the node as an MSI controller. + +- marvell,odmi-frames : Number of ODMI frames available. Each frame + provides a number of events. + +- reg : List of register definitions, one for each + ODMI frame. + +- marvell,spi-base : List of GIC base SPI interrupts, one for each + ODMI frame. Those SPI interrupts are 0-based, + i.e marvell,spi-base = <128> will use SPI #96. + See Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt + for details about the GIC Device Tree binding. + +- interrupt-parent : Reference to the parent interrupt controller. + +Example: + + odmi: odmi@300000 { + compatible = "marvell,odmi-controller"; + interrupt-controller; + msi-controller; + marvell,odmi-frames = <4>; + reg = <0x300000 0x4000>, + <0x304000 0x4000>, + <0x308000 0x4000>, + <0x30C000 0x4000>; + marvell,spi-base = <128>, <136>, <144>, <152>; + }; diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 8115a32a553c..fa5e98c0f2bf 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -223,3 +223,7 @@ config IRQ_MXS def_bool y if MACH_ASM9260 || ARCH_MXS select IRQ_DOMAIN select STMP_DEVICE + +config MVEBU_ODMI + bool + select GENERIC_MSI_IRQ_DOMAIN diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 30dba044d0b8..b37f7084db04 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -59,3 +59,4 @@ obj-$(CONFIG_ARCH_SA1100) += irq-sa11x0.o obj-$(CONFIG_INGENIC_IRQ) += irq-ingenic.o obj-$(CONFIG_IMX_GPCV2) += irq-imx-gpcv2.o obj-$(CONFIG_PIC32_EVIC) += irq-pic32-evic.o +obj-$(CONFIG_MVEBU_ODMI) += irq-mvebu-odmi.o diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c new file mode 100644 index 000000000000..0ae98b24836f --- /dev/null +++ b/drivers/irqchip/irq-mvebu-odmi.c @@ -0,0 +1,248 @@ +/* + * Copyright (C) 2016 Marvell + * + * Thomas Petazzoni + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#define pr_fmt(fmt) "GIC-ODMI: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#define GICP_ODMIN_SET 0x40 +#define GICP_ODMI_INT_NUM_SHIFT 12 +#define GICP_ODMIN_GM_EP_R0 0x110 +#define GICP_ODMIN_GM_EP_R1 0x114 +#define GICP_ODMIN_GM_EA_R0 0x108 +#define GICP_ODMIN_GM_EA_R1 0x118 + +/* + * We don't support the group events, so we simply have 8 interrupts + * per frame. + */ +#define NODMIS_SHIFT 3 +#define NODMIS_PER_FRAME (1 << NODMIS_SHIFT) +#define NODMIS_MASK (NODMIS_PER_FRAME - 1) + +struct odmi_data { + struct resource res; + void __iomem *base; + unsigned int spi_base; +}; + +static struct odmi_data *odmis; +static unsigned long *odmis_bm; +static unsigned int odmis_count; + +/* Protects odmis_bm */ +static DEFINE_SPINLOCK(odmis_bm_lock); + +static int odmi_set_affinity(struct irq_data *d, + const struct cpumask *mask, bool force) +{ + int ret; + + ret = irq_chip_set_affinity_parent(d, mask, force); + if (ret == IRQ_SET_MASK_OK) + ret = IRQ_SET_MASK_OK_DONE; + + return ret; +} + +static void odmi_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) +{ + struct odmi_data *odmi; + phys_addr_t addr; + unsigned int odmin; + + if (WARN_ON(d->hwirq >= odmis_count * NODMIS_PER_FRAME)) + return; + + odmi = &odmis[d->hwirq >> NODMIS_SHIFT]; + odmin = d->hwirq & NODMIS_MASK; + + addr = odmi->res.start + GICP_ODMIN_SET; + + msg->address_hi = upper_32_bits(addr); + msg->address_lo = lower_32_bits(addr); + msg->data = odmin << GICP_ODMI_INT_NUM_SHIFT; +} + +static struct irq_chip odmi_irq_chip = { + .name = "ODMI", + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = odmi_set_affinity, + .irq_compose_msi_msg = odmi_compose_msi_msg, +}; + +static int odmi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct odmi_data *odmi = NULL; + struct irq_fwspec fwspec; + struct irq_data *d; + unsigned int hwirq, odmin; + int ret; + + spin_lock(&odmis_bm_lock); + hwirq = find_first_zero_bit(odmis_bm, NODMIS_PER_FRAME * odmis_count); + if (hwirq >= NODMIS_PER_FRAME * odmis_count) { + spin_unlock(&odmis_bm_lock); + return -ENOSPC; + } + + __set_bit(hwirq, odmis_bm); + spin_unlock(&odmis_bm_lock); + + odmi = &odmis[hwirq >> NODMIS_SHIFT]; + odmin = hwirq & NODMIS_MASK; + + fwspec.fwnode = domain->parent->fwnode; + fwspec.param_count = 3; + fwspec.param[0] = GIC_SPI; + fwspec.param[1] = odmi->spi_base - 32 + odmin; + fwspec.param[2] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); + if (ret) { + pr_err("Cannot allocate parent IRQ\n"); + spin_lock(&odmis_bm_lock); + __clear_bit(odmin, odmis_bm); + spin_unlock(&odmis_bm_lock); + return ret; + } + + /* Configure the interrupt line to be edge */ + d = irq_domain_get_irq_data(domain->parent, virq); + d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); + + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, + &odmi_irq_chip, NULL); + + return 0; +} + +static void odmi_irq_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + + if (d->hwirq >= odmis_count * NODMIS_PER_FRAME) { + pr_err("Failed to teardown msi. Invalid hwirq %lu\n", d->hwirq); + return; + } + + irq_domain_free_irqs_parent(domain, virq, nr_irqs); + + /* Actually free the MSI */ + spin_lock(&odmis_bm_lock); + __clear_bit(d->hwirq, odmis_bm); + spin_unlock(&odmis_bm_lock); +} + +static const struct irq_domain_ops odmi_domain_ops = { + .alloc = odmi_irq_domain_alloc, + .free = odmi_irq_domain_free, +}; + +static struct irq_chip odmi_msi_irq_chip = { + .name = "ODMI", +}; + +static struct msi_domain_ops odmi_msi_ops = { +}; + +static struct msi_domain_info odmi_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS), + .ops = &odmi_msi_ops, + .chip = &odmi_msi_irq_chip, +}; + +static int __init mvebu_odmi_init(struct device_node *node, + struct device_node *parent) +{ + struct irq_domain *inner_domain, *plat_domain; + int ret, i; + + if (of_property_read_u32(node, "marvell,odmi-frames", &odmis_count)) + return -EINVAL; + + odmis = kcalloc(odmis_count, sizeof(struct odmi_data), GFP_KERNEL); + if (!odmis) + return -ENOMEM; + + odmis_bm = kcalloc(BITS_TO_LONGS(odmis_count * NODMIS_PER_FRAME), + sizeof(long), GFP_KERNEL); + if (!odmis_bm) { + ret = -ENOMEM; + goto err_alloc; + } + + for (i = 0; i < odmis_count; i++) { + struct odmi_data *odmi = &odmis[i]; + + ret = of_address_to_resource(node, i, &odmi->res); + if (ret) + goto err_unmap; + + odmi->base = of_io_request_and_map(node, i, "odmi"); + if (IS_ERR(odmi->base)) { + ret = PTR_ERR(odmi->base); + goto err_unmap; + } + + if (of_property_read_u32_index(node, "marvell,spi-base", + i, &odmi->spi_base)) { + ret = -EINVAL; + goto err_unmap; + } + } + + inner_domain = irq_domain_create_linear(of_node_to_fwnode(node), + odmis_count * NODMIS_PER_FRAME, + &odmi_domain_ops, NULL); + if (!inner_domain) { + ret = -ENOMEM; + goto err_unmap; + } + + inner_domain->parent = irq_find_host(parent); + + plat_domain = platform_msi_create_irq_domain(of_node_to_fwnode(node), + &odmi_msi_domain_info, + inner_domain); + if (!plat_domain) { + ret = -ENOMEM; + goto err_remove_inner; + } + + return 0; + +err_remove_inner: + irq_domain_remove(inner_domain); +err_unmap: + for (i = 0; i < odmis_count; i++) { + struct odmi_data *odmi = &odmis[i]; + + if (odmi->base && !IS_ERR(odmi->base)) + iounmap(odmis[i].base); + } + kfree(odmis_bm); +err_alloc: + kfree(odmis); + return ret; +} + +IRQCHIP_DECLARE(mvebu_odmi, "marvell,odmi-controller", mvebu_odmi_init); -- cgit v1.2.1 From 0407daceedfed003eaacb850d06cbbe359348367 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 19 Feb 2016 15:00:29 +0000 Subject: irqchip/gic: Return IRQ_SET_MASK_OK_DONE in the set_affinity method Moving an SPI around doesn't require any extra work from the rest of the stack, and specially not for MSI-generated SPIs. It is then worth returning IRQ_SET_MASK_OK_DONE instead of IRQ_SET_MASK_OK, and simplify the other irqchips that rely on this behaviour (GICv2m and Marvell's ODMI controller). Signed-off-by: Marc Zyngier Acked-by: Thomas Petazzoni Link: https://lkml.kernel.org/r/1455894029-17270-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-gic-v2m.c | 14 +------------- drivers/irqchip/irq-gic.c | 2 +- drivers/irqchip/irq-mvebu-odmi.c | 14 +------------- 3 files changed, 3 insertions(+), 27 deletions(-) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index c779f83e511d..28f047c61baa 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -92,18 +92,6 @@ static struct msi_domain_info gicv2m_msi_domain_info = { .chip = &gicv2m_msi_irq_chip, }; -static int gicv2m_set_affinity(struct irq_data *irq_data, - const struct cpumask *mask, bool force) -{ - int ret; - - ret = irq_chip_set_affinity_parent(irq_data, mask, force); - if (ret == IRQ_SET_MASK_OK) - ret = IRQ_SET_MASK_OK_DONE; - - return ret; -} - static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { struct v2m_data *v2m = irq_data_get_irq_chip_data(data); @@ -122,7 +110,7 @@ static struct irq_chip gicv2m_irq_chip = { .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, .irq_eoi = irq_chip_eoi_parent, - .irq_set_affinity = gicv2m_set_affinity, + .irq_set_affinity = irq_chip_set_affinity_parent, .irq_compose_msi_msg = gicv2m_compose_msi_msg, }; diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 911758c056c1..e14f2f2a7263 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -319,7 +319,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, writel_relaxed(val | bit, reg); raw_spin_unlock_irqrestore(&irq_controller_lock, flags); - return IRQ_SET_MASK_OK; + return IRQ_SET_MASK_OK_DONE; } #endif diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c index 0ae98b24836f..b4d367868dbb 100644 --- a/drivers/irqchip/irq-mvebu-odmi.c +++ b/drivers/irqchip/irq-mvebu-odmi.c @@ -47,18 +47,6 @@ static unsigned int odmis_count; /* Protects odmis_bm */ static DEFINE_SPINLOCK(odmis_bm_lock); -static int odmi_set_affinity(struct irq_data *d, - const struct cpumask *mask, bool force) -{ - int ret; - - ret = irq_chip_set_affinity_parent(d, mask, force); - if (ret == IRQ_SET_MASK_OK) - ret = IRQ_SET_MASK_OK_DONE; - - return ret; -} - static void odmi_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) { struct odmi_data *odmi; @@ -83,7 +71,7 @@ static struct irq_chip odmi_irq_chip = { .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, .irq_eoi = irq_chip_eoi_parent, - .irq_set_affinity = odmi_set_affinity, + .irq_set_affinity = irq_chip_set_affinity_parent, .irq_compose_msi_msg = odmi_compose_msi_msg, }; -- cgit v1.2.1 From b176862fca8625b0a8bee207bca9b611413e5e24 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 18 Feb 2016 21:00:41 +0100 Subject: x86/mm/ptdump: Remove paravirt_enabled() is_hypervisor_range() can simply check if the PGD index is within ffff800000000000 - ffff87ffffffffff which is the range reserved for a hypervisor. That range is practically an ABI, see Documentation/x86/x86_64/mm.txt. Tested-by: Boris Ostrovsky # Under Xen, as PV guest Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455825641-19585-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/dump_pagetables.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 4a6f1d9b5106..99bfb192803f 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -358,20 +358,19 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, #define pgd_none(a) pud_none(__pud(pgd_val(a))) #endif -#ifdef CONFIG_X86_64 static inline bool is_hypervisor_range(int idx) { +#ifdef CONFIG_X86_64 /* * ffff800000000000 - ffff87ffffffffff is reserved for * the hypervisor. */ - return paravirt_enabled() && - (idx >= pgd_index(__PAGE_OFFSET) - 16) && - (idx < pgd_index(__PAGE_OFFSET)); -} + return (idx >= pgd_index(__PAGE_OFFSET) - 16) && + (idx < pgd_index(__PAGE_OFFSET)); #else -static inline bool is_hypervisor_range(int idx) { return false; } + return false; #endif +} static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, bool checkwx) -- cgit v1.2.1 From c25323c07345a843a56a294047b130dfd9250fad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 18 Feb 2016 20:53:43 +0100 Subject: x86/tsc: Use topology functions It's simpler to look at the topology mask than iterating over all online cpus to find a cpu on the same package. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra --- arch/x86/kernel/tsc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3d743da828d3..5a6cb4684e0f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1246,14 +1246,14 @@ void __init tsc_init(void) */ unsigned long calibrate_delay_is_known(void) { - int i, cpu = smp_processor_id(); + int sibling, cpu = smp_processor_id(); if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) return 0; - for_each_online_cpu(i) - if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) - return cpu_data(i).loops_per_jiffy; + sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu); + if (sibling < nr_cpu_ids) + return cpu_data(sibling).loops_per_jiffy; return 0; } #endif -- cgit v1.2.1 From c637fa5294cefeda8be73cce20ba6693d22262dc Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Tue, 23 Feb 2016 01:29:59 +0000 Subject: x86/platform/intel/quark: Drop IMR lock bit support Isolated Memory Regions support a lock bit. The lock bit in an IMR prevents modification of the IMR until the core goes through a warm or cold reset. The lock bit feature is not useful in the context of the kernel API and is not really necessary since modification of IMRs is possible only from ring-zero anyway. This patch drops support for IMR locks bits, it simplifies the kernel API and removes an unnecessary and needlessly complex feature. Suggested-by: Ingo Molnar Signed-off-by: Bryan O'Donoghue Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andriy.shevchenko@linux.intel.com Cc: boon.leong.ong@intel.com Cc: paul.gortmaker@windriver.com Link: http://lkml.kernel.org/r/1456190999-12685-3-git-send-email-pure.logic@nexus-software.ie Signed-off-by: Ingo Molnar --- arch/x86/include/asm/imr.h | 2 +- arch/x86/platform/intel-quark/imr.c | 24 ++++++------------------ arch/x86/platform/intel-quark/imr_selftest.c | 15 +++++++-------- 3 files changed, 14 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h index cd2ce4068441..ebea2c9d2cdc 100644 --- a/arch/x86/include/asm/imr.h +++ b/arch/x86/include/asm/imr.h @@ -53,7 +53,7 @@ #define IMR_MASK (IMR_ALIGN - 1) int imr_add_range(phys_addr_t base, size_t size, - unsigned int rmask, unsigned int wmask, bool lock); + unsigned int rmask, unsigned int wmask); int imr_remove_range(phys_addr_t base, size_t size); diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index 740445a53363..17d6d2296e4d 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -134,11 +134,9 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr) * @idev: pointer to imr_device structure. * @imr_id: IMR entry to write. * @imr: IMR structure representing address and access masks. - * @lock: indicates if the IMR lock bit should be applied. * @return: 0 on success or error code passed from mbi_iosf on failure. */ -static int imr_write(struct imr_device *idev, u32 imr_id, - struct imr_regs *imr, bool lock) +static int imr_write(struct imr_device *idev, u32 imr_id, struct imr_regs *imr) { unsigned long flags; u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base; @@ -162,15 +160,6 @@ static int imr_write(struct imr_device *idev, u32 imr_id, if (ret) goto failed; - /* Lock bit must be set separately to addr_lo address bits. */ - if (lock) { - imr->addr_lo |= IMR_LOCK; - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, - reg - IMR_NUM_REGS, imr->addr_lo); - if (ret) - goto failed; - } - local_irq_restore(flags); return 0; failed: @@ -322,11 +311,10 @@ static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr) * @size: physical size of region in bytes must be aligned to 1KiB. * @read_mask: read access mask. * @write_mask: write access mask. - * @lock: indicates whether or not to permanently lock this region. * @return: zero on success or negative value indicating error. */ int imr_add_range(phys_addr_t base, size_t size, - unsigned int rmask, unsigned int wmask, bool lock) + unsigned int rmask, unsigned int wmask) { phys_addr_t end; unsigned int i; @@ -399,7 +387,7 @@ int imr_add_range(phys_addr_t base, size_t size, imr.rmask = rmask; imr.wmask = wmask; - ret = imr_write(idev, reg, &imr, lock); + ret = imr_write(idev, reg, &imr); if (ret < 0) { /* * In the highly unlikely event iosf_mbi_write failed @@ -410,7 +398,7 @@ int imr_add_range(phys_addr_t base, size_t size, imr.addr_hi = 0; imr.rmask = IMR_READ_ACCESS_ALL; imr.wmask = IMR_WRITE_ACCESS_ALL; - imr_write(idev, reg, &imr, false); + imr_write(idev, reg, &imr); } failed: mutex_unlock(&idev->lock); @@ -506,7 +494,7 @@ static int __imr_remove_range(int reg, phys_addr_t base, size_t size) imr.rmask = IMR_READ_ACCESS_ALL; imr.wmask = IMR_WRITE_ACCESS_ALL; - ret = imr_write(idev, reg, &imr, false); + ret = imr_write(idev, reg, &imr); failed: mutex_unlock(&idev->lock); @@ -587,7 +575,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev) * We don't round up @size since it is already PAGE_SIZE aligned. * See vmlinux.lds.S for details. */ - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); if (ret < 0) { pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n", size / 1024, start, end); diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index 0381343a0d3a..f5bad40936ac 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -60,30 +60,30 @@ static void __init imr_self_test(void) int ret; /* Test zero zero. */ - ret = imr_add_range(0, 0, 0, 0, false); + ret = imr_add_range(0, 0, 0, 0); imr_self_test_result(ret < 0, "zero sized IMR\n"); /* Test exact overlap. */ - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test overlap with base inside of existing. */ base += size - IMR_ALIGN; - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test overlap with end inside of existing. */ base -= size + IMR_ALIGN * 2; - ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU); imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); /* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */ ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL, - IMR_WRITE_ACCESS_ALL, false); + IMR_WRITE_ACCESS_ALL); imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n"); /* Test that a 1 KiB IMR @ zero with CPU only will work. */ - ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false); + ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU); imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n"); if (ret >= 0) { ret = imr_remove_range(0, IMR_ALIGN); @@ -92,8 +92,7 @@ static void __init imr_self_test(void) /* Test 2 KiB works. */ size = IMR_ALIGN * 2; - ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, - IMR_WRITE_ACCESS_ALL, false); + ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, IMR_WRITE_ACCESS_ALL); imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n"); if (ret >= 0) { ret = imr_remove_range(0, size); -- cgit v1.2.1 From bb53e416e02cd5406d5e1dbdd89432ff5875e982 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Dec 2015 09:30:12 -0800 Subject: rcu: Assign false instead of 0 for ->core_needs_qs A zero seems to have escaped earlier true/false substitution efforts, so this commit changes 0 to false for the ->core_needs_qs boolean field. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e41dd4131f7a..b8bbf3b27241 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2390,7 +2390,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { - rdp->core_needs_qs = 0; + rdp->core_needs_qs = false; /* * This GP can't end until cpu checks in, so all of our -- cgit v1.2.1 From 8994515cf0030e5020d67bb837a9d9a92441d0f7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Dec 2015 09:59:43 -0800 Subject: rcu: Update rcu_report_qs_rsp() comment The header comment for rcu_report_qs_rsp() was obsolete, dating well before the advent of RCU grace-period kthreads. This commit therefore brings this comment back into alignment with current reality. Reported-by: Lihao Liang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b8bbf3b27241..a91836868ade 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2234,11 +2234,13 @@ static bool rcu_start_gp(struct rcu_state *rsp) } /* - * Report a full set of quiescent states to the specified rcu_state - * data structure. This involves cleaning up after the prior grace - * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, which - * is released before return. + * Report a full set of quiescent states to the specified rcu_state data + * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period + * kthread if another grace period is required. Whether we wake + * the grace-period kthread or it awakens itself for the next round + * of quiescent-state forcing, that kthread will clean up after the + * just-completed grace period. Note that the caller must hold rnp->lock, + * which is released before return. */ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) -- cgit v1.2.1 From 4914950aaa12de8a2004b8031b45480526caf42b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Dec 2015 13:48:43 -0800 Subject: rcu: Stop treating in-kernel CPU-bound workloads as errors Commit 4a81e8328d379 ("Reduce overhead of cond_resched() checks for RCU") handles the error case where a nohz_full loops indefinitely in the kernel with the scheduling-clock interrupt disabled. However, this handling includes IPIing the CPU running the offending loop, which is not what we want for real-time workloads. And there are starting to be real-time CPU-bound in-kernel workloads, and these must be handled without IPIing the CPU, at least not in the common case. Therefore, this situation can no longer be dismissed as an error case. This commit therefore splits the handling out, so that the setting of bits in the per-CPU rcu_sched_qs_mask variable is done relatively early, but if the problem persists, resched_cpu() is eventually used to IPI the CPU containing the offending loop. Assuming that in-kernel CPU-bound loops used by real-time tasks contain frequent calls cond_resched_rcu_qs() (as in more than once per few tens of milliseconds), the real-time tasks will never be IPIed. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt --- kernel/rcu/tree.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a91836868ade..68f4bee3ecc3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1173,15 +1173,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, smp_mb(); /* ->cond_resched_completed before *rcrmp. */ WRITE_ONCE(*rcrmp, READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Enable beating. */ - } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - /* Time to beat on that CPU again! */ - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + /* And if it has been a really long time, kick the CPU as well. */ + if (ULONG_CMP_GE(jiffies, + rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) || + ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs)) + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + return 0; } -- cgit v1.2.1 From 23a9bacd357ab8388c7e07101b186a4282874992 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 13 Dec 2015 08:57:10 -0800 Subject: rcu: Set rdp->gpwrap when CPU is idle Commit #e3663b1024d1 ("rcu: Handle gpnum/completed wrap while dyntick idle") sets rdp->gpwrap on the wrong side of the "if" statement in dyntick_save_progress_counter(), that is, it sets it when the CPU is not idle instead of when it is idle. Of course, if the CPU is not idle, its rdp->gpnum won't be lagging beind the global rsp->gpnum, which means that rdp->gpwrap will never be set. This commit therefore moves this code to the proper leg of that "if" statement. This change means that the "else" cause is just "return 0" and the "then" clause ends with "return 1", so also move the "return 0" to follow the "if", dropping the "else" clause. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 68f4bee3ecc3..976a166f3fa3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1083,13 +1083,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, rcu_sysidle_check_cpu(rdp, isidle, maxj); if ((rdp->dynticks_snap & 0x1) == 0) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); - return 1; - } else { if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) WRITE_ONCE(rdp->gpwrap, true); - return 0; + return 1; } + return 0; } /* -- cgit v1.2.1 From aa5a8988760142fe33e38790a578bd84ddf3e339 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Dec 2015 16:27:06 -0800 Subject: rcutorture: Correct no-expedite console messages The "Disabled dynamic grace-period expediting" console message is currently printed unconditionally. This commit causes it to be output only when it is impossible to switch between normal and expedited grace periods, which was the original intent. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d2988d047d66..65ae0e5c35da 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -932,12 +932,14 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", - torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " Testing of dynamic grace-period expediting diabled.\n", - torture_type); + if (!can_expedite) { + pr_alert("%s" TORTURE_FLAG + " Grace periods expedited from boot/sysfs for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " Disabled dynamic grace-period expediting.\n", + torture_type); + } /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) -- cgit v1.2.1 From 1914aab54319ed70608078df4f18ac4767753977 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Sat, 26 Dec 2015 21:41:44 +0800 Subject: rcu: Remove useless rcu_data_p when !PREEMPT_RCU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The related warning from gcc 6.0: In file included from kernel/rcu/tree.c:4630:0: kernel/rcu/tree_plugin.h:810:40: warning: ‘rcu_data_p’ defined but not used [-Wunused-const-variable] static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data; ^~~~~~~~~~ Also remove always redundant rcu_data_p in tree.c. Signed-off-by: Chen Gang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - kernel/rcu/tree_plugin.h | 1 - 2 files changed, 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 976a166f3fa3..1e44fce73839 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -108,7 +108,6 @@ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); static struct rcu_state *const rcu_state_p; -static struct rcu_data __percpu *const rcu_data_p; LIST_HEAD(rcu_struct_flavors); /* Dump rcu_node combining tree at boot to verify correct setup. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9467a8b7e756..e6b88ad51959 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -807,7 +807,6 @@ void exit_rcu(void) #else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *const rcu_state_p = &rcu_sched_state; -static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data; /* * Tell them what RCU they are running. -- cgit v1.2.1 From ad315455d396a1cbcb2f9fdd687b7e1b26b789e7 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 29 Dec 2015 12:18:46 +0800 Subject: sparse: Add __private to privatize members of structs In C programming language, we don't have a easy way to privatize a member of a structure. However in kernel, sometimes there is a need to privatize a member in case of potential bugs or misuses. Fortunately, the noderef attribute of sparse is a way to privatize a member, as by defining a member as noderef, the address-of operator on the member will produce a noderef pointer to that member, and if anyone wants to dereference that kind of pointers to read or modify the member, sparse will yell. Based on this, __private modifier and related operation ACCESS_PRIVATE() are introduced, which could help detect undesigned public uses of private members of structs. Here is an example of sparse's output if it detect an undersigned public use: | kernel/rcu/tree.c:4453:25: warning: incorrect type in argument 1 (different modifiers) | kernel/rcu/tree.c:4453:25: expected struct raw_spinlock [usertype] *lock | kernel/rcu/tree.c:4453:25: got struct raw_spinlock [noderef] * Also, this patch improves compiler.h a little bit by adding comments for "#else" and "#endif". Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- include/linux/compiler.h | 12 ++++++++---- scripts/checkpatch.pl | 3 ++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 00b042c49ccd..c845356952bb 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -20,12 +20,14 @@ # define __pmem __attribute__((noderef, address_space(5))) #ifdef CONFIG_SPARSE_RCU_POINTER # define __rcu __attribute__((noderef, address_space(4))) -#else +#else /* CONFIG_SPARSE_RCU_POINTER */ # define __rcu -#endif +#endif /* CONFIG_SPARSE_RCU_POINTER */ +# define __private __attribute__((noderef)) extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); -#else +# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member)) +#else /* __CHECKER__ */ # define __user # define __kernel # define __safe @@ -44,7 +46,9 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __percpu # define __rcu # define __pmem -#endif +# define __private +# define ACCESS_PRIVATE(p, member) ((p)->member) +#endif /* __CHECKER__ */ /* Indirect macros required for expanded argument pasting, eg. __LINE__. */ #define ___PASTE(a,b) a##b diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0147c91fa549..874132b26d23 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -269,7 +269,8 @@ our $Sparse = qr{ __init_refok| __kprobes| __ref| - __rcu + __rcu| + __private }x; our $InitAttributePrefix = qr{__(?:mem|cpu|dev|net_|)}; our $InitAttributeData = qr{$InitAttributePrefix(?:initdata\b)}; -- cgit v1.2.1 From 67c583a7de3433a971983490b37ad2bff3c55463 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 29 Dec 2015 12:18:47 +0800 Subject: RCU: Privatize rcu_node::lock In patch: "rcu: Add transitivity to remaining rcu_node ->lock acquisitions" All locking operations on rcu_node::lock are replaced with the wrappers because of the need of transitivity, which indicates we should never write code using LOCK primitives alone(i.e. without a proper barrier following) on rcu_node::lock outside those wrappers. We could detect this kind of misuses on rcu_node::lock in the future by adding __private modifier on rcu_node::lock. To privatize rcu_node::lock, unlock wrappers are also needed. Replacing spinlock unlocks with these wrappers not only privatizes rcu_node::lock but also makes it easier to figure out critical sections of rcu_node. This patch adds __private modifier to rcu_node::lock and makes every access to it wrapped by ACCESS_PRIVATE(). Besides, unlock wrappers are added and raw_spin_unlock(&rnp->lock) and its friends are replaced with those wrappers. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 103 ++++++++++++++++++++++++----------------------- kernel/rcu/tree.h | 42 ++++++++++++++----- kernel/rcu/tree_plugin.h | 26 ++++++------ 3 files changed, 96 insertions(+), 75 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1e44fce73839..d5a6d2f0cbf4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1245,7 +1245,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) if (rnp->qsmask & (1UL << cpu)) dump_cpu_task(rnp->grplo + cpu); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1265,12 +1265,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) raw_spin_lock_irqsave_rcu_node(rnp, flags); delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * OK, time to rat on our buddy... @@ -1291,7 +1291,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) ndetected++; } } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } print_cpu_stall_info_end(); @@ -1356,7 +1356,7 @@ static void print_cpu_stall(struct rcu_state *rsp) if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * Attempt to revive the RCU machinery by forcing a context switch. @@ -1594,7 +1594,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } unlock_out: if (rnp != rnp_root) - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); out: if (c_out != NULL) *c_out = c; @@ -1814,7 +1814,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) return; } needwake = __note_gp_changes(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rsp); } @@ -1839,7 +1839,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); return false; } WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ @@ -1849,7 +1849,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * Grace period already in progress, don't start another. * Not supposed to be able to happen. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); return false; } @@ -1858,7 +1858,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Record GP times before starting GP, hence smp_store_release(). */ smp_store_release(&rsp->gpnum, rsp->gpnum + 1); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Apply per-leaf buffered online and offline operations to the @@ -1872,7 +1872,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); continue; } @@ -1906,7 +1906,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rcu_cleanup_dead_rnp(rnp); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -1937,7 +1937,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -1995,7 +1995,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } } @@ -2024,7 +2024,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Propagate new ->completed value to rcu_node structures so @@ -2045,7 +2045,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); @@ -2067,7 +2067,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) READ_ONCE(rsp->gpnum), TPS("newreq")); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -2246,7 +2246,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); rcu_gp_kthread_wake(rsp); } @@ -2276,7 +2276,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * Our bit has already been cleared, or the * relevant grace period is already over, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ @@ -2288,7 +2288,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rnp->grpmask; @@ -2298,7 +2298,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, break; } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -2330,7 +2330,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; /* Still need more quiescent states! */ } @@ -2347,7 +2347,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, /* Report up the rest of the hierarchy, tracking current ->gpnum. */ gps = rnp->gpnum; mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } @@ -2384,12 +2384,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rdp->grpmask; if ((rnp->qsmask & mask) == 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { rdp->core_needs_qs = false; @@ -2600,10 +2600,11 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); + /* irqs remain disabled. */ return; } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } } @@ -2626,7 +2627,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinitnext &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -2860,7 +2861,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); } else { /* Nothing to do here, so just drop the lock. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } } @@ -2896,11 +2897,11 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); rcu_gp_kthread_wake(rsp); } @@ -2926,7 +2927,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3017,7 +3018,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3437,14 +3438,14 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; /* No new CPUs, nothing to do. */ } /* Update this node's mask, track old value for propagation. */ oldmask = rnp->expmaskinit; rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* If was already nonzero, nothing to propagate. */ if (oldmask) @@ -3459,7 +3460,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore(&rnp_up->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); if (done) break; mask = rnp_up->grpmask; @@ -3482,7 +3483,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -3523,11 +3524,11 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, if (!rnp->expmask) rcu_initiate_boost(rnp, flags); else - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); break; } if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ wake_up(&rsp->expedited_wq); @@ -3535,7 +3536,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, break; } mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ rnp = rnp->parent; raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); @@ -3570,7 +3571,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } rnp->expmask &= ~mask; @@ -3731,7 +3732,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, */ if (rcu_preempt_has_tasks(rnp)) rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ mask = 1; @@ -3748,7 +3749,7 @@ retry_ipi: raw_spin_lock_irqsave_rcu_node(rnp, flags); if (cpu_online(cpu) && (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); schedule_timeout_uninterruptible(1); if (cpu_online(cpu) && (rnp->expmask & mask)) @@ -3757,7 +3758,7 @@ retry_ipi: } if (!(rnp->expmask & mask)) mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; @@ -4164,7 +4165,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) return; raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ rnp->qsmaskinit |= mask; - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ } } @@ -4188,7 +4189,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rsp = rsp; mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -4216,7 +4217,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed @@ -4237,7 +4238,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void rcu_prepare_cpu(int cpu) @@ -4359,7 +4360,7 @@ static int __init rcu_spawn_gp_kthread(void) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); } rcu_spawn_nocb_kthreads(); @@ -4450,8 +4451,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < levelcnt[i]; j++, rnp++) { - raw_spin_lock_init(&rnp->lock); - lockdep_set_class_and_name(&rnp->lock, + raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); + lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), &rcu_node_class[i], buf[i]); raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 83360b4f4352..4886d6a03353 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -149,8 +149,9 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - raw_spinlock_t lock; /* Root rcu_node's lock protects some */ - /* rcu_state fields as well as following. */ + raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ + /* some rcu_state fields as well as */ + /* following. */ unsigned long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ @@ -680,7 +681,7 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #else #ifdef CONFIG_PPC */ /* - * Wrappers for the rcu_node::lock acquire. + * Wrappers for the rcu_node::lock acquire and release. * * Because the rcu_nodes form a tree, the tree traversal locking will observe * different lock values, this in turn means that an UNLOCK of one level @@ -689,29 +690,48 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) * * In order to restore full ordering between tree levels, augment the regular * lock acquire functions with smp_mb__after_unlock_lock(). + * + * As ->lock of struct rcu_node is a __private field, therefore one should use + * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. */ static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) { - raw_spin_lock(&rnp->lock); + raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); smp_mb__after_unlock_lock(); } +static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); +} + static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) { - raw_spin_lock_irq(&rnp->lock); + raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); smp_mb__after_unlock_lock(); } -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&(rnp)->lock, flags); \ - smp_mb__after_unlock_lock(); \ +static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ } while (0) static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) { - bool locked = raw_spin_trylock(&rnp->lock); + bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); if (locked) smp_mb__after_unlock_lock(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e6b88ad51959..43e9b4a4bcd9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -489,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t) !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags); } else { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Unboost if we were boosted. */ @@ -518,14 +518,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) sched_show_task(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -990,7 +990,7 @@ static int rcu_boost(struct rcu_node *rnp) * might exit their RCU read-side critical sections on their own. */ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; } @@ -1027,7 +1027,7 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ @@ -1087,7 +1087,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { rnp->n_balk_exp_gp_tasks++; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } if (rnp->exp_tasks != NULL || @@ -1097,13 +1097,13 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) ULONG_CMP_GE(jiffies, rnp->boost_time))) { if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); t = rnp->boost_kthread_task; if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { rcu_initiate_boost_trace(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1171,7 +1171,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return PTR_ERR(t); raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ @@ -1307,7 +1307,7 @@ static void rcu_prepare_kthreads(int cpu) static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void invoke_rcu_callbacks_kthread(void) @@ -1558,7 +1558,7 @@ static void rcu_prepare_for_idle(void) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ if (needwake) rcu_gp_kthread_wake(rsp); } @@ -2058,7 +2058,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); -- cgit v1.2.1 From b354286effa52da6cb1b1f16604d41ff81b8c445 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 29 Dec 2015 12:18:48 +0800 Subject: irq: Privatize irq_common_data::state_use_accessors irq_common_data::state_use_accessors is not designed for public use. Therefore make it private so that people who write code accessing it directly will get blamed by sparse. Also #undef the macro __irqd_to_state after used in header files, so that the macro can't be misused. Signed-off-by: Boqun Feng Reviewed-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- include/linux/irq.h | 6 ++++-- kernel/irq/internals.h | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 3c1c96786248..cd14cd4a22b4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -137,7 +137,7 @@ struct irq_domain; * @msi_desc: MSI descriptor */ struct irq_common_data { - unsigned int state_use_accessors; + unsigned int __private state_use_accessors; #ifdef CONFIG_NUMA unsigned int node; #endif @@ -208,7 +208,7 @@ enum { IRQD_FORWARDED_TO_VCPU = (1 << 20), }; -#define __irqd_to_state(d) ((d)->common->state_use_accessors) +#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline bool irqd_is_setaffinity_pending(struct irq_data *d) { @@ -299,6 +299,8 @@ static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d) __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; } +#undef __irqd_to_state + static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) { return d->hwirq; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fcab63c66905..3d182932d2d1 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -160,6 +160,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) __irq_put_desc_unlock(desc, flags, false); } +#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) + /* * Manipulation functions for irq_data.state */ @@ -188,6 +190,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +#undef __irqd_to_state + static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); -- cgit v1.2.1 From 9fc9204ef9fdbe550fd81a554abce2b210e63e49 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 7 Jan 2016 19:05:19 -0500 Subject: rcu: Make rcu/tiny_plugin.h explicitly non-modular The Kconfig currently controlling compilation of this code is: init/Kconfig:config TINY_RCU init/Kconfig: bool ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the code there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We could consider moving this to an earlier initcall (subsys?) if desired. We also delete the MODULE_LICENSE tag etc. since all that information is already contained at the top of the file in the comments. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny_plugin.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index e492a5253e0f..196f0302e2f4 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -23,7 +23,7 @@ */ #include -#include +#include #include #include @@ -122,18 +122,7 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutiny_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - -module_init(rcutiny_trace_init); -module_exit(rcutiny_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutiny_trace_init); static void check_cpu_stall(struct rcu_ctrlblk *rcp) { -- cgit v1.2.1 From 9de630c4f264dec48e61edd871cb98b8e1b58250 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Jan 2016 07:43:50 -0800 Subject: rcu: Document unique-name limitation for DEFINE_STATIC_SRCU() SRCU uses per-CPU variables, and DEFINE_STATIC_SRCU() uses a static per-CPU variable. However, per-CPU variables have significant restrictions, for example, names of per-CPU variables must be globally unique, even if declared static. These restrictions carry over to DEFINE_STATIC_SRCU(), and this commit therefore documents these restrictions. Reported-by: Stephen Rothwell Reported-by: kbuild test robot Suggested-by: Boqun Feng Signed-off-by: Paul E. McKenney Reviewed-by: Tejun Heo --- include/linux/srcu.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f5f80c5643ac..dc8eb63c6568 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -99,8 +99,23 @@ void process_srcu(struct work_struct *work); } /* - * define and init a srcu struct at build time. - * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it. + * Define and initialize a srcu struct at build time. + * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. + * + * Note that although DEFINE_STATIC_SRCU() hides the name from other + * files, the per-CPU variable rules nevertheless require that the + * chosen name be globally unique. These rules also prohibit use of + * DEFINE_STATIC_SRCU() within a function. If these rules are too + * restrictive, declare the srcu_struct manually. For example, in + * each file: + * + * static struct srcu_struct my_srcu; + * + * Then, before the first use of each my_srcu, manually initialize it: + * + * init_srcu_struct(&my_srcu); + * + * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ #define __DEFINE_SRCU(name, is_static) \ static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\ -- cgit v1.2.1 From 4b455dc3e13064795ef2cd3415132df747e64063 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Jan 2016 22:44:45 -0800 Subject: rcu: Catch up rcu_report_qs_rdp() comment with reality Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d5a6d2f0cbf4..39f9c73d33c5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2354,12 +2354,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, /* * Record a quiescent state for the specified CPU to that CPU's rcu_data - * structure. This must be either called from the specified CPU, or - * called when the specified CPU is known to be offline (and when it is - * also known that no other CPU is concurrently trying to help the offline - * CPU). The lastcomp argument is used to make sure we are still in the - * grace period of interest. We don't want to end the current grace period - * based on quiescent states detected in an earlier grace period! + * structure. This must be called from the specified CPU. */ static void rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) -- cgit v1.2.1 From 3500efae4410454522697c94c23fc40323c0cee9 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 19 Oct 2015 14:45:02 -0700 Subject: rcu: Remove rcu_user_hooks_switch Because there are neither uses nor intended uses for the rcu_user_hooks_switch() function that was orginally intended for nohz use, this commit removes it. Signed-off-by: Yang Shi Acked-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 14e6f47ee16f..b5d48bd56e3f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -360,8 +360,6 @@ void rcu_user_exit(void); #else static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } -static inline void rcu_user_hooks_switch(struct task_struct *prev, - struct task_struct *next) { } #endif /* CONFIG_NO_HZ_FULL */ #ifdef CONFIG_RCU_NOCB_CPU -- cgit v1.2.1 From 4f2a848c567c72f778352d65cc7c155d1a0977fd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Jan 2016 13:38:12 -0800 Subject: rcu: Export rcu_gp_is_normal() This commit exports rcu_gp_is_normal() in order to allow it to be used by rcutorture and rcuperf. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 76b94e19430b..ca828b41c938 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -128,6 +128,7 @@ bool rcu_gp_is_normal(void) { return READ_ONCE(rcu_normal); } +EXPORT_SYMBOL_GPL(rcu_gp_is_normal); static atomic_t rcu_expedited_nesting = ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); -- cgit v1.2.1 From 405e1133d00e0271cedef75c17ecb773ff3e2732 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 10 Feb 2016 02:03:00 -0700 Subject: x86/mm: Avoid premature success when changing page attributes set_memory_nx() (and set_memory_x()) currently differ in behavior from all other set_memory_*() functions when encountering a virtual address space hole within the kernel address range: They stop processing at the hole, but nevertheless report success (making the caller believe the operation was carried out on the entire range). While observed to be a problem - triggering the CONFIG_DEBUG_WX warning - only with out of tree code, I suspect (but didn't check) that on x86-64 the CONFIG_DEBUG_PAGEALLOC logic in free_init_pages() would, when called from free_initmem(), have the same effect on the set_memory_nx() called from mark_rodata_ro(). This unexpected behavior is a result of change_page_attr_set_clr() special casing changes to only the NX bit, in that it passes "false" as the "checkalias" argument to __change_page_attr_set_clr(). Since this flag becomes the "primary" argument of both __change_page_attr() and __cpa_process_fault(), the latter would so far return success without adjusting cpa->numpages. Success to the higher level callers, however, means that whatever cpa->numpages currently holds is the count of successfully processed pages. The cases when __change_page_attr() calls __cpa_process_fault(), otoh, don't generally mean the entire range got processed (as can be seen from one of the two success return paths in __cpa_process_fault() already adjusting ->numpages). Signed-off-by: Jan Beulich Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/56BB0AD402000078000D05BF@prv-mh.provo.novell.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2440814b0069..3dd6afd2c0e5 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1122,8 +1122,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, /* * Ignore all non primary paths. */ - if (!primary) + if (!primary) { + cpa->numpages = 1; return 0; + } /* * Ignore the NULL PTE for kernel identity mapping, as it is expected -- cgit v1.2.1 From 0abefbaab4edbcec637e00fefcdeccb52797fe4f Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:12 +0000 Subject: genirq: Add new IPI irqdomain flags These flags will be used to identify an IPI domain. We have two flavours of IPI implementations: IRQ_DOMAIN_FLAG_IPI_PER_CPU: Each CPU has its own virq and hwirq IRQ_DOMAIN_FLAG_IPI_SINGLE : A single virq and hwirq for all CPUs Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-2-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 04579d9fbce4..9bb0a9cfc1c4 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -172,6 +172,12 @@ enum { /* Core calls alloc/free recursive through the domain hierarchy. */ IRQ_DOMAIN_FLAG_AUTO_RECURSIVE = (1 << 1), + /* Irq domain is an IPI domain with virq per cpu */ + IRQ_DOMAIN_FLAG_IPI_PER_CPU = (1 << 2), + + /* Irq domain is an IPI domain with single virq */ + IRQ_DOMAIN_FLAG_IPI_SINGLE = (1 << 3), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the @@ -400,6 +406,22 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY; } + +static inline bool irq_domain_is_ipi(struct irq_domain *domain) +{ + return domain->flags & + (IRQ_DOMAIN_FLAG_IPI_PER_CPU | IRQ_DOMAIN_FLAG_IPI_SINGLE); +} + +static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_IPI_PER_CPU; +} + +static inline bool irq_domain_is_ipi_single(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_IPI_SINGLE; +} #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ static inline void irq_domain_activate_irq(struct irq_data *data) { } static inline void irq_domain_deactivate_irq(struct irq_data *data) { } @@ -413,6 +435,21 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return false; } + +static inline bool irq_domain_is_ipi(struct irq_domain *domain) +{ + return false; +} + +static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain) +{ + return false; +} + +static inline bool irq_domain_is_ipi_single(struct irq_domain *domain) +{ + return false; +} #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #else /* CONFIG_IRQ_DOMAIN */ -- cgit v1.2.1 From 29d5c8db26ad54592436508819ac617119306f96 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:13 +0000 Subject: genirq: Add DOMAIN_BUS_IPI We need a way to search and match IPI domains. Using the new enum we can use irq_find_matching_host() to do that. Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-3-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 9bb0a9cfc1c4..130e1c3117c3 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -74,6 +74,7 @@ enum irq_domain_bus_token { DOMAIN_BUS_PCI_MSI, DOMAIN_BUS_PLATFORM_MSI, DOMAIN_BUS_NEXUS, + DOMAIN_BUS_IPI, }; /** -- cgit v1.2.1 From 379b656446a36c7a80b27cbdc34eec8e57b5f290 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:14 +0000 Subject: genirq: Add GENERIC_IRQ_IPI Kconfig symbol Select this to enable the generic IPI domain support Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-4-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3b48dab80164..3bbfd6a9c475 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY bool select IRQ_DOMAIN +# Generic IRQ IPI support +config GENERIC_IRQ_IPI + bool + # Generic MSI interrupt support config GENERIC_MSI_IRQ bool -- cgit v1.2.1 From 955bfe5912e7839abcc83694f06867535487404b Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:17 +0000 Subject: genirq: Add an extra comment about the use of affinity in irq_common_data Affinity will have dual meaning depends on the type of the irq. If it is a normal irq, it'll have the standard affinity meaning. If it is an IPI, it will hold the mask of the cpus to which an IPI can be sent. Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-7-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 3c1c96786248..0817afd0d719 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -133,7 +133,9 @@ struct irq_domain; * Use accessor functions to deal with it * @node: node index useful for balancing * @handler_data: per-IRQ data for the irq_chip methods - * @affinity: IRQ affinity on SMP + * @affinity: IRQ affinity on SMP. If this is an IPI + * related irq, then this is the mask of the + * CPUs to which an IPI can be sent. * @msi_desc: MSI descriptor */ struct irq_common_data { -- cgit v1.2.1 From f256c9a0c54820ffef21b126f8226be2bece3dd7 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:16 +0000 Subject: genirq: Add ipi_offset to irq_common_data IPIs are always assumed to be consecutively allocated, hence virqs and hwirqs can be inferred by using CPU id as an offset. But the first cpu doesn't always have to start at offset 0. ipi_offset stores the position of the first cpu so that we can easily calculate the virq or hwirq of an IPI associated with a specific cpu. Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-6-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/irq.h b/include/linux/irq.h index 0817afd0d719..a32b47fbf874 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -137,6 +137,7 @@ struct irq_domain; * related irq, then this is the mask of the * CPUs to which an IPI can be sent. * @msi_desc: MSI descriptor + * @ipi_offset: Offset of first IPI target cpu in @affinity. Optional. */ struct irq_common_data { unsigned int state_use_accessors; @@ -146,6 +147,9 @@ struct irq_common_data { void *handler_data; struct msi_desc *msi_desc; cpumask_var_t affinity; +#ifdef CONFIG_GENERIC_IRQ_IPI + unsigned int ipi_offset; +#endif }; /** -- cgit v1.2.1 From ac0a0cd266d1f21236d5975ca6bced9b377a2a6a Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:18 +0000 Subject: genirq: Make irq_domain_alloc_descs() non static We will need to use this function to implement irq_reserve_ipi() later. So make it non static and move the prototype to irqdomain.h to allow using it outside irqdomain.c Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-8-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 2 ++ kernel/irq/irqdomain.c | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 130e1c3117c3..c466cc17b8e9 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -213,6 +213,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, extern struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token); extern void irq_set_default_host(struct irq_domain *host); +extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3e56d2f03e24..86811541f073 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; -static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node); static void irq_domain_check_hierarchy(struct irq_domain *domain); struct irqchip_fwid { @@ -840,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -static int irq_domain_alloc_descs(int virq, unsigned int cnt, - irq_hw_number_t hwirq, int node) +int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, + int node) { unsigned int hint; -- cgit v1.2.1 From d17bf24e695290d3fe7943aca52ab48098a10653 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:19 +0000 Subject: genirq: Add a new generic IPI reservation code to irq core Add a generic mechanism to dynamically allocate an IPI. Depending on the underlying implementation this creates either a single Linux irq or a consective range of Linux irqs. The Linux irq is used later to send IPIs to other CPUs. [ tglx: Massaged the code and removed the 'consecutive mask' restriction for the single IRQ case ] Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-9-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 3 + include/linux/irqdomain.h | 5 ++ kernel/irq/Makefile | 1 + kernel/irq/ipi.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 146 insertions(+) create mode 100644 kernel/irq/ipi.c diff --git a/include/linux/irq.h b/include/linux/irq.h index a32b47fbf874..95f4f66f95f3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -940,4 +940,7 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, return readl(gc->reg_base + reg_offset); } +/* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ +#define INVALID_HWIRQ (~0UL) + #endif /* _LINUX_IRQ_H */ diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index c466cc17b8e9..ed48594e96d2 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -344,6 +344,11 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type); +/* IPI functions */ +unsigned int irq_reserve_ipi(struct irq_domain *domain, + const struct cpumask *dest); +void irq_destroy_ipi(unsigned int irq); + /* V2 interfaces to support hierarchy IRQ domains. */ extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq); diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2fc9cbdf35b6..2ee42e95a3ce 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o +obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c new file mode 100644 index 000000000000..340af273429c --- /dev/null +++ b/kernel/irq/ipi.c @@ -0,0 +1,137 @@ +/* + * linux/kernel/irq/ipi.c + * + * Copyright (C) 2015 Imagination Technologies Ltd + * Author: Qais Yousef + * + * This file contains driver APIs to the IPI subsystem. + */ + +#define pr_fmt(fmt) "genirq/ipi: " fmt + +#include +#include + +/** + * irq_reserve_ipi() - Setup an IPI to destination cpumask + * @domain: IPI domain + * @dest: cpumask of cpus which can receive the IPI + * + * Allocate a virq that can be used to send IPI to any CPU in dest mask. + * + * On success it'll return linux irq number and 0 on failure + */ +unsigned int irq_reserve_ipi(struct irq_domain *domain, + const struct cpumask *dest) +{ + unsigned int nr_irqs, offset; + struct irq_data *data; + int virq, i; + + if (!domain ||!irq_domain_is_ipi(domain)) { + pr_warn("Reservation on a non IPI domain\n"); + return 0; + } + + if (!cpumask_subset(dest, cpu_possible_mask)) { + pr_warn("Reservation is not in possible_cpu_mask\n"); + return 0; + } + + nr_irqs = cpumask_weight(dest); + if (!nr_irqs) { + pr_warn("Reservation for empty destination mask\n"); + return 0; + } + + if (irq_domain_is_ipi_single(domain)) { + /* + * If the underlying implementation uses a single HW irq on + * all cpus then we only need a single Linux irq number for + * it. We have no restrictions vs. the destination mask. The + * underlying implementation can deal with holes nicely. + */ + nr_irqs = 1; + offset = 0; + } else { + unsigned int next; + + /* + * The IPI requires a seperate HW irq on each CPU. We require + * that the destination mask is consecutive. If an + * implementation needs to support holes, it can reserve + * several IPI ranges. + */ + offset = cpumask_first(dest); + /* + * Find a hole and if found look for another set bit after the + * hole. For now we don't support this scenario. + */ + next = cpumask_next_zero(offset, dest); + if (next < nr_cpu_ids) + next = cpumask_next(next, dest); + if (next < nr_cpu_ids) { + pr_warn("Destination mask has holes\n"); + return 0; + } + } + + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc descs\n"); + return 0; + } + + virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, + (void *) dest, true); + + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); + goto free_descs; + } + + for (i = 0; i < nr_irqs; i++) { + data = irq_get_irq_data(virq + i); + cpumask_copy(data->common->affinity, dest); + data->common->ipi_offset = offset; + } + return virq; + +free_descs: + irq_free_descs(virq, nr_irqs); + return 0; +} + +/** + * irq_destroy_ipi() - unreserve an IPI that was previously allocated + * @irq: linux irq number to be destroyed + * + * Return the IPIs allocated with irq_reserve_ipi() to the system destroying + * all virqs associated with them. + */ +void irq_destroy_ipi(unsigned int irq) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + struct irq_domain *domain; + unsigned int nr_irqs; + + if (!irq || !data || !ipimask) + return; + + domain = data->domain; + if (WARN_ON(domain == NULL)) + return; + + if (!irq_domain_is_ipi(domain)) { + pr_warn("Trying to destroy a non IPI domain!\n"); + return; + } + + if (irq_domain_is_ipi_per_cpu(domain)) + nr_irqs = cpumask_weight(ipimask); + else + nr_irqs = 1; + + irq_domain_free_irqs(irq, nr_irqs); +} -- cgit v1.2.1 From f9bce791ae2a1a10a965b30427f5507c1a77669f Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:20 +0000 Subject: genirq: Add a new function to get IPI reverse mapping When dealing with coprocessors we need to find out the actual hwirqs values to pass on to the firmware so that it knows what it needs to use to receive IPIs from and send IPIs to Linux cpus. [ tglx: Fixed the single hwirq IPI case. The hardware irq number does not change due to the cpu number ] Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-10-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 + kernel/irq/ipi.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/include/linux/irq.h b/include/linux/irq.h index 95f4f66f95f3..10273dce058a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -942,5 +942,6 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) +irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); #endif /* _LINUX_IRQ_H */ diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 340af273429c..6f34f2930bc0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -135,3 +135,37 @@ void irq_destroy_ipi(unsigned int irq) irq_domain_free_irqs(irq, nr_irqs); } + +/** + * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu + * @irq: linux irq number + * @cpu: the target cpu + * + * When dealing with coprocessors IPI, we need to inform the coprocessor of + * the hwirq it needs to use to receive and send IPIs. + * + * Returns hwirq value on success and INVALID_HWIRQ on failure. + */ +irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + + if (!data || !ipimask || cpu > nr_cpu_ids) + return INVALID_HWIRQ; + + if (!cpumask_test_cpu(cpu, ipimask)) + return INVALID_HWIRQ; + + /* + * Get the real hardware irq number if the underlying implementation + * uses a seperate irq per cpu. If the underlying implementation uses + * a single hardware irq for all cpus then the IPI send mechanism + * needs to take care of this. + */ + if (irq_domain_is_ipi_per_cpu(data->domain)) + data = irq_get_irq_data(irq + cpu - data->common->ipi_offset); + + return data ? irqd_to_hwirq(data) : INVALID_HWIRQ; +} +EXPORT_SYMBOL_GPL(ipi_get_hwirq); -- cgit v1.2.1 From 34dc1ae101018dbb50e1d04e88aa89052802a7db Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:21 +0000 Subject: genirq: Add send_ipi callbacks to irq_chip Introduce the new callbacks which can be used by the core code to implement a generic IPI send mechanism. Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-11-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/irq.h b/include/linux/irq.h index 10273dce058a..3b3a5b817469 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -347,6 +347,8 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_get_irqchip_state: return the internal state of an interrupt * @irq_set_irqchip_state: set the internal state of a interrupt * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine + * @ipi_send_single: send a single IPI to destination cpus + * @ipi_send_mask: send an IPI to destination cpus in cpumask * @flags: chip specific flags */ struct irq_chip { @@ -391,6 +393,9 @@ struct irq_chip { int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info); + void (*ipi_send_single)(struct irq_data *data, unsigned int cpu); + void (*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest); + unsigned long flags; }; -- cgit v1.2.1 From 3b8e29a82dd16c1f2061e0b955a71cd36eeb061b Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:22 +0000 Subject: genirq: Implement ipi_send_mask/single() Add APIs to send IPIs from driver and arch code. We have different functions because we allow architecture code to cache the irq descriptor to avoid lookups. Driver code has to use the irq number and is subject to more restrictive checks. [ tglx: Polish the implementation ] Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-12-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++ kernel/irq/ipi.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 160 insertions(+), 1 deletion(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 3b3a5b817469..d5ebd94822d2 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -948,5 +948,9 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); +int __ipi_send_single(struct irq_desc *desc, unsigned int cpu); +int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest); +int ipi_send_single(unsigned int virq, unsigned int cpu); +int ipi_send_mask(unsigned int virq, const struct cpumask *dest); #endif /* _LINUX_IRQ_H */ diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 6f34f2930bc0..c37f34b00a11 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -161,7 +161,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) * Get the real hardware irq number if the underlying implementation * uses a seperate irq per cpu. If the underlying implementation uses * a single hardware irq for all cpus then the IPI send mechanism - * needs to take care of this. + * needs to take care of the cpu destinations. */ if (irq_domain_is_ipi_per_cpu(data->domain)) data = irq_get_irq_data(irq + cpu - data->common->ipi_offset); @@ -169,3 +169,158 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) return data ? irqd_to_hwirq(data) : INVALID_HWIRQ; } EXPORT_SYMBOL_GPL(ipi_get_hwirq); + +static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, + const struct cpumask *dest, unsigned int cpu) +{ + struct cpumask *ipimask = irq_data_get_affinity_mask(data); + + if (!chip || !ipimask) + return -EINVAL; + + if (!chip->ipi_send_single && !chip->ipi_send_mask) + return -EINVAL; + + if (cpu > nr_cpu_ids) + return -EINVAL; + + if (dest) { + if (!cpumask_subset(dest, ipimask)) + return -EINVAL; + } else { + if (!cpumask_test_cpu(cpu, ipimask)) + return -EINVAL; + } + return 0; +} + +/** + * __ipi_send_single - send an IPI to a target Linux SMP CPU + * @desc: pointer to irq_desc of the IRQ + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; +#endif + if (!chip->ipi_send_single) { + chip->ipi_send_mask(data, cpumask_of(cpu)); + return 0; + } + + /* FIXME: Store this information in irqdata flags */ + if (irq_domain_is_ipi_per_cpu(data->domain) && + cpu != data->common->ipi_offset) { + /* use the correct data for that cpu */ + unsigned irq = data->irq + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + } + chip->ipi_send_single(data, cpu); + return 0; +} + +/** + * ipi_send_mask - send an IPI to target Linux SMP CPU(s) + * @desc: pointer to irq_desc of the IRQ + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + unsigned int cpu; + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; +#endif + if (chip->ipi_send_mask) { + chip->ipi_send_mask(data, dest); + return 0; + } + + if (irq_domain_is_ipi_per_cpu(data->domain)) { + unsigned int base = data->irq; + + for_each_cpu(cpu, dest) { + unsigned irq = base + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + chip->ipi_send_single(data, cpu); + } + } else { + for_each_cpu(cpu, dest) + chip->ipi_send_single(data, cpu); + } + return 0; +} + +/** + * ipi_send_single - Send an IPI to a single CPU + * @virq: linux irq number from irq_reserve_ipi() + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_single(unsigned int virq, unsigned int cpu) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; + + return __ipi_send_single(desc, cpu); +} +EXPORT_SYMBOL_GPL(ipi_send_single); + +/** + * ipi_send_mask - Send an IPI to target CPU(s) + * @virq: linux irq number from irq_reserve_ipi() + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_mask(unsigned int virq, const struct cpumask *dest) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; + + return __ipi_send_mask(desc, dest); +} +EXPORT_SYMBOL_GPL(ipi_send_mask); -- cgit v1.2.1 From 2af70a962070fd2e6f3b1a259b652faa3fd1a122 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:23 +0000 Subject: irqchip/mips-gic: Add a IPI hierarchy domain Add a new ipi domain on top of the normal domain. MIPS GIC now supports dynamic allocation of an IPI. Signed-off-by: Qais Yousef Acked-by: Ralf Baechle Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-13-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/Kconfig | 1 + drivers/irqchip/irq-mips-gic.c | 184 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 180 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 0ffbdfdc4170..71e648adc3fd 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -209,6 +209,7 @@ config KEYSTONE_IRQ config MIPS_GIC bool + select IRQ_DOMAIN_HIERARCHY select MIPS_CM config INGENIC_IRQ diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 9e17ef27a183..99f01ca1869a 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -29,16 +29,31 @@ struct gic_pcpu_mask { DECLARE_BITMAP(pcpu_mask, GIC_MAX_INTRS); }; +struct gic_irq_spec { + enum { + GIC_DEVICE, + GIC_IPI + } type; + + union { + struct cpumask *ipimask; + unsigned int hwirq; + }; +}; + static unsigned long __gic_base_addr; + static void __iomem *gic_base; static struct gic_pcpu_mask pcpu_masks[NR_CPUS]; static DEFINE_SPINLOCK(gic_lock); static struct irq_domain *gic_irq_domain; +static struct irq_domain *gic_ipi_domain; static int gic_shared_intrs; static int gic_vpes; static unsigned int gic_cpu_pin; static unsigned int timer_cpu_pin; static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller; +DECLARE_BITMAP(ipi_resrv, GIC_MAX_INTRS); static void __gic_irq_dispatch(void); @@ -753,7 +768,7 @@ static int gic_local_irq_domain_map(struct irq_domain *d, unsigned int virq, } static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, - irq_hw_number_t hw) + irq_hw_number_t hw, unsigned int vpe) { int intr = GIC_HWIRQ_TO_SHARED(hw); unsigned long flags; @@ -763,9 +778,8 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, spin_lock_irqsave(&gic_lock, flags); gic_map_to_pin(intr, gic_cpu_pin); - /* Map to VPE 0 by default */ - gic_map_to_vpe(intr, 0); - set_bit(intr, pcpu_masks[0].pcpu_mask); + gic_map_to_vpe(intr, vpe); + set_bit(intr, pcpu_masks[vpe].pcpu_mask); spin_unlock_irqrestore(&gic_lock, flags); return 0; @@ -776,7 +790,7 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, { if (GIC_HWIRQ_TO_LOCAL(hw) < GIC_NUM_LOCAL_INTRS) return gic_local_irq_domain_map(d, virq, hw); - return gic_shared_irq_domain_map(d, virq, hw); + return gic_shared_irq_domain_map(d, virq, hw, 0); } static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, @@ -798,9 +812,157 @@ static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, return 0; } +static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct gic_irq_spec *spec = arg; + irq_hw_number_t hwirq, base_hwirq; + int cpu, ret, i; + + if (spec->type == GIC_DEVICE) { + /* verify that it doesn't conflict with an IPI irq */ + if (test_bit(spec->hwirq, ipi_resrv)) + return -EBUSY; + } else { + base_hwirq = find_first_bit(ipi_resrv, gic_shared_intrs); + if (base_hwirq == gic_shared_intrs) { + return -ENOMEM; + } + + /* check that we have enough space */ + for (i = base_hwirq; i < nr_irqs; i++) { + if (!test_bit(i, ipi_resrv)) + return -EBUSY; + } + bitmap_clear(ipi_resrv, base_hwirq, nr_irqs); + + /* map the hwirq for each cpu consecutively */ + i = 0; + for_each_cpu(cpu, spec->ipimask) { + hwirq = GIC_SHARED_TO_HWIRQ(base_hwirq + i); + + ret = irq_domain_set_hwirq_and_chip(d, virq + i, hwirq, + &gic_edge_irq_controller, + NULL); + if (ret) + goto error; + + ret = gic_shared_irq_domain_map(d, virq + i, hwirq, cpu); + if (ret) + goto error; + + i++; + } + + /* + * tell the parent about the base hwirq we allocated so it can + * set its own domain data + */ + spec->hwirq = base_hwirq; + } + + return 0; +error: + bitmap_set(ipi_resrv, base_hwirq, nr_irqs); + return ret; +} + +void gic_irq_domain_free(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs) +{ + irq_hw_number_t base_hwirq; + struct irq_data *data; + + data = irq_get_irq_data(virq); + if (!data) + return; + + base_hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(data)); + bitmap_set(ipi_resrv, base_hwirq, nr_irqs); +} + static const struct irq_domain_ops gic_irq_domain_ops = { .map = gic_irq_domain_map, .xlate = gic_irq_domain_xlate, + .alloc = gic_irq_domain_alloc, + .free = gic_irq_domain_free, +}; + +static int gic_ipi_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, + unsigned int *out_type) +{ + /* + * There's nothing to translate here. hwirq is dynamically allocated and + * the irq type is always edge triggered. + * */ + *out_hwirq = 0; + *out_type = IRQ_TYPE_EDGE_RISING; + + return 0; +} + +static int gic_ipi_domain_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct cpumask *ipimask = arg; + struct gic_irq_spec spec = { + .type = GIC_IPI, + .ipimask = ipimask + }; + int ret, i; + + ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec); + if (ret) + return ret; + + /* the parent should have set spec.hwirq to the base_hwirq it allocated */ + for (i = 0; i < nr_irqs; i++) { + ret = irq_domain_set_hwirq_and_chip(d, virq + i, + GIC_SHARED_TO_HWIRQ(spec.hwirq + i), + &gic_edge_irq_controller, + NULL); + if (ret) + goto error; + + ret = irq_set_irq_type(virq + i, IRQ_TYPE_EDGE_RISING); + if (ret) + goto error; + } + + return 0; +error: + irq_domain_free_irqs_parent(d, virq, nr_irqs); + return ret; +} + +void gic_ipi_domain_free(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs) +{ + irq_domain_free_irqs_parent(d, virq, nr_irqs); +} + +int gic_ipi_domain_match(struct irq_domain *d, struct device_node *node, + enum irq_domain_bus_token bus_token) +{ + bool is_ipi; + + switch (bus_token) { + case DOMAIN_BUS_IPI: + is_ipi = d->bus_token == bus_token; + return to_of_node(d->fwnode) == node && is_ipi; + break; + default: + return 0; + } +} + +static struct irq_domain_ops gic_ipi_domain_ops = { + .xlate = gic_ipi_domain_xlate, + .alloc = gic_ipi_domain_alloc, + .free = gic_ipi_domain_free, + .match = gic_ipi_domain_match, }; static void __init __gic_init(unsigned long gic_base_addr, @@ -864,6 +1026,18 @@ static void __init __gic_init(unsigned long gic_base_addr, if (!gic_irq_domain) panic("Failed to add GIC IRQ domain"); + gic_ipi_domain = irq_domain_add_hierarchy(gic_irq_domain, + IRQ_DOMAIN_FLAG_IPI_PER_CPU, + GIC_NUM_LOCAL_INTRS + gic_shared_intrs, + node, &gic_ipi_domain_ops, NULL); + if (!gic_ipi_domain) + panic("Failed to add GIC IPI domain"); + + gic_ipi_domain->bus_token = DOMAIN_BUS_IPI; + + /* Make the last 2 * NR_CPUS available for IPIs */ + bitmap_set(ipi_resrv, gic_shared_intrs - 2 * NR_CPUS, 2 * NR_CPUS); + gic_basic_init(); gic_ipi_init(); -- cgit v1.2.1 From c98c1822ee13e4501bf48a9e3184fb9a84c149c0 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:24 +0000 Subject: irqchip/mips-gic: Add device hierarchy domain Now the root gic_irq_domain is split into device and IPI domains. This form provides a better representation of how the root domain is split into 2. One for devices and one for IPIs. Signed-off-by: Qais Yousef Acked-by: Ralf Baechle Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-14-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-mips-gic.c | 103 +++++++++++++++++++++++++++++++++-------- 1 file changed, 83 insertions(+), 20 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 99f01ca1869a..794fc5949c9f 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -47,6 +47,7 @@ static void __iomem *gic_base; static struct gic_pcpu_mask pcpu_masks[NR_CPUS]; static DEFINE_SPINLOCK(gic_lock); static struct irq_domain *gic_irq_domain; +static struct irq_domain *gic_dev_domain; static struct irq_domain *gic_ipi_domain; static int gic_shared_intrs; static int gic_vpes; @@ -793,25 +794,6 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq, return gic_shared_irq_domain_map(d, virq, hw, 0); } -static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, - const u32 *intspec, unsigned int intsize, - irq_hw_number_t *out_hwirq, - unsigned int *out_type) -{ - if (intsize != 3) - return -EINVAL; - - if (intspec[0] == GIC_SHARED) - *out_hwirq = GIC_SHARED_TO_HWIRQ(intspec[1]); - else if (intspec[0] == GIC_LOCAL) - *out_hwirq = GIC_LOCAL_TO_HWIRQ(intspec[1]); - else - return -EINVAL; - *out_type = intspec[2] & IRQ_TYPE_SENSE_MASK; - - return 0; -} - static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs, void *arg) { @@ -881,11 +863,86 @@ void gic_irq_domain_free(struct irq_domain *d, unsigned int virq, bitmap_set(ipi_resrv, base_hwirq, nr_irqs); } +int gic_irq_domain_match(struct irq_domain *d, struct device_node *node, + enum irq_domain_bus_token bus_token) +{ + /* this domain should'nt be accessed directly */ + return 0; +} + static const struct irq_domain_ops gic_irq_domain_ops = { .map = gic_irq_domain_map, - .xlate = gic_irq_domain_xlate, .alloc = gic_irq_domain_alloc, .free = gic_irq_domain_free, + .match = gic_irq_domain_match, +}; + +static int gic_dev_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, + unsigned int *out_type) +{ + if (intsize != 3) + return -EINVAL; + + if (intspec[0] == GIC_SHARED) + *out_hwirq = GIC_SHARED_TO_HWIRQ(intspec[1]); + else if (intspec[0] == GIC_LOCAL) + *out_hwirq = GIC_LOCAL_TO_HWIRQ(intspec[1]); + else + return -EINVAL; + *out_type = intspec[2] & IRQ_TYPE_SENSE_MASK; + + return 0; +} + +static int gic_dev_domain_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_fwspec *fwspec = arg; + struct gic_irq_spec spec = { + .type = GIC_DEVICE, + .hwirq = fwspec->param[1], + }; + int i, ret; + bool is_shared = fwspec->param[0] == GIC_SHARED; + + if (is_shared) { + ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec); + if (ret) + return ret; + } + + for (i = 0; i < nr_irqs; i++) { + irq_hw_number_t hwirq; + + if (is_shared) + hwirq = GIC_SHARED_TO_HWIRQ(spec.hwirq + i); + else + hwirq = GIC_LOCAL_TO_HWIRQ(spec.hwirq + i); + + ret = irq_domain_set_hwirq_and_chip(d, virq + i, + hwirq, + &gic_level_irq_controller, + NULL); + if (ret) + return ret; + } + + return 0; +} + +void gic_dev_domain_free(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs) +{ + /* no real allocation is done for dev irqs, so no need to free anything */ + return; +} + +static struct irq_domain_ops gic_dev_domain_ops = { + .xlate = gic_dev_domain_xlate, + .alloc = gic_dev_domain_alloc, + .free = gic_dev_domain_free, }; static int gic_ipi_domain_xlate(struct irq_domain *d, struct device_node *ctrlr, @@ -1026,6 +1083,12 @@ static void __init __gic_init(unsigned long gic_base_addr, if (!gic_irq_domain) panic("Failed to add GIC IRQ domain"); + gic_dev_domain = irq_domain_add_hierarchy(gic_irq_domain, 0, + GIC_NUM_LOCAL_INTRS + gic_shared_intrs, + node, &gic_dev_domain_ops, NULL); + if (!gic_dev_domain) + panic("Failed to add GIC DEV domain"); + gic_ipi_domain = irq_domain_add_hierarchy(gic_irq_domain, IRQ_DOMAIN_FLAG_IPI_PER_CPU, GIC_NUM_LOCAL_INTRS + gic_shared_intrs, -- cgit v1.2.1 From 2a07870511829977d02609dac6450017b0419ea9 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:25 +0000 Subject: irqchip/mips-gic: Use gic_vpes instead of NR_CPUS NR_CPUS is set by Kconfig and could be much higher than what actually is in the system. gic_vpes should be a true representitives of the number of cpus in the system, so use it instead. Signed-off-by: Qais Yousef Acked-by: Ralf Baechle Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-15-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-mips-gic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 794fc5949c9f..1fe73a191120 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -465,7 +465,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *cpumask, gic_map_to_vpe(irq, mips_cm_vp_id(cpumask_first(&tmp))); /* Update the pcpu_masks */ - for (i = 0; i < NR_CPUS; i++) + for (i = 0; i < gic_vpes; i++) clear_bit(irq, pcpu_masks[i].pcpu_mask); set_bit(irq, pcpu_masks[cpumask_first(&tmp)].pcpu_mask); @@ -1098,8 +1098,8 @@ static void __init __gic_init(unsigned long gic_base_addr, gic_ipi_domain->bus_token = DOMAIN_BUS_IPI; - /* Make the last 2 * NR_CPUS available for IPIs */ - bitmap_set(ipi_resrv, gic_shared_intrs - 2 * NR_CPUS, 2 * NR_CPUS); + /* Make the last 2 * gic_vpes available for IPIs */ + bitmap_set(ipi_resrv, gic_shared_intrs - 2 * gic_vpes, 2 * gic_vpes); gic_basic_init(); -- cgit v1.2.1 From 78930f09b9406db029c69dcebad10cd7b6e06fae Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:26 +0000 Subject: irqchip/mips-gic: Clear percpu_masks correctly when mapping When setting the mapping for a hwirq, make sure we clear percpu_masks for all other cpus in case it was set previously. Signed-off-by: Qais Yousef Acked-by: Ralf Baechle Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-16-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-mips-gic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 1fe73a191120..83395bf834c8 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -773,6 +773,7 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, { int intr = GIC_HWIRQ_TO_SHARED(hw); unsigned long flags; + int i; irq_set_chip_and_handler(virq, &gic_level_irq_controller, handle_level_irq); @@ -780,6 +781,8 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, spin_lock_irqsave(&gic_lock, flags); gic_map_to_pin(intr, gic_cpu_pin); gic_map_to_vpe(intr, vpe); + for (i = 0; i < gic_vpes; i++) + clear_bit(intr, pcpu_masks[i].pcpu_mask); set_bit(intr, pcpu_masks[vpe].pcpu_mask); spin_unlock_irqrestore(&gic_lock, flags); -- cgit v1.2.1 From fbde2d7d8290d8c642d591a471356abda2446874 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:27 +0000 Subject: MIPS: Add generic SMP IPI support Use the new generic IPI layer to provide generic SMP IPI support if the irqchip supports it. Signed-off-by: Qais Yousef Acked-by: Ralf Baechle Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-17-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- arch/mips/kernel/smp.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index bd4385a8e6e8..c012e1903f6b 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -33,12 +33,16 @@ #include #include #include +#include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -79,6 +83,11 @@ static cpumask_t cpu_core_setup_map; cpumask_t cpu_coherent_mask; +#ifdef CONFIG_GENERIC_IRQ_IPI +static struct irq_desc *call_desc; +static struct irq_desc *sched_desc; +#endif + static inline void set_cpu_sibling_map(int cpu) { int i; @@ -145,6 +154,133 @@ void register_smp_ops(struct plat_smp_ops *ops) mp_ops = ops; } +#ifdef CONFIG_GENERIC_IRQ_IPI +void mips_smp_send_ipi_single(int cpu, unsigned int action) +{ + mips_smp_send_ipi_mask(cpumask_of(cpu), action); +} + +void mips_smp_send_ipi_mask(const struct cpumask *mask, unsigned int action) +{ + unsigned long flags; + unsigned int core; + int cpu; + + local_irq_save(flags); + + switch (action) { + case SMP_CALL_FUNCTION: + __ipi_send_mask(call_desc, mask); + break; + + case SMP_RESCHEDULE_YOURSELF: + __ipi_send_mask(sched_desc, mask); + break; + + default: + BUG(); + } + + if (mips_cpc_present()) { + for_each_cpu(cpu, mask) { + core = cpu_data[cpu].core; + + if (core == current_cpu_data.core) + continue; + + while (!cpumask_test_cpu(cpu, &cpu_coherent_mask)) { + mips_cpc_lock_other(core); + write_cpc_co_cmd(CPC_Cx_CMD_PWRUP); + mips_cpc_unlock_other(); + } + } + } + + local_irq_restore(flags); +} + + +static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) +{ + scheduler_ipi(); + + return IRQ_HANDLED; +} + +static irqreturn_t ipi_call_interrupt(int irq, void *dev_id) +{ + generic_smp_call_function_interrupt(); + + return IRQ_HANDLED; +} + +static struct irqaction irq_resched = { + .handler = ipi_resched_interrupt, + .flags = IRQF_PERCPU, + .name = "IPI resched" +}; + +static struct irqaction irq_call = { + .handler = ipi_call_interrupt, + .flags = IRQF_PERCPU, + .name = "IPI call" +}; + +static __init void smp_ipi_init_one(unsigned int virq, + struct irqaction *action) +{ + int ret; + + irq_set_handler(virq, handle_percpu_irq); + ret = setup_irq(virq, action); + BUG_ON(ret); +} + +static int __init mips_smp_ipi_init(void) +{ + unsigned int call_virq, sched_virq; + struct irq_domain *ipidomain; + struct device_node *node; + + node = of_irq_find_parent(of_root); + ipidomain = irq_find_matching_host(node, DOMAIN_BUS_IPI); + + /* + * Some platforms have half DT setup. So if we found irq node but + * didn't find an ipidomain, try to search for one that is not in the + * DT. + */ + if (node && !ipidomain) + ipidomain = irq_find_matching_host(NULL, DOMAIN_BUS_IPI); + + BUG_ON(!ipidomain); + + call_virq = irq_reserve_ipi(ipidomain, cpu_possible_mask); + BUG_ON(!call_virq); + + sched_virq = irq_reserve_ipi(ipidomain, cpu_possible_mask); + BUG_ON(!sched_virq); + + if (irq_domain_is_ipi_per_cpu(ipidomain)) { + int cpu; + + for_each_cpu(cpu, cpu_possible_mask) { + smp_ipi_init_one(call_virq + cpu, &irq_call); + smp_ipi_init_one(sched_virq + cpu, &irq_resched); + } + } else { + smp_ipi_init_one(call_virq, &irq_call); + smp_ipi_init_one(sched_virq, &irq_resched); + } + + call_desc = irq_to_desc(call_virq); + sched_desc = irq_to_desc(sched_virq); + + return 0; +} +early_initcall(mips_smp_ipi_init); +#endif + /* * First C code run on the secondary CPUs after being started up by * the master. -- cgit v1.2.1 From bb11cff327e54179c13446c4022ed4ed7d4871c7 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:28 +0000 Subject: MIPS: Make smp CMP, CPS and MT use the new generic IPI functions This commit does several things to avoid breaking bisectability. 1- Remove IPI init code from irqchip/mips-gic 2- Implement the new irqchip->send_ipi() in irqchip/mips-gic 3- Select GENERIC_IRQ_IPI Kconfig symbol for MIPS_GIC 4- Change MIPS SMP to use the generic IPI implementation Only the SMP variants that use GIC were converted as it's the only irqchip that will have the support for generic IPI for now. Signed-off-by: Qais Yousef Acked-by: Ralf Baechle Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-18-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- arch/mips/include/asm/smp-ops.h | 5 ++- arch/mips/kernel/smp-cmp.c | 4 +- arch/mips/kernel/smp-cps.c | 4 +- arch/mips/kernel/smp-mt.c | 2 +- drivers/irqchip/Kconfig | 1 + drivers/irqchip/irq-mips-gic.c | 86 +++------------------------------------- include/linux/irqchip/mips-gic.h | 3 -- 7 files changed, 14 insertions(+), 91 deletions(-) diff --git a/arch/mips/include/asm/smp-ops.h b/arch/mips/include/asm/smp-ops.h index 6ba1fb8b11e2..db7c322f057f 100644 --- a/arch/mips/include/asm/smp-ops.h +++ b/arch/mips/include/asm/smp-ops.h @@ -44,8 +44,9 @@ static inline void plat_smp_setup(void) mp_ops->smp_setup(); } -extern void gic_send_ipi_single(int cpu, unsigned int action); -extern void gic_send_ipi_mask(const struct cpumask *mask, unsigned int action); +extern void mips_smp_send_ipi_single(int cpu, unsigned int action); +extern void mips_smp_send_ipi_mask(const struct cpumask *mask, + unsigned int action); #else /* !CONFIG_SMP */ diff --git a/arch/mips/kernel/smp-cmp.c b/arch/mips/kernel/smp-cmp.c index d5e0f949dc48..76923349b4fe 100644 --- a/arch/mips/kernel/smp-cmp.c +++ b/arch/mips/kernel/smp-cmp.c @@ -149,8 +149,8 @@ void __init cmp_prepare_cpus(unsigned int max_cpus) } struct plat_smp_ops cmp_smp_ops = { - .send_ipi_single = gic_send_ipi_single, - .send_ipi_mask = gic_send_ipi_mask, + .send_ipi_single = mips_smp_send_ipi_single, + .send_ipi_mask = mips_smp_send_ipi_mask, .init_secondary = cmp_init_secondary, .smp_finish = cmp_smp_finish, .boot_secondary = cmp_boot_secondary, diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c index 2ad4e4c96d61..253e1409338c 100644 --- a/arch/mips/kernel/smp-cps.c +++ b/arch/mips/kernel/smp-cps.c @@ -472,8 +472,8 @@ static struct plat_smp_ops cps_smp_ops = { .boot_secondary = cps_boot_secondary, .init_secondary = cps_init_secondary, .smp_finish = cps_smp_finish, - .send_ipi_single = gic_send_ipi_single, - .send_ipi_mask = gic_send_ipi_mask, + .send_ipi_single = mips_smp_send_ipi_single, + .send_ipi_mask = mips_smp_send_ipi_mask, #ifdef CONFIG_HOTPLUG_CPU .cpu_disable = cps_cpu_disable, .cpu_die = cps_cpu_die, diff --git a/arch/mips/kernel/smp-mt.c b/arch/mips/kernel/smp-mt.c index 86311a164ef1..4f9570a57e8d 100644 --- a/arch/mips/kernel/smp-mt.c +++ b/arch/mips/kernel/smp-mt.c @@ -121,7 +121,7 @@ static void vsmp_send_ipi_single(int cpu, unsigned int action) #ifdef CONFIG_MIPS_GIC if (gic_present) { - gic_send_ipi_single(cpu, action); + mips_smp_send_ipi_single(cpu, action); return; } #endif diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 71e648adc3fd..00bbec6eca0b 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -209,6 +209,7 @@ config KEYSTONE_IRQ config MIPS_GIC bool + select GENERIC_IRQ_IPI select IRQ_DOMAIN_HIERARCHY select MIPS_CM diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 83395bf834c8..37831a557bcb 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -280,9 +280,11 @@ static void gic_bind_eic_interrupt(int irq, int set) GIC_VPE_EIC_SS(irq), set); } -void gic_send_ipi(unsigned int intr) +static void gic_send_ipi(struct irq_data *d, unsigned int cpu) { - gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(intr)); + irq_hw_number_t hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(d)); + + gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(hwirq)); } int gic_get_c0_compare_int(void) @@ -495,6 +497,7 @@ static struct irq_chip gic_edge_irq_controller = { #ifdef CONFIG_SMP .irq_set_affinity = gic_set_affinity, #endif + .ipi_send_single = gic_send_ipi, }; static void gic_handle_local_int(bool chained) @@ -588,83 +591,6 @@ static void gic_irq_dispatch(struct irq_desc *desc) gic_handle_shared_int(true); } -#ifdef CONFIG_MIPS_GIC_IPI -static int gic_resched_int_base; -static int gic_call_int_base; - -unsigned int plat_ipi_resched_int_xlate(unsigned int cpu) -{ - return gic_resched_int_base + cpu; -} - -unsigned int plat_ipi_call_int_xlate(unsigned int cpu) -{ - return gic_call_int_base + cpu; -} - -static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) -{ - scheduler_ipi(); - - return IRQ_HANDLED; -} - -static irqreturn_t ipi_call_interrupt(int irq, void *dev_id) -{ - generic_smp_call_function_interrupt(); - - return IRQ_HANDLED; -} - -static struct irqaction irq_resched = { - .handler = ipi_resched_interrupt, - .flags = IRQF_PERCPU, - .name = "IPI resched" -}; - -static struct irqaction irq_call = { - .handler = ipi_call_interrupt, - .flags = IRQF_PERCPU, - .name = "IPI call" -}; - -static __init void gic_ipi_init_one(unsigned int intr, int cpu, - struct irqaction *action) -{ - int virq = irq_create_mapping(gic_irq_domain, - GIC_SHARED_TO_HWIRQ(intr)); - int i; - - gic_map_to_vpe(intr, mips_cm_vp_id(cpu)); - for (i = 0; i < NR_CPUS; i++) - clear_bit(intr, pcpu_masks[i].pcpu_mask); - set_bit(intr, pcpu_masks[cpu].pcpu_mask); - - irq_set_irq_type(virq, IRQ_TYPE_EDGE_RISING); - - irq_set_handler(virq, handle_percpu_irq); - setup_irq(virq, action); -} - -static __init void gic_ipi_init(void) -{ - int i; - - /* Use last 2 * NR_CPUS interrupts as IPIs */ - gic_resched_int_base = gic_shared_intrs - nr_cpu_ids; - gic_call_int_base = gic_resched_int_base - nr_cpu_ids; - - for (i = 0; i < nr_cpu_ids; i++) { - gic_ipi_init_one(gic_call_int_base + i, i, &irq_call); - gic_ipi_init_one(gic_resched_int_base + i, i, &irq_resched); - } -} -#else -static inline void gic_ipi_init(void) -{ -} -#endif - static void __init gic_basic_init(void) { unsigned int i; @@ -1105,8 +1031,6 @@ static void __init __gic_init(unsigned long gic_base_addr, bitmap_set(ipi_resrv, gic_shared_intrs - 2 * gic_vpes, 2 * gic_vpes); gic_basic_init(); - - gic_ipi_init(); } void __init gic_init(unsigned long gic_base_addr, diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h index ce824db48d64..80f89e4a29ac 100644 --- a/include/linux/irqchip/mips-gic.h +++ b/include/linux/irqchip/mips-gic.h @@ -261,9 +261,6 @@ extern void gic_write_compare(cycle_t cnt); extern void gic_write_cpu_compare(cycle_t cnt, int cpu); extern void gic_start_count(void); extern void gic_stop_count(void); -extern void gic_send_ipi(unsigned int intr); -extern unsigned int plat_ipi_call_int_xlate(unsigned int); -extern unsigned int plat_ipi_resched_int_xlate(unsigned int); extern int gic_get_c0_compare_int(void); extern int gic_get_c0_perfcount_int(void); extern int gic_get_c0_fdc_int(void); -- cgit v1.2.1 From 7eb8c99db26cc6499bfb1eba72dffc4730570752 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:29 +0000 Subject: MIPS: Delete smp-gic.c We now have a generic IPI layer that will use GIC automatically if it's compiled in. Signed-off-by: Qais Yousef Acked-by: Ralf Baechle Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-19-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- arch/mips/Kconfig | 6 ------ arch/mips/kernel/Makefile | 1 - 2 files changed, 7 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index eb079ac36dc8..ddcbeda501ae 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2170,7 +2170,6 @@ config MIPS_MT_SMP select CPU_MIPSR2_IRQ_VI select CPU_MIPSR2_IRQ_EI select SYNC_R4K - select MIPS_GIC_IPI select MIPS_MT select SMP select SMP_UP @@ -2268,7 +2267,6 @@ config MIPS_VPE_APSP_API_MT config MIPS_CMP bool "MIPS CMP framework support (DEPRECATED)" depends on SYS_SUPPORTS_MIPS_CMP && !CPU_MIPSR6 - select MIPS_GIC_IPI select SMP select SYNC_R4K select SYS_SUPPORTS_SMP @@ -2288,7 +2286,6 @@ config MIPS_CPS select MIPS_CM select MIPS_CPC select MIPS_CPS_PM if HOTPLUG_CPU - select MIPS_GIC_IPI select SMP select SYNC_R4K if (CEVT_R4K || CSRC_R4K) select SYS_SUPPORTS_HOTPLUG_CPU @@ -2306,9 +2303,6 @@ config MIPS_CPS_PM select MIPS_CPC bool -config MIPS_GIC_IPI - bool - config MIPS_CM bool diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile index 68e2b7db9348..b0988fd62fcc 100644 --- a/arch/mips/kernel/Makefile +++ b/arch/mips/kernel/Makefile @@ -52,7 +52,6 @@ obj-$(CONFIG_MIPS_MT_SMP) += smp-mt.o obj-$(CONFIG_MIPS_CMP) += smp-cmp.o obj-$(CONFIG_MIPS_CPS) += smp-cps.o cps-vec.o obj-$(CONFIG_MIPS_CPS_NS16550) += cps-vec-ns16550.o -obj-$(CONFIG_MIPS_GIC_IPI) += smp-gic.o obj-$(CONFIG_MIPS_SPRAM) += spram.o obj-$(CONFIG_MIPS_VPE_LOADER) += vpe.o -- cgit v1.2.1 From 16a8083cedbe628228dbb08fc1469c70e6208619 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:30 +0000 Subject: irqchip/mips-gic: Add new DT property to reserve IPIs The new property will allow to specify the range of GIC hwirqs to use for IPIs. This is an optinal property. We preserve the previous behaviour of allocating the last 2 * gic_vpes if it's not specified or DT is not supported. Signed-off-by: Qais Yousef Acked-by: Rob Herring Acked-by: Ralf Baechle Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-20-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- .../devicetree/bindings/interrupt-controller/mips-gic.txt | 7 +++++++ drivers/irqchip/irq-mips-gic.c | 12 ++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt b/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt index aae4c384ee1f..173595305e26 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt @@ -23,6 +23,12 @@ Optional properties: - mti,reserved-cpu-vectors : Specifies the list of CPU interrupt vectors to which the GIC may not route interrupts. Valid values are 2 - 7. This property is ignored if the CPU is started in EIC mode. +- mti,reserved-ipi-vectors : Specifies the range of GIC interrupts that are + reserved for IPIs. + It accepts 2 values, the 1st is the starting interrupt and the 2nd is the size + of the reserved range. + If not specified, the driver will allocate the last 2 * number of VPEs in the + system. Required properties for timer sub-node: - compatible : Should be "mti,gic-timer". @@ -44,6 +50,7 @@ Example: #interrupt-cells = <3>; mti,reserved-cpu-vectors = <7>; + mti,reserved-ipi-vectors = <40 8>; timer { compatible = "mti,gic-timer"; diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 37831a557bcb..94a30da0cfac 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -957,6 +957,7 @@ static void __init __gic_init(unsigned long gic_base_addr, struct device_node *node) { unsigned int gicconfig; + unsigned int v[2]; __gic_base_addr = gic_base_addr; @@ -1027,8 +1028,15 @@ static void __init __gic_init(unsigned long gic_base_addr, gic_ipi_domain->bus_token = DOMAIN_BUS_IPI; - /* Make the last 2 * gic_vpes available for IPIs */ - bitmap_set(ipi_resrv, gic_shared_intrs - 2 * gic_vpes, 2 * gic_vpes); + if (node && + !of_property_read_u32_array(node, "mti,reserved-ipi-vectors", v, 2)) { + bitmap_set(ipi_resrv, v[0], v[1]); + } else { + /* Make the last 2 * gic_vpes available for IPIs */ + bitmap_set(ipi_resrv, + gic_shared_intrs - 2 * gic_vpes, + 2 * gic_vpes); + } gic_basic_init(); } -- cgit v1.2.1 From e392d603f61767cb2de4d82bb55a035918f8342c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 1 Feb 2016 12:00:48 +0000 Subject: clocksource/drivers/arm_arch_timer: Enable and verify MMIO access So far, we have been blindly assuming that having access to a memory-mapped timer frame implies that the individual elements of that frame frame are already enabled. Whilst it's the firmware's job to give us non-secure access to frames in the first place, we should not rely on implementations always being generous enough to also configure CNTACR for those non-secure frames (e.g. [1]). Explicitly enable feature-level access per-frame, and verify that the access we want is really implemented before trying to make use of it. [1]:https://github.com/ARM-software/tf-issues/issues/170 Acked-by: Mark Rutland Reviewed-by: Stephen Boyd Tested-by: Stephen Boyd Acked-by: Marc Zyngier Signed-off-by: Robin Murphy Signed-off-by: Daniel Lezcano --- drivers/clocksource/arm_arch_timer.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index c64d543d64bf..7c567f0c87f5 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -32,6 +32,14 @@ #define CNTTIDR 0x08 #define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) +#define CNTACR(n) (0x40 + ((n) * 4)) +#define CNTACR_RPCT BIT(0) +#define CNTACR_RVCT BIT(1) +#define CNTACR_RFRQ BIT(2) +#define CNTACR_RVOFF BIT(3) +#define CNTACR_RWVT BIT(4) +#define CNTACR_RWPT BIT(5) + #define CNTVCT_LO 0x08 #define CNTVCT_HI 0x0c #define CNTFRQ 0x10 @@ -757,7 +765,6 @@ static void __init arch_timer_mem_init(struct device_node *np) } cnttidr = readl_relaxed(cntctlbase + CNTTIDR); - iounmap(cntctlbase); /* * Try to find a virtual capable frame. Otherwise fall back to a @@ -765,20 +772,31 @@ static void __init arch_timer_mem_init(struct device_node *np) */ for_each_available_child_of_node(np, frame) { int n; + u32 cntacr; if (of_property_read_u32(frame, "frame-number", &n)) { pr_err("arch_timer: Missing frame-number\n"); - of_node_put(best_frame); of_node_put(frame); - return; + goto out; } - if (cnttidr & CNTTIDR_VIRT(n)) { + /* Try enabling everything, and see what sticks */ + cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT | + CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT; + writel_relaxed(cntacr, cntctlbase + CNTACR(n)); + cntacr = readl_relaxed(cntctlbase + CNTACR(n)); + + if ((cnttidr & CNTTIDR_VIRT(n)) && + !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) { of_node_put(best_frame); best_frame = frame; arch_timer_mem_use_virtual = true; break; } + + if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT)) + continue; + of_node_put(best_frame); best_frame = of_node_get(frame); } @@ -786,24 +804,26 @@ static void __init arch_timer_mem_init(struct device_node *np) base = arch_counter_base = of_iomap(best_frame, 0); if (!base) { pr_err("arch_timer: Can't map frame's registers\n"); - of_node_put(best_frame); - return; + goto out; } if (arch_timer_mem_use_virtual) irq = irq_of_parse_and_map(best_frame, 1); else irq = irq_of_parse_and_map(best_frame, 0); - of_node_put(best_frame); + if (!irq) { pr_err("arch_timer: Frame missing %s irq", arch_timer_mem_use_virtual ? "virt" : "phys"); - return; + goto out; } arch_timer_detect_rate(base, np); arch_timer_mem_register(base, irq); arch_timer_common_init(); +out: + iounmap(cntctlbase); + of_node_put(best_frame); } CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", arch_timer_mem_init); -- cgit v1.2.1 From 522ed95c26cd03b768018e441bbc0ff656e30fe4 Mon Sep 17 00:00:00 2001 From: Shawn Lin Date: Mon, 15 Feb 2016 09:02:09 +0800 Subject: clocksource/drivers/rockchip: Add err handle for rk_timer_init Currently rockchip_timer doesn't do some basic cleanup work when failing to init the timer. Let's add err handle routine to deal with all the err cases. Signed-off-by: Shawn Lin Signed-off-by: Daniel Lezcano --- drivers/clocksource/rockchip_timer.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/rockchip_timer.c b/drivers/clocksource/rockchip_timer.c index 8c77a529d0d4..b991b288c803 100644 --- a/drivers/clocksource/rockchip_timer.c +++ b/drivers/clocksource/rockchip_timer.c @@ -122,23 +122,23 @@ static void __init rk_timer_init(struct device_node *np) pclk = of_clk_get_by_name(np, "pclk"); if (IS_ERR(pclk)) { pr_err("Failed to get pclk for '%s'\n", TIMER_NAME); - return; + goto out_unmap; } if (clk_prepare_enable(pclk)) { pr_err("Failed to enable pclk for '%s'\n", TIMER_NAME); - return; + goto out_unmap; } timer_clk = of_clk_get_by_name(np, "timer"); if (IS_ERR(timer_clk)) { pr_err("Failed to get timer clock for '%s'\n", TIMER_NAME); - return; + goto out_timer_clk; } if (clk_prepare_enable(timer_clk)) { pr_err("Failed to enable timer clock\n"); - return; + goto out_timer_clk; } bc_timer.freq = clk_get_rate(timer_clk); @@ -146,7 +146,7 @@ static void __init rk_timer_init(struct device_node *np) irq = irq_of_parse_and_map(np, 0); if (!irq) { pr_err("Failed to map interrupts for '%s'\n", TIMER_NAME); - return; + goto out_irq; } ce->name = TIMER_NAME; @@ -164,10 +164,19 @@ static void __init rk_timer_init(struct device_node *np) ret = request_irq(irq, rk_timer_interrupt, IRQF_TIMER, TIMER_NAME, ce); if (ret) { pr_err("Failed to initialize '%s': %d\n", TIMER_NAME, ret); - return; + goto out_irq; } clockevents_config_and_register(ce, bc_timer.freq, 1, UINT_MAX); + + return; + +out_irq: + clk_disable_unprepare(timer_clk); +out_timer_clk: + clk_disable_unprepare(pclk); +out_unmap: + iounmap(bc_timer.base); } CLOCKSOURCE_OF_DECLARE(rk_timer, "rockchip,rk3288-timer", rk_timer_init); -- cgit v1.2.1 From 751db1a6eaec3d266ccfe3f0d11323b7c82486bf Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Tue, 9 Feb 2016 22:54:25 -0300 Subject: clocksource/drivers/lpc32xx: Don't use the prescaler counter for clockevents This commit switches the clockevents one-shot current implementation to avoid using the prescaler counter. The clockevents timer currently uses MR0=1, PR=ticks; and after this commit is uses MR0=ticks, PR=0. While using the prescaler with PR=1 works fine in one-shot mode, it seems it doesn't work as expected in periodic mode. By using the only match channel register (MR0) for the timer we make the periodic mode introduction easier, and consistent with one-shot mode. Signed-off-by: Ezequiel Garcia Signed-off-by: Daniel Lezcano Reviewed-by: Joachim Eastwood Tested-by: Joachim Eastwood --- drivers/clocksource/time-lpc32xx.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c index 1316876b487a..50d1a63cbe1e 100644 --- a/drivers/clocksource/time-lpc32xx.c +++ b/drivers/clocksource/time-lpc32xx.c @@ -60,14 +60,13 @@ static int lpc32xx_clkevt_next_event(unsigned long delta, container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev); /* - * Place timer in reset and program the delta in the prescale - * register (PR). When the prescale counter matches the value - * in PR the counter register is incremented and the compare - * match will trigger. After setup the timer is released from - * reset and enabled. + * Place timer in reset and program the delta in the match + * channel 0 (MR0). When the timer counter matches the value + * in MR0 register the match will trigger an interrupt. + * After setup the timer is released from reset and enabled. */ writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR); - writel_relaxed(delta, ddata->base + LPC32XX_TIMER_PR); + writel_relaxed(delta, ddata->base + LPC32XX_TIMER_MR0); writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR); return 0; @@ -210,13 +209,13 @@ static int __init lpc32xx_clockevent_init(struct device_node *np) /* * Disable timer and clear any pending interrupt (IR) on match - * channel 0 (MR0). Configure a compare match value of 1 on MR0 - * and enable interrupt, reset on match and stop on match (MCR). + * channel 0 (MR0). Clear the prescaler as it's not used. + * Enable interrupt, reset on match and stop on match (MCR). */ writel_relaxed(0, base + LPC32XX_TIMER_TCR); + writel_relaxed(0, base + LPC32XX_TIMER_PR); writel_relaxed(0, base + LPC32XX_TIMER_CTCR); writel_relaxed(LPC32XX_TIMER_IR_MR0INT, base + LPC32XX_TIMER_IR); - writel_relaxed(1, base + LPC32XX_TIMER_MR0); writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R | LPC32XX_TIMER_MCR_MR0S, base + LPC32XX_TIMER_MCR); -- cgit v1.2.1 From 32f32d982f655dd191858406b11e50219a0ee01e Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Tue, 9 Feb 2016 22:54:26 -0300 Subject: clocksource/drivers/lpc32xx: Support periodic mode This commit adds the support for periodic mode. This is done by not setting the MR0S (Stop on TnMR0) bit on MCR, thus allowing interrupts to be periodically generated on MR0 matches. In order to do this, move the initial configuration that is specific to the one-shot mode to set_state_oneshot(). Signed-off-by: Ezequiel Garcia Signed-off-by: Daniel Lezcano Reviewed-by: Joachim Eastwood Tested-by: Joachim Eastwood --- drivers/clocksource/time-lpc32xx.c | 39 +++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c index 50d1a63cbe1e..5694eddade15 100644 --- a/drivers/clocksource/time-lpc32xx.c +++ b/drivers/clocksource/time-lpc32xx.c @@ -43,6 +43,7 @@ struct lpc32xx_clock_event_ddata { struct clock_event_device evtdev; void __iomem *base; + u32 ticks_per_jiffy; }; /* Needed for the sched clock */ @@ -85,11 +86,39 @@ static int lpc32xx_clkevt_shutdown(struct clock_event_device *evtdev) static int lpc32xx_clkevt_oneshot(struct clock_event_device *evtdev) { + struct lpc32xx_clock_event_ddata *ddata = + container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev); + /* * When using oneshot, we must also disable the timer * to wait for the first call to set_next_event(). */ - return lpc32xx_clkevt_shutdown(evtdev); + writel_relaxed(0, ddata->base + LPC32XX_TIMER_TCR); + + /* Enable interrupt, reset on match and stop on match (MCR). */ + writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R | + LPC32XX_TIMER_MCR_MR0S, ddata->base + LPC32XX_TIMER_MCR); + return 0; +} + +static int lpc32xx_clkevt_periodic(struct clock_event_device *evtdev) +{ + struct lpc32xx_clock_event_ddata *ddata = + container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev); + + /* Enable interrupt and reset on match. */ + writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R, + ddata->base + LPC32XX_TIMER_MCR); + + /* + * Place timer in reset and program the delta in the match + * channel 0 (MR0). + */ + writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR); + writel_relaxed(ddata->ticks_per_jiffy, ddata->base + LPC32XX_TIMER_MR0); + writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR); + + return 0; } static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id) @@ -107,11 +136,13 @@ static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id) static struct lpc32xx_clock_event_ddata lpc32xx_clk_event_ddata = { .evtdev = { .name = "lpc3220 clockevent", - .features = CLOCK_EVT_FEAT_ONESHOT, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_PERIODIC, .rating = 300, .set_next_event = lpc32xx_clkevt_next_event, .set_state_shutdown = lpc32xx_clkevt_shutdown, .set_state_oneshot = lpc32xx_clkevt_oneshot, + .set_state_periodic = lpc32xx_clkevt_periodic, }, }; @@ -210,17 +241,15 @@ static int __init lpc32xx_clockevent_init(struct device_node *np) /* * Disable timer and clear any pending interrupt (IR) on match * channel 0 (MR0). Clear the prescaler as it's not used. - * Enable interrupt, reset on match and stop on match (MCR). */ writel_relaxed(0, base + LPC32XX_TIMER_TCR); writel_relaxed(0, base + LPC32XX_TIMER_PR); writel_relaxed(0, base + LPC32XX_TIMER_CTCR); writel_relaxed(LPC32XX_TIMER_IR_MR0INT, base + LPC32XX_TIMER_IR); - writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R | - LPC32XX_TIMER_MCR_MR0S, base + LPC32XX_TIMER_MCR); rate = clk_get_rate(clk); lpc32xx_clk_event_ddata.base = base; + lpc32xx_clk_event_ddata.ticks_per_jiffy = DIV_ROUND_CLOSEST(rate, HZ); clockevents_config_and_register(&lpc32xx_clk_event_ddata.evtdev, rate, 1, -1); -- cgit v1.2.1 From 1b18fd2023acf7679a79d94ee85d488664ef0553 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Tue, 9 Feb 2016 22:54:27 -0300 Subject: clocksource/drivers/lpc32xx: Support timer-based ARM delay This commit implements the ARM timer-based delay timer for the LPC32xx, LPC18xx, LPC43xx family of SoCs. Also, add a dependency to restrict compiling this driver for the ARM architecture. Signed-off-by: Ezequiel Garcia Signed-off-by: Daniel Lezcano Reviewed-by: Joachim Eastwood Tested-by: Joachim Eastwood --- drivers/clocksource/Kconfig | 1 + drivers/clocksource/time-lpc32xx.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 56777f04d2d9..c9c4db6c2797 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -153,6 +153,7 @@ config CLKSRC_EFM32 config CLKSRC_LPC32XX bool "Clocksource for LPC32XX" if COMPILE_TEST depends on GENERIC_CLOCKEVENTS && HAS_IOMEM + depends on ARM select CLKSRC_MMIO select CLKSRC_OF help diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c index 5694eddade15..daae61e8c820 100644 --- a/drivers/clocksource/time-lpc32xx.c +++ b/drivers/clocksource/time-lpc32xx.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,15 @@ static u64 notrace lpc32xx_read_sched_clock(void) return readl(clocksource_timer_counter); } +static unsigned long lpc32xx_delay_timer_read(void) +{ + return readl(clocksource_timer_counter); +} + +static struct delay_timer lpc32xx_delay_timer = { + .read_current_timer = lpc32xx_delay_timer_read, +}; + static int lpc32xx_clkevt_next_event(unsigned long delta, struct clock_event_device *evtdev) { @@ -192,6 +202,8 @@ static int __init lpc32xx_clocksource_init(struct device_node *np) } clocksource_timer_counter = base + LPC32XX_TIMER_TC; + lpc32xx_delay_timer.freq = rate; + register_current_timer_delay(&lpc32xx_delay_timer); sched_clock_register(lpc32xx_read_sched_clock, 32, rate); return 0; -- cgit v1.2.1 From bbaa06702719818913ed686612d0db477b2b53b0 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Wed, 19 Aug 2015 15:43:38 +0200 Subject: clocksource/drivers/arm_global_timer: Register delay timer Provide a delay timer using the lower 32-bits of the global timer so that we can use that instead of having to calibrating delays. Signed-off-by: Rabin Vincent Signed-off-by: Daniel Lezcano --- drivers/clocksource/arm_global_timer.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index d189d8cb69f7..36998fa18845 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -221,6 +222,21 @@ static u64 notrace gt_sched_clock_read(void) } #endif +static unsigned long gt_read_long(void) +{ + return readl_relaxed(gt_base + GT_COUNTER0); +} + +static struct delay_timer gt_delay_timer = { + .read_current_timer = gt_read_long, +}; + +static void __init gt_delay_timer_init(void) +{ + gt_delay_timer.freq = gt_clk_rate; + register_current_timer_delay(>_delay_timer); +} + static void __init gt_clocksource_init(void) { writel(0, gt_base + GT_CONTROL); @@ -317,6 +333,7 @@ static void __init global_timer_of_register(struct device_node *np) /* Immediately configure the timer on the boot CPU */ gt_clocksource_init(); gt_clockevents_init(this_cpu_ptr(gt_evt)); + gt_delay_timer_init(); return; -- cgit v1.2.1 From cf8c5009ee37d25cf4da0a250f3044ec6d1144a0 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Dec 2015 16:59:12 +0530 Subject: clockevents/drivers/arm_arch_timer: Implement ->set_state_oneshot_stopped() set_state_oneshot_stopped() is called by the clkevt core, when the next event is required at an expiry time of 'KTIME_MAX'. This normally happens with NO_HZ_{IDLE|FULL} in both LOWRES/HIGHRES modes. This patch makes the clockevent device to stop on such an event, to avoid spurious interrupts, as explained by: commit 8fff52fd5093 ("clockevents: Introduce CLOCK_EVT_STATE_ONESHOT_STOPPED state"). Signed-off-by: Viresh Kumar Signed-off-by: Daniel Lezcano --- drivers/clocksource/arm_arch_timer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 7c567f0c87f5..f0dd9d42bc7b 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -274,10 +274,12 @@ static void __arch_timer_setup(unsigned type, if (arch_timer_use_virtual) { clk->irq = arch_timer_ppi[VIRT_PPI]; clk->set_state_shutdown = arch_timer_shutdown_virt; + clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; clk->set_next_event = arch_timer_set_next_event_virt; } else { clk->irq = arch_timer_ppi[PHYS_SECURE_PPI]; clk->set_state_shutdown = arch_timer_shutdown_phys; + clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; clk->set_next_event = arch_timer_set_next_event_phys; } } else { @@ -287,10 +289,12 @@ static void __arch_timer_setup(unsigned type, clk->cpumask = cpu_all_mask; if (arch_timer_mem_use_virtual) { clk->set_state_shutdown = arch_timer_shutdown_virt_mem; + clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem; clk->set_next_event = arch_timer_set_next_event_virt_mem; } else { clk->set_state_shutdown = arch_timer_shutdown_phys_mem; + clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem; clk->set_next_event = arch_timer_set_next_event_phys_mem; } -- cgit v1.2.1 From 3effa3ceea2da9f67f042a694e24ba0d8143cff8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Dec 2015 16:59:13 +0530 Subject: clockevents/drivers/arm_global_timer: Implement ->set_state_oneshot_stopped() set_state_oneshot_stopped() is called by the clkevt core, when the next event is required at an expiry time of 'KTIME_MAX'. This normally happens with NO_HZ_{IDLE|FULL} in both LOWRES/HIGHRES modes. This patch makes the clockevent device to stop on such an event, to avoid spurious interrupts, as explained by: commit 8fff52fd5093 ("clockevents: Introduce CLOCK_EVT_STATE_ONESHOT_STOPPED state"). Signed-off-by: Viresh Kumar Signed-off-by: Daniel Lezcano --- drivers/clocksource/arm_global_timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index 36998fa18845..9df0d1699d22 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -175,6 +175,7 @@ static int gt_clockevents_init(struct clock_event_device *clk) clk->set_state_shutdown = gt_clockevent_shutdown; clk->set_state_periodic = gt_clockevent_set_periodic; clk->set_state_oneshot = gt_clockevent_shutdown; + clk->set_state_oneshot_stopped = gt_clockevent_shutdown; clk->set_next_event = gt_clockevent_set_next_event; clk->cpumask = cpumask_of(cpu); clk->rating = 300; -- cgit v1.2.1 From 07f101d33def572c7018626735475f88dabecebf Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Dec 2015 16:59:14 +0530 Subject: clockevents/drivers/exynos_mct: Implement ->set_state_oneshot_stopped() set_state_oneshot_stopped() is called by the clkevt core, when the next event is required at an expiry time of 'KTIME_MAX'. This normally happens with NO_HZ_{IDLE|FULL} in both LOWRES/HIGHRES modes. This patch makes the clockevent device to stop on such an event, to avoid spurious interrupts, as explained by: commit 8fff52fd5093 ("clockevents: Introduce CLOCK_EVT_STATE_ONESHOT_STOPPED state"). Signed-off-by: Viresh Kumar Signed-off-by: Daniel Lezcano --- drivers/clocksource/exynos_mct.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index ff44082a0827..be09bc0b5e26 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -313,6 +313,7 @@ static struct clock_event_device mct_comp_device = { .set_state_periodic = mct_set_state_periodic, .set_state_shutdown = mct_set_state_shutdown, .set_state_oneshot = mct_set_state_shutdown, + .set_state_oneshot_stopped = mct_set_state_shutdown, .tick_resume = mct_set_state_shutdown, }; @@ -452,6 +453,7 @@ static int exynos4_local_timer_setup(struct mct_clock_event_device *mevt) evt->set_state_periodic = set_state_periodic; evt->set_state_shutdown = set_state_shutdown; evt->set_state_oneshot = set_state_shutdown; + evt->set_state_oneshot_stopped = set_state_shutdown; evt->tick_resume = set_state_shutdown; evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; evt->rating = 450; -- cgit v1.2.1 From 7aca0c07207385cca76025cc85231519935722b9 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Fri, 26 Feb 2016 19:14:13 -0800 Subject: clocksource: Introduce clocksource_freq2mult() The clocksource_khz2mult() and clocksource_hz2mult() share similar code wihch calculates a mult from the given frequency. Both implementations in differ only in value of a frequency. This patch introduces the clocksource_freq2mult() helper with generic implementation of mult calculation to prevent code duplication. Signed-off-by: Alexander Kuleshov Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Richard Cochran Link: http://lkml.kernel.org/r/1456542854-22104-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 6013021a3b39..a307bf62974f 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -118,6 +118,23 @@ struct clocksource { /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1) +static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from) +{ + /* freq = cyc/from + * mult/2^shift = ns/cyc + * mult = ns/cyc * 2^shift + * mult = from/freq * 2^shift + * mult = from * 2^shift / freq + * mult = (from< Date: Fri, 26 Feb 2016 19:14:14 -0800 Subject: jiffies: Use CLOCKSOURCE_MASK instead of constant The CLOCKSOURCE_MASK(32) macro expands to the same value, but makes code more readable. Signed-off-by: Alexander Kuleshov Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Richard Cochran Link: http://lkml.kernel.org/r/1456542854-22104-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/jiffies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 347fecf86a3f..555e21f7b966 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, - .mask = 0xffffffff, /*32bits*/ + .mask = CLOCKSOURCE_MASK(32), .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, .max_cycles = 10, -- cgit v1.2.1 From b009b096c473669311612a05b1dc2057e0ef256c Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 24 Feb 2016 16:24:54 +0100 Subject: dt-bindings: interrupt-controller: Add SoC-specific compatible string to Marvell ODMI As requested by Rob Herring, this commit adds a SoC-specific compatible string to the Marvell ODMI DT binding. Signed-off-by: Thomas Petazzoni Link: https://lkml.kernel.org/r/1456327494-31358-1-git-send-email-thomas.petazzoni@free-electrons.com Signed-off-by: Jason Cooper --- .../bindings/interrupt-controller/marvell,odmi-controller.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt index 252d5c9c31f9..8af0a8e613ab 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt @@ -6,7 +6,9 @@ which can be used by on-board peripheral for MSI interrupts. Required properties: -- compatible : The value here should contain "marvell,odmi-controller". +- compatible : The value here should contain: + + "marvell,ap806-odmi-controller", "marvell,odmi-controller". - interrupt,controller : Identifies the node as an interrupt controller. @@ -29,7 +31,8 @@ Required properties: Example: odmi: odmi@300000 { - compatible = "marvell,odmi-controller"; + compatible = "marvell,ap806-odm-controller", + "marvell,odmi-controller"; interrupt-controller; msi-controller; marvell,odmi-frames = <4>; -- cgit v1.2.1 From 869ae76147ffdf21ad24f0e599303cd58a2bb39f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 27 Feb 2016 23:11:28 +0100 Subject: uprobes: __create_xol_area() must nullify xol_mapping.fault As Jiri pointed out, this recent commit: f872f5400cc0 ("mm: Add a vm_special_mapping.fault() method") breaks uprobes: __create_xol_area() doesn't initialize the new ->fault() method and this obviously leads to kernel crash when the application tries to execute the probed insn after bp hit. We probably want to add uprobes_special_mapping_fault(), this allows to turn xol_area->xol_mapping into a single instance of vm_special_mapping. But we need a simple fix, so lets change __create_xol() to nullify the new member as Jiri suggests. Suggested-by: Jiri Olsa Reported-by: Jiri Olsa Signed-off-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pratyush Anand Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160227221128.GA29565@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0167679182c0..5f6ce931f1ea 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1178,6 +1178,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) goto free_area; area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.fault = NULL; area->xol_mapping.pages = area->pages; area->pages[0] = alloc_page(GFP_HIGHUSER); if (!area->pages[0]) -- cgit v1.2.1 From 090e77c391dd983c8945b8e2e16d09f378d2e334 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:23 +0000 Subject: cpu/hotplug: Restructure FROZEN state handling There are only a few callbacks which really care about FROZEN vs. !FROZEN. No need to have extra states for this. Publish the frozen state in an extra variable which is updated under the hotplug lock and let the users interested deal with it w/o imposing that extra state checks on everyone. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.334912357@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 2 ++ kernel/cpu.c | 69 ++++++++++++++++++++++------------------------------- 2 files changed, 31 insertions(+), 40 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d2ca8c38f9c4..f2fb54938ee6 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -118,6 +118,7 @@ enum { #ifdef CONFIG_SMP +extern bool cpuhp_tasks_frozen; /* Need to know about CPUs going up/down? */ #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) #define cpu_notifier(fn, pri) { \ @@ -177,6 +178,7 @@ extern void cpu_maps_update_done(void); #define cpu_notifier_register_done cpu_maps_update_done #else /* CONFIG_SMP */ +#define cpuhp_tasks_frozen 0 #define cpu_notifier(fn, pri) do { (void)(fn); } while (0) #define __cpu_notifier(fn, pri) do { (void)(fn); } while (0) diff --git a/kernel/cpu.c b/kernel/cpu.c index 5b9d39633ce9..41a6cb85c0af 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -29,6 +29,8 @@ #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); +bool cpuhp_tasks_frozen; +EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen); /* * The following two APIs (cpu_maps_update_begin/done) must be used when @@ -207,27 +209,30 @@ int __register_cpu_notifier(struct notifier_block *nb) return raw_notifier_chain_register(&cpu_chain, nb); } -static int __cpu_notify(unsigned long val, void *v, int nr_to_call, +static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call, int *nr_calls) { + unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0; + void *hcpu = (void *)(long)cpu; + int ret; - ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, + ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call, nr_calls); return notifier_to_errno(ret); } -static int cpu_notify(unsigned long val, void *v) +static int cpu_notify(unsigned long val, unsigned int cpu) { - return __cpu_notify(val, v, -1, NULL); + return __cpu_notify(val, cpu, -1, NULL); } #ifdef CONFIG_HOTPLUG_CPU -static void cpu_notify_nofail(unsigned long val, void *v) +static void cpu_notify_nofail(unsigned long val, unsigned int cpu) { - BUG_ON(cpu_notify(val, v)); + BUG_ON(cpu_notify(val, cpu)); } EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); @@ -311,27 +316,21 @@ static inline void check_for_tasks(int dead_cpu) read_unlock(&tasklist_lock); } -struct take_cpu_down_param { - unsigned long mod; - void *hcpu; -}; - /* Take this CPU down. */ static int take_cpu_down(void *_param) { - struct take_cpu_down_param *param = _param; - int err; + int err, cpu = smp_processor_id(); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; - cpu_notify(CPU_DYING | param->mod, param->hcpu); + cpu_notify(CPU_DYING, cpu); /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - stop_machine_park((long)param->hcpu); + stop_machine_park(cpu); return 0; } @@ -339,12 +338,6 @@ static int take_cpu_down(void *_param) static int _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; - struct take_cpu_down_param tcd_param = { - .mod = mod, - .hcpu = hcpu, - }; if (num_online_cpus() == 1) return -EBUSY; @@ -354,10 +347,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) cpu_hotplug_begin(); - err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); + cpuhp_tasks_frozen = tasks_frozen; + + err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls); if (err) { nr_calls--; - __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); + __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL); pr_warn("%s: attempt to take down CPU %u failed\n", __func__, cpu); goto out_release; @@ -389,10 +384,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ - cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + cpu_notify_nofail(CPU_DOWN_FAILED, cpu); irq_unlock_sparse(); goto out_release; } @@ -419,14 +414,14 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* CPU is completely dead: tell everyone. Too late to complain. */ tick_cleanup_dead_cpu(cpu); - cpu_notify_nofail(CPU_DEAD | mod, hcpu); + cpu_notify_nofail(CPU_DEAD, cpu); check_for_tasks(cpu); out_release: cpu_hotplug_done(); if (!err) - cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); + cpu_notify_nofail(CPU_POST_DEAD, cpu); return err; } @@ -485,10 +480,8 @@ void smpboot_thread_init(void) /* Requires cpu_add_remove_lock to be held */ static int _cpu_up(unsigned int cpu, int tasks_frozen) { - int ret, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct task_struct *idle; + int ret, nr_calls = 0; cpu_hotplug_begin(); @@ -507,7 +500,9 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) if (ret) goto out; - ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); + cpuhp_tasks_frozen = tasks_frozen; + + ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls); if (ret) { nr_calls--; pr_warn("%s: attempt to bring up CPU %u failed\n", @@ -523,11 +518,11 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ - cpu_notify(CPU_ONLINE | mod, hcpu); + cpu_notify(CPU_ONLINE, cpu); out_notify: if (ret != 0) - __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); + __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL); out: cpu_hotplug_done(); @@ -719,13 +714,7 @@ core_initcall(cpu_hotplug_pm_sync_init); */ void notify_cpu_starting(unsigned int cpu) { - unsigned long val = CPU_STARTING; - -#ifdef CONFIG_PM_SLEEP_SMP - if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) - val = CPU_STARTING_FROZEN; -#endif /* CONFIG_PM_SLEEP_SMP */ - cpu_notify(val, (void *)(long)cpu); + cpu_notify(CPU_STARTING, cpu); } #endif /* CONFIG_SMP */ -- cgit v1.2.1 From ba997462435f48ad1501320e9da8770fd40c59b1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:24 +0000 Subject: cpu/hotplug: Restructure cpu_up code Split out into separate functions, so we can convert it to a state machine. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.429389195@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 69 +++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 22 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 41a6cb85c0af..15a413639abc 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -228,6 +228,43 @@ static int cpu_notify(unsigned long val, unsigned int cpu) return __cpu_notify(val, cpu, -1, NULL); } +/* Notifier wrappers for transitioning to state machine */ +static int notify_prepare(unsigned int cpu) +{ + int nr_calls = 0; + int ret; + + ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls); + if (ret) { + nr_calls--; + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", + __func__, cpu); + __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL); + } + return ret; +} + +static int notify_online(unsigned int cpu) +{ + cpu_notify(CPU_ONLINE, cpu); + return 0; +} + +static int bringup_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + int ret; + + /* Arch-specific enabling code. */ + ret = __cpu_up(cpu, idle); + if (ret) { + cpu_notify(CPU_UP_CANCELED, cpu); + return ret; + } + BUG_ON(!cpu_online(cpu)); + return 0; +} + #ifdef CONFIG_HOTPLUG_CPU static void cpu_notify_nofail(unsigned long val, unsigned int cpu) @@ -481,7 +518,7 @@ void smpboot_thread_init(void) static int _cpu_up(unsigned int cpu, int tasks_frozen) { struct task_struct *idle; - int ret, nr_calls = 0; + int ret; cpu_hotplug_begin(); @@ -496,33 +533,21 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) goto out; } + cpuhp_tasks_frozen = tasks_frozen; + ret = smpboot_create_threads(cpu); if (ret) goto out; - cpuhp_tasks_frozen = tasks_frozen; - - ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls); - if (ret) { - nr_calls--; - pr_warn("%s: attempt to bring up CPU %u failed\n", - __func__, cpu); - goto out_notify; - } - - /* Arch-specific enabling code. */ - ret = __cpu_up(cpu, idle); - - if (ret != 0) - goto out_notify; - BUG_ON(!cpu_online(cpu)); + ret = notify_prepare(cpu); + if (ret) + goto out; - /* Now call notifier in preparation. */ - cpu_notify(CPU_ONLINE, cpu); + ret = bringup_cpu(cpu); + if (ret) + goto out; -out_notify: - if (ret != 0) - __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL); + notify_online(cpu); out: cpu_hotplug_done(); -- cgit v1.2.1 From 984581728eb4b2e10baed3d606f85a119795b207 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:25 +0000 Subject: cpu/hotplug: Split out cpu down functions Split cpu_down in separate functions in preparation for state machine conversion. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.511796562@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 83 ++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 15a413639abc..0b5d2596f3ec 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -266,11 +266,6 @@ static int bringup_cpu(unsigned int cpu) } #ifdef CONFIG_HOTPLUG_CPU - -static void cpu_notify_nofail(unsigned long val, unsigned int cpu) -{ - BUG_ON(cpu_notify(val, cpu)); -} EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); @@ -353,6 +348,25 @@ static inline void check_for_tasks(int dead_cpu) read_unlock(&tasklist_lock); } +static void cpu_notify_nofail(unsigned long val, unsigned int cpu) +{ + BUG_ON(cpu_notify(val, cpu)); +} + +static int notify_down_prepare(unsigned int cpu) +{ + int err, nr_calls = 0; + + err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls); + if (err) { + nr_calls--; + __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL); + pr_warn("%s: attempt to take down CPU %u failed\n", + __func__, cpu); + } + return err; +} + /* Take this CPU down. */ static int take_cpu_down(void *_param) { @@ -371,29 +385,9 @@ static int take_cpu_down(void *_param) return 0; } -/* Requires cpu_add_remove_lock to be held */ -static int _cpu_down(unsigned int cpu, int tasks_frozen) +static int takedown_cpu(unsigned int cpu) { - int err, nr_calls = 0; - - if (num_online_cpus() == 1) - return -EBUSY; - - if (!cpu_online(cpu)) - return -EINVAL; - - cpu_hotplug_begin(); - - cpuhp_tasks_frozen = tasks_frozen; - - err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls); - if (err) { - nr_calls--; - __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL); - pr_warn("%s: attempt to take down CPU %u failed\n", - __func__, cpu); - goto out_release; - } + int err; /* * By now we've cleared cpu_active_mask, wait for all preempt-disabled @@ -426,7 +420,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED, cpu); irq_unlock_sparse(); - goto out_release; + return err; } BUG_ON(cpu_online(cpu)); @@ -449,11 +443,40 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* This actually kills the CPU. */ __cpu_die(cpu); - /* CPU is completely dead: tell everyone. Too late to complain. */ tick_cleanup_dead_cpu(cpu); - cpu_notify_nofail(CPU_DEAD, cpu); + return 0; +} +static int notify_dead(unsigned int cpu) +{ + cpu_notify_nofail(CPU_DEAD, cpu); check_for_tasks(cpu); + return 0; +} + +/* Requires cpu_add_remove_lock to be held */ +static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) +{ + int err; + + if (num_online_cpus() == 1) + return -EBUSY; + + if (!cpu_online(cpu)) + return -EINVAL; + + cpu_hotplug_begin(); + + cpuhp_tasks_frozen = tasks_frozen; + + err = notify_down_prepare(cpu); + if (err) + goto out_release; + err = takedown_cpu(cpu); + if (err) + goto out_release; + + notify_dead(cpu); out_release: cpu_hotplug_done(); -- cgit v1.2.1 From 5ba9ac8e2c45ab165e5b4a246f4821d319656e9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:27 +0000 Subject: cpu/hotplug: Add tracepoints We want to trace the hotplug machinery. Add tracepoints to track the invocation of callbacks and their result. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.593563875@linutronix.de Signed-off-by: Thomas Gleixner --- include/trace/events/cpuhp.h | 66 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 include/trace/events/cpuhp.h diff --git a/include/trace/events/cpuhp.h b/include/trace/events/cpuhp.h new file mode 100644 index 000000000000..a72bd93ec7e5 --- /dev/null +++ b/include/trace/events/cpuhp.h @@ -0,0 +1,66 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cpuhp + +#if !defined(_TRACE_CPUHP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CPUHP_H + +#include + +TRACE_EVENT(cpuhp_enter, + + TP_PROTO(unsigned int cpu, + int target, + int idx, + int (*fun)(unsigned int)), + + TP_ARGS(cpu, target, idx, fun), + + TP_STRUCT__entry( + __field( unsigned int, cpu ) + __field( int, target ) + __field( int, idx ) + __field( void *, fun ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->target = target; + __entry->idx = idx; + __entry->fun = fun; + ), + + TP_printk("cpu: %04u target: %3d step: %3d (%pf)", + __entry->cpu, __entry->target, __entry->idx, __entry->fun) +); + +TRACE_EVENT(cpuhp_exit, + + TP_PROTO(unsigned int cpu, + int state, + int idx, + int ret), + + TP_ARGS(cpu, state, idx, ret), + + TP_STRUCT__entry( + __field( unsigned int, cpu ) + __field( int, state ) + __field( int, idx ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->state = state; + __entry->idx = idx; + __entry->ret = ret; + ), + + TP_printk(" cpu: %04u state: %3d step: %3d ret: %d", + __entry->cpu, __entry->state, __entry->idx, __entry->ret) +); + +#endif + +/* This part must be outside protection */ +#include -- cgit v1.2.1 From cff7d378d3fdbb53db9b6e2578b14855f401cd41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:28 +0000 Subject: cpu/hotplug: Convert to a state machine for the control processor Move the split out steps into a callback array and let the cpu_up/down code iterate through the array functions. For now most of the callbacks are asymmetric to resemble the current hotplug maze. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.671816690@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 9 +- include/linux/cpuhotplug.h | 13 +++ init/main.c | 15 +--- kernel/cpu.c | 202 +++++++++++++++++++++++++++++++++++++++------ 4 files changed, 194 insertions(+), 45 deletions(-) create mode 100644 include/linux/cpuhotplug.h diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f2fb54938ee6..78989f20420f 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -16,6 +16,7 @@ #include #include #include +#include struct device; struct device_node; @@ -27,6 +28,9 @@ struct cpu { struct device dev; }; +extern void boot_cpu_init(void); +extern void boot_cpu_state_init(void); + extern int register_cpu(struct cpu *cpu, int num); extern struct device *get_cpu_device(unsigned cpu); extern bool cpu_is_hotpluggable(unsigned cpu); @@ -267,11 +271,6 @@ static inline int disable_nonboot_cpus(void) { return 0; } static inline void enable_nonboot_cpus(void) {} #endif /* !CONFIG_PM_SLEEP_SMP */ -enum cpuhp_state { - CPUHP_OFFLINE, - CPUHP_ONLINE, -}; - void cpu_startup_entry(enum cpuhp_state state); void cpu_idle_poll_ctrl(bool enable); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h new file mode 100644 index 000000000000..d55c9e64acd7 --- /dev/null +++ b/include/linux/cpuhotplug.h @@ -0,0 +1,13 @@ +#ifndef __CPUHOTPLUG_H +#define __CPUHOTPLUG_H + +enum cpuhp_state { + CPUHP_OFFLINE, + CPUHP_CREATE_THREADS, + CPUHP_NOTIFY_PREPARE, + CPUHP_BRINGUP_CPU, + CPUHP_NOTIFY_ONLINE, + CPUHP_ONLINE, +}; + +#endif diff --git a/init/main.c b/init/main.c index 58c9e374704b..c2ea72362ee3 100644 --- a/init/main.c +++ b/init/main.c @@ -452,20 +452,6 @@ void __init parse_early_param(void) done = 1; } -/* - * Activate the first processor. - */ - -static void __init boot_cpu_init(void) -{ - int cpu = smp_processor_id(); - /* Mark the boot cpu "present", "online" etc for SMP and UP case */ - set_cpu_online(cpu, true); - set_cpu_active(cpu, true); - set_cpu_present(cpu, true); - set_cpu_possible(cpu, true); -} - void __init __weak smp_setup_processor_id(void) { } @@ -530,6 +516,7 @@ asmlinkage __visible void __init start_kernel(void) setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); + boot_cpu_state_init(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ build_all_zonelists(NULL, NULL); diff --git a/kernel/cpu.c b/kernel/cpu.c index 0b5d2596f3ec..301851974b8d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -22,10 +22,64 @@ #include #include #include + #include +#define CREATE_TRACE_POINTS +#include #include "smpboot.h" +/** + * cpuhp_cpu_state - Per cpu hotplug state storage + * @state: The current cpu state + * @target: The target state + */ +struct cpuhp_cpu_state { + enum cpuhp_state state; + enum cpuhp_state target; +}; + +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); + +/** + * cpuhp_step - Hotplug state machine step + * @name: Name of the step + * @startup: Startup function of the step + * @teardown: Teardown function of the step + * @skip_onerr: Do not invoke the functions on error rollback + * Will go away once the notifiers are gone + */ +struct cpuhp_step { + const char *name; + int (*startup)(unsigned int cpu); + int (*teardown)(unsigned int cpu); + bool skip_onerr; +}; + +static struct cpuhp_step cpuhp_bp_states[]; + +/** + * cpuhp_invoke_callback _ Invoke the callbacks for a given state + * @cpu: The cpu for which the callback should be invoked + * @step: The step in the state machine + * @cb: The callback function to invoke + * + * Called from cpu hotplug and from the state register machinery + */ +static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, + int (*cb)(unsigned int)) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret = 0; + + if (cb) { + trace_cpuhp_enter(cpu, st->target, step, cb); + ret = cb(cpu); + trace_cpuhp_exit(cpu, st->state, step, ret); + } + return ret; +} + #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); @@ -454,10 +508,29 @@ static int notify_dead(unsigned int cpu) return 0; } +#else +#define notify_down_prepare NULL +#define takedown_cpu NULL +#define notify_dead NULL +#endif + +#ifdef CONFIG_HOTPLUG_CPU +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = cpuhp_bp_states + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->startup); + } +} + /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { - int err; + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int prev_state, ret = 0; + bool hasdied = false; if (num_online_cpus() == 1) return -EBUSY; @@ -469,20 +542,25 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpuhp_tasks_frozen = tasks_frozen; - err = notify_down_prepare(cpu); - if (err) - goto out_release; - err = takedown_cpu(cpu); - if (err) - goto out_release; + prev_state = st->state; + st->target = CPUHP_OFFLINE; + for (; st->state > st->target; st->state--) { + struct cpuhp_step *step = cpuhp_bp_states + st->state; - notify_dead(cpu); + ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st); + break; + } + } + hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; -out_release: cpu_hotplug_done(); - if (!err) + /* This post dead nonsense must die */ + if (!ret && hasdied) cpu_notify_nofail(CPU_POST_DEAD, cpu); - return err; + return ret; } int cpu_down(unsigned int cpu) @@ -537,11 +615,22 @@ void smpboot_thread_init(void) register_cpu_notifier(&smpboot_thread_notifier); } +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + for (st->state--; st->state > st->target; st->state--) { + struct cpuhp_step *step = cpuhp_bp_states + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } +} + /* Requires cpu_add_remove_lock to be held */ static int _cpu_up(unsigned int cpu, int tasks_frozen) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; - int ret; + int prev_state, ret = 0; cpu_hotplug_begin(); @@ -550,6 +639,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) goto out; } + /* Let it fail before we try to bring the cpu up */ idle = idle_thread_get(cpu); if (IS_ERR(idle)) { ret = PTR_ERR(idle); @@ -558,22 +648,22 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) cpuhp_tasks_frozen = tasks_frozen; - ret = smpboot_create_threads(cpu); - if (ret) - goto out; - - ret = notify_prepare(cpu); - if (ret) - goto out; - - ret = bringup_cpu(cpu); - if (ret) - goto out; - - notify_online(cpu); + prev_state = st->state; + st->target = CPUHP_ONLINE; + while (st->state < st->target) { + struct cpuhp_step *step; + + st->state++; + step = cpuhp_bp_states + st->state; + ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + if (ret) { + st->target = prev_state; + undo_cpu_up(cpu, st); + break; + } + } out: cpu_hotplug_done(); - return ret; } @@ -767,6 +857,44 @@ void notify_cpu_starting(unsigned int cpu) #endif /* CONFIG_SMP */ +/* Boot processor state steps */ +static struct cpuhp_step cpuhp_bp_states[] = { + [CPUHP_OFFLINE] = { + .name = "offline", + .startup = NULL, + .teardown = NULL, + }, +#ifdef CONFIG_SMP + [CPUHP_CREATE_THREADS]= { + .name = "threads:create", + .startup = smpboot_create_threads, + .teardown = NULL, + }, + [CPUHP_NOTIFY_PREPARE] = { + .name = "notify:prepare", + .startup = notify_prepare, + .teardown = notify_dead, + .skip_onerr = true, + }, + [CPUHP_BRINGUP_CPU] = { + .name = "cpu:bringup", + .startup = bringup_cpu, + .teardown = takedown_cpu, + .skip_onerr = true, + }, + [CPUHP_NOTIFY_ONLINE] = { + .name = "notify:online", + .startup = notify_online, + .teardown = notify_down_prepare, + }, +#endif + [CPUHP_ONLINE] = { + .name = "online", + .startup = NULL, + .teardown = NULL, + }, +}; + /* * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1<state = CPUHP_ONLINE; +} -- cgit v1.2.1 From 4baa0afc6719cbf36a1e08551484a641926b3fd1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:29 +0000 Subject: cpu/hotplug: Convert the hotplugged cpu work to a state machine Move the functions which need to run on the hotplugged processor into a state machine array and let the code iterate through these functions. In a later state, this will grow synchronization points between the control processor and the hotplugged processor, so we can move the various architecture implementations of the synchronizations to the core. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.770651526@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 4 +++ kernel/cpu.c | 81 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 70 insertions(+), 15 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d55c9e64acd7..d9303cca83d3 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -6,6 +6,10 @@ enum cpuhp_state { CPUHP_CREATE_THREADS, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, + CPUHP_AP_OFFLINE, + CPUHP_AP_NOTIFY_STARTING, + CPUHP_AP_ONLINE, + CPUHP_TEARDOWN_CPU, CPUHP_NOTIFY_ONLINE, CPUHP_ONLINE, }; diff --git a/kernel/cpu.c b/kernel/cpu.c index 301851974b8d..797723e81756 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -57,6 +57,7 @@ struct cpuhp_step { }; static struct cpuhp_step cpuhp_bp_states[]; +static struct cpuhp_step cpuhp_ap_states[]; /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state @@ -304,6 +305,12 @@ static int notify_online(unsigned int cpu) return 0; } +static int notify_starting(unsigned int cpu) +{ + cpu_notify(CPU_STARTING, cpu); + return 0; +} + static int bringup_cpu(unsigned int cpu) { struct task_struct *idle = idle_thread_get(cpu); @@ -421,9 +428,17 @@ static int notify_down_prepare(unsigned int cpu) return err; } +static int notify_dying(unsigned int cpu) +{ + cpu_notify(CPU_DYING, cpu); + return 0; +} + /* Take this CPU down. */ static int take_cpu_down(void *_param) { + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); /* Ensure this CPU doesn't handle any more interrupts. */ @@ -431,7 +446,12 @@ static int take_cpu_down(void *_param) if (err < 0) return err; - cpu_notify(CPU_DYING, cpu); + /* Invoke the former CPU_DYING callbacks */ + for (; st->state > target; st->state--) { + struct cpuhp_step *step = cpuhp_ap_states + st->state; + + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ @@ -512,6 +532,7 @@ static int notify_dead(unsigned int cpu) #define notify_down_prepare NULL #define takedown_cpu NULL #define notify_dead NULL +#define notify_dying NULL #endif #ifdef CONFIG_HOTPLUG_CPU @@ -615,6 +636,28 @@ void smpboot_thread_init(void) register_cpu_notifier(&smpboot_thread_notifier); } +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). + */ +void notify_cpu_starting(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + + while (st->state < target) { + struct cpuhp_step *step; + + st->state++; + step = cpuhp_ap_states + st->state; + cpuhp_invoke_callback(cpu, st->state, step->startup); + } +} + static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state--; st->state > st->target; st->state--) { @@ -842,19 +885,6 @@ core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ -/** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers - * @cpu: cpu that just started - * - * This function calls the cpu_chain notifiers with CPU_STARTING. - * It must be called by the arch code on the new cpu, before the new cpu - * enables interrupts and before the "boot" cpu returns from __cpu_up(). - */ -void notify_cpu_starting(unsigned int cpu) -{ - cpu_notify(CPU_STARTING, cpu); -} - #endif /* CONFIG_SMP */ /* Boot processor state steps */ @@ -879,8 +909,12 @@ static struct cpuhp_step cpuhp_bp_states[] = { [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup = bringup_cpu, + .teardown = NULL, + }, + [CPUHP_TEARDOWN_CPU] = { + .name = "cpu:teardown", + .startup = NULL, .teardown = takedown_cpu, - .skip_onerr = true, }, [CPUHP_NOTIFY_ONLINE] = { .name = "notify:online", @@ -895,6 +929,23 @@ static struct cpuhp_step cpuhp_bp_states[] = { }, }; +/* Application processor state steps */ +static struct cpuhp_step cpuhp_ap_states[] = { +#ifdef CONFIG_SMP + [CPUHP_AP_NOTIFY_STARTING] = { + .name = "notify:starting", + .startup = notify_starting, + .teardown = notify_dying, + .skip_onerr = true, + }, +#endif + [CPUHP_ONLINE] = { + .name = "online", + .startup = NULL, + .teardown = NULL, + }, +}; + /* * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1< Date: Fri, 26 Feb 2016 18:43:30 +0000 Subject: cpu/hotplug: Hand in target state to _cpu_up/down We want to be able to bringup/teardown the cpu to a particular state. Add a target argument to _cpu_up/down. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.862113133@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 797723e81756..a00f8f67e4ad 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -547,7 +547,8 @@ static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) } /* Requires cpu_add_remove_lock to be held */ -static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) +static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, + enum cpuhp_state target) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int prev_state, ret = 0; @@ -564,7 +565,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpuhp_tasks_frozen = tasks_frozen; prev_state = st->state; - st->target = CPUHP_OFFLINE; + st->target = target; for (; st->state > st->target; st->state--) { struct cpuhp_step *step = cpuhp_bp_states + st->state; @@ -584,7 +585,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return ret; } -int cpu_down(unsigned int cpu) +static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; @@ -595,12 +596,16 @@ int cpu_down(unsigned int cpu) goto out; } - err = _cpu_down(cpu, 0); + err = _cpu_down(cpu, 0, target); out: cpu_maps_update_done(); return err; } +int cpu_down(unsigned int cpu) +{ + return do_cpu_down(cpu, CPUHP_OFFLINE); +} EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ @@ -669,7 +674,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) } /* Requires cpu_add_remove_lock to be held */ -static int _cpu_up(unsigned int cpu, int tasks_frozen) +static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; @@ -692,7 +697,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) cpuhp_tasks_frozen = tasks_frozen; prev_state = st->state; - st->target = CPUHP_ONLINE; + st->target = target; while (st->state < st->target) { struct cpuhp_step *step; @@ -710,7 +715,7 @@ out: return ret; } -int cpu_up(unsigned int cpu) +static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; @@ -734,12 +739,16 @@ int cpu_up(unsigned int cpu) goto out; } - err = _cpu_up(cpu, 0); - + err = _cpu_up(cpu, 0, target); out: cpu_maps_update_done(); return err; } + +int cpu_up(unsigned int cpu) +{ + return do_cpu_up(cpu, CPUHP_ONLINE); +} EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP @@ -762,7 +771,7 @@ int disable_nonboot_cpus(void) if (cpu == first_cpu) continue; trace_suspend_resume(TPS("CPU_OFF"), cpu, true); - error = _cpu_down(cpu, 1); + error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); @@ -812,7 +821,7 @@ void enable_nonboot_cpus(void) for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); - error = _cpu_up(cpu, 1); + error = _cpu_up(cpu, 1, CPUHP_ONLINE); trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { pr_info("CPU%d is up\n", cpu); -- cgit v1.2.1 From 98f8cdce1db580b99fce823a48eea2cb2bdb261e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:31 +0000 Subject: cpu/hotplug: Add sysfs state interface Add a sysfs interface so we can actually see in which state the cpus are in. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.942257522@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index a00f8f67e4ad..1979b8927b86 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -56,6 +56,7 @@ struct cpuhp_step { bool skip_onerr; }; +static DEFINE_MUTEX(cpuhp_state_mutex); static struct cpuhp_step cpuhp_bp_states[]; static struct cpuhp_step cpuhp_ap_states[]; @@ -955,6 +956,105 @@ static struct cpuhp_step cpuhp_ap_states[] = { }, }; +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + return (state > CPUHP_AP_OFFLINE && state < CPUHP_AP_ONLINE); +} + +static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) +{ + struct cpuhp_step *sp; + + sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; + return sp + state; +} + +#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) +static ssize_t show_cpuhp_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->state); +} +static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL); + +static ssize_t show_cpuhp_target(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->target); +} +static DEVICE_ATTR(target, 0444, show_cpuhp_target, NULL); + +static struct attribute *cpuhp_cpu_attrs[] = { + &dev_attr_state.attr, + &dev_attr_target.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_attr_group = { + .attrs = cpuhp_cpu_attrs, + .name = "hotplug", + NULL +}; + +static ssize_t show_cpuhp_states(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t cur, res = 0; + int i; + + mutex_lock(&cpuhp_state_mutex); + for (i = 0; i <= CPUHP_ONLINE; i++) { + struct cpuhp_step *sp = cpuhp_get_step(i); + + if (sp->name) { + cur = sprintf(buf, "%3d: %s\n", i, sp->name); + buf += cur; + res += cur; + } + } + mutex_unlock(&cpuhp_state_mutex); + return res; +} +static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL); + +static struct attribute *cpuhp_cpu_root_attrs[] = { + &dev_attr_states.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_root_attr_group = { + .attrs = cpuhp_cpu_root_attrs, + .name = "hotplug", + NULL +}; + +static int __init cpuhp_sysfs_init(void) +{ + int cpu, ret; + + ret = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpuhp_cpu_root_attr_group); + if (ret) + return ret; + + for_each_possible_cpu(cpu) { + struct device *dev = get_cpu_device(cpu); + + if (!dev) + continue; + ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group); + if (ret) + return ret; + } + return 0; +} +device_initcall(cpuhp_sysfs_init); +#endif + /* * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1< Date: Fri, 26 Feb 2016 18:43:32 +0000 Subject: cpu/hotplug: Make target state writeable Make it possible to write a target state to the per cpu state file, so we can switch between states. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.022814799@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++------ lib/Kconfig.debug | 13 ++++++++++ 2 files changed, 78 insertions(+), 8 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 1979b8927b86..be9335da82f1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -48,12 +48,14 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); * @teardown: Teardown function of the step * @skip_onerr: Do not invoke the functions on error rollback * Will go away once the notifiers are gone + * @cant_stop: Bringup/teardown can't be stopped at this step */ struct cpuhp_step { const char *name; int (*startup)(unsigned int cpu); int (*teardown)(unsigned int cpu); bool skip_onerr; + bool cant_stop; }; static DEFINE_MUTEX(cpuhp_state_mutex); @@ -558,7 +560,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, if (num_online_cpus() == 1) return -EBUSY; - if (!cpu_online(cpu)) + if (!cpu_present(cpu)) return -EINVAL; cpu_hotplug_begin(); @@ -683,16 +685,25 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpu_hotplug_begin(); - if (cpu_online(cpu) || !cpu_present(cpu)) { + if (!cpu_present(cpu)) { ret = -EINVAL; goto out; } - /* Let it fail before we try to bring the cpu up */ - idle = idle_thread_get(cpu); - if (IS_ERR(idle)) { - ret = PTR_ERR(idle); + /* + * The caller of do_cpu_up might have raced with another + * caller. Ignore it for now. + */ + if (st->state >= target) goto out; + + if (st->state == CPUHP_OFFLINE) { + /* Let it fail before we try to bring the cpu up */ + idle = idle_thread_get(cpu); + if (IS_ERR(idle)) { + ret = PTR_ERR(idle); + goto out; + } } cpuhp_tasks_frozen = tasks_frozen; @@ -909,27 +920,32 @@ static struct cpuhp_step cpuhp_bp_states[] = { .name = "threads:create", .startup = smpboot_create_threads, .teardown = NULL, + .cant_stop = true, }, [CPUHP_NOTIFY_PREPARE] = { .name = "notify:prepare", .startup = notify_prepare, .teardown = notify_dead, .skip_onerr = true, + .cant_stop = true, }, [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup = bringup_cpu, .teardown = NULL, + .cant_stop = true, }, [CPUHP_TEARDOWN_CPU] = { .name = "cpu:teardown", .startup = NULL, .teardown = takedown_cpu, + .cant_stop = true, }, [CPUHP_NOTIFY_ONLINE] = { .name = "notify:online", .startup = notify_online, .teardown = notify_down_prepare, + .cant_stop = true, }, #endif [CPUHP_ONLINE] = { @@ -947,6 +963,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup = notify_starting, .teardown = notify_dying, .skip_onerr = true, + .cant_stop = true, }, #endif [CPUHP_ONLINE] = { @@ -979,6 +996,46 @@ static ssize_t show_cpuhp_state(struct device *dev, } static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL); +static ssize_t write_cpuhp_target(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int target, ret; + + ret = kstrtoint(buf, 10, &target); + if (ret) + return ret; + +#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL + if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE) + return -EINVAL; +#else + if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE) + return -EINVAL; +#endif + + ret = lock_device_hotplug_sysfs(); + if (ret) + return ret; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(target); + ret = !sp->name || sp->cant_stop ? -EINVAL : 0; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + if (st->state < target) + ret = do_cpu_up(dev->id, target); + else + ret = do_cpu_down(dev->id, target); + + unlock_device_hotplug(); + return ret ? ret : count; +} + static ssize_t show_cpuhp_target(struct device *dev, struct device_attribute *attr, char *buf) { @@ -986,7 +1043,7 @@ static ssize_t show_cpuhp_target(struct device *dev, return sprintf(buf, "%d\n", st->target); } -static DEVICE_ATTR(target, 0444, show_cpuhp_target, NULL); +static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, @@ -1007,7 +1064,7 @@ static ssize_t show_cpuhp_states(struct device *dev, int i; mutex_lock(&cpuhp_state_mutex); - for (i = 0; i <= CPUHP_ONLINE; i++) { + for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) { struct cpuhp_step *sp = cpuhp_get_step(i); if (sp->name) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8bfd1aca7a3d..f28f7fad452f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1442,6 +1442,19 @@ config DEBUG_BLOCK_EXT_DEVT Say N if you are unsure. +config CPU_HOTPLUG_STATE_CONTROL + bool "Enable CPU hotplug state control" + depends on DEBUG_KERNEL + depends on HOTPLUG_CPU + default n + help + Allows to write steps between "offline" and "online" to the CPUs + sysfs target file so states can be stepped granular. This is a debug + option for now as the hotplug machinery cannot be stopped and + restarted at arbitrary points yet. + + Say N if your are unsure. + config NOTIFIER_ERROR_INJECTION tristate "Notifier error injection" depends on DEBUG_KERNEL -- cgit v1.2.1 From 5b7aa87e0482be768486e0c2277aa4122487eb9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:33 +0000 Subject: cpu/hotplug: Implement setup/removal interface Implement function which allow to setup/remove hotplug state callbacks. The default behaviour for setup is to call the startup function for this state for (or on) all cpus which have a hotplug state >= the installed state. The default behaviour for removal is to call the teardown function for this state for (or on) all cpus which have a hotplug state >= the installed state. This includes rollback to the previous state in case of failure. A special state is CPUHP_ONLINE_DYN. Its for dynamically registering a hotplug callback pair. This is for drivers which have no dependencies to avoid that we need to allocate CPUHP states for each of them For both setup and remove helper functions are provided, which prevent the core to issue the callbacks. This simplifies the conversion of existing hotplug notifiers. [ Dynamic registering implemented by Sebastian Siewior ] Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.103464877@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 67 ++++++++++++++ kernel/cpu.c | 224 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 291 insertions(+) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d9303cca83d3..29935261b26d 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -11,7 +11,74 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_NOTIFY_ONLINE, + CPUHP_ONLINE_DYN, + CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, CPUHP_ONLINE, }; +int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)); + +/** + * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks + * @state: The state for which the calls are installed + * @name: Name of the callback (will be used in debug output) + * @startup: startup callback function + * @teardown: teardown callback function + * + * Installs the callback functions and invokes the startup callback on + * the present cpus which have already reached the @state. + */ +static inline int cpuhp_setup_state(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state(state, name, true, startup, teardown); +} + +/** + * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the + * callbacks + * @state: The state for which the calls are installed + * @name: Name of the callback. + * @startup: startup callback function + * @teardown: teardown callback function + * + * Same as @cpuhp_setup_state except that no calls are executed are invoked + * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n. + */ +static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state(state, name, false, startup, teardown); +} + +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); + +/** + * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown + * @state: The state for which the calls are removed + * + * Removes the callback functions and invokes the teardown callback on + * the present cpus which have already reached the @state. + */ +static inline void cpuhp_remove_state(enum cpuhp_state state) +{ + __cpuhp_remove_state(state, true); +} + +/** + * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking + * teardown + * @state: The state for which the calls are removed + */ +static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) +{ + __cpuhp_remove_state(state, false); +} + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index be9335da82f1..b5eacb9587af 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -973,6 +973,14 @@ static struct cpuhp_step cpuhp_ap_states[] = { }, }; +/* Sanity check for callbacks */ +static int cpuhp_cb_check(enum cpuhp_state state) +{ + if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE) + return -EINVAL; + return 0; +} + static bool cpuhp_is_ap_state(enum cpuhp_state state) { return (state > CPUHP_AP_OFFLINE && state < CPUHP_AP_ONLINE); @@ -986,6 +994,222 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) return sp + state; } +static void cpuhp_store_callbacks(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + /* (Un)Install the callbacks for further cpu hotplug operations */ + struct cpuhp_step *sp; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(state); + sp->startup = startup; + sp->teardown = teardown; + sp->name = name; + mutex_unlock(&cpuhp_state_mutex); +} + +static void *cpuhp_get_teardown_cb(enum cpuhp_state state) +{ + return cpuhp_get_step(state)->teardown; +} + +/* Helper function to run callback on the target cpu */ +static void cpuhp_on_cpu_cb(void *__cb) +{ + int (*cb)(unsigned int cpu) = __cb; + + BUG_ON(cb(smp_processor_id())); +} + +/* + * Call the startup/teardown function for a step either on the AP or + * on the current CPU. + */ +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int), bool bringup) +{ + int ret; + + if (!cb) + return 0; + + /* + * This invokes the callback directly for now. In a later step we + * convert that to use cpuhp_invoke_callback(). + */ + if (cpuhp_is_ap_state(state)) { + /* + * Note, that a function called on the AP is not + * allowed to fail. + */ + if (cpu_online(cpu)) + smp_call_function_single(cpu, cpuhp_on_cpu_cb, cb, 1); + return 0; + } + + /* + * The non AP bound callbacks can fail on bringup. On teardown + * e.g. module removal we crash for now. + */ + ret = cb(cpu); + BUG_ON(ret && !bringup); + return ret; +} + +/* + * Called from __cpuhp_setup_state on a recoverable failure. + * + * Note: The teardown callbacks for rollback are not allowed to fail! + */ +static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, + int (*teardown)(unsigned int cpu)) +{ + int cpu; + + if (!teardown) + return; + + /* Roll back the already executed steps on the other cpus */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpu >= failedcpu) + break; + + /* Did we invoke the startup call on that cpu ? */ + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +} + +/* + * Returns a free for dynamic slot assignment of the Online state. The states + * are protected by the cpuhp_slot_states mutex and an empty slot is identified + * by having no name assigned. + */ +static int cpuhp_reserve_state(enum cpuhp_state state) +{ + enum cpuhp_state i; + + mutex_lock(&cpuhp_state_mutex); + for (i = CPUHP_ONLINE_DYN; i <= CPUHP_ONLINE_DYN_END; i++) { + if (cpuhp_bp_states[i].name) + continue; + + cpuhp_bp_states[i].name = "Reserved"; + mutex_unlock(&cpuhp_state_mutex); + return i; + } + mutex_unlock(&cpuhp_state_mutex); + WARN(1, "No more dynamic states available for CPU hotplug\n"); + return -ENOSPC; +} + +/** + * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state + * @state: The state to setup + * @invoke: If true, the startup function is invoked for cpus where + * cpu state >= @state + * @startup: startup callback function + * @teardown: teardown callback function + * + * Returns 0 if successful, otherwise a proper error code + */ +int __cpuhp_setup_state(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + int cpu, ret = 0; + int dyn_state = 0; + + if (cpuhp_cb_check(state) || !name) + return -EINVAL; + + get_online_cpus(); + + /* currently assignments for the ONLINE state are possible */ + if (state == CPUHP_ONLINE_DYN) { + dyn_state = 1; + ret = cpuhp_reserve_state(state); + if (ret < 0) + goto out; + state = ret; + } + + cpuhp_store_callbacks(state, name, startup, teardown); + + if (!invoke || !startup) + goto out; + + /* + * Try to call the startup callback for each present cpu + * depending on the hotplug state of the cpu. + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate < state) + continue; + + ret = cpuhp_issue_call(cpu, state, startup, true); + if (ret) { + cpuhp_rollback_install(cpu, state, teardown); + cpuhp_store_callbacks(state, NULL, NULL, NULL); + goto out; + } + } +out: + put_online_cpus(); + if (!ret && dyn_state) + return state; + return ret; +} +EXPORT_SYMBOL(__cpuhp_setup_state); + +/** + * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state + * @state: The state to remove + * @invoke: If true, the teardown function is invoked for cpus where + * cpu state >= @state + * + * The teardown callback is currently not allowed to fail. Think + * about module removal! + */ +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +{ + int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); + int cpu; + + BUG_ON(cpuhp_cb_check(state)); + + get_online_cpus(); + + if (!invoke || !teardown) + goto remove; + + /* + * Call the teardown callback for each present cpu depending + * on the hotplug state of the cpu. This function is not + * allowed to fail currently! + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +remove: + cpuhp_store_callbacks(state, NULL, NULL, NULL); + put_online_cpus(); +} +EXPORT_SYMBOL(__cpuhp_remove_state); + #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) static ssize_t show_cpuhp_state(struct device *dev, struct device_attribute *attr, char *buf) -- cgit v1.2.1 From 949338e35131c551f7bf54f48a2e3a227af6721b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:35 +0000 Subject: cpu/hotplug: Move scheduler cpu_online notifier to hotplug core Move the scheduler cpu online notifier part to the hotplug core. This is anyway the highest priority callback and we need that functionality right now for the next changes. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.200791046@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + kernel/cpu.c | 18 ++++++++++++++++++ kernel/sched/core.c | 10 ---------- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 29935261b26d..2f2e5d9711c4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -10,6 +10,7 @@ enum cpuhp_state { CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, + CPUHP_CPU_SET_ACTIVE, CPUHP_NOTIFY_ONLINE, CPUHP_ONLINE_DYN, CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, diff --git a/kernel/cpu.c b/kernel/cpu.c index b5eacb9587af..65e34d34ca93 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -666,6 +666,19 @@ void notify_cpu_starting(unsigned int cpu) } } +/* + * Called from the idle task. We need to set active here, so we can kick off + * the stopper thread. + */ +static int cpuhp_set_cpu_active(unsigned int cpu) +{ + /* The cpu is marked online, set it active now */ + set_cpu_active(cpu, true); + /* Unpark the stopper thread */ + stop_machine_unpark(cpu); + return 0; +} + static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state--; st->state > st->target; st->state--) { @@ -941,6 +954,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = takedown_cpu, .cant_stop = true, }, + [CPUHP_CPU_SET_ACTIVE] = { + .name = "cpu:active", + .startup = cpuhp_set_cpu_active, + .teardown = NULL, + }, [CPUHP_NOTIFY_ONLINE] = { .name = "notify:online", .startup = notify_online, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9503d590e5ef..626646396ca0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5692,16 +5692,6 @@ static int sched_cpu_active(struct notifier_block *nfb, set_cpu_rq_start_time(); return NOTIFY_OK; - case CPU_ONLINE: - /* - * At this point a starting CPU has marked itself as online via - * set_cpu_online(). But it might not yet have marked itself - * as active, which is essential from here on. - */ - set_cpu_active(cpu, true); - stop_machine_unpark(cpu); - return NOTIFY_OK; - case CPU_DOWN_FAILED: set_cpu_active(cpu, true); return NOTIFY_OK; -- cgit v1.2.1 From 931ef163309ee955611f287dc65248b39a65fc9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:36 +0000 Subject: cpu/hotplug: Unpark smpboot threads from the state machine Handle the smpboot threads in the state machine. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.295777684@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 7 +------ include/linux/cpuhotplug.h | 1 + init/main.c | 1 - kernel/cpu.c | 39 +++++---------------------------------- kernel/smpboot.c | 6 ++++-- kernel/smpboot.h | 4 ++-- 6 files changed, 13 insertions(+), 45 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 78989f20420f..83f35767016d 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -78,7 +78,7 @@ enum { /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, CPU_PRI_MIGRATION = 10, - CPU_PRI_SMPBOOT = 9, + /* bring up workqueues before normal notifiers and down after */ CPU_PRI_WORKQUEUE_UP = 5, CPU_PRI_WORKQUEUE_DOWN = -5, @@ -172,7 +172,6 @@ static inline void __unregister_cpu_notifier(struct notifier_block *nb) } #endif -void smpboot_thread_init(void); int cpu_up(unsigned int cpu); void notify_cpu_starting(unsigned int cpu); extern void cpu_maps_update_begin(void); @@ -221,10 +220,6 @@ static inline void cpu_notifier_register_done(void) { } -static inline void smpboot_thread_init(void) -{ -} - #endif /* CONFIG_SMP */ extern struct bus_type cpu_subsys; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2f2e5d9711c4..38679106fddd 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -11,6 +11,7 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_CPU_SET_ACTIVE, + CPUHP_SMPBOOT_THREADS, CPUHP_NOTIFY_ONLINE, CPUHP_ONLINE_DYN, CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, diff --git a/init/main.c b/init/main.c index c2ea72362ee3..55563fd36be3 100644 --- a/init/main.c +++ b/init/main.c @@ -388,7 +388,6 @@ static noinline void __init_refok rest_init(void) int pid; rcu_scheduler_starting(); - smpboot_thread_init(); /* * We need to spawn init first so that it obtains pid 1, however * the init task will end up wanting to create kthreads, which, if diff --git a/kernel/cpu.c b/kernel/cpu.c index 65e34d34ca93..3ec86bc414b7 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -481,8 +481,6 @@ static int takedown_cpu(unsigned int cpu) else synchronize_rcu(); - smpboot_park_threads(cpu); - /* * Prevent irq alloc/free while the dying cpu reorganizes the * interrupt affinities. @@ -612,38 +610,6 @@ int cpu_down(unsigned int cpu) EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ -/* - * Unpark per-CPU smpboot kthreads at CPU-online time. - */ -static int smpboot_thread_call(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_DOWN_FAILED: - case CPU_ONLINE: - smpboot_unpark_threads(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block smpboot_thread_notifier = { - .notifier_call = smpboot_thread_call, - .priority = CPU_PRI_SMPBOOT, -}; - -void smpboot_thread_init(void) -{ - register_cpu_notifier(&smpboot_thread_notifier); -} - /** * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers * @cpu: cpu that just started @@ -959,6 +925,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup = cpuhp_set_cpu_active, .teardown = NULL, }, + [CPUHP_SMPBOOT_THREADS] = { + .name = "smpboot:threads", + .startup = smpboot_unpark_threads, + .teardown = smpboot_park_threads, + }, [CPUHP_NOTIFY_ONLINE] = { .name = "notify:online", .startup = notify_online, diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d264f59bff56..13bc43d1fb22 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp kthread_unpark(tsk); } -void smpboot_unpark_threads(unsigned int cpu) +int smpboot_unpark_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu) if (cpumask_test_cpu(cpu, cur->cpumask)) smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) @@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kthread_park(tsk); } -void smpboot_park_threads(unsigned int cpu) +int smpboot_park_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu) list_for_each_entry_reverse(cur, &hotplug_threads, list) smpboot_park_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 72415a0eb955..6b5f02017be3 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -14,7 +14,7 @@ static inline void idle_threads_init(void) { } #endif int smpboot_create_threads(unsigned int cpu); -void smpboot_park_threads(unsigned int cpu); -void smpboot_unpark_threads(unsigned int cpu); +int smpboot_park_threads(unsigned int cpu); +int smpboot_unpark_threads(unsigned int cpu); #endif -- cgit v1.2.1 From 2e1a3483ce74d197876e58281925dd4e378a7f28 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:37 +0000 Subject: cpu/hotplug: Split out the state walk into functions We need that for running callbacks on the AP and the BP. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.374946234@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 111 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 68 insertions(+), 43 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 3ec86bc414b7..9572ca043727 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -329,10 +329,74 @@ static int bringup_cpu(unsigned int cpu) return 0; } +/* + * Hotplug state machine related functions + */ +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->startup); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st, steps); + break; + } + } + return ret; +} + +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state--; st->state > st->target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } +} + +static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + while (st->state < target) { + struct cpuhp_step *step; + + st->state++; + step = steps + st->state; + ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + if (ret) { + st->target = prev_state; + undo_cpu_up(cpu, st, steps); + break; + } + } + return ret; +} + #ifdef CONFIG_HOTPLUG_CPU EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); - void unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); @@ -537,15 +601,6 @@ static int notify_dead(unsigned int cpu) #endif #ifdef CONFIG_HOTPLUG_CPU -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = cpuhp_bp_states + st->state; - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->startup); - } -} /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, @@ -567,16 +622,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, prev_state = st->state; st->target = target; - for (; st->state > st->target; st->state--) { - struct cpuhp_step *step = cpuhp_bp_states + st->state; + ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); - ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); - if (ret) { - st->target = prev_state; - undo_cpu_down(cpu, st); - break; - } - } hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; cpu_hotplug_done(); @@ -645,22 +692,12 @@ static int cpuhp_set_cpu_active(unsigned int cpu) return 0; } -static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - for (st->state--; st->state > st->target; st->state--) { - struct cpuhp_step *step = cpuhp_bp_states + st->state; - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->teardown); - } -} - /* Requires cpu_add_remove_lock to be held */ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; - int prev_state, ret = 0; + int ret = 0; cpu_hotplug_begin(); @@ -687,20 +724,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; - prev_state = st->state; st->target = target; - while (st->state < st->target) { - struct cpuhp_step *step; - - st->state++; - step = cpuhp_bp_states + st->state; - ret = cpuhp_invoke_callback(cpu, st->state, step->startup); - if (ret) { - st->target = prev_state; - undo_cpu_up(cpu, st); - break; - } - } + ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); out: cpu_hotplug_done(); return ret; -- cgit v1.2.1 From 4cb28ced23c4f222ff4e3f39898017e52161a9c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:38 +0000 Subject: cpu/hotplug: Create hotplug threads In order to let the hotplugged cpu take care of the setup/teardown, we need a seperate hotplug thread. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.454541272@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/smp.c | 1 + kernel/smpboot.h | 2 + 3 files changed, 147 insertions(+), 1 deletion(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 9572ca043727..9048c33689ac 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -33,10 +34,24 @@ * cpuhp_cpu_state - Per cpu hotplug state storage * @state: The current cpu state * @target: The target state + * @thread: Pointer to the hotplug thread + * @should_run: Thread should execute + * @cb_stat: The state for a single callback (install/uninstall) + * @cb: Single callback function (install/uninstall) + * @result: Result of the operation + * @done: Signal completion to the issuer of the task */ struct cpuhp_cpu_state { enum cpuhp_state state; enum cpuhp_state target; +#ifdef CONFIG_SMP + struct task_struct *thread; + bool should_run; + enum cpuhp_state cb_state; + int (*cb)(unsigned int cpu); + int result; + struct completion done; +#endif }; static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); @@ -394,6 +409,134 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, return ret; } +/* + * The cpu hotplug threads manage the bringup and teardown of the cpus + */ +static void cpuhp_create(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + init_completion(&st->done); +} + +static int cpuhp_should_run(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + return st->should_run; +} + +/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ +static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + enum cpuhp_state target = max((int)st->target, CPUHP_AP_ONLINE); + + return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); +} + +/* Execute the online startup callbacks. Used to be CPU_ONLINE */ +static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); +} + +/* + * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke + * callbacks when a state gets [un]installed at runtime. + */ +static void cpuhp_thread_fun(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + int ret = 0; + + /* + * Paired with the mb() in cpuhp_kick_ap_work and + * cpuhp_invoke_ap_callback, so the work set is consistent visible. + */ + smp_mb(); + if (!st->should_run) + return; + + st->should_run = false; + + /* Single callback invocation for [un]install ? */ + if (st->cb) { + if (st->cb_state < CPUHP_AP_ONLINE) { + local_irq_disable(); + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + local_irq_enable(); + } else { + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + } + } else { + /* Regular hotplug work */ + if (st->state < st->target) + ret = cpuhp_ap_online(cpu, st); + else if (st->state > st->target) + ret = cpuhp_ap_offline(cpu, st); + } + st->result = ret; + complete(&st->done); +} + +/* Invoke a single callback on a remote cpu */ +static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int)) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + if (!cpu_online(cpu)) + return 0; + + st->cb_state = state; + st->cb = cb; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); + return st->result; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static int cpuhp_kick_ap_work(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state state = st->state; + + trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + st->result = 0; + st->cb = NULL; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); + trace_cpuhp_exit(cpu, st->state, state, st->result); + return st->result; +} + +static struct smp_hotplug_thread cpuhp_threads = { + .store = &cpuhp_state.thread, + .create = &cpuhp_create, + .thread_should_run = cpuhp_should_run, + .thread_fn = cpuhp_thread_fun, + .thread_comm = "cpuhp/%u", + .selfparking = true, +}; + +void __init cpuhp_threads_init(void) +{ + BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); + kthread_unpark(this_cpu_read(cpuhp_state.thread)); +} + #ifdef CONFIG_HOTPLUG_CPU EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); @@ -997,7 +1140,7 @@ static int cpuhp_cb_check(enum cpuhp_state state) static bool cpuhp_is_ap_state(enum cpuhp_state state) { - return (state > CPUHP_AP_OFFLINE && state < CPUHP_AP_ONLINE); + return (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE); } static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) diff --git a/kernel/smp.c b/kernel/smp.c index d903c02223af..822ffb1ada3f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -569,6 +569,7 @@ void __init smp_init(void) unsigned int cpu; idle_threads_init(); + cpuhp_threads_init(); /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 6b5f02017be3..485b81cfab34 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -17,4 +17,6 @@ int smpboot_create_threads(unsigned int cpu); int smpboot_park_threads(unsigned int cpu); int smpboot_unpark_threads(unsigned int cpu); +void __init cpuhp_threads_init(void); + #endif -- cgit v1.2.1 From 1cf4f629d9d246519a1e76c021806f2a51ddba4d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:39 +0000 Subject: cpu/hotplug: Move online calls to hotplugged cpu Let the hotplugged cpu invoke the setup/teardown callbacks (CPU_ONLINE/CPU_DOWN_PREPARE) itself. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.536364371@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 10 ++-- kernel/cpu.c | 144 ++++++++++++++++++++++++++++++--------------- 2 files changed, 102 insertions(+), 52 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 38679106fddd..8a715bb1e192 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -11,10 +11,12 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_CPU_SET_ACTIVE, - CPUHP_SMPBOOT_THREADS, - CPUHP_NOTIFY_ONLINE, - CPUHP_ONLINE_DYN, - CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, + CPUHP_KICK_AP_THREAD, + CPUHP_BP_ONLINE, + CPUHP_AP_SMPBOOT_THREADS, + CPUHP_AP_NOTIFY_ONLINE, + CPUHP_AP_ONLINE_DYN, + CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, CPUHP_ONLINE, }; diff --git a/kernel/cpu.c b/kernel/cpu.c index 9048c33689ac..e220e565ea98 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -429,7 +429,7 @@ static int cpuhp_should_run(unsigned int cpu) /* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) { - enum cpuhp_state target = max((int)st->target, CPUHP_AP_ONLINE); + enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); } @@ -469,6 +469,9 @@ static void cpuhp_thread_fun(unsigned int cpu) ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); } } else { + /* Cannot happen .... */ + BUG_ON(st->state < CPUHP_KICK_AP_THREAD); + /* Regular hotplug work */ if (st->state < st->target) ret = cpuhp_ap_online(cpu, st); @@ -502,12 +505,8 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, } /* Regular hotplug invocation of the AP hotplug thread */ -static int cpuhp_kick_ap_work(unsigned int cpu) +static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) { - struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - enum cpuhp_state state = st->state; - - trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); st->result = 0; st->cb = NULL; /* @@ -517,6 +516,15 @@ static int cpuhp_kick_ap_work(unsigned int cpu) smp_mb(); st->should_run = true; wake_up_process(st->thread); +} + +static int cpuhp_kick_ap_work(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state state = st->state; + + trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + __cpuhp_kick_ap_work(st); wait_for_completion(&st->done); trace_cpuhp_exit(cpu, st->state, state, st->result); return st->result; @@ -688,6 +696,9 @@ static int takedown_cpu(unsigned int cpu) else synchronize_rcu(); + /* Park the hotplug thread */ + kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); + /* * Prevent irq alloc/free while the dying cpu reorganizes the * interrupt affinities. @@ -765,10 +776,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, prev_state = st->state; st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread. + */ + if (st->state >= CPUHP_KICK_AP_THREAD) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + + /* + * We might have stopped still in the range of the AP hotplug + * thread. Nothing to do anymore. + */ + if (st->state >= CPUHP_KICK_AP_THREAD) + goto out; + } + /* + * The AP brought itself down below CPUHP_KICK_AP_THREAD. So we need + * to do the further cleanups. + */ ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; - +out: cpu_hotplug_done(); /* This post dead nonsense must die */ if (!ret && hasdied) @@ -828,10 +863,13 @@ void notify_cpu_starting(unsigned int cpu) */ static int cpuhp_set_cpu_active(unsigned int cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + /* The cpu is marked online, set it active now */ set_cpu_active(cpu, true); - /* Unpark the stopper thread */ + /* Unpark the stopper thread and the hotplug thread */ stop_machine_unpark(cpu); + kthread_unpark(st->thread); return 0; } @@ -868,6 +906,26 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread once more. + */ + if (st->state >= CPUHP_KICK_AP_THREAD) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + } + + /* + * Try to reach the target state. We max out on the BP at + * CPUHP_KICK_AP_THREAD. After that the AP hotplug thread is + * responsible for bringing it up to the target state. + */ + target = min((int)target, CPUHP_KICK_AP_THREAD); ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); out: cpu_hotplug_done(); @@ -1093,19 +1151,13 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup = cpuhp_set_cpu_active, .teardown = NULL, }, - [CPUHP_SMPBOOT_THREADS] = { - .name = "smpboot:threads", - .startup = smpboot_unpark_threads, - .teardown = smpboot_park_threads, - }, - [CPUHP_NOTIFY_ONLINE] = { - .name = "notify:online", - .startup = notify_online, - .teardown = notify_down_prepare, - .cant_stop = true, + [CPUHP_KICK_AP_THREAD] = { + .name = "cpuhp:kickthread", + .startup = cpuhp_kick_ap_work, + .teardown = cpuhp_kick_ap_work, }, #endif - [CPUHP_ONLINE] = { + [CPUHP_BP_ONLINE] = { .name = "online", .startup = NULL, .teardown = NULL, @@ -1122,6 +1174,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { .skip_onerr = true, .cant_stop = true, }, + [CPUHP_AP_SMPBOOT_THREADS] = { + .name = "smpboot:threads", + .startup = smpboot_unpark_threads, + .teardown = smpboot_park_threads, + }, + [CPUHP_AP_NOTIFY_ONLINE] = { + .name = "notify:online", + .startup = notify_online, + .teardown = notify_down_prepare, + }, #endif [CPUHP_ONLINE] = { .name = "online", @@ -1140,7 +1202,9 @@ static int cpuhp_cb_check(enum cpuhp_state state) static bool cpuhp_is_ap_state(enum cpuhp_state state) { - return (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE); + if (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE) + return true; + return state > CPUHP_BP_ONLINE; } static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) @@ -1172,14 +1236,6 @@ static void *cpuhp_get_teardown_cb(enum cpuhp_state state) return cpuhp_get_step(state)->teardown; } -/* Helper function to run callback on the target cpu */ -static void cpuhp_on_cpu_cb(void *__cb) -{ - int (*cb)(unsigned int cpu) = __cb; - - BUG_ON(cb(smp_processor_id())); -} - /* * Call the startup/teardown function for a step either on the AP or * on the current CPU. @@ -1191,26 +1247,18 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, if (!cb) return 0; - - /* - * This invokes the callback directly for now. In a later step we - * convert that to use cpuhp_invoke_callback(). - */ - if (cpuhp_is_ap_state(state)) { - /* - * Note, that a function called on the AP is not - * allowed to fail. - */ - if (cpu_online(cpu)) - smp_call_function_single(cpu, cpuhp_on_cpu_cb, cb, 1); - return 0; - } - /* * The non AP bound callbacks can fail on bringup. On teardown * e.g. module removal we crash for now. */ - ret = cb(cpu); +#ifdef CONFIG_SMP + if (cpuhp_is_ap_state(state)) + ret = cpuhp_invoke_ap_callback(cpu, state, cb); + else + ret = cpuhp_invoke_callback(cpu, state, cb); +#else + ret = cpuhp_invoke_callback(cpu, state, cb); +#endif BUG_ON(ret && !bringup); return ret; } @@ -1252,11 +1300,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state) enum cpuhp_state i; mutex_lock(&cpuhp_state_mutex); - for (i = CPUHP_ONLINE_DYN; i <= CPUHP_ONLINE_DYN_END; i++) { - if (cpuhp_bp_states[i].name) + for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) { + if (cpuhp_ap_states[i].name) continue; - cpuhp_bp_states[i].name = "Reserved"; + cpuhp_ap_states[i].name = "Reserved"; mutex_unlock(&cpuhp_state_mutex); return i; } @@ -1289,7 +1337,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, get_online_cpus(); /* currently assignments for the ONLINE state are possible */ - if (state == CPUHP_ONLINE_DYN) { + if (state == CPUHP_AP_ONLINE_DYN) { dyn_state = 1; ret = cpuhp_reserve_state(state); if (ret < 0) -- cgit v1.2.1 From fc6d73d67436e7784758a831227bd019547a3f73 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:40 +0000 Subject: arch/hotplug: Call into idle with a proper state Let the non boot cpus call into idle with the corresponding hotplug state, so the hotplug core can handle the further bringup. That's a first step to convert the boot side of the hotplugged cpus to do all the synchronization with the other side through the state machine. For now it'll only start the hotplug thread and kick the full bringup of the cpu. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.614102639@linutronix.de Signed-off-by: Thomas Gleixner --- arch/alpha/kernel/smp.c | 2 +- arch/arc/kernel/smp.c | 2 +- arch/arm/kernel/smp.c | 2 +- arch/arm64/kernel/smp.c | 2 +- arch/blackfin/mach-common/smp.c | 2 +- arch/hexagon/kernel/smp.c | 2 +- arch/ia64/kernel/smpboot.c | 2 +- arch/m32r/kernel/smpboot.c | 2 +- arch/metag/kernel/smp.c | 2 +- arch/mips/kernel/smp.c | 2 +- arch/mn10300/kernel/smp.c | 2 +- arch/parisc/kernel/smp.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/s390/kernel/smp.c | 2 +- arch/sh/kernel/smp.c | 2 +- arch/sparc/kernel/smp_32.c | 2 +- arch/sparc/kernel/smp_64.c | 2 +- arch/tile/kernel/smpboot.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/xen/smp.c | 2 +- arch/xtensa/kernel/smp.c | 2 +- include/linux/cpuhotplug.h | 1 + 22 files changed, 22 insertions(+), 21 deletions(-) diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 2f24447fef92..46bf263c3153 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -168,7 +168,7 @@ smp_callin(void) cpuid, current, current->active_mm)); preempt_disable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } /* Wait until hwrpb->txrdy is clear for cpu. Return -1 on timeout. */ diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index 424e937da5c8..4cb3add77c75 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -142,7 +142,7 @@ void start_kernel_secondary(void) local_irq_enable(); preempt_disable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } /* diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 37312f6749f3..baee70267f29 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -409,7 +409,7 @@ asmlinkage void secondary_start_kernel(void) /* * OK, it's off to the idle thread for us */ - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } void __init smp_cpus_done(unsigned int max_cpus) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index b1adc51b2c2e..460765799c64 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -195,7 +195,7 @@ asmlinkage void secondary_start_kernel(void) /* * OK, it's off to the idle thread for us */ - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 0030e21cfceb..23c4ef5f8bdc 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -333,7 +333,7 @@ void secondary_start_kernel(void) /* We are done with local CPU inits, unblock the boot CPU. */ set_cpu_online(cpu, true); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } void __init smp_prepare_boot_cpu(void) diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index ff759f26b96a..983bae7d2665 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -180,7 +180,7 @@ void start_secondary(void) local_irq_enable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 0e76fad27975..74fe317477e6 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -454,7 +454,7 @@ start_secondary (void *unused) preempt_disable(); smp_callin(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); return 0; } diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c index a468467542f4..f98d2f6519d6 100644 --- a/arch/m32r/kernel/smpboot.c +++ b/arch/m32r/kernel/smpboot.c @@ -432,7 +432,7 @@ int __init start_secondary(void *unused) */ local_flush_tlb_all(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); return 0; } diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c index c3c6f0864881..bad13232de51 100644 --- a/arch/metag/kernel/smp.c +++ b/arch/metag/kernel/smp.c @@ -396,7 +396,7 @@ asmlinkage void secondary_start_kernel(void) /* * OK, it's off to the idle thread for us */ - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } void __init smp_cpus_done(unsigned int max_cpus) diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index bd4385a8e6e8..f2112a8ddf15 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -191,7 +191,7 @@ asmlinkage void start_secondary(void) WARN_ON_ONCE(!irqs_disabled()); mp_ops->smp_finish(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } static void stop_this_cpu(void *dummy) diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c index f984193718b1..426173c4b0b9 100644 --- a/arch/mn10300/kernel/smp.c +++ b/arch/mn10300/kernel/smp.c @@ -675,7 +675,7 @@ int __init start_secondary(void *unused) #ifdef CONFIG_GENERIC_CLOCKEVENTS init_clockevents(); #endif - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); return 0; } diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 52e85973a283..c2a9cc55a62f 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -305,7 +305,7 @@ void __init smp_callin(void) local_irq_enable(); /* Interrupts have been off until now */ - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); /* NOTREACHED */ panic("smp_callin() AAAAaaaaahhhh....\n"); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ec9ec2058d2d..cc13d4c83291 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -727,7 +727,7 @@ void start_secondary(void *unused) local_irq_enable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); BUG(); } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 3c65a8eae34d..40a6b4f9c36c 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -798,7 +798,7 @@ static void smp_start_secondary(void *cpuvoid) set_cpu_online(smp_processor_id(), true); inc_irq_stat(CPU_RST); local_irq_enable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } /* Upping and downing of CPUs */ diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index de6be008fc01..13f633add29a 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -203,7 +203,7 @@ asmlinkage void start_secondary(void) set_cpu_online(cpu, true); per_cpu(cpu_state, cpu) = CPU_ONLINE; - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } extern struct { diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index b3a5d81b20f0..fb30e7c6a5b1 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -364,7 +364,7 @@ static void sparc_start_secondary(void *arg) local_irq_enable(); wmb(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); /* We should never reach here! */ BUG(); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 19cd08d18672..8a6151a628ce 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -134,7 +134,7 @@ void smp_callin(void) local_irq_enable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } void cpu_panic(void) diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c index 20d52a98e171..6c0abaacec33 100644 --- a/arch/tile/kernel/smpboot.c +++ b/arch/tile/kernel/smpboot.c @@ -208,7 +208,7 @@ void online_secondary(void) /* Set up tile-timer clock-event device on this cpu */ setup_tile_timer(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } int __cpu_up(unsigned int cpu, struct task_struct *tidle) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 24d57f77b3c1..293b22a7ab02 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -248,7 +248,7 @@ static void notrace start_secondary(void *unused) x86_cpuinit.setup_percpu_clockev(); wmb(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } void __init smp_store_boot_cpu_info(void) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 3f4ebf0261f2..3c6d17fd423a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -112,7 +112,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu) xen_pvh_secondary_vcpu_init(cpu); #endif cpu_bringup(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } static void xen_smp_intr_free(unsigned int cpu) diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c index 4d02e38514f5..fc4ad21a5ed4 100644 --- a/arch/xtensa/kernel/smp.c +++ b/arch/xtensa/kernel/smp.c @@ -157,7 +157,7 @@ void secondary_start_kernel(void) complete(&cpu_running); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } static void mx_cpu_start(void *p) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8a715bb1e192..4aa263adc536 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -13,6 +13,7 @@ enum cpuhp_state { CPUHP_CPU_SET_ACTIVE, CPUHP_KICK_AP_THREAD, CPUHP_BP_ONLINE, + CPUHP_AP_ONLINE_IDLE, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_NOTIFY_ONLINE, CPUHP_AP_ONLINE_DYN, -- cgit v1.2.1 From 8df3e07e7f21f2ed8d001e6fabf9505946b438aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:41 +0000 Subject: cpu/hotplug: Let upcoming cpu bring itself fully up Let the upcoming cpu kick the hotplug thread and let itself complete the bringup. That way the controll side can just wait for the completion or later when we made the hotplug machinery async not care at all. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.697655464@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 9 ++++--- kernel/cpu.c | 66 ++++++++++++++++++++++++++-------------------- kernel/sched/idle.c | 2 ++ 3 files changed, 45 insertions(+), 32 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4aa263adc536..ad5d7fcb0130 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -10,9 +10,6 @@ enum cpuhp_state { CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, - CPUHP_CPU_SET_ACTIVE, - CPUHP_KICK_AP_THREAD, - CPUHP_BP_ONLINE, CPUHP_AP_ONLINE_IDLE, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_NOTIFY_ONLINE, @@ -86,4 +83,10 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) __cpuhp_remove_state(state, false); } +#ifdef CONFIG_SMP +void cpuhp_online_idle(enum cpuhp_state state); +#else +static inline void cpuhp_online_idle(enum cpuhp_state state) { } +#endif + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index e220e565ea98..f1f880fac832 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -329,6 +329,14 @@ static int notify_starting(unsigned int cpu) return 0; } +static int bringup_wait_for_ap(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + wait_for_completion(&st->done); + return st->result; +} + static int bringup_cpu(unsigned int cpu) { struct task_struct *idle = idle_thread_get(cpu); @@ -340,8 +348,9 @@ static int bringup_cpu(unsigned int cpu) cpu_notify(CPU_UP_CANCELED, cpu); return ret; } + ret = bringup_wait_for_ap(cpu); BUG_ON(!cpu_online(cpu)); - return 0; + return ret; } /* @@ -470,7 +479,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } } else { /* Cannot happen .... */ - BUG_ON(st->state < CPUHP_KICK_AP_THREAD); + BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); /* Regular hotplug work */ if (st->state < st->target) @@ -780,7 +789,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. */ - if (st->state >= CPUHP_KICK_AP_THREAD) { + if (st->state > CPUHP_TEARDOWN_CPU) { ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just @@ -793,11 +802,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * We might have stopped still in the range of the AP hotplug * thread. Nothing to do anymore. */ - if (st->state >= CPUHP_KICK_AP_THREAD) + if (st->state > CPUHP_TEARDOWN_CPU) goto out; } /* - * The AP brought itself down below CPUHP_KICK_AP_THREAD. So we need + * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need * to do the further cleanups. */ ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); @@ -859,18 +868,32 @@ void notify_cpu_starting(unsigned int cpu) /* * Called from the idle task. We need to set active here, so we can kick off - * the stopper thread. + * the stopper thread and unpark the smpboot threads. If the target state is + * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the + * cpu further. */ -static int cpuhp_set_cpu_active(unsigned int cpu) +void cpuhp_online_idle(enum cpuhp_state state) { - struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + unsigned int cpu = smp_processor_id(); + + /* Happens for the boot cpu */ + if (state != CPUHP_AP_ONLINE_IDLE) + return; + + st->state = CPUHP_AP_ONLINE_IDLE; /* The cpu is marked online, set it active now */ set_cpu_active(cpu, true); - /* Unpark the stopper thread and the hotplug thread */ + /* Unpark the stopper thread and the hotplug thread of this cpu */ stop_machine_unpark(cpu); kthread_unpark(st->thread); - return 0; + + /* Should we go further up ? */ + if (st->target > CPUHP_AP_ONLINE_IDLE) + __cpuhp_kick_ap_work(st); + else + complete(&st->done); } /* Requires cpu_add_remove_lock to be held */ @@ -910,7 +933,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. */ - if (st->state >= CPUHP_KICK_AP_THREAD) { + if (st->state > CPUHP_BRINGUP_CPU) { ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just @@ -922,10 +945,10 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) /* * Try to reach the target state. We max out on the BP at - * CPUHP_KICK_AP_THREAD. After that the AP hotplug thread is + * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is * responsible for bringing it up to the target state. */ - target = min((int)target, CPUHP_KICK_AP_THREAD); + target = min((int)target, CPUHP_BRINGUP_CPU); ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); out: cpu_hotplug_done(); @@ -1146,22 +1169,7 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = takedown_cpu, .cant_stop = true, }, - [CPUHP_CPU_SET_ACTIVE] = { - .name = "cpu:active", - .startup = cpuhp_set_cpu_active, - .teardown = NULL, - }, - [CPUHP_KICK_AP_THREAD] = { - .name = "cpuhp:kickthread", - .startup = cpuhp_kick_ap_work, - .teardown = cpuhp_kick_ap_work, - }, #endif - [CPUHP_BP_ONLINE] = { - .name = "online", - .startup = NULL, - .teardown = NULL, - }, }; /* Application processor state steps */ @@ -1204,7 +1212,7 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) { if (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE) return true; - return state > CPUHP_BP_ONLINE; + return state > CPUHP_BRINGUP_CPU; } static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 544a7133cbd1..a4b9813afc96 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -291,5 +292,6 @@ void cpu_startup_entry(enum cpuhp_state state) boot_init_stack_canary(); #endif arch_cpu_idle_prepare(); + cpuhp_online_idle(state); cpu_idle_loop(); } -- cgit v1.2.1 From e69aab13117efc1987620090e539b4ebeb33a04c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:43 +0000 Subject: cpu/hotplug: Make wait for dead cpu completion based Kill the busy spinning on the control side and just wait for the hotplugged cpu to tell that it reached the dead state. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.776157858@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 5 +++-- include/linux/cpuhotplug.h | 1 + kernel/cpu.c | 16 ++++++++++++---- kernel/sched/idle.c | 5 +---- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 83f35767016d..91a48d1b4ca0 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -276,14 +276,15 @@ void arch_cpu_idle_enter(void); void arch_cpu_idle_exit(void); void arch_cpu_idle_dead(void); -DECLARE_PER_CPU(bool, cpu_dead_idle); - int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); bool cpu_report_death(void); +void cpuhp_report_idle_dead(void); +#else +static inline void cpuhp_report_idle_dead(void) { } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ #endif /* _LINUX_CPU_H_ */ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index ad5d7fcb0130..5d68e15e46b7 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -6,6 +6,7 @@ enum cpuhp_state { CPUHP_CREATE_THREADS, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, + CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, diff --git a/kernel/cpu.c b/kernel/cpu.c index f1f880fac832..0e8c07f2566e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -688,6 +688,7 @@ static int take_cpu_down(void *_param) static int takedown_cpu(unsigned int cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int err; /* @@ -733,10 +734,8 @@ static int takedown_cpu(unsigned int cpu) * * Wait for the stop thread to go away. */ - while (!per_cpu(cpu_dead_idle, cpu)) - cpu_relax(); - smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ - per_cpu(cpu_dead_idle, cpu) = false; + wait_for_completion(&st->done); + BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ irq_unlock_sparse(); @@ -756,6 +755,15 @@ static int notify_dead(unsigned int cpu) return 0; } +void cpuhp_report_idle_dead(void) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + BUG_ON(st->state != CPUHP_AP_OFFLINE); + st->state = CPUHP_AP_IDLE_DEAD; + complete(&st->done); +} + #else #define notify_down_prepare NULL #define takedown_cpu NULL diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index a4b9813afc96..8abbe89e9114 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -194,8 +194,6 @@ exit_idle: rcu_idle_exit(); } -DEFINE_PER_CPU(bool, cpu_dead_idle); - /* * Generic idle loop implementation * @@ -224,8 +222,7 @@ static void cpu_idle_loop(void) if (cpu_is_offline(smp_processor_id())) { rcu_cpu_notify(NULL, CPU_DYING_IDLE, (void *)(long)smp_processor_id()); - smp_mb(); /* all activity before dead. */ - this_cpu_write(cpu_dead_idle, true); + cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } -- cgit v1.2.1 From 27d50c7eeb0f03c3d3ca72aac4d2dd487ca1f3f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:44 +0000 Subject: rcu: Make CPU_DYING_IDLE an explicit call Make the RCU CPU_DYING_IDLE callback an explicit function call, so it gets invoked at the proper place. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.870167933@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 4 +-- include/linux/notifier.h | 2 ++ include/linux/rcupdate.h | 4 +-- kernel/cpu.c | 1 + kernel/rcu/tree.c | 70 +++++++++++++++++++++++++----------------------- kernel/sched/idle.c | 2 -- 6 files changed, 42 insertions(+), 41 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 91a48d1b4ca0..f9b1fab4388a 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -101,9 +101,7 @@ enum { * Called on the new cpu, just before * enabling interrupts. Must not sleep, * must not fail */ -#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached - * idle loop. */ -#define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly, +#define CPU_BROKEN 0x000B /* CPU (unsigned)v did not die properly, * perhaps due to preemption. */ /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend diff --git a/include/linux/notifier.h b/include/linux/notifier.h index d14a4c362465..4149868de4e6 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -47,6 +47,8 @@ * runtime initialization. */ +struct notifier_block; + typedef int (*notifier_fn_t)(struct notifier_block *nb, unsigned long action, void *data); diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 14e6f47ee16f..fc46fe3ea259 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -332,9 +332,7 @@ void rcu_init(void); void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); -struct notifier_block; -int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); +void rcu_report_dead(unsigned int cpu); #ifndef CONFIG_TINY_RCU void rcu_end_inkernel_boot(void); diff --git a/kernel/cpu.c b/kernel/cpu.c index 0e8c07f2566e..ff8059b76a85 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -762,6 +762,7 @@ void cpuhp_report_idle_dead(void) BUG_ON(st->state != CPUHP_AP_OFFLINE); st->state = CPUHP_AP_IDLE_DEAD; complete(&st->done); + rcu_report_dead(smp_processor_id()); } #else diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e41dd4131f7a..85b41341272e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2606,28 +2606,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) } } -/* - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit - * bit masks. - */ -static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ - mask = rdp->grpmask; - raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ - rnp->qsmaskinitnext &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - /* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, @@ -4247,6 +4225,43 @@ static void rcu_prepare_cpu(int cpu) rcu_init_percpu_data(cpu, rsp); } +#ifdef CONFIG_HOTPLUG_CPU +/* + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + */ +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +void rcu_report_dead(unsigned int cpu) +{ + struct rcu_state *rsp; + + /* QS for any half-done expedited RCU-sched GP. */ + preempt_disable(); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(rcu_sched_state.rda), true); + preempt_enable(); + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_idle_cpu(cpu, rsp); +} +#endif + /* * Handle CPU online/offline notification events. */ @@ -4278,17 +4293,6 @@ int rcu_cpu_notify(struct notifier_block *self, for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); break; - case CPU_DYING_IDLE: - /* QS for any half-done expedited RCU-sched GP. */ - preempt_disable(); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(rcu_sched_state.rda), true); - preempt_enable(); - - for_each_rcu_flavor(rsp) { - rcu_cleanup_dying_idle_cpu(cpu, rsp); - } - break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8abbe89e9114..bd12c6c714ec 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -220,8 +220,6 @@ static void cpu_idle_loop(void) rmb(); if (cpu_is_offline(smp_processor_id())) { - rcu_cpu_notify(NULL, CPU_DYING_IDLE, - (void *)(long)smp_processor_id()); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } -- cgit v1.2.1 From 6bd58f09e1d8cc6c50a824c00bf0d617919986a1 Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:19 -0800 Subject: time: Add cycles to nanoseconds translation The timekeeping code does not currently provide a way to translate externally provided clocksource cycles to system time. The cycle count is always provided by the result clocksource read() method internal to the timekeeping code. The added function timekeeping_cycles_to_ns() calculated a nanosecond value from a cycle count that can be added to tk_read_base.base value yielding the current system time. This allows clocksource cycle values external to the timekeeping code to provide a cycle count that can be transformed to system time. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 34b4cedfa80d..4243d28177ac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -298,17 +298,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif +static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, + cycle_t delta) +{ + s64 nsec; + + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; + + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + arch_gettimeoffset(); +} + static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { cycle_t delta; - s64 nsec; delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} - nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; +static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) +{ + cycle_t delta; - /* If arch requires, add in get_arch_timeoffset() */ - return nsec + arch_gettimeoffset(); + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); } /** -- cgit v1.2.1 From 9da0f49c8767cc0ef6101cb21156cf4380ed50dd Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:20 -0800 Subject: time: Add timekeeping snapshot code capturing system time and counter In the current timekeeping code there isn't any interface to atomically capture the current relationship between the system counter and system time. ktime_get_snapshot() returns this triple (counter, monotonic raw, realtime) in the system_time_snapshot struct. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Moved structure definitions around to clean things up, fixed cycles_t/cycle_t confusion.] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 18 ++++++++++++++++++ kernel/time/timekeeping.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index ec89d846324c..7817591af46f 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -266,6 +266,24 @@ extern void timekeeping_inject_sleeptime64(struct timespec64 *delta); extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real); +/* + * struct system_time_snapshot - simultaneous raw/real time capture with + * counter value + * @cycles: Clocksource counter value to produce the system times + * @real: Realtime system time + * @raw: Monotonic raw system time + */ +struct system_time_snapshot { + cycle_t cycles; + ktime_t real; + ktime_t raw; +}; + +/* + * Simultaneously snapshot realtime and monotonic raw clocks + */ +extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); + /* * Persistent clock related interfaces */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4243d28177ac..89b4695bd083 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -874,6 +874,36 @@ time64_t __ktime_get_real_seconds(void) return tk->xtime_sec; } +/** + * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter + * @systime_snapshot: pointer to struct receiving the system time snapshot + */ +void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + ktime_t base_raw; + ktime_t base_real; + s64 nsec_raw; + s64 nsec_real; + cycle_t now; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + now = tk->tkr_mono.read(tk->tkr_mono.clock); + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + systime_snapshot->cycles = now; + systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_snapshot); #ifdef CONFIG_NTP_PPS -- cgit v1.2.1 From ba26621e63ce6dc481d90ab9f6902e058d4ea39a Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:21 -0800 Subject: time: Remove duplicated code in ktime_get_raw_and_real() The code in ktime_get_snapshot() is a superset of the code in ktime_get_raw_and_real() code. Further, ktime_get_raw_and_real() is called only by the PPS code, pps_get_ts(). Consolidate the pps_get_ts() code into a single function calling ktime_get_snapshot() and eliminate ktime_get_raw_and_real(). A side effect of this is that the raw and real results of pps_get_ts() correspond to exactly the same clock cycle. Previously these values represented separate reads of the system clock. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall Signed-off-by: John Stultz --- include/linux/pps_kernel.h | 17 ++++++----------- kernel/time/timekeeping.c | 40 ++-------------------------------------- 2 files changed, 8 insertions(+), 49 deletions(-) diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h index 54bf1484d41f..35ac903956c7 100644 --- a/include/linux/pps_kernel.h +++ b/include/linux/pps_kernel.h @@ -111,22 +111,17 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt, kt->nsec = ts.tv_nsec; } -#ifdef CONFIG_NTP_PPS - static inline void pps_get_ts(struct pps_event_time *ts) { - ktime_get_raw_and_real_ts64(&ts->ts_raw, &ts->ts_real); -} + struct system_time_snapshot snap; -#else /* CONFIG_NTP_PPS */ - -static inline void pps_get_ts(struct pps_event_time *ts) -{ - ktime_get_real_ts64(&ts->ts_real); + ktime_get_snapshot(&snap); + ts->ts_real = ktime_to_timespec64(snap.real); +#ifdef CONFIG_NTP_PPS + ts->ts_raw = ktime_to_timespec64(snap.raw); +#endif } -#endif /* CONFIG_NTP_PPS */ - /* Subtract known time delay from PPS event time(s) */ static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 89b4695bd083..af19a49d5223 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -888,6 +888,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) s64 nsec_real; cycle_t now; + WARN_ON_ONCE(timekeeping_suspended); + do { seq = read_seqcount_begin(&tk_core.seq); @@ -905,44 +907,6 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) } EXPORT_SYMBOL_GPL(ktime_get_snapshot); -#ifdef CONFIG_NTP_PPS - -/** - * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format - * @ts_raw: pointer to the timespec to be set to raw monotonic time - * @ts_real: pointer to the timespec to be set to the time of day - * - * This function reads both the time of day and raw monotonic time at the - * same time atomically and stores the resulting timestamps in timespec - * format. - */ -void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; - s64 nsecs_raw, nsecs_real; - - WARN_ON_ONCE(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&tk_core.seq); - - *ts_raw = tk->raw_time; - ts_real->tv_sec = tk->xtime_sec; - ts_real->tv_nsec = 0; - - nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); - nsecs_real = timekeeping_get_ns(&tk->tkr_mono); - - } while (read_seqcount_retry(&tk_core.seq, seq)); - - timespec64_add_ns(ts_raw, nsecs_raw); - timespec64_add_ns(ts_real, nsecs_real); -} -EXPORT_SYMBOL(ktime_get_raw_and_real_ts64); - -#endif /* CONFIG_NTP_PPS */ - /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set -- cgit v1.2.1 From 8006c24595cab106bcb9da12d35e32e14ff492df Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:22 -0800 Subject: time: Add driver cross timestamp interface for higher precision time synchronization ACKNOWLEDGMENT: cross timestamp code was developed by Thomas Gleixner . It has changed considerably and any mistakes are mine. The precision with which events on multiple networked systems can be synchronized using, as an example, PTP (IEEE 1588, 802.1AS) is limited by the precision of the cross timestamps between the system clock and the device (timestamp) clock. Precision here is the degree of simultaneity when capturing the cross timestamp. Currently the PTP cross timestamp is captured in software using the PTP device driver ioctl PTP_SYS_OFFSET. Reads of the device clock are interleaved with reads of the realtime clock. At best, the precision of this cross timestamp is on the order of several microseconds due to software latencies. Sub-microsecond precision is required for industrial control and some media applications. To achieve this level of precision hardware supported cross timestamping is needed. The function get_device_system_crosstimestamp() allows device drivers to return a cross timestamp with system time properly scaled to nanoseconds. The realtime value is needed to discipline that clock using PTP and the monotonic raw value is used for applications that don't require a "real" time, but need an unadjusted clock time. The get_device_system_crosstimestamp() code calls back into the driver to ensure that the system counter is within the current timekeeping update interval. Modern Intel hardware provides an Always Running Timer (ART) which is exactly related to TSC through a known frequency ratio. The ART is routed to devices on the system and is used to precisely and simultaneously capture the device clock with the ART. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Reworked to remove extra structures and simplify calling] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 35 ++++++++++++++++++++++++++++ kernel/time/timekeeping.c | 56 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7817591af46f..4a2ca65fc778 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -279,6 +279,41 @@ struct system_time_snapshot { ktime_t raw; }; +/* + * struct system_device_crosststamp - system/device cross-timestamp + * (syncronized capture) + * @device: Device time + * @sys_realtime: Realtime simultaneous with device time + * @sys_monoraw: Monotonic raw simultaneous with device time + */ +struct system_device_crosststamp { + ktime_t device; + ktime_t sys_realtime; + ktime_t sys_monoraw; +}; + +/* + * struct system_counterval_t - system counter value with the pointer to the + * corresponding clocksource + * @cycles: System counter value + * @cs: Clocksource corresponding to system counter value. Used by + * timekeeping code to verify comparibility of two cycle values + */ +struct system_counterval_t { + cycle_t cycles; + struct clocksource *cs; +}; + +/* + * Get cross timestamp between system clock and device clock + */ +extern int get_device_system_crosststamp( + int (*get_time_fn)(ktime_t *device_time, + struct system_counterval_t *system_counterval, + void *ctx), + void *ctx, + struct system_device_crosststamp *xtstamp); + /* * Simultaneously snapshot realtime and monotonic raw clocks */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index af19a49d5223..dba595cdb200 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -907,6 +907,62 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) } EXPORT_SYMBOL_GPL(ktime_get_snapshot); +/** + * get_device_system_crosststamp - Synchronously capture system/device timestamp + * @sync_devicetime: Callback to get simultaneous device time and + * system counter from the device driver + * @xtstamp: Receives simultaneously captured system and device time + * + * Reads a timestamp from a device and correlates it to system time + */ +int get_device_system_crosststamp(int (*get_time_fn) + (ktime_t *device_time, + struct system_counterval_t *sys_counterval, + void *ctx), + void *ctx, + struct system_device_crosststamp *xtstamp) +{ + struct system_counterval_t system_counterval; + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t base_real, base_raw; + s64 nsec_real, nsec_raw; + unsigned long seq; + int ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + /* + * Try to synchronously capture device time and a system + * counter value calling back into the device driver + */ + ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); + if (ret) + return ret; + + /* + * Verify that the clocksource associated with the captured + * system counter value is the same as the currently installed + * timekeeper clocksource + */ + if (tk->tkr_mono.clock != system_counterval.cs) + return -ENODEV; + + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, + system_counterval.cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, + system_counterval.cycles); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); + xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); + return 0; +} +EXPORT_SYMBOL_GPL(get_device_system_crosststamp); + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set -- cgit v1.2.1 From 2c756feb18d9ec258dbb3a3d11c47e28820690d7 Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:23 -0800 Subject: time: Add history to cross timestamp interface supporting slower devices Another representative use case of time sync and the correlated clocksource (in addition to PTP noted above) is PTP synchronized audio. In a streaming application, as an example, samples will be sent and/or received by multiple devices with a presentation time that is in terms of the PTP master clock. Synchronizing the audio output on these devices requires correlating the audio clock with the PTP master clock. The more precise this correlation is, the better the audio quality (i.e. out of sync audio sounds bad). From an application standpoint, to correlate the PTP master clock with the audio device clock, the system clock is used as a intermediate timebase. The transforms such an application would perform are: System Clock <-> Audio clock System Clock <-> Network Device Clock [<-> PTP Master Clock] Modern Intel platforms can perform a more accurate cross timestamp in hardware (ART,audio device clock). The audio driver requires ART->system time transforms -- the same as required for the network driver. These platforms offload audio processing (including cross-timestamps) to a DSP which to ensure uninterrupted audio processing, communicates and response to the host only once every millsecond. As a result is takes up to a millisecond for the DSP to receive a request, the request is processed by the DSP, the audio output hardware is polled for completion, the result is copied into shared memory, and the host is notified. All of these operation occur on a millisecond cadence. This transaction requires about 2 ms, but under heavier workloads it may take up to 4 ms. Adding a history allows these slow devices the option of providing an ART value outside of the current interval. In this case, the callback provided is an accessor function for the previously obtained counter value. If get_system_device_crosststamp() receives a counter value previous to cycle_last, it consults the history provided as an argument in history_ref and interpolates the realtime and monotonic raw system time using the provided counter value. If there are any clock discontinuities, e.g. from calling settimeofday(), the monotonic raw time is interpolated in the usual way, but the realtime clock time is adjusted by scaling the monotonic raw adjustment. When an accessor function is used a history argument *must* be provided. The history is initialized using ktime_get_snapshot() and must be called before the counter values are read. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Fixed up cycles_t/cycle_t type confusion] Signed-off-by: John Stultz --- include/linux/timekeeper_internal.h | 2 + include/linux/timekeeping.h | 5 ++ kernel/time/timekeeping.c | 171 +++++++++++++++++++++++++++++++++++- 3 files changed, 177 insertions(+), 1 deletion(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 25247220b4b7..e88005459035 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -50,6 +50,7 @@ struct tk_read_base { * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval @@ -91,6 +92,7 @@ struct timekeeper { ktime_t offs_tai; s32 tai_offset; unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; ktime_t next_leap_ktime; struct timespec64 raw_time; diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 4a2ca65fc778..96f37bee3bc1 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -272,11 +272,15 @@ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @raw: Monotonic raw system time + * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { cycle_t cycles; ktime_t real; ktime_t raw; + unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; }; /* @@ -312,6 +316,7 @@ extern int get_device_system_crosststamp( struct system_counterval_t *system_counterval, void *ctx), void *ctx, + struct system_time_snapshot *history, struct system_device_crosststamp *xtstamp); /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dba595cdb200..931b0b1a71e9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; + ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.read = clock->read; @@ -894,6 +895,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) seq = read_seqcount_begin(&tk_core.seq); now = tk->tkr_mono.read(tk->tkr_mono.clock); + systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; + systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; @@ -907,10 +910,123 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) } EXPORT_SYMBOL_GPL(ktime_get_snapshot); +/* Scale base by mult/div checking for overflow */ +static int scale64_check_overflow(u64 mult, u64 div, u64 *base) +{ + u64 tmp, rem; + + tmp = div64_u64_rem(*base, div, &rem); + + if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || + ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) + return -EOVERFLOW; + tmp *= mult; + rem *= mult; + + do_div(rem, div); + *base = tmp + rem; + return 0; +} + +/** + * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval + * @history: Snapshot representing start of history + * @partial_history_cycles: Cycle offset into history (fractional part) + * @total_history_cycles: Total history length in cycles + * @discontinuity: True indicates clock was set on history period + * @ts: Cross timestamp that should be adjusted using + * partial/total ratio + * + * Helper function used by get_device_system_crosststamp() to correct the + * crosstimestamp corresponding to the start of the current interval to the + * system counter value (timestamp point) provided by the driver. The + * total_history_* quantities are the total history starting at the provided + * reference point and ending at the start of the current interval. The cycle + * count between the driver timestamp point and the start of the current + * interval is partial_history_cycles. + */ +static int adjust_historical_crosststamp(struct system_time_snapshot *history, + cycle_t partial_history_cycles, + cycle_t total_history_cycles, + bool discontinuity, + struct system_device_crosststamp *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 corr_raw, corr_real; + bool interp_forward; + int ret; + + if (total_history_cycles == 0 || partial_history_cycles == 0) + return 0; + + /* Interpolate shortest distance from beginning or end of history */ + interp_forward = partial_history_cycles > total_history_cycles/2 ? + true : false; + partial_history_cycles = interp_forward ? + total_history_cycles - partial_history_cycles : + partial_history_cycles; + + /* + * Scale the monotonic raw time delta by: + * partial_history_cycles / total_history_cycles + */ + corr_raw = (u64)ktime_to_ns( + ktime_sub(ts->sys_monoraw, history->raw)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_raw); + if (ret) + return ret; + + /* + * If there is a discontinuity in the history, scale monotonic raw + * correction by: + * mult(real)/mult(raw) yielding the realtime correction + * Otherwise, calculate the realtime correction similar to monotonic + * raw calculation + */ + if (discontinuity) { + corr_real = mul_u64_u32_div + (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); + } else { + corr_real = (u64)ktime_to_ns( + ktime_sub(ts->sys_realtime, history->real)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_real); + if (ret) + return ret; + } + + /* Fixup monotonic raw and real time time values */ + if (interp_forward) { + ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); + ts->sys_realtime = ktime_add_ns(history->real, corr_real); + } else { + ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); + ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); + } + + return 0; +} + +/* + * cycle_between - true if test occurs chronologically between before and after + */ +static bool cycle_between(cycle_t before, cycle_t test, cycle_t after) +{ + if (test > before && test < after) + return true; + if (test < before && before > after) + return true; + return false; +} + /** * get_device_system_crosststamp - Synchronously capture system/device timestamp - * @sync_devicetime: Callback to get simultaneous device time and + * @get_time_fn: Callback to get simultaneous device time and * system counter from the device driver + * @ctx: Context passed to get_time_fn() + * @history_begin: Historical reference point used to interpolate system + * time when counter provided by the driver is before the current interval * @xtstamp: Receives simultaneously captured system and device time * * Reads a timestamp from a device and correlates it to system time @@ -920,13 +1036,18 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_counterval_t *sys_counterval, void *ctx), void *ctx, + struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { struct system_counterval_t system_counterval; struct timekeeper *tk = &tk_core.timekeeper; + cycle_t cycles, now, interval_start; + unsigned int clock_was_set_seq; ktime_t base_real, base_raw; s64 nsec_real, nsec_raw; + u8 cs_was_changed_seq; unsigned long seq; + bool do_interp; int ret; do { @@ -946,6 +1067,22 @@ int get_device_system_crosststamp(int (*get_time_fn) */ if (tk->tkr_mono.clock != system_counterval.cs) return -ENODEV; + cycles = system_counterval.cycles; + + /* + * Check whether the system counter value provided by the + * device driver is on the current timekeeping interval. + */ + now = tk->tkr_mono.read(tk->tkr_mono.clock); + interval_start = tk->tkr_mono.cycle_last; + if (!cycle_between(interval_start, cycles, now)) { + clock_was_set_seq = tk->clock_was_set_seq; + cs_was_changed_seq = tk->cs_was_changed_seq; + cycles = interval_start; + do_interp = true; + } else { + do_interp = false; + } base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); @@ -959,6 +1096,38 @@ int get_device_system_crosststamp(int (*get_time_fn) xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); + + /* + * Interpolate if necessary, adjusting back from the start of the + * current interval + */ + if (do_interp) { + cycle_t partial_history_cycles, total_history_cycles; + bool discontinuity; + + /* + * Check that the counter value occurs after the provided + * history reference and that the history doesn't cross a + * clocksource change + */ + if (!history_begin || + !cycle_between(history_begin->cycles, + system_counterval.cycles, cycles) || + history_begin->cs_was_changed_seq != cs_was_changed_seq) + return -EINVAL; + partial_history_cycles = cycles - system_counterval.cycles; + total_history_cycles = cycles - history_begin->cycles; + discontinuity = + history_begin->clock_was_set_seq != clock_was_set_seq; + + ret = adjust_historical_crosststamp(history_begin, + partial_history_cycles, + total_history_cycles, + discontinuity, xtstamp); + if (ret) + return ret; + } + return 0; } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); -- cgit v1.2.1 From 71f87b2fc64c2e9b6d53cb817f28711b959d3dfe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Mar 2016 10:52:10 +0100 Subject: cpu/hotplug: Plug death reporting race Paul noticed that the conversion of the death reporting introduced a race where the outgoing cpu might be delayed after waking the controll processor, so it might not be able to call rcu_report_dead() before being physically removed, leading to RCU stalls. We cant call complete after rcu_report_dead(), so instead of going back to busy polling, simply issue a function call to do the completion. Fixes: 27d50c7eeb0f "rcu: Make CPU_DYING_IDLE an explicit call" Reported-by: Paul E. McKenney Link: http://lkml.kernel.org/r/20160302201127.GA23440@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- kernel/cpu.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index ff8059b76a85..93e9d89bb0ab 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -755,14 +755,26 @@ static int notify_dead(unsigned int cpu) return 0; } +static void cpuhp_complete_idle_dead(void *arg) +{ + struct cpuhp_cpu_state *st = arg; + + complete(&st->done); +} + void cpuhp_report_idle_dead(void) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); BUG_ON(st->state != CPUHP_AP_OFFLINE); - st->state = CPUHP_AP_IDLE_DEAD; - complete(&st->done); rcu_report_dead(smp_processor_id()); + st->state = CPUHP_AP_IDLE_DEAD; + /* + * We cannot call complete after rcu_report_dead() so we delegate it + * to an online cpu. + */ + smp_call_function_single(cpumask_first(cpu_online_mask), + cpuhp_complete_idle_dead, st, 0); } #else -- cgit v1.2.1 From 82e88ff1ea948d83125a8aaa7c9809f03ccc500f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Mar 2016 11:11:12 +0100 Subject: hrtimer: Revert CLOCK_MONOTONIC_RAW support Revert commits: a6e707ddbdf1: KVM: arm/arm64: timer: Switch to CLOCK_MONOTONIC_RAW 9006a01829a5: hrtimer: Catch illegal clockids 9c808765e88e: hrtimer: Add support for CLOCK_MONOTONIC_RAW Marc found out, that there are fundamental issues with that patch series because __hrtimer_get_next_event() and hrtimer_forward() need support for CLOCK_MONOTONIC_RAW. Nothing which is easily fixed, so revert the whole lot. Reported-by: Marc Zyngier Link: http://lkml.kernel.org/r/56D6CEF0.8060607@arm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 - kernel/time/hrtimer.c | 18 ++---------------- virt/kvm/arm/arch_timer.c | 4 ++-- 3 files changed, 4 insertions(+), 19 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index a6d64af5e73f..76dd4f0da5ca 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -151,7 +151,6 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, - HRTIMER_BASE_MONOTONIC_RAW, HRTIMER_MAX_CLOCK_BASES, }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index cb0fe70f3c51..435b8850dd80 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -90,30 +90,19 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - { - .index = HRTIMER_BASE_MONOTONIC_RAW, - .clockid = CLOCK_MONOTONIC_RAW, - .get_time = &ktime_get_raw, - }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_MONOTONIC_RAW] = HRTIMER_BASE_MONOTONIC_RAW, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - int base = hrtimer_clock_to_base_table[clock_id]; - BUG_ON(base == HRTIMER_MAX_CLOCK_BASES); - return base; + return hrtimer_clock_to_base_table[clock_id]; } /* @@ -1279,10 +1268,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) if (!(active & 0x01)) continue; - if (unlikely(base->index == HRTIMER_BASE_MONOTONIC_RAW)) - basenow = ktime_get_raw(); - else - basenow = ktime_add(now, base->offset); + basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 97c58153f923..69bca185c471 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -48,7 +48,7 @@ static bool timer_is_armed(struct arch_timer_cpu *timer) static void timer_arm(struct arch_timer_cpu *timer, u64 ns) { timer->armed = true; - hrtimer_start(&timer->timer, ktime_add_ns(ktime_get_raw(), ns), + hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns), HRTIMER_MODE_ABS); } @@ -308,7 +308,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); - hrtimer_init(&timer->timer, CLOCK_MONOTONIC_RAW, HRTIMER_MODE_ABS); + hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->timer.function = kvm_timer_expire; } -- cgit v1.2.1 From f9677e0f83080bb4186865868c359e72e1fac1ea Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 29 Feb 2016 06:33:47 -0800 Subject: x86/tsc: Always Running Timer (ART) correlated clocksource MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On modern Intel systems TSC is derived from the new Always Running Timer (ART). ART can be captured simultaneous to the capture of audio and network device clocks, allowing a correlation between timebases to be constructed. Upon capture, the driver converts the captured ART value to the appropriate system clock using the correlated clocksource mechanism. On systems that support ART a new CPUID leaf (0x15) returns parameters “m” and “n” such that: TSC_value = (ART_value * m) / n + k [n >= 1] [k is an offset that can adjusted by a privileged agent. The IA32_TSC_ADJUST MSR is an example of an interface to adjust k. See 17.14.4 of the Intel SDM for more details] Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Tweaked to fix build issue, also reworked math for 64bit division on 32bit systems, as well as !CONFIG_CPU_FREQ build fixes] Signed-off-by: John Stultz --- arch/x86/include/asm/cpufeature.h | 2 +- arch/x86/include/asm/tsc.h | 2 ++ arch/x86/kernel/tsc.c | 59 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 7ad8c9464297..ff557b49ab77 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -85,7 +85,7 @@ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ #define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ -/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */ +#define X86_FEATURE_ART (3*32+10) /* Platform has always running timer (ART) */ #define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ #define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ #define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 6d7c5479bcea..174c4212780a 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -29,6 +29,8 @@ static inline cycles_t get_cycles(void) return rdtsc(); } +extern struct system_counterval_t convert_art_to_tsc(cycle_t art); + extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3d743da828d3..80d761e420c5 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -43,6 +43,11 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; +static u32 art_to_tsc_numerator; +static u32 art_to_tsc_denominator; +static u64 art_to_tsc_offset; +struct clocksource *art_related_clocksource; + /* * Use a ring-buffer like data structure, where a writer advances the head by * writing a new data entry and a reader advances the tail when it observes a @@ -964,6 +969,37 @@ core_initcall(cpufreq_tsc); #endif /* CONFIG_CPU_FREQ */ +#define ART_CPUID_LEAF (0x15) +#define ART_MIN_DENOMINATOR (1) + + +/* + * If ART is present detect the numerator:denominator to convert to TSC + */ +static void detect_art(void) +{ + unsigned int unused[2]; + + if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) + return; + + cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, + &art_to_tsc_numerator, unused, unused+1); + + /* Don't enable ART in a VM, non-stop TSC required */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || + !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || + art_to_tsc_denominator < ART_MIN_DENOMINATOR) + return; + + if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset)) + return; + + /* Make this sticky over multiple CPU init calls */ + setup_force_cpu_cap(X86_FEATURE_ART); +} + + /* clocksource code */ static struct clocksource clocksource_tsc; @@ -1071,6 +1107,25 @@ int unsynchronized_tsc(void) return 0; } +/* + * Convert ART to TSC given numerator/denominator found in detect_art() + */ +struct system_counterval_t convert_art_to_tsc(cycle_t art) +{ + u64 tmp, res, rem; + + rem = do_div(art, art_to_tsc_denominator); + + res = art * art_to_tsc_numerator; + tmp = rem * art_to_tsc_numerator; + + do_div(tmp, art_to_tsc_denominator); + res += tmp + art_to_tsc_offset; + + return (struct system_counterval_t) {.cs = art_related_clocksource, + .cycles = res}; +} +EXPORT_SYMBOL(convert_art_to_tsc); static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); @@ -1142,6 +1197,8 @@ static void tsc_refine_calibration_work(struct work_struct *work) (unsigned long)tsc_khz % 1000); out: + if (boot_cpu_has(X86_FEATURE_ART)) + art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); } @@ -1235,6 +1292,8 @@ void __init tsc_init(void) mark_tsc_unstable("TSCs unsynchronized"); check_system_tsc_reliable(); + + detect_art(); } #ifdef CONFIG_SMP -- cgit v1.2.1 From 719f1aa4a67199a3c4c68a03f94e5ec44d9d5f82 Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:25 -0800 Subject: ptp: Add PTP_SYS_OFFSET_PRECISE for driver crosstimestamping Currently, network /system cross-timestamping is performed in the PTP_SYS_OFFSET ioctl. The PTP clock driver reads gettimeofday() and the gettime64() callback provided by the driver. The cross-timestamp is best effort where the latency between the capture of system time (getnstimeofday()) and the device time (driver callback) may be significant. The getcrosststamp() callback and corresponding PTP_SYS_OFFSET_PRECISE ioctl allows the driver to perform this device/system correlation when for example cross timestamp hardware is available. Modern Intel systems can do this for onboard Ethernet controllers using the ART counter. There is virtually zero latency between captures of the ART and network device clock. The capabilities ioctl (PTP_CLOCK_GETCAPS), is augmented allowing applications to query whether or not drivers implement the getcrosststamp callback, providing more precise cross timestamping. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Acked-by: Richard Cochran Signed-off-by: Christopher S. Hall [jstultz: Commit subject tweaks] Signed-off-by: John Stultz --- Documentation/ptp/testptp.c | 6 ++++-- drivers/ptp/ptp_chardev.c | 27 +++++++++++++++++++++++++++ include/linux/ptp_clock_kernel.h | 8 ++++++++ include/uapi/linux/ptp_clock.h | 13 ++++++++++++- 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/Documentation/ptp/testptp.c b/Documentation/ptp/testptp.c index 6c6247aaa7b9..d99012f41602 100644 --- a/Documentation/ptp/testptp.c +++ b/Documentation/ptp/testptp.c @@ -277,13 +277,15 @@ int main(int argc, char *argv[]) " %d external time stamp channels\n" " %d programmable periodic signals\n" " %d pulse per second\n" - " %d programmable pins\n", + " %d programmable pins\n" + " %d cross timestamping\n", caps.max_adj, caps.n_alarm, caps.n_ext_ts, caps.n_per_out, caps.pps, - caps.n_pins); + caps.n_pins, + caps.cross_timestamping); } } diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index da7bae991552..579fd65299a0 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "ptp_private.h" @@ -120,11 +121,13 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) struct ptp_clock_caps caps; struct ptp_clock_request req; struct ptp_sys_offset *sysoff = NULL; + struct ptp_sys_offset_precise precise_offset; struct ptp_pin_desc pd; struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock); struct ptp_clock_info *ops = ptp->info; struct ptp_clock_time *pct; struct timespec64 ts; + struct system_device_crosststamp xtstamp; int enable, err = 0; unsigned int i, pin_index; @@ -138,6 +141,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) caps.n_per_out = ptp->info->n_per_out; caps.pps = ptp->info->pps; caps.n_pins = ptp->info->n_pins; + caps.cross_timestamping = ptp->info->getcrosststamp != NULL; if (copy_to_user((void __user *)arg, &caps, sizeof(caps))) err = -EFAULT; break; @@ -180,6 +184,29 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = ops->enable(ops, &req, enable); break; + case PTP_SYS_OFFSET_PRECISE: + if (!ptp->info->getcrosststamp) { + err = -EOPNOTSUPP; + break; + } + err = ptp->info->getcrosststamp(ptp->info, &xtstamp); + if (err) + break; + + ts = ktime_to_timespec64(xtstamp.device); + precise_offset.device.sec = ts.tv_sec; + precise_offset.device.nsec = ts.tv_nsec; + ts = ktime_to_timespec64(xtstamp.sys_realtime); + precise_offset.sys_realtime.sec = ts.tv_sec; + precise_offset.sys_realtime.nsec = ts.tv_nsec; + ts = ktime_to_timespec64(xtstamp.sys_monoraw); + precise_offset.sys_monoraw.sec = ts.tv_sec; + precise_offset.sys_monoraw.nsec = ts.tv_nsec; + if (copy_to_user((void __user *)arg, &precise_offset, + sizeof(precise_offset))) + err = -EFAULT; + break; + case PTP_SYS_OFFSET: sysoff = kmalloc(sizeof(*sysoff), GFP_KERNEL); if (!sysoff) { diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index b8b73066d137..6b15e168148a 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -38,6 +38,7 @@ struct ptp_clock_request { }; }; +struct system_device_crosststamp; /** * struct ptp_clock_info - decribes a PTP hardware clock * @@ -67,6 +68,11 @@ struct ptp_clock_request { * @gettime64: Reads the current time from the hardware clock. * parameter ts: Holds the result. * + * @getcrosststamp: Reads the current time from the hardware clock and + * system clock simultaneously. + * parameter cts: Contains timestamp (device,system) pair, + * where system time is realtime and monotonic. + * * @settime64: Set the current time on the hardware clock. * parameter ts: Time value to set. * @@ -105,6 +111,8 @@ struct ptp_clock_info { int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta); int (*adjtime)(struct ptp_clock_info *ptp, s64 delta); int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts); + int (*getcrosststamp)(struct ptp_clock_info *ptp, + struct system_device_crosststamp *cts); int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts); int (*enable)(struct ptp_clock_info *ptp, struct ptp_clock_request *request, int on); diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index f0b7bfe5da92..ac6dded80ffa 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -51,7 +51,9 @@ struct ptp_clock_caps { int n_per_out; /* Number of programmable periodic signals. */ int pps; /* Whether the clock supports a PPS callback. */ int n_pins; /* Number of input/output pins. */ - int rsv[14]; /* Reserved for future use. */ + /* Whether the clock supports precise system-device cross timestamps */ + int cross_timestamping; + int rsv[13]; /* Reserved for future use. */ }; struct ptp_extts_request { @@ -81,6 +83,13 @@ struct ptp_sys_offset { struct ptp_clock_time ts[2 * PTP_MAX_SAMPLES + 1]; }; +struct ptp_sys_offset_precise { + struct ptp_clock_time device; + struct ptp_clock_time sys_realtime; + struct ptp_clock_time sys_monoraw; + unsigned int rsv[4]; /* Reserved for future use. */ +}; + enum ptp_pin_function { PTP_PF_NONE, PTP_PF_EXTTS, @@ -124,6 +133,8 @@ struct ptp_pin_desc { #define PTP_SYS_OFFSET _IOW(PTP_CLK_MAGIC, 5, struct ptp_sys_offset) #define PTP_PIN_GETFUNC _IOWR(PTP_CLK_MAGIC, 6, struct ptp_pin_desc) #define PTP_PIN_SETFUNC _IOW(PTP_CLK_MAGIC, 7, struct ptp_pin_desc) +#define PTP_SYS_OFFSET_PRECISE \ + _IOWR(PTP_CLK_MAGIC, 8, struct ptp_sys_offset_precise) struct ptp_extts_event { struct ptp_clock_time t; /* Time event occured. */ -- cgit v1.2.1 From 01d7ada57ee9c735bd71fbe44ec0bcb70847afd4 Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:26 -0800 Subject: e1000e: Adds hardware supported cross timestamp on e1000e nic Modern Intel systems supports cross timestamping of the network device clock and Always Running Timer (ART) in hardware. This allows the device time and system time to be precisely correlated. The timestamp pair is returned through e1000e_phc_get_syncdevicetime() used by get_system_device_crosststamp(). The hardware cross-timestamp result is made available to applications through the PTP_SYS_OFFSET_PRECISE ioctl which calls e1000e_phc_getcrosststamp(). Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: Jeff Kirsher Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Acked-by: Jeff Kirsher Signed-off-by: Christopher S. Hall [jstultz: Reworked to use new interface, commit message tweaks] Signed-off-by: John Stultz --- drivers/net/ethernet/intel/Kconfig | 9 +++ drivers/net/ethernet/intel/e1000e/defines.h | 5 ++ drivers/net/ethernet/intel/e1000e/ptp.c | 85 +++++++++++++++++++++++++++++ drivers/net/ethernet/intel/e1000e/regs.h | 4 ++ 4 files changed, 103 insertions(+) diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index fa593dd3efe1..3772f3ac956e 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -83,6 +83,15 @@ config E1000E To compile this driver as a module, choose M here. The module will be called e1000e. +config E1000E_HWTS + bool "Support HW cross-timestamp on PCH devices" + default y + depends on E1000E && X86 + ---help--- + Say Y to enable hardware supported cross-timestamping on PCH + devices. The cross-timestamp is available through the PTP clock + driver precise cross-timestamp ioctl (PTP_SYS_OFFSET_PRECISE). + config IGB tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support" depends on PCI diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h index f7c7804d79e5..0641c0098738 100644 --- a/drivers/net/ethernet/intel/e1000e/defines.h +++ b/drivers/net/ethernet/intel/e1000e/defines.h @@ -528,6 +528,11 @@ #define E1000_RXCW_C 0x20000000 /* Receive config */ #define E1000_RXCW_SYNCH 0x40000000 /* Receive config synch */ +/* HH Time Sync */ +#define E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK 0x0000F000 /* max delay */ +#define E1000_TSYNCTXCTL_SYNC_COMP 0x40000000 /* sync complete */ +#define E1000_TSYNCTXCTL_START_SYNC 0x80000000 /* initiate sync */ + #define E1000_TSYNCTXCTL_VALID 0x00000001 /* Tx timestamp valid */ #define E1000_TSYNCTXCTL_ENABLED 0x00000010 /* enable Tx timestamping */ diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c index 25a0ad5102d6..e2ff3ef75d5d 100644 --- a/drivers/net/ethernet/intel/e1000e/ptp.c +++ b/drivers/net/ethernet/intel/e1000e/ptp.c @@ -26,6 +26,12 @@ #include "e1000.h" +#ifdef CONFIG_E1000E_HWTS +#include +#include +#include +#endif + /** * e1000e_phc_adjfreq - adjust the frequency of the hardware clock * @ptp: ptp clock structure @@ -98,6 +104,78 @@ static int e1000e_phc_adjtime(struct ptp_clock_info *ptp, s64 delta) return 0; } +#ifdef CONFIG_E1000E_HWTS +#define MAX_HW_WAIT_COUNT (3) + +/** + * e1000e_phc_get_syncdevicetime - Callback given to timekeeping code reads system/device registers + * @device: current device time + * @system: system counter value read synchronously with device time + * @ctx: context provided by timekeeping code + * + * Read device and system (ART) clock simultaneously and return the corrected + * clock values in ns. + **/ +static int e1000e_phc_get_syncdevicetime(ktime_t *device, + struct system_counterval_t *system, + void *ctx) +{ + struct e1000_adapter *adapter = (struct e1000_adapter *)ctx; + struct e1000_hw *hw = &adapter->hw; + unsigned long flags; + int i; + u32 tsync_ctrl; + cycle_t dev_cycles; + cycle_t sys_cycles; + + tsync_ctrl = er32(TSYNCTXCTL); + tsync_ctrl |= E1000_TSYNCTXCTL_START_SYNC | + E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK; + ew32(TSYNCTXCTL, tsync_ctrl); + for (i = 0; i < MAX_HW_WAIT_COUNT; ++i) { + udelay(1); + tsync_ctrl = er32(TSYNCTXCTL); + if (tsync_ctrl & E1000_TSYNCTXCTL_SYNC_COMP) + break; + } + + if (i == MAX_HW_WAIT_COUNT) + return -ETIMEDOUT; + + dev_cycles = er32(SYSSTMPH); + dev_cycles <<= 32; + dev_cycles |= er32(SYSSTMPL); + spin_lock_irqsave(&adapter->systim_lock, flags); + *device = ns_to_ktime(timecounter_cyc2time(&adapter->tc, dev_cycles)); + spin_unlock_irqrestore(&adapter->systim_lock, flags); + + sys_cycles = er32(PLTSTMPH); + sys_cycles <<= 32; + sys_cycles |= er32(PLTSTMPL); + *system = convert_art_to_tsc(sys_cycles); + + return 0; +} + +/** + * e1000e_phc_getsynctime - Reads the current system/device cross timestamp + * @ptp: ptp clock structure + * @cts: structure containing timestamp + * + * Read device and system (ART) clock simultaneously and return the scaled + * clock values in ns. + **/ +static int e1000e_phc_getcrosststamp(struct ptp_clock_info *ptp, + struct system_device_crosststamp *xtstamp) +{ + struct e1000_adapter *adapter = container_of(ptp, struct e1000_adapter, + ptp_clock_info); + + return get_device_system_crosststamp(e1000e_phc_get_syncdevicetime, + adapter, NULL, xtstamp); +} +#endif/*CONFIG_E1000E_HWTS*/ + /** * e1000e_phc_gettime - Reads the current time from the hardware clock * @ptp: ptp clock structure @@ -236,6 +314,13 @@ void e1000e_ptp_init(struct e1000_adapter *adapter) break; } +#ifdef CONFIG_E1000E_HWTS + /* CPU must have ART and GBe must be from Sunrise Point or greater */ + if (hw->mac.type >= e1000_pch_spt && boot_cpu_has(X86_FEATURE_ART)) + adapter->ptp_clock_info.getcrosststamp = + e1000e_phc_getcrosststamp; +#endif/*CONFIG_E1000E_HWTS*/ + INIT_DELAYED_WORK(&adapter->systim_overflow_work, e1000e_systim_overflow_work); diff --git a/drivers/net/ethernet/intel/e1000e/regs.h b/drivers/net/ethernet/intel/e1000e/regs.h index 1d5e0b77062a..0cb4d365e5ad 100644 --- a/drivers/net/ethernet/intel/e1000e/regs.h +++ b/drivers/net/ethernet/intel/e1000e/regs.h @@ -245,6 +245,10 @@ #define E1000_SYSTIML 0x0B600 /* System time register Low - RO */ #define E1000_SYSTIMH 0x0B604 /* System time register High - RO */ #define E1000_TIMINCA 0x0B608 /* Increment attributes register - RW */ +#define E1000_SYSSTMPL 0x0B648 /* HH Timesync system stamp low register */ +#define E1000_SYSSTMPH 0x0B64C /* HH Timesync system stamp hi register */ +#define E1000_PLTSTMPL 0x0B640 /* HH Timesync platform stamp low register */ +#define E1000_PLTSTMPH 0x0B644 /* HH Timesync platform stamp hi register */ #define E1000_RXMTRL 0x0B634 /* Time sync Rx EtherType and Msg Type - RW */ #define E1000_RXUDP 0x0B638 /* Time Sync Rx UDP Port - RW */ -- cgit v1.2.1 From cfa52c0cfa4d727aa3e457bf29aeff296c528a08 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Thu, 3 Mar 2016 02:03:11 +0100 Subject: x86/mm/kmmio: Fix mmiotrace for hugepages Because Linux might use bigger pages than the 4K pages to handle those mmio ioremaps, the kmmio code shouldn't rely on the pade id as it currently does. Using the memory address instead of the page id lets us look up how big the page is and what its base address is, so that we won't get a page fault within the same page twice anymore. Tested-by: Pierre Moreau Signed-off-by: Karol Herbst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Cc: linux-x86_64@vger.kernel.org Cc: nouveau@lists.freedesktop.org Cc: pq@iki.fi Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1456966991-6861-1-git-send-email-nouveau@karolherbst.de Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 88 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 29 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 637ab34ed632..ddb2244b06a1 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -33,7 +33,7 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; - unsigned long page; /* location of the fault page */ + unsigned long addr; /* the requested address */ pteval_t old_presence; /* page presence prior to arming */ bool armed; @@ -70,9 +70,16 @@ unsigned int kmmio_count; static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; static LIST_HEAD(kmmio_probes); -static struct list_head *kmmio_page_list(unsigned long page) +static struct list_head *kmmio_page_list(unsigned long addr) { - return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + + if (!pte) + return NULL; + addr &= page_level_mask(l); + + return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)]; } /* Accessed per-cpu */ @@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr) } /* You must be holding RCU read lock. */ -static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) { struct list_head *head; struct kmmio_fault_page *f; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); - page &= PAGE_MASK; - head = kmmio_page_list(page); + if (!pte) + return NULL; + addr &= page_level_mask(l); + head = kmmio_page_list(addr); list_for_each_entry_rcu(f, head, list) { - if (f->page == page) + if (f->addr == addr) return f; } return NULL; @@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) static int clear_page_presence(struct kmmio_fault_page *f, bool clear) { unsigned int level; - pte_t *pte = lookup_address(f->page, &level); + pte_t *pte = lookup_address(f->addr, &level); if (!pte) { - pr_err("no pte for page 0x%08lx\n", f->page); + pr_err("no pte for addr 0x%08lx\n", f->addr); return -1; } @@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) return -1; } - __flush_tlb_one(f->page); + __flush_tlb_one(f->addr); return 0; } @@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) int ret; WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, !!f->old_presence); + pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n", + f->addr, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); - WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), - f->page); + WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), + f->addr); f->armed = true; return ret; } @@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { int ret = clear_page_presence(f, false); WARN_ONCE(ret < 0, - KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); + KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr); f->armed = false; } @@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) struct kmmio_context *ctx; struct kmmio_fault_page *faultpage; int ret = 0; /* default to fault not handled */ + unsigned long page_base = addr; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + if (!pte) + return -EINVAL; + page_base &= page_level_mask(l); /* * Preemption is now disabled to prevent process switch during @@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) preempt_disable(); rcu_read_lock(); - faultpage = get_kmmio_fault_page(addr); + faultpage = get_kmmio_fault_page(page_base); if (!faultpage) { /* * Either this page fault is not caused by kmmio, or @@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { - if (addr == ctx->addr) { + if (page_base == ctx->addr) { /* * A second fault on the same page means some other * condition needs handling by do_page_fault(), the @@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx->active++; ctx->fpage = faultpage; - ctx->probe = get_kmmio_probe(addr); + ctx->probe = get_kmmio_probe(page_base); ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - ctx->addr = addr; + ctx->addr = page_base; if (ctx->probe && ctx->probe->pre_handler) ctx->probe->pre_handler(ctx->probe, regs, addr); @@ -354,12 +371,11 @@ out: } /* You must be holding kmmio_lock. */ -static int add_kmmio_fault_page(unsigned long page) +static int add_kmmio_fault_page(unsigned long addr) { struct kmmio_fault_page *f; - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); + f = get_kmmio_fault_page(addr); if (f) { if (!f->count) arm_kmmio_fault_page(f); @@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page) return -1; f->count = 1; - f->page = page; + f->addr = addr; if (arm_kmmio_fault_page(f)) { kfree(f); return -1; } - list_add_rcu(&f->list, kmmio_page_list(f->page)); + list_add_rcu(&f->list, kmmio_page_list(f->addr)); return 0; } /* You must be holding kmmio_lock. */ -static void release_kmmio_fault_page(unsigned long page, +static void release_kmmio_fault_page(unsigned long addr, struct kmmio_fault_page **release_list) { struct kmmio_fault_page *f; - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); + f = get_kmmio_fault_page(addr); if (!f) return; @@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p) int ret = 0; unsigned long size = 0; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + unsigned int l; + pte_t *pte; spin_lock_irqsave(&kmmio_lock, flags); if (get_kmmio_probe(p->addr)) { ret = -EEXIST; goto out; } + + pte = lookup_address(p->addr, &l); + if (!pte) { + ret = -EINVAL; + goto out; + } + kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { if (add_kmmio_fault_page(p->addr + size)) pr_err("Unable to set page fault.\n"); - size += PAGE_SIZE; + size += page_level_size(l); } out: spin_unlock_irqrestore(&kmmio_lock, flags); @@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p) const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; + unsigned int l; + pte_t *pte; + + pte = lookup_address(p->addr, &l); + if (!pte) + return; spin_lock_irqsave(&kmmio_lock, flags); while (size < size_lim) { release_kmmio_fault_page(p->addr + size, &release_list); - size += PAGE_SIZE; + size += page_level_size(l); } list_del_rcu(&p->list); kmmio_count--; -- cgit v1.2.1 From 5d8eb84253333f8f63ec704276e3f0a8ec8f3189 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Mar 2016 12:24:10 +0100 Subject: cpu/hotplug: Remove redundant state check The check for the AP range in cpuhp_is_ap_state() is redundant after commit 8df3e07e7f21 "cpu/hotplug: Let upcoming cpu bring itself fully up" because all states above CPUHP_BRINGUP_CPU are invoked on the hotplugged cpu. Remove it. Reported-by: Richard Cochran Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 93e9d89bb0ab..373e831e0faa 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1231,8 +1231,6 @@ static int cpuhp_cb_check(enum cpuhp_state state) static bool cpuhp_is_ap_state(enum cpuhp_state state) { - if (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE) - return true; return state > CPUHP_BRINGUP_CPU; } -- cgit v1.2.1 From bc864af13f34d19c911f5691d87bdacc9ce109f5 Mon Sep 17 00:00:00 2001 From: Chris Bainbridge Date: Mon, 7 Mar 2016 11:10:00 +0100 Subject: x86/microcode/intel: Change checksum variables to u32 Microcode checksum verification should be done using unsigned 32-bit values otherwise the calculation overflow results in undefined behaviour. This is also nicely documented in the SDM, section "Microcode Update Checksum": "To check for a corrupt microcode update, software must perform a unsigned DWORD (32-bit) checksum of the microcode update. Even though some fields are signed, the checksum procedure treats all DWORDs as unsigned. Microcode updates with a header version equal to 00000001H must sum all DWORDs that comprise the microcode update. A valid checksum check will yield a value of 00000000H." but for some reason the code has been using ints from the very beginning. In practice, this bug possibly manifested itself only when doing the microcode data checksum - apparently, currently shipped Intel microcode doesn't have an extended signature table for which we do checksum verification too. UBSAN: Undefined behaviour in arch/x86/kernel/cpu/microcode/intel_lib.c:105:12 signed integer overflow: -1500151068 + -2125470173 cannot be represented in type 'int' CPU: 0 PID: 0 Comm: swapper Not tainted 4.5.0-rc5+ #495 ... Call Trace: dump_stack ? inotify_ioctl ubsan_epilogue handle_overflow __ubsan_handle_add_overflow microcode_sanity_check get_matching_model_microcode.isra.2.constprop.8 ? early_idt_handler_common ? strlcpy ? find_cpio_data load_ucode_intel_bsp load_ucode_bsp ? load_ucode_bsp x86_64_start_kernel [ Expand and massage commit message. ] Signed-off-by: Chris Bainbridge Signed-off-by: Borislav Petkov Cc: hmh@hmh.eng.br Link: http://lkml.kernel.org/r/1456834359-5132-1-git-send-email-chris.bainbridge@gmail.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/microcode/intel_lib.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index b96896bcbdaf..99ca2c935777 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -49,7 +49,7 @@ int microcode_sanity_check(void *mc, int print_err) unsigned long total_size, data_size, ext_table_size; struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header = NULL; - int sum, orig_sum, ext_sigcount = 0, i; + u32 sum, orig_sum, ext_sigcount = 0, i; struct extended_signature *ext_sig; total_size = get_totalsize(mc_header); @@ -85,8 +85,8 @@ int microcode_sanity_check(void *mc, int print_err) /* check extended table checksum */ if (ext_table_size) { - int ext_table_sum = 0; - int *ext_tablep = (int *)ext_header; + u32 ext_table_sum = 0; + u32 *ext_tablep = (u32 *)ext_header; i = ext_table_size / DWSIZE; while (i--) @@ -102,7 +102,7 @@ int microcode_sanity_check(void *mc, int print_err) orig_sum = 0; i = (MC_HEADER_SIZE + data_size) / DWSIZE; while (i--) - orig_sum += ((int *)mc)[i]; + orig_sum += ((u32 *)mc)[i]; if (orig_sum) { if (print_err) pr_err("aborting, bad checksum\n"); -- cgit v1.2.1 From c0414622177ae4739a49ca7dad4306a681e2878b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 7 Mar 2016 11:10:01 +0100 Subject: x86/microcode/intel: Get rid of DWSIZE sizeof(u32) is perfectly clear as it is. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1457345404-28884-3-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/microcode_intel.h | 1 - arch/x86/kernel/cpu/microcode/intel_lib.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 8559b0102ea1..603417f8dd6c 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -40,7 +40,6 @@ struct extended_sigtable { #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) -#define DWSIZE (sizeof(u32)) #define get_totalsize(mc) \ (((struct microcode_intel *)mc)->hdr.datasize ? \ diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 99ca2c935777..2b2d1354dc70 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -88,7 +88,7 @@ int microcode_sanity_check(void *mc, int print_err) u32 ext_table_sum = 0; u32 *ext_tablep = (u32 *)ext_header; - i = ext_table_size / DWSIZE; + i = ext_table_size / sizeof(u32); while (i--) ext_table_sum += ext_tablep[i]; if (ext_table_sum) { @@ -100,7 +100,7 @@ int microcode_sanity_check(void *mc, int print_err) /* calculate the checksum */ orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / DWSIZE; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); while (i--) orig_sum += ((u32 *)mc)[i]; if (orig_sum) { -- cgit v1.2.1 From 7d0161569a3b66aaa01520002c3e5fd7126d071f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 7 Mar 2016 11:10:02 +0100 Subject: x86/microcode/intel: Merge two consecutive if-statements Merge the two consecutive "if (ext_table_size)". No functional change. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1457345404-28884-4-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/microcode/intel_lib.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 2b2d1354dc70..ffb1bbf45db0 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -68,6 +68,9 @@ int microcode_sanity_check(void *mc, int print_err) } ext_table_size = total_size - (MC_HEADER_SIZE + data_size); if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; + if ((ext_table_size < EXT_HEADER_SIZE) || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { if (print_err) @@ -81,12 +84,9 @@ int microcode_sanity_check(void *mc, int print_err) return -EFAULT; } ext_sigcount = ext_header->count; - } - /* check extended table checksum */ - if (ext_table_size) { - u32 ext_table_sum = 0; - u32 *ext_tablep = (u32 *)ext_header; + /* check extended table checksum */ + ext_tablep = (u32 *)ext_header; i = ext_table_size / sizeof(u32); while (i--) -- cgit v1.2.1 From 5b46b5e003724547f0c83041cada15f9f496590d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 7 Mar 2016 11:10:03 +0100 Subject: x86/microcode/intel: Improve microcode sanity-checking error messages Turn them into proper sentences. Add comments to microcode_sanity_check() to explain what it does. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1457345404-28884-5-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/microcode/intel_lib.c | 36 ++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index ffb1bbf45db0..23b1d92342e3 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -57,15 +57,16 @@ int microcode_sanity_check(void *mc, int print_err) if (data_size + MC_HEADER_SIZE > total_size) { if (print_err) - pr_err("error! Bad data size in microcode data file\n"); + pr_err("Error: bad microcode data file size.\n"); return -EINVAL; } if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { if (print_err) - pr_err("error! Unknown microcode update format\n"); + pr_err("Error: invalid/unknown microcode update format.\n"); return -EINVAL; } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); if (ext_table_size) { u32 ext_table_sum = 0; @@ -74,43 +75,58 @@ int microcode_sanity_check(void *mc, int print_err) if ((ext_table_size < EXT_HEADER_SIZE) || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { if (print_err) - pr_err("error! Small exttable size in microcode data file\n"); + pr_err("Error: truncated extended signature table.\n"); return -EINVAL; } + ext_header = mc + MC_HEADER_SIZE + data_size; if (ext_table_size != exttable_size(ext_header)) { if (print_err) - pr_err("error! Bad exttable size in microcode data file\n"); + pr_err("Error: extended signature table size mismatch.\n"); return -EFAULT; } + ext_sigcount = ext_header->count; - /* check extended table checksum */ + /* + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. + */ ext_tablep = (u32 *)ext_header; i = ext_table_size / sizeof(u32); while (i--) ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { if (print_err) - pr_warn("aborting, bad extended signature table checksum\n"); + pr_warn("Bad extended signature table checksum, aborting.\n"); return -EINVAL; } } - /* calculate the checksum */ + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ orig_sum = 0; i = (MC_HEADER_SIZE + data_size) / sizeof(u32); while (i--) orig_sum += ((u32 *)mc)[i]; + if (orig_sum) { if (print_err) - pr_err("aborting, bad checksum\n"); + pr_err("Bad microcode data checksum, aborting.\n"); return -EINVAL; } + if (!ext_table_size) return 0; - /* check extended signature checksum */ + + /* + * Check extended signature checksum: 0 => valid. + */ for (i = 0; i < ext_sigcount; i++) { ext_sig = (void *)ext_header + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i; @@ -119,7 +135,7 @@ int microcode_sanity_check(void *mc, int print_err) + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); if (sum) { if (print_err) - pr_err("aborting, bad checksum\n"); + pr_err("Bad extended signature checksum, aborting.\n"); return -EINVAL; } } -- cgit v1.2.1 From 4ace2e7a48ab426eaa9745ace4c50c6a7adb3992 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 7 Mar 2016 11:10:04 +0100 Subject: x86/microcode/intel: Drop orig_sum from ext signature checksum It is 0 because for !0 values we would have exited already. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1457345404-28884-6-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/microcode/intel_lib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 23b1d92342e3..2ce1a7dc45b7 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -130,9 +130,9 @@ int microcode_sanity_check(void *mc, int print_err) for (i = 0; i < ext_sigcount; i++) { ext_sig = (void *)ext_header + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i; - sum = orig_sum - - (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); if (sum) { if (print_err) pr_err("Bad extended signature checksum, aborting.\n"); -- cgit v1.2.1 From 6436257b491cc0d456c39330dfc22126148d5ed7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 8 Mar 2016 11:09:53 +0100 Subject: time/timekeeping: Work around false positive GCC warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newer GCC versions trigger the following warning: kernel/time/timekeeping.c: In function ‘get_device_system_crosststamp’: kernel/time/timekeeping.c:987:5: warning: ‘clock_was_set_seq’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (discontinuity) { ^ kernel/time/timekeeping.c:1045:15: note: ‘clock_was_set_seq’ was declared here unsigned int clock_was_set_seq; ^ GCC clearly is unable to recognize that the 'do_interp' boolean tracks the initialization status of 'clock_was_set_seq'. The GCC version used was: gcc version 5.3.1 20151207 (Red Hat 5.3.1-2) (GCC) Work it around by initializing clock_was_set_seq to 0. Compilers that are able to recognize the code flow will eliminate the unnecessary initialization. Acked-by: Thomas Gleixner Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 931b0b1a71e9..9c629bbed572 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1042,7 +1042,7 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_counterval_t system_counterval; struct timekeeper *tk = &tk_core.timekeeper; cycle_t cycles, now, interval_start; - unsigned int clock_was_set_seq; + unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; s64 nsec_real, nsec_raw; u8 cs_was_changed_seq; -- cgit v1.2.1 From 1a8aa8acab4f3949e05ceb51e36f627b1651814c Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 6 Mar 2016 19:11:15 +0100 Subject: x86/apic: Deinline __default_send_IPI_*, save ~200 bytes __default_send_IPI_shortcut: 49 bytes, 2 callsites __default_send_IPI_dest_field: 108 bytes, 7 callsites text data bss dec hex filename 96184086 20860488 36122624 153167198 921255e vmlinux_before 96183823 20860520 36122624 153166967 9212477 vmlinux Signed-off-by: Denys Vlasenko Cc: Borislav Petkov Cc: Daniel J Blueman Cc: Linus Torvalds Cc: Mike Travis Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1457287876-6001-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ipi.h | 58 ++------------------------------------------ arch/x86/kernel/apic/ipi.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 56 deletions(-) diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index cfc9a0d2d07c..a4fe16e42b7b 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -57,67 +57,13 @@ static inline void __xapic_wait_icr_idle(void) cpu_relax(); } -static inline void -__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) -{ - /* - * Subtle. In the case of the 'never do double writes' workaround - * we have to lock out interrupts to be safe. As we don't care - * of the value read we use an atomic rmw access to avoid costly - * cli/sti. Otherwise we use an even cheaper single atomic write - * to the APIC. - */ - unsigned int cfg; - - /* - * Wait for idle. - */ - __xapic_wait_icr_idle(); - - /* - * No need to touch the target chip field - */ - cfg = __prepare_ICR(shortcut, vector, dest); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); -} +void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest); /* * This is used to send an IPI with no shorthand notation (the destination is * specified in bits 56 to 63 of the ICR). */ -static inline void - __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) -{ - unsigned long cfg; - - /* - * Wait for idle. - */ - if (unlikely(vector == NMI_VECTOR)) - safe_apic_wait_icr_idle(); - else - __xapic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - native_apic_mem_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, dest); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); -} +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest); extern void default_send_IPI_single(int cpu, int vector); extern void default_send_IPI_single_phys(int cpu, int vector); diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index eb45fc9b6124..28bde88b0085 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -18,6 +18,66 @@ #include #include +void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) +{ + /* + * Subtle. In the case of the 'never do double writes' workaround + * we have to lock out interrupts to be safe. As we don't care + * of the value read we use an atomic rmw access to avoid costly + * cli/sti. Otherwise we use an even cheaper single atomic write + * to the APIC. + */ + unsigned int cfg; + + /* + * Wait for idle. + */ + __xapic_wait_icr_idle(); + + /* + * No need to touch the target chip field + */ + cfg = __prepare_ICR(shortcut, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) +{ + unsigned long cfg; + + /* + * Wait for idle. + */ + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + __xapic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(mask); + native_apic_mem_write(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + native_apic_mem_write(APIC_ICR, cfg); +} + void default_send_IPI_single_phys(int cpu, int vector) { unsigned long flags; -- cgit v1.2.1 From fe2f95468e4bdf4a526be4f86efaefe48ca63b83 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 6 Mar 2016 19:11:16 +0100 Subject: x86/apic: Deinline _flat_send_IPI_mask, save ~150 bytes _flat_send_IPI_mask: 157 bytes, 3 callsites text data bss dec hex filename 96183823 20860520 36122624 153166967 9212477 vmlinux1_before 96183699 20860520 36122624 153166843 92123fb vmlinux Signed-off-by: Denys Vlasenko Cc: Borislav Petkov Cc: Daniel J Blueman Cc: Linus Torvalds Cc: Mike Travis Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1457287876-6001-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 9968f30cca3e..76f89e2b245a 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -53,7 +53,7 @@ void flat_init_apic_ldr(void) apic_write(APIC_LDR, val); } -static inline void _flat_send_IPI_mask(unsigned long mask, int vector) +static void _flat_send_IPI_mask(unsigned long mask, int vector) { unsigned long flags; -- cgit v1.2.1 From 58a5aac5331388a175a42b6ed2154f0559cefb21 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 29 Feb 2016 15:50:19 -0800 Subject: x86/entry/32: Introduce and use X86_BUG_ESPFIX instead of paravirt_enabled x86_64 has very clean espfix handling on paravirt: espfix64 is set up in native_iret, so paravirt systems that override iret bypass espfix64 automatically. This is robust and straightforward. x86_32 is messier. espfix is set up before the IRET paravirt patch point, so it can't be directly conditionalized on whether we use native_iret. We also can't easily move it into native_iret without regressing performance due to a bizarre consideration. Specifically, on 64-bit kernels, the logic is: if (regs->ss & 0x4) setup_espfix; On 32-bit kernels, the logic is: if ((regs->ss & 0x4) && (regs->cs & 0x3) == 3 && (regs->flags & X86_EFLAGS_VM) == 0) setup_espfix; The performance of setup_espfix itself is essentially irrelevant, but the comparison happens on every IRET so its performance matters. On x86_64, there's no need for any registers except flags to implement the comparison, so we fold the whole thing into native_iret. On x86_32, we don't do that because we need a free register to implement the comparison efficiently. We therefore do espfix setup before restoring registers on x86_32. This patch gets rid of the explicit paravirt_enabled check by introducing X86_BUG_ESPFIX on 32-bit systems and using an ALTERNATIVE to skip espfix on paravirt systems where iret != native_iret. This is also messy, but it's at least in line with other things we do. This improves espfix performance by removing a branch, but no one cares. More importantly, it removes a paravirt_enabled user, which is good because paravirt_enabled is ill-defined and is going away. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: konrad.wilk@oracle.com Cc: lguest@lists.ozlabs.org Cc: xen-devel@lists.xensource.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 15 ++------------- arch/x86/include/asm/cpufeatures.h | 8 ++++++++ arch/x86/kernel/cpu/common.c | 25 +++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 66350e6a6ca5..e13027247c90 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -361,6 +361,8 @@ restore_all: TRACE_IRQS_IRET restore_all_notrace: #ifdef CONFIG_X86_ESPFIX32 + ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS /* * Warning: PT_OLDSS(%esp) contains the wrong/random values if we @@ -387,19 +389,6 @@ ENTRY(iret_exc ) #ifdef CONFIG_X86_ESPFIX32 ldt_ss: -#ifdef CONFIG_PARAVIRT - /* - * The kernel can't run on a non-flat stack if paravirt mode - * is active. Rather than try to fixup the high bits of - * ESP, bypass this code entirely. This may break DOSemu - * and/or Wine support in a paravirt VM, although the option - * is still available to implement the setting of the high - * 16-bits in the INTERRUPT_RETURN paravirt-op. - */ - cmpl $0, pv_info+PARAVIRT_enabled - jne restore_nocheck -#endif - /* * Setup and switch to ESPFIX stack * diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6663fae71b12..d11a3aaafd96 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -286,4 +286,12 @@ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#ifdef CONFIG_X86_32 +/* + * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional + * to avoid confusion. + */ +#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#endif + #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 079d83fc6488..d8337f34b5f4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -802,6 +802,31 @@ static void detect_nopl(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_NOPL); #else set_cpu_cap(c, X86_FEATURE_NOPL); +#endif + + /* + * ESPFIX is a strange bug. All real CPUs have it. Paravirt + * systems that run Linux at CPL > 0 may or may not have the + * issue, but, even if they have the issue, there's absolutely + * nothing we can do about it because we can't use the real IRET + * instruction. + * + * NB: For the time being, only 32-bit kernels support + * X86_BUG_ESPFIX as such. 64-bit kernels directly choose + * whether to apply espfix using paravirt hooks. If any + * non-paravirt system ever shows up that does *not* have the + * ESPFIX issue, we can change this. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_PARAVIRT + do { + extern void native_iret(void); + if (pv_cpu_ops.iret == native_iret) + set_cpu_bug(c, X86_BUG_ESPFIX); + } while (0); +#else + set_cpu_bug(c, X86_BUG_ESPFIX); +#endif #endif } -- cgit v1.2.1 From 0dd0036f6e07f741a1356b424b84a3164b6e59cf Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 29 Feb 2016 15:50:20 -0800 Subject: x86/asm-offsets: Remove PARAVIRT_enabled It no longer has any users. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: konrad.wilk@oracle.com Cc: lguest@lists.ozlabs.org Cc: xen-devel@lists.xensource.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 84a7524b202c..5c042466f274 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -59,7 +59,6 @@ void common(void) { #ifdef CONFIG_PARAVIRT BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); -- cgit v1.2.1 From db57d7460ea74de2204ddc303520753f256ea67d Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 19 Jan 2016 14:11:14 +0100 Subject: irqchip/gic-v3: Refactor gic_of_init() for GICv3 driver Isolate hardware abstraction (FDT) code to gic_of_init(). Rest of the logic goes to gic_init_bases() and expects well defined data to initialize GIC properly. The same solution is used for GICv2 driver. This is needed for ACPI initialization later. Acked-by: Marc Zyngier Signed-off-by: Tomasz Nowicki Signed-off-by: Hanjun Guo Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 130 ++++++++++++++++++++++++++----------------- 1 file changed, 78 insertions(+), 52 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index d7be6ddc34f6..31205c7d03e3 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -811,59 +811,16 @@ static void gicv3_enable_quirks(void) #endif } -static int __init gic_of_init(struct device_node *node, struct device_node *parent) +static int __init gic_init_bases(void __iomem *dist_base, + struct redist_region *rdist_regs, + u32 nr_redist_regions, + u64 redist_stride, + struct fwnode_handle *handle) { - void __iomem *dist_base; - struct redist_region *rdist_regs; - u64 redist_stride; - u32 nr_redist_regions; + struct device_node *node; u32 typer; - u32 reg; int gic_irqs; int err; - int i; - - dist_base = of_iomap(node, 0); - if (!dist_base) { - pr_err("%s: unable to map gic dist registers\n", - node->full_name); - return -ENXIO; - } - - reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; - if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4) { - pr_err("%s: no distributor detected, giving up\n", - node->full_name); - err = -ENODEV; - goto out_unmap_dist; - } - - if (of_property_read_u32(node, "#redistributor-regions", &nr_redist_regions)) - nr_redist_regions = 1; - - rdist_regs = kzalloc(sizeof(*rdist_regs) * nr_redist_regions, GFP_KERNEL); - if (!rdist_regs) { - err = -ENOMEM; - goto out_unmap_dist; - } - - for (i = 0; i < nr_redist_regions; i++) { - struct resource res; - int ret; - - ret = of_address_to_resource(node, 1 + i, &res); - rdist_regs[i].redist_base = of_iomap(node, 1 + i); - if (ret || !rdist_regs[i].redist_base) { - pr_err("%s: couldn't map region %d\n", - node->full_name, i); - err = -ENODEV; - goto out_unmap_rdist; - } - rdist_regs[i].phys_base = res.start; - } - - if (of_property_read_u64(node, "redistributor-stride", &redist_stride)) - redist_stride = 0; if (!is_hyp_mode_available()) static_key_slow_dec(&supports_deactivate); @@ -889,8 +846,8 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare gic_irqs = 1020; gic_data.irq_nr = gic_irqs; - gic_data.domain = irq_domain_add_tree(node, &gic_irq_domain_ops, - &gic_data); + gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops, + &gic_data); gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist)); if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) { @@ -900,7 +857,9 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare set_handle_irq(gic_handle_irq); - if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis()) + node = to_of_node(handle); + if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis() && + node) /* Temp hack to prevent ITS init for ACPI */ its_init(node, &gic_data.rdists, gic_data.domain); gic_smp_init(); @@ -914,6 +873,73 @@ out_free: if (gic_data.domain) irq_domain_remove(gic_data.domain); free_percpu(gic_data.rdists.rdist); + return err; +} + +static int __init gic_validate_dist_version(void __iomem *dist_base) +{ + u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; + + if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4) + return -ENODEV; + + return 0; +} + +static int __init gic_of_init(struct device_node *node, struct device_node *parent) +{ + void __iomem *dist_base; + struct redist_region *rdist_regs; + u64 redist_stride; + u32 nr_redist_regions; + int err, i; + + dist_base = of_iomap(node, 0); + if (!dist_base) { + pr_err("%s: unable to map gic dist registers\n", + node->full_name); + return -ENXIO; + } + + err = gic_validate_dist_version(dist_base); + if (err) { + pr_err("%s: no distributor detected, giving up\n", + node->full_name); + goto out_unmap_dist; + } + + if (of_property_read_u32(node, "#redistributor-regions", &nr_redist_regions)) + nr_redist_regions = 1; + + rdist_regs = kzalloc(sizeof(*rdist_regs) * nr_redist_regions, GFP_KERNEL); + if (!rdist_regs) { + err = -ENOMEM; + goto out_unmap_dist; + } + + for (i = 0; i < nr_redist_regions; i++) { + struct resource res; + int ret; + + ret = of_address_to_resource(node, 1 + i, &res); + rdist_regs[i].redist_base = of_iomap(node, 1 + i); + if (ret || !rdist_regs[i].redist_base) { + pr_err("%s: couldn't map region %d\n", + node->full_name, i); + err = -ENODEV; + goto out_unmap_rdist; + } + rdist_regs[i].phys_base = res.start; + } + + if (of_property_read_u64(node, "redistributor-stride", &redist_stride)) + redist_stride = 0; + + err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions, + redist_stride, &node->fwnode); + if (!err) + return 0; + out_unmap_rdist: for (i = 0; i < nr_redist_regions; i++) if (rdist_regs[i].redist_base) -- cgit v1.2.1 From ffa7d6166a9611ed9ad117b56f7f513901b31408 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 19 Jan 2016 14:11:15 +0100 Subject: irqchip/gic-v3: Add ACPI support for GICv3/4 initialization With the refator of gic_of_init(), GICv3/4 can be initialized by gic_init_bases() with gic distributor base address and gic redistributor region(s). So get the redistributor region base addresses from MADT GIC redistributor subtable, and the distributor base address from GICD subtable to init GICv3 irqchip in ACPI way. Note: GIC redistributor base address may also be provided in GICC structures on systems supporting GICv3 and above if the GIC Redistributors are not in the always-on power domain, this patch didn't implement such feature yet. Acked-by: Marc Zyngier Signed-off-by: Tomasz Nowicki Signed-off-by: Hanjun Guo Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 137 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 31205c7d03e3..2cada42022ee 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -15,10 +15,12 @@ * along with this program. If not, see . */ +#include #include #include #include #include +#include #include #include #include @@ -764,6 +766,15 @@ static int gic_irq_domain_translate(struct irq_domain *d, return 0; } + if (is_fwnode_irqchip(fwspec->fwnode)) { + if(fwspec->param_count != 2) + return -EINVAL; + + *hwirq = fwspec->param[0]; + *type = fwspec->param[1]; + return 0; + } + return -EINVAL; } @@ -951,3 +962,129 @@ out_unmap_dist: } IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init); + +#ifdef CONFIG_ACPI +static struct redist_region *redist_regs __initdata; +static u32 nr_redist_regions __initdata; + +static int __init +gic_acpi_parse_madt_redist(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_redistributor *redist = + (struct acpi_madt_generic_redistributor *)header; + void __iomem *redist_base; + static int count = 0; + + redist_base = ioremap(redist->base_address, redist->length); + if (!redist_base) { + pr_err("Couldn't map GICR region @%llx\n", redist->base_address); + return -ENOMEM; + } + + redist_regs[count].phys_base = redist->base_address; + redist_regs[count].redist_base = redist_base; + count++; + return 0; +} + +static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header, + const unsigned long end) +{ + /* Subtable presence means that redist exists, that's it */ + return 0; +} + +static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header, + struct acpi_probe_entry *ape) +{ + struct acpi_madt_generic_distributor *dist; + int count; + + dist = (struct acpi_madt_generic_distributor *)header; + if (dist->version != ape->driver_data) + return false; + + /* We need to do that exercise anyway, the sooner the better */ + count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR, + gic_acpi_match_gicr, 0); + if (count <= 0) + return false; + + nr_redist_regions = count; + return true; +} + +#define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K) + +static int __init +gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end) +{ + struct acpi_madt_generic_distributor *dist; + struct fwnode_handle *domain_handle; + void __iomem *dist_base; + int i, err, count; + + /* Get distributor base address */ + dist = (struct acpi_madt_generic_distributor *)header; + dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE); + if (!dist_base) { + pr_err("Unable to map GICD registers\n"); + return -ENOMEM; + } + + err = gic_validate_dist_version(dist_base); + if (err) { + pr_err("No distributor detected at @%p, giving up", dist_base); + goto out_dist_unmap; + } + + redist_regs = kzalloc(sizeof(*redist_regs) * nr_redist_regions, + GFP_KERNEL); + if (!redist_regs) { + err = -ENOMEM; + goto out_dist_unmap; + } + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR, + gic_acpi_parse_madt_redist, 0); + if (count <= 0) { + err = -ENODEV; + goto out_redist_unmap; + } + + domain_handle = irq_domain_alloc_fwnode(dist_base); + if (!domain_handle) { + err = -ENOMEM; + goto out_redist_unmap; + } + + err = gic_init_bases(dist_base, redist_regs, nr_redist_regions, 0, + domain_handle); + if (err) + goto out_fwhandle_free; + + acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle); + return 0; + +out_fwhandle_free: + irq_domain_free_fwnode(domain_handle); +out_redist_unmap: + for (i = 0; i < nr_redist_regions; i++) + if (redist_regs[i].redist_base) + iounmap(redist_regs[i].redist_base); + kfree(redist_regs); +out_dist_unmap: + iounmap(dist_base); + return err; +} +IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, + acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V3, + gic_acpi_init); +IRQCHIP_ACPI_DECLARE(gic_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, + acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V4, + gic_acpi_init); +IRQCHIP_ACPI_DECLARE(gic_v3_or_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, + acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_NONE, + gic_acpi_init); +#endif -- cgit v1.2.1 From b70fb7af67158250bf16db467926e6e105d8bc49 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 19 Jan 2016 14:11:16 +0100 Subject: irqchip/gic-v3: ACPI: Add redistributor support via GICC structures Following ACPI spec: On systems supporting GICv3 and above, GICR Base Address in MADT GICC structure holds the 64-bit physical address of the associated Redistributor. If all of the GIC Redistributors are in the always-on power domain, GICR structures should be used to describe the Redistributors instead, and this field must be set to 0. It means that we have two ways to initialize registirbutors map. 1. via GICD structure which can accommodate many redistributors as a region 2. via GICC which is able to describe single redistributor This patch is going to add support for second option. Considering redistributors, GICD and GICC subtables have be mutually exclusive. While discovering and mapping redistributor, we need to know its size in advance. For the GICC case, redistributor can be in a power-domain that is off, thus we cannot relay on GICR TYPER register. Therefore, we get GIC version from distributor register and map 2xSZ_64K for GICv3 and 4xSZ_64K for GICv4. Acked-by: Marc Zyngier Signed-off-by: Hanjun Guo Signed-off-by: Tomasz Nowicki Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 114 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 101 insertions(+), 13 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 2cada42022ee..dd16a608a988 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -40,6 +40,7 @@ struct redist_region { void __iomem *redist_base; phys_addr_t phys_base; + bool single_redist; }; struct gic_chip_data { @@ -436,6 +437,9 @@ static int gic_populate_rdist(void) return 0; } + if (gic_data.redist_regions[i].single_redist) + break; + if (gic_data.redist_stride) { ptr += gic_data.redist_stride; } else { @@ -964,8 +968,21 @@ out_unmap_dist: IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init); #ifdef CONFIG_ACPI +static void __iomem *dist_base; static struct redist_region *redist_regs __initdata; static u32 nr_redist_regions __initdata; +static bool single_redist; + +static void __init +gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base) +{ + static int count = 0; + + redist_regs[count].phys_base = phys_base; + redist_regs[count].redist_base = redist_base; + redist_regs[count].single_redist = single_redist; + count++; +} static int __init gic_acpi_parse_madt_redist(struct acpi_subtable_header *header, @@ -974,7 +991,6 @@ gic_acpi_parse_madt_redist(struct acpi_subtable_header *header, struct acpi_madt_generic_redistributor *redist = (struct acpi_madt_generic_redistributor *)header; void __iomem *redist_base; - static int count = 0; redist_base = ioremap(redist->base_address, redist->length); if (!redist_base) { @@ -982,12 +998,49 @@ gic_acpi_parse_madt_redist(struct acpi_subtable_header *header, return -ENOMEM; } - redist_regs[count].phys_base = redist->base_address; - redist_regs[count].redist_base = redist_base; - count++; + gic_acpi_register_redist(redist->base_address, redist_base); return 0; } +static int __init +gic_acpi_parse_madt_gicc(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_interrupt *gicc = + (struct acpi_madt_generic_interrupt *)header; + u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; + u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2; + void __iomem *redist_base; + + redist_base = ioremap(gicc->gicr_base_address, size); + if (!redist_base) + return -ENOMEM; + + gic_acpi_register_redist(gicc->gicr_base_address, redist_base); + return 0; +} + +static int __init gic_acpi_collect_gicr_base(void) +{ + acpi_tbl_entry_handler redist_parser; + enum acpi_madt_type type; + + if (single_redist) { + type = ACPI_MADT_TYPE_GENERIC_INTERRUPT; + redist_parser = gic_acpi_parse_madt_gicc; + } else { + type = ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR; + redist_parser = gic_acpi_parse_madt_redist; + } + + /* Collect redistributor base addresses in GICR entries */ + if (acpi_table_parse_madt(type, redist_parser, 0) > 0) + return 0; + + pr_info("No valid GICR entries exist\n"); + return -ENODEV; +} + static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header, const unsigned long end) { @@ -995,6 +1048,46 @@ static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header, return 0; } +static int __init gic_acpi_match_gicc(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_interrupt *gicc = + (struct acpi_madt_generic_interrupt *)header; + + /* + * If GICC is enabled and has valid gicr base address, then it means + * GICR base is presented via GICC + */ + if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address) + return 0; + + return -ENODEV; +} + +static int __init gic_acpi_count_gicr_regions(void) +{ + int count; + + /* + * Count how many redistributor regions we have. It is not allowed + * to mix redistributor description, GICR and GICC subtables have to be + * mutually exclusive. + */ + count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR, + gic_acpi_match_gicr, 0); + if (count > 0) { + single_redist = false; + return count; + } + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, + gic_acpi_match_gicc, 0); + if (count > 0) + single_redist = true; + + return count; +} + static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header, struct acpi_probe_entry *ape) { @@ -1006,8 +1099,7 @@ static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header, return false; /* We need to do that exercise anyway, the sooner the better */ - count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR, - gic_acpi_match_gicr, 0); + count = gic_acpi_count_gicr_regions(); if (count <= 0) return false; @@ -1022,8 +1114,7 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_generic_distributor *dist; struct fwnode_handle *domain_handle; - void __iomem *dist_base; - int i, err, count; + int i, err; /* Get distributor base address */ dist = (struct acpi_madt_generic_distributor *)header; @@ -1046,12 +1137,9 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end) goto out_dist_unmap; } - count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR, - gic_acpi_parse_madt_redist, 0); - if (count <= 0) { - err = -ENODEV; + err = gic_acpi_collect_gicr_base(); + if (err) goto out_redist_unmap; - } domain_handle = irq_domain_alloc_fwnode(dist_base); if (!domain_handle) { -- cgit v1.2.1 From f6ae5085d37b2eaf6cac30ccf4d425e95c7d4b63 Mon Sep 17 00:00:00 2001 From: Hanjun Guo Date: Tue, 19 Jan 2016 14:11:17 +0100 Subject: irqchip/gic-v3: Remove gic_root_node variable from the ITS code The gic_root_node variable defined in ITS driver is not actually used, so just remove it. Acked-by: Marc Zyngier Signed-off-by: Hanjun Guo Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 43dfd15c1dd2..1bd710555bda 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -103,7 +103,6 @@ struct its_device { static LIST_HEAD(its_nodes); static DEFINE_SPINLOCK(its_lock); -static struct device_node *gic_root_node; static struct rdists *gic_rdists; #define gic_data_rdist() (raw_cpu_ptr(gic_rdists->rdist)) @@ -1607,8 +1606,6 @@ int its_init(struct device_node *node, struct rdists *rdists, } gic_rdists = rdists; - gic_root_node = node; - its_alloc_lpi_tables(); its_lpi_init(rdists->id_bits); -- cgit v1.2.1 From 04a0e4dee85642138dc7bd78f50ebee397e057a8 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 19 Jan 2016 14:11:18 +0100 Subject: irqchip/gic-v3-its: Mark its_init() and its children as __init gicv3_init_bases() is the only caller for its_init(), also it is a __init function, so mark its_init() as __init too, then recursively mark the functions called as __init. This will help to introduce ITS initialization using ACPI tables as we will use acpi_table_parse_entries family functions there which belong to __init section as well. Acked-by: Marc Zyngier Signed-off-by: Hanjun Guo Signed-off-by: Tomasz Nowicki Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 1bd710555bda..39261798c59f 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -670,7 +670,7 @@ static int its_chunk_to_lpi(int chunk) return (chunk << IRQS_PER_CHUNK_SHIFT) + 8192; } -static int its_lpi_init(u32 id_bits) +static int __init its_lpi_init(u32 id_bits) { lpi_chunks = its_lpi_to_chunk(1UL << id_bits); @@ -1429,7 +1429,8 @@ static void its_enable_quirks(struct its_node *its) gic_enable_quirks(iidr, its_quirks, its); } -static int its_probe(struct device_node *node, struct irq_domain *parent) +static int __init its_probe(struct device_node *node, + struct irq_domain *parent) { struct resource res; struct its_node *its; @@ -1590,7 +1591,7 @@ static struct of_device_id its_device_id[] = { {}, }; -int its_init(struct device_node *node, struct rdists *rdists, +int __init its_init(struct device_node *node, struct rdists *rdists, struct irq_domain *parent_domain) { struct device_node *np; -- cgit v1.2.1 From 0fc6fa2924d0dd54aa5c780a964c2812acf55ded Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Fri, 19 Feb 2016 16:22:43 +0100 Subject: irqchip/gic-v3: Always return IRQ_SET_MASK_OK_DONE in gic_set_affinity Always return IRQ_SET_MASK_OK_DONE instead of IRQ_SET_MASK_OK when the affinity has been updated. When using stacked irqchips, returning IRQ_SET_MASK_OK_DONE means skipping all descendant irqchips. Signed-off-by: Antoine Tenart Acked-by: Marc Zyngier Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index dd16a608a988..5b7d3c2129d8 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -640,7 +640,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, else gic_dist_wait_for_rwp(); - return IRQ_SET_MASK_OK; + return IRQ_SET_MASK_OK_DONE; } #else #define gic_set_affinity NULL -- cgit v1.2.1 From e6b78f2c3e14a9e3a909be3e6ec305d9f1cbabbd Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Fri, 19 Feb 2016 16:22:44 +0100 Subject: irqchip: Add the Alpine MSIX interrupt controller This patch adds the Alpine MSIX interrupt controller driver. Signed-off-by: Antoine Tenart Signed-off-by: Tsahee Zidenberg Acked-by: Marc Zyngier Signed-off-by: Marc Zyngier --- drivers/irqchip/Kconfig | 6 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-alpine-msi.c | 293 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 300 insertions(+) create mode 100644 drivers/irqchip/irq-alpine-msi.c diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 00bbec6eca0b..7e8c441ff2de 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -65,6 +65,12 @@ config ARMADA_370_XP_IRQ select GENERIC_IRQ_CHIP select PCI_MSI_IRQ_DOMAIN if PCI_MSI +config ALPINE_MSI + bool + depends on PCI && PCI_MSI + select GENERIC_IRQ_CHIP + select PCI_MSI_IRQ_DOMAIN + config ATMEL_AIC_IRQ bool select GENERIC_IRQ_CHIP diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 8698710fda37..b03cfcbbac6b 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_IRQCHIP) += irqchip.o +obj-$(CONFIG_ALPINE_MSI) += irq-alpine-msi.o obj-$(CONFIG_ATH79) += irq-ath79-cpu.o obj-$(CONFIG_ATH79) += irq-ath79-misc.o obj-$(CONFIG_ARCH_BCM2835) += irq-bcm2835.o diff --git a/drivers/irqchip/irq-alpine-msi.c b/drivers/irqchip/irq-alpine-msi.c new file mode 100644 index 000000000000..f8712722a606 --- /dev/null +++ b/drivers/irqchip/irq-alpine-msi.c @@ -0,0 +1,293 @@ +/* + * Annapurna Labs MSIX support services + * + * Copyright (C) 2016, Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Antoine Tenart + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* MSIX message address format: local GIC target */ +#define ALPINE_MSIX_SPI_TARGET_CLUSTER0 BIT(16) + +struct alpine_msix_data { + spinlock_t msi_map_lock; + phys_addr_t addr; + u32 spi_first; /* The SGI number that MSIs start */ + u32 num_spis; /* The number of SGIs for MSIs */ + unsigned long *msi_map; +}; + +static void alpine_msix_mask_msi_irq(struct irq_data *d) +{ + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void alpine_msix_unmask_msi_irq(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static struct irq_chip alpine_msix_irq_chip = { + .name = "MSIx", + .irq_mask = alpine_msix_mask_msi_irq, + .irq_unmask = alpine_msix_unmask_msi_irq, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, +}; + +static int alpine_msix_allocate_sgi(struct alpine_msix_data *priv, int num_req) +{ + int first; + + spin_lock(&priv->msi_map_lock); + + first = bitmap_find_next_zero_area(priv->msi_map, priv->num_spis, 0, + num_req, 0); + if (first >= priv->num_spis) { + spin_unlock(&priv->msi_map_lock); + return -ENOSPC; + } + + bitmap_set(priv->msi_map, first, num_req); + + spin_unlock(&priv->msi_map_lock); + + return priv->spi_first + first; +} + +static void alpine_msix_free_sgi(struct alpine_msix_data *priv, unsigned sgi, + int num_req) +{ + int first = sgi - priv->spi_first; + + spin_lock(&priv->msi_map_lock); + + bitmap_clear(priv->msi_map, first, num_req); + + spin_unlock(&priv->msi_map_lock); +} + +static void alpine_msix_compose_msi_msg(struct irq_data *data, + struct msi_msg *msg) +{ + struct alpine_msix_data *priv = irq_data_get_irq_chip_data(data); + phys_addr_t msg_addr = priv->addr; + + msg_addr |= (data->hwirq << 3); + + msg->address_hi = upper_32_bits(msg_addr); + msg->address_lo = lower_32_bits(msg_addr); + msg->data = 0; +} + +static struct msi_domain_info alpine_msix_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .chip = &alpine_msix_irq_chip, +}; + +static struct irq_chip middle_irq_chip = { + .name = "alpine_msix_middle", + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = alpine_msix_compose_msi_msg, +}; + +static int alpine_msix_gic_domain_alloc(struct irq_domain *domain, + unsigned int virq, int sgi) +{ + struct irq_fwspec fwspec; + struct irq_data *d; + int ret; + + if (!is_of_node(domain->parent->fwnode)) + return -EINVAL; + + fwspec.fwnode = domain->parent->fwnode; + fwspec.param_count = 3; + fwspec.param[0] = 0; + fwspec.param[1] = sgi; + fwspec.param[2] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); + if (ret) + return ret; + + d = irq_domain_get_irq_data(domain->parent, virq); + d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); + + return 0; +} + +static int alpine_msix_middle_domain_alloc(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct alpine_msix_data *priv = domain->host_data; + int sgi, err, i; + + sgi = alpine_msix_allocate_sgi(priv, nr_irqs); + if (sgi < 0) + return sgi; + + for (i = 0; i < nr_irqs; i++) { + err = alpine_msix_gic_domain_alloc(domain, virq + i, sgi + i); + if (err) + goto err_sgi; + + irq_domain_set_hwirq_and_chip(domain, virq + i, sgi + i, + &middle_irq_chip, priv); + } + + return 0; + +err_sgi: + while (--i >= 0) + irq_domain_free_irqs_parent(domain, virq, i); + alpine_msix_free_sgi(priv, sgi, nr_irqs); + return err; +} + +static void alpine_msix_middle_domain_free(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct alpine_msix_data *priv = irq_data_get_irq_chip_data(d); + + irq_domain_free_irqs_parent(domain, virq, nr_irqs); + alpine_msix_free_sgi(priv, d->hwirq, nr_irqs); +} + +static const struct irq_domain_ops alpine_msix_middle_domain_ops = { + .alloc = alpine_msix_middle_domain_alloc, + .free = alpine_msix_middle_domain_free, +}; + +static int alpine_msix_init_domains(struct alpine_msix_data *priv, + struct device_node *node) +{ + struct irq_domain *middle_domain, *msi_domain, *gic_domain; + struct device_node *gic_node; + + gic_node = of_irq_find_parent(node); + if (!gic_node) { + pr_err("Failed to find the GIC node\n"); + return -ENODEV; + } + + gic_domain = irq_find_host(gic_node); + if (!gic_domain) { + pr_err("Failed to find the GIC domain\n"); + return -ENXIO; + } + + middle_domain = irq_domain_add_tree(NULL, + &alpine_msix_middle_domain_ops, + priv); + if (!middle_domain) { + pr_err("Failed to create the MSIX middle domain\n"); + return -ENOMEM; + } + + middle_domain->parent = gic_domain; + + msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(node), + &alpine_msix_domain_info, + middle_domain); + if (!msi_domain) { + pr_err("Failed to create MSI domain\n"); + irq_domain_remove(msi_domain); + return -ENOMEM; + } + + return 0; +} + +static int alpine_msix_init(struct device_node *node, + struct device_node *parent) +{ + struct alpine_msix_data *priv; + struct resource res; + int ret; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + spin_lock_init(&priv->msi_map_lock); + + ret = of_address_to_resource(node, 0, &res); + if (ret) { + pr_err("Failed to allocate resource\n"); + goto err_priv; + } + + /* + * The 20 least significant bits of addr provide direct information + * regarding the interrupt destination. + * + * To select the primary GIC as the target GIC, bits [18:17] must be set + * to 0x0. In this case, bit 16 (SPI_TARGET_CLUSTER0) must be set. + */ + priv->addr = res.start & GENMASK_ULL(63,20); + priv->addr |= ALPINE_MSIX_SPI_TARGET_CLUSTER0; + + if (of_property_read_u32(node, "al,msi-base-spi", &priv->spi_first)) { + pr_err("Unable to parse MSI base\n"); + ret = -EINVAL; + goto err_priv; + } + + if (of_property_read_u32(node, "al,msi-num-spis", &priv->num_spis)) { + pr_err("Unable to parse MSI numbers\n"); + ret = -EINVAL; + goto err_priv; + } + + priv->msi_map = kzalloc(sizeof(*priv->msi_map) * BITS_TO_LONGS(priv->num_spis), + GFP_KERNEL); + if (!priv->msi_map) { + ret = -ENOMEM; + goto err_priv; + } + + pr_debug("Registering %d msixs, starting at %d\n", + priv->num_spis, priv->spi_first); + + ret = alpine_msix_init_domains(priv, node); + if (ret) + goto err_map; + + return 0; + +err_map: + kfree(priv->msi_map); +err_priv: + kfree(priv); + return ret; +} +IRQCHIP_DECLARE(alpine_msix, "al,alpine-msix", alpine_msix_init); -- cgit v1.2.1 From a13690297ce49262ae44e41b25a954124609eea8 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Fri, 19 Feb 2016 16:22:45 +0100 Subject: Documentation/bindings: Document the Alpine MSIX driver Following the addition of the Alpine MSIX driver, this patch adds the corresponding bindings documentation. Acked-by: Marc Zyngier Signed-off-by: Antoine Tenart Signed-off-by: Tsahee Zidenberg Signed-off-by: Marc Zyngier --- .../interrupt-controller/al,alpine-msix.txt | 26 ++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt diff --git a/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt b/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt new file mode 100644 index 000000000000..f6f1c14bf99b --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt @@ -0,0 +1,26 @@ +Alpine MSIX controller + +See arm,gic-v3.txt for SPI and MSI definitions. + +Required properties: + +- compatible: should be "al,alpine-msix" +- reg: physical base address and size of the registers +- interrupt-parent: specifies the parent interrupt controller. +- interrupt-controller: identifies the node as an interrupt controller +- msi-controller: identifies the node as an PCI Message Signaled Interrupt + controller +- al,msi-base-spi: SPI base of the MSI frame +- al,msi-num-spis: number of SPIs assigned to the MSI frame, relative to SPI0 + +Example: + +msix: msix { + compatible = "al,alpine-msix"; + reg = <0x0 0xfbe00000 0x0 0x100000>; + interrupt-parent = <&gic>; + interrupt-controller; + msi-controller; + al,msi-base-spi = <160>; + al,msi-num-spis = <160>; +}; -- cgit v1.2.1 From 82b0a434b436f5da69ddd24bd6a6fa5dc4484310 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 22 Feb 2016 15:01:01 +0100 Subject: irqchip/gic/realview: Support more RealView DCC variants In the add-on file for the GIC dealing with the RealView family we currently only handle the PB11MPCore, let's extend this to manage the RealView EB ARM11MPCore as well. The Revision B of the ARM11MPCore core tile is a bit special and needs special handling as it moves a system control register around at random. Cc: Arnd Bergmann Cc: devicetree@vger.kernel.org Acked-by: Marc Zyngier Signed-off-by: Linus Walleij Signed-off-by: Marc Zyngier --- .../bindings/interrupt-controller/arm,gic.txt | 1 + drivers/irqchip/irq-gic-realview.c | 44 +++++++++++++++++++--- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt index 5a1cb4bc3dfe..793c20ff8fcc 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt @@ -16,6 +16,7 @@ Main node required properties: "arm,cortex-a15-gic" "arm,cortex-a7-gic" "arm,cortex-a9-gic" + "arm,eb11mp-gic" "arm,gic-400" "arm,pl390" "arm,tc11mp-gic" diff --git a/drivers/irqchip/irq-gic-realview.c b/drivers/irqchip/irq-gic-realview.c index aa46eb280a7f..54c296401525 100644 --- a/drivers/irqchip/irq-gic-realview.c +++ b/drivers/irqchip/irq-gic-realview.c @@ -10,7 +10,8 @@ #include #define REALVIEW_SYS_LOCK_OFFSET 0x20 -#define REALVIEW_PB11MP_SYS_PLD_CTRL1 0x74 +#define REALVIEW_SYS_PLD_CTRL1 0x74 +#define REALVIEW_EB_REVB_SYS_PLD_CTRL1 0xD8 #define VERSATILE_LOCK_VAL 0xA05F #define PLD_INTMODE_MASK BIT(22)|BIT(23)|BIT(24) #define PLD_INTMODE_LEGACY 0x0 @@ -18,26 +19,57 @@ #define PLD_INTMODE_NEW_NO_DCC BIT(23) #define PLD_INTMODE_FIQ_ENABLE BIT(24) +/* For some reason RealView EB Rev B moved this register */ +static const struct of_device_id syscon_pldset_of_match[] = { + { + .compatible = "arm,realview-eb11mp-revb-syscon", + .data = (void *)REALVIEW_EB_REVB_SYS_PLD_CTRL1, + }, + { + .compatible = "arm,realview-eb11mp-revc-syscon", + .data = (void *)REALVIEW_SYS_PLD_CTRL1, + }, + { + .compatible = "arm,realview-eb-syscon", + .data = (void *)REALVIEW_SYS_PLD_CTRL1, + }, + { + .compatible = "arm,realview-pb11mp-syscon", + .data = (void *)REALVIEW_SYS_PLD_CTRL1, + }, + {}, +}; + static int __init realview_gic_of_init(struct device_node *node, struct device_node *parent) { static struct regmap *map; + struct device_node *np; + const struct of_device_id *gic_id; + u32 pld1_ctrl; + + np = of_find_matching_node_and_match(NULL, syscon_pldset_of_match, + &gic_id); + if (!np) + return -ENODEV; + pld1_ctrl = (u32)gic_id->data; /* The PB11MPCore GIC needs to be configured in the syscon */ - map = syscon_regmap_lookup_by_compatible("arm,realview-pb11mp-syscon"); + map = syscon_node_to_regmap(np); if (!IS_ERR(map)) { /* new irq mode with no DCC */ regmap_write(map, REALVIEW_SYS_LOCK_OFFSET, VERSATILE_LOCK_VAL); - regmap_update_bits(map, REALVIEW_PB11MP_SYS_PLD_CTRL1, + regmap_update_bits(map, pld1_ctrl, PLD_INTMODE_NEW_NO_DCC, PLD_INTMODE_MASK); regmap_write(map, REALVIEW_SYS_LOCK_OFFSET, 0x0000); - pr_info("TC11MP GIC: set up interrupt controller to NEW mode, no DCC\n"); + pr_info("RealView GIC: set up interrupt controller to NEW mode, no DCC\n"); } else { - pr_err("TC11MP GIC setup: could not find syscon\n"); - return -ENXIO; + pr_err("RealView GIC setup: could not find syscon\n"); + return -ENODEV; } return gic_of_init(node, parent); } IRQCHIP_DECLARE(armtc11mp_gic, "arm,tc11mp-gic", realview_gic_of_init); +IRQCHIP_DECLARE(armeb11mp_gic, "arm,eb11mp-gic", realview_gic_of_init); -- cgit v1.2.1 From 8b30a8b3c636a155bab9176ad209964c9c22252d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 9 Mar 2016 14:48:21 +0100 Subject: x86/defconfigs/32: Set CONFIG_FRAME_WARN to the Kconfig default Sync it to the Kconfig default for 32-bit. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: tim.gardner@canonical.com Link: http://lkml.kernel.org/r/20160309134821.GD6564@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 028be48c8839..e25a1630320c 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -288,7 +288,7 @@ CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y CONFIG_PRINTK_TIME=y # CONFIG_ENABLE_WARN_DEPRECATED is not set -CONFIG_FRAME_WARN=2048 +CONFIG_FRAME_WARN=1024 CONFIG_MAGIC_SYSRQ=y # CONFIG_UNUSED_SYMBOLS is not set CONFIG_DEBUG_KERNEL=y -- cgit v1.2.1 From a318beea224d255c38652f88a7ae0d30ac4c23c3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:24 -0800 Subject: selftests/x86: In syscall_nt, test NT|TF as well Setting TF prevents fastpath returns in most cases, which causes the test to fail on 32-bit kernels because 32-bit kernels do not, in fact, handle NT correctly on SYSENTER entries. The next patch will fix 32-bit kernels. Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bd4bb48af6b10c0dc84aec6dbcf487ed25683495.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/syscall_nt.c | 57 +++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c index 60c06af4646a..43fcab367fb0 100644 --- a/tools/testing/selftests/x86/syscall_nt.c +++ b/tools/testing/selftests/x86/syscall_nt.c @@ -17,6 +17,9 @@ #include #include +#include +#include +#include #include #include @@ -26,6 +29,8 @@ # define WIDTH "l" #endif +static unsigned int nerrs; + static unsigned long get_eflags(void) { unsigned long eflags; @@ -39,16 +44,52 @@ static void set_eflags(unsigned long eflags) : : "rm" (eflags) : "flags"); } -int main() +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) { - printf("[RUN]\tSet NT and issue a syscall\n"); - set_eflags(get_eflags() | X86_EFLAGS_NT); + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void sigtrap(int sig, siginfo_t *si, void *ctx_void) +{ +} + +static void do_it(unsigned long extraflags) +{ + unsigned long flags; + + set_eflags(get_eflags() | extraflags); syscall(SYS_getpid); - if (get_eflags() & X86_EFLAGS_NT) { - printf("[OK]\tThe syscall worked and NT is still set\n"); - return 0; + flags = get_eflags(); + if ((flags & extraflags) == extraflags) { + printf("[OK]\tThe syscall worked and flags are still set\n"); } else { - printf("[FAIL]\tThe syscall worked but NT was cleared\n"); - return 1; + printf("[FAIL]\tThe syscall worked but flags were cleared (flags = 0x%lx but expected 0x%lx set)\n", + flags, extraflags); + nerrs++; } } + +int main(void) +{ + printf("[RUN]\tSet NT and issue a syscall\n"); + do_it(X86_EFLAGS_NT); + + /* + * Now try it again with TF set -- TF forces returns via IRET in all + * cases except non-ptregs-using 64-bit full fast path syscalls. + */ + + sethandler(SIGTRAP, sigtrap, 0); + + printf("[RUN]\tSet NT|TF and issue a syscall\n"); + do_it(X86_EFLAGS_NT | X86_EFLAGS_TF); + + return nerrs == 0 ? 0 : 1; +} -- cgit v1.2.1 From e786041153df6343169373177248abfab5c5ac1b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:25 -0800 Subject: x86/entry/compat: In SYSENTER, sink AC clearing below the existing FLAGS test CLAC is slow, and the SYSENTER code already has an unlikely path that runs if unusual flags are set. Drop the CLAC and instead rely on the unlikely path to clear AC. This seems to save ~24 cycles on my Skylake laptop. (Hey, Intel, make this faster please!) Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/90d6db2189f9add83bc7bddd75a0c19ebbd676b2.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 3c990eeee40b..da5c19443e0b 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -66,8 +66,6 @@ ENTRY(entry_SYSENTER_compat) */ pushfq /* pt_regs->flags (except IF = 0) */ orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ - ASM_CLAC /* Clear AC after saving FLAGS */ - pushq $__USER32_CS /* pt_regs->cs */ xorq %r8,%r8 pushq %r8 /* pt_regs->ip = 0 (placeholder) */ @@ -90,9 +88,9 @@ ENTRY(entry_SYSENTER_compat) cld /* - * Sysenter doesn't filter flags, so we need to clear NT + * SYSENTER doesn't filter flags, so we need to clear NT and AC * ourselves. To save a few cycles, we can check whether - * NT was set instead of doing an unconditional popfq. + * either was set instead of doing an unconditional popfq. * This needs to happen before enabling interrupts so that * we don't get preempted with NT set. * @@ -102,7 +100,7 @@ ENTRY(entry_SYSENTER_compat) * we're keeping that code behind a branch which will predict as * not-taken and therefore its instructions won't be fetched. */ - testl $X86_EFLAGS_NT, EFLAGS(%rsp) + testl $X86_EFLAGS_NT|X86_EFLAGS_AC, EFLAGS(%rsp) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: -- cgit v1.2.1 From 67f590e8d4d718d9bd377b39223f7f69678d6a10 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:26 -0800 Subject: x86/entry/32: Filter NT and speed up AC filtering in SYSENTER This makes the 32-bit code work just like the 64-bit code. It should speed up syscalls on 32-bit kernels on Skylake by something like 20 cycles (by analogy to the 64-bit compat case). It also cleans up NT just like we do for the 64-bit case. Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/07daef3d44bd1ed62a2c866e143e8df64edb40ee.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index e13027247c90..8daa8127f578 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -294,13 +294,29 @@ sysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ pushl %ebp /* pt_regs->sp (stashed in bp) */ pushfl /* pt_regs->flags (except IF = 0) */ - ASM_CLAC /* Clear AC after saving FLAGS */ orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ pushl $__USER_CS /* pt_regs->cs */ pushl $0 /* pt_regs->ip = 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + /* + * SYSENTER doesn't filter flags, so we need to clear NT and AC + * ourselves. To save a few cycles, we can check whether + * either was set instead of doing an unconditional popfq. + * This needs to happen before enabling interrupts so that + * we don't get preempted with NT set. + * + * NB.: .Lsysenter_fix_flags is a label with the code under it moved + * out-of-line as an optimization: NT is unlikely to be set in the + * majority of the cases and instead of polluting the I$ unnecessarily, + * we're keeping that code behind a branch which will predict as + * not-taken and therefore its instructions won't be fetched. + */ + testl $X86_EFLAGS_NT|X86_EFLAGS_AC, PT_EFLAGS(%esp) + jnz .Lsysenter_fix_flags +.Lsysenter_flags_fixed: + /* * User mode is traced as though IRQs are on, and SYSENTER * turned them off. @@ -339,6 +355,11 @@ sysenter_past_esp: .popsection _ASM_EXTABLE(1b, 2b) PTGS_TO_GS_EX + +.Lsysenter_fix_flags: + pushl $X86_EFLAGS_FIXED + popfl + jmp .Lsysenter_flags_fixed ENDPROC(entry_SYSENTER_32) # system call handler stub -- cgit v1.2.1 From c2c9b52fab0d0cf993476ed4c34f24da5a1205ae Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:27 -0800 Subject: x86/entry/32: Restore FLAGS on SYSEXIT We weren't restoring FLAGS at all on SYSEXIT. Apparently no one cared. With this patch applied, native kernels should always honor task_pt_regs()->flags, which opens the door for some sys_iopl() cleanups. I'll do those as a separate series, though, since getting it right will involve tweaking some paravirt ops. ( The short version is that, before this patch, sys_iopl(), invoked via SYSENTER, wasn't guaranteed to ever transfer the updated regs->flags, so sys_iopl() had to change the hardware flags register as well. ) Reported-by: Brian Gerst Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3f98b207472dc9784838eb5ca2b89dcc845ce269.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 8daa8127f578..76109068149f 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -342,6 +342,15 @@ sysenter_past_esp: popl %ebp /* pt_regs->bp */ popl %eax /* pt_regs->ax */ + /* + * Restore all flags except IF. (We restore IF separately because + * STI gives a one-instruction window in which we won't be interrupted, + * whereas POPF does not.) + */ + addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */ + btr $X86_EFLAGS_IF_BIT, (%esp) + popfl + /* * Return back to the vDSO, which will pop ecx and edx. * Don't bother with DS and ES (they already contain __USER_DS). -- cgit v1.2.1 From 81edd9f69a6fd214fdbe66b43de6aa1610c84c63 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:28 -0800 Subject: x86/entry/traps: Clear TIF_BLOCKSTEP on all debug exceptions The SDM says that debug exceptions clear BTF, and we need to keep TIF_BLOCKSTEP in sync with BTF. Clear it unconditionally and improve the comment. I suspect that the fact that kmemcheck could cause TIF_BLOCKSTEP not to be cleared was just an oversight. Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/fa86e55d196e6dde5b38839595bde2a292c52fdc.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 410e8e2700c5..1af56217fb5d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -597,6 +597,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) /* Filter out all the reserved bits which are preset to 1 */ dr6 &= ~DR6_RESERVED; + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); + /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. @@ -612,11 +619,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) /* DR6 may or may not be cleared by the CPU */ set_debugreg(0, 6); - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); - /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; -- cgit v1.2.1 From 8bb5643686d2898bec181c407651cf84e77d413f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:29 -0800 Subject: x86/entry/traps: Clear DR6 early in do_debug() and improve the comment Leaving any bits set in DR6 on return from a debug exception is asking for trouble. Prevent it by writing zero right away and clarify the comment. Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3857676e1be8fb27db4b89bbb1e2052b7f435ff4.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1af56217fb5d..7394be8b1a66 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -593,6 +593,18 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) ist_enter(regs); get_debugreg(dr6, 6); + /* + * The Intel SDM says: + * + * Certain debug exceptions may clear bits 0-3. The remaining + * contents of the DR6 register are never cleared by the + * processor. To avoid confusion in identifying debug + * exceptions, debug handlers should clear the register before + * returning to the interrupted task. + * + * Keep it simple: clear DR6 immediately. + */ + set_debugreg(0, 6); /* Filter out all the reserved bits which are preset to 1 */ dr6 &= ~DR6_RESERVED; @@ -616,9 +628,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) goto exit; - /* DR6 may or may not be cleared by the CPU */ - set_debugreg(0, 6); - /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; -- cgit v1.2.1 From f2b375756c839ff655a3e0b45e339f8fbf973093 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:30 -0800 Subject: x86/entry: Vastly simplify SYSENTER TF (single-step) handling Due to a blatant design error, SYSENTER doesn't clear TF (single-step). As a result, if a user does SYSENTER with TF set, we will single-step through the kernel until something clears TF. There is absolutely nothing we can do to prevent this short of turning off SYSENTER [1]. Simplify the handling considerably with two changes: 1. We already sanitize EFLAGS in SYSENTER to clear NT and AC. We can add TF to that list of flags to sanitize with no overhead whatsoever. 2. Teach do_debug() to ignore single-step traps in the SYSENTER prologue. That's all we need to do. Don't get too excited -- our handling is still buggy on 32-bit kernels. There's nothing wrong with the SYSENTER code itself, but the #DB prologue has a clever fixup for traps on the very first instruction of entry_SYSENTER_32, and the fixup doesn't work quite correctly. The next two patches will fix that. [1] We could probably prevent it by forcing BTF on at all times and making sure we clear TF before any branches in the SYSENTER code. Needless to say, this is a bad idea. Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a30d2ea06fe4b621fe6a9ef911b02c0f38feb6f2.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 42 ++++++++++++++++++++++---------- arch/x86/entry/entry_64_compat.S | 9 ++++++- arch/x86/include/asm/proto.h | 15 ++++++++++-- arch/x86/kernel/traps.c | 52 +++++++++++++++++++++++++++++++++------- 4 files changed, 94 insertions(+), 24 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 76109068149f..b2e1d446bdf8 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -287,7 +287,26 @@ need_resched: END(resume_kernel) #endif - # SYSENTER call handler stub +GLOBAL(__begin_SYSENTER_singlestep_region) +/* + * All code from here through __end_SYSENTER_singlestep_region is subject + * to being single-stepped if a user program sets TF and executes SYSENTER. + * There is absolutely nothing that we can do to prevent this from happening + * (thanks Intel!). To keep our handling of this situation as simple as + * possible, we handle TF just like AC and NT, except that our #DB handler + * will ignore all of the single-step traps generated in this range. + */ + +#ifdef CONFIG_XEN +/* + * Xen doesn't set %esp to be precisely what the normal SYSENTER + * entry point expects, so fix it up before using the normal path. + */ +ENTRY(xen_sysenter_target) + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp +#endif + ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp sysenter_past_esp: @@ -301,19 +320,25 @@ sysenter_past_esp: SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ /* - * SYSENTER doesn't filter flags, so we need to clear NT and AC - * ourselves. To save a few cycles, we can check whether + * SYSENTER doesn't filter flags, so we need to clear NT, AC + * and TF ourselves. To save a few cycles, we can check whether * either was set instead of doing an unconditional popfq. * This needs to happen before enabling interrupts so that * we don't get preempted with NT set. * + * If TF is set, we will single-step all the way to here -- do_debug + * will ignore all the traps. (Yes, this is slow, but so is + * single-stepping in general. This allows us to avoid having + * a more complicated code to handle the case where a user program + * forces us to single-step through the SYSENTER entry code.) + * * NB.: .Lsysenter_fix_flags is a label with the code under it moved * out-of-line as an optimization: NT is unlikely to be set in the * majority of the cases and instead of polluting the I$ unnecessarily, * we're keeping that code behind a branch which will predict as * not-taken and therefore its instructions won't be fetched. */ - testl $X86_EFLAGS_NT|X86_EFLAGS_AC, PT_EFLAGS(%esp) + testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: @@ -369,6 +394,7 @@ sysenter_past_esp: pushl $X86_EFLAGS_FIXED popfl jmp .Lsysenter_flags_fixed +GLOBAL(__end_SYSENTER_singlestep_region) ENDPROC(entry_SYSENTER_32) # system call handler stub @@ -651,14 +677,6 @@ ENTRY(spurious_interrupt_bug) END(spurious_interrupt_bug) #ifdef CONFIG_XEN -/* - * Xen doesn't set %esp to be precisely what the normal SYSENTER - * entry point expects, so fix it up before using the normal path. - */ -ENTRY(xen_sysenter_target) - addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp - ENTRY(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index da5c19443e0b..cbb5e6943d9b 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -94,13 +94,19 @@ ENTRY(entry_SYSENTER_compat) * This needs to happen before enabling interrupts so that * we don't get preempted with NT set. * + * If TF is set, we will single-step all the way to here -- do_debug + * will ignore all the traps. (Yes, this is slow, but so is + * single-stepping in general. This allows us to avoid having + * a more complicated code to handle the case where a user program + * forces us to single-step through the SYSENTER entry code.) + * * NB.: .Lsysenter_fix_flags is a label with the code under it moved * out-of-line as an optimization: NT is unlikely to be set in the * majority of the cases and instead of polluting the I$ unnecessarily, * we're keeping that code behind a branch which will predict as * not-taken and therefore its instructions won't be fetched. */ - testl $X86_EFLAGS_NT|X86_EFLAGS_AC, EFLAGS(%rsp) + testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: @@ -121,6 +127,7 @@ ENTRY(entry_SYSENTER_compat) pushq $X86_EFLAGS_FIXED popfq jmp .Lsysenter_flags_fixed +GLOBAL(__end_entry_SYSENTER_compat) ENDPROC(entry_SYSENTER_compat) /* diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index a4a77286cb1d..9b9b30b19441 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -7,12 +7,23 @@ void syscall_init(void); +#ifdef CONFIG_X86_64 void entry_SYSCALL_64(void); -void entry_SYSCALL_compat(void); +#endif + +#ifdef CONFIG_X86_32 void entry_INT80_32(void); -void entry_INT80_compat(void); void entry_SYSENTER_32(void); +void __begin_SYSENTER_singlestep_region(void); +void __end_SYSENTER_singlestep_region(void); +#endif + +#ifdef CONFIG_IA32_EMULATION void entry_SYSENTER_compat(void); +void __end_entry_SYSENTER_compat(void); +void entry_SYSCALL_compat(void); +void entry_INT80_compat(void); +#endif void x86_configure_nx(void); void x86_report_nx(void); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7394be8b1a66..b0ddb81926f9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -559,6 +559,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) NOKPROBE_SYMBOL(fixup_bad_iret); #endif +static bool is_sysenter_singlestep(struct pt_regs *regs) +{ + /* + * We don't try for precision here. If we're anywhere in the region of + * code that can be single-stepped in the SYSENTER entry path, then + * assume that this is a useless single-step trap due to SYSENTER + * being invoked with TF set. (We don't know in advance exactly + * which instructions will be hit because BTF could plausibly + * be set.) + */ +#ifdef CONFIG_X86_32 + return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < + (unsigned long)__end_SYSENTER_singlestep_region - + (unsigned long)__begin_SYSENTER_singlestep_region; +#elif defined(CONFIG_IA32_EMULATION) + return (regs->ip - (unsigned long)entry_SYSENTER_compat) < + (unsigned long)__end_entry_SYSENTER_compat - + (unsigned long)entry_SYSENTER_compat; +#else + return false; +#endif +} + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -616,6 +639,18 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) */ clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); + if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) && + is_sysenter_singlestep(regs))) { + dr6 &= ~DR_STEP; + if (!dr6) + goto exit; + /* + * else we might have gotten a single-step trap and hit a + * watchpoint at the same time, in which case we should fall + * through and handle the watchpoint. + */ + } + /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. @@ -624,7 +659,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if (!dr6 && user_mode(regs)) user_icebp = 1; - /* Catch kmemcheck conditions first of all! */ + /* Catch kmemcheck conditions! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) goto exit; @@ -659,14 +694,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) goto exit; } - /* - * Single-stepping through system calls: ignore any exceptions in - * kernel space, but re-enable TF when returning to user mode. - * - * We already checked v86 mode above, so we can check for kernel mode - * by just checking the CPL of CS. - */ - if ((dr6 & DR_STEP) && !user_mode(regs)) { + if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { + /* + * Historical junk that used to handle SYSENTER single-stepping. + * This should be unreachable now. If we survive for a while + * without anyone hitting this warning, we'll turn this into + * an oops. + */ tsk->thread.debugreg6 &= ~DR_STEP; set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; -- cgit v1.2.1 From 6dcc94149d605908a7c0c4cf2085340637aac86d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:31 -0800 Subject: x86/entry: Only allocate space for tss_struct::SYSENTER_stack if needed The SYSENTER stack is only used on 32-bit kernels. Remove it on 64-bit kernels. ( We may end up using it down the road on 64-bit kernels. If so, we'll re-enable it for CONFIG_IA32_EMULATION. ) Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/9dbd18429f9ff61a76b6eda97a9ea20510b9f6ba.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ecb410310e70..7cd01b71b5bd 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -297,10 +297,12 @@ struct tss_struct { */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; +#ifdef CONFIG_X86_32 /* * Space for the temporary SYSENTER stack: */ unsigned long SYSENTER_stack[64]; +#endif } ____cacheline_aligned; -- cgit v1.2.1 From 7536656f08d0c1a3b4c487d00785c5186ec6f533 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:32 -0800 Subject: x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup Right after SYSENTER, we can get a #DB or NMI. On x86_32, there's no IST, so the exception handler is invoked on the temporary SYSENTER stack. Because the SYSENTER stack is very small, we have a fixup to switch off the stack quickly when this happens. The old fixup had several issues: 1. It checked the interrupt frame's CS and EIP. This wasn't obviously correct on Xen or if vm86 mode was in use [1]. 2. In the NMI handler, it did some frightening digging into the stack frame. I'm not convinced this digging was correct. 3. The fixup didn't switch stacks and then switch back. Instead, it synthesized a brand new stack frame that would redirect the IRET back to the SYSENTER code. That frame was highly questionable. For one thing, if NMI nested inside #DB, we would effectively abort the #DB prologue, which was probably safe but was frightening. For another, the code used PUSHFL to write the FLAGS portion of the frame, which was simply bogus -- by the time PUSHFL was called, at least TF, NT, VM, and all of the arithmetic flags were clobbered. Simplify this considerably. Instead of looking at the saved frame to see where we came from, check the hardware ESP register against the SYSENTER stack directly. Malicious user code cannot spoof the kernel ESP register, and by moving the check after SAVE_ALL, we can use normal PER_CPU accesses to find all the relevant addresses. With this patch applied, the improved syscall_nt_32 test finally passes on 32-bit kernels. [1] It isn't obviously correct, but it is nonetheless safe from vm86 shenanigans as far as I can tell. A user can't point EIP at entry_SYSENTER_32 while in vm86 mode because entry_SYSENTER_32, like all kernel addresses, is greater than 0xffff and would thus violate the CS segment limit. Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b2cdbc037031c07ecf2c40a96069318aec0e7971.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 114 ++++++++++++++++++--------------------- arch/x86/kernel/asm-offsets_32.c | 5 ++ 2 files changed, 56 insertions(+), 63 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b2e1d446bdf8..7b3ec24ede82 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -976,51 +976,48 @@ error_code: jmp ret_from_exception END(page_fault) -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -.macro FIX_STACK offset ok label - cmpw $__KERNEL_CS, 4(%esp) - jne \ok -\label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp - pushfl - pushl $__KERNEL_CS - pushl $sysenter_past_esp -.endm - ENTRY(debug) + /* + * #DB can happen at the first instruction of + * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this + * happens, then we will be running on a very small stack. We + * need to detect this condition and switch to the thread + * stack before calling any C code at all. + * + * If you edit this code, keep in mind that NMIs can happen in here. + */ ASM_CLAC - cmpl $entry_SYSENTER_32, (%esp) - jne debug_stack_correct - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn -debug_stack_correct: pushl $-1 # mark this as an int SAVE_ALL - TRACE_IRQS_OFF xorl %edx, %edx # error code 0 movl %esp, %eax # pt_regs pointer + + /* Are we currently on the SYSENTER stack? */ + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Ldebug_from_sysenter_stack + + TRACE_IRQS_OFF + call do_debug + jmp ret_from_exception + +.Ldebug_from_sysenter_stack: + /* We're on the SYSENTER stack. Switch off. */ + movl %esp, %ebp + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp + TRACE_IRQS_OFF call do_debug + movl %ebp, %esp jmp ret_from_exception END(debug) /* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. + * NMI is doubly nasty. It can happen on the first instruction of + * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning + * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32 + * switched stacks. We handle both conditions by simply checking whether we + * interrupted kernel code running on the SYSENTER stack. */ ENTRY(nmi) ASM_CLAC @@ -1031,41 +1028,32 @@ ENTRY(nmi) popl %eax je nmi_espfix_stack #endif - cmpl $entry_SYSENTER_32, (%esp) - je nmi_stack_fixup - pushl %eax - movl %esp, %eax - /* - * Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1), %eax - cmpl $(THREAD_SIZE-20), %eax - popl %eax - jae nmi_stack_correct - cmpl $entry_SYSENTER_32, 12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - pushl %eax + + pushl %eax # pt_regs->orig_ax SAVE_ALL xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer + + /* Are we currently on the SYSENTER stack? */ + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Lnmi_from_sysenter_stack + + /* Not on SYSENTER stack. */ call do_nmi jmp restore_all_notrace -nmi_stack_fixup: - FIX_STACK 12, nmi_stack_correct, 1 - jmp nmi_stack_correct - -nmi_debug_stack_check: - cmpw $__KERNEL_CS, 16(%esp) - jne nmi_stack_correct - cmpl $debug, (%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn, (%esp) - ja nmi_stack_correct - FIX_STACK 24, nmi_stack_correct, 1 - jmp nmi_stack_correct +.Lnmi_from_sysenter_stack: + /* + * We're on the SYSENTER stack. Switch off. No one (not even debug) + * is using the thread stack right now, so it's safe for us to use it. + */ + movl %esp, %ebp + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp + call do_nmi + movl %ebp, %esp + jmp restore_all_notrace #ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index fdeb0ce07c16..ecdc1d217dc0 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -52,6 +52,11 @@ void foo(void) DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - offsetofend(struct tss_struct, SYSENTER_stack)); + /* Offset from cpu_tss to SYSENTER_stack */ + OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); + /* Size of SYSENTER_stack */ + DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); -- cgit v1.2.1 From 2a41aa4feb25af3ead60b740c43df80c576efea2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:33 -0800 Subject: x86/entry/32: Add and check a stack canary for the SYSENTER stack The first instruction of the SYSENTER entry runs on its own tiny stack. That stack can be used if a #DB or NMI is delivered before the SYSENTER prologue switches to a real stack. We have code in place to prevent us from overflowing the tiny stack. For added paranoia, add a canary to the stack and check it in do_debug() -- that way, if something goes wrong with the #DB logic, we'll eventually notice. Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6ff9a806f39098b166dc2c41c1db744df5272f29.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 3 ++- arch/x86/kernel/process.c | 3 +++ arch/x86/kernel/traps.c | 8 ++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7cd01b71b5bd..50a6dc871cc0 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -299,8 +299,9 @@ struct tss_struct { #ifdef CONFIG_X86_32 /* - * Space for the temporary SYSENTER stack: + * Space for the temporary SYSENTER stack. */ + unsigned long SYSENTER_stack_canary; unsigned long SYSENTER_stack[64]; #endif diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9f7c21c22477..ee9a9792caeb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -57,6 +57,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif +#ifdef CONFIG_X86_32 + .SYSENTER_stack_canary = STACK_END_MAGIC, +#endif }; EXPORT_PER_CPU_SYMBOL(cpu_tss); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b0ddb81926f9..49e2e775f507 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -713,6 +713,14 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: +#if defined(CONFIG_X86_32) + /* + * This is the most likely code path that involves non-trivial use + * of the SYSENTER stack. Check that we haven't overrun it. + */ + WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, + "Overran or corrupted SYSENTER stack\n"); +#endif ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); -- cgit v1.2.1 From 392a62549fbb80da48811d04391615a25c39d091 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:34 -0800 Subject: x86/entry: Remove TIF_SINGLESTEP entry work Now that SYSENTER with TF set puts X86_EFLAGS_TF directly into regs->flags, we don't need a TIF_SINGLESTEP fixup in the syscall entry code. Remove it. Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2d15f24da52dafc9d2f0b8d76f55544f4779c517.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 10 ---------- arch/x86/include/asm/thread_info.h | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 1a000f59f04a..209ba334e90f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -172,16 +172,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); - /* - * If we stepped into a sysenter/syscall insn, it trapped in - * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. - * If user-mode had set TF itself, then it's still clear from - * do_debug() and we need to set it again to restore the user - * state. If we entered on the slow path, TF was already set. - */ - if (work & _TIF_SINGLESTEP) - regs->flags |= X86_EFLAGS_TF; - #ifdef CONFIG_SECCOMP /* * Call seccomp_phase2 before running the other hooks so that diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c0778fcab06d..f2e2302c406f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -137,7 +137,7 @@ struct thread_info { /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ _TIF_NOHZ) /* work to do on any return to user space */ -- cgit v1.2.1 From fda57b2267e12de014069b1596a5438cf76fc7c6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 19:00:35 -0800 Subject: x86/entry: Improve system call entry comments Ingo suggested that the comments should explain when the various entries are used. This adds these explanations and improves other parts of the comments. Signed-off-by: Andy Lutomirski Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/9524ecef7a295347294300045d08354d6a57c6e7.1457578375.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 61 +++++++++++++++++++++++++++- arch/x86/entry/entry_64.S | 10 +++++ arch/x86/entry/entry_64_compat.S | 85 +++++++++++++++++++++++++++------------- 3 files changed, 128 insertions(+), 28 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 7b3ec24ede82..286efa342091 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -307,6 +307,38 @@ ENTRY(xen_sysenter_target) jmp sysenter_past_esp #endif +/* + * 32-bit SYSENTER entry. + * + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * if X86_FEATURE_SEP is available. This is the preferred system call + * entry on 32-bit systems. + * + * The SYSENTER instruction, in principle, should *only* occur in the + * vDSO. In practice, a small number of Android devices were shipped + * with a copy of Bionic that inlined a SYSENTER instruction. This + * never happened in any of Google's Bionic versions -- it only happened + * in a narrow range of Intel-provided versions. + * + * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs. + * IF and VM in RFLAGS are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old EIP (!!!), ESP, or EFLAGS. + * + * To avoid losing track of EFLAGS.VM (and thus potentially corrupting + * user and/or vm86 state), we explicitly disable the SYSENTER + * instruction in vm86 mode by reprogramming the MSRs. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + */ ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp sysenter_past_esp: @@ -397,7 +429,34 @@ sysenter_past_esp: GLOBAL(__end_SYSENTER_singlestep_region) ENDPROC(entry_SYSENTER_32) - # system call handler stub +/* + * 32-bit legacy system call entry. + * + * 32-bit x86 Linux system calls traditionally used the INT $0x80 + * instruction. INT $0x80 lands here. + * + * This entry point can be used by any 32-bit perform system calls. + * Instances of INT $0x80 can be found inline in various programs and + * libraries. It is also used by the vDSO's __kernel_vsyscall + * fallback for hardware that doesn't support a faster entry method. + * Restarted 32-bit system calls also fall back to INT $0x80 + * regardless of what instruction was originally used to do the system + * call. (64-bit programs can use INT $0x80 as well, but they can + * only run on 64-bit kernels and therefore land in + * entry_INT80_compat.) + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 + */ ENTRY(entry_INT80_32) ASM_CLAC pushl %eax /* pt_regs->orig_ax */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 70eadb0ea5fa..858b555e274b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64) /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * + * This is the only entry point used for 64-bit system calls. The + * hardware interface is reasonably well designed and the register to + * argument mapping Linux uses fits well with the registers that are + * available when SYSCALL is used. + * + * SYSCALL instructions can be found inlined in libc implementations as + * well as some other programs and libraries. There are also a handful + * of SYSCALL instructions in the vDSO used, for example, as a + * clock_gettimeofday fallback. + * * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, * then loads new ss, cs, and rip from previously programmed MSRs. * rflags gets masked by a value from another MSR (so CLD and CLAC diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index cbb5e6943d9b..efe0fe3f76bc 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -19,12 +19,21 @@ .section .entry.text, "ax" /* - * 32-bit SYSENTER instruction entry. + * 32-bit SYSENTER entry. * - * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. - * IF and VM in rflags are cleared (IOW: interrupts are off). + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * on 64-bit kernels running on Intel CPUs. + * + * The SYSENTER instruction, in principle, should *only* occur in the + * vDSO. In practice, a small number of Android devices were shipped + * with a copy of Bionic that inlined a SYSENTER instruction. This + * never happened in any of Google's Bionic versions -- it only happened + * in a narrow range of Intel-provided versions. + * + * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs. + * IF and VM in RFLAGS are cleared (IOW: interrupts are off). * SYSENTER does not save anything on the stack, - * and does not save old rip (!!!) and rflags. + * and does not save old RIP (!!!), RSP, or RFLAGS. * * Arguments: * eax system call number @@ -35,10 +44,6 @@ * edi arg5 * ebp user stack * 0(%ebp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. */ ENTRY(entry_SYSENTER_compat) /* Interrupts are off on entry. */ @@ -131,17 +136,38 @@ GLOBAL(__end_entry_SYSENTER_compat) ENDPROC(entry_SYSENTER_compat) /* - * 32-bit SYSCALL instruction entry. + * 32-bit SYSCALL entry. + * + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * on 64-bit kernels running on AMD CPUs. + * + * The SYSCALL instruction, in principle, should *only* occur in the + * vDSO. In practice, it appears that this really is the case. + * As evidence: + * + * - The calling convention for SYSCALL has changed several times without + * anyone noticing. + * + * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything + * user task that did SYSCALL without immediately reloading SS + * would randomly crash. * - * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. + * - Most programmers do not directly target AMD CPUs, and the 32-bit + * SYSCALL instruction does not exist on Intel CPUs. Even on AMD + * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels + * because the SYSCALL instruction in legacy/native 32-bit mode (as + * opposed to compat mode) is sufficiently poorly designed as to be + * essentially unusable. * - * Note: rflags saving+masking-with-MSR happens only in Long mode + * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves + * RFLAGS to R11, then loads new SS, CS, and RIP from previously + * programmed MSRs. RFLAGS gets masked by a value from another MSR + * (so CLD and CLAC are not needed). SYSCALL does not save anything on + * the stack and does not change RSP. + * + * Note: RFLAGS saving+masking-with-MSR happens only in Long mode * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). - * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). * @@ -241,7 +267,21 @@ sysret32_from_system_call: END(entry_SYSCALL_compat) /* - * Emulated IA32 system calls via int 0x80. + * 32-bit legacy system call entry. + * + * 32-bit x86 Linux system calls traditionally used the INT $0x80 + * instruction. INT $0x80 lands here. + * + * This entry point can be used by 32-bit and 64-bit programs to perform + * 32-bit system calls. Instances of INT $0x80 can be found inline in + * various programs and libraries. It is also used by the vDSO's + * __kernel_vsyscall fallback for hardware that doesn't support a faster + * entry method. Restarted 32-bit system calls also fall back to INT + * $0x80 regardless of what instruction was originally used to do the + * system call. + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. * * Arguments: * eax system call number @@ -250,17 +290,8 @@ END(entry_SYSCALL_compat) * edx arg3 * esi arg4 * edi arg5 - * ebp arg6 (note: not saved in the stack frame, should not be touched) - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except eax must be saved (but ptrace may violate that). - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. + * ebp arg6 */ - ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. -- cgit v1.2.1 From a798f091113ef4999277dbe0483d37d04fa35b2e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 13:24:32 -0800 Subject: x86/entry/32: Change INT80 to be an interrupt gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want all of the syscall entries to run with interrupts off so that we can efficiently run context tracking before enabling interrupts. This will regress int $0x80 performance on 32-bit kernels by a couple of cycles. This shouldn't matter much -- int $0x80 is not a fast path. This effectively reverts: 657c1eea0019 ("x86/entry/32: Fix entry_INT80_32() to expect interrupts to be on") ... and fixes the same issue differently. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frédéric Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/59b4f90c9ebfccd8c937305dbbbca680bc74b905.1457558566.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 15 +++------------ arch/x86/entry/entry_32.S | 8 ++++---- arch/x86/entry/entry_64_compat.S | 2 +- arch/x86/kernel/traps.c | 2 +- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 209ba334e90f..d69d1b6e6c31 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -371,14 +371,7 @@ __visible void do_syscall_64(struct pt_regs *regs) * in workloads that use it, and it's usually called from * do_fast_syscall_32, so forcibly inline it to improve performance. */ -#ifdef CONFIG_X86_32 -/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */ -__visible -#else -/* 64-bit kernels use do_syscall_32_irqs_off() instead. */ -static -#endif -__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned int nr = (unsigned int)regs->orig_ax; @@ -413,14 +406,12 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) syscall_return_slowpath(regs); } -#ifdef CONFIG_X86_64 -/* Handles INT80 on 64-bit kernels */ -__visible void do_syscall_32_irqs_off(struct pt_regs *regs) +/* Handles int $0x80 */ +__visible void do_int80_syscall_32(struct pt_regs *regs) { local_irq_enable(); do_syscall_32_irqs_on(regs); } -#endif /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible long do_fast_syscall_32(struct pt_regs *regs) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 286efa342091..10868aa734dc 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -463,13 +463,13 @@ ENTRY(entry_INT80_32) SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ /* - * User mode is traced as though IRQs are on. Unlike the 64-bit - * case, INT80 is a trap gate on 32-bit kernels, so interrupts - * are already on (unless user code is messing around with iopl). + * User mode is traced as though IRQs are on, and the interrupt gate + * turned them off. */ + TRACE_IRQS_OFF movl %esp, %eax - call do_syscall_32_irqs_on + call do_int80_syscall_32 .Lsyscall_32_done: restore_all: diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index efe0fe3f76bc..847f2f0c31e5 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -336,7 +336,7 @@ ENTRY(entry_INT80_compat) TRACE_IRQS_OFF movq %rsp, %rdi - call do_syscall_32_irqs_off + call do_int80_syscall_32 .Lsyscall_32_done: /* Go back to user mode. */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 49e2e775f507..5fae2d840e64 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -912,7 +912,7 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); + set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif -- cgit v1.2.1 From 9999c8c01f34c918a57d6e5ba2f5d8b79aa04801 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 9 Mar 2016 13:24:33 -0800 Subject: x86/entry: Call enter_from_user_mode() with IRQs off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that slow-path syscalls always enter C before enabling interrupts, it's straightforward to call enter_from_user_mode() before enabling interrupts rather than doing it as part of entry tracing. With this change, we should finally be able to retire exception_enter(). This will also enable optimizations based on knowing that we never change context tracking state with interrupts on. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frédéric Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bc376ecf87921a495e874ff98139b1ca2f5c5dd7.1457558566.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 33 +++++++++++---------------------- arch/x86/include/asm/thread_info.h | 5 ++++- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d69d1b6e6c31..e79d93d44ecd 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -45,6 +45,8 @@ __visible void enter_from_user_mode(void) CT_WARN_ON(ct_state() != CONTEXT_USER); user_exit(); } +#else +static inline void enter_from_user_mode(void) {} #endif static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) @@ -85,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; -#ifdef CONFIG_CONTEXT_TRACKING - /* - * If TIF_NOHZ is set, we are required to call user_exit() before - * doing anything that could touch RCU. - */ - if (work & _TIF_NOHZ) { - enter_from_user_mode(); - work &= ~_TIF_NOHZ; - } -#endif - #ifdef CONFIG_SECCOMP /* * Do seccomp first -- it should minimize exposure of other @@ -344,6 +335,7 @@ __visible void do_syscall_64(struct pt_regs *regs) struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned long nr = regs->orig_ax; + enter_from_user_mode(); local_irq_enable(); if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) @@ -366,9 +358,9 @@ __visible void do_syscall_64(struct pt_regs *regs) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) /* - * Does a 32-bit syscall. Called with IRQs on and does all entry and - * exit work and returns with IRQs off. This function is extremely hot - * in workloads that use it, and it's usually called from + * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does + * all entry and exit work and returns with IRQs off. This function is + * extremely hot in workloads that use it, and it's usually called from * do_fast_syscall_32, so forcibly inline it to improve performance. */ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) @@ -409,6 +401,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) /* Handles int $0x80 */ __visible void do_int80_syscall_32(struct pt_regs *regs) { + enter_from_user_mode(); local_irq_enable(); do_syscall_32_irqs_on(regs); } @@ -431,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) */ regs->ip = landing_pad; - /* - * Fetch EBP from where the vDSO stashed it. - * - * WARNING: We are in CONTEXT_USER and RCU isn't paying attention! - */ + enter_from_user_mode(); + local_irq_enable(); + + /* Fetch EBP from where the vDSO stashed it. */ if ( #ifdef CONFIG_X86_64 /* @@ -454,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) /* User code screwed up. */ local_irq_disable(); regs->ax = -EFAULT; -#ifdef CONFIG_CONTEXT_TRACKING - enter_from_user_mode(); -#endif prepare_exit_to_usermode(regs); return 0; /* Keep it simple: use IRET. */ } diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f2e2302c406f..82866697fcf1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -134,7 +134,10 @@ struct thread_info { #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) -/* work to do in syscall_trace_enter() */ +/* + * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for + * enter_from_user_mode() + */ #define _TIF_WORK_SYSCALL_ENTRY \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ -- cgit v1.2.1 From 52b2a05fa7c8cfceebb59117a95decd68cf7e465 Mon Sep 17 00:00:00 2001 From: Quan Nguyen Date: Thu, 3 Mar 2016 21:56:52 +0700 Subject: genirq: Export IRQ functions for module use Export irq_chip_*_parent(), irq_domain_create_hierarchy(), irq_domain_set_hwirq_and_chip(), irq_domain_reset_irq_data(), irq_domain_alloc/free_irqs_parent() So gpio drivers can be built as modules. First user: gpio-xgene-sb Signed-off-by: Quan Nguyen Acked-by: Linus Walleij Cc: Phong Vo Cc: Marc Zyngier Cc: patches@apm.com Cc: Loc Ho Cc: Keyur Chudgar Cc: Jiang Liu Link: https://lists.01.org/pipermail/kbuild-all/2016-February/017914.html Link: http://lkml.kernel.org/r/1457017012-10628-1-git-send-email-qnguyen@apm.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 4 ++++ kernel/irq/irqdomain.c | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5797909f4e5b..2f9f2b0e79f2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -961,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_mask(data); } +EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** * irq_chip_unmask_parent - Unmask the parent interrupt @@ -971,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_unmask(data); } +EXPORT_SYMBOL_GPL(irq_chip_unmask_parent); /** * irq_chip_eoi_parent - Invoke EOI on the parent interrupt @@ -981,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_eoi(data); } +EXPORT_SYMBOL_GPL(irq_chip_eoi_parent); /** * irq_chip_set_affinity_parent - Set affinity on the parent interrupt @@ -1016,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_type_parent); /** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 86811541f073..3a519a01118b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -893,6 +893,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, return domain; } +EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy); static void irq_domain_insert_irq(int virq) { @@ -1043,6 +1044,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, return 0; } +EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); /** * irq_domain_set_info - Set the complete data for a @virq in @domain @@ -1076,6 +1078,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data) irq_data->chip = &no_irq_chip; irq_data->chip_data = NULL; } +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); /** * irq_domain_free_irqs_common - Clear irq_data and free the parent @@ -1273,6 +1276,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, nr_irqs, arg); return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); /** * irq_domain_free_irqs_parent - Free interrupts from parent domain @@ -1290,6 +1294,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, irq_domain_free_irqs_recursive(domain->parent, irq_base, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate -- cgit v1.2.1 From cfe199afefe6201e998ddc07102fc1fdb55f196c Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Wed, 9 Mar 2016 03:21:29 +0200 Subject: irqchip/sunxi-nmi: Fix error check of of_io_request_and_map() The of_io_request_and_map() returns a valid pointer in iomem region or ERR_PTR(), check for NULL always fails and may cause a NULL pointer dereference on error path. Fixes: 0e841b04c829 ("irqchip/sunxi-nmi: Switch to of_io_request_and_map() from of_iomap()") Signed-off-by: Vladimir Zapolskiy Cc: Jason Cooper Cc: Marc Zyngier Cc: Chen-Yu Tsai Cc: Maxime Ripard Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1457486489-10189-1-git-send-email-vz@mleia.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-sunxi-nmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-sunxi-nmi.c b/drivers/irqchip/irq-sunxi-nmi.c index 0820f67cc9a7..668730c5cb66 100644 --- a/drivers/irqchip/irq-sunxi-nmi.c +++ b/drivers/irqchip/irq-sunxi-nmi.c @@ -160,9 +160,9 @@ static int __init sunxi_sc_nmi_irq_init(struct device_node *node, gc = irq_get_domain_generic_chip(domain, 0); gc->reg_base = of_io_request_and_map(node, 0, of_node_full_name(node)); - if (!gc->reg_base) { + if (IS_ERR(gc->reg_base)) { pr_err("unable to map resource\n"); - ret = -ENOMEM; + ret = PTR_ERR(gc->reg_base); goto fail_irqd_remove; } -- cgit v1.2.1 From edf8fcdc6b254236be005851af35ea5e826e7e09 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Wed, 9 Mar 2016 03:21:40 +0200 Subject: irqchip/mxs: Fix error check of of_io_request_and_map() The of_io_request_and_map() returns a valid pointer in iomem region or ERR_PTR(), check for NULL always fails and may cause a NULL pointer dereference on error path. Fixes: 25e34b44313b ("irqchip/mxs: Prepare driver for hardware with different offsets") Signed-off-by: Vladimir Zapolskiy Cc: Jason Cooper Cc: Marc Zyngier Cc: Oleksij Rempel Cc: Sascha Hauer Cc: Shawn Guo Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1457486500-10237-1-git-send-email-vz@mleia.com Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-mxs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-mxs.c b/drivers/irqchip/irq-mxs.c index efe50845939d..17304705f2cf 100644 --- a/drivers/irqchip/irq-mxs.c +++ b/drivers/irqchip/irq-mxs.c @@ -183,7 +183,7 @@ static void __iomem * __init icoll_init_iobase(struct device_node *np) void __iomem *icoll_base; icoll_base = of_io_request_and_map(np, 0, np->name); - if (!icoll_base) + if (IS_ERR(icoll_base)) panic("%s: unable to map resource", np->full_name); return icoll_base; } -- cgit v1.2.1 From 10ee73865e9e4705ba8b3f4d6149e8e68d902bb7 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Thu, 10 Mar 2016 20:19:58 +0800 Subject: x86/entry/traps: Show unhandled signal for i386 in do_trap() Commit abd4f7505baf ("x86: i386-show-unhandled-signals-v3") did turn on the showing-unhandled-signal behaviour for i386 for some exception handlers, but for no reason do_trap() is left out (my naive guess is because turning it on for do_trap() would be too noisy since do_trap() is shared by several exceptions). And since the same commit make "show_unhandled_signals" a debug tunable(in /proc/sys/debug/exception-trace), and x86 by default turning it on. So it would be strange for i386 users who turing it on manually and expect seeing the unhandled signal output in log, but nothing. This patch turns it on for i386 in do_trap() as well. Signed-off-by: Jianyu Zhan Reviewed-by: Jan Beulich Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@suse.de Cc: dave.hansen@linux.intel.com Cc: heukelum@fastmail.fm Cc: jbeulich@novell.com Cc: jdike@addtoit.com Cc: joe@perches.com Cc: luto@kernel.org Link: http://lkml.kernel.org/r/1457612398-4568-1-git-send-email-nasa4836@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5fae2d840e64..1baf08187ddf 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -248,7 +248,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; -#ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", @@ -257,7 +256,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, print_vma_addr(" in ", regs->ip); pr_cont("\n"); } -#endif force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); } -- cgit v1.2.1 From 2a58c527bb566b7abad96289fa5b973040c33c82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Mar 2016 20:42:08 +0100 Subject: cpu/hotplug: Fix smpboot thread ordering Commit 931ef163309e moved the smpboot thread park/unpark invocation to the state machine. The move of the unpark invocation was premature as it depends on work in progress patches. As a result cpu down can fail, because rcu synchronization in takedown_cpu() eventually requires a functional softirq thread. I never encountered the problem in testing, but 0day testing managed to provide a reliable reproducer. Remove the smpboot_threads_park() call from the state machine for now and put it back into the original place after the rcu synchronization. I'm embarrassed as I knew about the dependency and still managed to get it wrong. Hotplug induced brain melt seems to be the only sensible explanation for that. Fixes: 931ef163309e "cpu/hotplug: Unpark smpboot threads from the state machine" Reported-by: Fengguang Wu Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra --- kernel/cpu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 373e831e0faa..bcee286b7c73 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -706,8 +706,9 @@ static int takedown_cpu(unsigned int cpu) else synchronize_rcu(); - /* Park the hotplug thread */ + /* Park the smpboot threads */ kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); + smpboot_park_threads(cpu); /* * Prevent irq alloc/free while the dying cpu reorganizes the @@ -1206,7 +1207,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot:threads", .startup = smpboot_unpark_threads, - .teardown = smpboot_park_threads, + .teardown = NULL, }, [CPUHP_AP_NOTIFY_ONLINE] = { .name = "notify:online", -- cgit v1.2.1 From 8b8addf891de8a00e4d39fc32f93f7c5eb8feceb Mon Sep 17 00:00:00 2001 From: Hector Marco-Gisbert Date: Thu, 10 Mar 2016 20:51:00 +0100 Subject: x86/mm/32: Enable full randomization on i386 and X86_32 Currently on i386 and on X86_64 when emulating X86_32 in legacy mode, only the stack and the executable are randomized but not other mmapped files (libraries, vDSO, etc.). This patch enables randomization for the libraries, vDSO and mmap requests on i386 and in X86_32 in legacy mode. By default on i386 there are 8 bits for the randomization of the libraries, vDSO and mmaps which only uses 1MB of VA. This patch preserves the original randomness, using 1MB of VA out of 3GB or 4GB. We think that 1MB out of 3GB is not a big cost for having the ASLR. The first obvious security benefit is that all objects are randomized (not only the stack and the executable) in legacy mode which highly increases the ASLR effectiveness, otherwise the attackers may use these non-randomized areas. But also sensitive setuid/setgid applications are more secure because currently, attackers can disable the randomization of these applications by setting the ulimit stack to "unlimited". This is a very old and widely known trick to disable the ASLR in i386 which has been allowed for too long. Another trick used to disable the ASLR was to set the ADDR_NO_RANDOMIZE personality flag, but fortunately this doesn't work on setuid/setgid applications because there is security checks which clear Security-relevant flags. This patch always randomizes the mmap_legacy_base address, removing the possibility to disable the ASLR by setting the stack to "unlimited". Signed-off-by: Hector Marco-Gisbert Acked-by: Ismael Ripoll Ripoll Acked-by: Kees Cook Acked-by: Arjan van de Ven Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: kees Cook Link: http://lkml.kernel.org/r/1457639460-5242-1-git-send-email-hecmargi@upv.es Signed-off-by: Ingo Molnar --- arch/x86/mm/mmap.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 96bd1e2bffaf..389939f74dd5 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -93,18 +93,6 @@ static unsigned long mmap_base(unsigned long rnd) return PAGE_ALIGN(TASK_SIZE - gap - rnd); } -/* - * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 - * does, but not when emulating X86_32 - */ -static unsigned long mmap_legacy_base(unsigned long rnd) -{ - if (mmap_is_ia32()) - return TASK_UNMAPPED_BASE; - else - return TASK_UNMAPPED_BASE + rnd; -} - /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: @@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - mm->mmap_legacy_base = mmap_legacy_base(random_factor); + mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; -- cgit v1.2.1 From 143d36a33b4d59a56bb8e913a17a105578fd3237 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 11 Mar 2016 11:14:43 +0300 Subject: irqchip/irq-alpine-msi: Release the correct domain on error The "msi_domain" variable is NULL here so it leads to a NULL dereference. It looks like we actually intended to free "middle_domain". Fixes: e6b78f2c3e14 ('irqchip: Add the Alpine MSIX interrupt controller') Signed-off-by: Dan Carpenter Cc: Jason Cooper Cc: Marc Zyngier Cc: Antoine Tenart Cc: kernel-janitors@vger.kernel.org Cc: Tsahee Zidenberg Link: http://lkml.kernel.org/r/20160311081442.GE31887@mwanda Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-alpine-msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-alpine-msi.c b/drivers/irqchip/irq-alpine-msi.c index f8712722a606..25384255b30f 100644 --- a/drivers/irqchip/irq-alpine-msi.c +++ b/drivers/irqchip/irq-alpine-msi.c @@ -220,7 +220,7 @@ static int alpine_msix_init_domains(struct alpine_msix_data *priv, middle_domain); if (!msi_domain) { pr_err("Failed to create MSI domain\n"); - irq_domain_remove(msi_domain); + irq_domain_remove(middle_domain); return -ENOMEM; } -- cgit v1.2.1 From d05004944206cbbf1c453e179768163731c7c6f1 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 10 Mar 2016 19:38:18 -0800 Subject: x86/cpufeature: Enable new AVX-512 features A few new AVX-512 instruction groups/features are added in cpufeatures.h for enuermation: AVX512DQ, AVX512BW, and AVX512VL. Clear the flags in fpu__xstate_clear_all_cpu_caps(). The specification for latest AVX-512 including the features can be found at: https://software.intel.com/sites/default/files/managed/07/b7/319433-023.pdf Note, I didn't enable the flags in KVM. Hopefully the KVM guys can pick up the flags and enable them in KVM. Signed-off-by: Fenghua Yu Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: Denys Vlasenko Cc: Gleb Natapov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Ravi V Shankar Cc: Thomas Gleixner Cc: kvm@vger.kernel.org Link: http://lkml.kernel.org/r/1457667498-37357-1-git-send-email-fenghua.yu@intel.com [ Added more detailed feature descriptions. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 3 +++ arch/x86/kernel/fpu/xstate.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index d11a3aaafd96..9e0567f4c081 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -220,6 +220,7 @@ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ @@ -230,6 +231,8 @@ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ #define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ #define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d425cda5ae6d..6e8354f5a593 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -51,6 +51,9 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX512PF); setup_clear_cpu_cap(X86_FEATURE_AVX512ER); setup_clear_cpu_cap(X86_FEATURE_AVX512CD); + setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); + setup_clear_cpu_cap(X86_FEATURE_AVX512BW); + setup_clear_cpu_cap(X86_FEATURE_AVX512VL); setup_clear_cpu_cap(X86_FEATURE_MPX); setup_clear_cpu_cap(X86_FEATURE_XGETBV1); } -- cgit v1.2.1 From d10ef6f9380b8853c4b48eb104268fccfdc0b0c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Mar 2016 10:36:13 +0100 Subject: cpu/hotplug: Document states better Requested-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index bcee286b7c73..6ea42e8da861 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1172,6 +1172,10 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = NULL, .cant_stop = true, }, + /* + * Preparatory and dead notifiers. Will be replaced once the notifiers + * are converted to states. + */ [CPUHP_NOTIFY_PREPARE] = { .name = "notify:prepare", .startup = notify_prepare, @@ -1179,12 +1183,17 @@ static struct cpuhp_step cpuhp_bp_states[] = { .skip_onerr = true, .cant_stop = true, }, + /* Kicks the plugged cpu into life */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup = bringup_cpu, .teardown = NULL, .cant_stop = true, }, + /* + * Handled on controll processor until the plugged processor manages + * this itself. + */ [CPUHP_TEARDOWN_CPU] = { .name = "cpu:teardown", .startup = NULL, @@ -1197,6 +1206,23 @@ static struct cpuhp_step cpuhp_bp_states[] = { /* Application processor state steps */ static struct cpuhp_step cpuhp_ap_states[] = { #ifdef CONFIG_SMP + /* Final state before CPU kills itself */ + [CPUHP_AP_IDLE_DEAD] = { + .name = "idle:dead", + }, + /* + * Last state before CPU enters the idle loop to die. Transient state + * for synchronization. + */ + [CPUHP_AP_OFFLINE] = { + .name = "ap:offline", + .cant_stop = true, + }, + /* + * Low level startup/teardown notifiers. Run with interrupts + * disabled. Will be removed once the notifiers are converted to + * states. + */ [CPUHP_AP_NOTIFY_STARTING] = { .name = "notify:starting", .startup = notify_starting, @@ -1204,17 +1230,32 @@ static struct cpuhp_step cpuhp_ap_states[] = { .skip_onerr = true, .cant_stop = true, }, + /* Entry state on starting. Interrupts enabled from here on. Transient + * state for synchronsization */ + [CPUHP_AP_ONLINE] = { + .name = "ap:online", + }, + /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot:threads", .startup = smpboot_unpark_threads, .teardown = NULL, }, + /* + * Online/down_prepare notifiers. Will be removed once the notifiers + * are converted to states. + */ [CPUHP_AP_NOTIFY_ONLINE] = { .name = "notify:online", .startup = notify_online, .teardown = notify_down_prepare, }, #endif + /* + * The dynamically registered state space is here + */ + + /* CPU is fully up and running. */ [CPUHP_ONLINE] = { .name = "online", .startup = NULL, @@ -1232,7 +1273,11 @@ static int cpuhp_cb_check(enum cpuhp_state state) static bool cpuhp_is_ap_state(enum cpuhp_state state) { - return state > CPUHP_BRINGUP_CPU; + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitely in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) -- cgit v1.2.1