From d038b12c6d773a4b9f69ca5243773bf6314f7ee9 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 25 Jan 2011 17:32:01 +0200 Subject: perf: Fix Pentium4 raw event validation This patch fixes some issues with raw event validation on Pentium 4 (Netburst) based processors. As I was testing libpfm4 Netburst support, I ran into two problems in the p4_validate_raw_event() function: - the shared field must be checked ONLY when HT is on - the binding to ESCR register was missing The second item was causing raw events to not be encoded correctly compared to generic PMU events. With this patch, I can now pass Netburst events to libpfm4 examples and get meaningful results: $ task -e global_power_events:running:u noploop 1 noploop for 1 seconds 3,206,304,898 global_power_events:running Signed-off-by: Stephane Eranian Acked-by: Cyrill Gorcunov Cc: peterz@infradead.org Cc: paulus@samba.org Cc: davem@davemloft.net Cc: fweisbec@gmail.com Cc: perfmon2-devel@lists.sf.net Cc: eranian@gmail.com Cc: robert.richter@amd.com Cc: acme@redhat.com Cc: gorcunov@gmail.com Cc: ming.m.lin@intel.com LKML-Reference: <4d3efb2f.1252d80a.1a80.ffffc83f@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index e56b9bfbabd1..f7a0993c1e7c 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -682,7 +682,7 @@ static int p4_validate_raw_event(struct perf_event *event) * if an event is shared accross the logical threads * the user needs special permissions to be able to use it */ - if (p4_event_bind_map[v].shared) { + if (p4_ht_active() && p4_event_bind_map[v].shared) { if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return -EACCES; } @@ -727,7 +727,8 @@ static int p4_hw_config(struct perf_event *event) event->hw.config = p4_set_ht_bit(event->hw.config); if (event->attr.type == PERF_TYPE_RAW) { - + struct p4_event_bind *bind; + unsigned int esel; /* * Clear bits we reserve to be managed by kernel itself * and never allowed from a user space @@ -743,6 +744,13 @@ static int p4_hw_config(struct perf_event *event) * bits since we keep additional info here (for cache events and etc) */ event->hw.config |= event->attr.config; + bind = p4_config_get_bind(event->attr.config); + if (!bind) { + rc = -EINVAL; + goto out; + } + esel = P4_OPCODE_ESEL(bind->opcode); + event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel)); } rc = x86_setup_perfctr(event); -- cgit v1.2.1 From f12d3d04e8f6223276abb068c5d72852174b8c31 Mon Sep 17 00:00:00 2001 From: Matthieu CASTET Date: Thu, 20 Jan 2011 21:11:45 +0100 Subject: x86, nx: Don't force pages RW when setting NX bits Xen want page table pages read only. But the initial page table (from head_*.S) live in .data or .bss. That was broken by 64edc8ed5ffae999d8d413ba006850e9e34166cb. There is absolutely no reason to force these pages RW after they have already been marked RO. Signed-off-by: Matthieu CASTET Tested-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 8b830ca14ac4..d343b3c81f3c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -256,7 +256,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, unsigned long pfn) { pgprot_t forbidden = __pgprot(0); - pgprot_t required = __pgprot(0); /* * The BIOS area between 640k and 1Mb needs to be executable for @@ -282,12 +281,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; - /* - * .data and .bss should always be writable. - */ - if (within(address, (unsigned long)_sdata, (unsigned long)_edata) || - within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) - pgprot_val(required) |= _PAGE_RW; #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) /* @@ -327,7 +320,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, #endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); - prot = __pgprot(pgprot_val(prot) | pgprot_val(required)); return prot; } -- cgit v1.2.1 From f7448548a9f32db38f243ccd4271617758ddfe2c Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 2 Feb 2011 17:02:55 -0800 Subject: x86, mtrr: Avoid MTRR reprogramming on BP during boot on UP platforms Markus Kohn ran into a hard hang regression on an acer aspire 1310, when acpi is enabled. git bisect showed the following commit as the bad one that introduced the boot regression. commit d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3 Author: Suresh Siddha Date: Wed Aug 19 18:05:36 2009 -0700 x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init Because of the UP configuration of that platform, native_smp_prepare_cpus() bailed out (in smp_sanity_check()) before doing the set_mtrr_aps_delayed_init() Further down the boot path, native_smp_cpus_done() will call the delayed MTRR initialization for the AP's (mtrr_aps_init()) with mtrr_aps_delayed_init not set. This resulted in the boot processor reprogramming its MTRR's to the values seen during the start of the OS boot. While this is not needed ideally, this shouldn't have caused any side-effects. This is because the reprogramming of MTRR's (set_mtrr_state() that gets called via set_mtrr()) will check if the live register contents are different from what is being asked to write and will do the actual write only if they are different. BP's mtrr state is read during the start of the OS boot and typically nothing would have changed when we ask to reprogram it on BP again because of the above scenario on an UP platform. So on a normal UP platform no reprogramming of BP MTRR MSR's happens and all is well. However, on this platform, bios seems to be modifying the fixed mtrr range registers between the start of OS boot and when we double check the live registers for reprogramming BP MTRR registers. And as the live registers are modified, we end up reprogramming the MTRR's to the state seen during the start of the OS boot. During ACPI initialization, something in the bios (probably smi handler?) don't like this fact and results in a hard lockup. We didn't see this boot hang issue on this platform before the commit d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3, because only the AP's (if any) will program its MTRR's to the value that BP had at the start of the OS boot. Fix this issue by checking mtrr_aps_delayed_init before continuing further in the mtrr_aps_init(). Now, only AP's (if any) will program its MTRR's to the BP values during boot. Addresses https://bugzilla.novell.com/show_bug.cgi?id=623393 [ By the way, this behavior of the bios modifying MTRR's after the start of the OS boot is not common and the kernel is not prepared to handle this situation well. Irrespective of this issue, during suspend/resume, linux kernel will try to reprogram the BP's MTRR values to the values seen during the start of the OS boot. So suspend/resume might be already broken on this platform for all linux kernel versions. ] Reported-and-bisected-by: Markus Kohn Tested-by: Markus Kohn Signed-off-by: Suresh Siddha Cc: Thomas Renninger Cc: Rafael Wysocki Cc: Venkatesh Pallipadi Cc: stable@kernel.org # [v2.6.32+] LKML-Reference: <1296694975.4418.402.camel@sbsiddha-MOBL3.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 01c0f3ee6cc3..bebabec5b448 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -793,13 +793,21 @@ void set_mtrr_aps_delayed_init(void) } /* - * MTRR initialization for all AP's + * Delayed MTRR initialization for all AP's */ void mtrr_aps_init(void) { if (!use_intel()) return; + /* + * Check if someone has requested the delay of AP MTRR initialization, + * by doing set_mtrr_aps_delayed_init(), prior to this point. If not, + * then we are done. + */ + if (!mtrr_aps_delayed_init) + return; + set_mtrr(~0U, 0, 0, 0); mtrr_aps_delayed_init = false; } -- cgit v1.2.1 From 831d52bc153971b70e64eccfbed2b232394f22f8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 3 Feb 2011 12:20:04 -0800 Subject: x86, mm: avoid possible bogus tlb entries by clearing prev mm_cpumask after switching mm Clearing the cpu in prev's mm_cpumask early will avoid the flush tlb IPI's while the cr3 is still pointing to the prev mm. And this window can lead to the possibility of bogus TLB fills resulting in strange failures. One such problematic scenario is mentioned below. T1. CPU-1 is context switching from mm1 to mm2 context and got a NMI etc between the point of clearing the cpu from the mm_cpumask(mm1) and before reloading the cr3 with the new mm2. T2. CPU-2 is tearing down a specific vma for mm1 and will proceed with flushing the TLB for mm1. It doesn't send the flush TLB to CPU-1 as it doesn't see that cpu listed in the mm_cpumask(mm1). T3. After the TLB flush is complete, CPU-2 goes ahead and frees the page-table pages associated with the removed vma mapping. T4. CPU-2 now allocates those freed page-table pages for something else. T5. As the CR3 and TLB caches for mm1 is still active on CPU-1, CPU-1 can potentially speculate and walk through the page-table caches and can insert new TLB entries. As the page-table pages are already freed and being used on CPU-2, this page walk can potentially insert a bogus global TLB entry depending on the (random) contents of the page that is being used on CPU-2. T6. This bogus TLB entry being global will be active across future CR3 changes and can result in weird memory corruption etc. To avoid this issue, for the prev mm that is handing over the cpu to another mm, clear the cpu from the mm_cpumask(prev) after the cr3 is changed. Marking it for -stable, though we haven't seen any reported failure that can be attributed to this. Signed-off-by: Suresh Siddha Acked-by: Ingo Molnar Cc: stable@kernel.org [v2.6.32+] Signed-off-by: Linus Torvalds --- arch/x86/include/asm/mmu_context.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 4a2d4e0c18d9..8b5393ec1080 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -36,8 +36,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, unsigned cpu = smp_processor_id(); if (likely(prev != next)) { - /* stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); #ifdef CONFIG_SMP percpu_write(cpu_tlbstate.state, TLBSTATE_OK); percpu_write(cpu_tlbstate.active_mm, next); @@ -47,6 +45,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Re-load page tables */ load_cr3(next->pgd); + /* stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + /* * load the LDT, if the LDT is different: */ -- cgit v1.2.1 From 11d4c3f9b671720e80353dd7e433ff2bf65e9500 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 4 Feb 2011 16:14:11 -0800 Subject: x86-32: Make sure the stack is set up before we use it Since checkin ebba638ae723d8a8fc2f7abce5ec18b688b791d7 we call verify_cpu even in 32-bit mode. Unfortunately, calling a function means using the stack, and the stack pointer was not initialized in the 32-bit setup code! This code initializes the stack pointer, and simplifies the interface slightly since it is easier to rely on just a pointer value rather than a descriptor; we need to have different values for the segment register anyway. This retains start_stack as a virtual address, even though a physical address would be more convenient for 32 bits; the 64-bit code wants the other way around... Reported-by: Matthieu Castet LKML-Reference: <4D41E86D.8060205@free.fr> Tested-by: Kees Cook Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/smp.h | 5 +---- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/head_32.S | 30 +++++++++++++----------------- arch/x86/kernel/smpboot.c | 4 ++-- 4 files changed, 17 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4c2f63c7fc1b..1f4695136776 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -40,10 +40,7 @@ DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); /* Static state in head.S used to set up a CPU */ -extern struct { - void *sp; - unsigned short ss; -} stack_start; +extern unsigned long stack_start; /* Initial stack pointer address */ struct smp_ops { void (*smp_prepare_boot_cpu)(void); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 69fd72aa5594..4d9ebbab2230 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -100,7 +100,7 @@ int acpi_save_state_mem(void) #else /* CONFIG_64BIT */ header->trampoline_segment = setup_trampoline() >> 4; #ifdef CONFIG_SMP - stack_start.sp = temp_stack + sizeof(temp_stack); + stack_start = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index fc293dc8dc35..767d6c43de37 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -85,6 +85,8 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) */ __HEAD ENTRY(startup_32) + movl pa(stack_start),%ecx + /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ testb $(1<<6), BP_loadflags(%esi) @@ -99,7 +101,9 @@ ENTRY(startup_32) movl %eax,%es movl %eax,%fs movl %eax,%gs + movl %eax,%ss 2: + leal -__PAGE_OFFSET(%ecx),%esp /* * Clear BSS first so that there are no surprises... @@ -145,8 +149,6 @@ ENTRY(startup_32) * _brk_end is set up to point to the first "safe" location. * Mappings are created both at virtual address 0 (identity mapping) * and PAGE_OFFSET for up to _end. - * - * Note that the stack is not yet set up! */ #ifdef CONFIG_X86_PAE @@ -282,6 +284,9 @@ ENTRY(startup_32_smp) movl %eax,%es movl %eax,%fs movl %eax,%gs + movl pa(stack_start),%ecx + movl %eax,%ss + leal -__PAGE_OFFSET(%ecx),%esp #endif /* CONFIG_SMP */ default_entry: @@ -347,8 +352,8 @@ default_entry: movl %eax,%cr0 /* ..and set paging (PG) bit */ ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 1: - /* Set up the stack pointer */ - lss stack_start,%esp + /* Shift the stack pointer to a virtual address */ + addl $__PAGE_OFFSET, %esp /* * Initialize eflags. Some BIOS's leave bits like NT set. This would @@ -360,9 +365,7 @@ default_entry: #ifdef CONFIG_SMP cmpb $0, ready - jz 1f /* Initial CPU cleans BSS */ - jmp checkCPUtype -1: + jnz checkCPUtype #endif /* CONFIG_SMP */ /* @@ -470,14 +473,7 @@ is386: movl $2,%ecx # set MP cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder -#ifdef CONFIG_SMP - movb ready, %cl movb $1, ready - cmpb $0,%cl # the first CPU calls start_kernel - je 1f - movl (stack_start), %esp -1: -#endif /* CONFIG_SMP */ jmp *(initial_code) /* @@ -670,15 +666,15 @@ ENTRY(initial_page_table) #endif .data +.balign 4 ENTRY(stack_start) .long init_thread_union+THREAD_SIZE - .long __BOOT_DS - -ready: .byte 0 early_recursion_flag: .long 0 +ready: .byte 0 + int_msg: .asciz "Unknown interrupt or fault at: %p %p %p\n" diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0cbe8c0b35ed..03273b6c272c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -638,7 +638,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) * target processor state. */ startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, - (unsigned long)stack_start.sp); + stack_start); /* * Run STARTUP IPI loop. @@ -785,7 +785,7 @@ do_rest: #endif early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; - stack_start.sp = (void *) c_idle.idle->thread.sp; + stack_start = c_idle.idle->thread.sp; /* start_ip had better be page-aligned! */ start_ip = setup_trampoline(); -- cgit v1.2.1 From d344e38b2c151ca5e5e39f562017127e93912528 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 6 Feb 2011 21:16:09 -0800 Subject: x86, nx: Mark the ACPI resume trampoline code as +x We reserve lowmem for the things that need it, like the ACPI wakeup code, way early to guarantee availability. This happens before we set up the proper pagetables, so set_memory_x() has no effect. Until we have a better solution, use an initcall to mark the wakeup code executable. Originally-by: Matthieu Castet Signed-off-by: H. Peter Anvin Cc: Matthias Hopf Cc: rjw@sisk.pl Cc: Suresh Siddha LKML-Reference: <4D4F8019.2090104@zytor.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/sleep.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 4d9ebbab2230..68d1537b8c81 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -12,10 +12,8 @@ #include #include #include - -#ifdef CONFIG_X86_32 #include -#endif +#include #include "realmode/wakeup.h" #include "sleep.h" @@ -149,6 +147,15 @@ void __init acpi_reserve_wakeup_memory(void) memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); } +int __init acpi_configure_wakeup_memory(void) +{ + if (acpi_realmode) + set_memory_x(acpi_realmode, WAKEUP_SIZE >> PAGE_SHIFT); + + return 0; +} +arch_initcall(acpi_configure_wakeup_memory); + static int __init acpi_sleep_setup(char *str) { -- cgit v1.2.1