From 7f1baa063e2582dd52d83bb31508e9e84468c666 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 24 Oct 2008 15:24:29 -0700 Subject: x86/uv: provide a System Activity Indicator driver Impact: start per CPU heartbeat LED timers on SGI UV systems The SGI UV system has no LEDS but uses one of the system controller regs to indicate the online internal state of the cpu. There is a heartbeat bit indicating that the cpu is responding to interrupts, and an idle bit indicating whether the cpu is idle when the heartbeat interrupt occurs. The current period is one second. When a cpu panics, an error code is written by BIOS to this same reg. This patchset provides the following: * x86_64: Add base functionality for writing to the specific SCIR's for each cpu. * heartbeat: Invert "heartbeat" bit to indicate the cpu is "interruptible". If the current thread is the idle thread, then indicate system is "idle". * if hotplug enabled, all bits are set (0xff) when the cpu is disabled. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 63 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index c6ad93e315c8..400776dba9b5 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -112,6 +112,16 @@ */ #define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) +struct uv_scir_s { + struct timer_list timer; + unsigned long offset; + unsigned long last; + unsigned long idle_on; + unsigned long idle_off; + unsigned char state; + unsigned char enabled; +}; + /* * The following defines attributes of the HUB chip. These attributes are * frequently referenced and are kept in the per-cpu data areas of each cpu. @@ -130,7 +140,9 @@ struct uv_hub_info_s { unsigned char blade_processor_id; unsigned char m_val; unsigned char n_val; + struct uv_scir_s scir; }; + DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) @@ -162,6 +174,30 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_APIC_PNODE_SHIFT 6 +/* Local Bus from cpu's perspective */ +#define LOCAL_BUS_BASE 0x1c00000 +#define LOCAL_BUS_SIZE (4 * 1024 * 1024) + +/* + * System Controller Interface Reg + * + * Note there are NO leds on a UV system. This register is only + * used by the system controller to monitor system-wide operation. + * There are 64 regs per node. With Nahelem cpus (2 cores per node, + * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on + * a node. + * + * The window is located at top of ACPI MMR space + */ +#define SCIR_WINDOW_COUNT 64 +#define SCIR_LOCAL_MMR_BASE (LOCAL_BUS_BASE + \ + LOCAL_BUS_SIZE - \ + SCIR_WINDOW_COUNT) + +#define SCIR_CPU_HEARTBEAT 0x01 /* timer interrupt */ +#define SCIR_CPU_ACTIVITY 0x02 /* not idle */ +#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ + /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. @@ -276,6 +312,16 @@ static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) *uv_local_mmr_address(offset) = val; } +static inline unsigned char uv_read_local_mmr8(unsigned long offset) +{ + return *((unsigned char *)uv_local_mmr_address(offset)); +} + +static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val) +{ + *((unsigned char *)uv_local_mmr_address(offset)) = val; +} + /* * Structures and definitions for converting between cpu, node, pnode, and blade * numbers. @@ -350,5 +396,20 @@ static inline int uv_num_possible_blades(void) return uv_possible_blades; } -#endif /* _ASM_X86_UV_UV_HUB_H */ +/* Update SCIR state */ +static inline void uv_set_scir_bits(unsigned char value) +{ + if (uv_hub_info->scir.state != value) { + uv_hub_info->scir.state = value; + uv_write_local_mmr8(uv_hub_info->scir.offset, value); + } +} +static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) +{ + if (uv_cpu_hub_info(cpu)->scir.state != value) { + uv_cpu_hub_info(cpu)->scir.state = value; + uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value); + } +} +#endif /* _ASM_X86_UV_UV_HUB_H */ -- cgit v1.2.3 From 6784f7d0a5016a397d38be1134e63fc784c1ca8e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 5 Oct 2008 11:33:42 -0700 Subject: x86: corruption check: move the corruption checks into their own file Impact: cleanup The corruption check code is rather sizable and it's likely to grow over time when we add checks for more types of corruptions (there's a few candidates in kerneloops.org that I want to add checks for)... so lets move it to its own file Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/include/asm/setup.h | 4 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/check.c | 158 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 152 ----------------------------------------- 4 files changed, 163 insertions(+), 152 deletions(-) create mode 100644 arch/x86/kernel/check.c (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f12d37237465..1ed8b2e80727 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -8,6 +8,10 @@ /* Interrupt control for vSMPowered x86_64 systems */ void vsmp_init(void); + +void setup_bios_corruption_check(void); + + #ifdef CONFIG_X86_VISWS extern void visws_early_detect(void); extern int is_visws_box(void); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d7e5a58ee22f..31fbcaf3df70 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,6 +35,7 @@ obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o obj-y += tsc.o io_delay.o rtc.o +obj-y += check.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c new file mode 100644 index 000000000000..5056703e1b05 --- /dev/null +++ b/arch/x86/kernel/check.c @@ -0,0 +1,158 @@ +#include +#include + +#include +#include + +/* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +#define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = -1; + +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + +static struct e820entry scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + + +static int set_corruption_check(char *arg) +{ + char *end; + + memory_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check", set_corruption_check); + +static int set_corruption_check_period(char *arg) +{ + char *end; + + corruption_check_period = simple_strtoul(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + +void __init setup_bios_corruption_check(void) +{ + u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { + u64 size; + addr = find_e820_area_size(addr, &size, PAGE_SIZE); + + if (addr == 0) + break; + + if ((addr + size) > corruption_check_size) + size = corruption_check_size - addr; + + if (size == 0) + break; + + e820_update_range(addr, size, E820_RAM, E820_RESERVED); + scan_areas[num_scan_areas].addr = addr; + scan_areas[num_scan_areas].size = size; + num_scan_areas++; + + /* Assume we've already mapped this early memory */ + memset(__va(addr), 0, size); + + addr += size; + } + + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", + num_scan_areas); + update_e820(); +} + +static struct timer_list periodic_check_timer; + +void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!memory_corruption_check) + return; + + for (i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for (; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", + addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); +} + +static void periodic_check_for_corruption(unsigned long data) +{ + check_for_bios_corruption(); + mod_timer(&periodic_check_timer, + round_jiffies(jiffies + corruption_check_period*HZ)); +} + +void start_periodic_check_for_corruption(void) +{ + if (!memory_corruption_check || corruption_check_period == 0) + return; + + printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", + corruption_check_period); + + init_timer(&periodic_check_timer); + periodic_check_timer.function = &periodic_check_for_corruption; + periodic_check_for_corruption(0); +} +#endif + diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4f38e0305b07..af690aa593a9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -587,158 +587,6 @@ static struct x86_quirks default_x86_quirks __initdata; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; -/* - * Some BIOSes seem to corrupt the low 64k of memory during events - * like suspend/resume and unplugging an HDMI cable. Reserve all - * remaining free memory in that area and fill it with a distinct - * pattern. - */ -#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION -#define MAX_SCAN_AREAS 8 - -static int __read_mostly memory_corruption_check = -1; - -static unsigned __read_mostly corruption_check_size = 64*1024; -static unsigned __read_mostly corruption_check_period = 60; /* seconds */ - -static struct e820entry scan_areas[MAX_SCAN_AREAS]; -static int num_scan_areas; - - -static int set_corruption_check(char *arg) -{ - char *end; - - memory_corruption_check = simple_strtol(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check", set_corruption_check); - -static int set_corruption_check_period(char *arg) -{ - char *end; - - corruption_check_period = simple_strtoul(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check_period", set_corruption_check_period); - -static int set_corruption_check_size(char *arg) -{ - char *end; - unsigned size; - - size = memparse(arg, &end); - - if (*end == '\0') - corruption_check_size = size; - - return (size == corruption_check_size) ? 0 : -EINVAL; -} -early_param("memory_corruption_check_size", set_corruption_check_size); - - -static void __init setup_bios_corruption_check(void) -{ - u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ - - if (memory_corruption_check == -1) { - memory_corruption_check = -#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK - 1 -#else - 0 -#endif - ; - } - - if (corruption_check_size == 0) - memory_corruption_check = 0; - - if (!memory_corruption_check) - return; - - corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - - while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { - u64 size; - addr = find_e820_area_size(addr, &size, PAGE_SIZE); - - if (addr == 0) - break; - - if ((addr + size) > corruption_check_size) - size = corruption_check_size - addr; - - if (size == 0) - break; - - e820_update_range(addr, size, E820_RAM, E820_RESERVED); - scan_areas[num_scan_areas].addr = addr; - scan_areas[num_scan_areas].size = size; - num_scan_areas++; - - /* Assume we've already mapped this early memory */ - memset(__va(addr), 0, size); - - addr += size; - } - - printk(KERN_INFO "Scanning %d areas for low memory corruption\n", - num_scan_areas); - update_e820(); -} - -static struct timer_list periodic_check_timer; - -void check_for_bios_corruption(void) -{ - int i; - int corruption = 0; - - if (!memory_corruption_check) - return; - - for (i = 0; i < num_scan_areas; i++) { - unsigned long *addr = __va(scan_areas[i].addr); - unsigned long size = scan_areas[i].size; - - for (; size; addr++, size -= sizeof(unsigned long)) { - if (!*addr) - continue; - printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", - addr, __pa(addr), *addr); - corruption = 1; - *addr = 0; - } - } - - WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); -} - -static void periodic_check_for_corruption(unsigned long data) -{ - check_for_bios_corruption(); - mod_timer(&periodic_check_timer, - round_jiffies(jiffies + corruption_check_period*HZ)); -} - -void start_periodic_check_for_corruption(void) -{ - if (!memory_corruption_check || corruption_check_period == 0) - return; - - printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", - corruption_check_period); - - init_timer(&periodic_check_timer); - periodic_check_timer.function = &periodic_check_for_corruption; - periodic_check_for_corruption(0); -} -#endif - static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { printk(KERN_NOTICE -- cgit v1.2.3 From 69a72a0e9337aad8c730e8e9942d5aa022bc4c5c Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 27 Oct 2008 07:51:20 -0700 Subject: x86/uv: update SCIR driver to use the idle_cpu() function Impact: cleanup Change UV heartbeat function to use idle_cpu to determine cpu's "idleness". Realign uv_hub definitions. Signed-of-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 26 +++++++++++++------------- arch/x86/kernel/genx2apic_uv_x.c | 4 ++-- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 400776dba9b5..0ee12928e9ee 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -128,19 +128,19 @@ struct uv_scir_s { * They are kept together in a struct to minimize cache misses. */ struct uv_hub_info_s { - unsigned long global_mmr_base; - unsigned long gpa_mask; - unsigned long gnode_upper; - unsigned long lowmem_remap_top; - unsigned long lowmem_remap_base; - unsigned short pnode; - unsigned short pnode_mask; - unsigned short coherency_domain_number; - unsigned short numa_blade_id; - unsigned char blade_processor_id; - unsigned char m_val; - unsigned char n_val; - struct uv_scir_s scir; + unsigned long global_mmr_base; + unsigned long gpa_mask; + unsigned long gnode_upper; + unsigned long lowmem_remap_top; + unsigned long lowmem_remap_base; + unsigned short pnode; + unsigned short pnode_mask; + unsigned short coherency_domain_number; + unsigned short numa_blade_id; + unsigned char blade_processor_id; + unsigned char m_val; + unsigned char n_val; + struct uv_scir_s scir; }; DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 84367d84bb10..85fb7dd48f67 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -370,8 +370,8 @@ static void uv_heartbeat(unsigned long ignored) /* flip heartbeat bit */ bits ^= SCIR_CPU_HEARTBEAT; - /* are we the idle thread? */ - if (current->pid == 0) + /* is this cpu idle? */ + if (idle_cpu(raw_smp_processor_id())) bits &= ~SCIR_CPU_ACTIVITY; else bits |= SCIR_CPU_ACTIVITY; -- cgit v1.2.3 From d4f1b10365d4f03dd802433e0014cf503e6e930c Mon Sep 17 00:00:00 2001 From: Jike Song Date: Fri, 17 Oct 2008 13:25:07 +0800 Subject: x86: clean up comments wrt. rd{msr|tsc|pmc} The rdmsr instruction(et al) for i386 and x86-64 are semantically same. The only difference is how gcc interpret constraint "A" for these targets. Signed-off-by: Jike Song Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 46be2fa7ac26..478a9245aae1 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -22,10 +22,10 @@ static inline unsigned long long native_read_tscp(unsigned int *aux) } /* - * i386 calling convention returns 64-bit value in edx:eax, while - * x86_64 returns at rax. Also, the "A" constraint does not really - * mean rdx:rax in x86_64, so we need specialized behaviour for each - * architecture + * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A" + * constraint has different meanings. For i386, "A" means exactly + * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead, + * it means rax *or* rdx. */ #ifdef CONFIG_X86_64 #define DECLARE_ARGS(val, low, high) unsigned low, high -- cgit v1.2.3 From 17666f02b118099028522dfc3df00a235700e216 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Oct 2008 16:08:32 -0400 Subject: ftrace: nmi safe code modification Impact: fix crashes that can occur in NMI handlers, if their code is modified Modifying code is something that needs special care. On SMP boxes, if code that is being modified is also being executed on another CPU, that CPU will have undefined results. The dynamic ftrace uses kstop_machine to make the system act like a uniprocessor system. But this does not address NMIs, that can still run on other CPUs. One approach to handle this is to make all code that are used by NMIs not be traced. But NMIs can call notifiers that spread throughout the kernel and this will be very hard to maintain, and the chance of missing a function is very high. The approach that this patch takes is to have the NMIs modify the code if the modification is taking place. The way this works is that just writing to code executing on another CPU is not harmful if what is written is the same as what exists. Two buffers are used: an IP buffer and a "code" buffer. The steps that the patcher takes are: 1) Put in the instruction pointer into the IP buffer and the new code into the "code" buffer. 2) Set a flag that says we are modifying code 3) Wait for any running NMIs to finish. 4) Write the code 5) clear the flag. 6) Wait for any running NMIs to finish. If an NMI is executed, it will also write the pending code. Multiple writes are OK, because what is being written is the same. Then the patcher must wait for all running NMIs to finish before going to the next line that must be patched. This is basically the RCU approach to code modification. Thanks to Ingo Molnar for suggesting the idea, and to Arjan van de Ven for his guidence on what is safe and what is not. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/arm/include/asm/ftrace.h | 5 ++ arch/powerpc/include/asm/ftrace.h | 5 ++ arch/sh/include/asm/ftrace.h | 5 ++ arch/sparc/include/asm/ftrace.h | 5 ++ arch/x86/include/asm/ftrace.h | 15 ++++++ arch/x86/kernel/ftrace.c | 107 +++++++++++++++++++++++++++++++++++++- include/linux/hardirq.h | 15 +++++- 7 files changed, 154 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index 39c8bc1a006a..d4c24a7a9280 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -1,6 +1,11 @@ #ifndef _ASM_ARM_FTRACE #define _ASM_ARM_FTRACE +#ifndef __ASSEMBLY__ +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) +#endif + #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index b298f7a631e6..7652755dc000 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -1,6 +1,11 @@ #ifndef _ASM_POWERPC_FTRACE #define _ASM_POWERPC_FTRACE +#ifndef __ASSEMBLY__ +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) +#endif + #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index 3aed362c9463..cdf2cb0b9ffe 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h @@ -1,6 +1,11 @@ #ifndef __ASM_SH_FTRACE_H #define __ASM_SH_FTRACE_H +#ifndef __ASSEMBLY__ +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) +#endif + #ifndef __ASSEMBLY__ extern void mcount(void); #endif diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index d27716cd38c1..33a95feeb137 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -1,6 +1,11 @@ #ifndef _ASM_SPARC64_FTRACE #define _ASM_SPARC64_FTRACE +#ifndef __ASSEMBLY__ +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) +#endif + #ifdef CONFIG_MCOUNT #define MCOUNT_ADDR ((long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9e8bc29b8b17..f2ed6b704a75 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -17,6 +17,21 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) */ return addr - 1; } + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_nmi_enter(void); +extern void ftrace_nmi_exit(void); +#else +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) +#endif +#endif + +#else /* CONFIG_FUNCTION_TRACER */ + +#ifndef __ASSEMBLY__ +#define ftrace_nmi_enter() do { } while (0) +#define ftrace_nmi_exit() do { } while (0) #endif #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 50ea0ac8c9bf..fe5f859130b5 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -56,6 +56,111 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) return calc.code; } +/* + * Modifying code must take extra care. On an SMP machine, if + * the code being modified is also being executed on another CPU + * that CPU will have undefined results and possibly take a GPF. + * We use kstop_machine to stop other CPUS from exectuing code. + * But this does not stop NMIs from happening. We still need + * to protect against that. We separate out the modification of + * the code to take care of this. + * + * Two buffers are added: An IP buffer and a "code" buffer. + * + * 1) Put in the instruction pointer into the IP buffer + * and the new code into the "code" buffer. + * 2) Set a flag that says we are modifying code + * 3) Wait for any running NMIs to finish. + * 4) Write the code + * 5) clear the flag. + * 6) Wait for any running NMIs to finish. + * + * If an NMI is executed, the first thing it does is to call + * "ftrace_nmi_enter". This will check if the flag is set to write + * and if it is, it will write what is in the IP and "code" buffers. + * + * The trick is, it does not matter if everyone is writing the same + * content to the code location. Also, if a CPU is executing code + * it is OK to write to that code location if the contents being written + * are the same as what exists. + */ + +static atomic_t in_nmi; +static int mod_code_status; +static int mod_code_write; +static void *mod_code_ip; +static void *mod_code_newcode; + +static void ftrace_mod_code(void) +{ + /* + * Yes, more than one CPU process can be writing to mod_code_status. + * (and the code itself) + * But if one were to fail, then they all should, and if one were + * to succeed, then they all should. + */ + mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, + MCOUNT_INSN_SIZE); + +} + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); + /* Must have in_nmi seen before reading write flag */ + smp_mb(); + if (mod_code_write) + ftrace_mod_code(); +} + +void ftrace_nmi_exit(void) +{ + /* Finish all executions before clearing in_nmi */ + smp_wmb(); + atomic_dec(&in_nmi); +} + +static void wait_for_nmi(void) +{ + while (atomic_read(&in_nmi)) + cpu_relax(); +} + +static int +do_ftrace_mod_code(unsigned long ip, void *new_code) +{ + mod_code_ip = (void *)ip; + mod_code_newcode = new_code; + + /* The buffers need to be visible before we let NMIs write them */ + smp_wmb(); + + mod_code_write = 1; + + /* Make sure write bit is visible before we wait on NMIs */ + smp_mb(); + + wait_for_nmi(); + + /* Make sure all running NMIs have finished before we write the code */ + smp_mb(); + + ftrace_mod_code(); + + /* Make sure the write happens before clearing the bit */ + smp_wmb(); + + mod_code_write = 0; + + /* make sure NMIs see the cleared bit */ + smp_mb(); + + wait_for_nmi(); + + return mod_code_status; +} + + int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) @@ -81,7 +186,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return -EINVAL; /* replace the text with the new text */ - if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) + if (do_ftrace_mod_code(ip, new_code)) return -EPERM; sync_core(); diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 181006cc94a0..0087cb43becf 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -5,6 +5,7 @@ #include #include #include +#include #include /* @@ -161,7 +162,17 @@ extern void irq_enter(void); */ extern void irq_exit(void); -#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0) -#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0) +#define nmi_enter() \ + do { \ + ftrace_nmi_enter(); \ + lockdep_off(); \ + __irq_enter(); \ + } while (0) +#define nmi_exit() \ + do { \ + __irq_exit(); \ + lockdep_on(); \ + ftrace_nmi_exit(); \ + } while (0) #endif /* LINUX_HARDIRQ_H */ -- cgit v1.2.3 From 92be3d6bdf2cb34972ab50e12ad4da1076e690da Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Oct 2008 09:48:08 +0800 Subject: kexec/i386: allocate page table pages dynamically Impact: save .text size when kexec is built in but not loaded This patch adds an architecture specific struct kimage_arch into struct kimage. The pointers to page table pages used by kexec are added to struct kimage_arch. The page tables pages are dynamically allocated in machine_kexec_prepare instead of statically from BSS segment. This will save up to 20k memory when kexec image is not loaded. Signed-off-by: Huang Ying Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kexec.h | 14 ++++++++ arch/x86/kernel/machine_kexec_32.c | 67 ++++++++++++++++++++++++++------------ include/linux/kexec.h | 4 +++ 3 files changed, 64 insertions(+), 21 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index a1f22771a15a..df9c41a9c6ae 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -170,6 +170,20 @@ relocate_kernel(unsigned long indirection_page, unsigned long start_address) ATTRIB_NORET; #endif +#ifdef CONFIG_X86_32 +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + pgd_t *pgd; +#ifdef CONFIG_X86_PAE + pmd_t *pmd0; + pmd_t *pmd1; +#endif + pte_t *pte0; + pte_t *pte1; +}; +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 7a385746509a..1100312847a5 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -25,15 +26,6 @@ #include #include -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) -static u32 kexec_pgd[1024] PAGE_ALIGNED; -#ifdef CONFIG_X86_PAE -static u32 kexec_pmd0[1024] PAGE_ALIGNED; -static u32 kexec_pmd1[1024] PAGE_ALIGNED; -#endif -static u32 kexec_pte0[1024] PAGE_ALIGNED; -static u32 kexec_pte1[1024] PAGE_ALIGNED; - static void set_idt(void *newidt, __u16 limit) { struct desc_ptr curidt; @@ -76,6 +68,37 @@ static void load_segments(void) #undef __STR } +static void machine_kexec_free_page_tables(struct kimage *image) +{ + free_page((unsigned long)image->arch.pgd); +#ifdef CONFIG_X86_PAE + free_page((unsigned long)image->arch.pmd0); + free_page((unsigned long)image->arch.pmd1); +#endif + free_page((unsigned long)image->arch.pte0); + free_page((unsigned long)image->arch.pte1); +} + +static int machine_kexec_alloc_page_tables(struct kimage *image) +{ + image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); +#ifdef CONFIG_X86_PAE + image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); +#endif + image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!image->arch.pgd || +#ifdef CONFIG_X86_PAE + !image->arch.pmd0 || !image->arch.pmd1 || +#endif + !image->arch.pte0 || !image->arch.pte1) { + machine_kexec_free_page_tables(image); + return -ENOMEM; + } + return 0; +} + /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -87,13 +110,14 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Make control page executable. + * - Make control page executable. + * - Allocate page tables */ int machine_kexec_prepare(struct kimage *image) { if (nx_enabled) set_pages_x(image->control_code_page, 1); - return 0; + return machine_kexec_alloc_page_tables(image); } /* @@ -104,6 +128,7 @@ void machine_kexec_cleanup(struct kimage *image) { if (nx_enabled) set_pages_nx(image->control_code_page, 1); + machine_kexec_free_page_tables(image); } /* @@ -150,18 +175,18 @@ void machine_kexec(struct kimage *image) relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; - page_list[PA_PGD] = __pa(kexec_pgd); - page_list[VA_PGD] = (unsigned long)kexec_pgd; + page_list[PA_PGD] = __pa(image->arch.pgd); + page_list[VA_PGD] = (unsigned long)image->arch.pgd; #ifdef CONFIG_X86_PAE - page_list[PA_PMD_0] = __pa(kexec_pmd0); - page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PMD_1] = __pa(kexec_pmd1); - page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; + page_list[PA_PMD_0] = __pa(image->arch.pmd0); + page_list[VA_PMD_0] = (unsigned long)image->arch.pmd0; + page_list[PA_PMD_1] = __pa(image->arch.pmd1); + page_list[VA_PMD_1] = (unsigned long)image->arch.pmd1; #endif - page_list[PA_PTE_0] = __pa(kexec_pte0); - page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PTE_1] = __pa(kexec_pte1); - page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_PTE_0] = __pa(image->arch.pte0); + page_list[VA_PTE_0] = (unsigned long)image->arch.pte0; + page_list[PA_PTE_1] = __pa(image->arch.pte1); + page_list[VA_PTE_1] = (unsigned long)image->arch.pte1; if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 17f76fc05173..adc34f2c6eff 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -100,6 +100,10 @@ struct kimage { #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 unsigned int preserve_context : 1; + +#ifdef ARCH_HAS_KIMAGE_ARCH + struct kimage_arch arch; +#endif }; -- cgit v1.2.3 From 9868ee63b896ee4d2ceb8c292e88d7f4e66caaf9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Oct 2008 09:48:15 +0800 Subject: kexec/i386: setup kexec page table in C Impact: change the kexec bootstrap code implementation from assembly to C This patch transforms the kexec page tables setup code from assembler code to C code in machine_kexec_prepare. This improves readability and reduces code line number. Signed-off-by: Huang Ying Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kexec.h | 17 +----- arch/x86/kernel/machine_kexec_32.c | 59 ++++++++++++++---- arch/x86/kernel/relocate_kernel_32.S | 114 ----------------------------------- 3 files changed, 49 insertions(+), 141 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index df9c41a9c6ae..c61d8b2ab8b9 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -5,21 +5,8 @@ # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 # define PA_PGD 2 -# define VA_PGD 3 -# define PA_PTE_0 4 -# define VA_PTE_0 5 -# define PA_PTE_1 6 -# define VA_PTE_1 7 -# define PA_SWAP_PAGE 8 -# ifdef CONFIG_X86_PAE -# define PA_PMD_0 9 -# define VA_PMD_0 10 -# define PA_PMD_1 11 -# define VA_PMD_1 12 -# define PAGES_NR 13 -# else -# define PAGES_NR 9 -# endif +# define PA_SWAP_PAGE 3 +# define PAGES_NR 4 #else # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 1100312847a5..37f420018a41 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -99,6 +99,45 @@ static int machine_kexec_alloc_page_tables(struct kimage *image) return 0; } +static void machine_kexec_page_table_set_one( + pgd_t *pgd, pmd_t *pmd, pte_t *pte, + unsigned long vaddr, unsigned long paddr) +{ + pud_t *pud; + + pgd += pgd_index(vaddr); +#ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); +#endif + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); + pte = pte_offset_kernel(pmd, vaddr); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +} + +static void machine_kexec_prepare_page_tables(struct kimage *image) +{ + void *control_page; + pmd_t *pmd = 0; + + control_page = page_address(image->control_code_page); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd0; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte0, + (unsigned long)control_page, __pa(control_page)); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd1; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte1, + __pa(control_page), __pa(control_page)); +} + /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -112,12 +151,19 @@ static int machine_kexec_alloc_page_tables(struct kimage *image) * * - Make control page executable. * - Allocate page tables + * - Setup page tables */ int machine_kexec_prepare(struct kimage *image) { + int error; + if (nx_enabled) set_pages_x(image->control_code_page, 1); - return machine_kexec_alloc_page_tables(image); + error = machine_kexec_alloc_page_tables(image); + if (error) + return error; + machine_kexec_prepare_page_tables(image); + return 0; } /* @@ -176,17 +222,6 @@ void machine_kexec(struct kimage *image) page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_PGD] = __pa(image->arch.pgd); - page_list[VA_PGD] = (unsigned long)image->arch.pgd; -#ifdef CONFIG_X86_PAE - page_list[PA_PMD_0] = __pa(image->arch.pmd0); - page_list[VA_PMD_0] = (unsigned long)image->arch.pmd0; - page_list[PA_PMD_1] = __pa(image->arch.pmd1); - page_list[VA_PMD_1] = (unsigned long)image->arch.pmd1; -#endif - page_list[PA_PTE_0] = __pa(image->arch.pte0); - page_list[VA_PTE_0] = (unsigned long)image->arch.pte0; - page_list[PA_PTE_1] = __pa(image->arch.pte1); - page_list[VA_PTE_1] = (unsigned long)image->arch.pte1; if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 377da3f78e8c..a160f3119725 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -10,15 +10,12 @@ #include #include #include -#include /* * Must be relocatable PIC code callable as a C function */ #define PTR(x) (x << 2) -#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define PAE_PGD_ATTR (_PAGE_PRESENT) /* control_page + KEXEC_CONTROL_CODE_MAX_SIZE * ~ control_page + PAGE_SIZE are used as data storage and stack for @@ -59,117 +56,6 @@ relocate_kernel: movl %cr4, %eax movl %eax, CR4(%edi) -#ifdef CONFIG_X86_PAE - /* map the control page at its virtual address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0xc0000000, %eax - shrl $27, %eax - addl %edi, %eax - - movl PTR(PA_PMD_0)(%ebp), %edx - orl $PAE_PGD_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PMD_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x3fe00000, %eax - shrl $18, %eax - addl %edi, %eax - - movl PTR(PA_PTE_0)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x001ff000, %eax - shrl $9, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - /* identity map the control page at its physical address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0xc0000000, %eax - shrl $27, %eax - addl %edi, %eax - - movl PTR(PA_PMD_1)(%ebp), %edx - orl $PAE_PGD_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PMD_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x3fe00000, %eax - shrl $18, %eax - addl %edi, %eax - - movl PTR(PA_PTE_1)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x001ff000, %eax - shrl $9, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) -#else - /* map the control page at its virtual address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0xffc00000, %eax - shrl $20, %eax - addl %edi, %eax - - movl PTR(PA_PTE_0)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x003ff000, %eax - shrl $10, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - /* identity map the control page at its physical address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0xffc00000, %eax - shrl $20, %eax - addl %edi, %eax - - movl PTR(PA_PTE_1)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x003ff000, %eax - shrl $10, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) -#endif - -relocate_new_kernel: /* read the arguments and say goodbye to the stack */ movl 20+4(%esp), %ebx /* page_list */ movl 20+8(%esp), %ebp /* list of pages */ -- cgit v1.2.3 From a26a2a27396c0a0877aa701f8f92d08ba550a6c9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 31 Oct 2008 00:03:22 -0400 Subject: ftrace: nmi safe code clean ups Impact: cleanup This patch cleans up the NMI safe code for dynamic ftrace as suggested by Andrew Morton. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/arm/include/asm/ftrace.h | 4 ++-- arch/powerpc/include/asm/ftrace.h | 4 ++-- arch/sh/include/asm/ftrace.h | 4 ++-- arch/sparc/include/asm/ftrace.h | 4 ++-- arch/x86/include/asm/ftrace.h | 10 +++++----- arch/x86/kernel/ftrace.c | 16 ++++++++-------- include/linux/ftrace.h | 3 +++ kernel/trace/trace.c | 9 ++++----- 8 files changed, 28 insertions(+), 26 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index d4c24a7a9280..3f3a1d1508ea 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -2,8 +2,8 @@ #define _ASM_ARM_FTRACE #ifndef __ASSEMBLY__ -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 7652755dc000..1cd72700fbc0 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -2,8 +2,8 @@ #define _ASM_POWERPC_FTRACE #ifndef __ASSEMBLY__ -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index cdf2cb0b9ffe..31ada0370cb6 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h @@ -2,8 +2,8 @@ #define __ASM_SH_FTRACE_H #ifndef __ASSEMBLY__ -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif #ifndef __ASSEMBLY__ diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index 33a95feeb137..62055ac0496e 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -2,8 +2,8 @@ #define _ASM_SPARC64_FTRACE #ifndef __ASSEMBLY__ -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif #ifdef CONFIG_MCOUNT diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f2ed6b704a75..a23468194b8c 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -22,16 +22,16 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) -#endif +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif +#endif /* __ASSEMBLY__ */ #else /* CONFIG_FUNCTION_TRACER */ #ifndef __ASSEMBLY__ -#define ftrace_nmi_enter() do { } while (0) -#define ftrace_nmi_exit() do { } while (0) +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 6685b0fc1b44..69149337f2fe 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -67,7 +67,7 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * * Two buffers are added: An IP buffer and a "code" buffer. * - * 1) Put in the instruction pointer into the IP buffer + * 1) Put the instruction pointer into the IP buffer * and the new code into the "code" buffer. * 2) Set a flag that says we are modifying code * 3) Wait for any running NMIs to finish. @@ -85,14 +85,14 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * are the same as what exists. */ -static atomic_t in_nmi; -static int mod_code_status; -static int mod_code_write; -static void *mod_code_ip; -static void *mod_code_newcode; +static atomic_t in_nmi = ATOMIC_INIT(0); +static int mod_code_status; /* holds return value of text write */ +static int mod_code_write; /* set when NMI should do the write */ +static void *mod_code_ip; /* holds the IP to write to */ +static void *mod_code_newcode; /* holds the text to write to the IP */ -static int nmi_wait_count; -static atomic_t nmi_update_count; +static unsigned nmi_wait_count; +static atomic_t nmi_update_count = ATOMIC_INIT(0); int ftrace_arch_read_dyn_info(char *buf, int size) { diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 703eb53cfa2b..22240dfe912e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -74,6 +74,9 @@ extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); +/* May be defined in arch */ +extern int ftrace_arch_read_dyn_info(char *buf, int size); + /** * ftrace_modify_code - modify code segment * @ip: the address of the code segment diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bc36febc0771..7f86067d760c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2815,10 +2815,6 @@ static struct file_operations tracing_mark_fops = { #ifdef CONFIG_DYNAMIC_FTRACE -#define DYN_INFO_BUF_SIZE 1023 -static char ftrace_dyn_info_buffer[DYN_INFO_BUF_SIZE+1]; -static DEFINE_MUTEX(dyn_info_mutex); - int __weak ftrace_arch_read_dyn_info(char *buf, int size) { return 0; @@ -2828,14 +2824,17 @@ static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + static char ftrace_dyn_info_buffer[1024]; + static DEFINE_MUTEX(dyn_info_mutex); unsigned long *p = filp->private_data; char *buf = ftrace_dyn_info_buffer; + int size = ARRAY_SIZE(ftrace_dyn_info_buffer); int r; mutex_lock(&dyn_info_mutex); r = sprintf(buf, "%ld ", *p); - r += ftrace_arch_read_dyn_info(buf+r, DYN_INFO_BUF_SIZE-r); + r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); buf[r++] = '\n'; r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -- cgit v1.2.3 From b2bcc7b299f37037b4a78dc1538e5d6508ae8110 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 31 Oct 2008 11:59:53 -0700 Subject: x86: add a synthetic TSC_RELIABLE feature bit Impact: None, bit reservation only Add a synthetic TSC_RELIABLE feature bit which will be used to mark TSC as reliable so that we could skip all the runtime checks for TSC stablity, which have false positives in virtual environment. Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index cfdf8c2c5c31..e490a7932a0d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -92,6 +92,7 @@ #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ -- cgit v1.2.3 From 49ab56ac6e1b907b7dadb72a4012460359feaf0e Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Sat, 1 Nov 2008 18:34:37 -0700 Subject: x86: add X86_FEATURE_HYPERVISOR feature bit Impact: Number declaration only. Add X86_FEATURE_HYPERVISOR bit (CPUID level 1, ECX, bit 31). Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e490a7932a0d..694d1f8f1bee 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -118,6 +118,7 @@ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */ @@ -238,6 +239,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 -- cgit v1.2.3 From 88b094fb8d4fe43b7025ea8d487059e8813e02cd Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 27 Oct 2008 10:41:46 -0700 Subject: x86: Hypervisor detection and get tsc_freq from hypervisor Impact: Changes timebase calibration on Vmware. v3->v2 : Abstract the hypervisor detection and feature (tsc_freq) request behind a hypervisor.c file v2->v1 : Add a x86_hyper_vendor field to the cpuinfo_x86 structure. This avoids multiple calls to the hypervisor detection function. This patch adds function to detect if we are running under VMware. The current way to check if we are on VMware is following, # check if "hypervisor present bit" is set, if so read the 0x40000000 cpuid leaf and check for "VMwareVMware" signature. # if the above fails, check the DMI vendors name for "VMware" string if we find one we query the VMware hypervisor port to check if we are under VMware. The DMI + "VMware hypervisor port check" is needed for older VMware products, which don't implement the hypervisor signature cpuid leaf. Also note that since we are checking for the DMI signature the hypervisor port should never be accessed on native hardware. This patch also adds a hypervisor_get_tsc_freq function, instead of calibrating the frequency which can be error prone in virtualized environment, we ask the hypervisor for it. We get the frequency from the hypervisor by accessing the hypervisor port if we are running on VMware. Other hypervisors too can add code to the generic routine to get frequency on their platform. Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/hypervisor.h | 26 ++++++++++++ arch/x86/include/asm/processor.h | 4 ++ arch/x86/include/asm/vmware.h | 26 ++++++++++++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/common.c | 2 + arch/x86/kernel/cpu/hypervisor.c | 48 +++++++++++++++++++++ arch/x86/kernel/cpu/vmware.c | 88 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 7 ++++ arch/x86/kernel/tsc.c | 9 +++- 9 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/hypervisor.h create mode 100644 arch/x86/include/asm/vmware.h create mode 100644 arch/x86/kernel/cpu/hypervisor.c create mode 100644 arch/x86/kernel/cpu/vmware.c (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h new file mode 100644 index 000000000000..369f5c5d09a1 --- /dev/null +++ b/arch/x86/include/asm/hypervisor.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2008, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef ASM_X86__HYPERVISOR_H +#define ASM_X86__HYPERVISOR_H + +extern unsigned long get_hypervisor_tsc_freq(void); +extern void init_hypervisor(struct cpuinfo_x86 *c); + +#endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ca01e383269..a570eafa4755 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -110,6 +110,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; #endif + unsigned int x86_hyper_vendor; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 @@ -123,6 +124,9 @@ struct cpuinfo_x86 { #define X86_VENDOR_UNKNOWN 0xff +#define X86_HYPER_VENDOR_NONE 0 +#define X86_HYPER_VENDOR_VMWARE 1 + /* * capabilities of CPUs */ diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h new file mode 100644 index 000000000000..02dfea5aebc4 --- /dev/null +++ b/arch/x86/include/asm/vmware.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2008, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef ASM_X86__VMWARE_H +#define ASM_X86__VMWARE_H + +extern unsigned long vmware_get_tsc_khz(void); +extern int vmware_platform(void); + +#endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec6075c057..a5c04e88777e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -4,6 +4,7 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o +obj-y += vmware.o hypervisor.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0217a9..b88595c36254 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "cpu.h" @@ -703,6 +704,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) detect_ht(c); #endif + init_hypervisor(c); /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c new file mode 100644 index 000000000000..7bd55064ffe9 --- /dev/null +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -0,0 +1,48 @@ +/* + * Common hypervisor code + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include + +static inline void __cpuinit +detect_hypervisor_vendor(struct cpuinfo_x86 *c) +{ + if (vmware_platform()) { + c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; + } else { + c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; + } +} + +unsigned long get_hypervisor_tsc_freq(void) +{ + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) + return vmware_get_tsc_khz(); + return 0; +} + +void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) +{ + detect_hypervisor_vendor(c); +} + diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c new file mode 100644 index 000000000000..d5d1b75a4b77 --- /dev/null +++ b/arch/x86/kernel/cpu/vmware.c @@ -0,0 +1,88 @@ +/* + * VMware Detection code. + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include + +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 +#define VMWARE_HYPERVISOR_PORT 0x5658 + +#define VMWARE_PORT_CMD_GETVERSION 10 +#define VMWARE_PORT_CMD_GETHZ 45 + +#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ + __asm__("inl (%%dx)" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "0"(VMWARE_HYPERVISOR_MAGIC), \ + "1"(VMWARE_PORT_CMD_##cmd), \ + "2"(VMWARE_HYPERVISOR_PORT), "3"(0) : \ + "memory"); + +static inline int __vmware_platform(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx); + return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; +} + +static unsigned long __vmware_get_tsc_khz(void) +{ + uint64_t tsc_hz; + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + if (eax == (uint32_t)-1) + return 0; + tsc_hz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_hz, 1000); + BUG_ON(tsc_hz >> 32); + return tsc_hz; +} + +int vmware_platform(void) +{ + if (cpu_has_hypervisor) { + unsigned int eax, ebx, ecx, edx; + char hyper_vendor_id[13]; + + cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); + memcpy(hyper_vendor_id + 0, &ebx, 4); + memcpy(hyper_vendor_id + 4, &ecx, 4); + memcpy(hyper_vendor_id + 8, &edx, 4); + hyper_vendor_id[12] = '\0'; + if (!strcmp(hyper_vendor_id, "VMwareVMware")) + return 1; + } else if (dmi_available && dmi_name_in_vendors("VMware") && + __vmware_platform()) + return 1; + + return 0; +} + +unsigned long vmware_get_tsc_khz(void) +{ + BUG_ON(!vmware_platform()); + return __vmware_get_tsc_khz(); +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa6790c1dd3..f44dadfb32cf 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -98,6 +98,7 @@ #include #include +#include #include #include @@ -909,6 +910,12 @@ void __init setup_arch(char **cmdline_p) dmi_check_system(bad_bios_dmi_table); + /* + * VMware detection requires dmi to be available, so this + * needs to be done after dmi_scan_machine, for the BP. + */ + init_hypervisor(&boot_cpu_data); + #ifdef CONFIG_X86_32 probe_roms(); #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 62348e4fd8d1..6dbf0bcb44a8 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -15,6 +15,7 @@ #include #include #include +#include unsigned int cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -352,9 +353,15 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms, fast_calibrate, tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; + tsc_khz = get_hypervisor_tsc_freq(); + if (tsc_khz) { + printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); + return tsc_khz; + } + local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); -- cgit v1.2.3 From eca0cd028bdf0f6aaceb0d023e9c7501079a7dda Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 31 Oct 2008 12:01:58 -0700 Subject: x86: Add a synthetic TSC_RELIABLE feature bit. Impact: Changes timebase calibration on Vmware. Use the synthetic TSC_RELIABLE bit to workaround virtualization anomalies. Virtual TSCs can be kept nearly in sync, but because the virtual TSC offset is set by software, it's not perfect. So, the TSC synchronization test can fail. Even then the TSC can be used as a clocksource since the VMware platform exports a reliable TSC to the guest for timekeeping purposes. Use this bit to check if we need to skip the TSC sync checks. Along with this also set the CONSTANT_TSC bit when on VMware, since we still want to use TSC as clocksource on VM running over hardware which has unsynchronized TSC's (opteron's), since the hypervisor will take care of providing consistent TSC to the guest. Signed-off-by: Alok N Kataria Signed-off-by: Dan Hecht Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vmware.h | 1 + arch/x86/kernel/cpu/hypervisor.c | 11 ++++++++++- arch/x86/kernel/cpu/vmware.c | 18 ++++++++++++++++++ arch/x86/kernel/tsc_sync.c | 8 +++++++- 4 files changed, 36 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index 02dfea5aebc4..c11b7e100d83 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h @@ -22,5 +22,6 @@ extern unsigned long vmware_get_tsc_khz(void); extern int vmware_platform(void); +extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 7bd55064ffe9..35ae2b75226d 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -41,8 +41,17 @@ unsigned long get_hypervisor_tsc_freq(void) return 0; } +static inline void __cpuinit +hypervisor_set_feature_bits(struct cpuinfo_x86 *c) +{ + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { + vmware_set_feature_bits(c); + return; + } +} + void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) { detect_hypervisor_vendor(c); + hypervisor_set_feature_bits(c); } - diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index d5d1b75a4b77..2ac4394fcb90 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -86,3 +86,21 @@ unsigned long vmware_get_tsc_khz(void) BUG_ON(!vmware_platform()); return __vmware_get_tsc_khz(); } + +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) +{ + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); +} diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9ffb01c31c40..5977c40a138f 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -108,6 +108,12 @@ void __cpuinit check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + printk(KERN_INFO + "Skipping synchronization checks as TSC is reliable.\n"); + return; + } + printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", smp_processor_id(), cpu); @@ -161,7 +167,7 @@ void __cpuinit check_tsc_sync_target(void) { int cpus = 2; - if (unsynchronized_tsc()) + if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) return; /* -- cgit v1.2.3 From 7e5e26a3d8ac4bcadb380073dc9604c07a9a6198 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 31 Oct 2008 09:36:38 -0400 Subject: ftrace: fix hardirq header for non ftrace archs Impact: build fix for non-ftrace architectures Not all archs implement ftrace, and therefore do not have an asm/ftrace.h. This patch corrects the problem. The ftrace_nmi_enter/exit now must be defined for all archs that implement dynamic ftrace. Currently, only x86 does. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/arm/include/asm/ftrace.h | 5 ----- arch/powerpc/include/asm/ftrace.h | 5 ----- arch/sh/include/asm/ftrace.h | 5 ----- arch/sparc/include/asm/ftrace.h | 5 ----- arch/x86/include/asm/ftrace.h | 16 ---------------- include/linux/ftrace.h | 5 ++++- include/linux/hardirq.h | 2 +- 7 files changed, 5 insertions(+), 38 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index 3f3a1d1508ea..39c8bc1a006a 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -1,11 +1,6 @@ #ifndef _ASM_ARM_FTRACE #define _ASM_ARM_FTRACE -#ifndef __ASSEMBLY__ -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif - #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 1cd72700fbc0..b298f7a631e6 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -1,11 +1,6 @@ #ifndef _ASM_POWERPC_FTRACE #define _ASM_POWERPC_FTRACE -#ifndef __ASSEMBLY__ -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif - #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index 31ada0370cb6..3aed362c9463 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h @@ -1,11 +1,6 @@ #ifndef __ASM_SH_FTRACE_H #define __ASM_SH_FTRACE_H -#ifndef __ASSEMBLY__ -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif - #ifndef __ASSEMBLY__ extern void mcount(void); #endif diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index 62055ac0496e..d27716cd38c1 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -1,11 +1,6 @@ #ifndef _ASM_SPARC64_FTRACE #define _ASM_SPARC64_FTRACE -#ifndef __ASSEMBLY__ -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif - #ifdef CONFIG_MCOUNT #define MCOUNT_ADDR ((long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index a23468194b8c..f8173ed1c970 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -17,23 +17,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) */ return addr - 1; } - -#ifdef CONFIG_DYNAMIC_FTRACE -extern void ftrace_nmi_enter(void); -extern void ftrace_nmi_exit(void); -#else -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif #endif /* __ASSEMBLY__ */ - -#else /* CONFIG_FUNCTION_TRACER */ - -#ifndef __ASSEMBLY__ -static inline void ftrace_nmi_enter(void) { } -static inline void ftrace_nmi_exit(void) { } -#endif - #endif /* CONFIG_FUNCTION_TRACER */ #endif /* _ASM_X86_FTRACE_H */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e46a7b34037c..0ad1b48aea69 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -44,7 +44,6 @@ static inline void ftrace_kill(void) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE - enum { FTRACE_FL_FREE = (1 << 0), FTRACE_FL_FAILED = (1 << 1), @@ -105,6 +104,8 @@ extern void ftrace_release(void *start, unsigned long size); extern void ftrace_disable_daemon(void); extern void ftrace_enable_daemon(void); +extern void ftrace_nmi_enter(void); +extern void ftrace_nmi_exit(void); #else # define skip_trace(ip) ({ 0; }) @@ -113,6 +114,8 @@ extern void ftrace_enable_daemon(void); # define ftrace_disable_daemon() do { } while (0) # define ftrace_enable_daemon() do { } while (0) static inline void ftrace_release(void *start, unsigned long size) { } +static inline void ftrace_nmi_enter(void) { } +static inline void ftrace_nmi_exit(void) { } #endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 0087cb43becf..ffc16ab5a878 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -4,8 +4,8 @@ #include #include #include +#include #include -#include #include /* -- cgit v1.2.3 From 64ccf2f9a70a06ba56cd8cedfa610b4e77181587 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 5 Nov 2008 22:11:56 -0600 Subject: x86: uv: Add UV watchlist bios call Add UV bios calls to allocate and free watchlists. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 17 ++++++++++++++++- arch/x86/kernel/bios_uv.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 51cadc645e6f..58105c5b0b4e 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -32,7 +32,9 @@ enum uv_bios_cmd { UV_BIOS_COMMON, UV_BIOS_GET_SN_INFO, - UV_BIOS_FREQ_BASE + UV_BIOS_FREQ_BASE, + UV_BIOS_WATCHLIST_ALLOC, + UV_BIOS_WATCHLIST_FREE }; /* @@ -71,6 +73,15 @@ union partition_info_u { }; }; +union uv_watchlist_u { + u64 val; + struct { + u64 blade : 16, + size : 32, + filler : 16; + }; +}; + /* * bios calls have 6 parameters */ @@ -80,9 +91,13 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); +extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, + unsigned long *); +extern int uv_bios_mq_watchlist_free(int, int); extern void uv_bios_init(void); +extern unsigned long sn_rtc_cycles_per_second; extern int uv_type; extern long sn_partition_id; extern long sn_coherency_id; diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 7cefb7170e75..4c02b2799216 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -100,6 +100,39 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, return ret; } +int +uv_bios_mq_watchlist_alloc(int blade, void *mq, unsigned int mq_size, + unsigned long *intr_mmr_offset) +{ + union uv_watchlist_u size_blade; + unsigned long addr; + u64 watchlist; + s64 ret; + + addr = (unsigned long)mq; + size_blade.size = mq_size; + size_blade.blade = blade; + + /* + * bios returns watchlist number or negative error number. + */ + ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, + size_blade.val, (u64)intr_mmr_offset, + (u64)&watchlist, 0); + if (ret < BIOS_STATUS_SUCCESS) + return ret; + + return watchlist; +} +EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc); + +int +uv_bios_mq_watchlist_free(int blade, int watchlist_num) +{ + return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE, + blade, watchlist_num, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free); s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { -- cgit v1.2.3 From e8929c8a6acbecbd629b8e3f2d1a2546ec4ebdfc Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 5 Nov 2008 22:13:44 -0600 Subject: x86: uv: Add UV memory protection bios call Add UV bios call to change memory protections. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 10 +++++++++- arch/x86/kernel/bios_uv.c | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 58105c5b0b4e..a301a56d4157 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -34,7 +34,8 @@ enum uv_bios_cmd { UV_BIOS_GET_SN_INFO, UV_BIOS_FREQ_BASE, UV_BIOS_WATCHLIST_ALLOC, - UV_BIOS_WATCHLIST_FREE + UV_BIOS_WATCHLIST_FREE, + UV_BIOS_MEMPROTECT }; /* @@ -82,6 +83,12 @@ union uv_watchlist_u { }; }; +enum uv_memprotect { + UV_MEMPROT_RESTRICT_ACCESS, + UV_MEMPROT_ALLOW_AMO, + UV_MEMPROT_ALLOW_RW +}; + /* * bios calls have 6 parameters */ @@ -94,6 +101,7 @@ extern s64 uv_bios_freq_base(u64, u64 *); extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); +extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern void uv_bios_init(void); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 4c02b2799216..7cf6fc3d1c10 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -134,6 +134,14 @@ uv_bios_mq_watchlist_free(int blade, int watchlist_num) } EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free); +s64 +uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms) +{ + return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len, + perms, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); + s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, -- cgit v1.2.3 From 23c357003b3671cdfb17bc4d5383589e74b71511 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 5 Nov 2008 22:15:13 -0600 Subject: x86: uv: Add UV reserved page bios call Add UV bios call to get the address of the reserved page. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 5 ++++- arch/x86/kernel/bios_uv.c | 11 +++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index a301a56d4157..da1c4e8e78fc 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -35,13 +35,15 @@ enum uv_bios_cmd { UV_BIOS_FREQ_BASE, UV_BIOS_WATCHLIST_ALLOC, UV_BIOS_WATCHLIST_FREE, - UV_BIOS_MEMPROTECT + UV_BIOS_MEMPROTECT, + UV_BIOS_GET_PARTITION_ADDR }; /* * Status values returned from a BIOS call. */ enum { + BIOS_STATUS_MORE_PASSES = 1, BIOS_STATUS_SUCCESS = 0, BIOS_STATUS_UNIMPLEMENTED = -ENOSYS, BIOS_STATUS_EINVAL = -EINVAL, @@ -102,6 +104,7 @@ extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); +extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern void uv_bios_init(void); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 7cf6fc3d1c10..d22d0f1bbea0 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -142,6 +142,17 @@ uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms) } EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); +s64 +uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len) +{ + s64 ret; + + ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie, + (u64)addr, buf, (u64)len, 0); + return ret; +} +EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa); + s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, -- cgit v1.2.3 From caf4b323b02a16c92fba449952ac6515ddc76d7a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Nov 2008 07:03:45 +0100 Subject: tracing, x86: add low level support for ftrace return tracing Impact: add infrastructure for function-return tracing Add low level support for ftrace return tracing. This plug-in stores return addresses on the thread_info structure of the current task. The index of the current return address is initialized when the task is the first one (init) and when a process forks (the child). It is not needed when a task does a sys_execve because after this syscall, it still needs to return on the kernel functions it called. Note that the code of return_to_handler has been suggested by Steven Rostedt as almost all of the ideas of improvements in this V3. For purpose of security, arch/x86/kernel/process_32.c is not traced because __switch_to() changes the current task during its execution. That could cause inconsistency in the stored return address of this function even if I didn't have any crash after testing with tracing on this function enabled. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/ftrace.h | 26 ++++++ arch/x86/include/asm/thread_info.h | 24 +++++ arch/x86/kernel/Makefile | 6 ++ arch/x86/kernel/entry_32.S | 33 +++++++ arch/x86/kernel/ftrace.c | 181 +++++++++++++++++++++++++++++++++++-- include/linux/ftrace.h | 20 ++++ include/linux/ftrace_irq.h | 2 +- include/linux/sched.h | 11 +++ kernel/Makefile | 4 + 10 files changed, 300 insertions(+), 8 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 27b8a3a39911..ca91e50bdb10 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -11,6 +11,7 @@ config 64BIT config X86_32 def_bool !64BIT + select HAVE_FUNCTION_RET_TRACER config X86_64 def_bool 64BIT diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f8173ed1c970..9b6a1fa19e70 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -20,4 +20,30 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_RET_TRACER +#define FTRACE_RET_STACK_SIZE 20 + +#ifndef __ASSEMBLY__ + +/* + * Stack of return addresses for functions + * of a thread. + * Used in struct thread_info + */ +struct ftrace_ret_stack { + unsigned long ret; + unsigned long func; + unsigned long long calltime; +}; + +/* + * Primary handler of a function return. + * It relays on ftrace_return_to_handler. + * Defined in entry32.S + */ +extern void return_to_handler(void); + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_FUNCTION_RET_TRACER */ + #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad2..a71158369fd4 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -20,6 +20,7 @@ struct task_struct; struct exec_domain; #include +#include struct thread_info { struct task_struct *task; /* main task structure */ @@ -38,8 +39,30 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif + +#ifdef CONFIG_FUNCTION_RET_TRACER + /* Index of current stored adress in ret_stack */ + int curr_ret_stack; + /* Stack of return addresses for return function tracing */ + struct ftrace_ret_stack ret_stack[FTRACE_RET_STACK_SIZE]; +#endif }; +#ifdef CONFIG_FUNCTION_RET_TRACER +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + }, \ + .curr_ret_stack = -1,\ +} +#else #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ @@ -52,6 +75,7 @@ struct thread_info { .fn = do_no_restart_syscall, \ }, \ } +#endif #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e489ff9cb3e2..1d8ed95da846 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -14,6 +14,11 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg endif +ifdef CONFIG_FUNCTION_RET_TRACER +# Don't trace __switch_to() but let it for function tracer +CFLAGS_REMOVE_process_32.o = -pg +endif + # # vsyscalls (which work on the user stack) should have # no stack-protector checks: @@ -65,6 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_RET_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 9134de814c97..9a0ac85946db 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1188,6 +1188,10 @@ ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace +#ifdef CONFIG_FUNCTION_RET_TRACER + cmpl $ftrace_stub, ftrace_function_return + jnz trace_return +#endif .globl ftrace_stub ftrace_stub: ret @@ -1206,8 +1210,37 @@ trace: popl %edx popl %ecx popl %eax + jmp ftrace_stub +#ifdef CONFIG_FUNCTION_RET_TRACER +trace_return: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + pushl %eax + lea 0x4(%ebp), %eax + pushl %eax + call prepare_ftrace_return + addl $8, %esp + popl %edx + popl %ecx + popl %eax jmp ftrace_stub + +.globl return_to_handler +return_to_handler: + pushl $0 + pushl %eax + pushl %ecx + pushl %edx + call ftrace_return_to_handler + movl %eax, 0xc(%esp) + popl %edx + popl %ecx + popl %eax + ret +#endif /* CONFIG_FUNCTION_RET_TRACER */ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 69149337f2fe..d68033bba223 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -18,10 +18,173 @@ #include #include +#include #include +#include -static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +#ifdef CONFIG_FUNCTION_RET_TRACER + +/* + * These functions are picked from those used on + * this page for dynamic ftrace. They have been + * simplified to ignore all traces in NMI context. + */ +static atomic_t in_nmi; + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); +} + +void ftrace_nmi_exit(void) +{ + atomic_dec(&in_nmi); +} + +/* + * Synchronize accesses to return adresses stack with + * interrupts. + */ +static raw_spinlock_t ret_stack_lock; + +/* Add a function return address to the trace stack on thread info.*/ +static int push_return_trace(unsigned long ret, unsigned long long time, + unsigned long func) +{ + int index; + struct thread_info *ti; + unsigned long flags; + int err = 0; + + raw_local_irq_save(flags); + __raw_spin_lock(&ret_stack_lock); + + ti = current_thread_info(); + /* The return trace stack is full */ + if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { + err = -EBUSY; + goto out; + } + + index = ++ti->curr_ret_stack; + ti->ret_stack[index].ret = ret; + ti->ret_stack[index].func = func; + ti->ret_stack[index].calltime = time; + +out: + __raw_spin_unlock(&ret_stack_lock); + raw_local_irq_restore(flags); + return err; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void pop_return_trace(unsigned long *ret, unsigned long long *time, + unsigned long *func) +{ + struct thread_info *ti; + int index; + unsigned long flags; + + raw_local_irq_save(flags); + __raw_spin_lock(&ret_stack_lock); + + ti = current_thread_info(); + index = ti->curr_ret_stack; + *ret = ti->ret_stack[index].ret; + *func = ti->ret_stack[index].func; + *time = ti->ret_stack[index].calltime; + ti->curr_ret_stack--; + + __raw_spin_unlock(&ret_stack_lock); + raw_local_irq_restore(flags); +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ + struct ftrace_retfunc trace; + pop_return_trace(&trace.ret, &trace.calltime, &trace.func); + trace.rettime = cpu_clock(raw_smp_processor_id()); + ftrace_function_return(&trace); + + return trace.ret; +} + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +asmlinkage +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +{ + unsigned long old; + unsigned long long calltime; + int faulted; + unsigned long return_hooker = (unsigned long) + &return_to_handler; + + /* Nmi's are currently unsupported */ + if (atomic_read(&in_nmi)) + return; + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: movl (%[parent_old]), %[old]\n" + "2: movl %[return_hooker], (%[parent_replaced])\n" + " movl $0, %[faulted]\n" + + ".section .fixup, \"ax\"\n" + "3: movl $1, %[faulted]\n" + ".previous\n" + + ".section __ex_table, \"a\"\n" + " .long 1b, 3b\n" + " .long 2b, 3b\n" + ".previous\n" + + : [parent_replaced] "=rm" (parent), [old] "=r" (old), + [faulted] "=r" (faulted) + : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (WARN_ON(faulted)) { + unregister_ftrace_return(); + return; + } + + if (WARN_ON(!__kernel_text_address(old))) { + unregister_ftrace_return(); + *parent = old; + return; + } + + calltime = cpu_clock(raw_smp_processor_id()); + + if (push_return_trace(old, calltime, self_addr) == -EBUSY) + *parent = old; +} + +static int __init init_ftrace_function_return(void) +{ + ret_stack_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + return 0; +} +device_initcall(init_ftrace_function_return); + + +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; @@ -31,17 +194,11 @@ union ftrace_code_union { } __attribute__((packed)); }; - static int ftrace_calc_offset(long ip, long addr) { return (int)(addr - ip); } -unsigned char *ftrace_nop_replace(void) -{ - return ftrace_nop; -} - unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -183,6 +340,15 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) } + + +static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop; +} + int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) @@ -292,3 +458,4 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } +#endif diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1f5608c11023..dcbbf72a88b1 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -267,6 +267,26 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { } #endif +/* + * Structure that defines a return function trace. + */ +struct ftrace_retfunc { + unsigned long ret; /* Return address */ + unsigned long func; /* Current function */ + unsigned long long calltime; + unsigned long long rettime; +}; + +#ifdef CONFIG_FUNCTION_RET_TRACER +/* Type of a callback handler of tracing return function */ +typedef void (*trace_function_return_t)(struct ftrace_retfunc *); + +extern void register_ftrace_return(trace_function_return_t func); +/* The current handler in use */ +extern trace_function_return_t ftrace_function_return; +extern void unregister_ftrace_return(void); +#endif + /* * Structure which defines the trace of an initcall. * You don't have to fill the func field since it is diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index b1299d6729f2..0b4df55d7a74 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,7 +2,7 @@ #define _LINUX_FTRACE_IRQ_H -#ifdef CONFIG_DYNAMIC_FTRACE +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_RET_TRACER) extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/include/linux/sched.h b/include/linux/sched.h index 295b7c756ca6..df77abe860c9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2005,6 +2005,17 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct { *task_thread_info(p) = *task_thread_info(org); task_thread_info(p)->task = p; + +#ifdef CONFIG_FUNCTION_RET_TRACER + /* + * When fork() creates a child process, this function is called. + * But the child task may not inherit the return adresses traced + * by the return function tracer because it will directly execute + * in userspace and will not return to kernel functions its parent + * used. + */ + task_thread_info(p)->curr_ret_stack = -1; +#endif } static inline unsigned long *end_of_stack(struct task_struct *p) diff --git a/kernel/Makefile b/kernel/Makefile index 9a3ec66a9d84..af3be57acbbb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,10 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -mno-spe -pg endif +ifdef CONFIG_FUNCTION_RET_TRACER +CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address() +CFLAGS_REMOVE_module.o = -pg # For __module_text_address() +endif obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o -- cgit v1.2.3 From d3ec5cae0921611ceae06464ef6291012dd9849f Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 11 Nov 2008 14:33:44 +0100 Subject: x86: call machine_shutdown and stop all CPUs in native_machine_halt Impact: really halt all CPUs on halt Function machine_halt (resp. native_machine_halt) is empty for x86 architectures. When command 'halt -f' is invoked, the message "System halted." is displayed but this is not really true because all CPUs are still running. There are also similar inconsistencies for other arches (some uses power-off for halt or forever-loop with IRQs enabled/disabled). IMO there should be used the same approach for all architectures OR what does the message "System halted" really mean? This patch fixes it for x86. Signed-off-by: Ivan Vecera Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 1 + arch/x86/include/asm/system.h | 2 ++ arch/x86/kernel/process.c | 16 ++++++++++++++++ arch/x86/kernel/reboot.c | 5 +++++ arch/x86/kernel/smp.c | 13 ------------- 5 files changed, 24 insertions(+), 13 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3b1510b4fc57..25caa0738af5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -193,6 +193,7 @@ extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask); static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } +static inline void disable_local_APIC(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2ed3f0f44ff7..07c3e4048991 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -314,6 +314,8 @@ extern void free_init_pages(char *what, unsigned long begin, unsigned long end); void default_idle(void); +void stop_this_cpu(void *dummy); + /* * Force strict CPU ordering. * And yes, this is required on UP too when we're talking diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772744d8..a4da7c4b3129 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,6 +8,7 @@ #include #include #include +#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -122,6 +123,21 @@ void default_idle(void) EXPORT_SYMBOL(default_idle); #endif +void stop_this_cpu(void *dummy) +{ + local_irq_disable(); + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + disable_local_APIC(); + + for (;;) { + if (hlt_works(smp_processor_id())) + halt(); + } +} + static void do_nothing(void *unused) { } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 724adfc63cb9..34f8d37ae3c5 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -461,6 +461,11 @@ static void native_machine_restart(char *__unused) static void native_machine_halt(void) { + /* stop other cpus and apics */ + machine_shutdown(); + + /* stop this cpu */ + stop_this_cpu(NULL); } static void native_machine_power_off(void) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18f9b19f5f8f..3f92b134ab90 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -140,19 +140,6 @@ void native_send_call_func_ipi(cpumask_t mask) send_IPI_mask(mask, CALL_FUNCTION_VECTOR); } -static void stop_this_cpu(void *dummy) -{ - local_irq_disable(); - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - disable_local_APIC(); - if (hlt_works(smp_processor_id())) - for (;;) halt(); - for (;;); -} - /* * this function calls the 'stop' function on all other CPUs in the system. */ -- cgit v1.2.3 From 14d7ca5c575853664d8fe4f225a77b8df1b7de7d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 11 Nov 2008 16:19:48 -0800 Subject: x86: attempt reboot via port CF9 if we have standard PCI ports Impact: Changes reboot behavior. If port CF9 seems to be safe to touch, attempt it before trying the keyboard controller. Port CF9 is not available on all chipsets (a significant but decreasing number of modern chipsets don't implement it), but port CF9 itself should in general be safe to poke (no ill effects if unimplemented) on any system which has PCI Configuration Method #1 or #2, as it falls inside the PCI configuration port range in both cases. No chipset without PCI is known to have port CF9, either, although an explicit "pci=bios" would mean we miss this and therefore don't use port CF9. An explicit "reboot=pci" can be used to force the use of port CF9. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/emergency-restart.h | 4 +++- arch/x86/kernel/reboot.c | 34 +++++++++++++++++++++++++------- arch/x86/pci/direct.c | 4 +++- arch/x86/pci/pci.h | 1 + 4 files changed, 34 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index 94826cf87455..cc70c1c78ca4 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -8,7 +8,9 @@ enum reboot_type { BOOT_BIOS = 'b', #endif BOOT_ACPI = 'a', - BOOT_EFI = 'e' + BOOT_EFI = 'e', + BOOT_CF9 = 'p', + BOOT_CF9_COND = 'q', }; extern enum reboot_type reboot_type; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 34f8d37ae3c5..ddc93891cdcd 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -29,14 +29,17 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_KBD; +enum reboot_type reboot_type = BOOT_CF9_COND; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) static int reboot_cpu = -1; #endif -/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] +/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ +bool port_cf9_safe = false; + +/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] warm Don't set the cold reboot flag cold Set the cold reboot flag bios Reboot by jumping through the BIOS (only for X86_32) @@ -45,6 +48,7 @@ static int reboot_cpu = -1; kbd Use the keyboard controller. cold reset (default) acpi Use the RESET_REG in the FADT efi Use efi reset_system runtime service + pci Use the so-called "PCI reset register", CF9 force Avoid anything that could hang. */ static int __init reboot_setup(char *str) @@ -79,6 +83,7 @@ static int __init reboot_setup(char *str) case 'k': case 't': case 'e': + case 'p': reboot_type = *str; break; @@ -379,28 +384,43 @@ static void native_machine_emergency_restart(void) load_idt(&no_idt); __asm__ __volatile__("int3"); - reboot_type = BOOT_KBD; + reboot_type = BOOT_CF9_COND; break; #ifdef CONFIG_X86_32 case BOOT_BIOS: machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); - reboot_type = BOOT_KBD; + reboot_type = BOOT_CF9_COND; break; #endif case BOOT_ACPI: acpi_reboot(); - reboot_type = BOOT_KBD; + reboot_type = BOOT_CF9_COND; break; - case BOOT_EFI: if (efi_enabled) - efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, + efi.reset_system(reboot_mode ? + EFI_RESET_WARM : + EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); + reboot_type = BOOT_CF9_COND; + break; + + case BOOT_CF9: + port_cf9_safe = true; + /* fall through */ + case BOOT_CF9_COND: + if (port_cf9_safe) { + u8 cf9 = inb(0xcf9) & ~6; + outb(cf9|2, 0xcf9); /* Request hard reset */ + udelay(50); + outb(cf9|6, 0xcf9); /* Actually do the reset */ + udelay(50); + } reboot_type = BOOT_KBD; break; } diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 9915293500fb..9a5af6c8fbe9 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -173,7 +173,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, #undef PCI_CONF2_ADDRESS -static struct pci_raw_ops pci_direct_conf2 = { +struct pci_raw_ops pci_direct_conf2 = { .read = pci_conf2_read, .write = pci_conf2_write, }; @@ -289,6 +289,7 @@ int __init pci_direct_probe(void) if (pci_check_type1()) { raw_pci_ops = &pci_direct_conf1; + port_cf9_safe = true; return 1; } release_resource(region); @@ -305,6 +306,7 @@ int __init pci_direct_probe(void) if (pci_check_type2()) { raw_pci_ops = &pci_direct_conf2; + port_cf9_safe = true; return 2; } diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index 15b9cf6be729..1959018aac02 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -96,6 +96,7 @@ extern struct pci_raw_ops *raw_pci_ops; extern struct pci_raw_ops *raw_pci_ext_ops; extern struct pci_raw_ops pci_direct_conf1; +extern bool port_cf9_safe; /* arch_initcall level */ extern int pci_direct_probe(void); -- cgit v1.2.3 From c370e5e089adfd5b1b863f3464cccae9ebf33cca Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 12 Nov 2008 11:34:41 -0200 Subject: x86 kdump: make nmi_shootdown_cpus() non-static Impact: make API available to the rest of x86 platform code Add prototype to asm/reboot.h. Signed-off-by: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/include/asm/reboot.h | 5 +++++ arch/x86/kernel/crash.c | 3 +-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index df7710354f85..562d4fd31ba8 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_REBOOT_H #define _ASM_X86_REBOOT_H +#include + struct pt_regs; struct machine_ops { @@ -18,4 +20,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); void machine_real_restart(const unsigned char *code, int length); +typedef void (*nmi_shootdown_cb)(int, struct die_args*); +void nmi_shootdown_cpus(nmi_shootdown_cb callback); + #endif /* _ASM_X86_REBOOT_H */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index f23c2beeb37d..fb298d1daac9 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -29,7 +29,6 @@ #include -typedef void (*nmi_shootdown_cb)(int, struct die_args*); #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -100,7 +99,7 @@ static struct notifier_block crash_nmi_nb = { .notifier_call = crash_nmi_callback, }; -static void nmi_shootdown_cpus(nmi_shootdown_cb callback) +void nmi_shootdown_cpus(nmi_shootdown_cb callback) { unsigned long msecs; -- cgit v1.2.3 From 31e889098a80ceb3e9e3c555d522b2686a6663c6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Nov 2008 16:21:19 -0800 Subject: ftrace: pass module struct to arch dynamic ftrace functions Impact: allow archs more flexibility on dynamic ftrace implementations Dynamic ftrace has largly been developed on x86. Since x86 does not have the same limitations as other architectures, the ftrace interaction between the generic code and the architecture specific code was not flexible enough to handle some of the issues that other architectures have. Most notably, module trampolines. Due to the limited branch distance that archs make in calling kernel core code from modules, the module load code must create a trampoline to jump to what will make the larger jump into core kernel code. The problem arises when this happens to a call to mcount. Ftrace checks all code before modifying it and makes sure the current code is what it expects. Right now, there is not enough information to handle modifying module trampolines. This patch changes the API between generic dynamic ftrace code and the arch dependent code. There is now two functions for modifying code: ftrace_make_nop(mod, rec, addr) - convert the code at rec->ip into a nop, where the original text is calling addr. (mod is the module struct if called by module init) ftrace_make_caller(rec, addr) - convert the code rec->ip that should be a nop into a caller to addr. The record "rec" now has a new field called "arch" where the architecture can add any special attributes to each call site record. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 8 ++++++ arch/x86/kernel/ftrace.c | 29 +++++++++++++++++--- include/linux/ftrace.h | 53 +++++++++++++++++++++++++++--------- kernel/module.c | 2 +- kernel/trace/ftrace.c | 62 +++++++++++++++++-------------------------- 5 files changed, 100 insertions(+), 54 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9b6a1fa19e70..2bb43b433e07 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -17,6 +17,14 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) */ return addr - 1; } + +#ifdef CONFIG_DYNAMIC_FTRACE + +struct dyn_arch_ftrace { + /* No extra data needed for x86 */ +}; + +#endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index fe832738e1e2..762222ad1387 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -166,7 +166,7 @@ static int ftrace_calc_offset(long ip, long addr) return (int)(addr - ip); } -unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -311,12 +311,12 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; -unsigned char *ftrace_nop_replace(void) +static unsigned char *ftrace_nop_replace(void) { return ftrace_nop; } -int +static int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { @@ -349,6 +349,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return 0; } +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + new = ftrace_nop_replace(); + + return ftrace_modify_code(rec->ip, old, new); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + + return ftrace_modify_code(rec->ip, old, new); +} + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4fbc4a8b86a5..166a2070ef65 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -74,6 +74,9 @@ static inline void ftrace_start(void) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE +/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ +#include + enum { FTRACE_FL_FREE = (1 << 0), FTRACE_FL_FAILED = (1 << 1), @@ -88,6 +91,7 @@ struct dyn_ftrace { struct list_head list; unsigned long ip; /* address of mcount call-site */ unsigned long flags; + struct dyn_arch_ftrace arch; }; int ftrace_force_update(void); @@ -95,22 +99,40 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset); /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); -extern unsigned char *ftrace_nop_replace(void); -extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr); extern int ftrace_dyn_arch_init(void *data); extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); -/* May be defined in arch */ -extern int ftrace_arch_read_dyn_info(char *buf, int size); +/** + * ftrace_make_nop - convert code into top + * @mod: module structure if called by module load initialization + * @rec: the mcount call site record + * @addr: the address that the call site should be calling + * + * This is a very sensitive operation and great care needs + * to be taken by the arch. The operation should carefully + * read the location, check to see if what is read is indeed + * what we expect it to be, and then on success of the compare, + * it should write to the location. + * + * The code segment at @rec->ip should be a caller to @addr + * + * Return must be: + * 0 on success + * -EFAULT on error reading the location + * -EINVAL on a failed compare of the contents + * -EPERM on error writing to the location + * Any other value will be considered a failure. + */ +extern int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr); /** - * ftrace_modify_code - modify code segment - * @ip: the address of the code segment - * @old_code: the contents of what is expected to be there - * @new_code: the code to patch in + * ftrace_make_call - convert a nop call site into a call to addr + * @rec: the mcount call site record + * @addr: the address that the call site should call * * This is a very sensitive operation and great care needs * to be taken by the arch. The operation should carefully @@ -118,6 +140,8 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size); * what we expect it to be, and then on success of the compare, * it should write to the location. * + * The code segment at @rec->ip should be a nop + * * Return must be: * 0 on success * -EFAULT on error reading the location @@ -125,8 +149,11 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size); * -EPERM on error writing to the location * Any other value will be considered a failure. */ -extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code, - unsigned char *new_code); +extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); + + +/* May be defined in arch */ +extern int ftrace_arch_read_dyn_info(char *buf, int size); extern int skip_trace(unsigned long ip); @@ -259,11 +286,13 @@ static inline void ftrace_dump(void) { } #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); -extern void ftrace_init_module(unsigned long *start, unsigned long *end); +extern void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end); #else static inline void ftrace_init(void) { } static inline void -ftrace_init_module(unsigned long *start, unsigned long *end) { } +ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { } #endif diff --git a/kernel/module.c b/kernel/module.c index 1f4cc00e0c20..69791274e899 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2201,7 +2201,7 @@ static noinline struct module *load_module(void __user *umod, /* sechdrs[0].sh_size is always zero */ mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", sizeof(*mseg), &num_mcount); - ftrace_init_module(mseg, mseg + num_mcount); + ftrace_init_module(mod, mseg, mseg + num_mcount); err = module_finalize(hdr, sechdrs, mod); if (err < 0) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3940c71ac2a2..e9a5fbfce08e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -358,9 +358,7 @@ static void print_ip_ins(const char *fmt, unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } -static void ftrace_bug(int failed, unsigned long ip, - unsigned char *expected, - unsigned char *replace) +static void ftrace_bug(int failed, unsigned long ip) { switch (failed) { case -EFAULT: @@ -372,9 +370,7 @@ static void ftrace_bug(int failed, unsigned long ip, FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); - print_ip_ins(" expected: ", expected); print_ip_ins(" actual: ", (unsigned char *)ip); - print_ip_ins(" replace: ", replace); printk(KERN_CONT "\n"); break; case -EPERM: @@ -392,8 +388,7 @@ static void ftrace_bug(int failed, unsigned long ip, #define FTRACE_ADDR ((long)(ftrace_caller)) static int -__ftrace_replace_code(struct dyn_ftrace *rec, - unsigned char *old, unsigned char *new, int enable) +__ftrace_replace_code(struct dyn_ftrace *rec, int enable) { unsigned long ip, fl; @@ -435,12 +430,10 @@ __ftrace_replace_code(struct dyn_ftrace *rec, * otherwise enable it! */ if (fl & FTRACE_FL_ENABLED) { - /* swap new and old */ - new = old; - old = ftrace_call_replace(ip, FTRACE_ADDR); + enable = 0; rec->flags &= ~FTRACE_FL_ENABLED; } else { - new = ftrace_call_replace(ip, FTRACE_ADDR); + enable = 1; rec->flags |= FTRACE_FL_ENABLED; } } else { @@ -453,10 +446,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); if (fl == FTRACE_FL_NOTRACE) return 0; - - new = ftrace_call_replace(ip, FTRACE_ADDR); - } else - old = ftrace_call_replace(ip, FTRACE_ADDR); + } if (enable) { if (rec->flags & FTRACE_FL_ENABLED) @@ -469,21 +459,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } } - return ftrace_modify_code(ip, old, new); + if (enable) + return ftrace_make_call(rec, FTRACE_ADDR); + else + return ftrace_make_nop(NULL, rec, FTRACE_ADDR); } static void ftrace_replace_code(int enable) { int i, failed; - unsigned char *new = NULL, *old = NULL; struct dyn_ftrace *rec; struct ftrace_page *pg; - if (enable) - old = ftrace_nop_replace(); - else - new = ftrace_nop_replace(); - for (pg = ftrace_pages_start; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { rec = &pg->records[i]; @@ -504,34 +491,30 @@ static void ftrace_replace_code(int enable) unfreeze_record(rec); } - failed = __ftrace_replace_code(rec, old, new, enable); + failed = __ftrace_replace_code(rec, enable); if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { rec->flags |= FTRACE_FL_FAILED; if ((system_state == SYSTEM_BOOTING) || !core_kernel_text(rec->ip)) { ftrace_free_rec(rec); } else - ftrace_bug(failed, rec->ip, old, new); + ftrace_bug(failed, rec->ip); } } } } static int -ftrace_code_disable(struct dyn_ftrace *rec) +ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { unsigned long ip; - unsigned char *nop, *call; int ret; ip = rec->ip; - nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, mcount_addr); - - ret = ftrace_modify_code(ip, call, nop); + ret = ftrace_make_nop(mod, rec, mcount_addr); if (ret) { - ftrace_bug(ret, ip, call, nop); + ftrace_bug(ret, ip); rec->flags |= FTRACE_FL_FAILED; return 0; } @@ -650,7 +633,7 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int ftrace_update_code(void) +static int ftrace_update_code(struct module *mod) { struct dyn_ftrace *p, *t; cycle_t start, stop; @@ -667,7 +650,7 @@ static int ftrace_update_code(void) list_del_init(&p->list); /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(p)) { + if (ftrace_code_disable(mod, p)) { p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; } else @@ -1309,7 +1292,8 @@ static __init int ftrace_init_debugfs(void) fs_initcall(ftrace_init_debugfs); -static int ftrace_convert_nops(unsigned long *start, +static int ftrace_convert_nops(struct module *mod, + unsigned long *start, unsigned long *end) { unsigned long *p; @@ -1325,18 +1309,19 @@ static int ftrace_convert_nops(unsigned long *start, /* disable interrupts to prevent kstop machine */ local_irq_save(flags); - ftrace_update_code(); + ftrace_update_code(mod); local_irq_restore(flags); mutex_unlock(&ftrace_start_lock); return 0; } -void ftrace_init_module(unsigned long *start, unsigned long *end) +void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { if (ftrace_disabled || start == end) return; - ftrace_convert_nops(start, end); + ftrace_convert_nops(mod, start, end); } extern unsigned long __start_mcount_loc[]; @@ -1366,7 +1351,8 @@ void __init ftrace_init(void) last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_convert_nops(__start_mcount_loc, + ret = ftrace_convert_nops(NULL, + __start_mcount_loc, __stop_mcount_loc); return; -- cgit v1.2.3 From 569712b2b0970fa5b19673544d62ae661d04a220 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 16 Nov 2008 03:12:49 -0800 Subject: x86: fix wakeup_cpu with numaq/es7000, v2 Impact: fix secondary-CPU wakeup/init path with numaq and es7000 While looking at wakeup_secondary_cpu for WAKE_SECONDARY_VIA_NMI: |#ifdef WAKE_SECONDARY_VIA_NMI |/* | * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal | * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this | * won't ... remember to clear down the APIC, etc later. | */ |static int __devinit |wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) |{ | unsigned long send_status, accept_status = 0; | int maxlvt; |... | if (APIC_INTEGRATED(apic_version[phys_apicid])) { | maxlvt = lapic_get_maxlvt(); I noticed that there is no warning about undefined phys_apicid... because WAKE_SECONDARY_VIA_NMI and WAKE_SECONDARY_VIA_INIT can not be defined at the same time. So NUMAQ is using wrong wakeup_secondary_cpu. WAKE_SECONDARY_VIA_NMI, WAKE_SECONDARY_VIA_INIT and WAKE_SECONDARY_VIA_MIP are variants of a weird and fragile preprocessor-driven "HAL" mechanisms to specify the kind of secondary-CPU wakeup strategy a given x86 kernel will use. The vast majority of systems want to use INIT for secondary wakeup - NUMAQ uses an NMI, (old-style-) ES7000 uses 'MIP' (a firmware driven in-memory flag to let secondaries continue). So convert these mechanisms to x86_quirks and add a ->wakeup_secondary_cpu() method to specify the rare exception to the sane default. Extend genapic accordingly as well, for 32-bit. While looking further, I noticed that functions in wakecup.h for numaq and es7000 are different to the default in mach_wakecpu.h - but smpboot.c will only use default mach_wakecpu.h with smphook.h. So we need to add mach_wakecpu.h for mach_generic, to properly support numaq and es7000, and vectorize the following SMP init methods: int trampoline_phys_low; int trampoline_phys_high; void (*wait_for_init_deassert)(atomic_t *deassert); void (*smp_callin_clear_local_apic)(void); void (*store_NMI_vector)(unsigned short *high, unsigned short *low); void (*restore_NMI_vector)(unsigned short *high, unsigned short *low); void (*inquire_remote_apic)(int apicid); Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bigsmp/apic.h | 2 -- arch/x86/include/asm/es7000/apic.h | 3 -- arch/x86/include/asm/es7000/wakecpu.h | 41 ++++++--------------- arch/x86/include/asm/genapic_32.h | 17 ++++++++- arch/x86/include/asm/mach-default/mach_wakecpu.h | 24 +++++-------- arch/x86/include/asm/mach-default/smpboot_hooks.h | 8 +++-- arch/x86/include/asm/mach-generic/mach_wakecpu.h | 12 +++++++ arch/x86/include/asm/numaq/wakecpu.h | 24 +++++++------ arch/x86/include/asm/setup.h | 2 ++ arch/x86/kernel/es7000_32.c | 44 ++++++++++++----------- arch/x86/kernel/numaq_32.c | 1 + arch/x86/kernel/smpboot.c | 24 ++++++++----- arch/x86/mach-generic/bigsmp.c | 1 + arch/x86/mach-generic/default.c | 1 + arch/x86/mach-generic/summit.c | 1 + 15 files changed, 110 insertions(+), 95 deletions(-) create mode 100644 arch/x86/include/asm/mach-generic/mach_wakecpu.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h index 1d9543b9d358..ce547f24a1cd 100644 --- a/arch/x86/include/asm/bigsmp/apic.h +++ b/arch/x86/include/asm/bigsmp/apic.h @@ -24,8 +24,6 @@ static inline cpumask_t target_cpus(void) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE (0) /* phys delivery to target proc */ #define NO_BALANCE_IRQ (0) -#define WAKE_SECONDARY_VIA_INIT - static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h index 380f0b4f17ed..9d8cf776c285 100644 --- a/arch/x86/include/asm/es7000/apic.h +++ b/arch/x86/include/asm/es7000/apic.h @@ -23,8 +23,6 @@ static inline cpumask_t target_cpus(void) #define INT_DELIVERY_MODE (dest_LowestPrio) #define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */ #define NO_BALANCE_IRQ (1) -#undef WAKE_SECONDARY_VIA_INIT -#define WAKE_SECONDARY_VIA_MIP #else #define APIC_DFR_VALUE (APIC_DFR_FLAT) #define INT_DELIVERY_MODE (dest_Fixed) @@ -32,7 +30,6 @@ static inline cpumask_t target_cpus(void) #define NO_BALANCE_IRQ (0) #undef APIC_DEST_LOGICAL #define APIC_DEST_LOGICAL 0x0 -#define WAKE_SECONDARY_VIA_INIT #endif static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h index 398493461913..78f0daaee436 100644 --- a/arch/x86/include/asm/es7000/wakecpu.h +++ b/arch/x86/include/asm/es7000/wakecpu.h @@ -1,36 +1,12 @@ #ifndef __ASM_ES7000_WAKECPU_H #define __ASM_ES7000_WAKECPU_H -/* - * This file copes with machines that wakeup secondary CPUs by the - * INIT, INIT, STARTUP sequence. - */ - -#ifdef CONFIG_ES7000_CLUSTERED_APIC -#define WAKE_SECONDARY_VIA_MIP -#else -#define WAKE_SECONDARY_VIA_INIT -#endif - -#ifdef WAKE_SECONDARY_VIA_MIP -extern int es7000_start_cpu(int cpu, unsigned long eip); -static inline int -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) -{ - int boot_error = 0; - boot_error = es7000_start_cpu(phys_apicid, start_eip); - return boot_error; -} -#endif - -#define TRAMPOLINE_LOW phys_to_virt(0x467) -#define TRAMPOLINE_HIGH phys_to_virt(0x469) - -#define boot_cpu_apicid boot_cpu_physical_apicid +#define TRAMPOLINE_PHYS_LOW 0x467 +#define TRAMPOLINE_PHYS_HIGH 0x469 static inline void wait_for_init_deassert(atomic_t *deassert) { -#ifdef WAKE_SECONDARY_VIA_INIT +#ifndef CONFIG_ES7000_CLUSTERED_APIC while (!atomic_read(deassert)) cpu_relax(); #endif @@ -50,9 +26,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { } -#define inquire_remote_apic(apicid) do { \ - if (apic_verbosity >= APIC_DEBUG) \ - __inquire_remote_apic(apicid); \ - } while (0) +extern void __inquire_remote_apic(int apicid); + +static inline void inquire_remote_apic(int apicid) +{ + if (apic_verbosity >= APIC_DEBUG) + __inquire_remote_apic(apicid); +} #endif /* __ASM_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h index 5cbd4fcc06fd..39bd8c1db3f5 100644 --- a/arch/x86/include/asm/genapic_32.h +++ b/arch/x86/include/asm/genapic_32.h @@ -2,6 +2,7 @@ #define _ASM_X86_GENAPIC_32_H #include +#include /* * Generic APIC driver interface. @@ -65,6 +66,13 @@ struct genapic { void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); #endif + int trampoline_phys_low; + int trampoline_phys_high; + void (*wait_for_init_deassert)(atomic_t *deassert); + void (*smp_callin_clear_local_apic)(void); + void (*store_NMI_vector)(unsigned short *high, unsigned short *low); + void (*restore_NMI_vector)(unsigned short *high, unsigned short *low); + void (*inquire_remote_apic)(int apicid); }; #define APICFUNC(x) .x = x, @@ -105,13 +113,20 @@ struct genapic { APICFUNC(get_apic_id) \ .apic_id_mask = APIC_ID_MASK, \ APICFUNC(cpu_mask_to_apicid) \ - APICFUNC(vector_allocation_domain) \ + APICFUNC(vector_allocation_domain) \ APICFUNC(acpi_madt_oem_check) \ IPIFUNC(send_IPI_mask) \ IPIFUNC(send_IPI_allbutself) \ IPIFUNC(send_IPI_all) \ APICFUNC(enable_apic_mode) \ APICFUNC(phys_pkg_id) \ + .trampoline_phys_low = TRAMPOLINE_PHYS_LOW, \ + .trampoline_phys_high = TRAMPOLINE_PHYS_HIGH, \ + APICFUNC(wait_for_init_deassert) \ + APICFUNC(smp_callin_clear_local_apic) \ + APICFUNC(store_NMI_vector) \ + APICFUNC(restore_NMI_vector) \ + APICFUNC(inquire_remote_apic) \ } extern struct genapic *genapic; diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h index 9d80db91e992..ceb013660146 100644 --- a/arch/x86/include/asm/mach-default/mach_wakecpu.h +++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h @@ -1,17 +1,8 @@ #ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H #define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H -/* - * This file copes with machines that wakeup secondary CPUs by the - * INIT, INIT, STARTUP sequence. - */ - -#define WAKE_SECONDARY_VIA_INIT - -#define TRAMPOLINE_LOW phys_to_virt(0x467) -#define TRAMPOLINE_HIGH phys_to_virt(0x469) - -#define boot_cpu_apicid boot_cpu_physical_apicid +#define TRAMPOLINE_PHYS_LOW (0x467) +#define TRAMPOLINE_PHYS_HIGH (0x469) static inline void wait_for_init_deassert(atomic_t *deassert) { @@ -33,9 +24,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { } -#define inquire_remote_apic(apicid) do { \ - if (apic_verbosity >= APIC_DEBUG) \ - __inquire_remote_apic(apicid); \ - } while (0) +extern void __inquire_remote_apic(int apicid); + +static inline void inquire_remote_apic(int apicid) +{ + if (apic_verbosity >= APIC_DEBUG) + __inquire_remote_apic(apicid); +} #endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h index dbab36d64d48..23bf52103b89 100644 --- a/arch/x86/include/asm/mach-default/smpboot_hooks.h +++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h @@ -13,9 +13,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) CMOS_WRITE(0xa, 0xf); local_flush_tlb(); pr_debug("1.\n"); - *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + start_eip >> 4; pr_debug("2.\n"); - *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + start_eip & 0xf; pr_debug("3.\n"); } @@ -32,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void) */ CMOS_WRITE(0, 0xf); - *((volatile long *) phys_to_virt(0x467)) = 0; + *((volatile long *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } static inline void __init smpboot_setup_io_apic(void) diff --git a/arch/x86/include/asm/mach-generic/mach_wakecpu.h b/arch/x86/include/asm/mach-generic/mach_wakecpu.h new file mode 100644 index 000000000000..1ab16b168c8a --- /dev/null +++ b/arch/x86/include/asm/mach-generic/mach_wakecpu.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H +#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H + +#define TRAMPOLINE_PHYS_LOW (genapic->trampoline_phys_low) +#define TRAMPOLINE_PHYS_HIGH (genapic->trampoline_phys_high) +#define wait_for_init_deassert (genapic->wait_for_init_deassert) +#define smp_callin_clear_local_apic (genapic->smp_callin_clear_local_apic) +#define store_NMI_vector (genapic->store_NMI_vector) +#define restore_NMI_vector (genapic->restore_NMI_vector) +#define inquire_remote_apic (genapic->inquire_remote_apic) + +#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */ diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h index c577bda5b1c5..6f499df8eddb 100644 --- a/arch/x86/include/asm/numaq/wakecpu.h +++ b/arch/x86/include/asm/numaq/wakecpu.h @@ -3,12 +3,8 @@ /* This file copes with machines that wakeup secondary CPUs by NMIs */ -#define WAKE_SECONDARY_VIA_NMI - -#define TRAMPOLINE_LOW phys_to_virt(0x8) -#define TRAMPOLINE_HIGH phys_to_virt(0xa) - -#define boot_cpu_apicid boot_cpu_logical_apicid +#define TRAMPOLINE_PHYS_LOW (0x8) +#define TRAMPOLINE_PHYS_HIGH (0xa) /* We don't do anything here because we use NMI's to boot instead */ static inline void wait_for_init_deassert(atomic_t *deassert) @@ -27,17 +23,23 @@ static inline void smp_callin_clear_local_apic(void) static inline void store_NMI_vector(unsigned short *high, unsigned short *low) { printk("Storing NMI vector\n"); - *high = *((volatile unsigned short *) TRAMPOLINE_HIGH); - *low = *((volatile unsigned short *) TRAMPOLINE_LOW); + *high = + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)); + *low = + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)); } static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { printk("Restoring NMI vector\n"); - *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high; - *((volatile unsigned short *) TRAMPOLINE_LOW) = *low; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + *high; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + *low; } -#define inquire_remote_apic(apicid) {} +static inline void inquire_remote_apic(int apicid) +{ +} #endif /* __ASM_NUMAQ_WAKECPU_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f12d37237465..40b2d3304911 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -16,6 +16,7 @@ static inline void visws_early_detect(void) { } static inline int is_visws_box(void) { return 0; } #endif +extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); /* * Any setup quirks to be performed? */ @@ -39,6 +40,7 @@ struct x86_quirks { void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable, unsigned short oemsize); int (*setup_ioapic_ids)(void); + int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); }; extern struct x86_quirks *x86_quirks; diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index f454c78fcef6..bed10dddf099 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -40,6 +40,7 @@ #include #include #include +#include /* * ES7000 chipsets @@ -161,6 +162,26 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } +#ifdef CONFIG_ES7000_CLUSTERED_APIC +static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +{ + unsigned long vect = 0, psaival = 0; + + if (psai == NULL) + return -1; + + vect = ((unsigned long)__pa(eip)/0x1000) << 16; + psaival = (0x1000000 | vect | cpu); + + while (*psai & 0x1000000) + ; + + *psai = psaival; + + return 0; +} +#endif + void __init setup_unisys(void) { @@ -176,6 +197,9 @@ setup_unisys(void) else es7000_plat = ES7000_CLASSIC; ioapic_renumber_irq = es7000_rename_gsi; +#ifdef CONFIG_ES7000_CLUSTERED_APIC + x86_quirks->wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip; +#endif } /* @@ -324,26 +348,6 @@ es7000_mip_write(struct mip_reg *mip_reg) return status; } -int -es7000_start_cpu(int cpu, unsigned long eip) -{ - unsigned long vect = 0, psaival = 0; - - if (psai == NULL) - return -1; - - vect = ((unsigned long)__pa(eip)/0x1000) << 16; - psaival = (0x1000000 | vect | cpu); - - while (*psai & 0x1000000) - ; - - *psai = psaival; - - return 0; - -} - void __init es7000_sw_apic(void) { diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 4caff39078e0..745891b7d0fb 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -250,6 +250,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, .setup_ioapic_ids = numaq_setup_ioapic_ids, + .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi, }; void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7b1093397319..498c1ef37fe0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -536,7 +537,7 @@ static void impress_friends(void) pr_debug("Before bogocount - setting activated=1.\n"); } -static inline void __inquire_remote_apic(int apicid) +void __inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; @@ -575,14 +576,13 @@ static inline void __inquire_remote_apic(int apicid) } } -#ifdef WAKE_SECONDARY_VIA_NMI /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -static int __devinit -wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) +int __devinit +wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt; @@ -599,7 +599,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -614,11 +614,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -#endif /* WAKE_SECONDARY_VIA_NMI */ -#ifdef WAKE_SECONDARY_VIA_INIT static int __devinit -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) +wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt, num_starts, j; @@ -737,7 +735,15 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -#endif /* WAKE_SECONDARY_VIA_INIT */ + +static int __devinit +wakeup_secondary_cpu(int apicid, unsigned long start_eip) +{ + if (x86_quirks->wakeup_secondary_cpu) + return x86_quirks->wakeup_secondary_cpu(apicid, start_eip); + + return wakeup_secondary_cpu_via_init(apicid, start_eip); +} struct create_idle { struct work_struct work; diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 3c3b471ea496..3624a364b7f3 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -17,6 +17,7 @@ #include #include #include +#include static int dmi_bigsmp; /* can be set by dmi scanners */ diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c index 9e835a11a13a..e63a4a76d8cd 100644 --- a/arch/x86/mach-generic/default.c +++ b/arch/x86/mach-generic/default.c @@ -16,6 +16,7 @@ #include #include #include +#include /* should be called last. */ static int probe_default(void) diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 6272b5e69da6..2c6d234e0009 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -16,6 +16,7 @@ #include #include #include +#include static int probe_summit(void) { -- cgit v1.2.3 From 54ac14a8e982ae6c7ac71ee2b0d0173b974509e2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 17 Nov 2008 15:19:53 -0800 Subject: x86: fix wakeup_cpu with numaq/es7000, v2, fix Impact: fix wakeup_secondary_cpu with hotplug We can not put that into x86_quirks, because that is __initdata. So try to move that to genapic, and add update_genapic in x86_quirks. later we even could use that stub to: 1. autodetect CONFIG_ES7000_CLUSTERED_APIC 2. more correct inquire_remote_apic with apic_verbosity setting. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/genapic_32.h | 1 + arch/x86/include/asm/genapic_64.h | 2 ++ arch/x86/include/asm/mach-default/mach_apic.h | 2 ++ arch/x86/include/asm/mach-generic/mach_apic.h | 1 + arch/x86/include/asm/setup.h | 3 ++- arch/x86/kernel/es7000_32.c | 11 ++++++++++- arch/x86/kernel/genapic_64.c | 4 ++++ arch/x86/kernel/numaq_32.c | 11 +++++++++-- arch/x86/kernel/setup.c | 13 ++++++++++++- arch/x86/kernel/smpboot.c | 11 +---------- arch/x86/mach-generic/probe.c | 4 ++++ 11 files changed, 48 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h index 39bd8c1db3f5..455d6c27a98b 100644 --- a/arch/x86/include/asm/genapic_32.h +++ b/arch/x86/include/asm/genapic_32.h @@ -66,6 +66,7 @@ struct genapic { void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); #endif + int (*wakeup_cpu)(int apicid, unsigned long start_eip); int trampoline_phys_low; int trampoline_phys_high; void (*wait_for_init_deassert)(atomic_t *deassert); diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h index 13c4e96199ea..2cae011668b7 100644 --- a/arch/x86/include/asm/genapic_64.h +++ b/arch/x86/include/asm/genapic_64.h @@ -32,6 +32,8 @@ struct genapic { unsigned int (*get_apic_id)(unsigned long x); unsigned long (*set_apic_id)(unsigned int id); unsigned long apic_id_mask; + /* wakeup_secondary_cpu */ + int (*wakeup_cpu)(int apicid, unsigned long start_eip); }; extern struct genapic *genapic; diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h index ff3a6c236c00..6cb3a467e067 100644 --- a/arch/x86/include/asm/mach-default/mach_apic.h +++ b/arch/x86/include/asm/mach-default/mach_apic.h @@ -32,11 +32,13 @@ static inline cpumask_t target_cpus(void) #define vector_allocation_domain (genapic->vector_allocation_domain) #define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID))) #define send_IPI_self (genapic->send_IPI_self) +#define wakeup_secondary_cpu (genapic->wakeup_cpu) extern void setup_apic_routing(void); #else #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ #define TARGET_CPUS (target_cpus()) +#define wakeup_secondary_cpu wakeup_secondary_cpu_via_init /* * Set up the logical destination ID. * diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h index 5180bd7478fb..e430f47df667 100644 --- a/arch/x86/include/asm/mach-generic/mach_apic.h +++ b/arch/x86/include/asm/mach-generic/mach_apic.h @@ -27,6 +27,7 @@ #define vector_allocation_domain (genapic->vector_allocation_domain) #define enable_apic_mode (genapic->enable_apic_mode) #define phys_pkg_id (genapic->phys_pkg_id) +#define wakeup_secondary_cpu (genapic->wakeup_cpu) extern void generic_bigsmp_probe(void); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 40b2d3304911..294daeb3a006 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -17,6 +17,7 @@ static inline int is_visws_box(void) { return 0; } #endif extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); +extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip); /* * Any setup quirks to be performed? */ @@ -40,7 +41,7 @@ struct x86_quirks { void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable, unsigned short oemsize); int (*setup_ioapic_ids)(void); - int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + int (*update_genapic)(void); }; extern struct x86_quirks *x86_quirks; diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index bed10dddf099..fb3bfe66fbe2 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -40,6 +40,7 @@ #include #include #include +#include #include /* @@ -180,6 +181,13 @@ static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) return 0; } + +static int __init es7000_update_genapic(void) +{ + genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + + return 0; +} #endif void __init @@ -197,8 +205,9 @@ setup_unisys(void) else es7000_plat = ES7000_CLASSIC; ioapic_renumber_irq = es7000_rename_gsi; + #ifdef CONFIG_ES7000_CLUSTERED_APIC - x86_quirks->wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip; + x86_quirks->update_genapic = es7000_update_genapic; #endif } diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 6c9bfc9e1e95..2bced78b0b8e 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -21,6 +21,7 @@ #include #include #include +#include extern struct genapic apic_flat; extern struct genapic apic_physflat; @@ -53,6 +54,9 @@ void __init setup_apic_routing(void) genapic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); } + + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 745891b7d0fb..0deea37a53cf 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include @@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void) return 1; } +static int __init numaq_update_genapic(void) +{ + genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi; + + return 0; +} + static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, @@ -250,7 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, .setup_ioapic_ids = numaq_setup_ioapic_ids, - .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi, + .update_genapic = numaq_update_genapic, }; void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa6790c1dd3..c366e891e10b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -583,7 +583,18 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; +static int __init default_update_genapic(void) +{ +#if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) + genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi; +#endif + + return 0; +} + +static struct x86_quirks default_x86_quirks __initdata = { + .update_genapic = default_update_genapic, +}; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 498c1ef37fe0..0e9f446269f4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -615,7 +615,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -static int __devinit +int __devinit wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -736,15 +736,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -static int __devinit -wakeup_secondary_cpu(int apicid, unsigned long start_eip) -{ - if (x86_quirks->wakeup_secondary_cpu) - return x86_quirks->wakeup_secondary_cpu(apicid, start_eip); - - return wakeup_secondary_cpu_via_init(apicid, start_eip); -} - struct create_idle { struct work_struct work; struct task_struct *idle; diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c index 5a7e4619e1c4..90b134f3cd74 100644 --- a/arch/x86/mach-generic/probe.c +++ b/arch/x86/mach-generic/probe.c @@ -15,6 +15,7 @@ #include #include #include +#include extern struct genapic apic_numaq; extern struct genapic apic_summit; @@ -57,6 +58,9 @@ static int __init parse_apic(char *arg) } } + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); + /* Parsed again by __setup for debug/verbose */ return 0; } -- cgit v1.2.3 From 0231022cc32d5f2e7f3c06b75691dda0ad6aec33 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 17 Nov 2008 03:22:41 +0100 Subject: tracing/function-return-tracer: add the overrun field Impact: help to find the better depth of trace We decided to arbitrary define the depth of function return trace as "20". Perhaps this is not enough. To help finding an optimal depth, we measure now the overrun: the number of functions that have been missed for the current thread. By default this is not displayed, we have to do set a particular flag on the return tracer: echo overrun > /debug/tracing/trace_options And the overrun will be printed on the right. As the trace shows below, the current 20 depth is not enough. update_wall_time+0x37f/0x8c0 -> update_xtime_cache (345 ns) (Overruns: 2838) update_wall_time+0x384/0x8c0 -> clocksource_get_next (1141 ns) (Overruns: 2838) do_timer+0x23/0x100 -> update_wall_time (3882 ns) (Overruns: 2838) tick_do_update_jiffies64+0xbf/0x160 -> do_timer (5339 ns) (Overruns: 2838) tick_sched_timer+0x6a/0xf0 -> tick_do_update_jiffies64 (7209 ns) (Overruns: 2838) vgacon_set_cursor_size+0x98/0x120 -> native_io_delay (2613 ns) (Overruns: 274) vgacon_cursor+0x16e/0x1d0 -> vgacon_set_cursor_size (33151 ns) (Overruns: 274) set_cursor+0x5f/0x80 -> vgacon_cursor (36432 ns) (Overruns: 274) con_flush_chars+0x34/0x40 -> set_cursor (38790 ns) (Overruns: 274) release_console_sem+0x1ec/0x230 -> up (721 ns) (Overruns: 274) release_console_sem+0x225/0x230 -> wake_up_klogd (316 ns) (Overruns: 274) con_flush_chars+0x39/0x40 -> release_console_sem (2996 ns) (Overruns: 274) con_write+0x22/0x30 -> con_flush_chars (46067 ns) (Overruns: 274) n_tty_write+0x1cc/0x360 -> con_write (292670 ns) (Overruns: 274) smp_apic_timer_interrupt+0x2a/0x90 -> native_apic_mem_write (330 ns) (Overruns: 274) irq_enter+0x17/0x70 -> idle_cpu (413 ns) (Overruns: 274) smp_apic_timer_interrupt+0x2f/0x90 -> irq_enter (1525 ns) (Overruns: 274) ktime_get_ts+0x40/0x70 -> getnstimeofday (465 ns) (Overruns: 274) ktime_get_ts+0x60/0x70 -> set_normalized_timespec (436 ns) (Overruns: 274) ktime_get+0x16/0x30 -> ktime_get_ts (2501 ns) (Overruns: 274) hrtimer_interrupt+0x77/0x1a0 -> ktime_get (3439 ns) (Overruns: 274) Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 7 +++++++ arch/x86/kernel/ftrace.c | 10 ++++++--- include/linux/ftrace.h | 2 ++ include/linux/sched.h | 1 + kernel/trace/trace.c | 1 + kernel/trace/trace.h | 1 + kernel/trace/trace_functions_return.c | 38 +++++++++++++++++++++++++++++------ 7 files changed, 51 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a71158369fd4..e90e81ef6ab9 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -21,6 +21,7 @@ struct task_struct; struct exec_domain; #include #include +#include struct thread_info { struct task_struct *task; /* main task structure */ @@ -45,6 +46,11 @@ struct thread_info { int curr_ret_stack; /* Stack of return addresses for return function tracing */ struct ftrace_ret_stack ret_stack[FTRACE_RET_STACK_SIZE]; + /* + * Number of functions that haven't been traced + * because of depth overrun. + */ + atomic_t trace_overrun; #endif }; @@ -61,6 +67,7 @@ struct thread_info { .fn = do_no_restart_syscall, \ }, \ .curr_ret_stack = -1,\ + .trace_overrun = ATOMIC_INIT(0) \ } #else #define INIT_THREAD_INFO(tsk) \ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 924153edd973..356bb1eb6e9a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -353,8 +353,10 @@ static int push_return_trace(unsigned long ret, unsigned long long time, struct thread_info *ti = current_thread_info(); /* The return trace stack is full */ - if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) + if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { + atomic_inc(&ti->trace_overrun); return -EBUSY; + } index = ++ti->curr_ret_stack; barrier(); @@ -367,7 +369,7 @@ static int push_return_trace(unsigned long ret, unsigned long long time, /* Retrieve a function return address to the trace stack on thread info.*/ static void pop_return_trace(unsigned long *ret, unsigned long long *time, - unsigned long *func) + unsigned long *func, unsigned long *overrun) { int index; @@ -376,6 +378,7 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, *ret = ti->ret_stack[index].ret; *func = ti->ret_stack[index].func; *time = ti->ret_stack[index].calltime; + *overrun = atomic_read(&ti->trace_overrun); ti->curr_ret_stack--; } @@ -386,7 +389,8 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, unsigned long ftrace_return_to_handler(void) { struct ftrace_retfunc trace; - pop_return_trace(&trace.ret, &trace.calltime, &trace.func); + pop_return_trace(&trace.ret, &trace.calltime, &trace.func, + &trace.overrun); trace.rettime = cpu_clock(raw_smp_processor_id()); ftrace_function_return(&trace); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f1af1aab00e6..f7ba4ea5e128 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -318,6 +318,8 @@ struct ftrace_retfunc { unsigned long func; /* Current function */ unsigned long long calltime; unsigned long long rettime; + /* Number of functions that overran the depth limit for current task */ + unsigned long overrun; }; #ifdef CONFIG_FUNCTION_RET_TRACER diff --git a/include/linux/sched.h b/include/linux/sched.h index 61c8cc36028a..c8e0db464206 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2016,6 +2016,7 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct * used. */ task_thread_info(p)->curr_ret_stack = -1; + atomic_set(&task_thread_info(p)->trace_overrun, 0); #endif } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9531fddcfb8d..e97c29a6e7b0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -853,6 +853,7 @@ static void __trace_function_return(struct trace_array *tr, entry->parent_ip = trace->ret; entry->rettime = trace->rettime; entry->calltime = trace->calltime; + entry->overrun = trace->overrun; ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); } #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9d22618bf99f..2cb12fd98f6b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -60,6 +60,7 @@ struct ftrace_ret_entry { unsigned long parent_ip; unsigned long long calltime; unsigned long long rettime; + unsigned long overrun; }; extern struct tracer boot_tracer; diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c index a68564af022b..e00d64509c9c 100644 --- a/kernel/trace/trace_functions_return.c +++ b/kernel/trace/trace_functions_return.c @@ -14,6 +14,19 @@ #include "trace.h" +#define TRACE_RETURN_PRINT_OVERRUN 0x1 +static struct tracer_opt trace_opts[] = { + /* Display overruns or not */ + { TRACER_OPT(overrun, TRACE_RETURN_PRINT_OVERRUN) }, + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, /* Don't display overruns by default */ + .opts = trace_opts +}; + + static int return_trace_init(struct trace_array *tr) { int cpu; @@ -42,26 +55,39 @@ print_return_function(struct trace_iterator *iter) ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip); if (!ret) return TRACE_TYPE_PARTIAL_LINE; + ret = seq_print_ip_sym(s, field->ip, trace_flags & TRACE_ITER_SYM_MASK); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " (%llu ns)\n", + + ret = trace_seq_printf(s, " (%llu ns)", field->rettime - field->calltime); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; + + if (tracer_flags.val & TRACE_RETURN_PRINT_OVERRUN) { + ret = trace_seq_printf(s, " (Overruns: %lu)", + field->overrun); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; } return TRACE_TYPE_UNHANDLED; } -static struct tracer return_trace __read_mostly = -{ +static struct tracer return_trace __read_mostly = { .name = "return", .init = return_trace_init, .reset = return_trace_reset, - .print_line = print_return_function + .print_line = print_return_function, + .flags = &tracer_flags, }; static __init int init_return_trace(void) -- cgit v1.2.3 From b5fe363b7d89577fcfda9b6cf0efc32760bbccc6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 18 Nov 2008 08:14:14 -0800 Subject: x86: use update_genapic to get rid of ES7000_CLUSTERED_APIC v2 Impact: clean up We can autodetect those system that need cluster apic, and update genapic accordingly. We can also remove wakeup.h for e7000, because it's default one is now the same as overall default mach_wakecpu.h Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 -- arch/x86/include/asm/es7000/apic.h | 76 ++++++++++++++++++++++++++------------ arch/x86/include/asm/genapic_32.h | 1 + arch/x86/kernel/es7000_32.c | 17 +++++++-- arch/x86/mach-generic/es7000.c | 14 ++++++- 5 files changed, 80 insertions(+), 32 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 93224b569187..7d0ab8942cfb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -462,10 +462,6 @@ config X86_CYCLONE_TIMER def_bool y depends on X86_GENERICARCH -config ES7000_CLUSTERED_APIC - def_bool y - depends on SMP && X86_ES7000 && MPENTIUMIII - source "arch/x86/Kconfig.cpu" config HPET_TIMER diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h index 9d8cf776c285..e24ef876915f 100644 --- a/arch/x86/include/asm/es7000/apic.h +++ b/arch/x86/include/asm/es7000/apic.h @@ -9,28 +9,27 @@ static inline int apic_id_registered(void) return (1); } -static inline cpumask_t target_cpus(void) +static inline cpumask_t target_cpus_cluster(void) { -#if defined CONFIG_ES7000_CLUSTERED_APIC return CPU_MASK_ALL; -#else +} + +static inline cpumask_t target_cpus(void) +{ return cpumask_of_cpu(smp_processor_id()); -#endif } -#if defined CONFIG_ES7000_CLUSTERED_APIC -#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -#define INT_DELIVERY_MODE (dest_LowestPrio) -#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */ -#define NO_BALANCE_IRQ (1) -#else +#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER) +#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio) +#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */ +#define NO_BALANCE_IRQ_CLUSTER (1) + #define APIC_DFR_VALUE (APIC_DFR_FLAT) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE (0) /* phys delivery to target procs */ #define NO_BALANCE_IRQ (0) #undef APIC_DEST_LOGICAL #define APIC_DEST_LOGICAL 0x0 -#endif static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { @@ -57,6 +56,16 @@ static inline unsigned long calculate_ldr(int cpu) * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel * document number 292116). So here it goes... */ +static inline void init_apic_ldr_cluster(void) +{ + unsigned long val; + int cpu = smp_processor_id(); + + apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER); + val = calculate_ldr(cpu); + apic_write(APIC_LDR, val); +} + static inline void init_apic_ldr(void) { unsigned long val; @@ -67,10 +76,6 @@ static inline void init_apic_ldr(void) apic_write(APIC_LDR, val); } -#ifndef CONFIG_X86_GENERICARCH -extern void enable_apic_mode(void); -#endif - extern int apic_version [MAX_APICS]; static inline void setup_apic_routing(void) { @@ -141,7 +146,7 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid) return (1); } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask) { int num_bits_set; int cpus_found = 0; @@ -151,11 +156,7 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) num_bits_set = cpus_weight(cpumask); /* Return id to all */ if (num_bits_set == NR_CPUS) -#if defined CONFIG_ES7000_CLUSTERED_APIC return 0xFF; -#else - return cpu_to_logical_apicid(0); -#endif /* * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. @@ -168,11 +169,40 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ printk ("%s: Not a valid mask!\n", __func__); -#if defined CONFIG_ES7000_CLUSTERED_APIC return 0xFF; -#else + } + apicid = new_apicid; + cpus_found++; + } + cpu++; + } + return apicid; +} + +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ + int num_bits_set; + int cpus_found = 0; + int cpu; + int apicid; + + num_bits_set = cpus_weight(cpumask); + /* Return id to all */ + if (num_bits_set == NR_CPUS) + return cpu_to_logical_apicid(0); + /* + * The cpus in the mask must all be on the apic cluster. If are not + * on the same apicid cluster return default value of TARGET_CPUS. + */ + cpu = first_cpu(cpumask); + apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { + if (cpu_isset(cpu, cpumask)) { + int new_apicid = cpu_to_logical_apicid(cpu); + if (apicid_cluster(apicid) != + apicid_cluster(new_apicid)){ + printk ("%s: Not a valid mask!\n", __func__); return cpu_to_logical_apicid(0); -#endif } apicid = new_apicid; cpus_found++; diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h index 455d6c27a98b..0ac17d33a8c7 100644 --- a/arch/x86/include/asm/genapic_32.h +++ b/arch/x86/include/asm/genapic_32.h @@ -131,6 +131,7 @@ struct genapic { } extern struct genapic *genapic; +extern void es7000_update_genapic_to_cluster(void); enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; #define get_uv_system_type() UV_NONE diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index fb3bfe66fbe2..71d7be624d46 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -163,7 +164,6 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } -#ifdef CONFIG_ES7000_CLUSTERED_APIC static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { unsigned long vect = 0, psaival = 0; @@ -182,13 +182,24 @@ static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) return 0; } +static void noop_wait_for_deassert(atomic_t *deassert_not_used) +{ +} + static int __init es7000_update_genapic(void) { genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + /* MPENTIUMIII */ + if (boot_cpu_data.x86 == 6 && + (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) { + es7000_update_genapic_to_cluster(); + genapic->wait_for_init_deassert = noop_wait_for_deassert; + genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + } + return 0; } -#endif void __init setup_unisys(void) @@ -206,9 +217,7 @@ setup_unisys(void) es7000_plat = ES7000_CLASSIC; ioapic_renumber_irq = es7000_rename_gsi; -#ifdef CONFIG_ES7000_CLUSTERED_APIC x86_quirks->update_genapic = es7000_update_genapic; -#endif } /* diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 28459cab3ddb..7b4e6d0d1690 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -16,7 +16,19 @@ #include #include #include -#include +#include + +void __init es7000_update_genapic_to_cluster(void) +{ + genapic->target_cpus = target_cpus_cluster; + genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER; + genapic->int_dest_mode = INT_DEST_MODE_CLUSTER; + genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER; + + genapic->init_apic_ldr = init_apic_ldr_cluster; + + genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster; +} static int probe_es7000(void) { -- cgit v1.2.3 From f201ae2356c74bcae130b2177b3dca903ea98071 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 23 Nov 2008 06:22:56 +0100 Subject: tracing/function-return-tracer: store return stack into task_struct and allocate it dynamically Impact: use deeper function tracing depth safely Some tests showed that function return tracing needed a more deeper depth of function calls. But it could be unsafe to store these return addresses to the stack. So these arrays will now be allocated dynamically into task_struct of current only when the tracer is activated. Typical scheme when tracer is activated: - allocate a return stack for each task in global list. - fork: allocate the return stack for the newly created task - exit: free return stack of current - idle init: same as fork I chose a default depth of 50. I don't have overruns anymore. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 1 - arch/x86/include/asm/thread_info.h | 29 ------------ arch/x86/kernel/ftrace.c | 29 ++++++------ include/linux/ftrace.h | 5 ++ include/linux/sched.h | 23 +++++---- kernel/exit.c | 5 +- kernel/fork.c | 4 ++ kernel/sched.c | 3 ++ kernel/trace/ftrace.c | 96 +++++++++++++++++++++++++++++++++++++- 9 files changed, 137 insertions(+), 58 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 2bb43b433e07..754a3e082f94 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -29,7 +29,6 @@ struct dyn_arch_ftrace { #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_RET_TRACER -#define FTRACE_RET_STACK_SIZE 20 #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e90e81ef6ab9..0921b4018c11 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -40,36 +40,8 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif - -#ifdef CONFIG_FUNCTION_RET_TRACER - /* Index of current stored adress in ret_stack */ - int curr_ret_stack; - /* Stack of return addresses for return function tracing */ - struct ftrace_ret_stack ret_stack[FTRACE_RET_STACK_SIZE]; - /* - * Number of functions that haven't been traced - * because of depth overrun. - */ - atomic_t trace_overrun; -#endif }; -#ifdef CONFIG_FUNCTION_RET_TRACER -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ - .curr_ret_stack = -1,\ - .trace_overrun = ATOMIC_INIT(0) \ -} -#else #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ @@ -82,7 +54,6 @@ struct thread_info { .fn = do_no_restart_syscall, \ }, \ } -#endif #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 356bb1eb6e9a..bb137f7297ed 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -350,19 +350,21 @@ static int push_return_trace(unsigned long ret, unsigned long long time, unsigned long func) { int index; - struct thread_info *ti = current_thread_info(); + + if (!current->ret_stack) + return -EBUSY; /* The return trace stack is full */ - if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { - atomic_inc(&ti->trace_overrun); + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); return -EBUSY; } - index = ++ti->curr_ret_stack; + index = ++current->curr_ret_stack; barrier(); - ti->ret_stack[index].ret = ret; - ti->ret_stack[index].func = func; - ti->ret_stack[index].calltime = time; + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = time; return 0; } @@ -373,13 +375,12 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, { int index; - struct thread_info *ti = current_thread_info(); - index = ti->curr_ret_stack; - *ret = ti->ret_stack[index].ret; - *func = ti->ret_stack[index].func; - *time = ti->ret_stack[index].calltime; - *overrun = atomic_read(&ti->trace_overrun); - ti->curr_ret_stack--; + index = current->curr_ret_stack; + *ret = current->ret_stack[index].ret; + *func = current->ret_stack[index].func; + *time = current->ret_stack[index].calltime; + *overrun = atomic_read(¤t->trace_overrun); + current->curr_ret_stack--; } /* diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f7ba4ea5e128..2ba259b2defa 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -323,6 +323,8 @@ struct ftrace_retfunc { }; #ifdef CONFIG_FUNCTION_RET_TRACER +#define FTRACE_RETFUNC_DEPTH 50 +#define FTRACE_RETSTACK_ALLOC_SIZE 32 /* Type of a callback handler of tracing return function */ typedef void (*trace_function_return_t)(struct ftrace_retfunc *); @@ -330,6 +332,9 @@ extern int register_ftrace_return(trace_function_return_t func); /* The current handler in use */ extern trace_function_return_t ftrace_function_return; extern void unregister_ftrace_return(void); + +extern void ftrace_retfunc_init_task(struct task_struct *t); +extern void ftrace_retfunc_exit_task(struct task_struct *t); #endif #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index c8e0db464206..bee1e93c95ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1352,6 +1352,17 @@ struct task_struct { unsigned long default_timer_slack_ns; struct list_head *scm_work_list; +#ifdef CONFIG_FUNCTION_RET_TRACER + /* Index of current stored adress in ret_stack */ + int curr_ret_stack; + /* Stack of return addresses for return function tracing */ + struct ftrace_ret_stack *ret_stack; + /* + * Number of functions that haven't been traced + * because of depth overrun. + */ + atomic_t trace_overrun; +#endif }; /* @@ -2006,18 +2017,6 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct { *task_thread_info(p) = *task_thread_info(org); task_thread_info(p)->task = p; - -#ifdef CONFIG_FUNCTION_RET_TRACER - /* - * When fork() creates a child process, this function is called. - * But the child task may not inherit the return adresses traced - * by the return function tracer because it will directly execute - * in userspace and will not return to kernel functions its parent - * used. - */ - task_thread_info(p)->curr_ret_stack = -1; - atomic_set(&task_thread_info(p)->trace_overrun, 0); -#endif } static inline unsigned long *end_of_stack(struct task_struct *p) diff --git a/kernel/exit.c b/kernel/exit.c index 35c8ec2ba03a..b9d446329da1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -1127,7 +1128,9 @@ NORET_TYPE void do_exit(long code) preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; - +#ifdef CONFIG_FUNCTION_RET_TRACER + ftrace_retfunc_exit_task(tsk); +#endif schedule(); BUG(); /* Avoid "noreturn function does return". */ diff --git a/kernel/fork.c b/kernel/fork.c index ac62f43ee430..d1eb30e69ccc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1269,6 +1270,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); +#ifdef CONFIG_FUNCTION_RET_TRACER + ftrace_retfunc_init_task(p); +#endif proc_fork_connector(p); cgroup_post_fork(p); return p; diff --git a/kernel/sched.c b/kernel/sched.c index 4de56108c86f..fb17205950de 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5901,6 +5901,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; +#ifdef CONFIG_FUNCTION_RET_TRACER + ftrace_retfunc_init_task(idle); +#endif } /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f212da486689..90d99fb02ae4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1498,10 +1498,77 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_RET_TRACER +static atomic_t ftrace_retfunc_active; + /* The callback that hooks the return of a function */ trace_function_return_t ftrace_function_return = (trace_function_return_t)ftrace_stub; + +/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ +static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +{ + int i; + int ret = 0; + unsigned long flags; + int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; + struct task_struct *g, *t; + + for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { + ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack_list[i]) { + start = 0; + end = i; + ret = -ENOMEM; + goto free; + } + } + + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + if (start == end) { + ret = -EAGAIN; + goto unlock; + } + + if (t->ret_stack == NULL) { + t->ret_stack = ret_stack_list[start++]; + t->curr_ret_stack = -1; + atomic_set(&t->trace_overrun, 0); + } + } while_each_thread(g, t); + +unlock: + read_unlock_irqrestore(&tasklist_lock, flags); +free: + for (i = start; i < end; i++) + kfree(ret_stack_list[i]); + return ret; +} + +/* Allocate a return stack for each task */ +static int start_return_tracing(void) +{ + struct ftrace_ret_stack **ret_stack_list; + int ret; + + ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * + sizeof(struct ftrace_ret_stack *), + GFP_KERNEL); + + if (!ret_stack_list) + return -ENOMEM; + + do { + ret = alloc_retstack_tasklist(ret_stack_list); + } while (ret == -EAGAIN); + + kfree(ret_stack_list); + return ret; +} + int register_ftrace_return(trace_function_return_t func) { int ret = 0; @@ -1516,7 +1583,12 @@ int register_ftrace_return(trace_function_return_t func) ret = -EBUSY; goto out; } - + atomic_inc(&ftrace_retfunc_active); + ret = start_return_tracing(); + if (ret) { + atomic_dec(&ftrace_retfunc_active); + goto out; + } ftrace_tracing_type = FTRACE_TYPE_RETURN; ftrace_function_return = func; ftrace_startup(); @@ -1530,6 +1602,7 @@ void unregister_ftrace_return(void) { mutex_lock(&ftrace_sysctl_lock); + atomic_dec(&ftrace_retfunc_active); ftrace_function_return = (trace_function_return_t)ftrace_stub; ftrace_shutdown(); /* Restore normal tracing type */ @@ -1537,6 +1610,27 @@ void unregister_ftrace_return(void) mutex_unlock(&ftrace_sysctl_lock); } + +/* Allocate a return stack for newly created task */ +void ftrace_retfunc_init_task(struct task_struct *t) +{ + if (atomic_read(&ftrace_retfunc_active)) { + t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!t->ret_stack) + return; + t->curr_ret_stack = -1; + atomic_set(&t->trace_overrun, 0); + } else + t->ret_stack = NULL; +} + +void ftrace_retfunc_exit_task(struct task_struct *t) +{ + kfree(t->ret_stack); + t->ret_stack = NULL; +} #endif -- cgit v1.2.3 From 050dc6944b9ca2186f4729ab44e0da3743933941 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 23 Nov 2008 13:35:48 +0100 Subject: x86: remove duplicate #define from 'cpufeature.h' Impact: cleanup Remove duplicate #define from 'cpufeature.h'. This also fixes the following sparse warning: arch/x86/kernel/cpu/capflags.c:54:3: warning: Initializer entry defined twice arch/x86/kernel/cpu/capflags.c:58:3: also defined here Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 694d1f8f1bee..5bce8ed02b44 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -80,7 +80,6 @@ #define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */ #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ #define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ #define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */ -- cgit v1.2.3 From ca0002a179bfa532d009a9272d619732872c49bd Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 25 Nov 2008 09:01:25 +0100 Subject: x86, bts: base in-kernel ds interface on handles Impact: generalize the DS code to shared buffers Change the in-kernel ds.h interface to identify the tracer via a handle returned on ds_request_~(). Tracers used to be identified via their task_struct. The changes are required to allow DS to be shared between different tasks, which is needed for perfmon2 and for ftrace. For ptrace, the handle is stored in the traced task's task_struct. This should probably go into a (arch-specific) ptrace context some time. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 124 ++++----- arch/x86/kernel/ds.c | 679 +++++++++++++++++++++++----------------------- arch/x86/kernel/ptrace.c | 73 ++--- include/linux/sched.h | 9 + 4 files changed, 446 insertions(+), 439 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index a95008457ea4..0af997de5f01 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -26,11 +26,18 @@ #include #include +#include #ifdef CONFIG_X86_DS struct task_struct; +struct ds_tracer; +struct bts_tracer; +struct pebs_tracer; + +typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); +typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); /* * Request BTS or PEBS @@ -38,21 +45,29 @@ struct task_struct; * Due to alignement constraints, the actual buffer may be slightly * smaller than the requested or provided buffer. * - * Returns 0 on success; -Eerrno otherwise + * Returns a pointer to a tracer structure on success, or + * ERR_PTR(errcode) on failure. + * + * The interrupt threshold is independent from the overflow callback + * to allow users to use their own overflow interrupt handling mechanism. * * task: the task to request recording for; * NULL for per-cpu recording on the current cpu * base: the base pointer for the (non-pageable) buffer; * NULL if buffer allocation requested - * size: the size of the requested or provided buffer + * size: the size of the requested or provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; * NULL if cyclic buffer requested + * th: the interrupt threshold in records from the end of the buffer; + * -1 if no interrupt threshold is requested. */ -typedef void (*ds_ovfl_callback_t)(struct task_struct *); -extern int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl); -extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl); +extern struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th); +extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th); /* * Release BTS or PEBS resources @@ -61,37 +76,34 @@ extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, * * Returns 0 on success; -Eerrno otherwise * - * task: the task to release resources for; - * NULL to release resources for the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_release_bts(struct task_struct *task); -extern int ds_release_pebs(struct task_struct *task); +extern int ds_release_bts(struct bts_tracer *tracer); +extern int ds_release_pebs(struct pebs_tracer *tracer); /* - * Return the (array) index of the write pointer. + * Get the (array) index of the write pointer. * (assuming an array of BTS/PEBS records) * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result */ -extern int ds_get_bts_index(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_index(struct task_struct *task, size_t *pos); +extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos); /* - * Return the (array) index one record beyond the end of the array. + * Get the (array) index one record beyond the end of the array. * (assuming an array of BTS/PEBS records) * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result */ -extern int ds_get_bts_end(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); +extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos); /* * Provide a pointer to the BTS/PEBS record at parameter index. @@ -102,14 +114,13 @@ extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); * * Returns the size of a single record on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() * index: the index of the requested record * record (out): pointer to the requested record */ -extern int ds_access_bts(struct task_struct *task, +extern int ds_access_bts(struct bts_tracer *tracer, size_t index, const void **record); -extern int ds_access_pebs(struct task_struct *task, +extern int ds_access_pebs(struct pebs_tracer *tracer, size_t index, const void **record); /* @@ -129,38 +140,24 @@ extern int ds_access_pebs(struct task_struct *task, * * Returns the number of bytes written or -Eerrno. * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() * buffer: the buffer to write * size: the size of the buffer */ -extern int ds_write_bts(struct task_struct *task, +extern int ds_write_bts(struct bts_tracer *tracer, const void *buffer, size_t size); -extern int ds_write_pebs(struct task_struct *task, +extern int ds_write_pebs(struct pebs_tracer *tracer, const void *buffer, size_t size); -/* - * Same as ds_write_bts/pebs, but omit ownership checks. - * - * This is needed to have some other task than the owner of the - * BTS/PEBS buffer or the parameter task itself write into the - * respective buffer. - */ -extern int ds_unchecked_write_bts(struct task_struct *task, - const void *buffer, size_t size); -extern int ds_unchecked_write_pebs(struct task_struct *task, - const void *buffer, size_t size); - /* * Reset the write pointer of the BTS/PEBS buffer. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_reset_bts(struct task_struct *task); -extern int ds_reset_pebs(struct task_struct *task); +extern int ds_reset_bts(struct bts_tracer *tracer); +extern int ds_reset_pebs(struct pebs_tracer *tracer); /* * Clear the BTS/PEBS buffer and reset the write pointer. @@ -168,33 +165,30 @@ extern int ds_reset_pebs(struct task_struct *task); * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_clear_bts(struct task_struct *task); -extern int ds_clear_pebs(struct task_struct *task); +extern int ds_clear_bts(struct bts_tracer *tracer); +extern int ds_clear_pebs(struct pebs_tracer *tracer); /* * Provide the PEBS counter reset value. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs() * value (out): the counter reset value */ -extern int ds_get_pebs_reset(struct task_struct *task, u64 *value); +extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value); /* * Set the PEBS counter reset value. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs() * value: the new counter reset value */ -extern int ds_set_pebs_reset(struct task_struct *task, u64 value); +extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); /* * Initialization @@ -207,17 +201,13 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); /* * The DS context - part of struct thread_struct. */ +#define MAX_SIZEOF_DS (12 * 8) + struct ds_context { /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ - unsigned char *ds; + unsigned char ds[MAX_SIZEOF_DS]; /* the owner of the BTS and PEBS configuration, respectively */ - struct task_struct *owner[2]; - /* buffer overflow notification function for BTS and PEBS */ - ds_ovfl_callback_t callback[2]; - /* the original buffer address */ - void *buffer[2]; - /* the number of allocated pages for on-request allocated buffers */ - unsigned int pages[2]; + struct ds_tracer *owner[2]; /* use count */ unsigned long count; /* a pointer to the context location inside the thread_struct diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index d6938d9351cf..96768e9cce99 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -28,6 +28,7 @@ #include #include #include +#include /* @@ -44,6 +45,35 @@ struct ds_configuration { }; static struct ds_configuration ds_cfg; +/* + * A BTS or PEBS tracer. + * + * This holds the configuration of the tracer and serves as a handle + * to identify tracers. + */ +struct ds_tracer { + /* the DS context (partially) owned by this tracer */ + struct ds_context *context; + /* the buffer provided on ds_request() and its size in bytes */ + void *buffer; + size_t size; + /* the number of allocated pages for on-request allocated buffers */ + unsigned int pages; +}; + +struct bts_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* buffer overflow notification function */ + bts_ovfl_callback_t ovfl; +}; + +struct pebs_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* buffer overflow notification function */ + pebs_ovfl_callback_t ovfl; +}; /* * Debug Store (DS) save area configuration (see Intel64 and IA32 @@ -107,35 +137,15 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, (*(unsigned long *)base) = value; } +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ + /* * Locking is done only for allocating BTS or PEBS resources and for * guarding context and buffer memory allocation. - * - * Most functions require the current task to own the ds context part - * they are going to access. All the locking is done when validating - * access to the context. */ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); -/* - * Validate that the current task is allowed to access the BTS/PEBS - * buffer of the parameter task. - * - * Returns 0, if access is granted; -Eerrno, otherwise. - */ -static inline int ds_validate_access(struct ds_context *context, - enum ds_qualifier qual) -{ - if (!context) - return -EPERM; - - if (context->owner[qual] == current) - return 0; - - return -EPERM; -} - /* * We either support (system-wide) per-cpu or per-thread allocation. @@ -183,50 +193,12 @@ static inline int check_tracer(struct task_struct *task) * * Contexts are use-counted. They are allocated on first access and * deallocated when the last user puts the context. - * - * We distinguish between an allocating and a non-allocating get of a - * context: - * - the allocating get is used for requesting BTS/PEBS resources. It - * requires the caller to hold the global ds_lock. - * - the non-allocating get is used for all other cases. A - * non-existing context indicates an error. It acquires and releases - * the ds_lock itself for obtaining the context. - * - * A context and its DS configuration are allocated and deallocated - * together. A context always has a DS configuration of the - * appropriate size. */ static DEFINE_PER_CPU(struct ds_context *, system_context); #define this_system_context per_cpu(system_context, smp_processor_id()) -/* - * Returns the pointer to the parameter task's context or to the - * system-wide context, if task is NULL. - * - * Increases the use count of the returned context, if not NULL. - */ static inline struct ds_context *ds_get_context(struct task_struct *task) -{ - struct ds_context *context; - unsigned long irq; - - spin_lock_irqsave(&ds_lock, irq); - - context = (task ? task->thread.ds_ctx : this_system_context); - if (context) - context->count++; - - spin_unlock_irqrestore(&ds_lock, irq); - - return context; -} - -/* - * Same as ds_get_context, but allocates the context and it's DS - * structure, if necessary; returns NULL; if out of memory. - */ -static inline struct ds_context *ds_alloc_context(struct task_struct *task) { struct ds_context **p_context = (task ? &task->thread.ds_ctx : &this_system_context); @@ -238,16 +210,9 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task) if (!context) return NULL; - context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); - if (!context->ds) { - kfree(context); - return NULL; - } - spin_lock_irqsave(&ds_lock, irq); if (*p_context) { - kfree(context->ds); kfree(context); context = *p_context; @@ -272,10 +237,6 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task) return context; } -/* - * Decreases the use count of the parameter context, if not NULL. - * Deallocates the context, if the use count reaches zero. - */ static inline void ds_put_context(struct ds_context *context) { unsigned long irq; @@ -296,13 +257,6 @@ static inline void ds_put_context(struct ds_context *context) if (!context->task || (context->task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); - put_tracer(context->task); - - /* free any leftover buffers from tracers that did not - * deallocate them properly. */ - kfree(context->buffer[ds_bts]); - kfree(context->buffer[ds_pebs]); - kfree(context->ds); kfree(context); out: spin_unlock_irqrestore(&ds_lock, irq); @@ -312,21 +266,29 @@ static inline void ds_put_context(struct ds_context *context) /* * Handle a buffer overflow * - * task: the task whose buffers are overflowing; - * NULL for a buffer overflow on the current cpu * context: the ds context * qual: the buffer type */ -static void ds_overflow(struct task_struct *task, struct ds_context *context, - enum ds_qualifier qual) -{ - if (!context) - return; - - if (context->callback[qual]) - (*context->callback[qual])(task); - - /* todo: do some more overflow handling */ +static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) +{ + switch (qual) { + case ds_bts: { + struct bts_tracer *tracer = + container_of(context->owner[qual], + struct bts_tracer, ds); + if (tracer->ovfl) + tracer->ovfl(tracer); + } + break; + case ds_pebs: { + struct pebs_tracer *tracer = + container_of(context->owner[qual], + struct pebs_tracer, ds); + if (tracer->ovfl) + tracer->ovfl(tracer); + } + break; + } } @@ -343,23 +305,25 @@ static void ds_overflow(struct task_struct *task, struct ds_context *context, static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) { unsigned long rlim, vm, pgsz; - void *buffer; + void *buffer = NULL; pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + down_write(¤t->mm->mmap_sem); + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; vm = current->mm->total_vm + pgsz; if (rlim < vm) - return NULL; + goto out; rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; vm = current->mm->locked_vm + pgsz; if (rlim < vm) - return NULL; + goto out; buffer = kzalloc(size, GFP_KERNEL); if (!buffer) - return NULL; + goto out; current->mm->total_vm += pgsz; current->mm->locked_vm += pgsz; @@ -367,290 +331,337 @@ static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) if (pages) *pages = pgsz; + out: + up_write(¤t->mm->mmap_sem); return buffer; } -static int ds_request(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl, enum ds_qualifier qual) +static void ds_install_ds_config(struct ds_context *context, + enum ds_qualifier qual, + void *base, size_t size, size_t ith) { - struct ds_context *context; unsigned long buffer, adj; - const unsigned long alignment = (1 << 3); + + /* adjust the buffer address and size to meet alignment + * constraints: + * - buffer is double-word aligned + * - size is multiple of record size + * + * We checked the size at the very beginning; we have enough + * space to do the adjustment. + */ + buffer = (unsigned long)base; + + adj = ALIGN(buffer, DS_ALIGNMENT) - buffer; + buffer += adj; + size -= adj; + + size /= ds_cfg.sizeof_rec[qual]; + size *= ds_cfg.sizeof_rec[qual]; + + ds_set(context->ds, qual, ds_buffer_base, buffer); + ds_set(context->ds, qual, ds_index, buffer); + ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + + /* The value for 'no threshold' is -1, which will set the + * threshold outside of the buffer, just like we want it. + */ + ds_set(context->ds, qual, + ds_interrupt_threshold, buffer + size - ith); +} + +static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, + struct task_struct *task, + void *base, size_t size, size_t th) +{ + struct ds_context *context; unsigned long irq; - int error = 0; + int error; + error = -EOPNOTSUPP; if (!ds_cfg.sizeof_ds) - return -EOPNOTSUPP; + goto out; /* we require some space to do alignment adjustments below */ - if (size < (alignment + ds_cfg.sizeof_rec[qual])) - return -EINVAL; + error = -EINVAL; + if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + goto out; - /* buffer overflow notification is not yet implemented */ - if (ovfl) - return -EOPNOTSUPP; + if (th != (size_t)-1) { + th *= ds_cfg.sizeof_rec[qual]; + + error = -EINVAL; + if (size <= th) + goto out; + } + + error = -ENOMEM; + if (!base) { + base = ds_allocate_buffer(size, &tracer->pages); + if (!base) + goto out; + } + tracer->buffer = base; + tracer->size = size; - context = ds_alloc_context(task); + error = -ENOMEM; + context = ds_get_context(task); if (!context) - return -ENOMEM; + goto out; + tracer->context = context; + spin_lock_irqsave(&ds_lock, irq); error = -EPERM; if (!check_tracer(task)) goto out_unlock; - get_tracer(task); - error = -EALREADY; - if (context->owner[qual] == current) - goto out_put_tracer; error = -EPERM; - if (context->owner[qual] != NULL) + if (context->owner[qual]) goto out_put_tracer; - context->owner[qual] = current; + context->owner[qual] = tracer; spin_unlock_irqrestore(&ds_lock, irq); - error = -ENOMEM; - if (!base) { - base = ds_allocate_buffer(size, &context->pages[qual]); - if (!base) - goto out_release; - - context->buffer[qual] = base; - } - error = 0; + ds_install_ds_config(context, qual, base, size, th); - context->callback[qual] = ovfl; - - /* adjust the buffer address and size to meet alignment - * constraints: - * - buffer is double-word aligned - * - size is multiple of record size - * - * We checked the size at the very beginning; we have enough - * space to do the adjustment. - */ - buffer = (unsigned long)base; - - adj = ALIGN(buffer, alignment) - buffer; - buffer += adj; - size -= adj; - - size /= ds_cfg.sizeof_rec[qual]; - size *= ds_cfg.sizeof_rec[qual]; - - ds_set(context->ds, qual, ds_buffer_base, buffer); - ds_set(context->ds, qual, ds_index, buffer); - ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); - - if (ovfl) { - /* todo: select a suitable interrupt threshold */ - } else - ds_set(context->ds, qual, - ds_interrupt_threshold, buffer + size + 1); - - /* we keep the context until ds_release */ - return error; - - out_release: - context->owner[qual] = NULL; - ds_put_context(context); - put_tracer(task); - return error; + return 0; out_put_tracer: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); put_tracer(task); - return error; - out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(context); + tracer->context = NULL; + out: return error; } -int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) +struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th) { - return ds_request(task, base, size, ovfl, ds_bts); -} + struct bts_tracer *tracer; + int error; -int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) -{ - return ds_request(task, base, size, ovfl, ds_pebs); + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) + goto out; + + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + goto out; + tracer->ovfl = ovfl; + + error = ds_request(&tracer->ds, ds_bts, task, base, size, th); + if (error < 0) + goto out_tracer; + + return tracer; + + out_tracer: + (void)ds_release_bts(tracer); + out: + return ERR_PTR(error); } -static int ds_release(struct task_struct *task, enum ds_qualifier qual) +struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th) { - struct ds_context *context; + struct pebs_tracer *tracer; int error; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) goto out; - kfree(context->buffer[qual]); - context->buffer[qual] = NULL; + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + goto out; + tracer->ovfl = ovfl; - current->mm->total_vm -= context->pages[qual]; - current->mm->locked_vm -= context->pages[qual]; - context->pages[qual] = 0; - context->owner[qual] = NULL; + error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); + if (error < 0) + goto out_tracer; - /* - * we put the context twice: - * once for the ds_get_context - * once for the corresponding ds_request - */ - ds_put_context(context); + return tracer; + + out_tracer: + (void)ds_release_pebs(tracer); out: - ds_put_context(context); - return error; + return ERR_PTR(error); +} + +static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) +{ + if (tracer->context) { + BUG_ON(tracer->context->owner[qual] != tracer); + tracer->context->owner[qual] = NULL; + + put_tracer(tracer->context->task); + ds_put_context(tracer->context); + } + + if (tracer->pages) { + kfree(tracer->buffer); + + down_write(¤t->mm->mmap_sem); + + current->mm->total_vm -= tracer->pages; + current->mm->locked_vm -= tracer->pages; + + up_write(¤t->mm->mmap_sem); + } } -int ds_release_bts(struct task_struct *task) +int ds_release_bts(struct bts_tracer *tracer) { - return ds_release(task, ds_bts); + if (!tracer) + return -EINVAL; + + ds_release(&tracer->ds, ds_bts); + kfree(tracer); + + return 0; } -int ds_release_pebs(struct task_struct *task) +int ds_release_pebs(struct pebs_tracer *tracer) { - return ds_release(task, ds_pebs); + if (!tracer) + return -EINVAL; + + ds_release(&tracer->ds, ds_pebs); + kfree(tracer); + + return 0; } -static int ds_get_index(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) +static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) { - struct ds_context *context; unsigned long base, index; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; base = ds_get(context->ds, qual, ds_buffer_base); index = ds_get(context->ds, qual, ds_index); - error = ((index - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; - out: - ds_put_context(context); - return error; + return (index - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_index(struct task_struct *task, size_t *pos) +int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos) { - return ds_get_index(task, pos, ds_bts); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_index(tracer->ds.context, ds_bts); + + return 0; } -int ds_get_pebs_index(struct task_struct *task, size_t *pos) +int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos) { - return ds_get_index(task, pos, ds_pebs); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_index(tracer->ds.context, ds_pebs); + + return 0; } -static int ds_get_end(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) +static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual) { - struct ds_context *context; - unsigned long base, end; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; + unsigned long base, max; base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); + max = ds_get(context->ds, qual, ds_absolute_maximum); - error = ((end - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; - out: - ds_put_context(context); - return error; + return (max - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_end(struct task_struct *task, size_t *pos) +int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos) { - return ds_get_end(task, pos, ds_bts); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_end(tracer->ds.context, ds_bts); + + return 0; } -int ds_get_pebs_end(struct task_struct *task, size_t *pos) +int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos) { - return ds_get_end(task, pos, ds_pebs); + if (!tracer) + return -EINVAL; + + if (!pos) + return -EINVAL; + + *pos = ds_get_end(tracer->ds.context, ds_pebs); + + return 0; } -static int ds_access(struct task_struct *task, size_t index, - const void **record, enum ds_qualifier qual) +static int ds_access(struct ds_context *context, enum ds_qualifier qual, + size_t index, const void **record) { - struct ds_context *context; unsigned long base, idx; - int error; if (!record) return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; - base = ds_get(context->ds, qual, ds_buffer_base); idx = base + (index * ds_cfg.sizeof_rec[qual]); - error = -EINVAL; if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) - goto out; + return -EINVAL; *record = (const void *)idx; - error = ds_cfg.sizeof_rec[qual]; - out: - ds_put_context(context); - return error; + + return ds_cfg.sizeof_rec[qual]; } -int ds_access_bts(struct task_struct *task, size_t index, const void **record) +int ds_access_bts(struct bts_tracer *tracer, size_t index, + const void **record) { - return ds_access(task, index, record, ds_bts); + if (!tracer) + return -EINVAL; + + return ds_access(tracer->ds.context, ds_bts, index, record); } -int ds_access_pebs(struct task_struct *task, size_t index, const void **record) +int ds_access_pebs(struct pebs_tracer *tracer, size_t index, + const void **record) { - return ds_access(task, index, record, ds_pebs); + if (!tracer) + return -EINVAL; + + return ds_access(tracer->ds.context, ds_pebs, index, record); } -static int ds_write(struct task_struct *task, const void *record, size_t size, - enum ds_qualifier qual, int force) +static int ds_write(struct ds_context *context, enum ds_qualifier qual, + const void *record, size_t size) { - struct ds_context *context; - int error; + int bytes_written = 0; if (!record) return -EINVAL; - error = -EPERM; - context = ds_get_context(task); - if (!context) - goto out; - - if (!force) { - error = ds_validate_access(context, qual); - if (error < 0) - goto out; - } - - error = 0; while (size) { unsigned long base, index, end, write_end, int_th; unsigned long write_size, adj_write_size; @@ -678,14 +689,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size, write_end = end; if (write_end <= index) - goto out; + break; write_size = min((unsigned long) size, write_end - index); memcpy((void *)index, record, write_size); record = (const char *)record + write_size; - size -= write_size; - error += write_size; + size -= write_size; + bytes_written += write_size; adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; adj_write_size *= ds_cfg.sizeof_rec[qual]; @@ -700,47 +711,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size, ds_set(context->ds, qual, ds_index, index); if (index >= int_th) - ds_overflow(task, context, qual); + ds_overflow(context, qual); } - out: - ds_put_context(context); - return error; + return bytes_written; } -int ds_write_bts(struct task_struct *task, const void *record, size_t size) +int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size) { - return ds_write(task, record, size, ds_bts, /* force = */ 0); -} + if (!tracer) + return -EINVAL; -int ds_write_pebs(struct task_struct *task, const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 0); + return ds_write(tracer->ds.context, ds_bts, record, size); } -int ds_unchecked_write_bts(struct task_struct *task, - const void *record, size_t size) +int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size) { - return ds_write(task, record, size, ds_bts, /* force = */ 1); -} + if (!tracer) + return -EINVAL; -int ds_unchecked_write_pebs(struct task_struct *task, - const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 1); + return ds_write(tracer->ds.context, ds_pebs, record, size); } -static int ds_reset_or_clear(struct task_struct *task, - enum ds_qualifier qual, int clear) +static void ds_reset_or_clear(struct ds_context *context, + enum ds_qualifier qual, int clear) { - struct ds_context *context; unsigned long base, end; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; base = ds_get(context->ds, qual, ds_buffer_base); end = ds_get(context->ds, qual, ds_absolute_maximum); @@ -749,70 +745,69 @@ static int ds_reset_or_clear(struct task_struct *task, memset((void *)base, 0, end - base); ds_set(context->ds, qual, ds_index, base); - - error = 0; - out: - ds_put_context(context); - return error; } -int ds_reset_bts(struct task_struct *task) +int ds_reset_bts(struct bts_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); + + return 0; } -int ds_reset_pebs(struct task_struct *task) +int ds_reset_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); + + return 0; } -int ds_clear_bts(struct task_struct *task) +int ds_clear_bts(struct bts_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); + + return 0; } -int ds_clear_pebs(struct task_struct *task) +int ds_clear_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); + if (!tracer) + return -EINVAL; + + ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); + + return 0; } -int ds_get_pebs_reset(struct task_struct *task, u64 *value) +int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value) { - struct ds_context *context; - int error; + if (!tracer) + return -EINVAL; if (!value) return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; - - *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); + *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); - error = 0; - out: - ds_put_context(context); - return error; + return 0; } -int ds_set_pebs_reset(struct task_struct *task, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) { - struct ds_context *context; - int error; - - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; + if (!tracer) + return -EINVAL; - *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; - error = 0; - out: - ds_put_context(context); - return error; + return 0; } static const struct ds_configuration ds_cfg_var = { @@ -840,6 +835,10 @@ static inline void ds_configure(const struct ds_configuration *cfg) { ds_cfg = *cfg; + + printk(KERN_INFO "DS available\n"); + + BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -883,6 +882,8 @@ void ds_free(struct ds_context *context) * is dying. There should not be any user of that context left * to disturb us, anymore. */ unsigned long leftovers = context->count; - while (leftovers--) + while (leftovers--) { + put_tracer(context->task); ds_put_context(context); + } } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 06180dff5b2e..76adf5b640ff 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, size_t bts_index, bts_end; int error; - error = ds_get_bts_end(child, &bts_end); + error = ds_get_bts_end(child->bts, &bts_end); if (error < 0) return error; if (bts_end <= index) return -EINVAL; - error = ds_get_bts_index(child, &bts_index); + error = ds_get_bts_index(child->bts, &bts_index); if (error < 0) return error; @@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, if (bts_end <= bts_index) bts_index -= bts_end; - error = ds_access_bts(child, bts_index, &bts_record); + error = ds_access_bts(child->bts, bts_index, &bts_record); if (error < 0) return error; @@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child, size_t end, i; int error; - error = ds_get_bts_index(child, &end); + error = ds_get_bts_index(child->bts, &end); if (error < 0) return error; if (size < (end * sizeof(struct bts_struct))) return -EIO; - error = ds_access_bts(child, 0, (const void **)&raw); + error = ds_access_bts(child->bts, 0, (const void **)&raw); if (error < 0) return error; @@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child, return -EFAULT; } - error = ds_clear_bts(child); + error = ds_clear_bts(child->bts); if (error < 0) return error; return end; } -static void ptrace_bts_ovfl(struct task_struct *child) -{ - send_sig(child->thread.bts_ovfl_signal, child, 0); -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) @@ -760,23 +755,29 @@ static int ptrace_bts_config(struct task_struct *child, goto errout; if (cfg.flags & PTRACE_BTS_O_ALLOC) { - ds_ovfl_callback_t ovfl = NULL; + bts_ovfl_callback_t ovfl = NULL; unsigned int sig = 0; - /* we ignore the error in case we were not tracing child */ - (void)ds_release_bts(child); - if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) goto errout; + error = -EOPNOTSUPP; + goto errout; + sig = cfg.signal; - ovfl = ptrace_bts_ovfl; } - error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); - if (error < 0) + if (child->bts) + (void)ds_release_bts(child->bts); + + child->bts = ds_request_bts(child, /* base = */ NULL, cfg.size, + ovfl, /* th = */ (size_t)-1); + if (IS_ERR(child->bts)) { + error = PTR_ERR(child->bts); + child->bts = NULL; goto errout; + } child->thread.bts_ovfl_signal = sig; } @@ -823,15 +824,15 @@ static int ptrace_bts_status(struct task_struct *child, if (cfg_size < sizeof(cfg)) return -EIO; - error = ds_get_bts_end(child, &end); + error = ds_get_bts_end(child->bts, &end); if (error < 0) return error; - error = ds_access_bts(child, /* index = */ 0, &base); + error = ds_access_bts(child->bts, /* index = */ 0, &base); if (error < 0) return error; - error = ds_access_bts(child, /* index = */ end, &max); + error = ds_access_bts(child->bts, /* index = */ end, &max); if (error < 0) return error; @@ -884,10 +885,7 @@ static int ptrace_bts_write_record(struct task_struct *child, return -EINVAL; } - /* The writing task will be the switched-to task on a context - * switch. It needs to write into the switched-from task's BTS - * buffer. */ - return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); + return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts); } void ptrace_bts_take_timestamp(struct task_struct *tsk, @@ -972,13 +970,15 @@ void ptrace_disable(struct task_struct *child) clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif #ifdef CONFIG_X86_PTRACE_BTS - (void)ds_release_bts(child); + if (child->bts) { + (void)ds_release_bts(child->bts); - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + } #endif /* CONFIG_X86_PTRACE_BTS */ } @@ -1110,9 +1110,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) (child, data, (struct ptrace_bts_config __user *)addr); break; - case PTRACE_BTS_SIZE: - ret = ds_get_bts_index(child, /* pos = */ NULL); + case PTRACE_BTS_SIZE: { + size_t size; + + ret = ds_get_bts_index(child->bts, &size); + if (ret == 0) { + BUG_ON(size != (int) size); + ret = (int) size; + } break; + } case PTRACE_BTS_GET: ret = ptrace_bts_read_record @@ -1120,7 +1127,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ds_clear_bts(child); + ret = ds_clear_bts(child->bts); break; case PTRACE_BTS_DRAIN: diff --git a/include/linux/sched.h b/include/linux/sched.h index bee1e93c95ad..a9780eaa6737 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -96,6 +96,7 @@ struct exec_domain; struct futex_pi_state; struct robust_list_head; struct bio; +struct bts_tracer; /* * List of flags we want to share for kernel threads, @@ -1161,6 +1162,14 @@ struct task_struct { struct list_head ptraced; struct list_head ptrace_entry; +#ifdef CONFIG_X86_PTRACE_BTS + /* + * This is the tracer handle for the ptrace BTS extension. + * This field actually belongs to the ptracer task. + */ + struct bts_tracer *bts; +#endif /* CONFIG_X86_PTRACE_BTS */ + /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; -- cgit v1.2.3 From 6abb11aecd888d1da6276399380b7355f127c006 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 25 Nov 2008 09:05:27 +0100 Subject: x86, bts, ptrace: move BTS buffer allocation from ds.c into ptrace.c Impact: restructure DS memory allocation to be done by the usage site of DS Require pre-allocated buffers in ds.h. Move the BTS buffer allocation for ptrace into ptrace.c. The pointer to the allocated buffer is stored in the traced task's task_struct together with the handle returned by ds_request_bts(). Removes memory accounting code. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 12 +++---- arch/x86/kernel/ds.c | 92 ++++++++--------------------------------------- arch/x86/kernel/ptrace.c | 22 ++++++++++-- include/linux/sched.h | 4 +++ 4 files changed, 42 insertions(+), 88 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 0af997de5f01..99b6c39774a4 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -7,13 +7,12 @@ * * It manages: * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done) * - buffer access * * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks * * * Copyright (C) 2007-2008 Intel Corporation. @@ -54,8 +53,7 @@ typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); * task: the task to request recording for; * NULL for per-cpu recording on the current cpu * base: the base pointer for the (non-pageable) buffer; - * NULL if buffer allocation requested - * size: the size of the requested or provided buffer in bytes + * size: the size of the provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; * NULL if cyclic buffer requested * th: the interrupt threshold in records from the end of the buffer; @@ -72,8 +70,6 @@ extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, /* * Release BTS or PEBS resources * - * Frees buffers allocated on ds_request. - * * Returns 0 on success; -Eerrno otherwise * * tracer: the tracer handle returned from ds_request_~() diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 96768e9cce99..19a8c2c0389f 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -7,13 +7,12 @@ * * It manages: * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done) * - buffer access * * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks * * * Copyright (C) 2007-2008 Intel Corporation. @@ -57,8 +56,6 @@ struct ds_tracer { /* the buffer provided on ds_request() and its size in bytes */ void *buffer; size_t size; - /* the number of allocated pages for on-request allocated buffers */ - unsigned int pages; }; struct bts_tracer { @@ -141,8 +138,7 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, /* - * Locking is done only for allocating BTS or PEBS resources and for - * guarding context and buffer memory allocation. + * Locking is done only for allocating BTS or PEBS resources. */ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); @@ -292,50 +288,6 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) } -/* - * Allocate a non-pageable buffer of the parameter size. - * Checks the memory and the locked memory rlimit. - * - * Returns the buffer, if successful; - * NULL, if out of memory or rlimit exceeded. - * - * size: the requested buffer size in bytes - * pages (out): if not NULL, contains the number of pages reserved - */ -static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) -{ - unsigned long rlim, vm, pgsz; - void *buffer = NULL; - - pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - - down_write(¤t->mm->mmap_sem); - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + pgsz; - if (rlim < vm) - goto out; - - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + pgsz; - if (rlim < vm) - goto out; - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto out; - - current->mm->total_vm += pgsz; - current->mm->locked_vm += pgsz; - - if (pages) - *pages = pgsz; - - out: - up_write(¤t->mm->mmap_sem); - return buffer; -} - static void ds_install_ds_config(struct ds_context *context, enum ds_qualifier qual, void *base, size_t size, size_t ith) @@ -382,6 +334,10 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, if (!ds_cfg.sizeof_ds) goto out; + error = -EINVAL; + if (!base) + goto out; + /* we require some space to do alignment adjustments below */ error = -EINVAL; if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) @@ -395,13 +351,6 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, goto out; } - error = -ENOMEM; - if (!base) { - base = ds_allocate_buffer(size, &tracer->pages); - if (!base) - goto out; - } - tracer->buffer = base; tracer->size = size; @@ -466,7 +415,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, return tracer; out_tracer: - (void)ds_release_bts(tracer); + kfree(tracer); out: return ERR_PTR(error); } @@ -496,31 +445,18 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, return tracer; out_tracer: - (void)ds_release_pebs(tracer); + kfree(tracer); out: return ERR_PTR(error); } static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) { - if (tracer->context) { - BUG_ON(tracer->context->owner[qual] != tracer); - tracer->context->owner[qual] = NULL; - - put_tracer(tracer->context->task); - ds_put_context(tracer->context); - } + BUG_ON(tracer->context->owner[qual] != tracer); + tracer->context->owner[qual] = NULL; - if (tracer->pages) { - kfree(tracer->buffer); - - down_write(¤t->mm->mmap_sem); - - current->mm->total_vm -= tracer->pages; - current->mm->locked_vm -= tracer->pages; - - up_write(¤t->mm->mmap_sem); - } + put_tracer(tracer->context->task); + ds_put_context(tracer->context); } int ds_release_bts(struct bts_tracer *tracer) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 76adf5b640ff..2c8ec1ba75e6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -758,6 +758,10 @@ static int ptrace_bts_config(struct task_struct *child, bts_ovfl_callback_t ovfl = NULL; unsigned int sig = 0; + error = -EINVAL; + if (cfg.size < (10 * bts_cfg.sizeof_bts)) + goto errout; + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) goto errout; @@ -768,14 +772,26 @@ static int ptrace_bts_config(struct task_struct *child, sig = cfg.signal; } - if (child->bts) + if (child->bts) { (void)ds_release_bts(child->bts); + kfree(child->bts_buffer); + + child->bts = NULL; + child->bts_buffer = NULL; + } + + error = -ENOMEM; + child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); + if (!child->bts_buffer) + goto errout; - child->bts = ds_request_bts(child, /* base = */ NULL, cfg.size, + child->bts = ds_request_bts(child, child->bts_buffer, cfg.size, ovfl, /* th = */ (size_t)-1); if (IS_ERR(child->bts)) { error = PTR_ERR(child->bts); + kfree(child->bts_buffer); child->bts = NULL; + child->bts_buffer = NULL; goto errout; } @@ -972,6 +988,8 @@ void ptrace_disable(struct task_struct *child) #ifdef CONFIG_X86_PTRACE_BTS if (child->bts) { (void)ds_release_bts(child->bts); + kfree(child->bts_buffer); + child->bts_buffer = NULL; child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; if (!child->thread.debugctlmsr) diff --git a/include/linux/sched.h b/include/linux/sched.h index a9780eaa6737..d02a0ca70ee9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1168,6 +1168,10 @@ struct task_struct { * This field actually belongs to the ptracer task. */ struct bts_tracer *bts; + /* + * The buffer to hold the BTS data. + */ + void *bts_buffer; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ -- cgit v1.2.3 From fb52607afcd0629776f1dc9e657647ceae81dd50 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 25 Nov 2008 21:07:04 +0100 Subject: tracing/function-return-tracer: change the name into function-graph-tracer Impact: cleanup This patch changes the name of the "return function tracer" into function-graph-tracer which is a more suitable name for a tracing which makes one able to retrieve the ordered call stack during the code flow. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/ftrace.h | 4 +- arch/x86/kernel/Makefile | 4 +- arch/x86/kernel/entry_32.S | 12 ++--- arch/x86/kernel/ftrace.c | 12 ++--- include/linux/ftrace.h | 24 ++++----- include/linux/ftrace_irq.h | 2 +- include/linux/sched.h | 2 +- kernel/Makefile | 2 +- kernel/fork.c | 4 +- kernel/sched.c | 2 +- kernel/trace/Kconfig | 19 ++++--- kernel/trace/Makefile | 2 +- kernel/trace/ftrace.c | 26 +++++----- kernel/trace/trace.c | 18 +++---- kernel/trace/trace.h | 12 ++--- kernel/trace/trace_functions_graph.c | 98 ++++++++++++++++++++++++++++++++++++ 17 files changed, 173 insertions(+), 72 deletions(-) create mode 100644 kernel/trace/trace_functions_graph.c (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e49a4fd718fe..0842b1127684 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,7 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER - select HAVE_FUNCTION_RET_TRACER if X86_32 + select HAVE_FUNCTION_GRAPH_TRACER if X86_32 select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 754a3e082f94..7e61b4ceb9a4 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -28,7 +28,7 @@ struct dyn_arch_ftrace { #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER #ifndef __ASSEMBLY__ @@ -51,6 +51,6 @@ struct ftrace_ret_stack { extern void return_to_handler(void); #endif /* __ASSEMBLY__ */ -#endif /* CONFIG_FUNCTION_RET_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index af2bc36ca1c4..64939a0c3986 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -14,7 +14,7 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg endif -ifdef CONFIG_FUNCTION_RET_TRACER +ifdef CONFIG_FUNCTION_GRAPH_TRACER # Don't trace __switch_to() but let it for function tracer CFLAGS_REMOVE_process_32.o = -pg endif @@ -70,7 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_FUNCTION_RET_TRACER) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 74defe21ba42..2b1f0f081a6b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1188,9 +1188,9 @@ ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace -#ifdef CONFIG_FUNCTION_RET_TRACER - cmpl $ftrace_stub, ftrace_function_return - jnz ftrace_return_caller +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_function + jnz ftrace_graph_caller #endif .globl ftrace_stub ftrace_stub: @@ -1215,8 +1215,8 @@ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ -#ifdef CONFIG_FUNCTION_RET_TRACER -ENTRY(ftrace_return_caller) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) cmpl $0, function_trace_stop jne ftrace_stub @@ -1230,7 +1230,7 @@ ENTRY(ftrace_return_caller) popl %ecx popl %eax ret -END(ftrace_return_caller) +END(ftrace_graph_caller) .globl return_to_handler return_to_handler: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index bb137f7297ed..3595a4c14aba 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -323,7 +323,7 @@ int __init ftrace_dyn_arch_init(void *data) } #endif -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER #ifndef CONFIG_DYNAMIC_FTRACE @@ -389,11 +389,11 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, */ unsigned long ftrace_return_to_handler(void) { - struct ftrace_retfunc trace; + struct ftrace_graph_ret trace; pop_return_trace(&trace.ret, &trace.calltime, &trace.func, &trace.overrun); trace.rettime = cpu_clock(raw_smp_processor_id()); - ftrace_function_return(&trace); + ftrace_graph_function(&trace); return trace.ret; } @@ -440,12 +440,12 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) ); if (WARN_ON(faulted)) { - unregister_ftrace_return(); + unregister_ftrace_graph(); return; } if (WARN_ON(!__kernel_text_address(old))) { - unregister_ftrace_return(); + unregister_ftrace_graph(); *parent = old; return; } @@ -456,4 +456,4 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) *parent = old; } -#endif /* CONFIG_FUNCTION_RET_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7854d87b97b2..b4ac734ad8d6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -115,8 +115,8 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); -#ifdef CONFIG_FUNCTION_RET_TRACER -extern void ftrace_return_caller(void); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern void ftrace_graph_caller(void); #endif /** @@ -315,7 +315,7 @@ ftrace_init_module(struct module *mod, /* * Structure that defines a return function trace. */ -struct ftrace_retfunc { +struct ftrace_graph_ret { unsigned long ret; /* Return address */ unsigned long func; /* Current function */ unsigned long long calltime; @@ -324,22 +324,22 @@ struct ftrace_retfunc { unsigned long overrun; }; -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 /* Type of a callback handler of tracing return function */ -typedef void (*trace_function_return_t)(struct ftrace_retfunc *); +typedef void (*trace_function_graph_t)(struct ftrace_graph_ret *); -extern int register_ftrace_return(trace_function_return_t func); +extern int register_ftrace_graph(trace_function_graph_t func); /* The current handler in use */ -extern trace_function_return_t ftrace_function_return; -extern void unregister_ftrace_return(void); +extern trace_function_graph_t ftrace_graph_function; +extern void unregister_ftrace_graph(void); -extern void ftrace_retfunc_init_task(struct task_struct *t); -extern void ftrace_retfunc_exit_task(struct task_struct *t); +extern void ftrace_graph_init_task(struct task_struct *t); +extern void ftrace_graph_exit_task(struct task_struct *t); #else -static inline void ftrace_retfunc_init_task(struct task_struct *t) { } -static inline void ftrace_retfunc_exit_task(struct task_struct *t) { } +static inline void ftrace_graph_init_task(struct task_struct *t) { } +static inline void ftrace_graph_exit_task(struct task_struct *t) { } #endif #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 0b4df55d7a74..366a054d0b05 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,7 +2,7 @@ #define _LINUX_FTRACE_IRQ_H -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_RET_TRACER) +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/include/linux/sched.h b/include/linux/sched.h index d02a0ca70ee9..7ad48f2a2758 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1365,7 +1365,7 @@ struct task_struct { unsigned long default_timer_slack_ns; struct list_head *scm_work_list; -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored adress in ret_stack */ int curr_ret_stack; /* Stack of return addresses for return function tracing */ diff --git a/kernel/Makefile b/kernel/Makefile index 03a45e7e87b7..703cf3b7389c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,7 +21,7 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -pg endif -ifdef CONFIG_FUNCTION_RET_TRACER +ifdef CONFIG_FUNCTION_GRAPH_TRACER CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address() CFLAGS_REMOVE_module.o = -pg # For __module_text_address() endif diff --git a/kernel/fork.c b/kernel/fork.c index d6e1a3205f62..5f82a999c032 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -140,7 +140,7 @@ void free_task(struct task_struct *tsk) prop_local_destroy_single(&tsk->dirties); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); - ftrace_retfunc_exit_task(tsk); + ftrace_graph_exit_task(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1271,7 +1271,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); - ftrace_retfunc_init_task(p); + ftrace_graph_init_task(p); proc_fork_connector(p); cgroup_post_fork(p); return p; diff --git a/kernel/sched.c b/kernel/sched.c index 388d9db044ab..52490bf6b884 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5901,7 +5901,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; - ftrace_retfunc_init_task(idle); + ftrace_graph_init_task(idle); } /* diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 620feadff67a..eb9b901e0777 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -12,7 +12,7 @@ config NOP_TRACER config HAVE_FUNCTION_TRACER bool -config HAVE_FUNCTION_RET_TRACER +config HAVE_FUNCTION_GRAPH_TRACER bool config HAVE_FUNCTION_TRACE_MCOUNT_TEST @@ -63,15 +63,18 @@ config FUNCTION_TRACER (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. -config FUNCTION_RET_TRACER - bool "Kernel Function return Tracer" - depends on HAVE_FUNCTION_RET_TRACER +config FUNCTION_GRAPH_TRACER + bool "Kernel Function Graph Tracer" + depends on HAVE_FUNCTION_GRAPH_TRACER depends on FUNCTION_TRACER help - Enable the kernel to trace a function at its return. - It's first purpose is to trace the duration of functions. - This is done by setting the current return address on the thread - info structure of the current task. + Enable the kernel to trace a function at both its return + and its entry. + It's first purpose is to trace the duration of functions and + draw a call graph for each thread with some informations like + the return value. + This is done by setting the current return address on the current + task structure into a stack of calls. config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index cef4bcb4e822..08c5fe6ddc09 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -29,7 +29,7 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o -obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_BTS_TRACER) += trace_bts.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 53042f118f23..9e19976af727 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -395,11 +395,11 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) unsigned long ip, fl; unsigned long ftrace_addr; -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_tracing_type == FTRACE_TYPE_ENTER) ftrace_addr = (unsigned long)ftrace_caller; else - ftrace_addr = (unsigned long)ftrace_return_caller; + ftrace_addr = (unsigned long)ftrace_graph_caller; #else ftrace_addr = (unsigned long)ftrace_caller; #endif @@ -1496,13 +1496,13 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, return ret; } -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER static atomic_t ftrace_retfunc_active; /* The callback that hooks the return of a function */ -trace_function_return_t ftrace_function_return = - (trace_function_return_t)ftrace_stub; +trace_function_graph_t ftrace_graph_function = + (trace_function_graph_t)ftrace_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ @@ -1549,7 +1549,7 @@ free: } /* Allocate a return stack for each task */ -static int start_return_tracing(void) +static int start_graph_tracing(void) { struct ftrace_ret_stack **ret_stack_list; int ret; @@ -1569,7 +1569,7 @@ static int start_return_tracing(void) return ret; } -int register_ftrace_return(trace_function_return_t func) +int register_ftrace_graph(trace_function_graph_t func) { int ret = 0; @@ -1584,13 +1584,13 @@ int register_ftrace_return(trace_function_return_t func) goto out; } atomic_inc(&ftrace_retfunc_active); - ret = start_return_tracing(); + ret = start_graph_tracing(); if (ret) { atomic_dec(&ftrace_retfunc_active); goto out; } ftrace_tracing_type = FTRACE_TYPE_RETURN; - ftrace_function_return = func; + ftrace_graph_function = func; ftrace_startup(); out: @@ -1598,12 +1598,12 @@ out: return ret; } -void unregister_ftrace_return(void) +void unregister_ftrace_graph(void) { mutex_lock(&ftrace_sysctl_lock); atomic_dec(&ftrace_retfunc_active); - ftrace_function_return = (trace_function_return_t)ftrace_stub; + ftrace_graph_function = (trace_function_graph_t)ftrace_stub; ftrace_shutdown(); /* Restore normal tracing type */ ftrace_tracing_type = FTRACE_TYPE_ENTER; @@ -1612,7 +1612,7 @@ void unregister_ftrace_return(void) } /* Allocate a return stack for newly created task */ -void ftrace_retfunc_init_task(struct task_struct *t) +void ftrace_graph_init_task(struct task_struct *t) { if (atomic_read(&ftrace_retfunc_active)) { t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH @@ -1626,7 +1626,7 @@ void ftrace_retfunc_init_task(struct task_struct *t) t->ret_stack = NULL; } -void ftrace_retfunc_exit_task(struct task_struct *t) +void ftrace_graph_exit_task(struct task_struct *t) { struct ftrace_ret_stack *ret_stack = t->ret_stack; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8df8fdd69c95..f21ab2c68fd4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -878,15 +878,15 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } -#ifdef CONFIG_FUNCTION_RET_TRACER -static void __trace_function_return(struct trace_array *tr, +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void __trace_function_graph(struct trace_array *tr, struct trace_array_cpu *data, - struct ftrace_retfunc *trace, + struct ftrace_graph_ret *trace, unsigned long flags, int pc) { struct ring_buffer_event *event; - struct ftrace_ret_entry *entry; + struct ftrace_graph_entry *entry; unsigned long irq_flags; if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) @@ -1177,8 +1177,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) local_irq_restore(flags); } -#ifdef CONFIG_FUNCTION_RET_TRACER -void trace_function_return(struct ftrace_retfunc *trace) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +void trace_function_graph(struct ftrace_graph_ret *trace) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1193,12 +1193,12 @@ void trace_function_return(struct ftrace_retfunc *trace) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); - __trace_function_return(tr, data, trace, flags, pc); + __trace_function_graph(tr, data, trace, flags, pc); } atomic_dec(&data->disabled); raw_local_irq_restore(flags); } -#endif /* CONFIG_FUNCTION_RET_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ static struct ftrace_ops trace_ops __read_mostly = { @@ -2001,7 +2001,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) break; } case TRACE_FN_RET: { - return print_return_function(iter); + return print_graph_function(iter); break; } case TRACE_BRANCH: { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3abd645e8af2..72b5ef868765 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -57,7 +57,7 @@ struct ftrace_entry { }; /* Function return entry */ -struct ftrace_ret_entry { +struct ftrace_graph_entry { struct trace_entry ent; unsigned long ip; unsigned long parent_ip; @@ -264,7 +264,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ - IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\ + IF_ASSIGN(var, ent, struct ftrace_graph_entry, TRACE_FN_RET);\ IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\ __ftrace_bad_type(); \ } while (0) @@ -398,7 +398,7 @@ void trace_function(struct trace_array *tr, unsigned long parent_ip, unsigned long flags, int pc); void -trace_function_return(struct ftrace_retfunc *trace); +trace_function_graph(struct ftrace_graph_ret *trace); void trace_bts(struct trace_array *tr, unsigned long from, @@ -489,11 +489,11 @@ extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern unsigned long trace_flags; /* Standard output formatting function used for function return traces */ -#ifdef CONFIG_FUNCTION_RET_TRACER -extern enum print_line_t print_return_function(struct trace_iterator *iter); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern enum print_line_t print_graph_function(struct trace_iterator *iter); #else static inline enum print_line_t -print_return_function(struct trace_iterator *iter) +print_graph_function(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c new file mode 100644 index 000000000000..f5bad4624d2b --- /dev/null +++ b/kernel/trace/trace_functions_graph.c @@ -0,0 +1,98 @@ +/* + * + * Function graph tracer. + * Copyright (c) 2008 Frederic Weisbecker + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt + * + */ +#include +#include +#include +#include + +#include "trace.h" + + +#define TRACE_GRAPH_PRINT_OVERRUN 0x1 +static struct tracer_opt trace_opts[] = { + /* Display overruns or not */ + { TRACER_OPT(overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, /* Don't display overruns by default */ + .opts = trace_opts +}; + + +static int graph_trace_init(struct trace_array *tr) +{ + int cpu; + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + return register_ftrace_graph(&trace_function_graph); +} + +static void graph_trace_reset(struct trace_array *tr) +{ + unregister_ftrace_graph(); +} + + +enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct ftrace_graph_entry *field; + int ret; + + if (entry->type == TRACE_FN_RET) { + trace_assign_type(field, entry); + ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = seq_print_ip_sym(s, field->ip, + trace_flags & TRACE_ITER_SYM_MASK); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " (%llu ns)", + field->rettime - field->calltime); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { + ret = trace_seq_printf(s, " (Overruns: %lu)", + field->overrun); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; + } + return TRACE_TYPE_UNHANDLED; +} + +static struct tracer graph_trace __read_mostly = { + .name = "function-graph", + .init = graph_trace_init, + .reset = graph_trace_reset, + .print_line = print_graph_function, + .flags = &tracer_flags, +}; + +static __init int init_graph_trace(void) +{ + return register_tracer(&graph_trace); +} + +device_initcall(init_graph_trace); -- cgit v1.2.3 From 1d9b16d1690fe5edb1c907fe4746681cf026cdf3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 27 Nov 2008 18:39:15 +0100 Subject: x86: move GART specific stuff from iommu.h to gart.h Impact: cleanup Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/include/asm/gart.h | 33 +++++++++++++++++++++++++++++++++ arch/x86/include/asm/iommu.h | 33 --------------------------------- arch/x86/kernel/amd_iommu.c | 1 + arch/x86/kernel/amd_iommu_init.c | 1 + arch/x86/kernel/early-quirks.c | 1 + arch/x86/kernel/pci-dma.c | 1 + arch/x86/kernel/setup.c | 1 + 7 files changed, 38 insertions(+), 33 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 74252264433d..6cfdafa409d8 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -29,6 +29,39 @@ extern int fix_aperture; #define AMD64_GARTCACHECTL 0x9c #define AMD64_GARTEN (1<<0) +#ifdef CONFIG_GART_IOMMU +extern int gart_iommu_aperture; +extern int gart_iommu_aperture_allowed; +extern int gart_iommu_aperture_disabled; + +extern void early_gart_iommu_check(void); +extern void gart_iommu_init(void); +extern void gart_iommu_shutdown(void); +extern void __init gart_parse_options(char *); +extern void gart_iommu_hole_init(void); + +#else +#define gart_iommu_aperture 0 +#define gart_iommu_aperture_allowed 0 +#define gart_iommu_aperture_disabled 1 + +static inline void early_gart_iommu_check(void) +{ +} +static inline void gart_iommu_init(void) +{ +} +static inline void gart_iommu_shutdown(void) +{ +} +static inline void gart_parse_options(char *options) +{ +} +static inline void gart_iommu_hole_init(void) +{ +} +#endif + extern int agp_amd64_init(void); static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 0b500c5b6446..295b13193f4d 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -12,37 +12,4 @@ extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len); /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) -#ifdef CONFIG_GART_IOMMU -extern int gart_iommu_aperture; -extern int gart_iommu_aperture_allowed; -extern int gart_iommu_aperture_disabled; - -extern void early_gart_iommu_check(void); -extern void gart_iommu_init(void); -extern void gart_iommu_shutdown(void); -extern void __init gart_parse_options(char *); -extern void gart_iommu_hole_init(void); - -#else -#define gart_iommu_aperture 0 -#define gart_iommu_aperture_allowed 0 -#define gart_iommu_aperture_disabled 1 - -static inline void early_gart_iommu_check(void) -{ -} -static inline void gart_iommu_init(void) -{ -} -static inline void gart_iommu_shutdown(void) -{ -} -static inline void gart_parse_options(char *options) -{ -} -static inline void gart_iommu_hole_init(void) -{ -} -#endif - #endif /* _ASM_X86_IOMMU_H */ diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 331b318304eb..172e0dc4641e 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0cdcda35a05f..7685f0774a8f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * definitions for the ACPI scanning code diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 1b894b72c0f5..744aa7fc49d5 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,6 +17,7 @@ #include #include #include +#include static void __init fix_hypertransport_config(int num, int slot, int func) { diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 192624820217..12eeb4bfcdeb 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0fa6790c1dd3..67d5979e654e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -93,6 +93,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 2c5643b1c5c7fbb13f340d4c58944d9642f41796 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Sun, 30 Nov 2008 17:16:04 +0900 Subject: x86: provide readq()/writeq() on 32-bit too Impact: add new API for drivers Add implementation of readq/writeq to x86_32, and add config value to the x86 architecture to determine existence of readq/writeq. Signed-off-by: Hitoshi Mitake Acked-by: Sam Ravnborg Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 ++ arch/x86/include/asm/io.h | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f7..a7d50f5d118c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,6 +19,8 @@ config X86_64 config X86 def_bool y select HAVE_AOUT if X86_32 + select HAVE_READQ + select HAVE_WRITEQ select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index ac2abc88cd95..25946449df4f 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -4,6 +4,7 @@ #define ARCH_HAS_IOREMAP_WC #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -57,6 +58,29 @@ build_mmio_write(__writeq, "q", unsigned long, "r", ) /* Let people know we have them */ #define readq readq #define writeq writeq + +#else /* CONFIG_X86_32 from here */ + +static inline __u64 readq(const volatile void __iomem *addr) +{ + const volatile u32 __iomem *p = addr; + u32 l, h; + + l = readl(p); + h = readl(p + 1); + + return l + ((u64)h << 32); +} + +static inline void writeq(__u64 val, volatile void __iomem *addr) +{ + writel(val, addr); + writel(val >> 32, addr+4); +} + +#define readq readq +#define writeq writeq + #endif extern int iommu_bio_merge; -- cgit v1.2.3 From a0b1131e479e5af32eefac8bc54c9742e23d638e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 30 Nov 2008 09:33:55 +0100 Subject: x86: provide readq()/writeq() on 32-bit too, cleanup Impact: cleanup Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 25946449df4f..3ccfaf610c89 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -55,21 +55,17 @@ build_mmio_write(__writeq, "q", unsigned long, "r", ) #define __raw_readq __readq #define __raw_writeq writeq -/* Let people know we have them */ -#define readq readq -#define writeq writeq - #else /* CONFIG_X86_32 from here */ static inline __u64 readq(const volatile void __iomem *addr) { const volatile u32 __iomem *p = addr; - u32 l, h; + u32 low, high; - l = readl(p); - h = readl(p + 1); + low = readl(p); + high = readl(p + 1); - return l + ((u64)h << 32); + return low + ((u64)high << 32); } static inline void writeq(__u64 val, volatile void __iomem *addr) @@ -78,11 +74,12 @@ static inline void writeq(__u64 val, volatile void __iomem *addr) writel(val >> 32, addr+4); } +#endif + +/* Let people know that we have them */ #define readq readq #define writeq writeq -#endif - extern int iommu_bio_merge; #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 93093d099e5dd0c258fd530c12668e828c20df41 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 30 Nov 2008 10:20:20 +0100 Subject: x86: provide readq()/writeq() on 32-bit too, complete if HAVE_READQ/HAVE_WRITEQ are defined, the full range of readq/writeq APIs has to be provided to drivers: drivers/infiniband/hw/amso1100/c2.c: In function 'c2_tx_ring_alloc': drivers/infiniband/hw/amso1100/c2.c:133: error: implicit declaration of function '__raw_writeq' So provide them on 32-bit as well. Also, map all the APIs to the strongest ordering variant. It's way too easy to mess such details up in drivers and the difference between "memory" and "" constrained asm() constructs is in the noise range. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 3ccfaf610c89..33513b9a67f3 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -46,16 +46,11 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define mmiowb() barrier() #ifdef CONFIG_X86_64 + build_mmio_read(readq, "q", unsigned long, "=r", :"memory") -build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") -build_mmio_write(__writeq, "q", unsigned long, "r", ) - -#define readq_relaxed(a) __readq(a) -#define __raw_readq __readq -#define __raw_writeq writeq -#else /* CONFIG_X86_32 from here */ +#else static inline __u64 readq(const volatile void __iomem *addr) { @@ -76,9 +71,14 @@ static inline void writeq(__u64 val, volatile void __iomem *addr) #endif +#define readq_relaxed(a) readq(a) + +#define __raw_readq(a) readq(a) +#define __raw_writeq(val, addr) writeq(val, addr) + /* Let people know that we have them */ -#define readq readq -#define writeq writeq +#define readq readq +#define writeq writeq extern int iommu_bio_merge; -- cgit v1.2.3 From 50cec5c51c18301ff60262fdbe920f4a907c9d81 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 2 Dec 2008 02:17:15 +0900 Subject: x86: fix dma_mapping_error for 32bit x86, cleanup This removes ifdef CONFIG_X86_64 in dma_mapping_error(): 1) Xen people plan to use swiotlb on X86_32 for Dom0 support. swiotlb uses ops->mapping_error so X86_32 also needs to check ops->mapping_error. 2) Removing #ifdef hack is almost always a good thing. Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dma-mapping.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 097794ff6b79..dc22c0733282 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -71,12 +71,10 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) /* Make sure we keep the same behaviour */ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { -#ifdef CONFIG_X86_64 struct dma_mapping_ops *ops = get_dma_ops(dev); if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); -#endif return (dma_addr == bad_dma_address); } -- cgit v1.2.3 From 181de82ee3ffda1175f89d50c991dae31b79280c Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 3 Dec 2008 14:53:04 +0900 Subject: x86: remove dead BIO_VMERGE_BOUNDARY definition Impact: cleanup, remove dead code The block layer dropped the virtual merge feature (b8b3e16cfe6435d961f6aaebcfd52a1ff2a988c5). BIO_VMERGE_BOUNDARY definition is meaningless now. Signed-off-by: FUJITA Tomonori Acked-by: Jens Axboe Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 2 -- arch/x86/include/asm/io_64.h | 2 -- arch/x86/kernel/pci-dma.c | 6 ------ 3 files changed, 10 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 33513b9a67f3..05cfed4485fa 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -80,8 +80,6 @@ static inline void writeq(__u64 val, volatile void __iomem *addr) #define readq readq #define writeq writeq -extern int iommu_bio_merge; - #ifdef CONFIG_X86_32 # include "io_32.h" #else diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h index fea325a1122f..563c16270ba6 100644 --- a/arch/x86/include/asm/io_64.h +++ b/arch/x86/include/asm/io_64.h @@ -232,8 +232,6 @@ void memset_io(volatile void __iomem *a, int b, size_t c); #define flush_write_buffers() -#define BIO_VMERGE_BOUNDARY iommu_bio_merge - /* * Convert a virtual cached pointer to an uncached pointer */ diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 12eeb4bfcdeb..da93c65f8f0b 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -31,11 +31,6 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -/* This tells the BIO block layer to assume merging. Default to off - because we cannot guarantee merging later. */ -int iommu_bio_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_bio_merge); - dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -189,7 +184,6 @@ static __init int iommu_setup(char *p) } if (!strncmp(p, "biomerge", 8)) { - iommu_bio_merge = 4096; iommu_merge = 1; force_iommu = 1; } -- cgit v1.2.3 From affa219b60a11b3295637a97f5b1b8ef231490fc Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Wed, 3 Dec 2008 18:58:19 -0500 Subject: x86: change thread_info's flag field back to 32 bits Impact: pack struct thread_info more tightly Change x86_64's thread_info 'flags' field back to __u32. This was changed to 'unsigned long' when the thread_info*.h for i386 and x86_64 were merged. Change it back. We can do this as only 27 bits of 'flags' are actually used. This change actually packs down thread_info by 64 bits: 32 bits are saved by the smaller flags, and 32 bits are saved by the following 'mm_segment_t field' becoming naturally 64-bit aligned. Signed-off-by: Joe Korty Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad2..8dbc57390d25 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -24,7 +24,7 @@ struct exec_domain; struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ - unsigned long flags; /* low level flags */ + __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, -- cgit v1.2.3 From 3e1e9002aa8b32bd4c95ac6c8fad376b7a8127fb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 8 Dec 2008 00:50:22 +0100 Subject: x86: change static allocation of trampoline area Impact: fix trampoline sizing bug, save space While debugging a suspend-to-RAM related issue it occured to me that if the trampoline code had grown past 4 KB, we would have been allocating too little memory for it, since the 4 KB size of the trampoline is hardcoded into arch/x86/kernel/e820.c . Change that by making the kernel compute the trampoline size and allocate as much memory as necessary. Signed-off-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trampoline.h | 7 +++++++ arch/x86/kernel/e820.c | 16 ---------------- arch/x86/kernel/head32.c | 3 +++ arch/x86/kernel/head64.c | 3 +++ arch/x86/kernel/trampoline.c | 19 +++++++++++++++++-- 5 files changed, 30 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index fa0d79facdbc..780ba0ab94f9 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -3,6 +3,7 @@ #ifndef __ASSEMBLY__ +#ifdef CONFIG_X86_TRAMPOLINE /* * Trampoline 80x86 program as an array. */ @@ -13,8 +14,14 @@ extern unsigned char *trampoline_base; extern unsigned long init_rsp; extern unsigned long initial_code; +#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) #define TRAMPOLINE_BASE 0x6000 + extern unsigned long setup_trampoline(void); +extern void __init reserve_trampoline_memory(void); +#else +static inline void reserve_trampoline_memory(void) {}; +#endif /* CONFIG_X86_TRAMPOLINE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7aafeb5263ef..65a13943e098 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -677,22 +677,6 @@ struct early_res { }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE) - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, -#endif -#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" }, - /* - * Has to be in very low memory so we can execute - * real-mode AP code. - */ - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" }, -#endif {} }; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index fa1d25dd83e3..ac108d1fe182 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -12,9 +12,12 @@ #include #include #include +#include void __init i386_start_kernel(void) { + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index d16084f90649..388e05a5fc17 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -24,6 +24,7 @@ #include #include #include +#include /* boot cpu pda */ static struct x8664_pda _boot_cpu_pda __read_mostly; @@ -120,6 +121,8 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 1106fac6024d..808031a5ba19 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -1,10 +1,26 @@ #include #include +#include /* ready for x86_64 and x86 */ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); +void __init reserve_trampoline_memory(void) +{ +#ifdef CONFIG_X86_32 + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); +#endif + /* Has to be in very low memory so we can execute real-mode AP code. */ + reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, + "TRAMPOLINE"); +} + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -12,7 +28,6 @@ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); */ unsigned long setup_trampoline(void) { - memcpy(trampoline_base, trampoline_data, - trampoline_end - trampoline_data); + memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } -- cgit v1.2.3 From c2724775ce57c98b8af9694857b941dc61056516 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 11 Dec 2008 13:49:59 +0100 Subject: x86, bts: provide in-kernel branch-trace interface Impact: cleanup Move the BTS bits from ptrace.c into ds.c. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 241 ++++++----- arch/x86/include/asm/processor.h | 13 + arch/x86/include/asm/ptrace.h | 36 -- arch/x86/include/asm/thread_info.h | 5 +- arch/x86/kernel/cpu/intel.c | 4 - arch/x86/kernel/ds.c | 857 +++++++++++++++++++++++-------------- arch/x86/kernel/process_32.c | 59 +-- arch/x86/kernel/process_64.c | 50 +-- arch/x86/kernel/ptrace.c | 416 +++++------------- include/linux/sched.h | 1 + 10 files changed, 811 insertions(+), 871 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 99b6c39774a4..ee0ea3a96c11 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -6,13 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS + * - DS and BTS hardware configuration * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all traced tasks - * - current is allowed to trace tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -31,6 +31,7 @@ #ifdef CONFIG_X86_DS struct task_struct; +struct ds_context; struct ds_tracer; struct bts_tracer; struct pebs_tracer; @@ -38,6 +39,38 @@ struct pebs_tracer; typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); + +/* + * A list of features plus corresponding macros to talk about them in + * the ds_request function's flags parameter. + * + * We use the enum to index an array of corresponding control bits; + * we use the macro to index a flags bit-vector. + */ +enum ds_feature { + dsf_bts = 0, + dsf_bts_kernel, +#define BTS_KERNEL (1 << dsf_bts_kernel) + /* trace kernel-mode branches */ + + dsf_bts_user, +#define BTS_USER (1 << dsf_bts_user) + /* trace user-mode branches */ + + dsf_bts_overflow, + dsf_bts_max, + dsf_pebs = dsf_bts_max, + + dsf_pebs_max, + dsf_ctl_max = dsf_pebs_max, + dsf_bts_timestamps = dsf_ctl_max, +#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps) + /* add timestamps into BTS trace */ + +#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS) +}; + + /* * Request BTS or PEBS * @@ -58,92 +91,135 @@ typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); * NULL if cyclic buffer requested * th: the interrupt threshold in records from the end of the buffer; * -1 if no interrupt threshold is requested. + * flags: a bit-mask of the above flags */ extern struct bts_tracer *ds_request_bts(struct task_struct *task, void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th); + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, void *base, size_t size, pebs_ovfl_callback_t ovfl, - size_t th); + size_t th, unsigned int flags); /* * Release BTS or PEBS resources - * - * Returns 0 on success; -Eerrno otherwise + * Suspend and resume BTS or PEBS tracing * * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_release_bts(struct bts_tracer *tracer); -extern int ds_release_pebs(struct pebs_tracer *tracer); +extern void ds_release_bts(struct bts_tracer *tracer); +extern void ds_suspend_bts(struct bts_tracer *tracer); +extern void ds_resume_bts(struct bts_tracer *tracer); +extern void ds_release_pebs(struct pebs_tracer *tracer); +extern void ds_suspend_pebs(struct pebs_tracer *tracer); +extern void ds_resume_pebs(struct pebs_tracer *tracer); + /* - * Get the (array) index of the write pointer. - * (assuming an array of BTS/PEBS records) - * - * Returns 0 on success; -Eerrno on error + * The raw DS buffer state as it is used for BTS and PEBS recording. * - * tracer: the tracer handle returned from ds_request_~() - * pos (out): will hold the result + * This is the low-level, arch-dependent interface for working + * directly on the raw trace data. */ -extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos); -extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos); +struct ds_trace { + /* the number of bts/pebs records */ + size_t n; + /* the size of a bts/pebs record in bytes */ + size_t size; + /* pointers into the raw buffer: + - to the first entry */ + void *begin; + /* - one beyond the last entry */ + void *end; + /* - one beyond the newest entry */ + void *top; + /* - the interrupt threshold */ + void *ith; + /* flags given on ds_request() */ + unsigned int flags; +}; /* - * Get the (array) index one record beyond the end of the array. - * (assuming an array of BTS/PEBS records) - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_~() - * pos (out): will hold the result + * An arch-independent view on branch trace data. */ -extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos); -extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos); +enum bts_qualifier { + bts_invalid, +#define BTS_INVALID bts_invalid + + bts_branch, +#define BTS_BRANCH bts_branch + + bts_task_arrives, +#define BTS_TASK_ARRIVES bts_task_arrives + + bts_task_departs, +#define BTS_TASK_DEPARTS bts_task_departs + + bts_qual_bit_size = 4, + bts_qual_max = (1 << bts_qual_bit_size), +}; + +struct bts_struct { + __u64 qualifier; + union { + /* BTS_BRANCH */ + struct { + __u64 from; + __u64 to; + } lbr; + /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ + struct { + __u64 jiffies; + pid_t pid; + } timestamp; + } variant; +}; + /* - * Provide a pointer to the BTS/PEBS record at parameter index. - * (assuming an array of BTS/PEBS records) - * - * The pointer points directly into the buffer. The user is - * responsible for copying the record. - * - * Returns the size of a single record on success; -Eerrno on error + * The BTS state. * - * tracer: the tracer handle returned from ds_request_~() - * index: the index of the requested record - * record (out): pointer to the requested record + * This gives access to the raw DS state and adds functions to provide + * an arch-independent view of the BTS data. */ -extern int ds_access_bts(struct bts_tracer *tracer, - size_t index, const void **record); -extern int ds_access_pebs(struct pebs_tracer *tracer, - size_t index, const void **record); +struct bts_trace { + struct ds_trace ds; + + int (*read)(struct bts_tracer *tracer, const void *at, + struct bts_struct *out); + int (*write)(struct bts_tracer *tracer, const struct bts_struct *in); +}; + /* - * Write one or more BTS/PEBS records at the write pointer index and - * advance the write pointer. + * The PEBS state. * - * If size is not a multiple of the record size, trailing bytes are - * zeroed out. - * - * May result in one or more overflow notifications. - * - * If called during overflow handling, that is, with index >= - * interrupt threshold, the write will wrap around. + * This gives access to the raw DS state and the PEBS-specific counter + * reset value. + */ +struct pebs_trace { + struct ds_trace ds; + + /* the PEBS reset value */ + unsigned long long reset_value; +}; + + +/* + * Read the BTS or PEBS trace. * - * An overflow notification is given if and when the interrupt - * threshold is reached during or after the write. + * Returns a view on the trace collected for the parameter tracer. * - * Returns the number of bytes written or -Eerrno. + * The view remains valid as long as the traced task is not running or + * the tracer is suspended. + * Writes into the trace buffer are not reflected. * * tracer: the tracer handle returned from ds_request_~() - * buffer: the buffer to write - * size: the size of the buffer */ -extern int ds_write_bts(struct bts_tracer *tracer, - const void *buffer, size_t size); -extern int ds_write_pebs(struct pebs_tracer *tracer, - const void *buffer, size_t size); +extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer); +extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer); + /* * Reset the write pointer of the BTS/PEBS buffer. @@ -155,27 +231,6 @@ extern int ds_write_pebs(struct pebs_tracer *tracer, extern int ds_reset_bts(struct bts_tracer *tracer); extern int ds_reset_pebs(struct pebs_tracer *tracer); -/* - * Clear the BTS/PEBS buffer and reset the write pointer. - * The entire buffer will be zeroed out. - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_~() - */ -extern int ds_clear_bts(struct bts_tracer *tracer); -extern int ds_clear_pebs(struct pebs_tracer *tracer); - -/* - * Provide the PEBS counter reset value. - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_pebs() - * value (out): the counter reset value - */ -extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value); - /* * Set the PEBS counter reset value. * @@ -192,35 +247,17 @@ extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); struct cpuinfo_x86; extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); - - /* - * The DS context - part of struct thread_struct. + * Context switch work */ -#define MAX_SIZEOF_DS (12 * 8) - -struct ds_context { - /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ - unsigned char ds[MAX_SIZEOF_DS]; - /* the owner of the BTS and PEBS configuration, respectively */ - struct ds_tracer *owner[2]; - /* use count */ - unsigned long count; - /* a pointer to the context location inside the thread_struct - * or the per_cpu context array */ - struct ds_context **this; - /* a pointer to the task owning this context, or NULL, if the - * context is owned by a cpu */ - struct task_struct *task; -}; - -/* called by exit_thread() to free leftover contexts */ -extern void ds_free(struct ds_context *context); +extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} +static inline void ds_switch_to(struct task_struct *prev, + struct task_struct *next) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ca01e383269..aa5914f8e501 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -752,6 +752,19 @@ extern void switch_to_new_gdt(void); extern void cpu_init(void); extern void init_gdt(int cpu); +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index eefb0594b058..fbf744215911 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -6,7 +6,6 @@ #include #ifdef __KERNEL__ -#include /* the DS BTS struct is used for ptrace too */ #include #endif @@ -128,34 +127,6 @@ struct pt_regs { #endif /* !__i386__ */ -#ifdef CONFIG_X86_PTRACE_BTS -/* a branch trace record entry - * - * In order to unify the interface between various processor versions, - * we use the below data structure for all processors. - */ -enum bts_qualifier { - BTS_INVALID = 0, - BTS_BRANCH, - BTS_TASK_ARRIVES, - BTS_TASK_DEPARTS -}; - -struct bts_struct { - __u64 qualifier; - union { - /* BTS_BRANCH */ - struct { - __u64 from_ip; - __u64 to_ip; - } lbr; - /* BTS_TASK_ARRIVES or - BTS_TASK_DEPARTS */ - __u64 jiffies; - } variant; -}; -#endif /* CONFIG_X86_PTRACE_BTS */ - #ifdef __KERNEL__ #include @@ -163,13 +134,6 @@ struct bts_struct { struct cpuinfo_x86; struct task_struct; -#ifdef CONFIG_X86_PTRACE_BTS -extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *); -extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier); -#else -#define ptrace_bts_init_intel(config) do {} while (0) -#endif /* CONFIG_X86_PTRACE_BTS */ - extern unsigned long profile_pc(struct pt_regs *regs); extern unsigned long diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 0921b4018c11..bf8113d16a33 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -93,7 +93,6 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ -#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -115,7 +114,6 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) -#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -141,8 +139,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ - _TIF_NOTSC) + (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 816f27f289b1..cd413d9a0218 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -309,9 +308,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_P3); #endif - if (cpu_has_bts) - ptrace_bts_init_intel(c); - detect_extended_topology(c); if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { /* diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 095306988667..f0583005b75e 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -6,13 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS + * - DS and BTS hardware configuration * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all traced tasks - * - current is allowed to trace tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -34,15 +34,30 @@ * The configuration for a particular DS hardware implementation. */ struct ds_configuration { - /* the size of the DS structure in bytes */ - unsigned char sizeof_ds; - /* the size of one pointer-typed field in the DS structure in bytes; - this covers the first 8 fields related to buffer management. */ + /* the name of the configuration */ + const char *name; + /* the size of one pointer-typed field in the DS structure and + in the BTS and PEBS buffers in bytes; + this covers the first 8 DS fields related to buffer management. */ unsigned char sizeof_field; /* the size of a BTS/PEBS record in bytes */ unsigned char sizeof_rec[2]; + /* a series of bit-masks to control various features indexed + * by enum ds_feature */ + unsigned long ctl[dsf_ctl_max]; }; -static struct ds_configuration ds_cfg; +static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); + +#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) + +#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ +#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ + +#define BTS_CONTROL \ + (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ + ds_cfg.ctl[dsf_bts_overflow]) + /* * A BTS or PEBS tracer. @@ -61,6 +76,8 @@ struct ds_tracer { struct bts_tracer { /* the common DS part */ struct ds_tracer ds; + /* the trace including the DS configuration */ + struct bts_trace trace; /* buffer overflow notification function */ bts_ovfl_callback_t ovfl; }; @@ -68,6 +85,8 @@ struct bts_tracer { struct pebs_tracer { /* the common DS part */ struct ds_tracer ds; + /* the trace including the DS configuration */ + struct pebs_trace trace; /* buffer overflow notification function */ pebs_ovfl_callback_t ovfl; }; @@ -134,13 +153,11 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, (*(unsigned long *)base) = value; } -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ - /* * Locking is done only for allocating BTS or PEBS resources. */ -static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); +static DEFINE_SPINLOCK(ds_lock); /* @@ -156,27 +173,32 @@ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); * >0 number of per-thread tracers * <0 number of per-cpu tracers * - * The below functions to get and put tracers and to check the - * allocation type require the ds_lock to be held by the caller. - * * Tracers essentially gives the number of ds contexts for a certain * type of allocation. */ -static long tracers; +static atomic_t tracers = ATOMIC_INIT(0); static inline void get_tracer(struct task_struct *task) { - tracers += (task ? 1 : -1); + if (task) + atomic_inc(&tracers); + else + atomic_dec(&tracers); } static inline void put_tracer(struct task_struct *task) { - tracers -= (task ? 1 : -1); + if (task) + atomic_dec(&tracers); + else + atomic_inc(&tracers); } static inline int check_tracer(struct task_struct *task) { - return (task ? (tracers >= 0) : (tracers <= 0)); + return task ? + (atomic_read(&tracers) >= 0) : + (atomic_read(&tracers) <= 0); } @@ -190,14 +212,30 @@ static inline int check_tracer(struct task_struct *task) * Contexts are use-counted. They are allocated on first access and * deallocated when the last user puts the context. */ -static DEFINE_PER_CPU(struct ds_context *, system_context); +struct ds_context { + /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + unsigned char ds[MAX_SIZEOF_DS]; + /* the owner of the BTS and PEBS configuration, respectively */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; + /* use count */ + unsigned long count; + /* a pointer to the context location inside the thread_struct + * or the per_cpu context array */ + struct ds_context **this; + /* a pointer to the task owning this context, or NULL, if the + * context is owned by a cpu */ + struct task_struct *task; +}; + +static DEFINE_PER_CPU(struct ds_context *, system_context_array); -#define this_system_context per_cpu(system_context, smp_processor_id()) +#define system_context per_cpu(system_context_array, smp_processor_id()) static inline struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &this_system_context); + (task ? &task->thread.ds_ctx : &system_context); struct ds_context *context = *p_context; unsigned long irq; @@ -225,10 +263,22 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); } + + context->count++; + + spin_unlock_irqrestore(&ds_lock, irq); + } else { + spin_lock_irqsave(&ds_lock, irq); + + context = *p_context; + if (context) + context->count++; + spin_unlock_irqrestore(&ds_lock, irq); - } - context->count++; + if (!context) + context = ds_get_context(task); + } return context; } @@ -242,8 +292,10 @@ static inline void ds_put_context(struct ds_context *context) spin_lock_irqsave(&ds_lock, irq); - if (--context->count) - goto out; + if (--context->count) { + spin_unlock_irqrestore(&ds_lock, irq); + return; + } *(context->this) = NULL; @@ -253,14 +305,14 @@ static inline void ds_put_context(struct ds_context *context) if (!context->task || (context->task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); - kfree(context); - out: spin_unlock_irqrestore(&ds_lock, irq); + + kfree(context); } /* - * Handle a buffer overflow + * Call the tracer's callback on a buffer overflow. * * context: the ds context * qual: the buffer type @@ -268,30 +320,244 @@ static inline void ds_put_context(struct ds_context *context) static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) { switch (qual) { - case ds_bts: { - struct bts_tracer *tracer = - container_of(context->owner[qual], - struct bts_tracer, ds); - if (tracer->ovfl) - tracer->ovfl(tracer); - } + case ds_bts: + if (context->bts_master && + context->bts_master->ovfl) + context->bts_master->ovfl(context->bts_master); + break; + case ds_pebs: + if (context->pebs_master && + context->pebs_master->ovfl) + context->pebs_master->ovfl(context->pebs_master); break; - case ds_pebs: { - struct pebs_tracer *tracer = - container_of(context->owner[qual], - struct pebs_tracer, ds); - if (tracer->ovfl) - tracer->ovfl(tracer); } +} + + +/* + * Write raw data into the BTS or PEBS buffer. + * + * The remainder of any partially written record is zeroed out. + * + * context: the DS context + * qual: the buffer type + * record: the data to write + * size: the size of the data + */ +static int ds_write(struct ds_context *context, enum ds_qualifier qual, + const void *record, size_t size) +{ + int bytes_written = 0; + + if (!record) + return -EINVAL; + + while (size) { + unsigned long base, index, end, write_end, int_th; + unsigned long write_size, adj_write_size; + + /* + * write as much as possible without producing an + * overflow interrupt. + * + * interrupt_threshold must either be + * - bigger than absolute_maximum or + * - point to a record between buffer_base and absolute_maximum + * + * index points to a valid record. + */ + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + end = ds_get(context->ds, qual, ds_absolute_maximum); + int_th = ds_get(context->ds, qual, ds_interrupt_threshold); + + write_end = min(end, int_th); + + /* if we are already beyond the interrupt threshold, + * we fill the entire buffer */ + if (write_end <= index) + write_end = end; + + if (write_end <= index) + break; + + write_size = min((unsigned long) size, write_end - index); + memcpy((void *)index, record, write_size); + + record = (const char *)record + write_size; + size -= write_size; + bytes_written += write_size; + + adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; + adj_write_size *= ds_cfg.sizeof_rec[qual]; + + /* zero out trailing bytes */ + memset((char *)index + write_size, 0, + adj_write_size - write_size); + index += adj_write_size; + + if (index >= end) + index = base; + ds_set(context->ds, qual, ds_index, index); + + if (index >= int_th) + ds_overflow(context, qual); + } + + return bytes_written; +} + + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + * + * We use two levels of abstraction: + * - the raw data level defined here + * - an arch-independent level defined in ds.h + */ + +enum bts_field { + bts_from, + bts_to, + bts_flags, + + bts_qual = bts_from, + bts_jiffies = bts_to, + bts_pid = bts_flags, + + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) +}; + +static inline unsigned long bts_get(const char *base, enum bts_field field) +{ + base += (ds_cfg.sizeof_field * field); + return *(unsigned long *)base; +} + +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ + base += (ds_cfg.sizeof_field * field);; + (*(unsigned long *)base) = val; +} + + +/* + * The raw BTS data is architecture dependent. + * + * For higher-level users, we give an arch-independent view. + * - ds.h defines struct bts_struct + * - bts_read translates one raw bts record into a bts_struct + * - bts_write translates one bts_struct into the raw format and + * writes it into the top of the parameter tracer's buffer. + * + * return: bytes read/written on success; -Eerrno, otherwise + */ +static int bts_read(struct bts_tracer *tracer, const void *at, + struct bts_struct *out) +{ + if (!tracer) + return -EINVAL; + + if (at < tracer->trace.ds.begin) + return -EINVAL; + + if (tracer->trace.ds.end < (at + tracer->trace.ds.size)) + return -EINVAL; + + memset(out, 0, sizeof(*out)); + if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { + out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); + out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); + out->variant.timestamp.pid = bts_get(at, bts_pid); + } else { + out->qualifier = bts_branch; + out->variant.lbr.from = bts_get(at, bts_from); + out->variant.lbr.to = bts_get(at, bts_to); + } + + return ds_cfg.sizeof_rec[ds_bts]; +} + +static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) +{ + unsigned char raw[MAX_SIZEOF_BTS]; + + if (!tracer) + return -EINVAL; + + if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts]) + return -EOVERFLOW; + + switch (in->qualifier) { + case bts_invalid: + bts_set(raw, bts_from, 0); + bts_set(raw, bts_to, 0); + bts_set(raw, bts_flags, 0); + break; + case bts_branch: + bts_set(raw, bts_from, in->variant.lbr.from); + bts_set(raw, bts_to, in->variant.lbr.to); + bts_set(raw, bts_flags, 0); + break; + case bts_task_arrives: + case bts_task_departs: + bts_set(raw, bts_qual, (bts_escape | in->qualifier)); + bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); + bts_set(raw, bts_pid, in->variant.timestamp.pid); break; + default: + return -EINVAL; } + + return ds_write(tracer->ds.context, ds_bts, raw, + ds_cfg.sizeof_rec[ds_bts]); } -static void ds_install_ds_config(struct ds_context *context, - enum ds_qualifier qual, - void *base, size_t size, size_t ith) +static void ds_write_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) +{ + unsigned char *ds = context->ds; + + ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin); + ds_set(ds, qual, ds_index, (unsigned long)cfg->top); + ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end); + ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith); +} + +static void ds_read_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) { + unsigned char *ds = context->ds; + + cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base); + cfg->top = (void *)ds_get(ds, qual, ds_index); + cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum); + cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold); +} + +static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, + void *base, size_t size, size_t ith, + unsigned int flags) { unsigned long buffer, adj; /* adjust the buffer address and size to meet alignment @@ -308,32 +574,30 @@ static void ds_install_ds_config(struct ds_context *context, buffer += adj; size -= adj; - size /= ds_cfg.sizeof_rec[qual]; - size *= ds_cfg.sizeof_rec[qual]; + trace->n = size / ds_cfg.sizeof_rec[qual]; + trace->size = ds_cfg.sizeof_rec[qual]; - ds_set(context->ds, qual, ds_buffer_base, buffer); - ds_set(context->ds, qual, ds_index, buffer); - ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + size = (trace->n * trace->size); + trace->begin = (void *)buffer; + trace->top = trace->begin; + trace->end = (void *)(buffer + size); /* The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ - ds_set(context->ds, qual, - ds_interrupt_threshold, buffer + size - ith); + trace->ith = (void *)(buffer + size - ith); + + trace->flags = flags; } -static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, - struct task_struct *task, - void *base, size_t size, size_t th) + +static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, + enum ds_qualifier qual, struct task_struct *task, + void *base, size_t size, size_t th, unsigned int flags) { struct ds_context *context; - unsigned long irq; int error; - error = -EOPNOTSUPP; - if (!ds_cfg.sizeof_ds) - goto out; - error = -EINVAL; if (!base) goto out; @@ -360,43 +624,26 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, goto out; tracer->context = context; + ds_init_ds_trace(trace, qual, base, size, th, flags); - spin_lock_irqsave(&ds_lock, irq); - - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - - error = -EPERM; - if (context->owner[qual]) - goto out_put_tracer; - context->owner[qual] = tracer; - - spin_unlock_irqrestore(&ds_lock, irq); - - - ds_install_ds_config(context, qual, base, size, th); - - return 0; - - out_put_tracer: - put_tracer(task); - out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); - tracer->context = NULL; + error = 0; out: return error; } struct bts_tracer *ds_request_bts(struct task_struct *task, void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th) + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct bts_tracer *tracer; + unsigned long irq; int error; + error = -EOPNOTSUPP; + if (!ds_cfg.ctl[dsf_bts]) + goto out; + /* buffer overflow notification is not yet implemented */ error = -EOPNOTSUPP; if (ovfl) @@ -408,12 +655,40 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, goto out; tracer->ovfl = ovfl; - error = ds_request(&tracer->ds, ds_bts, task, base, size, th); + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_bts, task, base, size, th, flags); if (error < 0) goto out_tracer; + + spin_lock_irqsave(&ds_lock, irq); + + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); + + error = -EPERM; + if (tracer->ds.context->bts_master) + goto out_put_tracer; + tracer->ds.context->bts_master = tracer; + + spin_unlock_irqrestore(&ds_lock, irq); + + + tracer->trace.read = bts_read; + tracer->trace.write = bts_write; + + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_bts(tracer); + return tracer; + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); out: @@ -422,9 +697,11 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, struct pebs_tracer *ds_request_pebs(struct task_struct *task, void *base, size_t size, - pebs_ovfl_callback_t ovfl, size_t th) + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct pebs_tracer *tracer; + unsigned long irq; int error; /* buffer overflow notification is not yet implemented */ @@ -438,300 +715,171 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, goto out; tracer->ovfl = ovfl; - error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_pebs, task, base, size, th, flags); if (error < 0) goto out_tracer; + spin_lock_irqsave(&ds_lock, irq); + + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); + + error = -EPERM; + if (tracer->ds.context->pebs_master) + goto out_put_tracer; + tracer->ds.context->pebs_master = tracer; + + spin_unlock_irqrestore(&ds_lock, irq); + + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_pebs(tracer); + return tracer; + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); out: return ERR_PTR(error); } -static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) -{ - WARN_ON_ONCE(tracer->context->owner[qual] != tracer); - tracer->context->owner[qual] = NULL; - - put_tracer(tracer->context->task); - ds_put_context(tracer->context); -} - -int ds_release_bts(struct bts_tracer *tracer) +void ds_release_bts(struct bts_tracer *tracer) { if (!tracer) - return -EINVAL; + return; - ds_release(&tracer->ds, ds_bts); - kfree(tracer); + ds_suspend_bts(tracer); - return 0; -} + WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); + tracer->ds.context->bts_master = NULL; -int ds_release_pebs(struct pebs_tracer *tracer) -{ - if (!tracer) - return -EINVAL; + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); - ds_release(&tracer->ds, ds_pebs); kfree(tracer); - - return 0; -} - -static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) -{ - unsigned long base, index; - - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - - return (index - base) / ds_cfg.sizeof_rec[qual]; } -int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos) +void ds_suspend_bts(struct bts_tracer *tracer) { - if (!tracer) - return -EINVAL; + struct task_struct *task; - if (!pos) - return -EINVAL; - - *pos = ds_get_index(tracer->ds.context, ds_bts); - - return 0; -} - -int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos) -{ if (!tracer) - return -EINVAL; + return; - if (!pos) - return -EINVAL; + task = tracer->ds.context->task; - *pos = ds_get_index(tracer->ds.context, ds_pebs); + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); - return 0; -} + if (task) { + task->thread.debugctlmsr &= ~BTS_CONTROL; -static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual) -{ - unsigned long base, max; - - base = ds_get(context->ds, qual, ds_buffer_base); - max = ds_get(context->ds, qual, ds_absolute_maximum); - - return (max - base) / ds_cfg.sizeof_rec[qual]; + if (!task->thread.debugctlmsr) + clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } } -int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos) +void ds_resume_bts(struct bts_tracer *tracer) { - if (!tracer) - return -EINVAL; - - if (!pos) - return -EINVAL; - - *pos = ds_get_end(tracer->ds.context, ds_bts); - - return 0; -} + struct task_struct *task; + unsigned long control; -int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos) -{ if (!tracer) - return -EINVAL; - - if (!pos) - return -EINVAL; - - *pos = ds_get_end(tracer->ds.context, ds_pebs); - - return 0; -} - -static int ds_access(struct ds_context *context, enum ds_qualifier qual, - size_t index, const void **record) -{ - unsigned long base, idx; - - if (!record) - return -EINVAL; - - base = ds_get(context->ds, qual, ds_buffer_base); - idx = base + (index * ds_cfg.sizeof_rec[qual]); - - if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) - return -EINVAL; + return; - *record = (const void *)idx; + task = tracer->ds.context->task; - return ds_cfg.sizeof_rec[qual]; -} + control = ds_cfg.ctl[dsf_bts]; + if (!(tracer->trace.ds.flags & BTS_KERNEL)) + control |= ds_cfg.ctl[dsf_bts_kernel]; + if (!(tracer->trace.ds.flags & BTS_USER)) + control |= ds_cfg.ctl[dsf_bts_user]; -int ds_access_bts(struct bts_tracer *tracer, size_t index, - const void **record) -{ - if (!tracer) - return -EINVAL; + if (task) { + task->thread.debugctlmsr |= control; + set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } - return ds_access(tracer->ds.context, ds_bts, index, record); + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() | control); } -int ds_access_pebs(struct pebs_tracer *tracer, size_t index, - const void **record) +void ds_release_pebs(struct pebs_tracer *tracer) { if (!tracer) - return -EINVAL; - - return ds_access(tracer->ds.context, ds_pebs, index, record); -} - -static int ds_write(struct ds_context *context, enum ds_qualifier qual, - const void *record, size_t size) -{ - int bytes_written = 0; - - if (!record) - return -EINVAL; - - while (size) { - unsigned long base, index, end, write_end, int_th; - unsigned long write_size, adj_write_size; - - /* - * write as much as possible without producing an - * overflow interrupt. - * - * interrupt_threshold must either be - * - bigger than absolute_maximum or - * - point to a record between buffer_base and absolute_maximum - * - * index points to a valid record. - */ - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - end = ds_get(context->ds, qual, ds_absolute_maximum); - int_th = ds_get(context->ds, qual, ds_interrupt_threshold); - - write_end = min(end, int_th); - - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ - if (write_end <= index) - write_end = end; - - if (write_end <= index) - break; - - write_size = min((unsigned long) size, write_end - index); - memcpy((void *)index, record, write_size); - - record = (const char *)record + write_size; - size -= write_size; - bytes_written += write_size; - - adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; - adj_write_size *= ds_cfg.sizeof_rec[qual]; - - /* zero out trailing bytes */ - memset((char *)index + write_size, 0, - adj_write_size - write_size); - index += adj_write_size; + return; - if (index >= end) - index = base; - ds_set(context->ds, qual, ds_index, index); + ds_suspend_pebs(tracer); - if (index >= int_th) - ds_overflow(context, qual); - } + WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); + tracer->ds.context->pebs_master = NULL; - return bytes_written; -} + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); -int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size) -{ - if (!tracer) - return -EINVAL; - - return ds_write(tracer->ds.context, ds_bts, record, size); + kfree(tracer); } -int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size) +void ds_suspend_pebs(struct pebs_tracer *tracer) { - if (!tracer) - return -EINVAL; - return ds_write(tracer->ds.context, ds_pebs, record, size); } -static void ds_reset_or_clear(struct ds_context *context, - enum ds_qualifier qual, int clear) +void ds_resume_pebs(struct pebs_tracer *tracer) { - unsigned long base, end; - - base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); - - if (clear) - memset((void *)base, 0, end - base); - ds_set(context->ds, qual, ds_index, base); } -int ds_reset_bts(struct bts_tracer *tracer) +const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { if (!tracer) - return -EINVAL; - - ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); + return NULL; - return 0; + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + return &tracer->trace; } -int ds_reset_pebs(struct pebs_tracer *tracer) +const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) { if (!tracer) - return -EINVAL; + return NULL; - ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + tracer->trace.reset_value = + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); - return 0; + return &tracer->trace; } -int ds_clear_bts(struct bts_tracer *tracer) +int ds_reset_bts(struct bts_tracer *tracer) { if (!tracer) return -EINVAL; - ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); - - return 0; -} - -int ds_clear_pebs(struct pebs_tracer *tracer) -{ - if (!tracer) - return -EINVAL; + tracer->trace.ds.top = tracer->trace.ds.begin; - ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); return 0; } -int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value) +int ds_reset_pebs(struct pebs_tracer *tracer) { if (!tracer) return -EINVAL; - if (!value) - return -EINVAL; + tracer->trace.ds.top = tracer->trace.ds.begin; - *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); return 0; } @@ -746,35 +894,59 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) return 0; } -static const struct ds_configuration ds_cfg_var = { - .sizeof_ds = sizeof(long) * 12, - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, +static const struct ds_configuration ds_cfg_netburst = { + .name = "netburst", + .ctl[dsf_bts] = (1 << 2) | (1 << 3), + .ctl[dsf_bts_kernel] = (1 << 5), + .ctl[dsf_bts_user] = (1 << 6), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = sizeof(long) * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; -static const struct ds_configuration ds_cfg_64 = { - .sizeof_ds = 8 * 12, - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, +static const struct ds_configuration ds_cfg_pentium_m = { + .name = "pentium m", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = 8 * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = 8 * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; +static const struct ds_configuration ds_cfg_core2 = { + .name = "core 2", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + + .sizeof_field = 8, + .sizeof_rec[ds_bts] = 8 * 3, + .sizeof_rec[ds_pebs] = 8 * 18, +}; -static inline void +static void ds_configure(const struct ds_configuration *cfg) { + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; - printk(KERN_INFO "DS available\n"); + printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + + if (!cpu_has_bts) { + ds_cfg.ctl[dsf_bts] = 0; + printk(KERN_INFO "[ds] bts not available\n"); + } + if (!cpu_has_pebs) + printk(KERN_INFO "[ds] pebs not available\n"); - WARN_ON_ONCE(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); + WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -787,10 +959,10 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; case 0xD: case 0xE: /* Pentium M */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_pentium_m); break; default: /* Core2, Atom, ... */ - ds_configure(&ds_cfg_64); + ds_configure(&ds_cfg_core2); break; } break; @@ -799,7 +971,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_netburst); break; default: /* sorry, don't know about them */ @@ -812,14 +984,41 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) } } -void ds_free(struct ds_context *context) +/* + * Change the DS configuration from tracing prev to tracing next. + */ +void ds_switch_to(struct task_struct *prev, struct task_struct *next) { - /* This is called when the task owning the parameter context - * is dying. There should not be any user of that context left - * to disturb us, anymore. */ - unsigned long leftovers = context->count; - while (leftovers--) { - put_tracer(context->task); - ds_put_context(context); + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + + if (prev_ctx) { + update_debugctlmsr(0); + + if (prev_ctx->bts_master && + (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_departs, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = prev->pid + }; + bts_write(prev_ctx->bts_master, &ts); + } + } + + if (next_ctx) { + if (next_ctx->bts_master && + (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_arrives, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = next->pid + }; + bts_write(next_ctx->bts_master, &ts); + } + + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); } + + update_debugctlmsr(next->thread.debugctlmsr); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 24c2276aa453..605eff9a8ac0 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -252,11 +252,14 @@ void exit_thread(void) put_cpu(); } #ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(current->thread.ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(current->thread.ds_ctx); + /* Free any BTS tracers that have not been properly released. */ + if (unlikely(current->bts)) { + ds_release_bts(current->bts); + current->bts = NULL; + + kfree(current->bts_buffer); + current->bts_buffer = NULL; + current->bts_size = 0; } #endif /* CONFIG_X86_DS */ } @@ -420,48 +423,19 @@ int set_tsc_mode(unsigned int val) return 0; } -#ifdef CONFIG_X86_DS -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - unsigned long ds_prev = 0; - unsigned long ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, ds_next, 0); - } - return debugctl; -} -#else -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - return debugctl; -} -#endif /* CONFIG_X86_DS */ - static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread; next = &next_p->thread; - debugctl = update_debugctl(prev, next, prev->debugctlmsr); - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -483,15 +457,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, hard_enable_TSC(); } -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ - - if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index fbb321d53d34..1cfd2a4bf853 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -237,11 +237,14 @@ void exit_thread(void) put_cpu(); } #ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(t->ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(t->ds_ctx); + /* Free any BTS tracers that have not been properly released. */ + if (unlikely(current->bts)) { + ds_release_bts(current->bts); + current->bts = NULL; + + kfree(current->bts_buffer); + current->bts_buffer = NULL; + current->bts_size = 0; } #endif /* CONFIG_X86_DS */ } @@ -471,35 +474,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; - debugctl = prev->debugctlmsr; - -#ifdef CONFIG_X86_DS - { - unsigned long ds_prev = 0, ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* - * We clear debugctl to make sure DS - * is not in use when we change it: - */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, ds_next); - } - } -#endif /* CONFIG_X86_DS */ - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -534,14 +516,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } - -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ } /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b2998fe1166b..45e9855da2d2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -581,153 +581,73 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS -/* - * The configuration for a particular BTS hardware implementation. - */ -struct bts_configuration { - /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */ - unsigned char sizeof_bts; - /* the size of a field in the BTS record in bytes */ - unsigned char sizeof_field; - /* a bitmask to enable/disable BTS in DEBUGCTL MSR */ - unsigned long debugctl_mask; -}; -static struct bts_configuration bts_cfg; - -#define BTS_MAX_RECORD_SIZE (8 * 3) - - -/* - * Branch Trace Store (BTS) uses the following format. Different - * architectures vary in the size of those fields. - * - source linear address - * - destination linear address - * - flags - * - * Later architectures use 64bit pointers throughout, whereas earlier - * architectures use 32bit pointers in 32bit mode. - * - * We compute the base address for the first 8 fields based on: - * - the field size stored in the DS configuration - * - the relative field position - * - * In order to store additional information in the BTS buffer, we use - * a special source address to indicate that the record requires - * special interpretation. - * - * Netburst indicated via a bit in the flags field whether the branch - * was predicted; this is ignored. - */ - -enum bts_field { - bts_from = 0, - bts_to, - bts_flags, - - bts_escape = (unsigned long)-1, - bts_qual = bts_to, - bts_jiffies = bts_flags -}; - -static inline unsigned long bts_get(const char *base, enum bts_field field) -{ - base += (bts_cfg.sizeof_field * field); - return *(unsigned long *)base; -} - -static inline void bts_set(char *base, enum bts_field field, unsigned long val) -{ - base += (bts_cfg.sizeof_field * field);; - (*(unsigned long *)base) = val; -} - -/* - * Translate a BTS record from the raw format into the bts_struct format - * - * out (out): bts_struct interpretation - * raw: raw BTS record - */ -static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw) -{ - memset(out, 0, sizeof(*out)); - if (bts_get(raw, bts_from) == bts_escape) { - out->qualifier = bts_get(raw, bts_qual); - out->variant.jiffies = bts_get(raw, bts_jiffies); - } else { - out->qualifier = BTS_BRANCH; - out->variant.lbr.from_ip = bts_get(raw, bts_from); - out->variant.lbr.to_ip = bts_get(raw, bts_to); - } -} - static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { - struct bts_struct ret; - const void *bts_record; - size_t bts_index, bts_end; + const struct bts_trace *trace; + struct bts_struct bts; + const unsigned char *at; int error; - error = ds_get_bts_end(child->bts, &bts_end); - if (error < 0) - return error; - - if (bts_end <= index) - return -EINVAL; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - error = ds_get_bts_index(child->bts, &bts_index); - if (error < 0) - return error; + at = trace->ds.top - ((index + 1) * trace->ds.size); + if ((void *)at < trace->ds.begin) + at += (trace->ds.n * trace->ds.size); - /* translate the ptrace bts index into the ds bts index */ - bts_index += bts_end - (index + 1); - if (bts_end <= bts_index) - bts_index -= bts_end; + if (!trace->read) + return -EOPNOTSUPP; - error = ds_access_bts(child->bts, bts_index, &bts_record); + error = trace->read(child->bts, at, &bts); if (error < 0) return error; - ptrace_bts_translate_record(&ret, bts_record); - - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; - return sizeof(ret); + return sizeof(bts); } static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { - struct bts_struct ret; - const unsigned char *raw; - size_t end, i; - int error; + const struct bts_trace *trace; + const unsigned char *at; + int error, drained = 0; - error = ds_get_bts_index(child->bts, &end); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - if (size < (end * sizeof(struct bts_struct))) + if (!trace->read) + return -EOPNOTSUPP; + + if (size < (trace->ds.top - trace->ds.begin)) return -EIO; - error = ds_access_bts(child->bts, 0, (const void **)&raw); - if (error < 0) - return error; + for (at = trace->ds.begin; (void *)at < trace->ds.top; + out++, drained++, at += trace->ds.size) { + struct bts_struct bts; + int error; - for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { - ptrace_bts_translate_record(&ret, raw); + error = trace->read(child->bts, at, &bts); + if (error < 0) + return error; - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; } - error = ds_clear_bts(child->bts); + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); + + error = ds_reset_bts(child->bts); if (error < 0) return error; - return end; + return drained; } static int ptrace_bts_config(struct task_struct *child, @@ -735,136 +655,89 @@ static int ptrace_bts_config(struct task_struct *child, const struct ptrace_bts_config __user *ucfg) { struct ptrace_bts_config cfg; - int error = 0; - - error = -EOPNOTSUPP; - if (!bts_cfg.sizeof_bts) - goto errout; + unsigned int flags = 0; - error = -EIO; if (cfg_size < sizeof(cfg)) - goto errout; + return -EIO; - error = -EFAULT; if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - goto errout; - - error = -EINVAL; - if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && - !(cfg.flags & PTRACE_BTS_O_ALLOC)) - goto errout; - - if (cfg.flags & PTRACE_BTS_O_ALLOC) { - bts_ovfl_callback_t ovfl = NULL; - unsigned int sig = 0; - - error = -EINVAL; - if (cfg.size < (10 * bts_cfg.sizeof_bts)) - goto errout; + return -EFAULT; - if (cfg.flags & PTRACE_BTS_O_SIGNAL) { - if (!cfg.signal) - goto errout; + if (child->bts) { + ds_release_bts(child->bts); + child->bts = NULL; + } - error = -EOPNOTSUPP; - goto errout; + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { + if (!cfg.signal) + return -EINVAL; - sig = cfg.signal; - } + return -EOPNOTSUPP; - if (child->bts) { - (void)ds_release_bts(child->bts); - kfree(child->bts_buffer); + child->thread.bts_ovfl_signal = cfg.signal; + } - child->bts = NULL; - child->bts_buffer = NULL; - } + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && + (cfg.size != child->bts_size)) { + kfree(child->bts_buffer); - error = -ENOMEM; + child->bts_size = cfg.size; child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); - if (!child->bts_buffer) - goto errout; - - child->bts = ds_request_bts(child, child->bts_buffer, cfg.size, - ovfl, /* th = */ (size_t)-1); - if (IS_ERR(child->bts)) { - error = PTR_ERR(child->bts); - kfree(child->bts_buffer); - child->bts = NULL; - child->bts_buffer = NULL; - goto errout; + if (!child->bts_buffer) { + child->bts_size = 0; + return -ENOMEM; } - - child->thread.bts_ovfl_signal = sig; } - error = -EINVAL; - if (!child->thread.ds_ctx && cfg.flags) - goto errout; - if (cfg.flags & PTRACE_BTS_O_TRACE) - child->thread.debugctlmsr |= bts_cfg.debugctl_mask; - else - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + flags |= BTS_USER; if (cfg.flags & PTRACE_BTS_O_SCHED) - set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - else - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + flags |= BTS_TIMESTAMPS; - error = sizeof(cfg); + child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, + /* ovfl = */ NULL, /* th = */ (size_t)-1, + flags); + if (IS_ERR(child->bts)) { + int error = PTR_ERR(child->bts); -out: - if (child->thread.debugctlmsr) - set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - else - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + kfree(child->bts_buffer); + child->bts = NULL; + child->bts_buffer = NULL; + child->bts_size = 0; - return error; + return error; + } -errout: - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - goto out; + return sizeof(cfg); } static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + const struct bts_trace *trace; struct ptrace_bts_config cfg; - size_t end; - const void *base, *max; - int error; if (cfg_size < sizeof(cfg)) return -EIO; - error = ds_get_bts_end(child->bts, &end); - if (error < 0) - return error; - - error = ds_access_bts(child->bts, /* index = */ 0, &base); - if (error < 0) - return error; - - error = ds_access_bts(child->bts, /* index = */ end, &max); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; memset(&cfg, 0, sizeof(cfg)); - cfg.size = (max - base); + cfg.size = trace->ds.end - trace->ds.begin; cfg.signal = child->thread.bts_ovfl_signal; cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; - if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && - child->thread.debugctlmsr & bts_cfg.debugctl_mask) + if (trace->ds.flags & BTS_USER) cfg.flags |= PTRACE_BTS_O_TRACE; - if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + if (trace->ds.flags & BTS_TIMESTAMPS) cfg.flags |= PTRACE_BTS_O_SCHED; if (copy_to_user(ucfg, &cfg, sizeof(cfg))) @@ -873,105 +746,28 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } -static int ptrace_bts_write_record(struct task_struct *child, - const struct bts_struct *in) +static int ptrace_bts_clear(struct task_struct *child) { - unsigned char bts_record[BTS_MAX_RECORD_SIZE]; + const struct bts_trace *trace; - if (BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts) - return -EOVERFLOW; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - memset(bts_record, 0, bts_cfg.sizeof_bts); - switch (in->qualifier) { - case BTS_INVALID: - break; + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - case BTS_BRANCH: - bts_set(bts_record, bts_from, in->variant.lbr.from_ip); - bts_set(bts_record, bts_to, in->variant.lbr.to_ip); - break; - - case BTS_TASK_ARRIVES: - case BTS_TASK_DEPARTS: - bts_set(bts_record, bts_from, bts_escape); - bts_set(bts_record, bts_qual, in->qualifier); - bts_set(bts_record, bts_jiffies, in->variant.jiffies); - break; - - default: - return -EINVAL; - } - - return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts); + return ds_reset_bts(child->bts); } -void ptrace_bts_take_timestamp(struct task_struct *tsk, - enum bts_qualifier qualifier) +static int ptrace_bts_size(struct task_struct *child) { - struct bts_struct rec = { - .qualifier = qualifier, - .variant.jiffies = jiffies_64 - }; - - ptrace_bts_write_record(tsk, &rec); -} - -static const struct bts_configuration bts_cfg_netburst = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<2)|(1<<3)|(1<<5) -}; + const struct bts_trace *trace; -static const struct bts_configuration bts_cfg_pentium_m = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<6)|(1<<7) -}; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; -static const struct bts_configuration bts_cfg_core2 = { - .sizeof_bts = 8 * 3, - .sizeof_field = 8, - .debugctl_mask = (1<<6)|(1<<7)|(1<<9) -}; - -static inline void bts_configure(const struct bts_configuration *cfg) -{ - bts_cfg = *cfg; -} - -void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) -{ - switch (c->x86) { - case 0x6: - switch (c->x86_model) { - case 0 ... 0xC: - /* sorry, don't know about them */ - break; - case 0xD: - case 0xE: /* Pentium M */ - bts_configure(&bts_cfg_pentium_m); - break; - default: /* Core2, Atom, ... */ - bts_configure(&bts_cfg_core2); - break; - } - break; - case 0xF: - switch (c->x86_model) { - case 0x0: - case 0x1: - case 0x2: /* Netburst */ - bts_configure(&bts_cfg_netburst); - break; - default: - /* sorry, don't know about them */ - break; - } - break; - default: - /* sorry, don't know about them */ - break; - } + return (trace->ds.top - trace->ds.begin) / trace->ds.size; } #endif /* CONFIG_X86_PTRACE_BTS */ @@ -988,15 +784,12 @@ void ptrace_disable(struct task_struct *child) #endif #ifdef CONFIG_X86_PTRACE_BTS if (child->bts) { - (void)ds_release_bts(child->bts); + ds_release_bts(child->bts); + child->bts = NULL; + kfree(child->bts_buffer); child->bts_buffer = NULL; - - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + child->bts_size = 0; } #endif /* CONFIG_X86_PTRACE_BTS */ } @@ -1129,16 +922,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) (child, data, (struct ptrace_bts_config __user *)addr); break; - case PTRACE_BTS_SIZE: { - size_t size; - - ret = ds_get_bts_index(child->bts, &size); - if (ret == 0) { - WARN_ON_ONCE(size != (int) size); - ret = (int) size; - } + case PTRACE_BTS_SIZE: + ret = ptrace_bts_size(child); break; - } case PTRACE_BTS_GET: ret = ptrace_bts_read_record @@ -1146,7 +932,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ds_clear_bts(child->bts); + ret = ptrace_bts_clear(child); break; case PTRACE_BTS_DRAIN: @@ -1409,6 +1195,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: +#ifdef CONFIG_X86_PTRACE_BTS + case PTRACE_BTS_CONFIG: + case PTRACE_BTS_STATUS: + case PTRACE_BTS_SIZE: + case PTRACE_BTS_GET: + case PTRACE_BTS_CLEAR: + case PTRACE_BTS_DRAIN: +#endif /* CONFIG_X86_PTRACE_BTS */ return arch_ptrace(child, request, addr, data); default: diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b81fc5f7731..dc5ea65dc716 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1176,6 +1176,7 @@ struct task_struct { * The buffer to hold the BTS data. */ void *bts_buffer; + size_t bts_size; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ -- cgit v1.2.3 From 16855f878d7127a8bb3925753463485f3071ad76 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 8 Dec 2008 19:18:38 -0800 Subject: x86: uaccess: return value of __{get|put}_user() can be int Impact: cleanup The type of return value of __{get|put}_user() can be int. There is no user to refer the return value of __{get|put}_user() as long. This reduces code size a bit on 64-bit. $ size vmlinux.* text data bss dec hex filename 4509265 479988 673588 5662841 566879 vmlinux.new 4511462 479988 673588 5665038 56710e vmlinux.old Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 35c54921b2e4..580c3ee6c58c 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -350,14 +350,14 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ - long __pu_err; \ + int __pu_err; \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ __pu_err; \ }) #define __get_user_nocheck(x, ptr, size) \ ({ \ - long __gu_err; \ + int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ -- cgit v1.2.3 From 8f2466f45f75e3cbe3aa2b69d33fd9d6e343b9cc Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 8 Dec 2008 19:19:07 -0800 Subject: x86: kill #ifdef for exit_idle() Impact: cleanup Introduce helper inline function in arch/x86/include/asm/idle.h to remove #ifdefs around exit_idle(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/idle.h | 5 +++++ arch/x86/kernel/apic.c | 6 ------ arch/x86/kernel/io_apic.c | 3 +-- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h index 44c89c3a23e9..38d87379e270 100644 --- a/arch/x86/include/asm/idle.h +++ b/arch/x86/include/asm/idle.h @@ -8,8 +8,13 @@ struct notifier_block; void idle_notifier_register(struct notifier_block *n); void idle_notifier_unregister(struct notifier_block *n); +#ifdef CONFIG_X86_64 void enter_idle(void); void exit_idle(void); +#else /* !CONFIG_X86_64 */ +static inline void enter_idle(void) { } +static inline void exit_idle(void) { } +#endif /* CONFIG_X86_64 */ void c1e_remove_cpu(int cpu); diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f94879b525..0fd083713f62 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -814,9 +814,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); @@ -1682,9 +1680,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) { u32 v; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1713,9 +1709,7 @@ void smp_error_interrupt(struct pt_regs *regs) { u32 v, v1; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 9043251210fb..679e7bbbbcd6 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -2216,10 +2216,9 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; + ack_APIC_irq(); -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); me = smp_processor_id(); -- cgit v1.2.3 From ae8d04e2ecbb233926860e9ce145eac19c7835dc Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Sat, 13 Dec 2008 12:36:58 -0800 Subject: x86 Fix VMI crash on boot in 2.6.28-rc8 VMI initialiation can relocate the fixmap, causing early_ioremap to malfunction if it is initialized before the relocation. To fix this, VMI activation is split into two phases; the detection, which must happen before setting up ioremap, and the activation, which must happen after parsing early boot parameters. This fixes a crash on boot when VMI is enabled under VMware. Signed-off-by: Zachary Amsden Signed-off-by: Linus Torvalds --- arch/x86/include/asm/vmi.h | 8 +++++++- arch/x86/kernel/setup.c | 12 +++++------- arch/x86/kernel/smpboot.c | 2 -- arch/x86/kernel/vmi_32.c | 16 +++++++++++----- 4 files changed, 23 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h index b7c0dea119fe..61e08c0a2907 100644 --- a/arch/x86/include/asm/vmi.h +++ b/arch/x86/include/asm/vmi.h @@ -223,9 +223,15 @@ struct pci_header { } __attribute__((packed)); /* Function prototypes for bootstrapping */ +#ifdef CONFIG_VMI extern void vmi_init(void); +extern void vmi_activate(void); extern void vmi_bringup(void); -extern void vmi_apply_boot_page_allocations(void); +#else +static inline void vmi_init(void) {} +static inline void vmi_activate(void) {} +static inline void vmi_bringup(void) {} +#endif /* State needed to start an application processor in an SMP system. */ struct vmi_ap_state { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9d5674f7b6cc..bdec76e55594 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -794,6 +794,9 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + /* VMI may relocate the fixmap; do this before touching ioremap area */ + vmi_init(); + early_cpu_init(); early_ioremap_init(); @@ -880,13 +883,8 @@ void __init setup_arch(char **cmdline_p) check_efer(); #endif -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be before kernel pagetables are setup - * or fixmap area is touched. - */ - vmi_init(); -#endif + /* Must be before kernel pagetables are setup */ + vmi_activate(); /* after early param, so could get panic from serial */ reserve_early_setup_data(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7b1093397319..f71f96fc9e62 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -294,9 +294,7 @@ static void __cpuinit start_secondary(void *unused) * fragile that we want to limit the things done here to the * most necessary things. */ -#ifdef CONFIG_VMI vmi_bringup(); -#endif cpu_init(); preempt_disable(); smp_callin(); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 8b6c393ab9fd..22fd6577156a 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -960,8 +960,6 @@ static inline int __init activate_vmi(void) void __init vmi_init(void) { - unsigned long flags; - if (!vmi_rom) probe_vmi_rom(); else @@ -973,13 +971,21 @@ void __init vmi_init(void) reserve_top_address(-vmi_rom->virtual_top); - local_irq_save(flags); - activate_vmi(); - #ifdef CONFIG_X86_IO_APIC /* This is virtual hardware; timer routing is wired correctly */ no_timer_check = 1; #endif +} + +void vmi_activate(void) +{ + unsigned long flags; + + if (!vmi_rom) + return; + + local_irq_save(flags); + activate_vmi(); local_irq_restore(flags & X86_EFLAGS_IF); } -- cgit v1.2.3 From 205516c12dbba003c26b42cfb41e598631300106 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Tue, 16 Dec 2008 00:32:21 -0800 Subject: x86: convert rdtscll() to use __native_read_tsc Impact: micro-optimization Is there any reason why x86 rdtscll have to use the out of line function instead of inline __native_read_tsc()? native_read_tsc and __native_read_tsc is essentially the same functions. Patch to let x86 rdtscll() to use the inline version of read_tsc. Signed-off-by: Ken Chen Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c2a812ebde89..42f639b991b4 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -181,10 +181,10 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) } #define rdtscl(low) \ - ((low) = (u32)native_read_tsc()) + ((low) = (u32)__native_read_tsc()) #define rdtscll(val) \ - ((val) = native_read_tsc()) + ((val) = __native_read_tsc()) #define rdpmc(counter, low, high) \ do { \ -- cgit v1.2.3 From 1796316a8b028a148be48ba5d4e7be493a39d173 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:35:24 +0000 Subject: x86: consolidate __swp_XXX() macros Impact: cleanup, code robustization The __swp_...() macros silently relied upon which bits are used for _PAGE_FILE and _PAGE_PROTNONE. After having changed _PAGE_PROTNONE in our Xen kernel to no longer overlap _PAGE_PAT, live locks and crashes were reported that could have been avoided if these macros properly used the symbolic constants. Since, as pointed out earlier, for Xen Dom0 support mainline likewise will need to eliminate the conflict between _PAGE_PAT and _PAGE_PROTNONE, this patch does all the necessary adjustments, plus it introduces a mechanism to check consistency between MAX_SWAPFILES_SHIFT and the actual encoding macros. This also fixes a latent bug in that x86-64 used a 6-bit mask in __swp_type(), and if MAX_SWAPFILES_SHIFT was increased beyond 5 in (the seemingly unrelated) linux/swap.h, this would have resulted in a collision with _PAGE_FILE. Non-PAE 32-bit code gets similarly adjusted for its pte_to_pgoff() and pgoff_to_pte() calculations. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable-2level.h | 50 ++++++++++++++++++++++++++++------- arch/x86/include/asm/pgtable-3level.h | 1 + arch/x86/include/asm/pgtable.h | 14 +++++----- arch/x86/include/asm/pgtable_64.h | 20 +++++++++++--- mm/swapfile.c | 9 +++++++ 5 files changed, 75 insertions(+), 19 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index b17edfd23628..e0d199fe1d83 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -56,23 +56,55 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #define pte_none(x) (!(x).pte_low) /* - * Bits 0, 6 and 7 are taken, split up the 29 bits of offset - * into this range: + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, + * split up the 29 bits of offset into this range: */ #define PTE_FILE_MAX_BITS 29 +#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) +#else +#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1) +#endif +#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) +#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) #define pte_to_pgoff(pte) \ - ((((pte).pte_low >> 1) & 0x1f) + (((pte).pte_low >> 8) << 5)) + ((((pte).pte_low >> PTE_FILE_SHIFT1) \ + & ((1U << PTE_FILE_BITS1) - 1)) \ + + ((((pte).pte_low >> PTE_FILE_SHIFT2) \ + & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \ + + (((pte).pte_low >> PTE_FILE_SHIFT3) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2))) #define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + \ - (((off) >> 5) << 8) + _PAGE_FILE }) + ((pte_t) { .pte_low = \ + (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ + + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \ + << PTE_FILE_SHIFT2) \ + + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + << PTE_FILE_SHIFT3) \ + + _PAGE_FILE }) /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x1f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 52597aeadfff..447da43cddb3 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -166,6 +166,7 @@ static inline int pte_none(pte_t pte) #define PTE_FILE_MAX_BITS 32 /* Encode and de-code a swap entry */ +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) #define __swp_offset(x) ((x).val >> 5) #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c012f3b11671..b7c2ecdb7658 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -10,7 +10,6 @@ #define _PAGE_BIT_PCD 4 /* page cache disabled */ #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ -#define _PAGE_BIT_FILE 6 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ @@ -22,6 +21,12 @@ #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ +/* If _PAGE_BIT_PRESENT is clear, we use these: */ +/* - if the user mapped it with PROT_NONE; pte_present gives true */ +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL +/* - set: nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY + #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) @@ -46,11 +51,8 @@ #define _PAGE_NX (_AT(pteval_t, 0)) #endif -/* If _PAGE_PRESENT is clear, we use these: */ -#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, - * saved PTE; unset:swap */ -#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE; - pte_present gives true */ +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 545a0e042bb2..65b6be6677c7 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -250,10 +250,22 @@ static inline int pud_large(pud_t pte) extern int direct_gbpages; /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x3f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \ - ((offset) << 8) }) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) diff --git a/mm/swapfile.c b/mm/swapfile.c index 90cb67a5417c..54a9f87e5162 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1462,6 +1462,15 @@ static int __init procswaps_init(void) __initcall(procswaps_init); #endif /* CONFIG_PROC_FS */ +#ifdef MAX_SWAPFILES_CHECK +static int __init max_swapfiles_check(void) +{ + MAX_SWAPFILES_CHECK(); + return 0; +} +late_initcall(max_swapfiles_check); +#endif + /* * Written 01/25/92 by Simmule Turner, heavily changed by Linus. * -- cgit v1.2.3 From b93a531e315e97ef00367099e6b5f19651936e20 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:40:27 +0000 Subject: allow bug table entries to use relative pointers (and use it on x86-64) Impact: reduce bug table size This allows reducing the bug table size by half. Perhaps there are other 64-bit architectures that could also make use of this. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/bug.h | 2 +- include/asm-generic/bug.h | 8 ++++++++ lib/bug.c | 19 +++++++++++++++++-- 4 files changed, 30 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f7..ab98cca84e1b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -87,6 +87,10 @@ config GENERIC_IOMAP config GENERIC_BUG def_bool y depends on BUG + select GENERIC_BUG_RELATIVE_POINTERS if X86_64 + +config GENERIC_BUG_RELATIVE_POINTERS + bool config GENERIC_HWEIGHT def_bool y diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 3def2065fcea..d9cf1cd156d2 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -9,7 +9,7 @@ #ifdef CONFIG_X86_32 # define __BUG_C0 "2:\t.long 1b, %c0\n" #else -# define __BUG_C0 "2:\t.quad 1b, %c0\n" +# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n" #endif #define BUG() \ diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 12c07c1866b2..4c794d73fb84 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -8,9 +8,17 @@ #ifdef CONFIG_GENERIC_BUG #ifndef __ASSEMBLY__ struct bug_entry { +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS unsigned long bug_addr; +#else + signed int bug_addr_disp; +#endif #ifdef CONFIG_DEBUG_BUGVERBOSE +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS const char *file; +#else + signed int file_disp; +#endif unsigned short line; #endif unsigned short flags; diff --git a/lib/bug.c b/lib/bug.c index bfeafd60ee9f..300e41afbf97 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -5,6 +5,8 @@ CONFIG_BUG - emit BUG traps. Nothing happens without this. CONFIG_GENERIC_BUG - enable this code. + CONFIG_GENERIC_BUG_RELATIVE_POINTERS - use 32-bit pointers relative to + the containing struct bug_entry for bug_addr and file. CONFIG_DEBUG_BUGVERBOSE - emit full file+line information for each BUG CONFIG_BUG and CONFIG_DEBUG_BUGVERBOSE are potentially user-settable @@ -43,6 +45,15 @@ extern const struct bug_entry __start___bug_table[], __stop___bug_table[]; +static inline unsigned long bug_addr(const struct bug_entry *bug) +{ +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS + return bug->bug_addr; +#else + return (unsigned long)bug + bug->bug_addr_disp; +#endif +} + #ifdef CONFIG_MODULES static LIST_HEAD(module_bug_list); @@ -55,7 +66,7 @@ static const struct bug_entry *module_find_bug(unsigned long bugaddr) unsigned i; for (i = 0; i < mod->num_bugs; ++i, ++bug) - if (bugaddr == bug->bug_addr) + if (bugaddr == bug_addr(bug)) return bug; } return NULL; @@ -108,7 +119,7 @@ const struct bug_entry *find_bug(unsigned long bugaddr) const struct bug_entry *bug; for (bug = __start___bug_table; bug < __stop___bug_table; ++bug) - if (bugaddr == bug->bug_addr) + if (bugaddr == bug_addr(bug)) return bug; return module_find_bug(bugaddr); @@ -133,7 +144,11 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) if (bug) { #ifdef CONFIG_DEBUG_BUGVERBOSE +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS file = bug->file; +#else + file = (const char *)bug + bug->file_disp; +#endif line = bug->line; #endif warning = (bug->flags & BUGFLAG_WARNING) != 0; -- cgit v1.2.3 From cfc319833b5b359bf3bce99564dbac00af7925ac Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:46:58 +0000 Subject: x86, 32-bit: improve lazy TLB handling code Impact: micro-optimize the 32-bit TLB flush code Use the faster x86_{read,write}_percpu() accessors here. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context_32.h | 13 ++++++------- arch/x86/kernel/tlb_32.c | 11 +++++------ 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h index 8e10015781fb..7e98ce1d2c0e 100644 --- a/arch/x86/include/asm/mmu_context_32.h +++ b/arch/x86/include/asm/mmu_context_32.h @@ -4,9 +4,8 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP - unsigned cpu = smp_processor_id(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY; + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY); #endif } @@ -20,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev, /* stop flush ipis for the previous mm */ cpu_clear(cpu, prev->cpu_vm_mask); #ifdef CONFIG_SMP - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - per_cpu(cpu_tlbstate, cpu).active_mm = next; + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); + x86_write_percpu(cpu_tlbstate.active_mm, next); #endif cpu_set(cpu, next->cpu_vm_mask); @@ -36,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev, } #ifdef CONFIG_SMP else { - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next); if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { /* We were in lazy tlb mode and leave_mm disabled diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index f4049f3513b6..4290d918b58a 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -34,9 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock); */ void leave_mm(int cpu) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); + BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK); + cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -104,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) * BUG(); */ - if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { + if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) { + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) { if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else @@ -238,7 +237,7 @@ static void do_flush_tlb_all(void *info) unsigned long cpu = smp_processor_id(); __flush_tlb_all(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(cpu); } -- cgit v1.2.3 From b6fd6f26733e864fba2ea3eb1d716e23d2e66f3a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Dec 2008 19:23:36 +0100 Subject: x86, mm: limit MAXMEM on 64-bit on 64-bit x86 the physical memory limit is controlled by the sparsemem bits - which are 44 bits right now. But MAXMEM (the max pfn number e820 parsing will allow to enter our sizing routines) is set to 0x00003fffffffffff, i.e. 46 bits - that's too large because it overlaps into the vmalloc range. So couple MAXMEM to MAX_PHYSMEM_BITS, and add a comment that the maximum of MAX_PHYSMEM_BITS is 45 bits. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64.h | 2 +- arch/x86/include/asm/sparsemem.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 65b6be6677c7..c54ba69608bd 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -146,7 +146,7 @@ static inline void native_pgd_clear(pgd_t *pgd) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) -#define MAXMEM _AC(0x00003fffffffffff, UL) +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #define VMALLOC_START _AC(0xffffc20000000000, UL) #define VMALLOC_END _AC(0xffffe1ffffffffff, UL) #define VMEMMAP_START _AC(0xffffe20000000000, UL) diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index be44f7dab395..e3cc3c063ec5 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,7 +27,7 @@ #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ # define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 44 +# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ #endif #endif /* CONFIG_SPARSEMEM */ -- cgit v1.2.3 From 29d0887ffd084cde9d6a1286cb82b71701a974dd Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 16 Dec 2008 19:16:34 +0100 Subject: x86: microcode_amd: replace inline asm by common rdmsr/wrmsr functions Impact: cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/microcode_amd.c | 23 +++++------------------ 2 files changed, 7 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e38859d577a1..cb58643947b9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -85,7 +85,9 @@ /* AMD64 MSRs. Not complete. See the architecture manual for a more complete list. */ +#define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index c7f225c7e481..2856955ddab1 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -93,6 +93,7 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); + u32 dummy; memset(csig, 0, sizeof(*csig)); @@ -102,9 +103,7 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return -1; } - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (csig->rev) - : "i" (0x0000008B) : "ecx"); + rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", csig->rev); @@ -181,12 +180,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev) static void apply_microcode_amd(int cpu) { unsigned long flags; - unsigned int eax, edx; - unsigned int rev; + u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; struct microcode_amd *mc_amd = uci->mc; - unsigned long addr; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -195,19 +192,9 @@ static void apply_microcode_amd(int cpu) return; spin_lock_irqsave(µcode_update_lock, flags); - - addr = (unsigned long)&mc_amd->hdr.data_code; - edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); - eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); - - asm volatile("movl %0, %%ecx; wrmsr" : - : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); - + wrmsrl(MSR_AMD64_PATCH_LOADER, &mc_amd->hdr.data_code); /* get patch id after patching */ - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (rev) - : "i" (0x0000008B) : "ecx"); - + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ -- cgit v1.2.3 From d4377974062122d6d9be0bbd8a910a0954714194 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Dec 2008 20:59:24 +0100 Subject: x86: support always running TSC on Intel CPUs, add cpufeature definition Impact: add new synthetic-cpuid bit definition add X86_FEATURE_NONSTOP_TSC to the cpufeature bits - this is in preparation of Venki's always-running-TSC patch. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 5bce8ed02b44..ea408dcba513 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -92,6 +92,7 @@ #define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ -- cgit v1.2.3 From aab02f0ae20b8fe0fe891e9f107c6e392256ca01 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 15 Dec 2008 22:23:54 +0530 Subject: x86: process_64.c declare __switch_to() and sys_arch_prctl before they get used Impact: cleanup In asm/system.h moved out __switch_to from CONFIG_X86_32 as it is common for both 32 and 64 bit. In asm/pctl.h defined sys_arch_prctl Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/prctl.h | 3 +++ arch/x86/include/asm/system.h | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h index fe681147a4f7..a8894647dd9a 100644 --- a/arch/x86/include/asm/prctl.h +++ b/arch/x86/include/asm/prctl.h @@ -6,5 +6,8 @@ #define ARCH_GET_FS 0x1003 #define ARCH_GET_GS 0x1004 +#ifdef CONFIG_X86_64 +extern long sys_arch_prctl(int, unsigned long); +#endif /* CONFIG_X86_64 */ #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2ed3f0f44ff7..59555f48bf4c 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -17,12 +17,12 @@ # define AT_VECTOR_SIZE_ARCH 1 #endif -#ifdef CONFIG_X86_32 - struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); +#ifdef CONFIG_X86_32 + /* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. -- cgit v1.2.3 From 7b5b50f1be9e07714cfaa620d102c8daf3cdd814 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 15 Dec 2008 22:24:48 +0530 Subject: x86: signal.c declare do_notify_resume before they get used Impact: cleanup In asm/signal.h moved out do_notify_resume from __i386__ as it is common for both 32 and 64 bit. Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar arch/x86/include/asm/signal.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) --- arch/x86/include/asm/signal.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 96ac44f275da..7761a5d554bb 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -121,6 +121,10 @@ typedef unsigned long sigset_t; #ifndef __ASSEMBLY__ +# ifdef __KERNEL__ +extern void do_notify_resume(struct pt_regs *, void *, __u32); +# endif /* __KERNEL__ */ + #ifdef __i386__ # ifdef __KERNEL__ struct old_sigaction { @@ -141,8 +145,6 @@ struct k_sigaction { struct sigaction sa; }; -extern void do_notify_resume(struct pt_regs *, void *, __u32); - # else /* __KERNEL__ */ /* Here we must cater to libcs that poke about in kernel headers. */ -- cgit v1.2.3 From c0195b6da08c4ddd8c8ea830f6c3c40bc7f82071 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 15 Dec 2008 22:26:30 +0530 Subject: x86: ldt.c declare sys_modify_ldt before they get used Impact: cleanup In asm/syscalls.h moved out sys_modify_ldt from CONFIG_X86_32 as it is common for both 32 and 64 bit. Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/syscalls.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 87803da44010..75d4a6afc36f 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -19,6 +19,9 @@ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +/* kernel/ldt.c */ +asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); + /* X86_32 only */ #ifdef CONFIG_X86_32 /* kernel/process_32.c */ @@ -38,9 +41,6 @@ asmlinkage int sys_rt_sigreturn(unsigned long); /* kernel/ioport.c */ asmlinkage long sys_iopl(unsigned long); -/* kernel/ldt.c */ -asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); - /* kernel/sys_i386_32.c */ asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -- cgit v1.2.3 From ecbf29cdb3990c83d90d0c4187c89fb2ce423367 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 16 Dec 2008 12:37:07 -0800 Subject: xen: clean up asm/xen/hypervisor.h Impact: cleanup hypervisor.h had accumulated a lot of crud, including lots of spurious #includes. Clean it all up, and go around fixing up everything else accordingly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/include/asm/xen/hypercall.h | 6 ++++++ arch/x86/include/asm/xen/hypervisor.h | 39 +++++++---------------------------- arch/x86/include/asm/xen/page.h | 5 +++++ arch/x86/xen/enlighten.c | 1 + drivers/xen/balloon.c | 4 +++- drivers/xen/features.c | 6 +++++- drivers/xen/grant-table.c | 1 + include/xen/interface/event_channel.h | 2 ++ 8 files changed, 31 insertions(+), 33 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 3f6000d95fe2..5e79ca694326 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -33,8 +33,14 @@ #ifndef _ASM_X86_XEN_HYPERCALL_H #define _ASM_X86_XEN_HYPERCALL_H +#include +#include #include #include +#include + +#include +#include #include #include diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index a38d25ac87d2..81fbd735aec4 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -33,39 +33,10 @@ #ifndef _ASM_X86_XEN_HYPERVISOR_H #define _ASM_X86_XEN_HYPERVISOR_H -#include -#include - -#include -#include - -#include -#include -#include -#if defined(__i386__) -# ifdef CONFIG_X86_PAE -# include -# else -# include -# endif -#endif -#include - /* arch/i386/kernel/setup.c */ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; -/* arch/i386/mach-xen/evtchn.c */ -/* Force a proper event-channel callback from Xen. */ -extern void force_evtchn_callback(void); - -/* Turn jiffies into Xen system time. */ -u64 jiffies_to_st(unsigned long jiffies); - - -#define MULTI_UVMFLAGS_INDEX 3 -#define MULTI_UVMDOMID_INDEX 4 - enum xen_domain_type { XEN_NATIVE, XEN_PV_DOMAIN, @@ -74,9 +45,15 @@ enum xen_domain_type { extern enum xen_domain_type xen_domain_type; +#ifdef CONFIG_XEN #define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) +#else +#define xen_domain() (0) +#endif + +#define xen_pv_domain() (xen_domain() && xen_domain_type == XEN_PV_DOMAIN) +#define xen_hvm_domain() (xen_domain() && xen_domain_type == XEN_HVM_DOMAIN) + #define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN) -#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index bc628998a1b9..7ef617ef1df3 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -1,11 +1,16 @@ #ifndef _ASM_X86_XEN_PAGE_H #define _ASM_X86_XEN_PAGE_H +#include +#include +#include #include #include +#include #include +#include #include /* Xen machine address */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 86cd2f829683..bea215230b20 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 526c191e84ea..8dc7109d61b7 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -44,13 +44,15 @@ #include #include -#include #include #include #include #include #include +#include +#include +#include #include #include #include diff --git a/drivers/xen/features.c b/drivers/xen/features.c index 0707714e40d6..99eda169c779 100644 --- a/drivers/xen/features.c +++ b/drivers/xen/features.c @@ -8,7 +8,11 @@ #include #include #include -#include + +#include + +#include +#include #include u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 06592b9da83c..7d8f531fb8e8 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h index 919b5bdcb2bd..2090881c3650 100644 --- a/include/xen/interface/event_channel.h +++ b/include/xen/interface/event_channel.h @@ -9,6 +9,8 @@ #ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__ #define __XEN_PUBLIC_EVENT_CHANNEL_H__ +#include + typedef uint32_t evtchn_port_t; DEFINE_GUEST_HANDLE(evtchn_port_t); -- cgit v1.2.3 From 189f67c4408806563a1f061f5c8bf184a6658477 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 12 Dec 2008 14:50:40 -0600 Subject: x86: UV fix for global physical addresses Impact: fix UV boot crash This fixes a UV bug related to generating global memory addresses on partitioned systems. Partition systems do not have physical memory at address 0. Instead, a chunk of high memory is remapped by the chipset so that it appears to be at address 0. This remapping is INVISIBLE to most of the OS. The only OS functions that need to be aware of the remaping are functions that directly interface to the chipset. The GRU is one example. Also, delete a couple of unused macros related to global memory addresses. Signed-off-by: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 16 ++-------------- arch/x86/kernel/genx2apic_uv_x.c | 3 +-- 2 files changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 52aa943c634f..777327ef05c1 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -210,7 +210,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) { if (paddr < uv_hub_info->lowmem_remap_top) - paddr += uv_hub_info->lowmem_remap_base; + paddr |= uv_hub_info->lowmem_remap_base; return paddr | uv_hub_info->gnode_upper; } @@ -218,19 +218,7 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) /* socket virtual --> UV global physical address */ static inline unsigned long uv_gpa(void *v) { - return __pa(v) | uv_hub_info->gnode_upper; -} - -/* socket virtual --> UV global physical address */ -static inline void *uv_vgpa(void *v) -{ - return (void *)uv_gpa(v); -} - -/* UV global physical address --> socket virtual */ -static inline void *uv_va(unsigned long gpa) -{ - return __va(gpa & uv_hub_info->gpa_mask); + return uv_soc_phys_ram_to_gpa(__pa(v)); } /* pnode, offset --> socket virtual */ diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 221299f4509f..dece17289731 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -540,8 +540,7 @@ void __init uv_system_init(void) uv_blade_info[blade].nr_possible_cpus++; uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; - uv_cpu_hub_info(cpu)->lowmem_remap_top = - lowmem_redir_base + lowmem_redir_size; + uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; uv_cpu_hub_info(cpu)->n_val = m_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; -- cgit v1.2.3 From c8182f0016fb65a721c4fbe487909a2d56178135 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 12 Dec 2008 11:07:00 -0600 Subject: sgi-xp: xpc needs to pass the physical address, not virtual Impact: fix crash xpc needs to pass the physical address, not virtual. Testing uncovered this problem. The virtual address happens to work most of the time due to the way bios was masking off the node bits. Passing the physical address makes it work all of the time. Signed-off-by: Russ Anderson Acked-by: Dean Nelson Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/bios.h | 2 +- arch/x86/kernel/bios_uv.c | 4 +--- drivers/misc/sgi-xp/xpc_uv.c | 8 ++++---- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index da1c4e8e78fc..7ed17ff502b9 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -100,7 +100,7 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); -extern int uv_bios_mq_watchlist_alloc(int, void *, unsigned int, +extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index d22d0f1bbea0..2a0a2a3cac26 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -101,15 +101,13 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, } int -uv_bios_mq_watchlist_alloc(int blade, void *mq, unsigned int mq_size, +uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, unsigned long *intr_mmr_offset) { union uv_watchlist_u size_blade; - unsigned long addr; u64 watchlist; s64 ret; - addr = (unsigned long)mq; size_blade.size = mq_size; size_blade.blade = blade; diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index 684b2dd17583..91a55b1b1037 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -119,16 +119,16 @@ xpc_gru_mq_watchlist_alloc_uv(struct xpc_gru_mq_uv *mq) int ret; #if defined CONFIG_X86_64 - ret = uv_bios_mq_watchlist_alloc(mq->mmr_blade, mq->address, mq->order, - &mq->mmr_offset); + ret = uv_bios_mq_watchlist_alloc(mq->mmr_blade, uv_gpa(mq->address), + mq->order, &mq->mmr_offset); if (ret < 0) { dev_err(xpc_part, "uv_bios_mq_watchlist_alloc() failed, " "ret=%d\n", ret); return ret; } #elif defined CONFIG_IA64_GENERIC || defined CONFIG_IA64_SGI_UV - ret = sn_mq_watchlist_alloc(mq->mmr_blade, mq->address, mq->order, - &mq->mmr_offset); + ret = sn_mq_watchlist_alloc(mq->mmr_blade, uv_gpa(mq->address), + mq->order, &mq->mmr_offset); if (ret < 0) { dev_err(xpc_part, "sn_mq_watchlist_alloc() failed, ret=%d\n", ret); -- cgit v1.2.3 From d680fe44775ed17a80035462d9898f5e77bfd7dd Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 13 Dec 2008 00:09:08 +0300 Subject: x86: entry_64 - introduce FTRACE_ frame macro v2 Impact: clean up Itroduce MCOUNT_SAVE/RESTORE_FRAME which allow us to save a number of lines on source level. Also fix a comment in ftrace.h. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 29 +++++++++++++++++++++- arch/x86/kernel/entry_64.S | 57 ++++++------------------------------------- 2 files changed, 35 insertions(+), 51 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 7e61b4ceb9a4..b55b4a7fbefd 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -1,6 +1,33 @@ #ifndef _ASM_X86_FTRACE_H #define _ASM_X86_FTRACE_H +#ifdef __ASSEMBLY__ + + .macro MCOUNT_SAVE_FRAME + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + .endm + + .macro MCOUNT_RESTORE_FRAME + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + .endm + +#endif + #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ @@ -46,7 +73,7 @@ struct ftrace_ret_stack { /* * Primary handler of a function return. * It relays on ftrace_return_to_handler. - * Defined in entry32.S + * Defined in entry_32/64.S */ extern void return_to_handler(void); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 54e0bbdccb99..303dd84d2a98 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -71,15 +71,7 @@ ENTRY(ftrace_caller) cmpl $0, function_trace_stop jne ftrace_stub - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi @@ -89,14 +81,7 @@ ENTRY(ftrace_caller) ftrace_call: call ftrace_stub - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call @@ -130,15 +115,7 @@ ftrace_stub: retq trace: - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi @@ -146,14 +123,7 @@ trace: call *ftrace_trace_function - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME jmp ftrace_stub END(mcount) @@ -165,14 +135,7 @@ ENTRY(ftrace_graph_caller) cmpl $0, function_trace_stop jne ftrace_stub - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME leaq 8(%rbp), %rdi movq 0x38(%rsp), %rsi @@ -180,14 +143,8 @@ ENTRY(ftrace_graph_caller) call prepare_ftrace_return - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME + retq END(ftrace_graph_caller) -- cgit v1.2.3 From 41af86fad3c40646b9748279e3862781e937a5d2 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:50:32 -0800 Subject: x86: signal: move sigframe.h to arch/x86/include/asm Impact: cleanup, move header file Move arch/x86/kernel/sigframe.h to arch/x86/include/asm/sigframe.h. It will be used in arch/x86/ia32/ia32_signal.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigframe.h | 43 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/asm-offsets_32.c | 2 +- arch/x86/kernel/sigframe.h | 43 ---------------------------------------- arch/x86/kernel/signal.c | 2 +- 4 files changed, 45 insertions(+), 45 deletions(-) create mode 100644 arch/x86/include/asm/sigframe.h delete mode 100644 arch/x86/kernel/sigframe.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h new file mode 100644 index 000000000000..6718ed04b05b --- /dev/null +++ b/arch/x86/include/asm/sigframe.h @@ -0,0 +1,43 @@ +#ifdef CONFIG_X86_32 +#define sigframe_ia32 sigframe +#define rt_sigframe_ia32 rt_sigframe +#define sigcontext_ia32 sigcontext +#define _fpstate_ia32 _fpstate +#define ucontext_ia32 ucontext + +struct sigframe_ia32 { + u32 pretcode; + int sig; + struct sigcontext_ia32 sc; + /* + * fpstate is unused. fpstate is moved/allocated after + * retcode[] below. This movement allows to have the FP state and the + * future state extensions (xsave) stay together. + * And at the same time retaining the unused fpstate, prevents changing + * the offset of extramask[] in the sigframe and thus prevent any + * legacy application accessing/modifying it. + */ + struct _fpstate_ia32 fpstate_unused; + unsigned long extramask[_NSIG_WORDS-1]; + char retcode[8]; + /* fp state follows here */ +}; + +struct rt_sigframe_ia32 { + u32 pretcode; + int sig; + u32 pinfo; + u32 puc; + struct siginfo info; + struct ucontext_ia32 uc; + char retcode[8]; + /* fp state follows here */ +}; +#else /* !CONFIG_X86_32 */ +struct rt_sigframe { + char __user *pretcode; + struct ucontext uc; + struct siginfo info; + /* fp state follows here */ +}; +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 6649d09ad88f..ee4df08feee6 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -11,7 +11,7 @@ #include #include #include -#include "sigframe.h" +#include #include #include #include diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h deleted file mode 100644 index 6718ed04b05b..000000000000 --- a/arch/x86/kernel/sigframe.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifdef CONFIG_X86_32 -#define sigframe_ia32 sigframe -#define rt_sigframe_ia32 rt_sigframe -#define sigcontext_ia32 sigcontext -#define _fpstate_ia32 _fpstate -#define ucontext_ia32 ucontext - -struct sigframe_ia32 { - u32 pretcode; - int sig; - struct sigcontext_ia32 sc; - /* - * fpstate is unused. fpstate is moved/allocated after - * retcode[] below. This movement allows to have the FP state and the - * future state extensions (xsave) stay together. - * And at the same time retaining the unused fpstate, prevents changing - * the offset of extramask[] in the sigframe and thus prevent any - * legacy application accessing/modifying it. - */ - struct _fpstate_ia32 fpstate_unused; - unsigned long extramask[_NSIG_WORDS-1]; - char retcode[8]; - /* fp state follows here */ -}; - -struct rt_sigframe_ia32 { - u32 pretcode; - int sig; - u32 pinfo; - u32 puc; - struct siginfo info; - struct ucontext_ia32 uc; - char retcode[8]; - /* fp state follows here */ -}; -#else /* !CONFIG_X86_32 */ -struct rt_sigframe { - char __user *pretcode; - struct ucontext uc; - struct siginfo info; - /* fp state follows here */ -}; -#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 848c2d64a289..89bb7668041d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -35,7 +35,7 @@ #include #include -#include "sigframe.h" +#include #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) -- cgit v1.2.3 From c85c2ff877c9305f801f7d5b9e6382cb05a03d45 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 17 Dec 2008 18:51:08 -0800 Subject: x86: signal: prepare to include from ia32_signal.c Impact: cleanup, prepare to use from ia32_signal.c Make struct sigframe_ia32 and rt_sigframe_ia32 visible to ia32_signal.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigframe.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 6718ed04b05b..491a0878c3aa 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -4,7 +4,15 @@ #define sigcontext_ia32 sigcontext #define _fpstate_ia32 _fpstate #define ucontext_ia32 ucontext +#else /* !CONFIG_X86_32 */ + +#ifdef CONFIG_IA32_EMULATION +#include +#endif /* CONFIG_IA32_EMULATION */ + +#endif /* CONFIG_X86_32 */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) struct sigframe_ia32 { u32 pretcode; int sig; @@ -18,7 +26,11 @@ struct sigframe_ia32 { * legacy application accessing/modifying it. */ struct _fpstate_ia32 fpstate_unused; +#ifdef CONFIG_IA32_EMULATION + unsigned int extramask[_COMPAT_NSIG_WORDS-1]; +#else /* !CONFIG_IA32_EMULATION */ unsigned long extramask[_NSIG_WORDS-1]; +#endif /* CONFIG_IA32_EMULATION */ char retcode[8]; /* fp state follows here */ }; @@ -28,16 +40,22 @@ struct rt_sigframe_ia32 { int sig; u32 pinfo; u32 puc; +#ifdef CONFIG_IA32_EMULATION + compat_siginfo_t info; +#else /* !CONFIG_IA32_EMULATION */ struct siginfo info; +#endif /* CONFIG_IA32_EMULATION */ struct ucontext_ia32 uc; char retcode[8]; /* fp state follows here */ }; -#else /* !CONFIG_X86_32 */ +#endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */ + +#ifdef CONFIG_X86_64 struct rt_sigframe { char __user *pretcode; struct ucontext uc; struct siginfo info; /* fp state follows here */ }; -#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_X86_64 */ -- cgit v1.2.3 From 7c9c160c54fc545efc23881344593868e5f717bd Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Wed, 17 Dec 2008 23:18:52 +0530 Subject: x86: tls.c declare sys_set_thread_area and sys_get_thread_area before they get used Impact: cleanup In asm/syscalls.h move out sys_set_thread_area() and sys_get_thread_area() as they are common for both 32 and 64 bit. Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/syscalls.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 75d4a6afc36f..c0b0bda754ee 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -22,6 +22,10 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +/* kernel/tls.c */ +asmlinkage int sys_set_thread_area(struct user_desc __user *); +asmlinkage int sys_get_thread_area(struct user_desc __user *); + /* X86_32 only */ #ifdef CONFIG_X86_32 /* kernel/process_32.c */ @@ -54,10 +58,6 @@ asmlinkage int sys_uname(struct old_utsname __user *); struct oldold_utsname; asmlinkage int sys_olduname(struct oldold_utsname __user *); -/* kernel/tls.c */ -asmlinkage int sys_set_thread_area(struct user_desc __user *); -asmlinkage int sys_get_thread_area(struct user_desc __user *); - /* kernel/vm86_32.c */ asmlinkage int sys_vm86old(struct pt_regs); asmlinkage int sys_vm86(struct pt_regs); -- cgit v1.2.3 From 5c2628e8b4f670d0954053444289e2b018be957a Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 18 Dec 2008 09:18:35 -0800 Subject: x86: sigframe.h: add guard macro Impact: cleanup Add missing guard macro _ASM_X86_SIGFRAME_H. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigframe.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 491a0878c3aa..3bd0f4276000 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -1,3 +1,6 @@ +#ifndef _ASM_X86_SIGFRAME_H +#define _ASM_X86_SIGFRAME_H + #ifdef CONFIG_X86_32 #define sigframe_ia32 sigframe #define rt_sigframe_ia32 rt_sigframe @@ -59,3 +62,5 @@ struct rt_sigframe { /* fp state follows here */ }; #endif /* CONFIG_X86_64 */ + +#endif /* _ASM_X86_SIGFRAME_H */ -- cgit v1.2.3 From 5899329b19100c0b82dc78e9b21ed8b920c9ffb3 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:30 -0800 Subject: x86: PAT: implement track/untrack of pfnmap regions for x86 - v3 Impact: New mm functionality. Hookup remap_pfn_range and vm_insert_pfn and corresponding copy and free routines with reserve and free tracking. reserve and free here only takes care of non RAM region mapping. For RAM region, driver should use set_memory_[uc|wc|wb] to set the cache type and then setup the mapping for user pte. We can bypass below reserve/free in that case. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 10 ++ arch/x86/mm/pat.c | 236 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 246 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c012f3b11671..7dcd94c29044 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -219,6 +219,11 @@ static inline unsigned long pte_pfn(pte_t pte) return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; } +static inline u64 pte_pa(pte_t pte) +{ + return pte_val(pte) & PTE_PFN_MASK; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -328,6 +333,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) +/* Indicate that x86 has its own track and untrack pfn vma functions */ +#define track_pfn_vma_new track_pfn_vma_new +#define track_pfn_vma_copy track_pfn_vma_copy +#define untrack_pfn_vma untrack_pfn_vma + #ifndef __ASSEMBLY__ #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index eb1bf000d12e..1069ffecf77d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -596,6 +596,242 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } +/* + * Internal interface to reserve a range of physical memory with prot. + * Reserved non RAM regions only and after successful reserve_memtype, + * this func also keeps identity mapping (if any) in sync with this new prot. + */ +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot) +{ + int is_ram = 0; + int id_sz, ret; + unsigned long flags; + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); + + is_ram = pagerange_is_ram(paddr, paddr + size); + + if (is_ram != 0) { + /* + * For mapping RAM pages, drivers need to call + * set_memory_[uc|wc|wb] directly, for reserve and free, before + * setting up the PTE. + */ + WARN_ON_ONCE(1); + return 0; + } + + ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); + if (ret) + return ret; + + if (flags != want_flags) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR + "%s:%d map pfn expected mapping type %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + return -EINVAL; + } + + /* Need to keep identity mapping in sync */ + if (paddr >= __pa(high_memory)) + return 0; + + id_sz = (__pa(high_memory) < paddr + size) ? + __pa(high_memory) - paddr : + size; + + if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR + "%s:%d reserve_pfn_range ioremap_change_attr failed %s " + "for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size)); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to free a range of physical memory. + * Frees non RAM regions only. + */ +static void free_pfn_range(u64 paddr, unsigned long size) +{ + int is_ram; + + is_ram = pagerange_is_ram(paddr, paddr + size); + if (is_ram == 0) + free_memtype(paddr, paddr + size); +} + +/* + * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + * + * If the vma has a linear pfn mapping for the entire range, we get the prot + * from pte and reserve the entire vma range with single reserve_pfn_range call. + * Otherwise, we reserve the entire vma range, my ging through the PTEs page + * by page to get physical address and protection. + */ +int track_pfn_vma_copy(struct vm_area_struct *vma) +{ + int retval = 0; + unsigned long i, j; + u64 paddr; + pgprot_t prot; + pte_t pte; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return 0; + + if (is_linear_pfn_mapping(vma)) { + /* + * reserve the whole chunk starting from vm_pgoff, + * But, we have to get the protection from pte. + */ + if (follow_pfnmap_pte(vma, vma_start, &pte)) { + WARN_ON_ONCE(1); + return -1; + } + prot = pte_pgprot(pte); + paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + return reserve_pfn_range(paddr, vma_size, prot); + } + + /* reserve entire vma page by page, using pfn and prot from pte */ + for (i = 0; i < vma_size; i += PAGE_SIZE) { + if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + continue; + + paddr = pte_pa(pte); + prot = pte_pgprot(pte); + retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + if (retval) + goto cleanup_ret; + } + return 0; + +cleanup_ret: + /* Reserve error: Cleanup partial reservation and return error */ + for (j = 0; j < i; j += PAGE_SIZE) { + if (follow_pfnmap_pte(vma, vma_start + j, &pte)) + continue; + + paddr = pte_pa(pte); + free_pfn_range(paddr, PAGE_SIZE); + } + + return retval; +} + +/* + * track_pfn_vma_new is called when a _new_ pfn mapping is being established + * for physical range indicated by pfn and size. + * + * prot is passed in as a parameter for the new mapping. If the vma has a + * linear pfn mapping for the entire range reserve the entire vma range with + * single reserve_pfn_range call. + * Otherwise, we look t the pfn and size and reserve only the specified range + * page by page. + * + * Note that this function can be called with caller trying to map only a + * subrange/page inside the vma. + */ +int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size) +{ + int retval = 0; + unsigned long i, j; + u64 base_paddr; + u64 paddr; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return 0; + + if (is_linear_pfn_mapping(vma)) { + /* reserve the whole chunk starting from vm_pgoff */ + paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + return reserve_pfn_range(paddr, vma_size, prot); + } + + /* reserve page by page using pfn and size */ + base_paddr = (u64)pfn << PAGE_SHIFT; + for (i = 0; i < size; i += PAGE_SIZE) { + paddr = base_paddr + i; + retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + if (retval) + goto cleanup_ret; + } + return 0; + +cleanup_ret: + /* Reserve error: Cleanup partial reservation and return error */ + for (j = 0; j < i; j += PAGE_SIZE) { + paddr = base_paddr + j; + free_pfn_range(paddr, PAGE_SIZE); + } + + return retval; +} + +/* + * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case size can be zero). + */ +void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) +{ + unsigned long i; + u64 paddr; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return; + + if (is_linear_pfn_mapping(vma)) { + /* free the whole chunk starting from vm_pgoff */ + paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; + free_pfn_range(paddr, vma_size); + return; + } + + if (size != 0 && size != vma_size) { + /* free page by page, using pfn and size */ + paddr = (u64)pfn << PAGE_SHIFT; + for (i = 0; i < size; i += PAGE_SIZE) { + paddr = paddr + i; + free_pfn_range(paddr, PAGE_SIZE); + } + } else { + /* free entire vma, page by page, using the pfn from pte */ + for (i = 0; i < vma_size; i += PAGE_SIZE) { + pte_t pte; + + if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + continue; + + paddr = pte_pa(pte); + free_pfn_range(paddr, PAGE_SIZE); + } + } +} + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* get Nth element of the linked list */ -- cgit v1.2.3 From 8a7b12f70fb135a1b1d865687de3edcdc780f6d1 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:31 -0800 Subject: x86: PAT: change pgprot_noncached to uc_minus instead of strong uc - v3 Impact: mm behavior change. Make pgprot_noncached uc_minus instead of strong UC. This will make pgprot_noncached to be in line with ioremap_nocache() and all the other APIs that map page uc_minus on uc request. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 8 ++++++++ arch/x86/include/asm/pgtable_32.h | 9 --------- arch/x86/include/asm/pgtable_64.h | 6 ------ 3 files changed, 8 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 7dcd94c29044..6968d4f6be3e 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -158,6 +158,14 @@ #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ #endif +/* + * Macro to mark a page protection value as UC- + */ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \ + : (prot)) + #ifndef __ASSEMBLY__ /* diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index f9d5889b336b..72b020deb46b 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -100,15 +100,6 @@ extern unsigned long pg0[]; # include #endif -/* - * Macro to mark a page protection value as "uncacheable". - * On processors which do not support it, this is a no-op. - */ -#define pgprot_noncached(prot) \ - ((boot_cpu_data.x86 > 3) \ - ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \ - : (prot)) - /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 545a0e042bb2..4798a4033e3a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -176,12 +176,6 @@ static inline int pmd_bad(pmd_t pmd) #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */ -/* - * Macro to mark a page protection value as "uncacheable". - */ -#define pgprot_noncached(prot) \ - (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT)) - /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. -- cgit v1.2.3 From 2520bd3123c00272f818a176c92d03c7d0a113d6 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 18 Dec 2008 11:41:32 -0800 Subject: x86: PAT: add pgprot_writecombine() interface for drivers - v3 Impact: New mm functionality. Add pgprot_writecombine. pgprot_writecombine will be aliased to pgprot_noncached when not supported by the architecture. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 3 +++ arch/x86/mm/pat.c | 8 ++++++++ include/asm-generic/pgtable.h | 4 ++++ 3 files changed, 15 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6968d4f6be3e..579f8ceee948 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -168,6 +168,9 @@ #ifndef __ASSEMBLY__ +#define pgprot_writecombine pgprot_writecombine +extern pgprot_t pgprot_writecombine(pgprot_t prot); + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1069ffecf77d..d5254bae84f4 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -832,6 +832,14 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, } } +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + if (pat_enabled) + return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC); + else + return pgprot_noncached(prot); +} + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* get Nth element of the linked list */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index ef87f889ef62..b84633801fb6 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -129,6 +129,10 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres #define move_pte(pte, prot, old_addr, new_addr) (pte) #endif +#ifndef pgprot_writecombine +#define pgprot_writecombine pgprot_noncached +#endif + /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no -- cgit v1.2.3 From d1769d5475176124af04fa69848b022c98c4bc37 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 19 Dec 2008 00:03:56 +0530 Subject: x86: traps.c declare functions before they get used Impact: cleanup In asm/traps.h :- do_double_fault : added under X86_64 sync_regs : added under X86_64 math_error : moved out from X86_32 as it is common for both 32 and 64 bit math_emulate : moved from X86_32 as it is common for both 32 and 64 bit smp_thermal_interrupt : added under X86_64 mce_threshold_interrupt : added under X86_64 Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/include/asm/traps.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 45dee286e45c..2ee0a3bceedf 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -46,6 +46,10 @@ dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long); dotraplinkage void do_invalid_TSS(struct pt_regs *, long); dotraplinkage void do_segment_not_present(struct pt_regs *, long); dotraplinkage void do_stack_segment(struct pt_regs *, long); +#ifdef CONFIG_X86_64 +dotraplinkage void do_double_fault(struct pt_regs *, long); +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *); +#endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); @@ -72,10 +76,13 @@ static inline int get_si_code(unsigned long condition) extern int panic_on_unrecovered_nmi; extern int kstack_depth_to_print; -#ifdef CONFIG_X86_32 void math_error(void __user *); -unsigned long patch_espfix_desc(unsigned long, unsigned long); asmlinkage void math_emulate(long); +#ifdef CONFIG_X86_32 +unsigned long patch_espfix_desc(unsigned long, unsigned long); +#else +asmlinkage void smp_thermal_interrupt(void); +asmlinkage void mce_threshold_interrupt(void); #endif #endif /* _ASM_X86_TRAPS_H */ -- cgit v1.2.3 From b2fa739c06931d167b6d2aa7b514ab7f30d04dc0 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 18 Dec 2008 14:43:34 -0800 Subject: x86: sigframe.h: include headers for dependency Impact: cleanup Include following headers for dependency. asm/sigcontext.h asm/siginfo.h asm/ucontext.h Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/sigframe.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 3bd0f4276000..4e0fe26d27d3 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -1,6 +1,10 @@ #ifndef _ASM_X86_SIGFRAME_H #define _ASM_X86_SIGFRAME_H +#include +#include +#include + #ifdef CONFIG_X86_32 #define sigframe_ia32 sigframe #define rt_sigframe_ia32 rt_sigframe -- cgit v1.2.3 From 9f221495997d180df51ce4d8296669445dd3e7b3 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 18 Dec 2008 14:47:37 -0800 Subject: x86: ia32.h: remove unused struct sigfram32 and rt_sigframe32 Impact: cleanup Remove struct sigfram32 and rt_sigframe32 because there is no user. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/ia32.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 97989c0e534c..50ca486fd88c 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -129,24 +129,6 @@ typedef struct compat_siginfo { } _sifields; } compat_siginfo_t; -struct sigframe32 { - u32 pretcode; - int sig; - struct sigcontext_ia32 sc; - struct _fpstate_ia32 fpstate; - unsigned int extramask[_COMPAT_NSIG_WORDS-1]; -}; - -struct rt_sigframe32 { - u32 pretcode; - int sig; - u32 pinfo; - u32 puc; - compat_siginfo_t info; - struct ucontext_ia32 uc; - struct _fpstate_ia32 fpstate; -}; - struct ustat32 { __u32 f_tfree; compat_ino_t f_tinode; -- cgit v1.2.3 From 982d789ab76c8a11426852fec2fdf2f412e21c0c Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:28 -0800 Subject: x86: PAT: remove follow_pfnmap_pte in favor of follow_phys Impact: Cleanup - removes a new function in favor of a recently modified older one. Replace follow_pfnmap_pte in pat code with follow_phys. follow_phys lso returns protection eliminating the need of pte_pgprot call. Using follow_phys also eliminates the need for pte_pa. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 5 ----- arch/x86/mm/pat.c | 30 +++++++++++------------------ include/linux/mm.h | 3 --- mm/memory.c | 43 ------------------------------------------ 4 files changed, 11 insertions(+), 70 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 579f8ceee948..2aa792bbd7e0 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -230,11 +230,6 @@ static inline unsigned long pte_pfn(pte_t pte) return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; } -static inline u64 pte_pa(pte_t pte) -{ - return pte_val(pte) & PTE_PFN_MASK; -} - #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d5254bae84f4..541bcc944a5b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -685,8 +685,7 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) int retval = 0; unsigned long i, j; u64 paddr; - pgprot_t prot; - pte_t pte; + unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; unsigned long vma_size = vma_end - vma_start; @@ -696,26 +695,22 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) if (is_linear_pfn_mapping(vma)) { /* - * reserve the whole chunk starting from vm_pgoff, - * But, we have to get the protection from pte. + * reserve the whole chunk covered by vma. We need the + * starting address and protection from pte. */ - if (follow_pfnmap_pte(vma, vma_start, &pte)) { + if (follow_phys(vma, vma_start, 0, &prot, &paddr)) { WARN_ON_ONCE(1); - return -1; + return -EINVAL; } - prot = pte_pgprot(pte); - paddr = (u64)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, prot); + return reserve_pfn_range(paddr, vma_size, __pgprot(prot)); } /* reserve entire vma page by page, using pfn and prot from pte */ for (i = 0; i < vma_size; i += PAGE_SIZE) { - if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); - prot = pte_pgprot(pte); - retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + retval = reserve_pfn_range(paddr, PAGE_SIZE, __pgprot(prot)); if (retval) goto cleanup_ret; } @@ -724,10 +719,9 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) cleanup_ret: /* Reserve error: Cleanup partial reservation and return error */ for (j = 0; j < i; j += PAGE_SIZE) { - if (follow_pfnmap_pte(vma, vma_start + j, &pte)) + if (follow_phys(vma, vma_start + j, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); free_pfn_range(paddr, PAGE_SIZE); } @@ -797,6 +791,7 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, { unsigned long i; u64 paddr; + unsigned long prot; unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; unsigned long vma_size = vma_end - vma_start; @@ -821,12 +816,9 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, } else { /* free entire vma, page by page, using the pfn from pte */ for (i = 0; i < vma_size; i += PAGE_SIZE) { - pte_t pte; - - if (follow_pfnmap_pte(vma, vma_start + i, &pte)) + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) continue; - paddr = pte_pa(pte); free_pfn_range(paddr, PAGE_SIZE); } } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2f6e2f886d4b..36f9b3fa5e15 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1239,9 +1239,6 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ -int follow_pfnmap_pte(struct vm_area_struct *vma, - unsigned long address, pte_t *ret_ptep); - typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 79f28e35d4fc..6b29f39a5a3e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1168,49 +1168,6 @@ no_page_table: return page; } -int follow_pfnmap_pte(struct vm_area_struct *vma, unsigned long address, - pte_t *ret_ptep) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - struct page *page; - struct mm_struct *mm = vma->vm_mm; - - if (!is_pfn_mapping(vma)) - goto err; - - page = NULL; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto err; - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto err; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto err; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - - pte = *ptep; - if (!pte_present(pte)) - goto err_unlock; - - *ret_ptep = pte; - pte_unmap_unlock(ptep, ptl); - return 0; - -err_unlock: - pte_unmap_unlock(ptep, ptl); -err: - return -EINVAL; -} - /* Can we do the FOLL_ANON optimization? */ static inline int use_zero_page(struct vm_area_struct *vma) { -- cgit v1.2.3 From 34801ba9bf0381fcf0e2b08179d2c07f2c6ede74 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 19 Dec 2008 13:47:29 -0800 Subject: x86: PAT: move track untrack pfnmap stubs to asm-generic Impact: Cleanup and branch hints only. Move the track and untrack pfn stub routines from memory.c to asm-generic. Also add unlikely to pfnmap related calls in fork and exit path. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 6 ++---- include/asm-generic/pgtable.h | 46 ++++++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 6 ------ mm/memory.c | 48 ++---------------------------------------- 4 files changed, 50 insertions(+), 56 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2aa792bbd7e0..875192bf72cb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -339,12 +339,10 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) +#ifndef __ASSEMBLY__ /* Indicate that x86 has its own track and untrack pfn vma functions */ -#define track_pfn_vma_new track_pfn_vma_new -#define track_pfn_vma_copy track_pfn_vma_copy -#define untrack_pfn_vma untrack_pfn_vma +#define __HAVE_PFNMAP_TRACKING -#ifndef __ASSEMBLY__ #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index b84633801fb6..72ebe91005a8 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -293,6 +293,52 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #define arch_flush_lazy_cpu_mode() do {} while (0) #endif +#ifndef __HAVE_PFNMAP_TRACKING +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * track_pfn_vma_new is called when a _new_ pfn mapping is being established + * for physical range indicated by pfn and size. + */ +static inline int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size) +{ + return 0; +} + +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + */ +static inline int track_pfn_vma_copy(struct vm_area_struct *vma) +{ + return 0; +} + +/* + * Interface that can be used by architecture code to keep track of + * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) + * + * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case size can be zero). + */ +static inline void untrack_pfn_vma(struct vm_area_struct *vma, + unsigned long pfn, unsigned long size) +{ +} +#else +extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size); +extern int track_pfn_vma_copy(struct vm_area_struct *vma); +extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size); +#endif + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_GENERIC_PGTABLE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 36f9b3fa5e15..d3ddd735e375 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -163,12 +163,6 @@ static inline int is_pfn_mapping(struct vm_area_struct *vma) return (vma->vm_flags & VM_PFNMAP); } -extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, - unsigned long pfn, unsigned long size); -extern int track_pfn_vma_copy(struct vm_area_struct *vma); -extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size); - /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask diff --git a/mm/memory.c b/mm/memory.c index 6b29f39a5a3e..f01b7eed6e16 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -99,50 +99,6 @@ int randomize_va_space __read_mostly = 2; #endif -#ifndef track_pfn_vma_new -/* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * - * track_pfn_vma_new is called when a _new_ pfn mapping is being established - * for physical range indicated by pfn and size. - */ -int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, - unsigned long pfn, unsigned long size) -{ - return 0; -} -#endif - -#ifndef track_pfn_vma_copy -/* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * - * track_pfn_vma_copy is called when vma that is covering the pfnmap gets - * copied through copy_page_range(). - */ -int track_pfn_vma_copy(struct vm_area_struct *vma) -{ - return 0; -} -#endif - -#ifndef untrack_pfn_vma -/* - * Interface that can be used by architecture code to keep track of - * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn) - * - * untrack_pfn_vma is called while unmapping a pfnmap for a region. - * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case size can be zero). - */ -void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) -{ -} -#endif - static int __init disable_randmaps(char *s) { randomize_va_space = 0; @@ -713,7 +669,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); - if (is_pfn_mapping(vma)) { + if (unlikely(is_pfn_mapping(vma))) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine @@ -969,7 +925,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, if (vma->vm_flags & VM_ACCOUNT) *nr_accounted += (end - start) >> PAGE_SHIFT; - if (is_pfn_mapping(vma)) + if (unlikely(is_pfn_mapping(vma))) untrack_pfn_vma(vma, 0, 0); while (start != end) { -- cgit v1.2.3 From bf53de907dfdaac178c92d774aae7370d7b97d20 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 19 Dec 2008 15:10:24 +0100 Subject: x86, bts: add fork and exit handling Impact: introduce new ptrace facility Add arch_ptrace_untrace() function that is called when the tracer detaches (either voluntarily or when the tracing task dies); ptrace_disable() is only called on a voluntary detach. Add ptrace_fork() and arch_ptrace_fork(). They are called when a traced task is forked. Clear DS and BTS related fields on fork. Release DS resources and reclaim memory in ptrace_untrace(). This releases resources already when the tracing task dies. We used to do that when the traced task dies. Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 9 ++++++++ arch/x86/include/asm/ptrace.h | 7 ++++++ arch/x86/kernel/ds.c | 11 ++++++++++ arch/x86/kernel/process_32.c | 20 ++++++++--------- arch/x86/kernel/process_64.c | 20 ++++++++--------- arch/x86/kernel/ptrace.c | 50 ++++++++++++++++++++++++++++++++++--------- include/linux/ptrace.h | 22 +++++++++++++++++++ kernel/fork.c | 2 ++ kernel/ptrace.c | 12 +++++++++++ 9 files changed, 121 insertions(+), 32 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index ee0ea3a96c11..a8f672ba100c 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -252,12 +252,21 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); */ extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); +/* + * Task clone/init and cleanup work + */ +extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father); +extern void ds_exit_thread(struct task_struct *tsk); + #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} static inline void ds_switch_to(struct task_struct *prev, struct task_struct *next) {} +static inline void ds_copy_thread(struct task_struct *tsk, + struct task_struct *father) {} +static inline void ds_exit_thread(struct task_struct *tsk) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index fbf744215911..6d34d954c228 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -235,6 +235,13 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); +extern void x86_ptrace_untrace(struct task_struct *); +extern void x86_ptrace_fork(struct task_struct *child, + unsigned long clone_flags); + +#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) +#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) + #endif /* __KERNEL__ */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 98d271e60e08..da91701a2348 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1017,3 +1017,14 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next) update_debugctlmsr(next->thread.debugctlmsr); } + +void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +{ + clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); + tsk->thread.ds_ctx = NULL; +} + +void ds_exit_thread(struct task_struct *tsk) +{ + WARN_ON(tsk->thread.ds_ctx); +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 605eff9a8ac0..3ba155d24884 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -60,6 +60,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -251,17 +252,8 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any BTS tracers that have not been properly released. */ - if (unlikely(current->bts)) { - ds_release_bts(current->bts); - current->bts = NULL; - - kfree(current->bts_buffer); - current->bts_buffer = NULL; - current->bts_size = 0; - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -343,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + + ds_copy_thread(p, current); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + return err; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1cfd2a4bf853..416fb9282f4f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -53,6 +53,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); @@ -236,17 +237,8 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any BTS tracers that have not been properly released. */ - if (unlikely(current->bts)) { - ds_release_bts(current->bts); - current->bts = NULL; - - kfree(current->bts_buffer); - current->bts_buffer = NULL; - current->bts_size = 0; - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -376,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, if (err) goto out; } + + ds_copy_thread(p, me); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + err = 0; out: if (err && p->thread.io_bitmap_ptr) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 45e9855da2d2..6ad2bb607650 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -769,8 +769,47 @@ static int ptrace_bts_size(struct task_struct *child) return (trace->ds.top - trace->ds.begin) / trace->ds.size; } + +static void ptrace_bts_fork(struct task_struct *tsk) +{ + tsk->bts = NULL; + tsk->bts_buffer = NULL; + tsk->bts_size = 0; + tsk->thread.bts_ovfl_signal = 0; +} + +static void ptrace_bts_untrace(struct task_struct *child) +{ + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; + + kfree(child->bts_buffer); + child->bts_buffer = NULL; + child->bts_size = 0; + } +} + +static void ptrace_bts_detach(struct task_struct *child) +{ + ptrace_bts_untrace(child); +} +#else +static inline void ptrace_bts_fork(struct task_struct *tsk) {} +static inline void ptrace_bts_detach(struct task_struct *child) {} +static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ +void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) +{ + ptrace_bts_fork(child); +} + +void x86_ptrace_untrace(struct task_struct *child) +{ + ptrace_bts_untrace(child); +} + /* * Called by kernel/ptrace.c when detaching.. * @@ -782,16 +821,7 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif -#ifdef CONFIG_X86_PTRACE_BTS - if (child->bts) { - ds_release_bts(child->bts); - child->bts = NULL; - - kfree(child->bts_buffer); - child->bts_buffer = NULL; - child->bts_size = 0; - } -#endif /* CONFIG_X86_PTRACE_BTS */ + ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 22641d5d45df..98b93ca4db06 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); +extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 /* Returns 0 on success, -errno on denial. */ @@ -313,6 +314,27 @@ static inline void user_enable_block_step(struct task_struct *task) #define arch_ptrace_stop(code, info) do { } while (0) #endif +#ifndef arch_ptrace_untrace +/* + * Do machine-specific work before untracing child. + * + * This is called for a normal detach as well as from ptrace_exit() + * when the tracing task dies. + * + * Called with write_lock(&tasklist_lock) held. + */ +#define arch_ptrace_untrace(task) do { } while (0) +#endif + +#ifndef arch_ptrace_fork +/* + * Do machine-specific work to initialize a new task. + * + * This is called from copy_process(). + */ +#define arch_ptrace_fork(child, clone_flags) do { } while (0) +#endif + extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); diff --git a/kernel/fork.c b/kernel/fork.c index 7b93da72d4a2..65ce60adc8e8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1096,6 +1096,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif + if (unlikely(ptrace_reparented(current))) + ptrace_fork(p, clone_flags); /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4c8bcd7dd8e0..100a71cfdaba 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -25,6 +25,17 @@ #include #include + +/* + * Initialize a new task whose father had been ptraced. + * + * Called from copy_process(). + */ +void ptrace_fork(struct task_struct *child, unsigned long clone_flags) +{ + arch_ptrace_fork(child, clone_flags); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child) child->parent = child->real_parent; list_del_init(&child->ptrace_entry); + arch_ptrace_untrace(child); if (task_is_traced(child)) ptrace_untrace(child); } -- cgit v1.2.3 From 0ca59dd948a51c95d5a366d35f897bc5ef9df55d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Dec 2008 23:30:02 +0100 Subject: tracing/ftrace: don't trace on early stage of a secondary cpu boot, v3 Impact: fix a crash/hard-reboot on certain configs while enabling cpu runtime On some archs, the boot of a secondary cpu can have an early fragile state. On x86-64, the pda is not initialized on the first stage of a cpu boot but it is needed to get the cpu number and the current task pointer. This data is needed during tracing. As they were dereferenced at this stage, we got a crash while tracing a cpu being enabled at runtime. Some other archs like ia64 can have such kind of issue too. Changes on v2: We dropped the previous solution of a per-arch called function to guess the current state of a cpu. That could slow down the tracing. This patch removes the -pg flag on arch/x86/kernel/cpu/common.c where the low level cpu boot functions exist, on start_secondary() and a helper function used at this stage. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr.h | 3 ++- arch/x86/kernel/cpu/Makefile | 5 +++++ arch/x86/kernel/smpboot.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c2a812ebde89..b8a1799ea871 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -85,7 +85,8 @@ static inline void native_write_msr(unsigned int msr, asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); } -static inline int native_write_msr_safe(unsigned int msr, +/* Can be uninlined because referenced by paravirt */ +notrace static inline int native_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int err; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec6075c057..4ae495a313f3 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -2,6 +2,11 @@ # Makefile for x86-compatible CPU details and quirks # +# Don't trace early stages of a secondary CPU boot +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_common.o = -pg +endif + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f71f96fc9e62..f6174d229024 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -287,7 +287,7 @@ static int __cpuinitdata unsafe_smp; /* * Activate a secondary processor. */ -static void __cpuinit start_secondary(void *unused) +notrace static void __cpuinit start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too -- cgit v1.2.3 From fc5243d98ac2575ad14a974b3c097e9ba874c03d Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 25 Dec 2008 13:38:35 +0100 Subject: [S390] arch_setup_additional_pages arguments arch_setup_additional_pages currently gets two arguments, the binary format descripton and an indication if the process uses an executable stack or not. The second argument is not used by anybody, it could be removed without replacement. What actually does make sense is to pass an indication if the process uses the elf interpreter or not. The glibc code will not use anything from the vdso if the process does not use the dynamic linker, so for statically linked binaries the architecture backend can choose not to map the vdso. Acked-by: Ingo Molnar Signed-off-by: Martin Schwidefsky --- arch/powerpc/include/asm/elf.h | 2 +- arch/powerpc/kernel/vdso.c | 3 +-- arch/sh/include/asm/elf.h | 2 +- arch/sh/kernel/vsyscall/vsyscall.c | 3 +-- arch/x86/include/asm/elf.h | 2 +- arch/x86/vdso/vdso32-setup.c | 2 +- arch/x86/vdso/vma.c | 2 +- fs/binfmt_elf.c | 2 +- 8 files changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index d812929390e4..cd46f023ec6d 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -267,7 +267,7 @@ extern int ucache_bsize; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack); + int uses_interp); #define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b); #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 65639a43e644..f7ec7d0888fe 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -184,8 +184,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma) * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree */ -int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; struct page **vdso_pagelist; diff --git a/arch/sh/include/asm/elf.h b/arch/sh/include/asm/elf.h index 9eb9036a1bdc..9381397ebeb8 100644 --- a/arch/sh/include/asm/elf.h +++ b/arch/sh/include/asm/elf.h @@ -204,7 +204,7 @@ do { \ #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack); + int uses_interp); extern unsigned int vdso_enabled; extern void __kernel_vsyscall; diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c index 95f4de0800ec..3f7e415be86a 100644 --- a/arch/sh/kernel/vsyscall/vsyscall.c +++ b/arch/sh/kernel/vsyscall/vsyscall.c @@ -59,8 +59,7 @@ int __init vsyscall_init(void) } /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 40ca1bea7916..f51a3ddde01a 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -325,7 +325,7 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack); + int uses_interp); extern int syscall32_setup_pages(struct linux_binprm *, int exstack); #define compat_arch_setup_additional_pages syscall32_setup_pages diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 513f330c5832..1241f118ab56 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -310,7 +310,7 @@ int __init sysenter_setup(void) } /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 257ba4a10abf..9c98cc6ba978 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -98,7 +98,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) /* Setup a VMA at program startup for the vsyscall page. Not called for compat tasks */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8fcfa398d350..95a76ff9e01b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -949,7 +949,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES - retval = arch_setup_additional_pages(bprm, executable_stack); + retval = arch_setup_additional_pages(bprm, !!elf_interpreter); if (retval < 0) { send_sig(SIGKILL, current, 0); goto out; -- cgit v1.2.3