diff options
Diffstat (limited to 'arch')
67 files changed, 1120 insertions, 585 deletions
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index fe698b5045e9..376f22130791 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -230,44 +230,24 @@ linux_to_osf_statfs(struct kstatfs *linux_stat, struct osf_statfs __user *osf_st return copy_to_user(osf_stat, &tmp_stat, bufsiz) ? -EFAULT : 0; } -static int -do_osf_statfs(struct path *path, struct osf_statfs __user *buffer, - unsigned long bufsiz) +SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname, + struct osf_statfs __user *, buffer, unsigned long, bufsiz) { struct kstatfs linux_stat; - int error = vfs_statfs(path, &linux_stat); + int error = user_statfs(pathname, &linux_stat); if (!error) error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz); return error; } -SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname, - struct osf_statfs __user *, buffer, unsigned long, bufsiz) -{ - struct path path; - int retval; - - retval = user_path(pathname, &path); - if (!retval) { - retval = do_osf_statfs(&path, buffer, bufsiz); - path_put(&path); - } - return retval; -} - SYSCALL_DEFINE3(osf_fstatfs, unsigned long, fd, struct osf_statfs __user *, buffer, unsigned long, bufsiz) { - struct file *file; - int retval; - - retval = -EBADF; - file = fget(fd); - if (file) { - retval = do_osf_statfs(&file->f_path, buffer, bufsiz); - fput(file); - } - return retval; + struct kstatfs linux_stat; + int error = fd_statfs(fd, &linux_stat); + if (!error) + error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz); + return error; } /* diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index f6c108a3d673..8c13a0c77830 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -149,6 +149,7 @@ static int titan_set_irq_affinity(struct irq_data *d, const struct cpumask *affinity, bool force) { + unsigned int irq = d->irq; spin_lock(&titan_irq_lock); titan_cpu_set_irq_affinity(irq - 16, *affinity); titan_update_irq_hw(titan_cached_irq_mask); diff --git a/arch/ia64/include/asm/xen/hypercall.h b/arch/ia64/include/asm/xen/hypercall.h index 96fc62366aa4..ed28bcd5bb85 100644 --- a/arch/ia64/include/asm/xen/hypercall.h +++ b/arch/ia64/include/asm/xen/hypercall.h @@ -107,7 +107,7 @@ extern unsigned long __hypercall(unsigned long a1, unsigned long a2, static inline int xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg) { - return _hypercall2(int, sched_op_new, cmd, arg); + return _hypercall2(int, sched_op, cmd, arg); } static inline long diff --git a/arch/ia64/xen/suspend.c b/arch/ia64/xen/suspend.c index fd66b048c6fa..419c8620945a 100644 --- a/arch/ia64/xen/suspend.c +++ b/arch/ia64/xen/suspend.c @@ -37,19 +37,14 @@ xen_mm_unpin_all(void) /* nothing */ } -void xen_pre_device_suspend(void) -{ - /* nothing */ -} - void -xen_pre_suspend() +xen_arch_pre_suspend() { /* nothing */ } void -xen_post_suspend(int suspend_cancelled) +xen_arch_post_suspend(int suspend_cancelled) { if (suspend_cancelled) return; diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index f5ecc0566bc2..d88983516e26 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -4,6 +4,7 @@ config MIPS select HAVE_GENERIC_DMA_COHERENT select HAVE_IDE select HAVE_OPROFILE + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select PERF_USE_VMALLOC select HAVE_ARCH_KGDB @@ -208,6 +209,7 @@ config MACH_JZ4740 select ARCH_REQUIRE_GPIOLIB select SYS_HAS_EARLY_PRINTK select HAVE_PWM + select HAVE_CLK config LASAT bool "LASAT Networks platforms" @@ -333,6 +335,8 @@ config PNX8550_STB810 config PMC_MSP bool "PMC-Sierra MSP chipsets" depends on EXPERIMENTAL + select CEVT_R4K + select CSRC_R4K select DMA_NONCOHERENT select SWAP_IO_SPACE select NO_EXCEPT_FILL diff --git a/arch/mips/alchemy/mtx-1/board_setup.c b/arch/mips/alchemy/mtx-1/board_setup.c index 6398fa95905c..40b84b991191 100644 --- a/arch/mips/alchemy/mtx-1/board_setup.c +++ b/arch/mips/alchemy/mtx-1/board_setup.c @@ -54,8 +54,8 @@ int mtx1_pci_idsel(unsigned int devsel, int assert); static void mtx1_reset(char *c) { - /* Hit BCSR.SYSTEM_CONTROL[SW_RST] */ - au_writel(0x00000000, 0xAE00001C); + /* Jump to the reset vector */ + __asm__ __volatile__("jr\t%0"::"r"(0xbfc00000)); } static void mtx1_power_off(void) diff --git a/arch/mips/alchemy/mtx-1/platform.c b/arch/mips/alchemy/mtx-1/platform.c index e30e42add697..956f946218c5 100644 --- a/arch/mips/alchemy/mtx-1/platform.c +++ b/arch/mips/alchemy/mtx-1/platform.c @@ -28,6 +28,8 @@ #include <linux/mtd/physmap.h> #include <mtd/mtd-abi.h> +#include <asm/mach-au1x00/au1xxx_eth.h> + static struct gpio_keys_button mtx1_gpio_button[] = { { .gpio = 207, @@ -140,10 +142,17 @@ static struct __initdata platform_device * mtx1_devs[] = { &mtx1_mtd, }; +static struct au1000_eth_platform_data mtx1_au1000_eth0_pdata = { + .phy_search_highest_addr = 1, + .phy1_search_mac0 = 1, +}; + static int __init mtx1_register_devices(void) { int rc; + au1xxx_override_eth_cfg(0, &mtx1_au1000_eth0_pdata); + rc = gpio_request(mtx1_gpio_button[0].gpio, mtx1_gpio_button[0].desc); if (rc < 0) { diff --git a/arch/mips/alchemy/xxs1500/board_setup.c b/arch/mips/alchemy/xxs1500/board_setup.c index b43c918925d3..80c521e5290d 100644 --- a/arch/mips/alchemy/xxs1500/board_setup.c +++ b/arch/mips/alchemy/xxs1500/board_setup.c @@ -36,8 +36,8 @@ static void xxs1500_reset(char *c) { - /* Hit BCSR.SYSTEM_CONTROL[SW_RST] */ - au_writel(0x00000000, 0xAE00001C); + /* Jump to the reset vector */ + __asm__ __volatile__("jr\t%0"::"r"(0xbfc00000)); } static void xxs1500_power_off(void) diff --git a/arch/mips/include/asm/perf_event.h b/arch/mips/include/asm/perf_event.h index e00007cf8162..d0c77496c728 100644 --- a/arch/mips/include/asm/perf_event.h +++ b/arch/mips/include/asm/perf_event.h @@ -11,15 +11,5 @@ #ifndef __MIPS_PERF_EVENT_H__ #define __MIPS_PERF_EVENT_H__ - -/* - * MIPS performance counters do not raise NMI upon overflow, a regular - * interrupt will be signaled. Hence we can do the pending perf event - * work at the tail of the irq handler. - */ -static inline void -set_perf_event_pending(void) -{ -} - +/* Leave it empty here. The file is required by linux/perf_event.h */ #endif /* __MIPS_PERF_EVENT_H__ */ diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c index 5a84a1f11231..94ca2b018af7 100644 --- a/arch/mips/kernel/ftrace.c +++ b/arch/mips/kernel/ftrace.c @@ -17,29 +17,13 @@ #include <asm/cacheflush.h> #include <asm/uasm.h> -/* - * If the Instruction Pointer is in module space (0xc0000000), return true; - * otherwise, it is in kernel space (0x80000000), return false. - * - * FIXME: This will not work when the kernel space and module space are the - * same. If they are the same, we need to modify scripts/recordmcount.pl, - * ftrace_make_nop/call() and the other related parts to ensure the - * enabling/disabling of the calling site to _mcount is right for both kernel - * and module. - */ - -static inline int in_module(unsigned long ip) -{ - return ip & 0x40000000; -} +#include <asm-generic/sections.h> #ifdef CONFIG_DYNAMIC_FTRACE #define JAL 0x0c000000 /* jump & link: ip --> ra, jump to target */ #define ADDR_MASK 0x03ffffff /* op_code|addr : 31...26|25 ....0 */ -#define INSN_B_1F_4 0x10000004 /* b 1f; offset = 4 */ -#define INSN_B_1F_5 0x10000005 /* b 1f; offset = 5 */ #define INSN_NOP 0x00000000 /* nop */ #define INSN_JAL(addr) \ ((unsigned int)(JAL | (((addr) >> 2) & ADDR_MASK))) @@ -69,6 +53,20 @@ static inline void ftrace_dyn_arch_init_insns(void) #endif } +/* + * Check if the address is in kernel space + * + * Clone core_kernel_text() from kernel/extable.c, but doesn't call + * init_kernel_text() for Ftrace doesn't trace functions in init sections. + */ +static inline int in_kernel_space(unsigned long ip) +{ + if (ip >= (unsigned long)_stext && + ip <= (unsigned long)_etext) + return 1; + return 0; +} + static int ftrace_modify_code(unsigned long ip, unsigned int new_code) { int faulted; @@ -84,6 +82,42 @@ static int ftrace_modify_code(unsigned long ip, unsigned int new_code) return 0; } +/* + * The details about the calling site of mcount on MIPS + * + * 1. For kernel: + * + * move at, ra + * jal _mcount --> nop + * + * 2. For modules: + * + * 2.1 For KBUILD_MCOUNT_RA_ADDRESS and CONFIG_32BIT + * + * lui v1, hi_16bit_of_mcount --> b 1f (0x10000005) + * addiu v1, v1, low_16bit_of_mcount + * move at, ra + * move $12, ra_address + * jalr v1 + * sub sp, sp, 8 + * 1: offset = 5 instructions + * 2.2 For the Other situations + * + * lui v1, hi_16bit_of_mcount --> b 1f (0x10000004) + * addiu v1, v1, low_16bit_of_mcount + * move at, ra + * jalr v1 + * nop | move $12, ra_address | sub sp, sp, 8 + * 1: offset = 4 instructions + */ + +#if defined(KBUILD_MCOUNT_RA_ADDRESS) && defined(CONFIG_32BIT) +#define MCOUNT_OFFSET_INSNS 5 +#else +#define MCOUNT_OFFSET_INSNS 4 +#endif +#define INSN_B_1F (0x10000000 | MCOUNT_OFFSET_INSNS) + int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { @@ -91,39 +125,11 @@ int ftrace_make_nop(struct module *mod, unsigned long ip = rec->ip; /* - * We have compiled module with -mlong-calls, but compiled the kernel - * without it, we need to cope with them respectively. + * If ip is in kernel space, no long call, otherwise, long call is + * needed. */ - if (in_module(ip)) { -#if defined(KBUILD_MCOUNT_RA_ADDRESS) && defined(CONFIG_32BIT) - /* - * lui v1, hi_16bit_of_mcount --> b 1f (0x10000005) - * addiu v1, v1, low_16bit_of_mcount - * move at, ra - * move $12, ra_address - * jalr v1 - * sub sp, sp, 8 - * 1: offset = 5 instructions - */ - new = INSN_B_1F_5; -#else - /* - * lui v1, hi_16bit_of_mcount --> b 1f (0x10000004) - * addiu v1, v1, low_16bit_of_mcount - * move at, ra - * jalr v1 - * nop | move $12, ra_address | sub sp, sp, 8 - * 1: offset = 4 instructions - */ - new = INSN_B_1F_4; -#endif - } else { - /* - * move at, ra - * jal _mcount --> nop - */ - new = INSN_NOP; - } + new = in_kernel_space(ip) ? INSN_NOP : INSN_B_1F; + return ftrace_modify_code(ip, new); } @@ -132,8 +138,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) unsigned int new; unsigned long ip = rec->ip; - /* ip, module: 0xc0000000, kernel: 0x80000000 */ - new = in_module(ip) ? insn_lui_v1_hi16_mcount : insn_jal_ftrace_caller; + new = in_kernel_space(ip) ? insn_jal_ftrace_caller : + insn_lui_v1_hi16_mcount; return ftrace_modify_code(ip, new); } @@ -190,29 +196,25 @@ int ftrace_disable_ftrace_graph_caller(void) #define S_R_SP (0xafb0 << 16) /* s{d,w} R, offset(sp) */ #define OFFSET_MASK 0xffff /* stack offset range: 0 ~ PT_SIZE */ -unsigned long ftrace_get_parent_addr(unsigned long self_addr, - unsigned long parent, - unsigned long parent_addr, - unsigned long fp) +unsigned long ftrace_get_parent_ra_addr(unsigned long self_ra, unsigned long + old_parent_ra, unsigned long parent_ra_addr, unsigned long fp) { - unsigned long sp, ip, ra; + unsigned long sp, ip, tmp; unsigned int code; int faulted; /* - * For module, move the ip from calling site of mcount to the - * instruction "lui v1, hi_16bit_of_mcount"(offset is 20), but for - * kernel, move to the instruction "move ra, at"(offset is 12) + * For module, move the ip from the return address after the + * instruction "lui v1, hi_16bit_of_mcount"(offset is 24), but for + * kernel, move after the instruction "move ra, at"(offset is 16) */ - ip = self_addr - (in_module(self_addr) ? 20 : 12); + ip = self_ra - (in_kernel_space(self_ra) ? 16 : 24); /* * search the text until finding the non-store instruction or "s{d,w} * ra, offset(sp)" instruction */ do { - ip -= 4; - /* get the code at "ip": code = *(unsigned int *)ip; */ safe_load_code(code, ip, faulted); @@ -224,18 +226,20 @@ unsigned long ftrace_get_parent_addr(unsigned long self_addr, * store the ra on the stack */ if ((code & S_R_SP) != S_R_SP) - return parent_addr; + return parent_ra_addr; - } while (((code & S_RA_SP) != S_RA_SP)); + /* Move to the next instruction */ + ip -= 4; + } while ((code & S_RA_SP) != S_RA_SP); sp = fp + (code & OFFSET_MASK); - /* ra = *(unsigned long *)sp; */ - safe_load_stack(ra, sp, faulted); + /* tmp = *(unsigned long *)sp; */ + safe_load_stack(tmp, sp, faulted); if (unlikely(faulted)) return 0; - if (ra == parent) + if (tmp == old_parent_ra) return sp; return 0; } @@ -246,21 +250,21 @@ unsigned long ftrace_get_parent_addr(unsigned long self_addr, * Hook the return address and push it in the stack of return addrs * in current thread info. */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, +void prepare_ftrace_return(unsigned long *parent_ra_addr, unsigned long self_ra, unsigned long fp) { - unsigned long old; + unsigned long old_parent_ra; struct ftrace_graph_ent trace; unsigned long return_hooker = (unsigned long) &return_to_handler; - int faulted; + int faulted, insns; if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; /* - * "parent" is the stack address saved the return address of the caller - * of _mcount. + * "parent_ra_addr" is the stack address saved the return address of + * the caller of _mcount. * * if the gcc < 4.5, a leaf function does not save the return address * in the stack address, so, we "emulate" one in _mcount's stack space, @@ -275,37 +279,44 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, * do it in ftrace_graph_caller of mcount.S. */ - /* old = *parent; */ - safe_load_stack(old, parent, faulted); + /* old_parent_ra = *parent_ra_addr; */ + safe_load_stack(old_parent_ra, parent_ra_addr, faulted); if (unlikely(faulted)) goto out; #ifndef KBUILD_MCOUNT_RA_ADDRESS - parent = (unsigned long *)ftrace_get_parent_addr(self_addr, old, - (unsigned long)parent, fp); + parent_ra_addr = (unsigned long *)ftrace_get_parent_ra_addr(self_ra, + old_parent_ra, (unsigned long)parent_ra_addr, fp); /* * If fails when getting the stack address of the non-leaf function's * ra, stop function graph tracer and return */ - if (parent == 0) + if (parent_ra_addr == 0) goto out; #endif - /* *parent = return_hooker; */ - safe_store_stack(return_hooker, parent, faulted); + /* *parent_ra_addr = return_hooker; */ + safe_store_stack(return_hooker, parent_ra_addr, faulted); if (unlikely(faulted)) goto out; - if (ftrace_push_return_trace(old, self_addr, &trace.depth, fp) == - -EBUSY) { - *parent = old; + if (ftrace_push_return_trace(old_parent_ra, self_ra, &trace.depth, fp) + == -EBUSY) { + *parent_ra_addr = old_parent_ra; return; } - trace.func = self_addr; + /* + * Get the recorded ip of the current mcount calling site in the + * __mcount_loc section, which will be used to filter the function + * entries configured through the tracing/set_graph_function interface. + */ + + insns = in_kernel_space(self_ra) ? 2 : MCOUNT_OFFSET_INSNS + 1; + trace.func = self_ra - (MCOUNT_INSN_SIZE * insns); /* Only trace if the calling function expects to */ if (!ftrace_graph_entry(&trace)) { current->curr_ret_stack--; - *parent = old; + *parent_ra_addr = old_parent_ra; } return; out: diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c index 2b7f3f703b83..a8244854d3dc 100644 --- a/arch/mips/kernel/perf_event.c +++ b/arch/mips/kernel/perf_event.c @@ -161,41 +161,6 @@ mipspmu_event_set_period(struct perf_event *event, return ret; } -static int mipspmu_enable(struct perf_event *event) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - int idx; - int err = 0; - - /* To look for a free counter for this event. */ - idx = mipspmu->alloc_counter(cpuc, hwc); - if (idx < 0) { - err = idx; - goto out; - } - - /* - * If there is an event in the counter we are going to use then - * make sure it is disabled. - */ - event->hw.idx = idx; - mipspmu->disable_event(idx); - cpuc->events[idx] = event; - - /* Set the period for the event. */ - mipspmu_event_set_period(event, hwc, idx); - - /* Enable the event. */ - mipspmu->enable_event(hwc, idx); - - /* Propagate our changes to the userspace mapping. */ - perf_event_update_userpage(event); - -out: - return err; -} - static void mipspmu_event_update(struct perf_event *event, struct hw_perf_event *hwc, int idx) @@ -204,7 +169,7 @@ static void mipspmu_event_update(struct perf_event *event, unsigned long flags; int shift = 64 - TOTAL_BITS; s64 prev_raw_count, new_raw_count; - s64 delta; + u64 delta; again: prev_raw_count = local64_read(&hwc->prev_count); @@ -231,32 +196,90 @@ again: return; } -static void mipspmu_disable(struct perf_event *event) +static void mipspmu_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!mipspmu) + return; + + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + + hwc->state = 0; + + /* Set the period for the event. */ + mipspmu_event_set_period(event, hwc, hwc->idx); + + /* Enable the event. */ + mipspmu->enable_event(hwc, hwc->idx); +} + +static void mipspmu_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!mipspmu) + return; + + if (!(hwc->state & PERF_HES_STOPPED)) { + /* We are working on a local event. */ + mipspmu->disable_event(hwc->idx); + barrier(); + mipspmu_event_update(event, hwc, hwc->idx); + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + } +} + +static int mipspmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; + int idx; + int err = 0; + perf_pmu_disable(event->pmu); - WARN_ON(idx < 0 || idx >= mipspmu->num_counters); + /* To look for a free counter for this event. */ + idx = mipspmu->alloc_counter(cpuc, hwc); + if (idx < 0) { + err = idx; + goto out; + } - /* We are working on a local event. */ + /* + * If there is an event in the counter we are going to use then + * make sure it is disabled. + */ + event->hw.idx = idx; mipspmu->disable_event(idx); + cpuc->events[idx] = event; - barrier(); - - mipspmu_event_update(event, hwc, idx); - cpuc->events[idx] = NULL; - clear_bit(idx, cpuc->used_mask); + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + if (flags & PERF_EF_START) + mipspmu_start(event, PERF_EF_RELOAD); + /* Propagate our changes to the userspace mapping. */ perf_event_update_userpage(event); + +out: + perf_pmu_enable(event->pmu); + return err; } -static void mipspmu_unthrottle(struct perf_event *event) +static void mipspmu_del(struct perf_event *event, int flags) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; - mipspmu->enable_event(hwc, hwc->idx); + WARN_ON(idx < 0 || idx >= mipspmu->num_counters); + + mipspmu_stop(event, PERF_EF_UPDATE); + cpuc->events[idx] = NULL; + clear_bit(idx, cpuc->used_mask); + + perf_event_update_userpage(event); } static void mipspmu_read(struct perf_event *event) @@ -270,12 +293,17 @@ static void mipspmu_read(struct perf_event *event) mipspmu_event_update(event, hwc, hwc->idx); } -static struct pmu pmu = { - .enable = mipspmu_enable, - .disable = mipspmu_disable, - .unthrottle = mipspmu_unthrottle, - .read = mipspmu_read, -}; +static void mipspmu_enable(struct pmu *pmu) +{ + if (mipspmu) + mipspmu->start(); +} + +static void mipspmu_disable(struct pmu *pmu) +{ + if (mipspmu) + mipspmu->stop(); +} static atomic_t active_events = ATOMIC_INIT(0); static DEFINE_MUTEX(pmu_reserve_mutex); @@ -318,6 +346,82 @@ static void mipspmu_free_irq(void) perf_irq = save_perf_irq; } +/* + * mipsxx/rm9000/loongson2 have different performance counters, they have + * specific low-level init routines. + */ +static void reset_counters(void *arg); +static int __hw_perf_event_init(struct perf_event *event); + +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (atomic_dec_and_mutex_lock(&active_events, + &pmu_reserve_mutex)) { + /* + * We must not call the destroy function with interrupts + * disabled. + */ + on_each_cpu(reset_counters, + (void *)(long)mipspmu->num_counters, 1); + mipspmu_free_irq(); + mutex_unlock(&pmu_reserve_mutex); + } +} + +static int mipspmu_event_init(struct perf_event *event) +{ + int err = 0; + + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + break; + + default: + return -ENOENT; + } + + if (!mipspmu || event->cpu >= nr_cpumask_bits || + (event->cpu >= 0 && !cpu_online(event->cpu))) + return -ENODEV; + + if (!atomic_inc_not_zero(&active_events)) { + if (atomic_read(&active_events) > MIPS_MAX_HWEVENTS) { + atomic_dec(&active_events); + return -ENOSPC; + } + + mutex_lock(&pmu_reserve_mutex); + if (atomic_read(&active_events) == 0) + err = mipspmu_get_irq(); + + if (!err) + atomic_inc(&active_events); + mutex_unlock(&pmu_reserve_mutex); + } + + if (err) + return err; + + err = __hw_perf_event_init(event); + if (err) + hw_perf_event_destroy(event); + + return err; +} + +static struct pmu pmu = { + .pmu_enable = mipspmu_enable, + .pmu_disable = mipspmu_disable, + .event_init = mipspmu_event_init, + .add = mipspmu_add, + .del = mipspmu_del, + .start = mipspmu_start, + .stop = mipspmu_stop, + .read = mipspmu_read, +}; + static inline unsigned int mipspmu_perf_event_encode(const struct mips_perf_event *pev) { @@ -382,8 +486,9 @@ static int validate_event(struct cpu_hw_events *cpuc, { struct hw_perf_event fake_hwc = event->hw; - if (event->pmu && event->pmu != &pmu) - return 0; + /* Allow mixed event group. So return 1 to pass validation. */ + if (event->pmu != &pmu || event->state <= PERF_EVENT_STATE_OFF) + return 1; return mipspmu->alloc_counter(cpuc, &fake_hwc) >= 0; } @@ -409,73 +514,6 @@ static int validate_group(struct perf_event *event) return 0; } -/* - * mipsxx/rm9000/loongson2 have different performance counters, they have - * specific low-level init routines. - */ -static void reset_counters(void *arg); -static int __hw_perf_event_init(struct perf_event *event); - -static void hw_perf_event_destroy(struct perf_event *event) -{ - if (atomic_dec_and_mutex_lock(&active_events, - &pmu_reserve_mutex)) { - /* - * We must not call the destroy function with interrupts - * disabled. - */ - on_each_cpu(reset_counters, - (void *)(long)mipspmu->num_counters, 1); - mipspmu_free_irq(); - mutex_unlock(&pmu_reserve_mutex); - } -} - -const struct pmu *hw_perf_event_init(struct perf_event *event) -{ - int err = 0; - - if (!mipspmu || event->cpu >= nr_cpumask_bits || - (event->cpu >= 0 && !cpu_online(event->cpu))) - return ERR_PTR(-ENODEV); - - if (!atomic_inc_not_zero(&active_events)) { - if (atomic_read(&active_events) > MIPS_MAX_HWEVENTS) { - atomic_dec(&active_events); - return ERR_PTR(-ENOSPC); - } - - mutex_lock(&pmu_reserve_mutex); - if (atomic_read(&active_events) == 0) - err = mipspmu_get_irq(); - - if (!err) - atomic_inc(&active_events); - mutex_unlock(&pmu_reserve_mutex); - } - - if (err) - return ERR_PTR(err); - - err = __hw_perf_event_init(event); - if (err) - hw_perf_event_destroy(event); - - return err ? ERR_PTR(err) : &pmu; -} - -void hw_perf_enable(void) -{ - if (mipspmu) - mipspmu->start(); -} - -void hw_perf_disable(void) -{ - if (mipspmu) - mipspmu->stop(); -} - /* This is needed by specific irq handlers in perf_event_*.c */ static void handle_associated_event(struct cpu_hw_events *cpuc, @@ -496,21 +534,13 @@ handle_associated_event(struct cpu_hw_events *cpuc, #include "perf_event_mipsxx.c" /* Callchain handling code. */ -static inline void -callchain_store(struct perf_callchain_entry *entry, - u64 ip) -{ - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; -} /* * Leave userspace callchain empty for now. When we find a way to trace * the user stack callchains, we add here. */ -static void -perf_callchain_user(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) { } @@ -523,23 +553,21 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry, while (!kstack_end(sp)) { addr = *sp++; if (__kernel_text_address(addr)) { - callchain_store(entry, addr); + perf_callchain_store(entry, addr); if (entry->nr >= PERF_MAX_STACK_DEPTH) break; } } } -static void -perf_callchain_kernel(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long sp = regs->regs[29]; #ifdef CONFIG_KALLSYMS unsigned long ra = regs->regs[31]; unsigned long pc = regs->cp0_epc; - callchain_store(entry, PERF_CONTEXT_KERNEL); if (raw_show_trace || !__kernel_text_address(pc)) { unsigned long stack_page = (unsigned long)task_stack_page(current); @@ -549,53 +577,12 @@ perf_callchain_kernel(struct pt_regs *regs, return; } do { - callchain_store(entry, pc); + perf_callchain_store(entry, pc); if (entry->nr >= PERF_MAX_STACK_DEPTH) break; pc = unwind_stack(current, &sp, pc, &ra); } while (pc); #else - callchain_store(entry, PERF_CONTEXT_KERNEL); save_raw_perf_callchain(entry, sp); #endif } - -static void -perf_do_callchain(struct pt_regs *regs, - struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (!current || !current->pid) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - if (!is_user) { - perf_callchain_kernel(regs, entry); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - if (regs) - perf_callchain_user(regs, entry); -} - -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); - -struct perf_callchain_entry * -perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry = &__get_cpu_var(pmc_irq_entry); - - entry->nr = 0; - perf_do_callchain(regs, entry); - return entry; -} diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 183e0d226669..d9a7db78ed62 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -696,7 +696,7 @@ static int mipsxx_pmu_handle_shared_irq(void) * interrupt, not NMI. */ if (handled == IRQ_HANDLED) - perf_event_do_pending(); + irq_work_run(); #ifdef CONFIG_MIPS_MT_SMP read_unlock(&pmuint_rwlock); @@ -1045,6 +1045,8 @@ init_hw_perf_events(void) "CPU, irq %d%s\n", mipspmu->name, counters, irq, irq < 0 ? " (share with timer interrupt)" : ""); + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); + return 0; } early_initcall(init_hw_perf_events); diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 5922342bca39..dbbe0ce48d89 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -84,7 +84,7 @@ static int protected_save_fp_context(struct sigcontext __user *sc) static int protected_restore_fp_context(struct sigcontext __user *sc) { - int err, tmp; + int err, tmp __maybe_unused; while (1) { lock_fpu_owner(); own_fpu_inatomic(0); diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index a0ed0e052b2e..aae986613795 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -115,7 +115,7 @@ static int protected_save_fp_context32(struct sigcontext32 __user *sc) static int protected_restore_fp_context32(struct sigcontext32 __user *sc) { - int err, tmp; + int err, tmp __maybe_unused; while (1) { lock_fpu_owner(); own_fpu_inatomic(0); diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 383aeb95cb49..32a256101082 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -193,6 +193,22 @@ void __devinit smp_prepare_boot_cpu(void) */ static struct task_struct *cpu_idle_thread[NR_CPUS]; +struct create_idle { + struct work_struct work; + struct task_struct *idle; + struct completion done; + int cpu; +}; + +static void __cpuinit do_fork_idle(struct work_struct *work) +{ + struct create_idle *c_idle = + container_of(work, struct create_idle, work); + + c_idle->idle = fork_idle(c_idle->cpu); + complete(&c_idle->done); +} + int __cpuinit __cpu_up(unsigned int cpu) { struct task_struct *idle; @@ -203,8 +219,19 @@ int __cpuinit __cpu_up(unsigned int cpu) * Linux can schedule processes on this slave. */ if (!cpu_idle_thread[cpu]) { - idle = fork_idle(cpu); - cpu_idle_thread[cpu] = idle; + /* + * Schedule work item to avoid forking user task + * Ported from arch/x86/kernel/smpboot.c + */ + struct create_idle c_idle = { + .cpu = cpu, + .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), + }; + + INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); + schedule_work(&c_idle.work); + wait_for_completion(&c_idle.done); + idle = cpu_idle_thread[cpu] = c_idle.idle; if (IS_ERR(idle)) panic(KERN_ERR "Fork failed for CPU %d", cpu); diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 1dc6edff45e0..58beabf50b3c 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -383,12 +383,11 @@ save_static_function(sys_sysmips); static int __used noinline _sys_sysmips(nabi_no_regargs struct pt_regs regs) { - long cmd, arg1, arg2, arg3; + long cmd, arg1, arg2; cmd = regs.regs[4]; arg1 = regs.regs[5]; arg2 = regs.regs[6]; - arg3 = regs.regs[7]; switch (cmd) { case MIPS_ATOMIC_SET: @@ -405,7 +404,7 @@ _sys_sysmips(nabi_no_regargs struct pt_regs regs) if (arg1 & 2) set_thread_flag(TIF_LOGADE); else - clear_thread_flag(TIF_FIXADE); + clear_thread_flag(TIF_LOGADE); return 0; diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c index 6a1fdfef8fde..ab52b7cf3b6b 100644 --- a/arch/mips/kernel/vpe.c +++ b/arch/mips/kernel/vpe.c @@ -148,9 +148,9 @@ struct { spinlock_t tc_list_lock; struct list_head tc_list; /* Thread contexts */ } vpecontrol = { - .vpe_list_lock = SPIN_LOCK_UNLOCKED, + .vpe_list_lock = __SPIN_LOCK_UNLOCKED(vpe_list_lock), .vpe_list = LIST_HEAD_INIT(vpecontrol.vpe_list), - .tc_list_lock = SPIN_LOCK_UNLOCKED, + .tc_list_lock = __SPIN_LOCK_UNLOCKED(tc_list_lock), .tc_list = LIST_HEAD_INIT(vpecontrol.tc_list) }; diff --git a/arch/mips/loongson/Kconfig b/arch/mips/loongson/Kconfig index 6e1b77fec7ea..aca93eed8779 100644 --- a/arch/mips/loongson/Kconfig +++ b/arch/mips/loongson/Kconfig @@ -1,6 +1,7 @@ +if MACH_LOONGSON + choice prompt "Machine Type" - depends on MACH_LOONGSON config LEMOTE_FULOONG2E bool "Lemote Fuloong(2e) mini-PC" @@ -87,3 +88,5 @@ config LOONGSON_UART_BASE config LOONGSON_MC146818 bool default n + +endif # MACH_LOONGSON diff --git a/arch/mips/loongson/common/cmdline.c b/arch/mips/loongson/common/cmdline.c index 1a06defc4f7f..353e1d2e41a5 100644 --- a/arch/mips/loongson/common/cmdline.c +++ b/arch/mips/loongson/common/cmdline.c @@ -44,10 +44,5 @@ void __init prom_init_cmdline(void) strcat(arcs_cmdline, " "); } - if ((strstr(arcs_cmdline, "console=")) == NULL) - strcat(arcs_cmdline, " console=ttyS0,115200"); - if ((strstr(arcs_cmdline, "root=")) == NULL) - strcat(arcs_cmdline, " root=/dev/hda1"); - prom_init_machtype(); } diff --git a/arch/mips/loongson/common/machtype.c b/arch/mips/loongson/common/machtype.c index 81fbe6b73f91..2efd5d9dee27 100644 --- a/arch/mips/loongson/common/machtype.c +++ b/arch/mips/loongson/common/machtype.c @@ -41,7 +41,7 @@ void __weak __init mach_prom_init_machtype(void) void __init prom_init_machtype(void) { - char *p, str[MACHTYPE_LEN]; + char *p, str[MACHTYPE_LEN + 1]; int machtype = MACH_LEMOTE_FL2E; mips_machtype = LOONGSON_MACHTYPE; @@ -53,6 +53,7 @@ void __init prom_init_machtype(void) } p += strlen("machtype="); strncpy(str, p, MACHTYPE_LEN); + str[MACHTYPE_LEN] = '\0'; p = strstr(str, " "); if (p) *p = '\0'; diff --git a/arch/mips/math-emu/ieee754int.h b/arch/mips/math-emu/ieee754int.h index 2701d9500959..2a7d43f4f161 100644 --- a/arch/mips/math-emu/ieee754int.h +++ b/arch/mips/math-emu/ieee754int.h @@ -70,7 +70,7 @@ #define COMPXSP \ - unsigned xm; int xe; int xs; int xc + unsigned xm; int xe; int xs __maybe_unused; int xc #define COMPYSP \ unsigned ym; int ye; int ys; int yc @@ -104,7 +104,7 @@ #define COMPXDP \ -u64 xm; int xe; int xs; int xc +u64 xm; int xe; int xs __maybe_unused; int xc #define COMPYDP \ u64 ym; int ye; int ys; int yc diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 2efcbd24c82f..279599e9a779 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -324,7 +324,7 @@ int page_is_ram(unsigned long pagenr) void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long lastpfn; + unsigned long lastpfn __maybe_unused; pagetable_init(); diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 083d3412d0bc..04f9e17db9d0 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -109,6 +109,8 @@ static bool scratchpad_available(void) static int scratchpad_offset(int i) { BUG(); + /* Really unreachable, but evidently some GCC want this. */ + return 0; } #endif /* diff --git a/arch/mips/pci/ops-pmcmsp.c b/arch/mips/pci/ops-pmcmsp.c index b7c03d80c88c..68798f869c0f 100644 --- a/arch/mips/pci/ops-pmcmsp.c +++ b/arch/mips/pci/ops-pmcmsp.c @@ -308,7 +308,7 @@ static struct resource pci_mem_resource = { * RETURNS: PCIBIOS_SUCCESSFUL - success * ****************************************************************************/ -static int bpci_interrupt(int irq, void *dev_id) +static irqreturn_t bpci_interrupt(int irq, void *dev_id) { struct msp_pci_regs *preg = (void *)PCI_BASE_REG; unsigned int stat = preg->if_status; @@ -326,7 +326,7 @@ static int bpci_interrupt(int irq, void *dev_id) /* write to clear all asserted interrupts */ preg->if_status = stat; - return PCIBIOS_SUCCESSFUL; + return IRQ_HANDLED; } /***************************************************************************** diff --git a/arch/mips/pmc-sierra/Kconfig b/arch/mips/pmc-sierra/Kconfig index c139988bb85d..8d798497c614 100644 --- a/arch/mips/pmc-sierra/Kconfig +++ b/arch/mips/pmc-sierra/Kconfig @@ -4,15 +4,11 @@ choice config PMC_MSP4200_EVAL bool "PMC-Sierra MSP4200 Eval Board" - select CEVT_R4K - select CSRC_R4K select IRQ_MSP_SLP select HW_HAS_PCI config PMC_MSP4200_GW bool "PMC-Sierra MSP4200 VoIP Gateway" - select CEVT_R4K - select CSRC_R4K select IRQ_MSP_SLP select HW_HAS_PCI diff --git a/arch/mips/pmc-sierra/msp71xx/msp_time.c b/arch/mips/pmc-sierra/msp71xx/msp_time.c index cca64e15f57f..01df84ce31e2 100644 --- a/arch/mips/pmc-sierra/msp71xx/msp_time.c +++ b/arch/mips/pmc-sierra/msp71xx/msp_time.c @@ -81,7 +81,7 @@ void __init plat_time_init(void) mips_hpt_frequency = cpu_rate/2; } -unsigned int __init get_c0_compare_int(void) +unsigned int __cpuinit get_c0_compare_int(void) { return MSP_INT_VPE0_TIMER; } diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h index 92d2f9298e38..9d773a639513 100644 --- a/arch/mn10300/include/asm/atomic.h +++ b/arch/mn10300/include/asm/atomic.h @@ -139,7 +139,7 @@ static inline unsigned long __cmpxchg(volatile unsigned long *m, * Atomically reads the value of @v. Note that the guaranteed * useful range of an atomic_t is only 24 bits. */ -#define atomic_read(v) ((v)->counter) +#define atomic_read(v) (ACCESS_ONCE((v)->counter)) /** * atomic_set - set atomic variable diff --git a/arch/mn10300/include/asm/uaccess.h b/arch/mn10300/include/asm/uaccess.h index 679dee0bbd08..3d6e60dad9d9 100644 --- a/arch/mn10300/include/asm/uaccess.h +++ b/arch/mn10300/include/asm/uaccess.h @@ -160,9 +160,10 @@ struct __large_struct { unsigned long buf[100]; }; #define __get_user_check(x, ptr, size) \ ({ \ + const __typeof__(ptr) __guc_ptr = (ptr); \ int _e; \ - if (likely(__access_ok((unsigned long) (ptr), (size)))) \ - _e = __get_user_nocheck((x), (ptr), (size)); \ + if (likely(__access_ok((unsigned long) __guc_ptr, (size)))) \ + _e = __get_user_nocheck((x), __guc_ptr, (size)); \ else { \ _e = -EFAULT; \ (x) = (__typeof__(x))0; \ diff --git a/arch/mn10300/mm/cache-inv-icache.c b/arch/mn10300/mm/cache-inv-icache.c index a8933a60b2d4..a6b63dde603d 100644 --- a/arch/mn10300/mm/cache-inv-icache.c +++ b/arch/mn10300/mm/cache-inv-icache.c @@ -69,7 +69,7 @@ static void flush_icache_page_range(unsigned long start, unsigned long end) /* invalidate the icache coverage on that region */ mn10300_local_icache_inv_range2(addr + off, size); - smp_cache_call(SMP_ICACHE_INV_FLUSH_RANGE, start, end); + smp_cache_call(SMP_ICACHE_INV_RANGE, start, end); } /** @@ -101,7 +101,7 @@ void flush_icache_range(unsigned long start, unsigned long end) * directly */ start_page = (start >= 0x80000000UL) ? start : 0x80000000UL; mn10300_icache_inv_range(start_page, end); - smp_cache_call(SMP_ICACHE_INV_FLUSH_RANGE, start, end); + smp_cache_call(SMP_ICACHE_INV_RANGE, start, end); if (start_page == start) goto done; end = start_page; diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c index 30394081d9b6..6ab9580b0b00 100644 --- a/arch/parisc/hpux/sys_hpux.c +++ b/arch/parisc/hpux/sys_hpux.c @@ -185,26 +185,21 @@ struct hpux_statfs { int16_t f_pad; }; -static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf) +static int do_statfs_hpux(struct kstatfs *st, struct hpux_statfs __user *p) { - struct kstatfs st; - int retval; - - retval = vfs_statfs(path, &st); - if (retval) - return retval; - - memset(buf, 0, sizeof(*buf)); - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid[0] = st.f_fsid.val[0]; - buf->f_fsid[1] = st.f_fsid.val[1]; - + struct hpux_statfs buf; + memset(&buf, 0, sizeof(buf)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid[0] = st->f_fsid.val[0]; + buf.f_fsid[1] = st->f_fsid.val[1]; + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } @@ -212,35 +207,19 @@ static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf) asmlinkage long hpux_statfs(const char __user *pathname, struct hpux_statfs __user *buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct hpux_statfs tmp; - error = do_statfs_hpux(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + struct kstatfs st; + int error = user_statfs(pathname, &st); + if (!error) + error = do_statfs_hpux(&st, buf); return error; } asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf) { - struct file *file; - struct hpux_statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs_hpux(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); - out: + struct kstatfs st; + int error = fd_statfs(fd, &st); + if (!error) + error = do_statfs_hpux(&st, buf); return error; } diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h index 380d48bacd16..26b8c807f8f1 100644 --- a/arch/powerpc/include/asm/lppaca.h +++ b/arch/powerpc/include/asm/lppaca.h @@ -33,9 +33,25 @@ // //---------------------------------------------------------------------------- #include <linux/cache.h> +#include <linux/threads.h> #include <asm/types.h> #include <asm/mmu.h> +/* + * We only have to have statically allocated lppaca structs on + * legacy iSeries, which supports at most 64 cpus. + */ +#ifdef CONFIG_PPC_ISERIES +#if NR_CPUS < 64 +#define NR_LPPACAS NR_CPUS +#else +#define NR_LPPACAS 64 +#endif +#else /* not iSeries */ +#define NR_LPPACAS 1 +#endif + + /* The Hypervisor barfs if the lppaca crosses a page boundary. A 1k * alignment is sufficient to prevent this */ struct lppaca { diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index ebf9846f3c3b..f4adf89d7614 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -27,20 +27,6 @@ extern unsigned long __toc_start; #ifdef CONFIG_PPC_BOOK3S /* - * We only have to have statically allocated lppaca structs on - * legacy iSeries, which supports at most 64 cpus. - */ -#ifdef CONFIG_PPC_ISERIES -#if NR_CPUS < 64 -#define NR_LPPACAS NR_CPUS -#else -#define NR_LPPACAS 64 -#endif -#else /* not iSeries */ -#define NR_LPPACAS 1 -#endif - -/* * The structure which the hypervisor knows about - this structure * should not cross a page boundary. The vpa_init/register_vpa call * is now known to fail if the lppaca structure crosses a page diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index fd4812329570..0dc95c0aa3be 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1516,7 +1516,8 @@ int start_topology_update(void) { int rc = 0; - if (firmware_has_feature(FW_FEATURE_VPHN) && + /* Disabled until races with load balancing are fixed */ + if (0 && firmware_has_feature(FW_FEATURE_VPHN) && get_lppaca()->shared_proc) { vphn_enabled = 1; setup_cpu_associativity_change_counters(); diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 187a7d32f86a..a3d2ce54ea2e 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -70,7 +70,7 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, if (!IS_ERR(tmp)) { struct nameidata nd; - ret = path_lookup(tmp, LOOKUP_PARENT, &nd); + ret = kern_path_parent(tmp, &nd); if (!ret) { nd.flags |= LOOKUP_OPEN | LOOKUP_CREATE; ret = spufs_create(&nd, flags, mode, neighbor); diff --git a/arch/powerpc/platforms/iseries/dt.c b/arch/powerpc/platforms/iseries/dt.c index fdb7384c0c4f..f0491cc28900 100644 --- a/arch/powerpc/platforms/iseries/dt.c +++ b/arch/powerpc/platforms/iseries/dt.c @@ -242,8 +242,8 @@ static void __init dt_cpus(struct iseries_flat_dt *dt) pft_size[0] = 0; /* NUMA CEC cookie, 0 for non NUMA */ pft_size[1] = __ilog2(HvCallHpt_getHptPages() * HW_PAGE_SIZE); - for (i = 0; i < NR_CPUS; i++) { - if (lppaca_of(i).dyn_proc_status >= 2) + for (i = 0; i < NR_LPPACAS; i++) { + if (lppaca[i].dyn_proc_status >= 2) continue; snprintf(p, 32 - (p - buf), "@%d", i); @@ -251,7 +251,7 @@ static void __init dt_cpus(struct iseries_flat_dt *dt) dt_prop_str(dt, "device_type", device_type_cpu); - index = lppaca_of(i).dyn_hv_phys_proc_index; + index = lppaca[i].dyn_hv_phys_proc_index; d = &xIoHriProcessorVpd[index]; dt_prop_u32(dt, "i-cache-size", d->xInstCacheSize * 1024); diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index b0863410517f..2946ae10fbfd 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -680,6 +680,7 @@ void * __init iSeries_early_setup(void) * on but calling this function multiple times is fine. */ identify_cpu(0, mfspr(SPRN_PVR)); + initialise_paca(&boot_paca, 0); powerpc_firmware_features |= FW_FEATURE_ISERIES; powerpc_firmware_features |= FW_FEATURE_LPAR; diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 975613b23dcf..c70e047eed72 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -124,35 +124,18 @@ void mconsole_log(struct mc_request *req) #if 0 void mconsole_proc(struct mc_request *req) { - struct nameidata nd; struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; struct file *file; - int n, err; + int n; char *ptr = req->request.data, *buf; mm_segment_t old_fs = get_fs(); ptr += strlen("proc"); ptr = skip_spaces(ptr); - err = vfs_path_lookup(mnt->mnt_root, mnt, ptr, LOOKUP_FOLLOW, &nd); - if (err) { - mconsole_reply(req, "Failed to look up file", 1, 0); - goto out; - } - - err = may_open(&nd.path, MAY_READ, O_RDONLY); - if (result) { - mconsole_reply(req, "Failed to open file", 1, 0); - path_put(&nd.path); - goto out; - } - - file = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY, - current_cred()); - err = PTR_ERR(file); + file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY); if (IS_ERR(file)) { mconsole_reply(req, "Failed to open file", 1, 0); - path_put(&nd.path); goto out; } diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index 646aa78ba5fd..46a823882437 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -62,7 +62,12 @@ int main(int argc, char *argv[]) if (fseek(f, -4L, SEEK_END)) { perror(argv[1]); } - fread(&olen, sizeof olen, 1, f); + + if (fread(&olen, sizeof(olen), 1, f) != 1) { + perror(argv[1]); + return 1; + } + ilen = ftell(f); olen = getle32(&olen); fclose(f); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 518bb99c3394..98d353edfff3 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -851,4 +851,6 @@ ia32_sys_call_table: .quad sys_fanotify_init .quad sys32_fanotify_mark .quad sys_prlimit64 /* 340 */ + .quad sys_name_to_handle_at + .quad compat_sys_open_by_handle_at ia32_syscall_end: diff --git a/arch/x86/include/asm/ce4100.h b/arch/x86/include/asm/ce4100.h new file mode 100644 index 000000000000..e656ad8c0a2e --- /dev/null +++ b/arch/x86/include/asm/ce4100.h @@ -0,0 +1,6 @@ +#ifndef _ASM_CE4100_H_ +#define _ASM_CE4100_H_ + +int ce4100_pci_init(void); + +#endif diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index b766a5e8ba0e..f4c4973fc2ac 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -346,10 +346,12 @@ #define __NR_fanotify_init 338 #define __NR_fanotify_mark 339 #define __NR_prlimit64 340 +#define __NR_name_to_handle_at 341 +#define __NR_open_by_handle_at 342 #ifdef __KERNEL__ -#define NR_syscalls 341 +#define NR_syscalls 343 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 363e9b8a715b..81a3d5b70235 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -669,6 +669,10 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init) __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) #define __NR_prlimit64 302 __SYSCALL(__NR_prlimit64, sys_prlimit64) +#define __NR_name_to_handle_at 303 +__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) +#define __NR_open_by_handle_at 304 +__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index ce1d54c8a433..3e094af443c3 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -176,7 +176,7 @@ struct bau_msg_payload { struct bau_msg_header { unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ - unsigned int base_dest_nodeid:15; /* nasid (pnode<<1) of */ + unsigned int base_dest_nodeid:15; /* nasid of the */ /* bits 20:6 */ /* first bit in uvhub map */ unsigned int command:8; /* message type */ /* bits 28:21 */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a3c28ae4025b..8508bfe52296 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -287,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set) static inline int HYPERVISOR_sched_op(int cmd, void *arg) { - return _hypercall2(int, sched_op_new, cmd, arg); + return _hypercall2(int, sched_op, cmd, arg); } static inline long @@ -422,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value) #endif static inline int -HYPERVISOR_suspend(unsigned long srec) +HYPERVISOR_suspend(unsigned long start_info_mfn) { - return _hypercall3(int, sched_op, SCHEDOP_shutdown, - SHUTDOWN_suspend, srec); + struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; + + /* + * For a PV guest the tools require that the start_info mfn be + * present in rdx/edx when the hypercall is made. Per the + * hypercall calling convention this is the third hypercall + * argument, which is start_info_mfn here. + */ + return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn); } static inline int diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index f25bdf238a33..c61934fbf22a 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -29,8 +29,10 @@ typedef struct xpaddr { /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ #define INVALID_P2M_ENTRY (~0UL) -#define FOREIGN_FRAME_BIT (1UL<<31) +#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1)) +#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2)) #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) +#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) /* Maximum amount of memory we can handle in a domain in pages */ #define MAX_DOMAIN_PAGES \ @@ -41,12 +43,18 @@ extern unsigned int machine_to_phys_order; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern unsigned long set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); extern int m2p_add_override(unsigned long mfn, struct page *page); extern int m2p_remove_override(struct page *page); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); +#ifdef CONFIG_XEN_DEBUG_FS +extern int p2m_dump_show(struct seq_file *m, void *v); +#endif static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; @@ -57,7 +65,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) mfn = get_phys_to_machine(pfn); if (mfn != INVALID_P2M_ENTRY) - mfn &= ~FOREIGN_FRAME_BIT; + mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); return mfn; } @@ -73,25 +81,44 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn) static inline unsigned long mfn_to_pfn(unsigned long mfn) { unsigned long pfn; + int ret = 0; if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; + if (unlikely((mfn >> machine_to_phys_order) != 0)) { + pfn = ~0; + goto try_override; + } pfn = 0; /* * The array access can fail (e.g., device space beyond end of RAM). * In such cases it doesn't matter what we return (we return garbage), * but we must handle the fault without crashing! */ - __get_user(pfn, &machine_to_phys_mapping[mfn]); - - /* - * If this appears to be a foreign mfn (because the pfn - * doesn't map back to the mfn), then check the local override - * table to see if there's a better pfn to use. + ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); +try_override: + /* ret might be < 0 if there are no entries in the m2p for mfn */ + if (ret < 0) + pfn = ~0; + else if (get_phys_to_machine(pfn) != mfn) + /* + * If this appears to be a foreign mfn (because the pfn + * doesn't map back to the mfn), then check the local override + * table to see if there's a better pfn to use. + * + * m2p_find_override_pfn returns ~0 if it doesn't find anything. + */ + pfn = m2p_find_override_pfn(mfn, ~0); + + /* + * pfn is ~0 if there are no entries in the m2p for mfn or if the + * entry doesn't map back to the mfn and m2p_override doesn't have a + * valid entry for it. */ - if (get_phys_to_machine(pfn) != mfn) - pfn = m2p_find_override_pfn(mfn, pfn); + if (pfn == ~0 && + get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn)) + pfn = mfn; return pfn; } diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 2329b3eaf8d3..aa8620989162 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -27,16 +27,16 @@ static inline void __init xen_setup_pirqs(void) * its own functions. */ struct xen_pci_frontend_ops { - int (*enable_msi)(struct pci_dev *dev, int **vectors); + int (*enable_msi)(struct pci_dev *dev, int vectors[]); void (*disable_msi)(struct pci_dev *dev); - int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec); + int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec); void (*disable_msix)(struct pci_dev *dev); }; extern struct xen_pci_frontend_ops *xen_pci_frontend; static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, - int **vectors) + int vectors[]) { if (xen_pci_frontend && xen_pci_frontend->enable_msi) return xen_pci_frontend->enable_msi(dev, vectors); @@ -48,7 +48,7 @@ static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev) xen_pci_frontend->disable_msi(dev); } static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, - int **vectors, int nvec) + int vectors[], int nvec) { if (xen_pci_frontend && xen_pci_frontend->enable_msix) return xen_pci_frontend->enable_msix(dev, vectors, nvec); diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 13a389179514..452932d34730 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -106,8 +106,8 @@ void __init setup_bios_corruption_check(void) addr += size; } - printk(KERN_INFO "Scanning %d areas for low memory corruption\n", - num_scan_areas); + if (num_scan_areas) + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas); } @@ -143,12 +143,12 @@ static void check_corruption(struct work_struct *dummy) { check_for_bios_corruption(); schedule_delayed_work(&bios_check_work, - round_jiffies_relative(corruption_check_period*HZ)); + round_jiffies_relative(corruption_check_period*HZ)); } static int start_periodic_check_for_corruption(void) { - if (!memory_corruption_check || corruption_check_period == 0) + if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0) return 0; printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 4f6f679f2799..4a5a42b842ad 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -195,7 +195,7 @@ static unsigned int pcc_get_freq(unsigned int cpu) cmd_incomplete: iowrite16(0, &pcch_hdr->status); spin_unlock(&pcc_lock); - return -EINVAL; + return 0; } static int pcc_cpufreq_target(struct cpufreq_policy *policy, diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index b35786dc9b8f..c314b2199efd 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -340,3 +340,5 @@ ENTRY(sys_call_table) .long sys_fanotify_init .long sys_fanotify_mark .long sys_prlimit64 /* 340 */ + .long sys_name_to_handle_at + .long sys_open_by_handle_at diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7d90ceb882a4..20e3f8702d1e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -229,15 +229,14 @@ void vmalloc_sync_all(void) for (address = VMALLOC_START & PMD_MASK; address >= TASK_SIZE && address < FIXADDR_TOP; address += PMD_SIZE) { - - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; pmd_t *ret; + /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); @@ -247,7 +246,7 @@ void vmalloc_sync_all(void) if (!ret) break; } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } } @@ -828,6 +827,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address); + return; + } + out_of_memory(regs, error_code, address); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 71a59296af80..c14a5422e152 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -105,18 +105,18 @@ void sync_global_pgds(unsigned long start, unsigned long end) for (address = start; address <= end; address += PGDIR_SIZE) { const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; spinlock_t *pgt_lock; pgd = (pgd_t *)page_address(page) + pgd_index(address); + /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); @@ -128,7 +128,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_unlock(pgt_lock); } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 95ea1551eebc..1337c51b07d7 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -780,11 +780,7 @@ void __cpuinit numa_add_cpu(int cpu) int physnid; int nid = NUMA_NO_NODE; - apicid = early_per_cpu(x86_cpu_to_apicid, cpu); - if (apicid != BAD_APICID) - nid = apicid_to_node[apicid]; - if (nid == NUMA_NO_NODE) - nid = early_cpu_to_node(cpu); + nid = early_cpu_to_node(cpu); BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); /* diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index d343b3c81f3c..90825f2eb0f4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -57,12 +57,10 @@ static unsigned long direct_pages_count[PG_LEVEL_NUM]; void update_page_count(int level, unsigned long pages) { - unsigned long flags; - /* Protect against CPA */ - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); direct_pages_count[level] += pages; - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } static void split_page_count(int level) @@ -394,7 +392,7 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; + unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; @@ -403,7 +401,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, if (cpa->force_split) return 1; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up already: @@ -498,14 +496,14 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, } out_unlock: - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return do_split; } static int split_large_page(pte_t *kpte, unsigned long address) { - unsigned long flags, pfn, pfninc = 1; + unsigned long pfn, pfninc = 1; unsigned int i, level; pte_t *pbase, *tmp; pgprot_t ref_prot; @@ -519,7 +517,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) if (!base) return -ENOMEM; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up for us already: @@ -591,7 +589,7 @@ out_unlock: */ if (base) __free_page(base); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return 0; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 500242d3c96d..0113d19c8aa6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -121,14 +121,12 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) static void pgd_dtor(pgd_t *pgd) { - unsigned long flags; /* can be called from interrupt context */ - if (SHARED_KERNEL_PMD) return; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } /* @@ -260,7 +258,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - unsigned long flags; pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); @@ -280,12 +277,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return pgd; diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 85b68ef5e809..9260b3eb18d4 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -34,6 +34,7 @@ #include <linux/pci.h> #include <linux/init.h> +#include <asm/ce4100.h> #include <asm/pci_x86.h> struct sim_reg { @@ -306,10 +307,10 @@ struct pci_raw_ops ce4100_pci_conf = { .write = ce4100_conf_write, }; -static int __init ce4100_pci_init(void) +int __init ce4100_pci_init(void) { init_sim_regs(); raw_pci_ops = &ce4100_pci_conf; - return 0; + /* Indicate caller that it should invoke pci_legacy_init() */ + return 1; } -subsys_initcall(ce4100_pci_init); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 25cd4a07d09f..8c4085a95ef1 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -20,7 +20,8 @@ #include <asm/xen/pci.h> #ifdef CONFIG_ACPI -static int xen_hvm_register_pirq(u32 gsi, int triggering) +static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, + int trigger, int polarity) { int rc, irq; struct physdev_map_pirq map_irq; @@ -41,7 +42,7 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering) return -1; } - if (triggering == ACPI_EDGE_SENSITIVE) { + if (trigger == ACPI_EDGE_SENSITIVE) { shareable = 0; name = "ioapic-edge"; } else { @@ -55,12 +56,6 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering) return irq; } - -static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, - int trigger, int polarity) -{ - return xen_hvm_register_pirq(gsi, trigger); -} #endif #if defined(CONFIG_PCI_MSI) @@ -91,7 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, pirq, ret = 0; + int irq, pirq; struct msi_desc *msidesc; struct msi_msg msg; @@ -99,39 +94,32 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) __read_msi_msg(msidesc, &msg); pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); - if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { - xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ); - if (irq < 0) + if (msg.data != XEN_PIRQ_MSI_DATA || + xen_irq_from_pirq(pirq) < 0) { + pirq = xen_allocate_pirq_msi(dev, msidesc); + if (pirq < 0) goto error; - ret = set_irq_msi(irq, msidesc); - if (ret < 0) - goto error_while; - printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d" - " pirq=%d\n", irq, pirq); - return 0; + xen_msi_compose_msg(dev, pirq, &msg); + __write_msi_msg(msidesc, &msg); + dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq); + } else { + dev_dbg(&dev->dev, + "xen: msi already bound to pirq=%d\n", pirq); } - xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ)); - if (irq < 0 || pirq < 0) + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0, + (type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi"); + if (irq < 0) goto error; - printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); - xen_msi_compose_msg(dev, pirq, &msg); - ret = set_irq_msi(irq, msidesc); - if (ret < 0) - goto error_while; - write_msi_msg(irq, &msg); + dev_dbg(&dev->dev, + "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq); } return 0; -error_while: - unbind_from_irqhandler(irq, NULL); error: - if (ret == -ENODEV) - dev_err(&dev->dev, "Xen PCI frontend has not registered" \ - " MSI/MSI-X support!\n"); - - return ret; + dev_err(&dev->dev, + "Xen PCI frontend has not registered MSI/MSI-X support!\n"); + return -ENODEV; } /* @@ -150,35 +138,26 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return -ENOMEM; if (type == PCI_CAP_ID_MSIX) - ret = xen_pci_frontend_enable_msix(dev, &v, nvec); + ret = xen_pci_frontend_enable_msix(dev, v, nvec); else - ret = xen_pci_frontend_enable_msi(dev, &v); + ret = xen_pci_frontend_enable_msi(dev, v); if (ret) goto error; i = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_allocate_pirq(v[i], 0, /* not sharable */ - (type == PCI_CAP_ID_MSIX) ? - "pcifront-msi-x" : "pcifront-msi"); - if (irq < 0) { - ret = -1; + irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, + (type == PCI_CAP_ID_MSIX) ? + "pcifront-msi-x" : + "pcifront-msi"); + if (irq < 0) goto free; - } - - ret = set_irq_msi(irq, msidesc); - if (ret) - goto error_while; i++; } kfree(v); return 0; -error_while: - unbind_from_irqhandler(irq, NULL); error: - if (ret == -ENODEV) - dev_err(&dev->dev, "Xen PCI frontend has not registered" \ - " MSI/MSI-X support!\n"); + dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n"); free: kfree(v); return ret; @@ -193,6 +172,9 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev) xen_pci_frontend_disable_msix(dev); else xen_pci_frontend_disable_msi(dev); + + /* Free the IRQ's and the msidesc using the generic code. */ + default_teardown_msi_irqs(dev); } static void xen_teardown_msi_irq(unsigned int irq) @@ -200,47 +182,82 @@ static void xen_teardown_msi_irq(unsigned int irq) xen_destroy_irq(irq); } +#ifdef CONFIG_XEN_DOM0 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, ret; + int ret = 0; struct msi_desc *msidesc; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = xen_create_msi_irq(dev, msidesc, type); - if (irq < 0) - return -1; + struct physdev_map_pirq map_irq; - ret = set_irq_msi(irq, msidesc); - if (ret) - goto error; - } - return 0; + memset(&map_irq, 0, sizeof(map_irq)); + map_irq.domid = DOMID_SELF; + map_irq.type = MAP_PIRQ_TYPE_MSI; + map_irq.index = -1; + map_irq.pirq = -1; + map_irq.bus = dev->bus->number; + map_irq.devfn = dev->devfn; -error: - xen_destroy_irq(irq); + if (type == PCI_CAP_ID_MSIX) { + int pos; + u32 table_offset, bir; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + + pci_read_config_dword(dev, pos + PCI_MSIX_TABLE, + &table_offset); + bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); + + map_irq.table_base = pci_resource_start(dev, bir); + map_irq.entry_nr = msidesc->msi_attrib.entry_nr; + } + + ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (ret) { + dev_warn(&dev->dev, "xen map irq failed %d\n", ret); + goto out; + } + + ret = xen_bind_pirq_msi_to_irq(dev, msidesc, + map_irq.pirq, map_irq.index, + (type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi"); + if (ret < 0) + goto out; + } + ret = 0; +out: return ret; } #endif +#endif static int xen_pcifront_enable_irq(struct pci_dev *dev) { int rc; int share = 1; + u8 gsi; - dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq); - - if (dev->irq < 0) - return -EINVAL; + rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n", + rc); + return rc; + } - if (dev->irq < NR_IRQS_LEGACY) + if (gsi < NR_IRQS_LEGACY) share = 0; - rc = xen_allocate_pirq(dev->irq, share, "pcifront"); + rc = xen_allocate_pirq(gsi, share, "pcifront"); if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n", - dev->irq, rc); + dev_warn(&dev->dev, "Xen PCI: failed to register GSI%d: %d\n", + gsi, rc); return rc; } + + dev->irq = rc; + dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq); return 0; } diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index d2c0d51a7178..cd6f184c3b3f 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -15,6 +15,7 @@ #include <linux/serial_reg.h> #include <linux/serial_8250.h> +#include <asm/ce4100.h> #include <asm/setup.h> #include <asm/io.h> @@ -129,4 +130,5 @@ void __init x86_ce4100_early_setup(void) x86_init.resources.probe_roms = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; x86_init.mpparse.find_smp_config = sdv_find_smp_config; + x86_init.pci.init = ce4100_pci_init; } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index df58e9cad96a..a7b38d35c29a 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1364,11 +1364,11 @@ uv_activation_descriptor_init(int node, int pnode) memset(bd2, 0, sizeof(struct bau_desc)); bd2->header.sw_ack_flag = 1; /* - * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub + * base_dest_nodeid is the nasid of the first uvhub * in the partition. The bit map will indicate uvhub numbers, * which are 0-N in a partition. Pnodes are unique system-wide. */ - bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1; + bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode); bd2->header.dest_subnodeid = 0x10; /* the LB */ bd2->header.command = UV_NET_ENDPOINT_INTD; bd2->header.int_both = 1; diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 5b54892e4bc3..e4343fe488ed 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -48,3 +48,11 @@ config XEN_DEBUG_FS help Enable statistics output and various tuning options in debugfs. Enabling this option may incur a significant performance overhead. + +config XEN_DEBUG + bool "Enable Xen debug checks" + depends on XEN + default n + help + Enable various WARN_ON checks in the Xen MMU code. + Enabling this option WILL incur a significant performance overhead. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 50542efe45fb..49dbd78ec3cb 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1284,15 +1284,14 @@ static int init_hvm_pv_info(int *major, int *minor) xen_setup_features(); - pv_info = xen_info; - pv_info.kernel_rpl = 0; + pv_info.name = "Xen HVM"; xen_domain_type = XEN_HVM_DOMAIN; return 0; } -void xen_hvm_init_shared_info(void) +void __ref xen_hvm_init_shared_info(void) { int cpu; struct xen_add_to_physmap xatp; @@ -1331,6 +1330,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + if (xen_have_vector_callback) + xen_init_lock_cpu(cpu); break; default: break; @@ -1355,6 +1356,7 @@ static void __init xen_hvm_guest_init(void) if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; + xen_hvm_smp_init(); register_cpu_notifier(&xen_hvm_cpu_notifier); xen_unplug_emulated_devices(); have_vcpu_info_placement = 0; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5e92b61ad574..832765c0fb8c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -46,6 +46,7 @@ #include <linux/module.h> #include <linux/gfp.h> #include <linux/memblock.h> +#include <linux/seq_file.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -416,8 +417,12 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; - unsigned long mfn = pfn_to_mfn(pfn); + unsigned long mfn; + if (!xen_feature(XENFEAT_auto_translated_physmap)) + mfn = get_phys_to_machine(pfn); + else + mfn = pfn; /* * If there's no mfn for the pfn, then just create an * empty non-present pte. Unfortunately this loses @@ -427,8 +432,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (unlikely(mfn == INVALID_P2M_ENTRY)) { mfn = 0; flags = 0; + } else { + /* + * Paramount to do this test _after_ the + * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY & + * IDENTITY_FRAME_BIT resolves to true. + */ + mfn &= ~FOREIGN_FRAME_BIT; + if (mfn & IDENTITY_FRAME_BIT) { + mfn &= ~IDENTITY_FRAME_BIT; + flags |= _PAGE_IOMAP; + } } - val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } @@ -532,6 +547,41 @@ pte_t xen_make_pte(pteval_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); +#ifdef CONFIG_XEN_DEBUG +pte_t xen_make_pte_debug(pteval_t pte) +{ + phys_addr_t addr = (pte & PTE_PFN_MASK); + phys_addr_t other_addr; + bool io_page = false; + pte_t _pte; + + if (pte & _PAGE_IOMAP) + io_page = true; + + _pte = xen_make_pte(pte); + + if (!addr) + return _pte; + + if (io_page && + (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { + other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT; + WARN(addr != other_addr, + "0x%lx is using VM_IO, but it is 0x%lx!\n", + (unsigned long)addr, (unsigned long)other_addr); + } else { + pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP; + other_addr = (_pte.pte & PTE_PFN_MASK); + WARN((addr == other_addr) && (!io_page) && (!iomap_set), + "0x%lx is missing VM_IO (and wasn't fixed)!\n", + (unsigned long)addr); + } + + return _pte; +} +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug); +#endif + pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); @@ -986,10 +1036,9 @@ static void xen_pgd_pin(struct mm_struct *mm) */ void xen_mm_pin_all(void) { - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { if (!PagePinned(page)) { @@ -998,7 +1047,7 @@ void xen_mm_pin_all(void) } } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } /* @@ -1099,10 +1148,9 @@ static void xen_pgd_unpin(struct mm_struct *mm) */ void xen_mm_unpin_all(void) { - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { @@ -1112,7 +1160,7 @@ void xen_mm_unpin_all(void) } } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -1942,6 +1990,9 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { +#ifdef CONFIG_XEN_DEBUG + pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); +#endif pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; @@ -2074,7 +2125,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, in_frames[i] = virt_to_mfn(vaddr); MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); - set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); if (out_frames) out_frames[i] = virt_to_pfn(vaddr); @@ -2353,6 +2404,18 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); #ifdef CONFIG_XEN_DEBUG_FS +static int p2m_dump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, p2m_dump_show, NULL); +} + +static const struct file_operations p2m_dump_fops = { + .open = p2m_dump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static struct dentry *d_mmu_debug; static int __init xen_mmu_debugfs(void) @@ -2408,6 +2471,7 @@ static int __init xen_mmu_debugfs(void) debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, &mmu_stats.prot_commit_batched); + debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops); return 0; } fs_initcall(xen_mmu_debugfs); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index fd12d7ce7ff9..215a3ce61068 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -23,6 +23,129 @@ * P2M_PER_PAGE depends on the architecture, as a mfn is always * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to * 512 and 1024 entries respectively. + * + * In short, these structures contain the Machine Frame Number (MFN) of the PFN. + * + * However not all entries are filled with MFNs. Specifically for all other + * leaf entries, or for the top root, or middle one, for which there is a void + * entry, we assume it is "missing". So (for example) + * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. + * + * We also have the possibility of setting 1-1 mappings on certain regions, so + * that: + * pfn_to_mfn(0xc0000)=0xc0000 + * + * The benefit of this is, that we can assume for non-RAM regions (think + * PCI BARs, or ACPI spaces), we can create mappings easily b/c we + * get the PFN value to match the MFN. + * + * For this to work efficiently we have one new page p2m_identity and + * allocate (via reserved_brk) any other pages we need to cover the sides + * (1GB or 4MB boundary violations). All entries in p2m_identity are set to + * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs, + * no other fancy value). + * + * On lookup we spot that the entry points to p2m_identity and return the + * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. + * If the entry points to an allocated page, we just proceed as before and + * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in + * appropriate functions (pfn_to_mfn). + * + * The reason for having the IDENTITY_FRAME_BIT instead of just returning the + * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a + * non-identity pfn. To protect ourselves against we elect to set (and get) the + * IDENTITY_FRAME_BIT on all identity mapped PFNs. + * + * This simplistic diagram is used to explain the more subtle piece of code. + * There is also a digram of the P2M at the end that can help. + * Imagine your E820 looking as so: + * + * 1GB 2GB + * /-------------------+---------\/----\ /----------\ /---+-----\ + * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | + * \-------------------+---------/\----/ \----------/ \---+-----/ + * ^- 1029MB ^- 2001MB + * + * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), + * 2048MB = 524288 (0x80000)] + * + * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB + * is actually not present (would have to kick the balloon driver to put it in). + * + * When we are told to set the PFNs for identity mapping (see patch: "xen/setup: + * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start + * of the PFN and the end PFN (263424 and 512256 respectively). The first step + * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page + * covers 512^2 of page estate (1GB) and in case the start or end PFN is not + * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn + * to end pfn. We reserve_brk top leaf pages if they are missing (means they + * point to p2m_mid_missing). + * + * With the E820 example above, 263424 is not 1GB aligned so we allocate a + * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. + * Each entry in the allocate page is "missing" (points to p2m_missing). + * + * Next stage is to determine if we need to do a more granular boundary check + * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. + * We check if the start pfn and end pfn violate that boundary check, and if + * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer + * granularity of setting which PFNs are missing and which ones are identity. + * In our example 263424 and 512256 both fail the check so we reserve_brk two + * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" + * values) and assign them to p2m[1][2] and p2m[1][488] respectively. + * + * At this point we would at minimum reserve_brk one page, but could be up to + * three. Each call to set_phys_range_identity has at maximum a three page + * cost. If we were to query the P2M at this stage, all those entries from + * start PFN through end PFN (so 1029MB -> 2001MB) would return + * INVALID_P2M_ENTRY ("missing"). + * + * The next step is to walk from the start pfn to the end pfn setting + * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. + * If we find that the middle leaf is pointing to p2m_missing we can swap it + * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this + * point we do not need to worry about boundary aligment (so no need to + * reserve_brk a middle page, figure out which PFNs are "missing" and which + * ones are identity), as that has been done earlier. If we find that the + * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference + * that page (which covers 512 PFNs) and set the appropriate PFN with + * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we + * set from p2m[1][2][256->511] and p2m[1][488][0->256] with + * IDENTITY_FRAME_BIT set. + * + * All other regions that are void (or not filled) either point to p2m_missing + * (considered missing) or have the default value of INVALID_P2M_ENTRY (also + * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] + * contain the INVALID_P2M_ENTRY value and are considered "missing." + * + * This is what the p2m ends up looking (for the E820 above) with this + * fabulous drawing: + * + * p2m /--------------\ + * /-----\ | &mfn_list[0],| /-----------------\ + * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. | + * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] | + * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] | + * |-----| \ | [p2m_identity]+\\ | .... | + * | 2 |--\ \-------------------->| ... | \\ \----------------/ + * |-----| \ \---------------/ \\ + * | 3 |\ \ \\ p2m_identity + * |-----| \ \-------------------->/---------------\ /-----------------\ + * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... | + * \-----/ / | [p2m_identity]+-->| ..., ~0 | + * / /---------------\ | .... | \-----------------/ + * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. | + * / | IDENTITY[@256]|<----/ \---------------/ + * / | ~0, ~0, .... | + * | \---------------/ + * | + * p2m_missing p2m_missing + * /------------------\ /------------\ + * | [p2m_mid_missing]+---->| ~0, ~0, ~0 | + * | [p2m_mid_missing]+---->| ..., ~0 | + * \------------------/ \------------/ + * + * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ #include <linux/init.h> @@ -30,6 +153,7 @@ #include <linux/list.h> #include <linux/hash.h> #include <linux/sched.h> +#include <linux/seq_file.h> #include <asm/cache.h> #include <asm/setup.h> @@ -59,9 +183,15 @@ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); + RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +/* We might hit two boundary violations at the start and end, at max each + * boundary violation will require three middle nodes. */ +RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); @@ -136,7 +266,7 @@ static void p2m_init(unsigned long *p2m) * - After resume we're called from within stop_machine, but the mfn * tree should alreay be completely allocated. */ -void xen_build_mfn_list_list(void) +void __ref xen_build_mfn_list_list(void) { unsigned long pfn; @@ -221,6 +351,9 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_init(p2m_top); + p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_identity); + /* * The domain builder gives us a pre-constructed p2m array in * mfn_list for all the pages initially given to us, so we just @@ -266,6 +399,14 @@ unsigned long get_phys_to_machine(unsigned long pfn) mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); + /* + * The INVALID_P2M_ENTRY is filled in both p2m_*identity + * and in p2m_*missing, so returning the INVALID_P2M_ENTRY + * would be wrong. + */ + if (p2m_top[topidx][mididx] == p2m_identity) + return IDENTITY_FRAME(pfn); + return p2m_top[topidx][mididx][idx]; } EXPORT_SYMBOL_GPL(get_phys_to_machine); @@ -335,9 +476,11 @@ static bool alloc_p2m(unsigned long pfn) p2m_top_mfn_p[topidx] = mid_mfn; } - if (p2m_top[topidx][mididx] == p2m_missing) { + if (p2m_top[topidx][mididx] == p2m_identity || + p2m_top[topidx][mididx] == p2m_missing) { /* p2m leaf page is missing */ unsigned long *p2m; + unsigned long *p2m_orig = p2m_top[topidx][mididx]; p2m = alloc_p2m_page(); if (!p2m) @@ -345,7 +488,7 @@ static bool alloc_p2m(unsigned long pfn) p2m_init(p2m); - if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig) free_p2m_page(p2m); else mid_mfn[mididx] = virt_to_mfn(p2m); @@ -354,11 +497,91 @@ static bool alloc_p2m(unsigned long pfn) return true; } +bool __early_alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx, idx; + + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + idx = p2m_index(pfn); + + /* Pfff.. No boundary cross-over, lets get out. */ + if (!idx) + return false; + + WARN(p2m_top[topidx][mididx] == p2m_identity, + "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n", + topidx, mididx); + + /* + * Could be done by xen_build_dynamic_phys_to_machine.. + */ + if (p2m_top[topidx][mididx] != p2m_missing) + return false; + + /* Boundary cross-over for the edges: */ + if (idx) { + unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_init(p2m); + + p2m_top[topidx][mididx] = p2m; + + } + return idx != 0; +} +unsigned long set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e) +{ + unsigned long pfn; + + if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN)) + return 0; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return pfn_e - pfn_s; + + if (pfn_s > pfn_e) + return 0; + + for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1)); + pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); + pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) + { + unsigned topidx = p2m_top_index(pfn); + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + } + } + + __early_alloc_p2m(pfn_s); + __early_alloc_p2m(pfn_e); + + for (pfn = pfn_s; pfn < pfn_e; pfn++) + if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) + break; + + if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), + "Identity mapping failed. We are %ld short of 1-1 mappings!\n", + (pfn_e - pfn_s) - (pfn - pfn_s))) + printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn); + + return pfn - pfn_s; +} + /* Try to install p2m mapping; fail if intermediate bits missing */ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { unsigned topidx, mididx, idx; + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return true; + } if (unlikely(pfn >= MAX_P2M_PFN)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; @@ -368,6 +591,21 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); + /* For sparse holes were the p2m leaf has real PFN along with + * PCI holes, stick in the PFN as the MFN value. + */ + if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { + if (p2m_top[topidx][mididx] == p2m_identity) + return true; + + /* Swap over from MISSING to IDENTITY if needed. */ + if (p2m_top[topidx][mididx] == p2m_missing) { + WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing, + p2m_identity) != p2m_missing); + return true; + } + } + if (p2m_top[topidx][mididx] == p2m_missing) return mfn == INVALID_P2M_ENTRY; @@ -378,11 +616,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return true; - } - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!alloc_p2m(pfn)) return false; @@ -421,7 +654,7 @@ int m2p_add_override(unsigned long mfn, struct page *page) { unsigned long flags; unsigned long pfn; - unsigned long address; + unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; @@ -455,7 +688,7 @@ int m2p_remove_override(struct page *page) unsigned long flags; unsigned long mfn; unsigned long pfn; - unsigned long address; + unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; @@ -520,3 +753,80 @@ unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) return ret; } EXPORT_SYMBOL_GPL(m2p_find_override_pfn); + +#ifdef CONFIG_XEN_DEBUG_FS + +int p2m_dump_show(struct seq_file *m, void *v) +{ + static const char * const level_name[] = { "top", "middle", + "entry", "abnormal" }; + static const char * const type_name[] = { "identity", "missing", + "pfn", "abnormal"}; +#define TYPE_IDENTITY 0 +#define TYPE_MISSING 1 +#define TYPE_PFN 2 +#define TYPE_UNKNOWN 3 + unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0; + unsigned int uninitialized_var(prev_level); + unsigned int uninitialized_var(prev_type); + + if (!p2m_top) + return 0; + + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned idx = p2m_index(pfn); + unsigned lvl, type; + + lvl = 4; + type = TYPE_UNKNOWN; + if (p2m_top[topidx] == p2m_mid_missing) { + lvl = 0; type = TYPE_MISSING; + } else if (p2m_top[topidx] == NULL) { + lvl = 0; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx] == NULL) { + lvl = 1; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx] == p2m_identity) { + lvl = 1; type = TYPE_IDENTITY; + } else if (p2m_top[topidx][mididx] == p2m_missing) { + lvl = 1; type = TYPE_MISSING; + } else if (p2m_top[topidx][mididx][idx] == 0) { + lvl = 2; type = TYPE_UNKNOWN; + } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) { + lvl = 2; type = TYPE_IDENTITY; + } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) { + lvl = 2; type = TYPE_MISSING; + } else if (p2m_top[topidx][mididx][idx] == pfn) { + lvl = 2; type = TYPE_PFN; + } else if (p2m_top[topidx][mididx][idx] != pfn) { + lvl = 2; type = TYPE_PFN; + } + if (pfn == 0) { + prev_level = lvl; + prev_type = type; + } + if (pfn == MAX_DOMAIN_PAGES-1) { + lvl = 3; + type = TYPE_UNKNOWN; + } + if (prev_type != type) { + seq_printf(m, " [0x%lx->0x%lx] %s\n", + prev_pfn_type, pfn, type_name[prev_type]); + prev_pfn_type = pfn; + prev_type = type; + } + if (prev_level != lvl) { + seq_printf(m, " [0x%lx->0x%lx] level %s\n", + prev_pfn_level, pfn, level_name[prev_level]); + prev_pfn_level = pfn; + prev_level = lvl; + } + } + return 0; +#undef TYPE_IDENTITY +#undef TYPE_MISSING +#undef TYPE_PFN +#undef TYPE_UNKNOWN +} +#endif diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a8a66a50d446..fa0269a99377 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -52,6 +52,8 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size; static __init void xen_add_extra_mem(unsigned long pages) { + unsigned long pfn; + u64 size = (u64)pages * PAGE_SIZE; u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; @@ -66,6 +68,9 @@ static __init void xen_add_extra_mem(unsigned long pages) xen_extra_mem_size += size; xen_max_p2m_pfn = PFN_DOWN(extra_start + size); + + for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++) + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } static unsigned long __init xen_release_chunk(phys_addr_t start_addr, @@ -104,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", start, end, ret); if (ret == 1) { - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); len++; } } @@ -138,12 +143,55 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, return released; } +static unsigned long __init xen_set_identity(const struct e820entry *list, + ssize_t map_size) +{ + phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS; + phys_addr_t start_pci = last; + const struct e820entry *entry; + unsigned long identity = 0; + int i; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + phys_addr_t start = entry->addr; + phys_addr_t end = start + entry->size; + + if (start < last) + start = last; + + if (end <= start) + continue; + + /* Skip over the 1MB region. */ + if (last > end) + continue; + + if (entry->type == E820_RAM) { + if (start > start_pci) + identity += set_phys_range_identity( + PFN_UP(start_pci), PFN_DOWN(start)); + + /* Without saving 'last' we would gooble RAM too + * at the end of the loop. */ + last = end; + start_pci = end; + continue; + } + start_pci = min(start, start_pci); + last = end; + } + if (last > start_pci) + identity += set_phys_range_identity( + PFN_UP(start_pci), PFN_DOWN(last)); + return identity; +} /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { static struct e820entry map[E820MAX] __initdata; + static struct e820entry map_raw[E820MAX] __initdata; unsigned long max_pfn = xen_start_info->nr_pages; unsigned long long mem_end; @@ -151,6 +199,7 @@ char * __init xen_memory_setup(void) struct xen_memory_map memmap; unsigned long extra_pages = 0; unsigned long extra_limit; + unsigned long identity_pages = 0; int i; int op; @@ -176,6 +225,7 @@ char * __init xen_memory_setup(void) } BUG_ON(rc); + memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; xen_extra_mem_start = mem_end; for (i = 0; i < memmap.nr_entries; i++) { @@ -194,6 +244,15 @@ char * __init xen_memory_setup(void) end -= delta; extra_pages += PFN_DOWN(delta); + /* + * Set RAM below 4GB that is not for us to be unusable. + * This prevents "System RAM" address space from being + * used as potential resource for I/O address (happens + * when 'allocate_resource' is called). + */ + if (delta && + (xen_initial_domain() && end < 0x100000000ULL)) + e820_add_region(end, delta, E820_UNUSABLE); } if (map[i].size > 0 && end > xen_extra_mem_start) @@ -251,6 +310,13 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(extra_pages); + /* + * Set P2M for all non-RAM pages and E820 gaps to be identity + * type PFNs. We supply it with the non-sanitized version + * of the E820. + */ + identity_pages = xen_set_identity(map_raw, memmap.nr_entries); + printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); return "Xen"; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 72a4c7959045..30612441ed99 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -509,3 +509,41 @@ void __init xen_smp_init(void) xen_fill_possible_map(); xen_init_spinlocks(); } + +static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) +{ + native_smp_prepare_cpus(max_cpus); + WARN_ON(xen_smp_intr_init(0)); + + if (!xen_have_vector_callback) + return; + xen_init_lock_cpu(0); + xen_init_spinlocks(); +} + +static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) +{ + int rc; + rc = native_cpu_up(cpu); + WARN_ON (xen_smp_intr_init(cpu)); + return rc; +} + +static void xen_hvm_cpu_die(unsigned int cpu) +{ + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + native_cpu_die(cpu); +} + +void __init xen_hvm_smp_init(void) +{ + smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; + smp_ops.smp_send_reschedule = xen_smp_send_reschedule; + smp_ops.cpu_up = xen_hvm_cpu_up; + smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; + smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; +} diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 9bbd63a129b5..45329c8c226e 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -12,7 +12,7 @@ #include "xen-ops.h" #include "mmu.h" -void xen_pre_suspend(void) +void xen_arch_pre_suspend(void) { xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); xen_start_info->console.domU.mfn = @@ -26,8 +26,9 @@ void xen_pre_suspend(void) BUG(); } -void xen_hvm_post_suspend(int suspend_cancelled) +void xen_arch_hvm_post_suspend(int suspend_cancelled) { +#ifdef CONFIG_XEN_PVHVM int cpu; xen_hvm_init_shared_info(); xen_callback_vector(); @@ -37,9 +38,10 @@ void xen_hvm_post_suspend(int suspend_cancelled) xen_setup_runstate_info(cpu); } } +#endif } -void xen_post_suspend(int suspend_cancelled) +void xen_arch_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 067759e3d6a5..2e2d370a47b1 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -397,7 +397,9 @@ void xen_setup_timer(int cpu) name = "<timer kasprintf failed>"; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, + IRQF_DISABLED|IRQF_PERCPU| + IRQF_NOBALANCING|IRQF_TIMER| + IRQF_FORCE_RESUME, name, NULL); evt = &per_cpu(xen_clock_events, cpu); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9d41bf985757..3112f55638c4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -64,10 +64,12 @@ void xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP void xen_smp_init(void); +void __init xen_hvm_smp_init(void); extern cpumask_var_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} +static inline void xen_hvm_smp_init(void) {} #endif #ifdef CONFIG_PARAVIRT_SPINLOCKS |