From 3f3aaea29ff7ee2d43b430338427f30ba7f60ff9 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 30 Mar 2012 11:45:01 -0400 Subject: xen/p2m: Move code around to allow for better re-usage. We are going to be using the early_alloc_p2m (and early_alloc_p2m_middle) code in follow up patches which are not related to setting identity pages. Hence lets move the code out in its own function and rename them as appropiate. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 62 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 1b267e75158d..3cc3afeb09a1 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -499,7 +499,7 @@ static bool alloc_p2m(unsigned long pfn) return true; } -static bool __init __early_alloc_p2m(unsigned long pfn) +static bool __init early_alloc_p2m_middle(unsigned long pfn) { unsigned topidx, mididx, idx; @@ -541,6 +541,36 @@ static bool __init __early_alloc_p2m(unsigned long pfn) } return idx != 0; } + +static bool __init early_alloc_p2m(unsigned long pfn) +{ + unsigned topidx = p2m_top_index(pfn); + unsigned long *mid_mfn_p; + unsigned long **mid; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + if (mid == p2m_mid_missing) { + mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + } + /* And the save/restore P2M tables.. */ + if (mid_mfn_p == p2m_mid_missing_mfn) { + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + /* Note: we don't set mid_mfn_p[midix] here, + * look in early_alloc_p2m_middle */ + } + return true; +} unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { @@ -559,35 +589,11 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned long *mid_mfn_p; - unsigned long **mid; - - mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; - if (mid == p2m_mid_missing) { - mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - - p2m_mid_init(mid); - - p2m_top[topidx] = mid; - - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - } - /* And the save/restore P2M tables.. */ - if (mid_mfn_p == p2m_mid_missing_mfn) { - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - /* Note: we don't set mid_mfn_p[midix] here, - * look in __early_alloc_p2m */ - } + WARN_ON(!early_alloc_p2m(pfn)); } - __early_alloc_p2m(pfn_s); - __early_alloc_p2m(pfn_e); + early_alloc_p2m_middle(pfn_s); + early_alloc_p2m_middle(pfn_e); for (pfn = pfn_s; pfn < pfn_e; pfn++) if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) -- cgit v1.2.1 From cef4cca551d652b7f69c9d76337c5fae24e069dc Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 30 Mar 2012 14:15:14 -0400 Subject: xen/p2m: Allow alloc_p2m_middle to call reserve_brk depending on argument For identity cases we want to call reserve_brk only on the boundary conditions of the middle P2M (so P2M[x][y][0] = extend_brk). This is to work around identify regions (PCI spaces, gaps in E820) which are not aligned on 2MB regions. However for the case were we want to allocate P2M middle leafs at the early bootup stage, irregardless of this alignment check we need some means of doing that. For that we provide the new argument. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 3cc3afeb09a1..8b3a3958d120 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -499,7 +499,7 @@ static bool alloc_p2m(unsigned long pfn) return true; } -static bool __init early_alloc_p2m_middle(unsigned long pfn) +static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; @@ -508,7 +508,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) idx = p2m_index(pfn); /* Pfff.. No boundary cross-over, lets get out. */ - if (!idx) + if (!idx && check_boundary) return false; WARN(p2m_top[topidx][mididx] == p2m_identity, @@ -531,7 +531,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) p2m_top[topidx][mididx] = p2m; /* For save/restore we need to MFN of the P2M saved */ - + mid_mfn_p = p2m_top_mfn_p[topidx]; WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", @@ -592,8 +592,8 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, WARN_ON(!early_alloc_p2m(pfn)); } - early_alloc_p2m_middle(pfn_s); - early_alloc_p2m_middle(pfn_e); + early_alloc_p2m_middle(pfn_s, true); + early_alloc_p2m_middle(pfn_e, true); for (pfn = pfn_s; pfn < pfn_e; pfn++) if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) -- cgit v1.2.1 From d5096850b47424fb0f1c6a75b8f7184f7169319a Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 30 Mar 2012 14:16:49 -0400 Subject: xen/p2m: Collapse early_alloc_p2m_middle redundant checks. At the start of the function we were checking for idx != 0 and bailing out. And later calling extend_brk if idx != 0. That is unnecessary so remove that checks. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 8b3a3958d120..952edefcedb3 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -502,6 +502,8 @@ static bool alloc_p2m(unsigned long pfn) static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; + unsigned long *p2m; + unsigned long *mid_mfn_p; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); @@ -522,24 +524,21 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary return false; /* Boundary cross-over for the edges: */ - if (idx) { - unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); - unsigned long *mid_mfn_p; + p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m); + p2m_init(p2m); - p2m_top[topidx][mididx] = p2m; + p2m_top[topidx][mididx] = p2m; - /* For save/restore we need to MFN of the P2M saved */ + /* For save/restore we need to MFN of the P2M saved */ - mid_mfn_p = p2m_top_mfn_p[topidx]; - WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), - "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", - topidx, mididx); - mid_mfn_p[mididx] = virt_to_mfn(p2m); + mid_mfn_p = p2m_top_mfn_p[topidx]; + WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), + "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", + topidx, mididx); + mid_mfn_p[mididx] = virt_to_mfn(p2m); - } - return idx != 0; + return true; } static bool __init early_alloc_p2m(unsigned long pfn) -- cgit v1.2.1 From 940713bb2ce3033f468a220094a07250a2f69bdd Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 30 Mar 2012 14:33:14 -0400 Subject: xen/p2m: An early bootup variant of set_phys_to_machine During early bootup we can't use alloc_page, so to allocate leaf pages in the P2M we need to use extend_brk. For that we are utilizing the early_alloc_p2m and early_alloc_p2m_middle functions to do the job for us. This function follows the same logic as set_phys_to_machine. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 1 + arch/x86/xen/p2m.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c34f96c2f7a0..93971e841dd5 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -44,6 +44,7 @@ extern unsigned long machine_to_phys_nr; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern unsigned long set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 952edefcedb3..ffd08c414e91 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -570,6 +570,21 @@ static bool __init early_alloc_p2m(unsigned long pfn) } return true; } +bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (!early_alloc_p2m(pfn)) + return false; + + if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) + return false; + + if (!__set_phys_to_machine(pfn, mfn)) + return false; + } + + return true; +} unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { -- cgit v1.2.1 From 83c529151ab0d4a813e3f6a3e293fff75d468519 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Tue, 28 Feb 2012 05:15:46 +0000 Subject: KVM: x86: expose Intel cpu new features (HLE, RTM) to guest Intel recently release 2 new features, HLE and RTM. Refer to http://software.intel.com/file/41417. This patch expose them to guest. Signed-off-by: Liu, Jinsong Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9fed5bedaad6..c2134b881033 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -247,7 +247,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = - F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); + F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(BMI2) | F(ERMS) | F(RTM); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.2.1 From 675acb758ab2381c72fe3ceb5c091cbd0879d4dd Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 8 Mar 2012 18:07:56 +0800 Subject: KVM: SVM: count all irq windows exit Also count the exits of fast-path. Signed-off-by: Jason Wang Acked-by: Joerg Roedel Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e334389e1c75..f3167208562e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3240,6 +3240,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); + ++svm->vcpu.stat.irq_window_exits; /* * If the user space waits to inject interrupts, exit as soon as * possible @@ -3247,7 +3248,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm) if (!irqchip_in_kernel(svm->vcpu.kvm) && kvm_run->request_interrupt_window && !kvm_cpu_has_interrupt(&svm->vcpu)) { - ++svm->vcpu.stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } -- cgit v1.2.1 From b6d33834bd4e8bdf4a199812e31b3e36da53c794 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 8 Mar 2012 16:44:24 -0500 Subject: KVM: Factor out kvm_vcpu_kick to arch-generic code The kvm_vcpu_kick function performs roughly the same funcitonality on most all architectures, so we shouldn't have separate copies. PowerPC keeps a pointer to interchanging waitqueues on the vcpu_arch structure and to accomodate this special need a __KVM_HAVE_ARCH_VCPU_GET_WQ define and accompanying function kvm_arch_vcpu_wq have been defined. For all other architectures this is a generic inline that just returns &vcpu->wq; Acked-by: Scott Wood Signed-off-by: Christoffer Dall Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4044ce0bf7c1..511031dcb9cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6403,21 +6403,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) kvm_cpu_has_interrupt(vcpu)); } -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { - int me; - int cpu = vcpu->cpu; - - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); - ++vcpu->stat.halt_wakeup; - } - - me = get_cpu(); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) - smp_send_reschedule(cpu); - put_cpu(); + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) -- cgit v1.2.1 From eae3ee7d8a7c59cf63441dedf28674889f5fc477 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Sat, 10 Mar 2012 14:37:25 -0500 Subject: x86: pvclock: Add flag to indicate that a vm was stopped by the host This flag will be used to check if the vm was stopped by the host when a soft lockup was detected. The host will set the flag when it stops the guest. On resume, the guest will check this flag if a soft lockup is detected and skip issuing the warning. Signed-off-by: Eric B Munson Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/pvclock-abi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 35f2d1948ada..6167fd798188 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -40,5 +40,6 @@ struct pvclock_wall_clock { } __attribute__((__packed__)); #define PVCLOCK_TSC_STABLE_BIT (1 << 0) +#define PVCLOCK_GUEST_STOPPED (1 << 1) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ -- cgit v1.2.1 From 3b5d56b9317fa7b5407dff1aa7b115bf6cdbd494 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Sat, 10 Mar 2012 14:37:26 -0500 Subject: kvmclock: Add functions to check if the host has stopped the vm When a host stops or suspends a VM it will set a flag to show this. The watchdog will use these functions to determine if a softlockup is real, or the result of a suspended VM. Signed-off-by: Eric B Munson asm-generic changes Acked-by: Arnd Bergmann Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_para.h | 8 ++++++++ arch/x86/kernel/kvmclock.c | 21 +++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 734c3767cfac..99c4bbe0cca2 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -95,6 +95,14 @@ struct kvm_vcpu_pv_apf_data { extern void kvmclock_init(void); extern int kvm_register_clock(char *txt); +#ifdef CONFIG_KVM_CLOCK +bool kvm_check_and_clear_guest_paused(void); +#else +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} +#endif /* CONFIG_KVMCLOCK */ /* This instruction is vmcall. On non-VT architectures, it will generate a * trap that we will then rewrite to the appropriate instruction. diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f8492da65bfc..4ba090ca689d 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -114,6 +115,26 @@ static void kvm_get_preset_lpj(void) preset_lpj = lpj; } +bool kvm_check_and_clear_guest_paused(void) +{ + bool ret = false; + struct pvclock_vcpu_time_info *src; + + /* + * per_cpu() is safe here because this function is only called from + * timer functions where preemption is already disabled. + */ + WARN_ON(!in_atomic()); + src = &__get_cpu_var(hv_clock); + if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { + __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); + ret = true; + } + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_check_and_clear_guest_paused); + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, -- cgit v1.2.1 From 1c0b28c2a46d98cd258d96b8c222144b22876c46 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Sat, 10 Mar 2012 14:37:27 -0500 Subject: KVM: x86: Add ioctl for KVM_KVMCLOCK_CTRL Now that we have a flag that will tell the guest it was suspended, create an interface for that communication using a KVM ioctl. Signed-off-by: Eric B Munson Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 511031dcb9cc..99b738028fc0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2147,6 +2147,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ASYNC_PF: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_PCI_2_3: + case KVM_CAP_KVMCLOCK_CTRL: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2597,6 +2598,23 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, return r; } +/* + * kvm_set_guest_paused() indicates to the guest kernel that it has been + * stopped by the hypervisor. This function will be called from the host only. + * EINVAL is returned when the host attempts to set the flag for a guest that + * does not support pv clocks. + */ +static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) +{ + struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock; + if (!vcpu->arch.time_page) + return -EINVAL; + src->flags |= PVCLOCK_GUEST_STOPPED; + mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2873,6 +2891,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = vcpu->arch.virtual_tsc_khz; goto out; } + case KVM_KVMCLOCK_CTRL: { + r = kvm_set_guest_paused(vcpu); + goto out; + } default: r = -EINVAL; } -- cgit v1.2.1 From 248997095d652576f1213028a95ca5fff85d089f Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 15 Mar 2012 18:16:49 -0400 Subject: kvmclock: remove unneeded EXPORT macro check_and_clear_guest_paused does not need to be exported as it isn't used by any modules, remove the export. Signed-off-by: Eric B Munson Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kernel/kvmclock.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 4ba090ca689d..086eb58c6e80 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -133,7 +133,6 @@ bool kvm_check_and_clear_guest_paused(void) return ret; } -EXPORT_SYMBOL_GPL(kvm_check_and_clear_guest_paused); static struct clocksource kvm_clock = { .name = "kvm-clock", -- cgit v1.2.1 From a0ed46073c14f66dbf0707aaa7588b78da83d7c6 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 1 Mar 2012 19:31:22 +0900 Subject: KVM: MMU: Split the main body of rmap_write_protect() off from others We will use this in the following patch to implement another function which needs to write protect pages using the rmap information. Note that there is a small change in debug printing for large pages: we do not differentiate them from others to avoid duplicating code. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4cb164268846..c8b5694d1a48 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1010,42 +1010,43 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot) +static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { - unsigned long *rmapp; - u64 *spte; - int i, write_protected = 0; + u64 *spte = NULL; + int write_protected = 0; - rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); - spte = rmap_next(rmapp, NULL); - while (spte) { + while ((spte = rmap_next(rmapp, spte))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writable_pte(*spte)) { + + if (!is_writable_pte(*spte)) + continue; + + if (level == PT_PAGE_TABLE_LEVEL) { mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); - write_protected = 1; + } else { + BUG_ON(!is_large_pte(*spte)); + drop_spte(kvm, spte); + --kvm->stat.lpages; + spte = NULL; } - spte = rmap_next(rmapp, spte); + + write_protected = 1; } - /* check for huge page mappings */ - for (i = PT_DIRECTORY_LEVEL; + return write_protected; +} + +int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, + struct kvm_memory_slot *slot) +{ + unsigned long *rmapp; + int i, write_protected = 0; + + for (i = PT_PAGE_TABLE_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON(!is_large_pte(*spte)); - pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writable_pte(*spte)) { - drop_spte(kvm, spte); - --kvm->stat.lpages; - spte = NULL; - write_protected = 1; - } - spte = rmap_next(rmapp, spte); - } + write_protected |= __rmap_write_protect(kvm, rmapp, i); } return write_protected; -- cgit v1.2.1 From 5dc99b2380d59b8aeafa98791f92b96400ed3187 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 1 Mar 2012 19:32:16 +0900 Subject: KVM: Avoid checking huge page mappings in get_dirty_log() Dropped such mappings when we enabled dirty logging and we will never create new ones until we stop the logging. For this we introduce a new function which can be used to write protect a range of PT level pages: although we do not need to care about a range of pages at this point, the following patch will need this feature to optimize the write protection of many pages. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/mmu.c | 40 ++++++++++++++++++++++++++++++---------- arch/x86/kvm/x86.c | 8 +++----- 3 files changed, 36 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e216ba066e79..f624ca72ea24 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -712,8 +712,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c8b5694d1a48..dc5f2459db6c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1037,27 +1037,47 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level return write_protected; } -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot) +/** + * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages + * @kvm: kvm instance + * @slot: slot to protect + * @gfn_offset: start of the BITS_PER_LONG pages we care about + * @mask: indicates which pages we should protect + * + * Used when we do not need to care about huge page mappings: e.g. during dirty + * logging we do not have any such mappings. + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) { unsigned long *rmapp; - int i, write_protected = 0; - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, i); - } + while (mask) { + rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); - return write_protected; + /* clear the first set bit */ + mask &= mask - 1; + } } static int rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; + unsigned long *rmapp; + int i; + int write_protected = 0; slot = gfn_to_memslot(kvm, gfn); - return kvm_mmu_rmap_write_protect(kvm, gfn, slot); + + for (i = PT_PAGE_TABLE_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + rmapp = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(kvm, rmapp, i); + } + + return write_protected; } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 99b738028fc0..813ebf1e55a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3095,13 +3095,11 @@ static void write_protect_slot(struct kvm *kvm, /* Not many dirty pages compared to # of shadow pages. */ if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { - unsigned long gfn_offset; + gfn_t offset; - for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { - unsigned long gfn = memslot->base_gfn + gfn_offset; + for_each_set_bit(offset, dirty_bitmap, memslot->npages) + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 1); - kvm_mmu_rmap_write_protect(kvm, gfn, memslot); - } kvm_flush_remote_tlbs(kvm); } else kvm_mmu_slot_remove_write_access(kvm, memslot->id); -- cgit v1.2.1 From 60c34612b70711fb14a8dcbc6a79509902450d2e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 3 Mar 2012 14:21:48 +0900 Subject: KVM: Switch to srcu-less get_dirty_log() We have seen some problems of the current implementation of get_dirty_log() which uses synchronize_srcu_expedited() for updating dirty bitmaps; e.g. it is noticeable that this sometimes gives us ms order of latency when we use VGA displays. Furthermore the recent discussion on the following thread "srcu: Implement call_srcu()" http://lkml.org/lkml/2012/1/31/211 also motivated us to implement get_dirty_log() without SRCU. This patch achieves this goal without sacrificing the performance of both VGA and live migration: in practice the new code is much faster than the old one unless we have too many dirty pages. Implementation: The key part of the implementation is the use of xchg() operation for clearing dirty bits atomically. Since this allows us to update only BITS_PER_LONG pages at once, we need to iterate over the dirty bitmap until every dirty bit is cleared again for the next call. Although some people may worry about the problem of using the atomic memory instruction many times to the concurrently accessible bitmap, it is usually accessed with mmu_lock held and we rarely see concurrent accesses: so what we need to care about is the pure xchg() overheads. Another point to note is that we do not use for_each_set_bit() to check which ones in each BITS_PER_LONG pages are actually dirty. Instead we simply use __ffs() in a loop. This is much faster than repeatedly call find_next_bit(). Performance: The dirty-log-perf unit test showed nice improvements, some times faster than before, except for some extreme cases; for such cases the speed of getting dirty page information is much faster than we process it in the userspace. For real workloads, both VGA and live migration, we have observed pure improvements: when the guest was reading a file during live migration, we originally saw a few ms of latency, but with the new method the latency was less than 200us. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 116 ++++++++++++++++++++--------------------------------- 1 file changed, 43 insertions(+), 73 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 813ebf1e55a0..0d9a57875f0b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3067,55 +3067,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, } /** - * write_protect_slot - write protect a slot for dirty logging - * @kvm: the kvm instance - * @memslot: the slot we protect - * @dirty_bitmap: the bitmap indicating which pages are dirty - * @nr_dirty_pages: the number of dirty pages + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log * - * We have two ways to find all sptes to protect: - * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and - * checks ones that have a spte mapping a page in the slot. - * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap. + * We need to keep it in mind that VCPU threads can write to the bitmap + * concurrently. So, to avoid losing data, we keep the following order for + * each bit: * - * Generally speaking, if there are not so many dirty pages compared to the - * number of shadow pages, we should use the latter. + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Flush TLB's if needed. + * 4. Copy the snapshot to the userspace. * - * Note that letting others write into a page marked dirty in the old bitmap - * by using the remaining tlb entry is not a problem. That page will become - * write protected again when we flush the tlb and then be reported dirty to - * the user space by copying the old bitmap. + * Between 2 and 3, the guest may write to the page using the remaining TLB + * entry. This is not a problem because the page will be reported dirty at + * step 4 using the snapshot taken before and step 3 ensures that successive + * writes will be logged for the next call. */ -static void write_protect_slot(struct kvm *kvm, - struct kvm_memory_slot *memslot, - unsigned long *dirty_bitmap, - unsigned long nr_dirty_pages) -{ - spin_lock(&kvm->mmu_lock); - - /* Not many dirty pages compared to # of shadow pages. */ - if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { - gfn_t offset; - - for_each_set_bit(offset, dirty_bitmap, memslot->npages) - kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 1); - - kvm_flush_remote_tlbs(kvm); - } else - kvm_mmu_slot_remove_write_access(kvm, memslot->id); - - spin_unlock(&kvm->mmu_lock); -} - -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { int r; struct kvm_memory_slot *memslot; - unsigned long n, nr_dirty_pages; + unsigned long n, i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + bool is_dirty = false; mutex_lock(&kvm->slots_lock); @@ -3124,49 +3101,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, goto out; memslot = id_to_memslot(kvm->memslots, log->slot); + + dirty_bitmap = memslot->dirty_bitmap; r = -ENOENT; - if (!memslot->dirty_bitmap) + if (!dirty_bitmap) goto out; n = kvm_dirty_bitmap_bytes(memslot); - nr_dirty_pages = memslot->nr_dirty_pages; - /* If nothing is dirty, don't bother messing with page tables. */ - if (nr_dirty_pages) { - struct kvm_memslots *slots, *old_slots; - unsigned long *dirty_bitmap, *dirty_bitmap_head; + dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); + memset(dirty_bitmap_buffer, 0, n); - dirty_bitmap = memslot->dirty_bitmap; - dirty_bitmap_head = memslot->dirty_bitmap_head; - if (dirty_bitmap == dirty_bitmap_head) - dirty_bitmap_head += n / sizeof(long); - memset(dirty_bitmap_head, 0, n); + spin_lock(&kvm->mmu_lock); - r = -ENOMEM; - slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); - if (!slots) - goto out; + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; - memslot = id_to_memslot(slots, log->slot); - memslot->nr_dirty_pages = 0; - memslot->dirty_bitmap = dirty_bitmap_head; - update_memslots(slots, NULL); + if (!dirty_bitmap[i]) + continue; - old_slots = kvm->memslots; - rcu_assign_pointer(kvm->memslots, slots); - synchronize_srcu_expedited(&kvm->srcu); - kfree(old_slots); + is_dirty = true; - write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) - goto out; - } else { - r = -EFAULT; - if (clear_user(log->dirty_bitmap, n)) - goto out; + offset = i * BITS_PER_LONG; + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); } + if (is_dirty) + kvm_flush_remote_tlbs(kvm); + + spin_unlock(&kvm->mmu_lock); + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + goto out; r = 0; out: -- cgit v1.2.1 From e9bda3b3d0ce775afe15eaf71922d342cc74991c Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Tue, 20 Mar 2012 23:33:51 -0700 Subject: KVM: VMX: Auto-load on CPUs with VMX Enable x86 feature-based autoloading for the kvm-intel module on CPUs with X86_FEATURE_VMX. Signed-off-by: Josh Triplett Acked-By: Kay Sievers Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad85adfef843..52f685635766 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,12 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +static const struct x86_cpu_id vmx_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_VMX), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); + static bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); -- cgit v1.2.1 From c36fc04ef558c95cff46a8c89d2f804f217335f5 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 8 Mar 2012 12:45:54 +0100 Subject: KVM: x86: add paging gcc optimization Since most guests will have paging enabled for memory management, add likely() optimization around CR0.PG checks. Signed-off-by: Davidlohr Bueso Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cb80c293cdd8..3d1134ddb885 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -64,7 +64,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu) static inline int is_paging(struct kvm_vcpu *vcpu) { - return kvm_read_cr0_bits(vcpu, X86_CR0_PG); + return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); } static inline u32 bit(int bitno) -- cgit v1.2.1 From 220f773a0013bf6fe2eefd9718ac7471f368fd8e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 21 Mar 2012 23:49:39 +0900 Subject: KVM: MMU: Make pte_list_desc fit cache lines well We have PTE_LIST_EXT + 1 pointers in this structure and these 40/20 bytes do not fit cache lines well. Furthermore, some allocators may use 64/32-byte objects for the pte_list_desc cache. This patch solves this problem by changing PTE_LIST_EXT from 4 to 3. For shadow paging, the new size is still large enough to hold both the kernel and process mappings for usual anonymous pages. For file mappings, there may be a slight change in the cache usage. Note: with EPT/NPT we almost always have a single spte in each reverse mapping and we will not see any change by this. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dc5f2459db6c..3213348e3a93 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -135,8 +135,6 @@ module_param(dbg, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PTE_LIST_EXT 4 - #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK #define ACC_USER_MASK PT_USER_MASK @@ -151,6 +149,9 @@ module_param(dbg, bool, 0644); #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +/* make pte_list_desc fit well in cache line */ +#define PTE_LIST_EXT 3 + struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; -- cgit v1.2.1 From 1e3f42f03c38c29c1814199a6f0a2f01b919ea3f Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 21 Mar 2012 23:50:34 +0900 Subject: KVM: MMU: Improve iteration through sptes from rmap Iteration using rmap_next(), the actual body is pte_list_next(), is inefficient: every time we call it we start from checking whether rmap holds a single spte or points to a descriptor which links more sptes. In the case of shadow paging, this quadratic total iteration cost is a problem. Even for two dimensional paging, with EPT/NPT on, in which we almost always have a single mapping, the extra checks at the end of the iteration should be eliminated. This patch fixes this by introducing rmap_iterator which keeps the iteration context for the next search. Furthermore the implementation of rmap_next() is splitted into two functions, rmap_get_first() and rmap_get_next(), to avoid repeatedly checking whether the rmap being iterated on has only one spte. Although there seemed to be only a slight change for EPT/NPT, the actual improvement was significant: we observed that GET_DIRTY_LOG for 1GB dirty memory became 15% faster than before. This is probably because the new code is easy to make branch predictions. Note: we just remove pte_list_next() because we can think of parent_ptes as a reverse mapping. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 196 ++++++++++++++++++++++++++++------------------- arch/x86/kvm/mmu_audit.c | 10 +-- 2 files changed, 124 insertions(+), 82 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3213348e3a93..29ad6f9c58a5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -842,32 +842,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, return count; } -static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) -{ - struct pte_list_desc *desc; - u64 *prev_spte; - int i; - - if (!*pte_list) - return NULL; - else if (!(*pte_list & 1)) { - if (!spte) - return (u64 *)*pte_list; - return NULL; - } - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - prev_spte = NULL; - while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { - if (prev_spte == spte) - return desc->sptes[i]; - prev_spte = desc->sptes[i]; - } - desc = desc->more; - } - return NULL; -} - static void pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, int i, struct pte_list_desc *prev_desc) @@ -988,11 +962,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return pte_list_add(vcpu, spte, rmapp); } -static u64 *rmap_next(unsigned long *rmapp, u64 *spte) -{ - return pte_list_next(rmapp, spte); -} - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_mmu_page *sp; @@ -1005,6 +974,67 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pte_list_remove(spte, rmapp); } +/* + * Used by the following functions to iterate through the sptes linked by a + * rmap. All fields are private and not assumed to be used outside. + */ +struct rmap_iterator { + /* private fields */ + struct pte_list_desc *desc; /* holds the sptep if not NULL */ + int pos; /* index of the sptep */ +}; + +/* + * Iteration must be started by this function. This should also be used after + * removing/dropping sptes from the rmap link because in such cases the + * information in the itererator may not be valid. + * + * Returns sptep if found, NULL otherwise. + */ +static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) +{ + if (!rmap) + return NULL; + + if (!(rmap & 1)) { + iter->desc = NULL; + return (u64 *)rmap; + } + + iter->desc = (struct pte_list_desc *)(rmap & ~1ul); + iter->pos = 0; + return iter->desc->sptes[iter->pos]; +} + +/* + * Must be used with a valid iterator: e.g. after rmap_get_first(). + * + * Returns sptep if found, NULL otherwise. + */ +static u64 *rmap_get_next(struct rmap_iterator *iter) +{ + if (iter->desc) { + if (iter->pos < PTE_LIST_EXT - 1) { + u64 *sptep; + + ++iter->pos; + sptep = iter->desc->sptes[iter->pos]; + if (sptep) + return sptep; + } + + iter->desc = iter->desc->more; + + if (iter->desc) { + iter->pos = 0; + /* desc->sptes[0] cannot be NULL */ + return iter->desc->sptes[iter->pos]; + } + } + + return NULL; +} + static void drop_spte(struct kvm *kvm, u64 *sptep) { if (mmu_spte_clear_track_bits(sptep)) @@ -1013,23 +1043,27 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { - u64 *spte = NULL; + u64 *sptep; + struct rmap_iterator iter; int write_protected = 0; - while ((spte = rmap_next(rmapp, spte))) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - if (!is_writable_pte(*spte)) + if (!is_writable_pte(*sptep)) { + sptep = rmap_get_next(&iter); continue; + } if (level == PT_PAGE_TABLE_LEVEL) { - mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); + mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); + sptep = rmap_get_next(&iter); } else { - BUG_ON(!is_large_pte(*spte)); - drop_spte(kvm, spte); + BUG_ON(!is_large_pte(*sptep)); + drop_spte(kvm, sptep); --kvm->stat.lpages; - spte = NULL; + sptep = rmap_get_first(*rmapp, &iter); } write_protected = 1; @@ -1084,48 +1118,57 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; int need_tlb_flush = 0; - while ((spte = rmap_next(rmapp, NULL))) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - drop_spte(kvm, spte); + while ((sptep = rmap_get_first(*rmapp, &iter))) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep); + + drop_spte(kvm, sptep); need_tlb_flush = 1; } + return need_tlb_flush; } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { + u64 *sptep; + struct rmap_iterator iter; int need_flush = 0; - u64 *spte, new_spte; + u64 new_spte; pte_t *ptep = (pte_t *)data; pfn_t new_pfn; WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!is_shadow_present_pte(*spte)); - rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!is_shadow_present_pte(*sptep)); + rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep); + need_flush = 1; + if (pte_write(*ptep)) { - drop_spte(kvm, spte); - spte = rmap_next(rmapp, NULL); + drop_spte(kvm, sptep); + sptep = rmap_get_first(*rmapp, &iter); } else { - new_spte = *spte &~ (PT64_BASE_ADDR_MASK); + new_spte = *sptep & ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; new_spte &= ~shadow_accessed_mask; - mmu_spte_clear_track_bits(spte); - mmu_spte_set(spte, new_spte); - spte = rmap_next(rmapp, spte); + + mmu_spte_clear_track_bits(sptep); + mmu_spte_set(sptep, new_spte); + sptep = rmap_get_next(&iter); } } + if (need_flush) kvm_flush_remote_tlbs(kvm); @@ -1184,7 +1227,8 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; int young = 0; /* @@ -1197,25 +1241,24 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) return kvm_unmap_rmapp(kvm, rmapp, data); - spte = rmap_next(rmapp, NULL); - while (spte) { - int _young; - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - _young = _spte & PT_ACCESSED_MASK; - if (_young) { + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + if (*sptep & PT_ACCESSED_MASK) { young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); + clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); } - spte = rmap_next(rmapp, spte); } + return young; } static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; int young = 0; /* @@ -1226,16 +1269,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - spte = rmap_next(rmapp, NULL); - while (spte) { - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - young = _spte & PT_ACCESSED_MASK; - if (young) { + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + if (*sptep & PT_ACCESSED_MASK) { young = 1; break; } - spte = rmap_next(rmapp, spte); } out: return young; @@ -1887,10 +1928,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { - u64 *parent_pte; + u64 *sptep; + struct rmap_iterator iter; - while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) - drop_parent_pte(sp, parent_pte); + while ((sptep = rmap_get_first(sp->parent_ptes, &iter))) + drop_parent_pte(sp, sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 715da5a19a5b..7d7d0b9e23eb 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -192,7 +192,8 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memory_slot *slot; unsigned long *rmapp; - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; if (sp->role.direct || sp->unsync || sp->role.invalid) return; @@ -200,13 +201,12 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slot = gfn_to_memslot(kvm, sp->gfn); rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; - spte = rmap_next(rmapp, NULL); - while (spte) { - if (is_writable_pte(*spte)) + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + if (is_writable_pte(*sptep)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); - spte = rmap_next(rmapp, spte); } } -- cgit v1.2.1 From ae75954457eee0a608072368c5b477e40f378d7b Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 28 Mar 2012 11:32:28 -0700 Subject: KVM: SVM: Auto-load on CPUs with SVM Enable x86 feature-based autoloading for the kvm-amd module on CPUs with X86_FEATURE_SVM. Signed-off-by: Josh Triplett Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f3167208562e..f75af406b268 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -22,6 +22,7 @@ #include "x86.h" #include +#include #include #include #include @@ -42,6 +43,12 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +static const struct x86_cpu_id svm_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_SVM), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); + #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 -- cgit v1.2.1 From 1c11b37669a5209bd11fb857a103634afef971e8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 9 Apr 2012 18:39:59 +0300 Subject: KVM: x86 emulator: add support for vector alignment x86 defines three classes of vector instructions: explicitly aligned (#GP(0) if unaligned, explicitly unaligned, and default (which depends on the encoding: AVX is unaligned, SSE is aligned). Add support for marking an instruction as explicitly aligned or unaligned, and mark MOVDQU as unaligned. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 83756223f8aa..6302e5c74341 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -142,6 +142,9 @@ #define Src2FS (OpFS << Src2Shift) #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) +#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ +#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ +#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ #define X2(x...) x, x #define X3(x...) X2(x), x @@ -557,6 +560,29 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); } +/* + * x86 defines three classes of vector instructions: explicitly + * aligned, explicitly unaligned, and the rest, which change behaviour + * depending on whether they're AVX encoded or not. + * + * Also included is CMPXCHG16B which is not a vector instruction, yet it is + * subject to the same check. + */ +static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) +{ + if (likely(size < 16)) + return false; + + if (ctxt->d & Aligned) + return true; + else if (ctxt->d & Unaligned) + return false; + else if (ctxt->d & Avx) + return false; + else + return true; +} + static int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, unsigned size, bool write, bool fetch, @@ -621,6 +647,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, } if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) la &= (u32)-1; + if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) + return emulate_gp(ctxt, 0); *linear = la; return X86EMUL_CONTINUE; bad: @@ -3415,7 +3443,7 @@ static struct opcode group11[] = { }; static struct gprefix pfx_0f_6f_0f_7f = { - N, N, N, I(Sse, em_movdqu), + N, N, N, I(Sse | Unaligned, em_movdqu), }; static struct opcode opcode_table[256] = { -- cgit v1.2.1 From 49597d8116ad70aabb598e606b218ddd9315b0af Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 9 Apr 2012 18:40:00 +0300 Subject: KVM: x86: emulate movdqa An Ubuntu 9.10 Karmic Koala guest is unable to boot or install due to missing movdqa emulation: kvm_exit: reason EXCEPTION_NMI rip 0x7fef3e025a7b info 7fef3e799000 80000b0e kvm_page_fault: address 7fef3e799000 error_code f kvm_emulate_insn: 0:7fef3e025a7b: 66 0f 7f 07 (prot64) movdqa %xmm0,(%rdi) [avi: mark it explicitly aligned] Signed-off-by: Stefan Hajnoczi Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6302e5c74341..b160fb1fc68b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2818,7 +2818,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt) static int em_mov(struct x86_emulate_ctxt *ctxt) { - ctxt->dst.val = ctxt->src.val; + memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes); return X86EMUL_CONTINUE; } @@ -2898,12 +2898,6 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); } -static int em_movdqu(struct x86_emulate_ctxt *ctxt) -{ - memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes); - return X86EMUL_CONTINUE; -} - static int em_invlpg(struct x86_emulate_ctxt *ctxt) { int rc; @@ -3443,7 +3437,7 @@ static struct opcode group11[] = { }; static struct gprefix pfx_0f_6f_0f_7f = { - N, N, N, I(Sse | Unaligned, em_movdqu), + N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; static struct opcode opcode_table[256] = { -- cgit v1.2.1 From 3e114eb4db3a33141b8c91bb53dae9ba6b015a32 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 9 Apr 2012 18:40:01 +0300 Subject: KVM: x86 emulator: implement movntps Used to write to framebuffers (by at least Icaros). Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b160fb1fc68b..fb39e0b32ed1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3440,6 +3440,10 @@ static struct gprefix pfx_0f_6f_0f_7f = { N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; +static struct gprefix pfx_vmovntpx = { + I(0, em_mov), N, N, N, +}; + static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ I6ALU(Lock, em_add), @@ -3571,7 +3575,8 @@ static struct opcode twobyte_table[256] = { IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), N, N, N, N, - N, N, N, N, N, N, N, N, + N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), + N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), -- cgit v1.2.1 From cbe2c9d30aa69b0551247ddb0fb450b6e8080ec4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 9 Apr 2012 18:40:02 +0300 Subject: KVM: x86 emulator: MMX support General support for the MMX instruction set. Special care is taken to trap pending x87 exceptions so that they are properly reflected to the guest. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 4 +- arch/x86/kvm/emulate.c | 103 +++++++++++++++++++++++++++++++++++-- 2 files changed, 102 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c222e1a1b12a..1ac46c22dd50 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -200,7 +200,7 @@ typedef u32 __attribute__((vector_size(16))) sse128_t; /* Type, address-of, and value of an instruction's operand. */ struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type; + enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; unsigned int bytes; union { unsigned long orig_val; @@ -213,12 +213,14 @@ struct operand { unsigned seg; } mem; unsigned xmm; + unsigned mm; } addr; union { unsigned long val; u64 val64; char valptr[sizeof(unsigned long) + 2]; sse128_t vec_val; + u64 mm_val; }; }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fb39e0b32ed1..0011b4ad44b5 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -142,6 +142,7 @@ #define Src2FS (OpFS << Src2Shift) #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) +#define Mmx ((u64)1 << 40) /* MMX Vector instruction */ #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ #define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ @@ -887,6 +888,40 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, ctxt->ops->put_fpu(ctxt); } +static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; + case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; + case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; + case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; + case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; + case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; + case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; + case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; + case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; + case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; + case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; + case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; + case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; + case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; + case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + static void decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op) { @@ -903,6 +938,13 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, read_sse_reg(ctxt, &op->vec_val, reg); return; } + if (ctxt->d & Mmx) { + reg &= 7; + op->type = OP_MM; + op->bytes = 8; + op->addr.mm = reg; + return; + } op->type = OP_REG; if (ctxt->d & ByteOp) { @@ -948,6 +990,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); return rc; } + if (ctxt->d & Mmx) { + op->type = OP_MM; + op->bytes = 8; + op->addr.xmm = ctxt->modrm_rm & 7; + return rc; + } fetch_register_operand(op); return rc; } @@ -1415,6 +1463,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt) case OP_XMM: write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); break; + case OP_MM: + write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); + break; case OP_NONE: /* no writeback */ break; @@ -3987,6 +4038,8 @@ done_prefixes: if (ctxt->d & Sse) ctxt->op_bytes = 16; + else if (ctxt->d & Mmx) + ctxt->op_bytes = 8; /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { @@ -4057,6 +4110,35 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) return false; } +static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) +{ + bool fault = false; + + ctxt->ops->get_fpu(ctxt); + asm volatile("1: fwait \n\t" + "2: \n\t" + ".pushsection .fixup,\"ax\" \n\t" + "3: \n\t" + "movb $1, %[fault] \n\t" + "jmp 2b \n\t" + ".popsection \n\t" + _ASM_EXTABLE(1b, 3b) + : [fault]"+rm"(fault)); + ctxt->ops->put_fpu(ctxt); + + if (unlikely(fault)) + return emulate_exception(ctxt, MF_VECTOR, 0, false); + + return X86EMUL_CONTINUE; +} + +static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op) +{ + if (op->type == OP_MM) + read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; @@ -4081,18 +4163,31 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if ((ctxt->d & Sse) - && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) - || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) + || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { rc = emulate_ud(ctxt); goto done; } - if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { rc = emulate_nm(ctxt); goto done; } + if (ctxt->d & Mmx) { + rc = flush_pending_x87_faults(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + /* + * Now that we know the fpu is exception safe, we can fetch + * operands from it. + */ + fetch_possible_mmx_operand(ctxt, &ctxt->src); + fetch_possible_mmx_operand(ctxt, &ctxt->src2); + if (!(ctxt->d & Mov)) + fetch_possible_mmx_operand(ctxt, &ctxt->dst); + } + if (unlikely(ctxt->guest_mode) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); -- cgit v1.2.1 From e59717550e5cf0e7159c5b7af1d1ead35fef49dd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 9 Apr 2012 18:40:03 +0300 Subject: KVM: x86 emulator: implement MMX MOVQ (opcodes 0f 6f, 0f 7f) Needed by some framebuffer drivers. See https://bugzilla.kernel.org/show_bug.cgi?id=42779 Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0011b4ad44b5..d5729a91d08d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3488,7 +3488,7 @@ static struct opcode group11[] = { }; static struct gprefix pfx_0f_6f_0f_7f = { - N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), + I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; static struct gprefix pfx_vmovntpx = { -- cgit v1.2.1 From a0c9a822bf37e6282eb6006b407ec5aec22e08fb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 11 Apr 2012 18:49:55 +0300 Subject: KVM: dont clear TMR on EOI Intel spec says that TMR needs to be set/cleared when IRR is set, but kvm also clears it on EOI. I did some tests on a real (AMD based) system, and I see same TMR values both before and after EOI, so I think it's a minor bug in kvm. This patch fixes TMR to be set/cleared on IRR set only as per spec. And now that we don't clear TMR, we can save an atomic read of TMR on EOI that's not propagated to ioapic, by checking whether ioapic needs a specific vector first and calculating the mode afterwards. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 858432287ab6..992b4eaae684 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -92,6 +92,11 @@ static inline int apic_test_and_clear_vector(int vec, void *bitmap) return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +static inline int apic_test_vector(int vec, void *bitmap) +{ + return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + static inline void apic_set_vector(int vec, void *bitmap) { set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -480,7 +485,6 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) static void apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); - int trigger_mode; /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC @@ -491,12 +495,15 @@ static void apic_set_eoi(struct kvm_lapic *apic) apic_clear_vector(vector, apic->regs + APIC_ISR); apic_update_ppr(apic); - if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; - if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) + if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && + kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + int trigger_mode; + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + } kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } -- cgit v1.2.1 From 9fe2a7015393dc0203ac39242ae9c89038994f3c Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 23 Mar 2012 13:36:28 +0530 Subject: debugfs: Add support to print u32 array in debugfs Move the code from Xen to debugfs to make the code common for other users as well. Accked-by: Greg Kroah-Hartman Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Suzuki Poulose [v1: Fixed rebase issues] [v2: Fixed PPC compile issues] Signed-off-by: Raghavendra K T Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/debugfs.c | 104 ------------------------------------------------ arch/x86/xen/debugfs.h | 4 -- arch/x86/xen/spinlock.c | 12 +++--- 3 files changed, 6 insertions(+), 114 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index ef1db1900d86..c8377fb26cdf 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -19,107 +19,3 @@ struct dentry * __init xen_init_debugfs(void) return d_xen_debug; } -struct array_data -{ - void *array; - unsigned elements; -}; - -static int u32_array_open(struct inode *inode, struct file *file) -{ - file->private_data = NULL; - return nonseekable_open(inode, file); -} - -static size_t format_array(char *buf, size_t bufsize, const char *fmt, - u32 *array, unsigned array_size) -{ - size_t ret = 0; - unsigned i; - - for(i = 0; i < array_size; i++) { - size_t len; - - len = snprintf(buf, bufsize, fmt, array[i]); - len++; /* ' ' or '\n' */ - ret += len; - - if (buf) { - buf += len; - bufsize -= len; - buf[-1] = (i == array_size-1) ? '\n' : ' '; - } - } - - ret++; /* \0 */ - if (buf) - *buf = '\0'; - - return ret; -} - -static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size) -{ - size_t len = format_array(NULL, 0, fmt, array, array_size); - char *ret; - - ret = kmalloc(len, GFP_KERNEL); - if (ret == NULL) - return NULL; - - format_array(ret, len, fmt, array, array_size); - return ret; -} - -static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, - loff_t *ppos) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct array_data *data = inode->i_private; - size_t size; - - if (*ppos == 0) { - if (file->private_data) { - kfree(file->private_data); - file->private_data = NULL; - } - - file->private_data = format_array_alloc("%u", data->array, data->elements); - } - - size = 0; - if (file->private_data) - size = strlen(file->private_data); - - return simple_read_from_buffer(buf, len, ppos, file->private_data, size); -} - -static int xen_array_release(struct inode *inode, struct file *file) -{ - kfree(file->private_data); - - return 0; -} - -static const struct file_operations u32_array_fops = { - .owner = THIS_MODULE, - .open = u32_array_open, - .release= xen_array_release, - .read = u32_array_read, - .llseek = no_llseek, -}; - -struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode, - struct dentry *parent, - u32 *array, unsigned elements) -{ - struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); - - if (data == NULL) - return NULL; - - data->array = array; - data->elements = elements; - - return debugfs_create_file(name, mode, parent, data, &u32_array_fops); -} diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h index 78d25499be5b..12ebf3325c7b 100644 --- a/arch/x86/xen/debugfs.h +++ b/arch/x86/xen/debugfs.h @@ -3,8 +3,4 @@ struct dentry * __init xen_init_debugfs(void); -struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode, - struct dentry *parent, - u32 *array, unsigned elements); - #endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index d69cc6c3f808..83e866d714ce 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -440,12 +440,12 @@ static int __init xen_spinlock_debugfs(void) debugfs_create_u64("time_total", 0444, d_spin_debug, &spinlock_stats.time_total); - xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, - spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); - xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, - spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); - xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, - spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + debugfs_create_u32_array("histo_total", 0444, d_spin_debug, + spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); + debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, + spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); return 0; } -- cgit v1.2.1 From f71fa31f9f7ac33cba12b8897983f950ad2c7a5b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 18 Apr 2012 12:24:29 +0200 Subject: KVM: MMU: use page table level macro Its much cleaner to use PT_PAGE_TABLE_LEVEL than its numeric value. Signed-off-by: Davidlohr Bueso Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/paging_tmpl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 29ad6f9c58a5..07424cf60434 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3618,7 +3618,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) * Skip write-flooding detected for the sp whose level is 1, because * it can become unsync, then the guest page is not write-protected. */ - if (sp->role.level == 1) + if (sp->role.level == PT_PAGE_TABLE_LEVEL) return false; return ++sp->write_flooding_count >= 3; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index df5a70311be8..34f970937ef1 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -658,7 +658,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != 1); + WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; -- cgit v1.2.1 From 95022b8cf6ed7f3292b60c8e85fe59a12bfb1c9e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 18 Apr 2012 15:19:40 -0700 Subject: x86/mce: Avoid reading every machine check bank register twice. Reading machine check bank registers is slow. There is a trend of increasing the number of banks, and the number of cores. The main section of do_machine_check() is a serialized section where each cpu in turn checks every bank. Even on a little two socket SandyBridge-EP system that multiplies out as: 2 sockets * 8 cores * 2 hyperthreads * 20 banks = 640 MSRs We already scan the banks in parallel in mce_no_way_out() to see if there is a fatal error anywhere in the system. If we build a cache of VALID bits during this scan, we can avoid uselessly re-reading banks that have no data. Note that this cache is only a hint. If the valid bit is set in a shared bank, all cpus that share that bank will see it during the parallel scan, but the first to find it in the sequential scan will (usually) clear the bank. Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d086a09c087d..66e1c51be084 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -641,16 +641,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll); * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static int mce_no_way_out(struct mce *m, char **msg) +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp) { - int i; + int i, ret = 0; for (i = 0; i < banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + if (m->status & MCI_STATUS_VAL) + __set_bit(i, validp); if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) - return 1; + ret = 1; } - return 0; + return ret; } /* @@ -1011,6 +1013,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ int kill_it = 0; DECLARE_BITMAP(toclear, MAX_NR_BANKS); + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; atomic_inc(&mce_entry); @@ -1025,7 +1028,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) final = &__get_cpu_var(mces_seen); *final = m; - no_way_out = mce_no_way_out(&m, &msg); + memset(valid_banks, 0, sizeof(valid_banks)); + no_way_out = mce_no_way_out(&m, &msg, valid_banks); barrier(); @@ -1045,6 +1049,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; if (!mce_banks[i].ctl) continue; -- cgit v1.2.1 From f78146b0f9230765c6315b2e14f56112513389ad Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Apr 2012 19:22:47 +0300 Subject: KVM: Fix page-crossing MMIO MMIO that are split across a page boundary are currently broken - the code does not expect to be aborted by the exit to userspace for the first MMIO fragment. This patch fixes the problem by generalizing the current code for handling 16-byte MMIOs to handle a number of "fragments", and changes the MMIO code to create those fragments. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 114 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 81 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0d9a57875f0b..4de705cdcafd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3718,9 +3718,8 @@ struct read_write_emulator_ops { static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) { if (vcpu->mmio_read_completed) { - memcpy(val, vcpu->mmio_data, bytes); trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_phys_addr, *(u64 *)val); + vcpu->mmio_fragments[0].gpa, *(u64 *)val); vcpu->mmio_read_completed = 0; return 1; } @@ -3756,8 +3755,9 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - memcpy(vcpu->mmio_data, val, bytes); - memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; + + memcpy(vcpu->run->mmio.data, frag->data, frag->len); return X86EMUL_CONTINUE; } @@ -3784,10 +3784,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, gpa_t gpa; int handled, ret; bool write = ops->write; - - if (ops->read_write_prepare && - ops->read_write_prepare(vcpu, val, bytes)) - return X86EMUL_CONTINUE; + struct kvm_mmio_fragment *frag; ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -3813,15 +3810,19 @@ mmio: bytes -= handled; val += handled; - vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = write; - vcpu->mmio_index = 0; + while (bytes) { + unsigned now = min(bytes, 8U); - return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); + frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; + frag->gpa = gpa; + frag->data = val; + frag->len = now; + + gpa += now; + val += now; + bytes -= now; + } + return X86EMUL_CONTINUE; } int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, @@ -3830,10 +3831,18 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, struct read_write_emulator_ops *ops) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + gpa_t gpa; + int rc; + + if (ops->read_write_prepare && + ops->read_write_prepare(vcpu, val, bytes)) + return X86EMUL_CONTINUE; + + vcpu->mmio_nr_fragments = 0; /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { - int rc, now; + int now; now = -addr & ~PAGE_MASK; rc = emulator_read_write_onepage(addr, val, now, exception, @@ -3846,8 +3855,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, bytes -= now; } - return emulator_read_write_onepage(addr, val, bytes, exception, - vcpu, ops); + rc = emulator_read_write_onepage(addr, val, bytes, exception, + vcpu, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + if (!vcpu->mmio_nr_fragments) + return rc; + + gpa = vcpu->mmio_fragments[0].gpa; + + vcpu->mmio_needed = 1; + vcpu->mmio_cur_fragment = 0; + + vcpu->run->mmio.len = vcpu->mmio_fragments[0].len; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = gpa; + + return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); } static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, @@ -5446,33 +5472,55 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return r; } +/* + * Implements the following, as a state machine: + * + * read: + * for each fragment + * write gpa, len + * exit + * copy data + * execute insn + * + * write: + * for each fragment + * write gpa, len + * copy data + * exit + */ static int complete_mmio(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; + struct kvm_mmio_fragment *frag; int r; if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) return 1; if (vcpu->mmio_needed) { - vcpu->mmio_needed = 0; + /* Complete previous fragment */ + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; if (!vcpu->mmio_is_write) - memcpy(vcpu->mmio_data + vcpu->mmio_index, - run->mmio.data, 8); - vcpu->mmio_index += 8; - if (vcpu->mmio_index < vcpu->mmio_size) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; - memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); - run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8); - run->mmio.is_write = vcpu->mmio_is_write; - vcpu->mmio_needed = 1; - return 0; + memcpy(frag->data, run->mmio.data, frag->len); + if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { + vcpu->mmio_needed = 0; + if (vcpu->mmio_is_write) + return 1; + vcpu->mmio_read_completed = 1; + goto done; } + /* Initiate next fragment */ + ++frag; + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = frag->gpa; if (vcpu->mmio_is_write) - return 1; - vcpu->mmio_read_completed = 1; + memcpy(run->mmio.data, frag->data, frag->len); + run->mmio.len = frag->len; + run->mmio.is_write = vcpu->mmio_is_write; + return 0; + } +done: vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); -- cgit v1.2.1 From 8571723a698dcc0ee16c1c63908aa99dd940ce5c Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Fri, 20 Apr 2012 16:02:05 -0700 Subject: x86/mce Add validation check before GHES error is recorded When GHES error record is logged into mcelog kernel buffer, a validation check for physical address is necessary, which prevents reporting an invalid physical address. [Since physical address is the only useful element in this error record, we drop generating the record completely if we don't have a valid address] Signed-off-by: Chen Gong Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-apei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 507ea58688e2..cd8b166a1735 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -42,7 +42,8 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) struct mce m; /* Only corrected MC is reported */ - if (!corrected) + if (!corrected || !(mem_err->validation_bits & + CPER_MEM_VALID_PHYSICAL_ADDRESS)) return; mce_setup(&m); -- cgit v1.2.1 From 07975ad3b30579ca27d880491ad992326b930c63 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 29 Mar 2012 21:14:12 +0200 Subject: KVM: Introduce direct MSI message injection for in-kernel irqchips Currently, MSI messages can only be injected to in-kernel irqchips by defining a corresponding IRQ route for each message. This is not only unhandy if the MSI messages are generated "on the fly" by user space, IRQ routes are a limited resource that user space has to manage carefully. By providing a direct injection path, we can both avoid using up limited resources and simplify the necessary steps for user land. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 1a7fe868f375..a28f338843ea 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -36,6 +36,7 @@ config KVM select TASKSTATS select TASK_DELAY_ACCT select PERF_EVENTS + select HAVE_KVM_MSI ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent -- cgit v1.2.1 From 413837714232b3a4c0705e915d8af75ad521d083 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 19 Apr 2012 14:06:29 +0300 Subject: KVM: Introduce bitmask for apic attention reasons The patch introduces a bitmap that will hold reasons apic should be checked during vmexit. This is in a preparation for vp eoi patch that will add one more check on vmexit. With the bitmap we can do if(apic_attention) to check everything simultaneously which will add zero overhead on the fast path. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/lapic.c | 12 +++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f624ca72ea24..69e39bc7e36f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -172,6 +172,9 @@ enum { #define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff23ff +/* apic attention bits */ +#define KVM_APIC_CHECK_VAPIC 0 + /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -337,6 +340,7 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; int sipi_vector; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 992b4eaae684..93c15743f1ee 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1088,6 +1088,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; + vcpu->arch.apic_attention = 0; apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, @@ -1287,7 +1288,7 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) u32 data; void *vapic; - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; vapic = kmap_atomic(vcpu->arch.apic->vapic_page); @@ -1304,7 +1305,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) struct kvm_lapic *apic; void *vapic; - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; apic = vcpu->arch.apic; @@ -1324,10 +1325,11 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { - if (!irqchip_in_kernel(vcpu->kvm)) - return; - vcpu->arch.apic->vapic_addr = vapic_addr; + if (vapic_addr) + __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); + else + __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); } int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) -- cgit v1.2.1 From 38e8a2ddc9ada5dd1f2def95bebb733bf619bbef Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 22 Apr 2012 15:12:50 +0300 Subject: KVM: x86 emulator: fix asm constraint in flush_pending_x87_faults 'bool' wants 8-bit registers. Reported-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d5729a91d08d..0d151e232480 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4123,7 +4123,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) "jmp 2b \n\t" ".popsection \n\t" _ASM_EXTABLE(1b, 3b) - : [fault]"+rm"(fault)); + : [fault]"+qm"(fault)); ctxt->ops->put_fpu(ctxt); if (unlikely(fault)) -- cgit v1.2.1 From b6ddf05ff68d81a7c1736717faf492b70e9bf4f9 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 24 Apr 2012 16:40:17 +0200 Subject: KVM: x86: Run PIT work in own kthread We can't run PIT IRQ injection work in the interrupt context of the host timer. This would allow the user to influence the handler complexity by asking for a broadcast to a large number of VCPUs. Therefore, this work was pushed into workqueue context in 9d244caf2e. However, this prevents prioritizing the PIT injection over other task as workqueues share kernel threads. This replaces the workqueue with a kthread worker and gives that thread a name in the format "kvm-pit/". That allows to identify and adjust the kthread priority according to the VM process parameters. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 31 +++++++++++++++++++------------ arch/x86/kvm/i8254.h | 7 +++++-- 2 files changed, 24 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index d68f99df690c..adba28f88d1a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -34,7 +34,6 @@ #include #include -#include #include "irq.h" #include "i8254.h" @@ -249,7 +248,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) /* in this case, we had multiple outstanding pit interrupts * that we needed to inject. Reinject */ - queue_work(ps->pit->wq, &ps->pit->expired); + queue_kthread_work(&ps->pit->worker, &ps->pit->expired); ps->irq_ack = 1; spin_unlock(&ps->inject_lock); } @@ -270,7 +269,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) static void destroy_pit_timer(struct kvm_pit *pit) { hrtimer_cancel(&pit->pit_state.pit_timer.timer); - cancel_work_sync(&pit->expired); + flush_kthread_work(&pit->expired); } static bool kpit_is_periodic(struct kvm_timer *ktimer) @@ -284,7 +283,7 @@ static struct kvm_timer_ops kpit_ops = { .is_periodic = kpit_is_periodic, }; -static void pit_do_work(struct work_struct *work) +static void pit_do_work(struct kthread_work *work) { struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); struct kvm *kvm = pit->kvm; @@ -328,7 +327,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) if (ktimer->reinject || !atomic_read(&ktimer->pending)) { atomic_inc(&ktimer->pending); - queue_work(pt->wq, &pt->expired); + queue_kthread_work(&pt->worker, &pt->expired); } if (ktimer->t_ops->is_periodic(ktimer)) { @@ -353,7 +352,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); - cancel_work_sync(&ps->pit->expired); + flush_kthread_work(&ps->pit->expired); pt->period = interval; ps->is_periodic = is_period; @@ -669,6 +668,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; struct kvm_kpit_state *pit_state; + struct pid *pid; + pid_t pid_nr; int ret; pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); @@ -685,14 +686,20 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_lock(&pit->pit_state.lock); spin_lock_init(&pit->pit_state.inject_lock); - pit->wq = create_singlethread_workqueue("kvm-pit-wq"); - if (!pit->wq) { + pid = get_pid(task_tgid(current)); + pid_nr = pid_vnr(pid); + put_pid(pid); + + init_kthread_worker(&pit->worker); + pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, + "kvm-pit/%d", pid_nr); + if (IS_ERR(pit->worker_task)) { mutex_unlock(&pit->pit_state.lock); kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; } - INIT_WORK(&pit->expired, pit_do_work); + init_kthread_work(&pit->expired, pit_do_work); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -736,7 +743,7 @@ fail: kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); kvm_free_irq_source_id(kvm, pit->irq_source_id); - destroy_workqueue(pit->wq); + kthread_stop(pit->worker_task); kfree(pit); return NULL; } @@ -756,10 +763,10 @@ void kvm_free_pit(struct kvm *kvm) mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); - cancel_work_sync(&kvm->arch.vpit->expired); + flush_kthread_work(&kvm->arch.vpit->expired); + kthread_stop(kvm->arch.vpit->worker_task); kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - destroy_workqueue(kvm->arch.vpit->wq); kfree(kvm->arch.vpit); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 51a97426e791..fdf40425ea1d 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -1,6 +1,8 @@ #ifndef __I8254_H #define __I8254_H +#include + #include "iodev.h" struct kvm_kpit_channel_state { @@ -39,8 +41,9 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; - struct workqueue_struct *wq; - struct work_struct expired; + struct kthread_worker worker; + struct task_struct *worker_task; + struct kthread_work expired; }; #define KVM_PIT_BASE_ADDRESS 0x40 -- cgit v1.2.1 From 57c22e5f35aa4b9b2fe11f73f3e62bbf9ef36190 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 2 May 2012 17:55:56 +0300 Subject: KVM: fix cpuid eax for KVM leaf cpuid eax should return the max leaf so that guests can find out the valid range. This matches Xen et al. Update documentation to match. Tested with -cpu host. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c2134b881033..7df1c6d839fb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -398,7 +398,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case KVM_CPUID_SIGNATURE: { char signature[12] = "KVMKVMKVM\0\0"; u32 *sigptr = (u32 *)signature; - entry->eax = 0; + entry->eax = KVM_CPUID_FEATURES; entry->ebx = sigptr[0]; entry->ecx = sigptr[1]; entry->edx = sigptr[2]; -- cgit v1.2.1 From 9b72d3b07dd99ac8ab2b84de5004a295af460536 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 30 Apr 2012 14:45:49 +0300 Subject: KVM guest: make kvm_para_available() check hypervisor bit reading cpuid leaf This cpuid range does not exist on real HW and Intel spec says that "Information returned for highest basic information leaf" will be returned. Not very well defined. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_para.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 99c4bbe0cca2..a7a7a94b94ce 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -178,14 +178,16 @@ static inline int kvm_para_available(void) unsigned int eax, ebx, ecx, edx; char signature[13]; - cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; - - if (strcmp(signature, "KVMKVMKVM") == 0) - return 1; + if (cpu_has_hypervisor) { + cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; + + if (strcmp(signature, "KVMKVMKVM") == 0) + return 1; + } return 0; } -- cgit v1.2.1 From 1c2545be05f436523cabc54087c6a60ea10110d3 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 30 Apr 2012 17:46:31 +0900 Subject: KVM: x86 emulator: Move ModRM flags for groups to top level opcode tables Needed for the following patch which simplifies ModRM fetching code. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 111 +++++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0d151e232480..8d2c3d04cfec 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3359,8 +3359,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) .check_perm = (_p) } #define N D(0) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } -#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } -#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } +#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } +#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define II(_f, _e, _i) \ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } @@ -3380,25 +3380,25 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) static struct opcode group7_rm1[] = { - DI(SrcNone | ModRM | Priv, monitor), - DI(SrcNone | ModRM | Priv, mwait), + DI(SrcNone | Priv, monitor), + DI(SrcNone | Priv, mwait), N, N, N, N, N, N, }; static struct opcode group7_rm3[] = { - DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), - II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), - DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), - DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), - DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), + DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), + II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), + DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), + DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), + DIP(SrcNone | Prot | Priv, stgi, check_svme), + DIP(SrcNone | Prot | Priv, clgi, check_svme), + DIP(SrcNone | Prot | Priv, skinit, check_svme), + DIP(SrcNone | Prot | Priv, invlpga, check_svme), }; static struct opcode group7_rm7[] = { N, - DIP(SrcNone | ModRM, rdtscp, check_rdtsc), + DIP(SrcNone, rdtscp, check_rdtsc), N, N, N, N, N, N, }; @@ -3414,76 +3414,77 @@ static struct opcode group1[] = { }; static struct opcode group1A[] = { - I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, }; static struct opcode group3[] = { - I(DstMem | SrcImm | ModRM, em_test), - I(DstMem | SrcImm | ModRM, em_test), - I(DstMem | SrcNone | ModRM | Lock, em_not), - I(DstMem | SrcNone | ModRM | Lock, em_neg), - I(SrcMem | ModRM, em_mul_ex), - I(SrcMem | ModRM, em_imul_ex), - I(SrcMem | ModRM, em_div_ex), - I(SrcMem | ModRM, em_idiv_ex), + I(DstMem | SrcImm, em_test), + I(DstMem | SrcImm, em_test), + I(DstMem | SrcNone | Lock, em_not), + I(DstMem | SrcNone | Lock, em_neg), + I(SrcMem, em_mul_ex), + I(SrcMem, em_imul_ex), + I(SrcMem, em_div_ex), + I(SrcMem, em_idiv_ex), }; static struct opcode group4[] = { - I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), - I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), + I(ByteOp | DstMem | SrcNone | Lock, em_grp45), + I(ByteOp | DstMem | SrcNone | Lock, em_grp45), N, N, N, N, N, N, }; static struct opcode group5[] = { - I(DstMem | SrcNone | ModRM | Lock, em_grp45), - I(DstMem | SrcNone | ModRM | Lock, em_grp45), - I(SrcMem | ModRM | Stack, em_grp45), - I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), - I(SrcMem | ModRM | Stack, em_grp45), - I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), - I(SrcMem | ModRM | Stack, em_grp45), N, + I(DstMem | SrcNone | Lock, em_grp45), + I(DstMem | SrcNone | Lock, em_grp45), + I(SrcMem | Stack, em_grp45), + I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), + I(SrcMem | Stack, em_grp45), + I(SrcMemFAddr | ImplicitOps, em_grp45), + I(SrcMem | Stack, em_grp45), N, }; static struct opcode group6[] = { - DI(ModRM | Prot, sldt), - DI(ModRM | Prot, str), - DI(ModRM | Prot | Priv, lldt), - DI(ModRM | Prot | Priv, ltr), + DI(Prot, sldt), + DI(Prot, str), + DI(Prot | Priv, lldt), + DI(Prot | Priv, ltr), N, N, N, N, }; static struct group_dual group7 = { { - DI(ModRM | Mov | DstMem | Priv, sgdt), - DI(ModRM | Mov | DstMem | Priv, sidt), - II(ModRM | SrcMem | Priv, em_lgdt, lgdt), - II(ModRM | SrcMem | Priv, em_lidt, lidt), - II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, - II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), - II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), + DI(Mov | DstMem | Priv, sgdt), + DI(Mov | DstMem | Priv, sidt), + II(SrcMem | Priv, em_lgdt, lgdt), + II(SrcMem | Priv, em_lidt, lidt), + II(SrcNone | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), + II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), + I(SrcNone | Priv | VendorSpecific, em_vmcall), EXT(0, group7_rm1), N, EXT(0, group7_rm3), - II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, - II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), + II(SrcNone | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), + EXT(0, group7_rm7), } }; static struct opcode group8[] = { N, N, N, N, - I(DstMem | SrcImmByte | ModRM, em_bt), - I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), - I(DstMem | SrcImmByte | ModRM | Lock, em_btr), - I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), + I(DstMem | SrcImmByte, em_bt), + I(DstMem | SrcImmByte | Lock | PageTable, em_bts), + I(DstMem | SrcImmByte | Lock, em_btr), + I(DstMem | SrcImmByte | Lock | PageTable, em_btc), }; static struct group_dual group9 = { { - N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, + N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, }, { N, N, N, N, N, N, N, N, } }; static struct opcode group11[] = { - I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), + I(DstMem | SrcImm | Mov | PageTable, em_mov), X7(D(Undefined)), }; @@ -3541,10 +3542,10 @@ static struct opcode opcode_table[256] = { /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ - G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), - G(DstMem | SrcImm | ModRM | Group, group1), - G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), - G(DstMem | SrcImmByte | ModRM | Group, group1), + G(ByteOp | DstMem | SrcImm, group1), + G(DstMem | SrcImm, group1), + G(ByteOp | DstMem | SrcImm | No64, group1), + G(DstMem | SrcImmByte, group1), I2bv(DstMem | SrcReg | ModRM, em_test), I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), /* 0x88 - 0x8F */ -- cgit v1.2.1 From 9f4260e73ac43aaa91eb5de95950e1de7002f467 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 30 Apr 2012 17:48:25 +0900 Subject: KVM: x86 emulator: Avoid pushing back ModRM byte fetched for group decoding Although ModRM byte is fetched for group decoding, it is soon pushed back to make decode_modrm() fetch it later again. Now that ModRM flag can be found in the top level opcode tables, fetch ModRM byte before group decoding to make the code simpler. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8d2c3d04cfec..7fd25763b0e0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -972,7 +972,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ } - ctxt->modrm = insn_fetch(u8, ctxt); ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ctxt->modrm_rm |= (ctxt->modrm & 0x07); @@ -3976,17 +3975,16 @@ done_prefixes: } ctxt->d = opcode.flags; + if (ctxt->d & ModRM) + ctxt->modrm = insn_fetch(u8, ctxt); + while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: - ctxt->modrm = insn_fetch(u8, ctxt); - --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: - ctxt->modrm = insn_fetch(u8, ctxt); - --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; if ((ctxt->modrm >> 6) == 3) opcode = opcode.u.gdual->mod3[goffset]; -- cgit v1.2.1 From ca1182387e57470460294ce1e39e2d5518809811 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 30 Mar 2012 15:37:07 -0400 Subject: xen/setup: Only print "Freeing XXX-YYY pfn range: Z pages freed" if Z > 0 Otherwise we can get these meaningless: Freeing bad80-badf4 pfn range: 0 pages freed We also can do this for the summary ones - no point of printing "Set 0 page(s) to 1-1 mapping" Acked-by: David Vrabel [v1: Extended to the summary printks] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1ba8dff26753..7b0ab77b8479 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -114,8 +114,9 @@ static unsigned long __init xen_release_chunk(unsigned long start, len++; } } - printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", - start, end, len); + if (len) + printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", + start, end, len); return len; } @@ -162,8 +163,10 @@ static unsigned long __init xen_set_identity_and_release( } } - printk(KERN_INFO "Released %lu pages of unused memory\n", released); - printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); + if (released) + printk(KERN_INFO "Released %lu pages of unused memory\n", released); + if (identity) + printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); return released; } -- cgit v1.2.1 From 2e2fb75475c2fc74c98100f1468c8195fee49f3b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 6 Apr 2012 10:07:11 -0400 Subject: xen/setup: Populate freed MFNs from non-RAM E820 entries and gaps to E820 RAM When the Xen hypervisor boots a PV kernel it hands it two pieces of information: nr_pages and a made up E820 entry. The nr_pages value defines the range from zero to nr_pages of PFNs which have a valid Machine Frame Number (MFN) underneath it. The E820 mirrors that (with the VGA hole): BIOS-provided physical RAM map: Xen: 0000000000000000 - 00000000000a0000 (usable) Xen: 00000000000a0000 - 0000000000100000 (reserved) Xen: 0000000000100000 - 0000000080800000 (usable) The fun comes when a PV guest that is run with a machine E820 - that can either be the initial domain or a PCI PV guest, where the E820 looks like the normal thing: BIOS-provided physical RAM map: Xen: 0000000000000000 - 000000000009e000 (usable) Xen: 000000000009ec00 - 0000000000100000 (reserved) Xen: 0000000000100000 - 0000000020000000 (usable) Xen: 0000000020000000 - 0000000020200000 (reserved) Xen: 0000000020200000 - 0000000040000000 (usable) Xen: 0000000040000000 - 0000000040200000 (reserved) Xen: 0000000040200000 - 00000000bad80000 (usable) Xen: 00000000bad80000 - 00000000badc9000 (ACPI NVS) .. With that overlaying the nr_pages directly on the E820 does not work as there are gaps and non-RAM regions that won't be used by the memory allocator. The 'xen_release_chunk' helps with that by punching holes in the P2M (PFN to MFN lookup tree) for those regions and tells us that: Freeing 20000-20200 pfn range: 512 pages freed Freeing 40000-40200 pfn range: 512 pages freed Freeing bad80-badf4 pfn range: 116 pages freed Freeing badf6-bae7f pfn range: 137 pages freed Freeing bb000-100000 pfn range: 282624 pages freed Released 283999 pages of unused memory Those 283999 pages are subtracted from the nr_pages and are returned to the hypervisor. The end result is that the initial domain boots with 1GB less memory as the nr_pages has been subtracted by the amount of pages residing within the PCI hole. It can balloon up to that if desired using 'xl mem-set 0 8092', but the balloon driver is not always compiled in for the initial domain. This patch, implements the populate hypercall (XENMEM_populate_physmap) which increases the the domain with the same amount of pages that were released. The other solution (that did not work) was to transplant the MFN in the P2M tree - the ones that were going to be freed were put in the E820_RAM regions past the nr_pages. But the modifications to the M2P array (the other side of creating PTEs) were not carried away. As the hypervisor is the only one capable of modifying that and the only two hypercalls that would do this are: the update_va_mapping (which won't work, as during initial bootup only PFNs up to nr_pages are mapped in the guest) or via the populate hypercall. The end result is that the kernel can now boot with the nr_pages without having to subtract the 283999 pages. On a 8GB machine, with various dom0_mem= parameters this is what we get: no dom0_mem -Memory: 6485264k/9435136k available (5817k kernel code, 1136060k absent, 1813812k reserved, 2899k data, 696k init) +Memory: 7619036k/9435136k available (5817k kernel code, 1136060k absent, 680040k reserved, 2899k data, 696k init) dom0_mem=3G -Memory: 2616536k/9435136k available (5817k kernel code, 1136060k absent, 5682540k reserved, 2899k data, 696k init) +Memory: 2703776k/9435136k available (5817k kernel code, 1136060k absent, 5595300k reserved, 2899k data, 696k init) dom0_mem=max:3G -Memory: 2696732k/4281724k available (5817k kernel code, 1136060k absent, 448932k reserved, 2899k data, 696k init) +Memory: 2702204k/4281724k available (5817k kernel code, 1136060k absent, 443460k reserved, 2899k data, 696k init) And the 'xm list' or 'xl list' now reflect what the dom0_mem= argument is. Acked-by: David Vrabel [v2: Use populate hypercall] [v3: Remove debug printks] [v4: Simplify code] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 112 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 7b0ab77b8479..710af36e6dfb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -26,7 +26,6 @@ #include #include #include - #include "xen-ops.h" #include "vdso.h" @@ -120,7 +119,105 @@ static unsigned long __init xen_release_chunk(unsigned long start, return len; } +static unsigned long __init xen_populate_physmap(unsigned long start, + unsigned long end) +{ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + unsigned long len = 0; + int ret; + + for (pfn = start; pfn < end; pfn++) { + unsigned long frame; + + /* Make sure pfn does not exists to start with */ + if (pfn_to_mfn(pfn) != INVALID_P2M_ENTRY) + continue; + frame = pfn; + set_xen_guest_handle(reservation.extent_start, &frame); + reservation.nr_extents = 1; + + ret = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + WARN(ret != 1, "Failed to populate pfn %lx err=%d\n", pfn, ret); + if (ret == 1) { + if (!early_set_phys_to_machine(pfn, frame)) { + set_xen_guest_handle(reservation.extent_start, &frame); + reservation.nr_extents = 1; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + break; + } + len++; + } else + break; + } + if (len) + printk(KERN_INFO "Populated %lx-%lx pfn range: %lu pages added\n", + start, end, len); + return len; +} +static unsigned long __init xen_populate_chunk( + const struct e820entry *list, size_t map_size, + unsigned long max_pfn, unsigned long *last_pfn, + unsigned long credits_left) +{ + const struct e820entry *entry; + unsigned int i; + unsigned long done = 0; + unsigned long dest_pfn; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + unsigned long credits = credits_left; + unsigned long s_pfn; + unsigned long e_pfn; + unsigned long pfns; + long capacity; + + if (credits <= 0) + break; + + if (entry->type != E820_RAM) + continue; + + e_pfn = PFN_UP(entry->addr + entry->size); + + /* We only care about E820 after the xen_start_info->nr_pages */ + if (e_pfn <= max_pfn) + continue; + + s_pfn = PFN_DOWN(entry->addr); + /* If the E820 falls within the nr_pages, we want to start + * at the nr_pages PFN. + * If that would mean going past the E820 entry, skip it + */ + if (s_pfn <= max_pfn) { + capacity = e_pfn - max_pfn; + dest_pfn = max_pfn; + } else { + /* last_pfn MUST be within E820_RAM regions */ + if (*last_pfn && e_pfn >= *last_pfn) + s_pfn = *last_pfn; + capacity = e_pfn - s_pfn; + dest_pfn = s_pfn; + } + /* If we had filled this E820_RAM entry, go to the next one. */ + if (capacity <= 0) + continue; + + if (credits > capacity) + credits = capacity; + + pfns = xen_populate_physmap(dest_pfn, dest_pfn + credits); + done += pfns; + credits_left -= pfns; + *last_pfn = (dest_pfn + pfns); + } + return done; +} static unsigned long __init xen_set_identity_and_release( const struct e820entry *list, size_t map_size, unsigned long nr_pages) { @@ -143,7 +240,6 @@ static unsigned long __init xen_set_identity_and_release( */ for (i = 0, entry = list; i < map_size; i++, entry++) { phys_addr_t end = entry->addr + entry->size; - if (entry->type == E820_RAM || i == map_size - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -220,7 +316,9 @@ char * __init xen_memory_setup(void) int rc; struct xen_memory_map memmap; unsigned long max_pages; + unsigned long last_pfn = 0; unsigned long extra_pages = 0; + unsigned long populated; int i; int op; @@ -260,8 +358,19 @@ char * __init xen_memory_setup(void) */ xen_released_pages = xen_set_identity_and_release( map, memmap.nr_entries, max_pfn); - extra_pages += xen_released_pages; + /* + * Populate back the non-RAM pages and E820 gaps that had been + * released. */ + populated = xen_populate_chunk(map, memmap.nr_entries, + max_pfn, &last_pfn, xen_released_pages); + + extra_pages += (xen_released_pages - populated); + + if (last_pfn > max_pfn) { + max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); + mem_end = PFN_PHYS(max_pfn); + } /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. On non-highmem systems, the base @@ -275,7 +384,6 @@ char * __init xen_memory_setup(void) */ extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages); - i = 0; while (i < memmap.nr_entries) { u64 addr = map[i].addr; -- cgit v1.2.1 From 96dc08b35c4af8cb5810450602590706f2593a5f Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 6 Apr 2012 16:10:20 -0400 Subject: xen/setup: Combine the two hypercall functions - since they are quite similar. They use the same set of arguments, so it is just the matter of using the proper hypercall. Acked-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 81 +++++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 710af36e6dfb..30ac05a8d28f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -83,8 +83,8 @@ static void __init xen_add_extra_mem(u64 start, u64 size) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } -static unsigned long __init xen_release_chunk(unsigned long start, - unsigned long end) +static unsigned long __init xen_do_chunk(unsigned long start, + unsigned long end, bool release) { struct xen_memory_reservation reservation = { .address_bits = 0, @@ -95,60 +95,36 @@ static unsigned long __init xen_release_chunk(unsigned long start, unsigned long pfn; int ret; - for(pfn = start; pfn < end; pfn++) { - unsigned long mfn = pfn_to_mfn(pfn); - - /* Make sure pfn exists to start with */ - if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) - continue; - - set_xen_guest_handle(reservation.extent_start, &mfn); - reservation.nr_extents = 1; - - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); - WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); - if (ret == 1) { - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); - len++; - } - } - if (len) - printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", - start, end, len); - - return len; -} -static unsigned long __init xen_populate_physmap(unsigned long start, - unsigned long end) -{ - struct xen_memory_reservation reservation = { - .address_bits = 0, - .extent_order = 0, - .domid = DOMID_SELF - }; - unsigned long len = 0; - int ret; - for (pfn = start; pfn < end; pfn++) { unsigned long frame; + unsigned long mfn = pfn_to_mfn(pfn); - /* Make sure pfn does not exists to start with */ - if (pfn_to_mfn(pfn) != INVALID_P2M_ENTRY) - continue; - - frame = pfn; + if (release) { + /* Make sure pfn exists to start with */ + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) + continue; + frame = mfn; + } else { + if (mfn != INVALID_P2M_ENTRY) + continue; + frame = pfn; + } set_xen_guest_handle(reservation.extent_start, &frame); reservation.nr_extents = 1; - ret = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - WARN(ret != 1, "Failed to populate pfn %lx err=%d\n", pfn, ret); + ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, + &reservation); + WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", + release ? "release" : "populate", pfn, ret); + if (ret == 1) { - if (!early_set_phys_to_machine(pfn, frame)) { + if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { + if (release) + break; set_xen_guest_handle(reservation.extent_start, &frame); reservation.nr_extents = 1; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); + &reservation); break; } len++; @@ -156,8 +132,11 @@ static unsigned long __init xen_populate_physmap(unsigned long start, break; } if (len) - printk(KERN_INFO "Populated %lx-%lx pfn range: %lu pages added\n", - start, end, len); + printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", + release ? "Freeing" : "Populating", + start, end, len, + release ? "freed" : "added"); + return len; } static unsigned long __init xen_populate_chunk( @@ -211,7 +190,7 @@ static unsigned long __init xen_populate_chunk( if (credits > capacity) credits = capacity; - pfns = xen_populate_physmap(dest_pfn, dest_pfn + credits); + pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false); done += pfns; credits_left -= pfns; *last_pfn = (dest_pfn + pfns); @@ -249,8 +228,8 @@ static unsigned long __init xen_set_identity_and_release( if (start_pfn < end_pfn) { if (start_pfn < nr_pages) - released += xen_release_chunk( - start_pfn, min(end_pfn, nr_pages)); + released += xen_do_chunk( + start_pfn, min(end_pfn, nr_pages), true); identity += set_phys_range_identity( start_pfn, end_pfn); -- cgit v1.2.1 From 83d51ab473dddde7df858015070ed22b84ebe9a9 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 3 May 2012 16:15:42 +0100 Subject: xen/setup: update VA mapping when releasing memory during setup In xen_memory_setup(), if a page that is being released has a VA mapping this must also be updated. Otherwise, the page will be not released completely -- it will still be referenced in Xen and won't be freed util the mapping is removed and this prevents it from being reallocated at a different PFN. This was already being done for the ISA memory region in xen_ident_map_ISA() but on many systems this was omitting a few pages as many systems marked a few pages below the ISA memory region as reserved in the e820 map. This fixes errors such as: (XEN) page_alloc.c:1148:d0 Over-allocation for domain 0: 2097153 > 2097152 (XEN) memory.c:133:d0 Could not allocate order=0 extent: id=0 memflags=0 (0 of 17) Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 1 - arch/x86/xen/mmu.c | 23 ----------------------- arch/x86/xen/setup.c | 41 ++++++++++++++++++++++++++++++++++------- arch/x86/xen/xen-ops.h | 1 - 4 files changed, 34 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index fe06bf4ef0e3..ac90e5629508 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1308,7 +1308,6 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); - xen_ident_map_ISA(); /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 91dc2871e336..c9a351925a0c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1929,29 +1929,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } -void __init xen_ident_map_ISA(void) -{ - unsigned long pa; - - /* - * If we're dom0, then linear map the ISA machine addresses into - * the kernel's address space. - */ - if (!xen_initial_domain()) - return; - - xen_raw_printk("Xen: setup ISA identity maps\n"); - - for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) { - pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO); - - if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0)) - BUG(); - } - - xen_flush_tlb(); -} - static void __init xen_post_allocator_init(void) { pv_mmu_ops.set_pte = xen_set_pte; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 30ac05a8d28f..3ebba0753d38 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -139,6 +139,13 @@ static unsigned long __init xen_do_chunk(unsigned long start, return len; } + +static unsigned long __init xen_release_chunk(unsigned long start, + unsigned long end) +{ + return xen_do_chunk(start, end, true); +} + static unsigned long __init xen_populate_chunk( const struct e820entry *list, size_t map_size, unsigned long max_pfn, unsigned long *last_pfn, @@ -197,6 +204,29 @@ static unsigned long __init xen_populate_chunk( } return done; } + +static void __init xen_set_identity_and_release_chunk( + unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long *released, unsigned long *identity) +{ + unsigned long pfn; + + /* + * If the PFNs are currently mapped, the VA mapping also needs + * to be updated to be 1:1. + */ + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) + (void)HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(pfn, PAGE_KERNEL_IO), 0); + + if (start_pfn < nr_pages) + *released += xen_release_chunk( + start_pfn, min(end_pfn, nr_pages)); + + *identity += set_phys_range_identity(start_pfn, end_pfn); +} + static unsigned long __init xen_set_identity_and_release( const struct e820entry *list, size_t map_size, unsigned long nr_pages) { @@ -226,14 +256,11 @@ static unsigned long __init xen_set_identity_and_release( if (entry->type == E820_RAM) end_pfn = PFN_UP(entry->addr); - if (start_pfn < end_pfn) { - if (start_pfn < nr_pages) - released += xen_do_chunk( - start_pfn, min(end_pfn, nr_pages), true); + if (start_pfn < end_pfn) + xen_set_identity_and_release_chunk( + start_pfn, end_pfn, nr_pages, + &released, &identity); - identity += set_phys_range_identity( - start_pfn, end_pfn); - } start = end; } } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index b095739ccd4c..506fa08d934a 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -28,7 +28,6 @@ void xen_setup_shared_info(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); -void xen_ident_map_ISA(void); void xen_reserve_top(void); extern unsigned long xen_max_p2m_pfn; -- cgit v1.2.1 From f447d56d36af18c5104ff29dcb1327c0c0ac3634 Mon Sep 17 00:00:00 2001 From: Ben Guthro Date: Sat, 21 Apr 2012 00:11:04 +0800 Subject: xen: implement apic ipi interface Map native ipi vector to xen vector. Implement apic ipi interface with xen_send_IPI_one. Tested-by: Steven Noonan Signed-off-by: Ben Guthro Signed-off-by: Lin Ming Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 9 ++++++ arch/x86/xen/smp.c | 81 +++++++++++++++++++++++++++++++++++++++++++++--- arch/x86/xen/smp.h | 12 +++++++ 3 files changed, 98 insertions(+), 4 deletions(-) create mode 100644 arch/x86/xen/smp.h (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 4f51bebac02c..1ed61c2bf633 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -74,6 +74,7 @@ #include "xen-ops.h" #include "mmu.h" +#include "smp.h" #include "multicalls.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -849,6 +850,14 @@ static void set_xen_basic_apic_ops(void) apic->icr_write = xen_apic_icr_write; apic->wait_icr_idle = xen_apic_wait_icr_idle; apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; + +#ifdef CONFIG_SMP + apic->send_IPI_allbutself = xen_send_IPI_allbutself; + apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself; + apic->send_IPI_mask = xen_send_IPI_mask; + apic->send_IPI_all = xen_send_IPI_all; + apic->send_IPI_self = xen_send_IPI_self; +#endif } #endif diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 5fac6919b957..2dc6628c1520 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -465,8 +465,8 @@ static void xen_smp_send_reschedule(int cpu) xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } -static void xen_send_IPI_mask(const struct cpumask *mask, - enum ipi_vector vector) +static void __xen_send_IPI_mask(const struct cpumask *mask, + int vector) { unsigned cpu; @@ -478,7 +478,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask) { int cpu; - xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); + __xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); /* Make sure other vcpus get a chance to run if they need to. */ for_each_cpu(cpu, mask) { @@ -491,10 +491,83 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask) static void xen_smp_send_call_function_single_ipi(int cpu) { - xen_send_IPI_mask(cpumask_of(cpu), + __xen_send_IPI_mask(cpumask_of(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); } +static inline int xen_map_vector(int vector) +{ + int xen_vector; + + switch (vector) { + case RESCHEDULE_VECTOR: + xen_vector = XEN_RESCHEDULE_VECTOR; + break; + case CALL_FUNCTION_VECTOR: + xen_vector = XEN_CALL_FUNCTION_VECTOR; + break; + case CALL_FUNCTION_SINGLE_VECTOR: + xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR; + break; + default: + xen_vector = -1; + printk(KERN_ERR "xen: vector 0x%x is not implemented\n", + vector); + } + + return xen_vector; +} + +void xen_send_IPI_mask(const struct cpumask *mask, + int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + __xen_send_IPI_mask(mask, xen_vector); +} + +void xen_send_IPI_all(int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + __xen_send_IPI_mask(cpu_online_mask, xen_vector); +} + +void xen_send_IPI_self(int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + xen_send_IPI_one(smp_processor_id(), xen_vector); +} + +void xen_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned cpu; + unsigned int this_cpu = smp_processor_id(); + + if (!(num_online_cpus() > 1)) + return; + + for_each_cpu_and(cpu, mask, cpu_online_mask) { + if (this_cpu == cpu) + continue; + + xen_smp_send_call_function_single_ipi(cpu); + } +} + +void xen_send_IPI_allbutself(int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + xen_send_IPI_mask_allbutself(cpu_online_mask, xen_vector); +} + static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { irq_enter(); diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h new file mode 100644 index 000000000000..8981a76d081a --- /dev/null +++ b/arch/x86/xen/smp.h @@ -0,0 +1,12 @@ +#ifndef _XEN_SMP_H + +extern void xen_send_IPI_mask(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_allbutself(int vector); +extern void physflat_send_IPI_allbutself(int vector); +extern void xen_send_IPI_all(int vector); +extern void xen_send_IPI_self(int vector); + +#endif -- cgit v1.2.1 From 1ff2b0c303698e486f1e0886b4d9876200ef8ca5 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sat, 21 Apr 2012 00:11:05 +0800 Subject: xen: implement IRQ_WORK_VECTOR handler Signed-off-by: Lin Ming Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/events.h | 1 + arch/x86/xen/smp.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 1df35417c412..cc146d51449e 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -6,6 +6,7 @@ enum ipi_vector { XEN_CALL_FUNCTION_VECTOR, XEN_CALL_FUNCTION_SINGLE_VECTOR, XEN_SPIN_UNLOCK_VECTOR, + XEN_IRQ_WORK_VECTOR, XEN_NR_IPIS, }; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 2dc6628c1520..3ec3f8eb19fc 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -41,10 +42,12 @@ cpumask_var_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, xen_resched_irq); static DEFINE_PER_CPU(int, xen_callfunc_irq); static DEFINE_PER_CPU(int, xen_callfuncsingle_irq); +static DEFINE_PER_CPU(int, xen_irq_work); static DEFINE_PER_CPU(int, xen_debug_irq) = -1; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); /* * Reschedule call back. @@ -143,6 +146,17 @@ static int xen_smp_intr_init(unsigned int cpu) goto fail; per_cpu(xen_callfuncsingle_irq, cpu) = rc; + callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, + cpu, + xen_irq_work_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(xen_irq_work, cpu) = rc; + return 0; fail: @@ -155,6 +169,8 @@ static int xen_smp_intr_init(unsigned int cpu) if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + if (per_cpu(xen_irq_work, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); return rc; } @@ -509,6 +525,9 @@ static inline int xen_map_vector(int vector) case CALL_FUNCTION_SINGLE_VECTOR: xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR; break; + case IRQ_WORK_VECTOR: + xen_vector = XEN_IRQ_WORK_VECTOR; + break; default: xen_vector = -1; printk(KERN_ERR "xen: vector 0x%x is not implemented\n", @@ -588,6 +607,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) +{ + irq_enter(); + irq_work_run(); + inc_irq_stat(apic_irq_work_irqs); + irq_exit(); + + return IRQ_HANDLED; +} + static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, @@ -634,6 +663,7 @@ static void xen_hvm_cpu_die(unsigned int cpu) unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); native_cpu_die(cpu); } -- cgit v1.2.1 From 211063dc159695bd6072c5393e9bc729481c6ede Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 8 Dec 2011 17:32:23 +0800 Subject: xen/acpi/sleep: Enable ACPI sleep via the __acpi_os_prepare_sleep Provide the registration callback to call in the Xen's ACPI sleep functionality. This means that during S3/S5 we make a hypercall XENPF_enter_acpi_sleep with the proper PM1A/PM1B registers. Based of Ke Yu's initial idea. [ From http://xenbits.xensource.com/linux-2.6.18-xen.hg change c68699484a65 ] [v1: Added Copyright and license] [v2: Added check if PM1A/B the 16-bits MSB contain something. The spec only uses 16-bits but might have more in future] Signed-off-by: Liang Tang Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1ed61c2bf633..eca90e5be1e7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -1373,6 +1374,8 @@ asmlinkage void __init xen_start_kernel(void) /* Make sure ACS will be enabled */ pci_request_acs(); + + xen_acpi_sleep_register(); } -- cgit v1.2.1 From 433de739bbc22a5b2c87602116566ce27e3b4cab Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:24 +0300 Subject: x86, realmode: 16-bit real-mode code support for relocs tool A new option is added to the relocs tool called '--realmode'. This option causes the generation of 16-bit segment relocations and 32-bit linear relocations for the real-mode code. When the real-mode code is moved to the low-memory during kernel initialization, these relocation entries can be used to relocate the code properly. In the assembly code 16-bit segment relocations must be relative to the 'real_mode_seg' absolute symbol. Linear relocations must be relative to a symbol prefixed with 'pa_'. 16-bit segment relocation is used to load cs:ip in 16-bit code. Linear relocations are used in the 32-bit code for relocatable data references. They are declared in the linker script of the real-mode code. The relocs tool is moved to scripts/x86-relocs.c so it will be compiled before building the arch/x86 tree. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-2-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: Jarkko Sakkinen Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/Makefile | 11 +- arch/x86/boot/compressed/relocs.c | 678 -------------------------------------- 2 files changed, 5 insertions(+), 684 deletions(-) delete mode 100644 arch/x86/boot/compressed/relocs.c (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fd55a2ff3ad8..0435e8a2d20e 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -40,13 +40,12 @@ OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) +targets += vmlinux.bin.all vmlinux.relocs -targets += vmlinux.bin.all vmlinux.relocs relocs -hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs - -quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE +CMD_RELOCS = scripts/x86-relocs +quiet_cmd_relocs = RELOCS $@ + cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< +$(obj)/vmlinux.relocs: vmlinux FORCE $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c deleted file mode 100644 index fb7117a4ade1..000000000000 --- a/arch/x86/boot/compressed/relocs.c +++ /dev/null @@ -1,678 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#define USE_BSD -#include -#include -#include - -static void die(char *fmt, ...); - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -static Elf32_Ehdr ehdr; -static unsigned long reloc_count, reloc_idx; -static unsigned long *relocs; - -struct section { - Elf32_Shdr shdr; - struct section *link; - Elf32_Sym *symtab; - Elf32_Rel *reltab; - char *strtab; -}; -static struct section *secs; - -/* - * Following symbols have been audited. There values are constant and do - * not change if bzImage is loaded at a different physical address than - * the address for which it has been compiled. Don't warn user about - * absolute relocations present w.r.t these symbols. - */ -static const char abs_sym_regex[] = - "^(xen_irq_disable_direct_reloc$|" - "xen_save_fl_direct_reloc$|" - "VDSO|" - "__crc_)"; -static regex_t abs_sym_regex_c; -static int is_abs_reloc(const char *sym_name) -{ - return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0); -} - -/* - * These symbols are known to be relative, even if the linker marks them - * as absolute (typically defined outside any section in the linker script.) - */ -static const char rel_sym_regex[] = - "^_end$"; -static regex_t rel_sym_regex_c; -static int is_rel_reloc(const char *sym_name) -{ - return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0); -} - -static void regex_init(void) -{ - char errbuf[128]; - int err; - - err = regcomp(&abs_sym_regex_c, abs_sym_regex, - REG_EXTENDED|REG_NOSUB); - if (err) { - regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf); - die("%s", errbuf); - } - - err = regcomp(&rel_sym_regex_c, rel_sym_regex, - REG_EXTENDED|REG_NOSUB); - if (err) { - regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf); - die("%s", errbuf); - } -} - -static void die(char *fmt, ...) -{ - va_list ap; - va_start(ap, fmt); - vfprintf(stderr, fmt, ap); - va_end(ap); - exit(1); -} - -static const char *sym_type(unsigned type) -{ - static const char *type_name[] = { -#define SYM_TYPE(X) [X] = #X - SYM_TYPE(STT_NOTYPE), - SYM_TYPE(STT_OBJECT), - SYM_TYPE(STT_FUNC), - SYM_TYPE(STT_SECTION), - SYM_TYPE(STT_FILE), - SYM_TYPE(STT_COMMON), - SYM_TYPE(STT_TLS), -#undef SYM_TYPE - }; - const char *name = "unknown sym type name"; - if (type < ARRAY_SIZE(type_name)) { - name = type_name[type]; - } - return name; -} - -static const char *sym_bind(unsigned bind) -{ - static const char *bind_name[] = { -#define SYM_BIND(X) [X] = #X - SYM_BIND(STB_LOCAL), - SYM_BIND(STB_GLOBAL), - SYM_BIND(STB_WEAK), -#undef SYM_BIND - }; - const char *name = "unknown sym bind name"; - if (bind < ARRAY_SIZE(bind_name)) { - name = bind_name[bind]; - } - return name; -} - -static const char *sym_visibility(unsigned visibility) -{ - static const char *visibility_name[] = { -#define SYM_VISIBILITY(X) [X] = #X - SYM_VISIBILITY(STV_DEFAULT), - SYM_VISIBILITY(STV_INTERNAL), - SYM_VISIBILITY(STV_HIDDEN), - SYM_VISIBILITY(STV_PROTECTED), -#undef SYM_VISIBILITY - }; - const char *name = "unknown sym visibility name"; - if (visibility < ARRAY_SIZE(visibility_name)) { - name = visibility_name[visibility]; - } - return name; -} - -static const char *rel_type(unsigned type) -{ - static const char *type_name[] = { -#define REL_TYPE(X) [X] = #X - REL_TYPE(R_386_NONE), - REL_TYPE(R_386_32), - REL_TYPE(R_386_PC32), - REL_TYPE(R_386_GOT32), - REL_TYPE(R_386_PLT32), - REL_TYPE(R_386_COPY), - REL_TYPE(R_386_GLOB_DAT), - REL_TYPE(R_386_JMP_SLOT), - REL_TYPE(R_386_RELATIVE), - REL_TYPE(R_386_GOTOFF), - REL_TYPE(R_386_GOTPC), -#undef REL_TYPE - }; - const char *name = "unknown type rel type name"; - if (type < ARRAY_SIZE(type_name) && type_name[type]) { - name = type_name[type]; - } - return name; -} - -static const char *sec_name(unsigned shndx) -{ - const char *sec_strtab; - const char *name; - sec_strtab = secs[ehdr.e_shstrndx].strtab; - name = ""; - if (shndx < ehdr.e_shnum) { - name = sec_strtab + secs[shndx].shdr.sh_name; - } - else if (shndx == SHN_ABS) { - name = "ABSOLUTE"; - } - else if (shndx == SHN_COMMON) { - name = "COMMON"; - } - return name; -} - -static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym) -{ - const char *name; - name = ""; - if (sym->st_name) { - name = sym_strtab + sym->st_name; - } - else { - name = sec_name(secs[sym->st_shndx].shdr.sh_name); - } - return name; -} - - - -#if BYTE_ORDER == LITTLE_ENDIAN -#define le16_to_cpu(val) (val) -#define le32_to_cpu(val) (val) -#endif -#if BYTE_ORDER == BIG_ENDIAN -#define le16_to_cpu(val) bswap_16(val) -#define le32_to_cpu(val) bswap_32(val) -#endif - -static uint16_t elf16_to_cpu(uint16_t val) -{ - return le16_to_cpu(val); -} - -static uint32_t elf32_to_cpu(uint32_t val) -{ - return le32_to_cpu(val); -} - -static void read_ehdr(FILE *fp) -{ - if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) { - die("Cannot read ELF header: %s\n", - strerror(errno)); - } - if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) { - die("No ELF magic\n"); - } - if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) { - die("Not a 32 bit executable\n"); - } - if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { - die("Not a LSB ELF executable\n"); - } - if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) { - die("Unknown ELF version\n"); - } - /* Convert the fields to native endian */ - ehdr.e_type = elf16_to_cpu(ehdr.e_type); - ehdr.e_machine = elf16_to_cpu(ehdr.e_machine); - ehdr.e_version = elf32_to_cpu(ehdr.e_version); - ehdr.e_entry = elf32_to_cpu(ehdr.e_entry); - ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff); - ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff); - ehdr.e_flags = elf32_to_cpu(ehdr.e_flags); - ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize); - ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize); - ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum); - ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize); - ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum); - ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx); - - if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) { - die("Unsupported ELF header type\n"); - } - if (ehdr.e_machine != EM_386) { - die("Not for x86\n"); - } - if (ehdr.e_version != EV_CURRENT) { - die("Unknown ELF version\n"); - } - if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) { - die("Bad Elf header size\n"); - } - if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) { - die("Bad program header entry\n"); - } - if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) { - die("Bad section header entry\n"); - } - if (ehdr.e_shstrndx >= ehdr.e_shnum) { - die("String table index out of bounds\n"); - } -} - -static void read_shdrs(FILE *fp) -{ - int i; - Elf32_Shdr shdr; - - secs = calloc(ehdr.e_shnum, sizeof(struct section)); - if (!secs) { - die("Unable to allocate %d section headers\n", - ehdr.e_shnum); - } - if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - ehdr.e_shoff, strerror(errno)); - } - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (fread(&shdr, sizeof shdr, 1, fp) != 1) - die("Cannot read ELF section headers %d/%d: %s\n", - i, ehdr.e_shnum, strerror(errno)); - sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name); - sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type); - sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags); - sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr); - sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset); - sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size); - sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link); - sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info); - sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign); - sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize); - if (sec->shdr.sh_link < ehdr.e_shnum) - sec->link = &secs[sec->shdr.sh_link]; - } - -} - -static void read_strtabs(FILE *fp) -{ - int i; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_STRTAB) { - continue; - } - sec->strtab = malloc(sec->shdr.sh_size); - if (!sec->strtab) { - die("malloc of %d bytes for strtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - } -} - -static void read_symtabs(FILE *fp) -{ - int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_SYMTAB) { - continue; - } - sec->symtab = malloc(sec->shdr.sh_size); - if (!sec->symtab) { - die("malloc of %d bytes for symtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { - Elf32_Sym *sym = &sec->symtab[j]; - sym->st_name = elf32_to_cpu(sym->st_name); - sym->st_value = elf32_to_cpu(sym->st_value); - sym->st_size = elf32_to_cpu(sym->st_size); - sym->st_shndx = elf16_to_cpu(sym->st_shndx); - } - } -} - - -static void read_relocs(FILE *fp) -{ - int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec->reltab = malloc(sec->shdr.sh_size); - if (!sec->reltab) { - die("malloc of %d bytes for relocs failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel = &sec->reltab[j]; - rel->r_offset = elf32_to_cpu(rel->r_offset); - rel->r_info = elf32_to_cpu(rel->r_info); - } - } -} - - -static void print_absolute_symbols(void) -{ - int i; - printf("Absolute symbols\n"); - printf(" Num: Value Size Type Bind Visibility Name\n"); - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - char *sym_strtab; - int j; - - if (sec->shdr.sh_type != SHT_SYMTAB) { - continue; - } - sym_strtab = sec->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { - Elf32_Sym *sym; - const char *name; - sym = &sec->symtab[j]; - name = sym_name(sym_strtab, sym); - if (sym->st_shndx != SHN_ABS) { - continue; - } - printf("%5d %08x %5d %10s %10s %12s %s\n", - j, sym->st_value, sym->st_size, - sym_type(ELF32_ST_TYPE(sym->st_info)), - sym_bind(ELF32_ST_BIND(sym->st_info)), - sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)), - name); - } - } - printf("\n"); -} - -static void print_absolute_relocs(void) -{ - int i, printed = 0; - - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - struct section *sec_applies, *sec_symtab; - char *sym_strtab; - Elf32_Sym *sh_symtab; - int j; - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec_symtab = sec->link; - sec_applies = &secs[sec->shdr.sh_info]; - if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { - continue; - } - sh_symtab = sec_symtab->symtab; - sym_strtab = sec_symtab->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel; - Elf32_Sym *sym; - const char *name; - rel = &sec->reltab[j]; - sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; - name = sym_name(sym_strtab, sym); - if (sym->st_shndx != SHN_ABS) { - continue; - } - - /* Absolute symbols are not relocated if bzImage is - * loaded at a non-compiled address. Display a warning - * to user at compile time about the absolute - * relocations present. - * - * User need to audit the code to make sure - * some symbols which should have been section - * relative have not become absolute because of some - * linker optimization or wrong programming usage. - * - * Before warning check if this absolute symbol - * relocation is harmless. - */ - if (is_abs_reloc(name) || is_rel_reloc(name)) - continue; - - if (!printed) { - printf("WARNING: Absolute relocations" - " present\n"); - printf("Offset Info Type Sym.Value " - "Sym.Name\n"); - printed = 1; - } - - printf("%08x %08x %10s %08x %s\n", - rel->r_offset, - rel->r_info, - rel_type(ELF32_R_TYPE(rel->r_info)), - sym->st_value, - name); - } - } - - if (printed) - printf("\n"); -} - -static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) -{ - int i; - /* Walk through the relocations */ - for (i = 0; i < ehdr.e_shnum; i++) { - char *sym_strtab; - Elf32_Sym *sh_symtab; - struct section *sec_applies, *sec_symtab; - int j; - struct section *sec = &secs[i]; - - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec_symtab = sec->link; - sec_applies = &secs[sec->shdr.sh_info]; - if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { - continue; - } - sh_symtab = sec_symtab->symtab; - sym_strtab = sec_symtab->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel; - Elf32_Sym *sym; - unsigned r_type; - rel = &sec->reltab[j]; - sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; - r_type = ELF32_R_TYPE(rel->r_info); - /* Don't visit relocations to absolute symbols */ - if (sym->st_shndx == SHN_ABS && - !is_rel_reloc(sym_name(sym_strtab, sym))) { - continue; - } - switch (r_type) { - case R_386_NONE: - case R_386_PC32: - /* - * NONE can be ignored and and PC relative - * relocations don't need to be adjusted. - */ - break; - case R_386_32: - /* Visit relocations that need to be adjusted */ - visit(rel, sym); - break; - default: - die("Unsupported relocation type: %s (%d)\n", - rel_type(r_type), r_type); - break; - } - } - } -} - -static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym) -{ - reloc_count += 1; -} - -static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym) -{ - /* Remember the address that needs to be adjusted. */ - relocs[reloc_idx++] = rel->r_offset; -} - -static int cmp_relocs(const void *va, const void *vb) -{ - const unsigned long *a, *b; - a = va; b = vb; - return (*a == *b)? 0 : (*a > *b)? 1 : -1; -} - -static void emit_relocs(int as_text) -{ - int i; - /* Count how many relocations I have and allocate space for them. */ - reloc_count = 0; - walk_relocs(count_reloc); - relocs = malloc(reloc_count * sizeof(relocs[0])); - if (!relocs) { - die("malloc of %d entries for relocs failed\n", - reloc_count); - } - /* Collect up the relocations */ - reloc_idx = 0; - walk_relocs(collect_reloc); - - /* Order the relocations for more efficient processing */ - qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs); - - /* Print the relocations */ - if (as_text) { - /* Print the relocations in a form suitable that - * gas will like. - */ - printf(".section \".data.reloc\",\"a\"\n"); - printf(".balign 4\n"); - for (i = 0; i < reloc_count; i++) { - printf("\t .long 0x%08lx\n", relocs[i]); - } - printf("\n"); - } - else { - unsigned char buf[4]; - /* Print a stop */ - fwrite("\0\0\0\0", 4, 1, stdout); - /* Now print each relocation */ - for (i = 0; i < reloc_count; i++) { - put_unaligned_le32(relocs[i], buf); - fwrite(buf, 4, 1, stdout); - } - } -} - -static void usage(void) -{ - die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n"); -} - -int main(int argc, char **argv) -{ - int show_absolute_syms, show_absolute_relocs; - int as_text; - const char *fname; - FILE *fp; - int i; - - regex_init(); - - show_absolute_syms = 0; - show_absolute_relocs = 0; - as_text = 0; - fname = NULL; - for (i = 1; i < argc; i++) { - char *arg = argv[i]; - if (*arg == '-') { - if (strcmp(argv[1], "--abs-syms") == 0) { - show_absolute_syms = 1; - continue; - } - - if (strcmp(argv[1], "--abs-relocs") == 0) { - show_absolute_relocs = 1; - continue; - } - else if (strcmp(argv[1], "--text") == 0) { - as_text = 1; - continue; - } - } - else if (!fname) { - fname = arg; - continue; - } - usage(); - } - if (!fname) { - usage(); - } - fp = fopen(fname, "r"); - if (!fp) { - die("Cannot open %s: %s\n", - fname, strerror(errno)); - } - read_ehdr(fp); - read_shdrs(fp); - read_strtabs(fp); - read_symtabs(fp); - read_relocs(fp); - if (show_absolute_syms) { - print_absolute_symbols(); - return 0; - } - if (show_absolute_relocs) { - print_absolute_relocs(); - return 0; - } - emit_relocs(as_text); - return 0; -} -- cgit v1.2.1 From b3266bd6ff52efb9e57c7fbfff4c8f7363dfaab3 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:25 +0300 Subject: x86, realmode: realmode.bin infrastructure Create realmode.bin and realmode.relocs files. Piggy pack them into relocatable object that will be included into .init.data section of the main kernel image. The first file includes binary image of the real-mode code. The latter file includes all relocations. The layout of the binary image is specified in realmode.lds.S. The makefile generates pa_ prefixed symbols for each exported global. These are used in 32-bit code and in realmode header to define symbols that need to be relocated. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-3-git-send-email-jarkko.sakkinen@intel.com Originally-by: H. Peter Anvin Signed-off-by: H. Peter Anvin --- arch/x86/Kbuild | 2 +- arch/x86/realmode/Makefile | 20 +++++++++++ arch/x86/realmode/rm/.gitignore | 3 ++ arch/x86/realmode/rm/Makefile | 63 ++++++++++++++++++++++++++++++++++ arch/x86/realmode/rm/header.S | 16 +++++++++ arch/x86/realmode/rm/realmode.lds.S | 68 +++++++++++++++++++++++++++++++++++++ arch/x86/realmode/rmpiggy.S | 18 ++++++++++ 7 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 arch/x86/realmode/Makefile create mode 100644 arch/x86/realmode/rm/.gitignore create mode 100644 arch/x86/realmode/rm/Makefile create mode 100644 arch/x86/realmode/rm/header.S create mode 100644 arch/x86/realmode/rm/realmode.lds.S create mode 100644 arch/x86/realmode/rmpiggy.S (limited to 'arch/x86') diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 0e9dec6cadd1..e5287d8517aa 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -1,4 +1,3 @@ - obj-$(CONFIG_KVM) += kvm/ # Xen paravirtualization support @@ -7,6 +6,7 @@ obj-$(CONFIG_XEN) += xen/ # lguest paravirtualization support obj-$(CONFIG_LGUEST_GUEST) += lguest/ +obj-y += realmode/ obj-y += kernel/ obj-y += mm/ diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile new file mode 100644 index 000000000000..f22a4f8d99d6 --- /dev/null +++ b/arch/x86/realmode/Makefile @@ -0,0 +1,20 @@ +# +# arch/x86/realmode/Makefile +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# + +subdir- := rm + +obj-y += rmpiggy.o + +$(obj)/rmpiggy.o: $(obj)/rm/realmode.relocs $(obj)/rm/realmode.bin + +$(obj)/rm/realmode.bin: FORCE + $(Q)$(MAKE) $(build)=$(obj)/rm $@ + +$(obj)/rm/realmode.relocs: FORCE + $(Q)$(MAKE) $(build)=$(obj)/rm $@ diff --git a/arch/x86/realmode/rm/.gitignore b/arch/x86/realmode/rm/.gitignore new file mode 100644 index 000000000000..b6ed3a2555cb --- /dev/null +++ b/arch/x86/realmode/rm/.gitignore @@ -0,0 +1,3 @@ +pasyms.h +realmode.lds +realmode.relocs diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile new file mode 100644 index 000000000000..7c3f202cbccf --- /dev/null +++ b/arch/x86/realmode/rm/Makefile @@ -0,0 +1,63 @@ +# +# arch/x86/realmode/Makefile +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# + +subdir- := wakeup + +always := realmode.bin + +realmode-y += header.o + +targets += $(realmode-y) + +REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y)) + +sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p' + +quiet_cmd_pasyms = PASYMS $@ + cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \ + sed $(sed-pasyms) | sort | uniq > $@ + +$(obj)/pasyms.h: $(REALMODE_OBJS) FORCE + $(call if_changed,pasyms) + +$(obj)/realmode.lds: $(obj)/pasyms.h + +LDFLAGS_realmode.elf := --emit-relocs -T +CPPFLAGS_realmode.lds += -P -C -I$(obj) + +$(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE + $(call if_changed,ld) + +OBJCOPYFLAGS_realmode.bin := -O binary + +$(obj)/realmode.bin: $(obj)/realmode.elf + $(call if_changed,objcopy) + +quiet_cmd_relocs = RELOCS $@ + cmd_relocs = scripts/x86-relocs --realmode $< > $@ +$(obj)/realmode.relocs: $(obj)/realmode.elf FORCE + $(call if_changed,relocs) + +# --------------------------------------------------------------------------- + +# How to compile the 16-bit code. Note we always compile for -march=i386, +# that way we can complain to the user if the CPU is insufficient. +KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ \ + -DDISABLE_BRANCH_PROFILING \ + -Wall -Wstrict-prototypes \ + -march=i386 -mregparm=3 \ + -include $(srctree)/$(src)/../../boot/code16gcc.h \ + -fno-strict-aliasing -fomit-frame-pointer \ + $(call cc-option, -ffreestanding) \ + $(call cc-option, -fno-toplevel-reorder,\ + $(call cc-option, -fno-unit-at-a-time)) \ + $(call cc-option, -fno-stack-protector) \ + $(call cc-option, -mpreferred-stack-boundary=2) +KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ +GCOV_PROFILE := n diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S new file mode 100644 index 000000000000..7be17f2c65a3 --- /dev/null +++ b/arch/x86/realmode/rm/header.S @@ -0,0 +1,16 @@ +/* + * Real-mode blob header; this should match realmode.h and be + * readonly; for mutable data instead add pointers into the .data + * or .bss sections as appropriate. + */ + +#include +#include + + .section ".header", "a" + +ENTRY(real_mode_header) + .long pa_text_start + .long pa_ro_end + .long pa_end +END(real_mode_header) diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S new file mode 100644 index 000000000000..c5b8a4f31ba3 --- /dev/null +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -0,0 +1,68 @@ +/* + * realmode.lds.S + * + * Linker script for the real-mode code + */ + +#include + +#undef i386 + +OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +OUTPUT_ARCH(i386) + +SECTIONS +{ + real_mode_seg = 0; + + . = 0; + .header : { + pa_real_mode_base = .; + *(.header) + } + + . = ALIGN(4); + .rodata : { + *(.rodata) + *(.rodata.*) + } + + . = ALIGN(PAGE_SIZE); + .text : { + pa_text_start = .; + *(.text) + *(.text.*) + } + + .text32 : { + *(.text32) + *(.text32.*) + pa_ro_end = .; + } + + . = ALIGN(PAGE_SIZE); + .data : { + *(.data) + *(.data.*) + } + + . = ALIGN(128); + .bss : { + *(.bss*) + } + + /* End signature for integrity checking */ + . = ALIGN(4); + .signature : { + *(.signature) + pa_end = .; + } + + /DISCARD/ : { + *(.note*) + *(.debug*) + *(.eh_frame*) + } + +#include "pasyms.h" +} diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S new file mode 100644 index 000000000000..6047d7f604cf --- /dev/null +++ b/arch/x86/realmode/rmpiggy.S @@ -0,0 +1,18 @@ +/* + * Wrapper script for the realmode binary as a transport object + * before copying to low memory. + */ +#include +#include + + .section ".init.data","aw" + + .balign PAGE_SIZE + +ENTRY(real_mode_blob) + .incbin "arch/x86/realmode/rm/realmode.bin" +END(real_mode_blob) + +ENTRY(real_mode_relocs) + .incbin "arch/x86/realmode/rm/realmode.relocs" +END(real_mode_relocs) -- cgit v1.2.1 From 084ee1c641a068bfd1194d545f7dc9ab2043eb35 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:26 +0300 Subject: x86, realmode: Relocator for realmode code Implements relocator for real mode code that is called as part of setup_arch(). Processes segment relocations and linear relocations. Real-mode code is relocated to a free hole below 1 MB. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-4-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 26 ++++++++++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/realmode.c | 79 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 2 ++ 4 files changed, 108 insertions(+) create mode 100644 arch/x86/include/asm/realmode.h create mode 100644 arch/x86/kernel/realmode.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h new file mode 100644 index 000000000000..dc1bba534c14 --- /dev/null +++ b/arch/x86/include/asm/realmode.h @@ -0,0 +1,26 @@ +#ifndef _ARCH_X86_REALMODE_H +#define _ARCH_X86_REALMODE_H + +#include +#include + +/* This must match data at realmode.S */ +struct real_mode_header { + u32 text_start; + u32 ro_end; + u32 end; +} __attribute__((__packed__)); + +extern struct real_mode_header real_mode_header; +extern unsigned char *real_mode_base; + +extern unsigned long init_rsp; +extern unsigned long initial_code; +extern unsigned long initial_gs; + +extern unsigned char real_mode_blob[]; +extern unsigned char real_mode_relocs[]; + +extern void __init setup_real_mode(void); + +#endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 532d2e090e6f..f9e19d4eb984 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,6 +36,7 @@ obj-y += pci-iommu_table.o obj-y += resource.o obj-y += trampoline.o trampoline_$(BITS).o +obj-y += realmode.o obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c new file mode 100644 index 000000000000..7415c42547ac --- /dev/null +++ b/arch/x86/kernel/realmode.c @@ -0,0 +1,79 @@ +#include +#include + +#include +#include +#include + +unsigned char *real_mode_base; +struct real_mode_header real_mode_header; + +void __init setup_real_mode(void) +{ + phys_addr_t mem; + u16 real_mode_seg; + u32 *rel; + u32 count; + u32 *ptr; + u16 *seg; + int i; + + struct real_mode_header *header = + (struct real_mode_header *) real_mode_blob; + + size_t size = PAGE_ALIGN(header->end); + + /* Has to be in very low memory so we can execute real-mode AP code. */ + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); + if (!mem) + panic("Cannot allocate trampoline\n"); + + real_mode_base = __va(mem); + memblock_reserve(mem, size); + + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + real_mode_base, (unsigned long long)mem, size); + + memcpy(real_mode_base, real_mode_blob, size); + + real_mode_seg = __pa(real_mode_base) >> 4; + rel = (u32 *) real_mode_relocs; + + /* 16-bit segment relocations. */ + count = rel[0]; + rel = &rel[1]; + for (i = 0; i < count; i++) { + seg = (u16 *) (real_mode_base + rel[i]); + *seg = real_mode_seg; + } + + /* 32-bit linear relocations. */ + count = rel[i]; + rel = &rel[i + 1]; + for (i = 0; i < count; i++) { + ptr = (u32 *) (real_mode_base + rel[i]); + *ptr += __pa(real_mode_base); + } + + /* Copied header will contain relocated physical addresses. */ + memcpy(&real_mode_header, real_mode_base, + sizeof(struct real_mode_header)); +} + +/* + * set_real_mode_permissions() gets called very early, to guarantee the + * availability of low memory. This is before the proper kernel page + * tables are set up, so we cannot set page permissions in that + * function. Thus, we use an arch_initcall instead. + */ +static int __init set_real_mode_permissions(void) +{ + size_t all_size = + PAGE_ALIGN(real_mode_header.end) - + __pa(real_mode_base); + + set_memory_x((unsigned long) real_mode_base, all_size >> PAGE_SHIFT); + return 0; +} + +arch_initcall(set_real_mode_permissions); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1a2901562059..56e41242a6b8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include #include @@ -918,6 +919,7 @@ void __init setup_arch(char **cmdline_p) max_pfn_mapped< Date: Tue, 8 May 2012 21:22:27 +0300 Subject: x86, realmode: Move reboot_32.S to unified realmode code Migrated reboot_32.S from x86_trampoline to the real-mode blob. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-5-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 4 ++ arch/x86/kernel/Makefile | 1 - arch/x86/kernel/reboot.c | 25 +------- arch/x86/kernel/reboot_32.S | 135 --------------------------------------- arch/x86/realmode/rm/Makefile | 1 + arch/x86/realmode/rm/header.S | 3 + arch/x86/realmode/rm/reboot_32.S | 134 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 145 insertions(+), 158 deletions(-) delete mode 100644 arch/x86/kernel/reboot_32.S create mode 100644 arch/x86/realmode/rm/reboot_32.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index dc1bba534c14..bf26b0681931 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -9,6 +9,10 @@ struct real_mode_header { u32 text_start; u32 ro_end; u32 end; + /* reboot */ +#ifdef CONFIG_X86_32 + u32 machine_real_restart_asm; +#endif } __attribute__((__packed__)); extern struct real_mode_header real_mode_header; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f9e19d4eb984..b71ef35c7d77 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -49,7 +49,6 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o -obj-$(CONFIG_X86_32) += reboot_32.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d840e69a853c..050eff29a4bb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -24,6 +24,7 @@ #ifdef CONFIG_X86_32 # include # include +# include #else # include #endif @@ -332,15 +333,10 @@ static int __init reboot_init(void) } core_initcall(reboot_init); -extern const unsigned char machine_real_restart_asm[]; -extern const u64 machine_real_restart_gdt[3]; - void machine_real_restart(unsigned int type) { - void *restart_va; - unsigned long restart_pa; - void (*restart_lowmem)(unsigned int); - u64 *lowmem_gdt; + void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) + real_mode_header.machine_real_restart_asm; local_irq_disable(); @@ -369,21 +365,6 @@ void machine_real_restart(unsigned int type) too. */ *((unsigned short *)0x472) = reboot_mode; - /* Patch the GDT in the low memory trampoline */ - lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); - - restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); - restart_pa = virt_to_phys(restart_va); - restart_lowmem = (void (*)(unsigned int))restart_pa; - - /* GDT[0]: GDT self-pointer */ - lowmem_gdt[0] = - (u64)(sizeof(machine_real_restart_gdt) - 1) + - ((u64)virt_to_phys(lowmem_gdt) << 16); - /* GDT[1]: 64K real mode code segment */ - lowmem_gdt[1] = - GDT_ENTRY(0x009b, restart_pa, 0xffff); - /* Jump to the identity-mapped low memory code */ restart_lowmem(type); } diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S deleted file mode 100644 index 1d5c46df0d78..000000000000 --- a/arch/x86/kernel/reboot_32.S +++ /dev/null @@ -1,135 +0,0 @@ -#include -#include -#include -#include - -/* - * The following code and data reboots the machine by switching to real - * mode and jumping to the BIOS reset entry point, as if the CPU has - * really been reset. The previous version asked the keyboard - * controller to pulse the CPU reset line, which is more thorough, but - * doesn't work with at least one type of 486 motherboard. It is easy - * to stop this code working; hence the copious comments. - * - * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. - */ - .section ".x86_trampoline","a" - .balign 16 - .code32 -ENTRY(machine_real_restart_asm) -r_base = . - /* Get our own relocated address */ - call 1f -1: popl %ebx - subl $(1b - r_base), %ebx - - /* Compute the equivalent real-mode segment */ - movl %ebx, %ecx - shrl $4, %ecx - - /* Patch post-real-mode segment jump */ - movw (dispatch_table - r_base)(%ebx,%eax,2),%ax - movw %ax, (101f - r_base)(%ebx) - movw %cx, (102f - r_base)(%ebx) - - /* Set up the IDT for real mode. */ - lidtl (machine_real_restart_idt - r_base)(%ebx) - - /* - * Set up a GDT from which we can load segment descriptors for real - * mode. The GDT is not used in real mode; it is just needed here to - * prepare the descriptors. - */ - lgdtl (machine_real_restart_gdt - r_base)(%ebx) - - /* - * Load the data segment registers with 16-bit compatible values - */ - movl $16, %ecx - movl %ecx, %ds - movl %ecx, %es - movl %ecx, %fs - movl %ecx, %gs - movl %ecx, %ss - ljmpl $8, $1f - r_base - -/* - * This is 16-bit protected mode code to disable paging and the cache, - * switch to real mode and jump to the BIOS reset code. - * - * The instruction that switches to real mode by writing to CR0 must be - * followed immediately by a far jump instruction, which set CS to a - * valid value for real mode, and flushes the prefetch queue to avoid - * running instructions that have already been decoded in protected - * mode. - * - * Clears all the flags except ET, especially PG (paging), PE - * (protected-mode enable) and TS (task switch for coprocessor state - * save). Flushes the TLB after paging has been disabled. Sets CD and - * NW, to disable the cache on a 486, and invalidates the cache. This - * is more like the state of a 486 after reset. I don't know if - * something else should be done for other chips. - * - * More could be done here to set up the registers as if a CPU reset had - * occurred; hopefully real BIOSs don't assume much. This is not the - * actual BIOS entry point, anyway (that is at 0xfffffff0). - * - * Most of this work is probably excessive, but it is what is tested. - */ - .code16 -1: - xorl %ecx, %ecx - movl %cr0, %eax - andl $0x00000011, %eax - orl $0x60000000, %eax - movl %eax, %cr0 - movl %ecx, %cr3 - movl %cr0, %edx - andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ - jz 2f - wbinvd -2: - andb $0x10, %al - movl %eax, %cr0 - .byte 0xea /* ljmpw */ -101: .word 0 /* Offset */ -102: .word 0 /* Segment */ - -bios: - ljmpw $0xf000, $0xfff0 - -apm: - movw $0x1000, %ax - movw %ax, %ss - movw $0xf000, %sp - movw $0x5307, %ax - movw $0x0001, %bx - movw $0x0003, %cx - int $0x15 - -END(machine_real_restart_asm) - - .balign 16 - /* These must match +#include +#include +#include + +/* + * The following code and data reboots the machine by switching to real + * mode and jumping to the BIOS reset entry point, as if the CPU has + * really been reset. The previous version asked the keyboard + * controller to pulse the CPU reset line, which is more thorough, but + * doesn't work with at least one type of 486 motherboard. It is easy + * to stop this code working; hence the copious comments. + * + * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. + */ + .section ".text32", "ax" + .code32 + .globl machine_real_restart_asm + + .balign 16 +machine_real_restart_asm: + /* Set up the IDT for real mode. */ + lidtl pa_machine_real_restart_idt + + /* + * Set up a GDT from which we can load segment descriptors for real + * mode. The GDT is not used in real mode; it is just needed here to + * prepare the descriptors. + */ + lgdtl pa_machine_real_restart_gdt + + /* + * Load the data segment registers with 16-bit compatible values + */ + movl $16, %ecx + movl %ecx, %ds + movl %ecx, %es + movl %ecx, %fs + movl %ecx, %gs + movl %ecx, %ss + ljmpw $8, $1f + +/* + * This is 16-bit protected mode code to disable paging and the cache, + * switch to real mode and jump to the BIOS reset code. + * + * The instruction that switches to real mode by writing to CR0 must be + * followed immediately by a far jump instruction, which set CS to a + * valid value for real mode, and flushes the prefetch queue to avoid + * running instructions that have already been decoded in protected + * mode. + * + * Clears all the flags except ET, especially PG (paging), PE + * (protected-mode enable) and TS (task switch for coprocessor state + * save). Flushes the TLB after paging has been disabled. Sets CD and + * NW, to disable the cache on a 486, and invalidates the cache. This + * is more like the state of a 486 after reset. I don't know if + * something else should be done for other chips. + * + * More could be done here to set up the registers as if a CPU reset had + * occurred; hopefully real BIOSs don't assume much. This is not the + * actual BIOS entry point, anyway (that is at 0xfffffff0). + * + * Most of this work is probably excessive, but it is what is tested. + */ + .text + .code16 + + .balign 16 +machine_real_restart_asm16: +1: + xorl %ecx, %ecx + movl %cr0, %edx + andl $0x00000011, %edx + orl $0x60000000, %edx + movl %edx, %cr0 + movl %ecx, %cr3 + movl %cr0, %edx + andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ + jz 2f + wbinvd +2: + andb $0x10, %dl + movl %edx, %cr0 + .byte 0xea /* ljmpw */ + .word 3f /* Offset */ + .word real_mode_seg /* Segment */ + +3: + testb $0, %al + jz bios + +apm: + movw $0x1000, %ax + movw %ax, %ss + movw $0xf000, %sp + movw $0x5307, %ax + movw $0x0001, %bx + movw $0x0003, %cx + int $0x15 + /* This should never return... */ + +bios: + ljmpw $0xf000, $0xfff0 + + .section ".rodata", "a" + .globl machine_real_restart_idt, machine_real_restart_gdt + + .balign 16 +machine_real_restart_idt: + .word 0xffff /* Length - real mode default value */ + .long 0 /* Base - real mode default value */ + + .balign 16 +machine_real_restart_gdt: + /* Self-pointer */ + .word 0xffff /* Length - real mode default value */ + .long pa_machine_real_restart_gdt + .word 0 + + /* + * 16-bit code segment pointing to real_mode_seg + * Selector value 8 + */ + .word 0xffff /* Limit */ + .long 0x9b000000 + pa_real_mode_base + .word 0 + + /* + * 16-bit data segment with the selector value 16 = 0x10 and + * base value 0x100; since this is consistent with real mode + * semantics we don't have to reload the segments once CR0.PE = 0. + */ + .quad GDT_ENTRY(0x0093, 0x100, 0xffff) -- cgit v1.2.1 From 48927bbb97c7d4cf343c05827ab9ac30c60678cb Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:28 +0300 Subject: x86, realmode: Move SMP trampoline to unified realmode code Migrated SMP trampoline code to the real mode blob. SMP trampoline code is not yet removed from .x86_trampoline because it is needed by the wakeup code. [ hpa: always enable compiling startup_32_smp in head_32.S... it is only a few instructions which go into .init on UP builds, and it makes the rest of the code less #ifdef ugly. ] Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-6-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 18 ++++ arch/x86/kernel/head_32.S | 5 +- arch/x86/kernel/head_64.S | 4 - arch/x86/kernel/realmode.c | 14 +++ arch/x86/kernel/smpboot.c | 18 ++-- arch/x86/realmode/rm/Makefile | 1 + arch/x86/realmode/rm/header.S | 11 +++ arch/x86/realmode/rm/trampoline_32.S | 86 +++++++++++++++++ arch/x86/realmode/rm/trampoline_64.S | 175 +++++++++++++++++++++++++++++++++++ 9 files changed, 316 insertions(+), 16 deletions(-) create mode 100644 arch/x86/realmode/rm/trampoline_32.S create mode 100644 arch/x86/realmode/rm/trampoline_64.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index bf26b0681931..9b4a5da5e22e 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -12,6 +12,17 @@ struct real_mode_header { /* reboot */ #ifdef CONFIG_X86_32 u32 machine_real_restart_asm; +#endif + /* SMP trampoline */ + u32 trampoline_data; + u32 trampoline_status; +#ifdef CONFIG_X86_32 + u32 startup_32_smp; + u32 boot_gdt; +#else + u32 startup_64_smp; + u32 level3_ident_pgt; + u32 level3_kernel_pgt; #endif } __attribute__((__packed__)); @@ -25,6 +36,13 @@ extern unsigned long initial_gs; extern unsigned char real_mode_blob[]; extern unsigned char real_mode_relocs[]; +#ifdef CONFIG_X86_32 +extern unsigned char startup_32_smp[]; +extern unsigned char boot_gdt[]; +#else +extern unsigned char secondary_startup_64[]; +#endif + extern void __init setup_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index ce0be7cd085e..a3c2b4ffebc6 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -273,10 +273,7 @@ num_subarch_entries = (. - subarch_entries) / 4 * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ - __CPUINIT - -#ifdef CONFIG_SMP ENTRY(startup_32_smp) cld movl $(__BOOT_DS),%eax @@ -287,7 +284,7 @@ ENTRY(startup_32_smp) movl pa(stack_start),%ecx movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp -#endif /* CONFIG_SMP */ + default_entry: /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 40f4eb3766d1..d70bc2eb202b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -136,10 +136,6 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) - /* Fixup trampoline */ - addq %rbp, trampoline_level4_pgt + 0(%rip) - addq %rbp, trampoline_level4_pgt + (511*8)(%rip) - /* Due to ENTRY(), sometimes the empty space gets filled with * zeros. Better take a jmp than relying on empty space being * filled with 0x90 (nop) diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index 7415c42547ac..a465775b32f2 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -58,6 +58,20 @@ void __init setup_real_mode(void) /* Copied header will contain relocated physical addresses. */ memcpy(&real_mode_header, real_mode_base, sizeof(struct real_mode_header)); + +#ifdef CONFIG_X86_32 + *((u32 *)__va(real_mode_header.startup_32_smp)) = __pa(startup_32_smp); + *((u32 *)__va(real_mode_header.boot_gdt)) = __pa(boot_gdt); +#else + *((u64 *) __va(real_mode_header.startup_64_smp)) = + (u64) __pa(secondary_startup_64); + + *((u64 *) __va(real_mode_header.level3_ident_pgt)) = + __pa(level3_ident_pgt) + _KERNPG_TABLE; + + *((u64 *) __va(real_mode_header.level3_kernel_pgt)) = + __pa(level3_kernel_pgt) + _KERNPG_TABLE; +#endif } /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6e1e406038c2..c7971ea74bd0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include #include #include @@ -73,6 +73,8 @@ #include #include +#include + /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -662,8 +664,12 @@ static void __cpuinit announce_cpu(int cpu, int apicid) */ static int __cpuinit do_boot_cpu(int apicid, int cpu) { + volatile u32 *trampoline_status = + (volatile u32 *) __va(real_mode_header.trampoline_status); + /* start_ip had better be page-aligned! */ + unsigned long start_ip = real_mode_header.trampoline_data; + unsigned long boot_error = 0; - unsigned long start_ip; int timeout; struct create_idle c_idle = { .cpu = cpu, @@ -713,9 +719,6 @@ do_rest: initial_code = (unsigned long)start_secondary; stack_start = c_idle.idle->thread.sp; - /* start_ip had better be page-aligned! */ - start_ip = trampoline_address(); - /* So we see what's up */ announce_cpu(cpu, apicid); @@ -778,8 +781,7 @@ do_rest: pr_debug("CPU%d: has booted.\n", cpu); } else { boot_error = 1; - if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) - == 0xA5A5A5A5) + if (*trampoline_status == 0xA5A5A5A5) /* trampoline started but...? */ pr_err("CPU%d: Stuck ??\n", cpu); else @@ -805,7 +807,7 @@ do_rest: } /* mark "stuck" area as not stuck */ - *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0; + *trampoline_status = 0; if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { /* diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 3f851c488593..56ec64f94e69 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -13,6 +13,7 @@ always := realmode.bin realmode-y += header.o realmode-$(CONFIG_X86_32) += reboot_32.o +realmode-y += trampoline_$(BITS).o targets += $(realmode-y) diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index db21401c0c57..a97900409c61 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -15,5 +15,16 @@ ENTRY(real_mode_header) .long pa_end #ifdef CONFIG_X86_32 .long pa_machine_real_restart_asm +#endif + /* SMP trampoline */ + .long pa_trampoline_data + .long pa_trampoline_status +#ifdef CONFIG_X86_32 + .long pa_startup_32_smp + .long pa_boot_gdt +#else + .long pa_startup_64_smp + .long pa_level3_ident_pgt + .long pa_level3_kernel_pgt #endif END(real_mode_header) diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S new file mode 100644 index 000000000000..18cb7fc9fad4 --- /dev/null +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -0,0 +1,86 @@ +/* + * + * Trampoline.S Derived from Setup.S by Linus Torvalds + * + * 4 Jan 1997 Michael Chastain: changed to gnu as. + * + * This is only used for booting secondary CPUs in SMP machine + * + * Entry: CS:IP point to the start of our code, we are + * in real mode with no stack, but the rest of the + * trampoline page to make our stack and everything else + * is a mystery. + * + * We jump into arch/x86/kernel/head_32.S. + * + * On entry to trampoline_data, the processor is in real mode + * with 16-bit addressing and 16-bit data. CS has some value + * and IP is zero. Thus, we load CS to the physical segment + * of the real mode code before doing anything further. + * + * The structure real_mode_header includes entries that need + * to be set up before executing this code: + * + * startup_32_smp + * boot_gdt + */ + +#include +#include +#include +#include + + .text + .code16 + .globl trampoline_data + + .balign PAGE_SIZE +trampoline_data: + wbinvd # Needed for NUMA-Q should be harmless for others + + .byte 0xea # ljmpw + .word 1f # Offset + .word real_mode_seg # Segment +1: + mov %cs, %ax # Code and data in the same place + mov %ax, %ds + + cli # We should be safe anyway + + movl $0xA5A5A5A5, trampoline_status + # write marker for master knows we're running + + /* GDT tables in non default location kernel can be beyond 16MB and + * lgdt will not be able to load the address as in real mode default + * operand size is 16bit. Use lgdtl instead to force operand size + * to 32 bit. + */ + + lidtl boot_idt_descr # load idt with 0, 0 + lgdtl boot_gdt_descr # load gdt with whatever is appropriate + + xor %ax, %ax + inc %ax # protected mode (PE) bit + lmsw %ax # into protected mode + + # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S + ljmpl *(startup_32_smp) + + .data + .globl startup_32_smp, boot_gdt, trampoline_status + +boot_gdt_descr: + .word __BOOT_DS + 7 # gdt limit +boot_gdt: + .long 0 # gdt base + +boot_idt_descr: + .word 0 # idt limit = 0 + .long 0 # idt base = 0L + +trampoline_status: + .long 0 + +startup_32_smp: + .long 0x00000000 + .word __BOOT_CS, 0 diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S new file mode 100644 index 000000000000..063da008d520 --- /dev/null +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -0,0 +1,175 @@ +/* + * + * Trampoline.S Derived from Setup.S by Linus Torvalds + * + * 4 Jan 1997 Michael Chastain: changed to gnu as. + * 15 Sept 2005 Eric Biederman: 64bit PIC support + * + * Entry: CS:IP point to the start of our code, we are + * in real mode with no stack, but the rest of the + * trampoline page to make our stack and everything else + * is a mystery. + * + * On entry to trampoline_data, the processor is in real mode + * with 16-bit addressing and 16-bit data. CS has some value + * and IP is zero. Thus, data addresses need to be absolute + * (no relocation) and are taken with regard to r_base. + * + * With the addition of trampoline_level4_pgt this code can + * now enter a 64bit kernel that lives at arbitrary 64bit + * physical addresses. + * + * If you work on this file, check the object module with objdump + * --full-contents --reloc to make sure there are no relocation + * entries. + */ + +#include +#include +#include +#include +#include +#include +#include + + .text + .balign PAGE_SIZE + .code16 + +ENTRY(trampoline_data) + cli # We should be safe anyway + wbinvd + + .byte 0xea # ljmpw + .word 1f # Offset + .word real_mode_seg # Segment +1: + mov %cs, %ax # Code and data in the same place + mov %ax, %ds + mov %ax, %es + mov %ax, %ss + + movl $0xA5A5A5A5, trampoline_status + # write marker for master knows we're running + + # Setup stack + movw $trampoline_stack_end, %sp + + call verify_cpu # Verify the cpu supports long mode + testl %eax, %eax # Check for return code + jnz no_longmode + + /* + * GDT tables in non default location kernel can be beyond 16MB and + * lgdt will not be able to load the address as in real mode default + * operand size is 16bit. Use lgdtl instead to force operand size + * to 32 bit. + */ + + lidtl tidt # load idt with 0, 0 + lgdtl tgdt # load gdt with whatever is appropriate + + mov $X86_CR0_PE, %ax # protected mode (PE) bit + lmsw %ax # into protected mode + + # flush prefetch and jump to startup_32 + ljmpl *(startup_32_vector) + +no_longmode: + hlt + jmp no_longmode +#include "../kernel/verify_cpu.S" + + .code32 + .balign 4 +ENTRY(startup_32) + movl $__KERNEL_DS, %eax # Initialize the %ds segment register + movl %eax, %ds + + movl $X86_CR4_PAE, %eax + movl %eax, %cr4 # Enable PAE mode + + movl pa_startup_64_smp, %esi + movl pa_startup_64_smp_high, %edi + + # Setup trampoline 4 level pagetables + leal pa_trampoline_level4_pgt, %eax + movl %eax, %cr3 + + movl $MSR_EFER, %ecx + movl $(1 << _EFER_LME), %eax # Enable Long Mode + xorl %edx, %edx + wrmsr + + # Enable paging and in turn activate Long Mode + # Enable protected mode + movl $(X86_CR0_PG | X86_CR0_PE), %eax + movl %eax, %cr0 + + /* + * At this point we're in long mode but in 32bit compatibility mode + * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn + * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + */ + ljmpl *(pa_startup_64_vector) + + .code64 + .balign 4 +ENTRY(startup_64) + # Now jump into the kernel using virtual addresses + movl %edi, %eax + shlq $32, %rax + addl %esi, %eax + jmp *%rax + + # Careful these need to be in the same 64K segment as the above; +tidt: + .word 0 # idt limit = 0 + .word 0, 0 # idt base = 0L + + # Duplicate the global descriptor table + # so the kernel can live anywhere + .balign 4 + .globl tgdt +tgdt: + .short tgdt_end - tgdt # gdt limit + .long pa_tgdt + .short 0 + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS +tgdt_end: + + .balign 4 +startup_32_vector: + .long pa_startup_32 + .word __KERNEL32_CS, 0 + + .balign 4 + .globl startup_64_vector +startup_64_vector: + .long pa_startup_64 + .word __KERNEL_CS, 0 + + .data + + .balign 4 +ENTRY(trampoline_status) + .long 0 + +trampoline_stack: + .org 0x1000 +trampoline_stack_end: + + .globl level3_ident_pgt + .globl level3_kernel_pgt +ENTRY(trampoline_level4_pgt) + level3_ident_pgt: .quad 0 + .fill 510,8,0 + level3_kernel_pgt: .quad 0 + + .globl startup_64_smp + .globl startup_64_smp_high +startup_64_smp: .long 0 +startup_64_smp_high: .long 0 -- cgit v1.2.1 From c9b77ccb52a5c77233b0e557b7d4417b00ef4012 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:29 +0300 Subject: x86, realmode: Move ACPI wakeup to unified realmode code Migrated ACPI wakeup code to the real-mode blob. Code existing in .x86_trampoline can be completely removed. Static descriptor table in wakeup_asm.S is courtesy of H. Peter Anvin. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-7-git-send-email-jarkko.sakkinen@intel.com Cc: Rafael J. Wysocki Cc: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 2 - arch/x86/include/asm/realmode.h | 4 + arch/x86/include/asm/trampoline.h | 39 ------ arch/x86/kernel/Makefile | 1 - arch/x86/kernel/acpi/Makefile | 9 +- arch/x86/kernel/acpi/realmode/.gitignore | 3 - arch/x86/kernel/acpi/realmode/Makefile | 59 --------- arch/x86/kernel/acpi/realmode/bioscall.S | 1 - arch/x86/kernel/acpi/realmode/copy.S | 1 - arch/x86/kernel/acpi/realmode/regs.c | 1 - arch/x86/kernel/acpi/realmode/video-bios.c | 1 - arch/x86/kernel/acpi/realmode/video-mode.c | 1 - arch/x86/kernel/acpi/realmode/video-vesa.c | 1 - arch/x86/kernel/acpi/realmode/video-vga.c | 1 - arch/x86/kernel/acpi/realmode/wakemain.c | 81 ------------- arch/x86/kernel/acpi/realmode/wakeup.S | 170 -------------------------- arch/x86/kernel/acpi/realmode/wakeup.h | 48 -------- arch/x86/kernel/acpi/realmode/wakeup.lds.S | 62 ---------- arch/x86/kernel/acpi/sleep.c | 33 +---- arch/x86/kernel/acpi/sleep.h | 2 +- arch/x86/kernel/acpi/wakeup_rm.S | 12 -- arch/x86/kernel/head32.c | 1 - arch/x86/kernel/head64.c | 1 - arch/x86/kernel/mpparse.c | 1 - arch/x86/kernel/setup.c | 2 - arch/x86/kernel/tboot.c | 5 +- arch/x86/kernel/trampoline.c | 42 ------- arch/x86/kernel/trampoline_32.S | 83 ------------- arch/x86/kernel/trampoline_64.S | 171 -------------------------- arch/x86/kernel/vmlinux.lds.S | 12 -- arch/x86/realmode/rm/Makefile | 4 + arch/x86/realmode/rm/header.S | 5 + arch/x86/realmode/rm/realmode.lds.S | 4 + arch/x86/realmode/rm/wakeup/.gitignore | 3 + arch/x86/realmode/rm/wakeup/Makefile | 33 +++++ arch/x86/realmode/rm/wakeup/bioscall.S | 1 + arch/x86/realmode/rm/wakeup/copy.S | 1 + arch/x86/realmode/rm/wakeup/regs.c | 1 + arch/x86/realmode/rm/wakeup/video-bios.c | 1 + arch/x86/realmode/rm/wakeup/video-mode.c | 1 + arch/x86/realmode/rm/wakeup/video-vesa.c | 1 + arch/x86/realmode/rm/wakeup/video-vga.c | 1 + arch/x86/realmode/rm/wakeup/wakemain.c | 82 +++++++++++++ arch/x86/realmode/rm/wakeup/wakeup.h | 41 +++++++ arch/x86/realmode/rm/wakeup/wakeup_asm.S | 189 +++++++++++++++++++++++++++++ 45 files changed, 381 insertions(+), 837 deletions(-) delete mode 100644 arch/x86/include/asm/trampoline.h delete mode 100644 arch/x86/kernel/acpi/realmode/.gitignore delete mode 100644 arch/x86/kernel/acpi/realmode/Makefile delete mode 100644 arch/x86/kernel/acpi/realmode/bioscall.S delete mode 100644 arch/x86/kernel/acpi/realmode/copy.S delete mode 100644 arch/x86/kernel/acpi/realmode/regs.c delete mode 100644 arch/x86/kernel/acpi/realmode/video-bios.c delete mode 100644 arch/x86/kernel/acpi/realmode/video-mode.c delete mode 100644 arch/x86/kernel/acpi/realmode/video-vesa.c delete mode 100644 arch/x86/kernel/acpi/realmode/video-vga.c delete mode 100644 arch/x86/kernel/acpi/realmode/wakemain.c delete mode 100644 arch/x86/kernel/acpi/realmode/wakeup.S delete mode 100644 arch/x86/kernel/acpi/realmode/wakeup.h delete mode 100644 arch/x86/kernel/acpi/realmode/wakeup.lds.S delete mode 100644 arch/x86/kernel/acpi/wakeup_rm.S delete mode 100644 arch/x86/kernel/trampoline.c delete mode 100644 arch/x86/kernel/trampoline_32.S delete mode 100644 arch/x86/kernel/trampoline_64.S create mode 100644 arch/x86/realmode/rm/wakeup/.gitignore create mode 100644 arch/x86/realmode/rm/wakeup/Makefile create mode 100644 arch/x86/realmode/rm/wakeup/bioscall.S create mode 100644 arch/x86/realmode/rm/wakeup/copy.S create mode 100644 arch/x86/realmode/rm/wakeup/regs.c create mode 100644 arch/x86/realmode/rm/wakeup/video-bios.c create mode 100644 arch/x86/realmode/rm/wakeup/video-mode.c create mode 100644 arch/x86/realmode/rm/wakeup/video-vesa.c create mode 100644 arch/x86/realmode/rm/wakeup/video-vga.c create mode 100644 arch/x86/realmode/rm/wakeup/wakemain.c create mode 100644 arch/x86/realmode/rm/wakeup/wakeup.h create mode 100644 arch/x86/realmode/rm/wakeup/wakeup_asm.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 610001d385dd..724aa441de7d 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -29,7 +29,6 @@ #include #include #include -#include #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long @@ -118,7 +117,6 @@ static inline void acpi_disable_pci(void) extern int acpi_suspend_lowlevel(void); extern const unsigned char acpi_wakeup_code[]; -#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code))) /* early initialization routine */ extern void acpi_reserve_wakeup_memory(void); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 9b4a5da5e22e..1bfc74d213a4 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -24,6 +24,10 @@ struct real_mode_header { u32 level3_ident_pgt; u32 level3_kernel_pgt; #endif +#ifdef CONFIG_ACPI_SLEEP + u32 wakeup_start; + u32 wakeup_header; +#endif } __attribute__((__packed__)); extern struct real_mode_header real_mode_header; diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h deleted file mode 100644 index feca3118a73b..000000000000 --- a/arch/x86/include/asm/trampoline.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef _ASM_X86_TRAMPOLINE_H -#define _ASM_X86_TRAMPOLINE_H - -#ifndef __ASSEMBLY__ - -#include -#include - -/* - * Trampoline 80x86 program as an array. These are in the init rodata - * segment, but that's okay, because we only care about the relative - * addresses of the symbols. - */ -extern const unsigned char x86_trampoline_start []; -extern const unsigned char x86_trampoline_end []; -extern unsigned char *x86_trampoline_base; - -extern unsigned long init_rsp; -extern unsigned long initial_code; -extern unsigned long initial_gs; - -extern void __init setup_trampolines(void); - -extern const unsigned char trampoline_data[]; -extern const unsigned char trampoline_status[]; - -#define TRAMPOLINE_SYM(x) \ - ((void *)(x86_trampoline_base + \ - ((const unsigned char *)(x) - x86_trampoline_start))) - -/* Address of the SMP trampoline */ -static inline unsigned long trampoline_address(void) -{ - return virt_to_phys(TRAMPOLINE_SYM(trampoline_data)); -} - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_TRAMPOLINE_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b71ef35c7d77..4a20f4441ffe 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,7 +35,6 @@ obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o -obj-y += trampoline.o trampoline_$(BITS).o obj-y += realmode.o obj-y += process.o obj-y += i387.o xsave.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 6f35260bb3ef..163b22581472 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,14 +1,7 @@ -subdir- := realmode - obj-$(CONFIG_ACPI) += boot.o -obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o endif -$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin - -$(obj)/realmode/wakeup.bin: FORCE - $(Q)$(MAKE) $(build)=$(obj)/realmode - diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore deleted file mode 100644 index 58f1f48a58f8..000000000000 --- a/arch/x86/kernel/acpi/realmode/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -wakeup.bin -wakeup.elf -wakeup.lds diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile deleted file mode 100644 index 6a564ac67ef5..000000000000 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ /dev/null @@ -1,59 +0,0 @@ -# -# arch/x86/kernel/acpi/realmode/Makefile -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# - -always := wakeup.bin -targets := wakeup.elf wakeup.lds - -wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o - -# The link order of the video-*.o modules can matter. In particular, -# video-vga.o *must* be listed first, followed by video-vesa.o. -# Hardware-specific drivers should follow in the order they should be -# probed, and video-bios.o should typically be last. -wakeup-y += video-vga.o -wakeup-y += video-vesa.o -wakeup-y += video-bios.o - -targets += $(wakeup-y) - -bootsrc := $(src)/../../../boot - -# --------------------------------------------------------------------------- - -# How to compile the 16-bit code. Note we always compile for -march=i386, -# that way we can complain to the user if the CPU is insufficient. -# Compile with _SETUP since this is similar to the boot-time setup code. -KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \ - -I$(srctree)/$(bootsrc) \ - $(cflags-y) \ - -Wall -Wstrict-prototypes \ - -march=i386 -mregparm=3 \ - -include $(srctree)/$(bootsrc)/code16gcc.h \ - -fno-strict-aliasing -fomit-frame-pointer \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-toplevel-reorder,\ - $(call cc-option, -fno-unit-at-a-time)) \ - $(call cc-option, -fno-stack-protector) \ - $(call cc-option, -mpreferred-stack-boundary=2) -KBUILD_CFLAGS += $(call cc-option, -m32) -KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -GCOV_PROFILE := n - -WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) - -LDFLAGS_wakeup.elf := -T - -CPPFLAGS_wakeup.lds += -P -C - -$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE - $(call if_changed,ld) - -OBJCOPYFLAGS_wakeup.bin := -O binary - -$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE - $(call if_changed,objcopy) diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S deleted file mode 100644 index f51eb0bb56ce..000000000000 --- a/arch/x86/kernel/acpi/realmode/bioscall.S +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/bioscall.S" diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S deleted file mode 100644 index dc59ebee69d8..000000000000 --- a/arch/x86/kernel/acpi/realmode/copy.S +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/copy.S" diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c deleted file mode 100644 index 6206033ba202..000000000000 --- a/arch/x86/kernel/acpi/realmode/regs.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/regs.c" diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c deleted file mode 100644 index 7deabc144a27..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-bios.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-bios.c" diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c deleted file mode 100644 index 328ad209f113..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-mode.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-mode.c" diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c deleted file mode 100644 index 9dbb9672226a..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-vesa.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-vesa.c" diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c deleted file mode 100644 index bcc81255f374..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-vga.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-vga.c" diff --git a/arch/x86/kernel/acpi/realmode/wakemain.c b/arch/x86/kernel/acpi/realmode/wakemain.c deleted file mode 100644 index 883962d9eef2..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakemain.c +++ /dev/null @@ -1,81 +0,0 @@ -#include "wakeup.h" -#include "boot.h" - -static void udelay(int loops) -{ - while (loops--) - io_delay(); /* Approximately 1 us */ -} - -static void beep(unsigned int hz) -{ - u8 enable; - - if (!hz) { - enable = 0x00; /* Turn off speaker */ - } else { - u16 div = 1193181/hz; - - outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */ - io_delay(); - outb(div, 0x42); /* LSB of counter */ - io_delay(); - outb(div >> 8, 0x42); /* MSB of counter */ - io_delay(); - - enable = 0x03; /* Turn on speaker */ - } - inb(0x61); /* Dummy read of System Control Port B */ - io_delay(); - outb(enable, 0x61); /* Enable timer 2 output to speaker */ - io_delay(); -} - -#define DOT_HZ 880 -#define DASH_HZ 587 -#define US_PER_DOT 125000 - -/* Okay, this is totally silly, but it's kind of fun. */ -static void send_morse(const char *pattern) -{ - char s; - - while ((s = *pattern++)) { - switch (s) { - case '.': - beep(DOT_HZ); - udelay(US_PER_DOT); - beep(0); - udelay(US_PER_DOT); - break; - case '-': - beep(DASH_HZ); - udelay(US_PER_DOT * 3); - beep(0); - udelay(US_PER_DOT); - break; - default: /* Assume it's a space */ - udelay(US_PER_DOT * 3); - break; - } - } -} - -void main(void) -{ - /* Kill machine if structures are wrong */ - if (wakeup_header.real_magic != 0x12345678) - while (1); - - if (wakeup_header.realmode_flags & 4) - send_morse("...-"); - - if (wakeup_header.realmode_flags & 1) - asm volatile("lcallw $0xc000,$3"); - - if (wakeup_header.realmode_flags & 2) { - /* Need to call BIOS */ - probe_cards(0); - set_mode(wakeup_header.video_mode); - } -} diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S deleted file mode 100644 index b4fd836e4053..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ /dev/null @@ -1,170 +0,0 @@ -/* - * ACPI wakeup real mode startup stub - */ -#include -#include -#include -#include -#include -#include "wakeup.h" - - .code16 - .section ".jump", "ax" - .globl _start -_start: - cli - jmp wakeup_code - -/* This should match the structure in wakeup.h */ - .section ".header", "a" - .globl wakeup_header -wakeup_header: -video_mode: .short 0 /* Video mode number */ -pmode_return: .byte 0x66, 0xea /* ljmpl */ - .long 0 /* offset goes here */ - .short __KERNEL_CS -pmode_cr0: .long 0 /* Saved %cr0 */ -pmode_cr3: .long 0 /* Saved %cr3 */ -pmode_cr4: .long 0 /* Saved %cr4 */ -pmode_efer: .quad 0 /* Saved EFER */ -pmode_gdt: .quad 0 -pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ -pmode_behavior: .long 0 /* Wakeup behavior flags */ -realmode_flags: .long 0 -real_magic: .long 0 -trampoline_segment: .word 0 -_pad1: .byte 0 -wakeup_jmp: .byte 0xea /* ljmpw */ -wakeup_jmp_off: .word 3f -wakeup_jmp_seg: .word 0 -wakeup_gdt: .quad 0, 0, 0 -signature: .long WAKEUP_HEADER_SIGNATURE - - .text - .code16 -wakeup_code: - cld - - /* Apparently some dimwit BIOS programmers don't know how to - program a PM to RM transition, and we might end up here with - junk in the data segment descriptor registers. The only way - to repair that is to go into PM and fix it ourselves... */ - movw $16, %cx - lgdtl %cs:wakeup_gdt - movl %cr0, %eax - orb $X86_CR0_PE, %al - movl %eax, %cr0 - jmp 1f -1: ljmpw $8, $2f -2: - movw %cx, %ds - movw %cx, %es - movw %cx, %ss - movw %cx, %fs - movw %cx, %gs - - andb $~X86_CR0_PE, %al - movl %eax, %cr0 - jmp wakeup_jmp -3: - /* Set up segments */ - movw %cs, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %ss - lidtl wakeup_idt - - movl $wakeup_stack_end, %esp - - /* Clear the EFLAGS */ - pushl $0 - popfl - - /* Check header signature... */ - movl signature, %eax - cmpl $WAKEUP_HEADER_SIGNATURE, %eax - jne bogus_real_magic - - /* Check we really have everything... */ - movl end_signature, %eax - cmpl $WAKEUP_END_SIGNATURE, %eax - jne bogus_real_magic - - /* Call the C code */ - calll main - - /* Restore MISC_ENABLE before entering protected mode, in case - BIOS decided to clear XD_DISABLE during S3. */ - movl pmode_behavior, %eax - btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax - jnc 1f - - movl pmode_misc_en, %eax - movl pmode_misc_en + 4, %edx - movl $MSR_IA32_MISC_ENABLE, %ecx - wrmsr -1: - - /* Do any other stuff... */ - -#ifndef CONFIG_64BIT - /* This could also be done in C code... */ - movl pmode_cr3, %eax - movl %eax, %cr3 - - movl pmode_cr4, %ecx - jecxz 1f - movl %ecx, %cr4 -1: - movl pmode_efer, %eax - movl pmode_efer + 4, %edx - movl %eax, %ecx - orl %edx, %ecx - jz 1f - movl $MSR_EFER, %ecx - wrmsr -1: - - lgdtl pmode_gdt - - /* This really couldn't... */ - movl pmode_cr0, %eax - movl %eax, %cr0 - jmp pmode_return -#else - pushw $0 - pushw trampoline_segment - pushw $0 - lret -#endif - -bogus_real_magic: -1: - hlt - jmp 1b - - .data - .balign 8 - - /* This is the standard real-mode IDT */ -wakeup_idt: - .word 0xffff /* limit */ - .long 0 /* address */ - .word 0 - - .globl HEAP, heap_end -HEAP: - .long wakeup_heap -heap_end: - .long wakeup_stack - - .bss -wakeup_heap: - .space 2048 -wakeup_stack: - .space 2048 -wakeup_stack_end: - - .section ".signature","a" -end_signature: - .long WAKEUP_END_SIGNATURE diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h deleted file mode 100644 index 97a29e1430e3..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakeup.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Definitions for the wakeup data structure at the head of the - * wakeup code. - */ - -#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H -#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H - -#ifndef __ASSEMBLY__ -#include - -/* This must match data at wakeup.S */ -struct wakeup_header { - u16 video_mode; /* Video mode number */ - u16 _jmp1; /* ljmpl opcode, 32-bit only */ - u32 pmode_entry; /* Protected mode resume point, 32-bit only */ - u16 _jmp2; /* CS value, 32-bit only */ - u32 pmode_cr0; /* Protected mode cr0 */ - u32 pmode_cr3; /* Protected mode cr3 */ - u32 pmode_cr4; /* Protected mode cr4 */ - u32 pmode_efer_low; /* Protected mode EFER */ - u32 pmode_efer_high; - u64 pmode_gdt; - u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */ - u32 pmode_misc_en_high; - u32 pmode_behavior; /* Wakeup routine behavior flags */ - u32 realmode_flags; - u32 real_magic; - u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ - u8 _pad1; - u8 wakeup_jmp; - u16 wakeup_jmp_off; - u16 wakeup_jmp_seg; - u64 wakeup_gdt[3]; - u32 signature; /* To check we have correct structure */ -} __attribute__((__packed__)); - -extern struct wakeup_header wakeup_header; -#endif - -#define WAKEUP_HEADER_OFFSET 8 -#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 -#define WAKEUP_END_SIGNATURE 0x65a22c82 - -/* Wakeup behavior bits */ -#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 - -#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S deleted file mode 100644 index d4f8010a5b1b..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ /dev/null @@ -1,62 +0,0 @@ -/* - * wakeup.ld - * - * Linker script for the real-mode wakeup code - */ -#undef i386 -#include "wakeup.h" - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(_start) - -SECTIONS -{ - . = 0; - .jump : { - *(.jump) - } = 0x90909090 - - . = WAKEUP_HEADER_OFFSET; - .header : { - *(.header) - } - - . = ALIGN(16); - .text : { - *(.text*) - } = 0x90909090 - - . = ALIGN(16); - .rodata : { - *(.rodata*) - } - - .videocards : { - video_cards = .; - *(.videocards) - video_cards_end = .; - } - - . = ALIGN(16); - .data : { - *(.data*) - } - - . = ALIGN(16); - .bss : { - __bss_start = .; - *(.bss) - __bss_end = .; - } - - .signature : { - *(.signature) - } - - _end = .; - - /DISCARD/ : { - *(.note*) - } -} diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 146a49c763a4..d941b62da4b6 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -14,8 +14,9 @@ #include #include #include +#include -#include "realmode/wakeup.h" +#include "../../realmode/rm/wakeup/wakeup.h" #include "sleep.h" unsigned long acpi_realmode_flags; @@ -36,13 +37,9 @@ asmlinkage void acpi_enter_s3(void) */ int acpi_suspend_lowlevel(void) { - struct wakeup_header *header; - /* address in low memory of the wakeup routine. */ - char *acpi_realmode; + struct wakeup_header *header = + (struct wakeup_header *) __va(real_mode_header.wakeup_header); - acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code); - - header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET); if (header->signature != WAKEUP_HEADER_SIGNATURE) { printk(KERN_ERR "wakeup header does not match\n"); return -EINVAL; @@ -50,27 +47,6 @@ int acpi_suspend_lowlevel(void) header->video_mode = saved_video_mode; - header->wakeup_jmp_seg = acpi_wakeup_address >> 4; - - /* - * Set up the wakeup GDT. We set these up as Big Real Mode, - * that is, with limits set to 4 GB. At least the Lenovo - * Thinkpad X61 is known to need this for the video BIOS - * initialization quirk to work; this is likely to also - * be the case for other laptops or integrated video devices. - */ - - /* GDT[0]: GDT self-pointer */ - header->wakeup_gdt[0] = - (u64)(sizeof(header->wakeup_gdt) - 1) + - ((u64)__pa(&header->wakeup_gdt) << 16); - /* GDT[1]: big real mode-like code segment */ - header->wakeup_gdt[1] = - GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); - /* GDT[2]: big real mode-like data segment */ - header->wakeup_gdt[2] = - GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff); - #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); @@ -95,7 +71,6 @@ int acpi_suspend_lowlevel(void) header->pmode_cr3 = (u32)__pa(&initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ - header->trampoline_segment = trampoline_address() >> 4; #ifdef CONFIG_SMP stack_start = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index d68677a2a010..5653a5791ec9 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -2,8 +2,8 @@ * Variables and functions used by the code in sleep.c */ -#include #include +#include extern unsigned long saved_video_mode; extern long saved_magic; diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S deleted file mode 100644 index 63b8ab524f2c..000000000000 --- a/arch/x86/kernel/acpi/wakeup_rm.S +++ /dev/null @@ -1,12 +0,0 @@ -/* - * Wrapper script for the realmode binary as a transport object - * before copying to low memory. - */ -#include - - .section ".x86_trampoline","a" - .balign PAGE_SIZE - .globl acpi_wakeup_code -acpi_wakeup_code: - .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" - .size acpi_wakeup_code, .-acpi_wakeup_code diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 51ff18616d50..c18f59d10101 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 3a3b779f41d3..037df57a99ac 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -24,7 +24,6 @@ #include #include #include -#include #include static void __init zap_identity_mappings(void) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index ca470e4c92dc..f44d31157353 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 56e41242a6b8..7a14fece9cfc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -73,7 +73,6 @@ #include #include -#include #include #include #include @@ -918,7 +917,6 @@ void __init setup_arch(char **cmdline_p) printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", max_pfn_mapped< #include -#include +#include #include #include #include @@ -201,7 +201,8 @@ static int tboot_setup_sleep(void) add_mac_region(e820.map[i].addr, e820.map[i].size); } - tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + tboot->acpi_sinfo.kernel_s3_resume_vector = + real_mode_header.wakeup_start; return 0; } diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c deleted file mode 100644 index a73b61055ad6..000000000000 --- a/arch/x86/kernel/trampoline.c +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include - -#include -#include -#include - -unsigned char *x86_trampoline_base; - -void __init setup_trampolines(void) -{ - phys_addr_t mem; - size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); - - /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) - panic("Cannot allocate trampoline\n"); - - x86_trampoline_base = __va(mem); - memblock_reserve(mem, size); - - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - x86_trampoline_base, (unsigned long long)mem, size); - - memcpy(x86_trampoline_base, x86_trampoline_start, size); -} - -/* - * setup_trampolines() gets called very early, to guarantee the - * availability of low memory. This is before the proper kernel page - * tables are set up, so we cannot set page permissions in that - * function. Thus, we use an arch_initcall instead. - */ -static int __init configure_trampolines(void) -{ - size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); - - set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT); - return 0; -} -arch_initcall(configure_trampolines); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S deleted file mode 100644 index 451c0a7ef7fd..000000000000 --- a/arch/x86/kernel/trampoline_32.S +++ /dev/null @@ -1,83 +0,0 @@ -/* - * - * Trampoline.S Derived from Setup.S by Linus Torvalds - * - * 4 Jan 1997 Michael Chastain: changed to gnu as. - * - * This is only used for booting secondary CPUs in SMP machine - * - * Entry: CS:IP point to the start of our code, we are - * in real mode with no stack, but the rest of the - * trampoline page to make our stack and everything else - * is a mystery. - * - * We jump into arch/x86/kernel/head_32.S. - * - * On entry to trampoline_data, the processor is in real mode - * with 16-bit addressing and 16-bit data. CS has some value - * and IP is zero. Thus, data addresses need to be absolute - * (no relocation) and are taken with regard to r_base. - * - * If you work on this file, check the object module with - * objdump --reloc to make sure there are no relocation - * entries except for: - * - * TYPE VALUE - * R_386_32 startup_32_smp - * R_386_32 boot_gdt - */ - -#include -#include -#include -#include - -#ifdef CONFIG_SMP - - .section ".x86_trampoline","a" - .balign PAGE_SIZE - .code16 - -ENTRY(trampoline_data) -r_base = . - wbinvd # Needed for NUMA-Q should be harmless for others - mov %cs, %ax # Code and data in the same place - mov %ax, %ds - - cli # We should be safe anyway - - movl $0xA5A5A5A5, trampoline_status - r_base - # write marker for master knows we're running - - /* GDT tables in non default location kernel can be beyond 16MB and - * lgdt will not be able to load the address as in real mode default - * operand size is 16bit. Use lgdtl instead to force operand size - * to 32 bit. - */ - - lidtl boot_idt_descr - r_base # load idt with 0, 0 - lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate - - xor %ax, %ax - inc %ax # protected mode (PE) bit - lmsw %ax # into protected mode - # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S - ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET) - - # These need to be in the same 64K segment as the above; - # hence we don't use the boot_gdt_descr defined in head.S -boot_gdt_descr: - .word __BOOT_DS + 7 # gdt limit - .long boot_gdt - __PAGE_OFFSET # gdt base - -boot_idt_descr: - .word 0 # idt limit = 0 - .long 0 # idt base = 0L - -ENTRY(trampoline_status) - .long 0 - -.globl trampoline_end -trampoline_end: - -#endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S deleted file mode 100644 index 09ff51799e96..000000000000 --- a/arch/x86/kernel/trampoline_64.S +++ /dev/null @@ -1,171 +0,0 @@ -/* - * - * Trampoline.S Derived from Setup.S by Linus Torvalds - * - * 4 Jan 1997 Michael Chastain: changed to gnu as. - * 15 Sept 2005 Eric Biederman: 64bit PIC support - * - * Entry: CS:IP point to the start of our code, we are - * in real mode with no stack, but the rest of the - * trampoline page to make our stack and everything else - * is a mystery. - * - * On entry to trampoline_data, the processor is in real mode - * with 16-bit addressing and 16-bit data. CS has some value - * and IP is zero. Thus, data addresses need to be absolute - * (no relocation) and are taken with regard to r_base. - * - * With the addition of trampoline_level4_pgt this code can - * now enter a 64bit kernel that lives at arbitrary 64bit - * physical addresses. - * - * If you work on this file, check the object module with objdump - * --full-contents --reloc to make sure there are no relocation - * entries. - */ - -#include -#include -#include -#include -#include -#include -#include - - .section ".x86_trampoline","a" - .balign PAGE_SIZE - .code16 - -ENTRY(trampoline_data) -r_base = . - cli # We should be safe anyway - wbinvd - mov %cs, %ax # Code and data in the same place - mov %ax, %ds - mov %ax, %es - mov %ax, %ss - - - movl $0xA5A5A5A5, trampoline_status - r_base - # write marker for master knows we're running - - # Setup stack - movw $(trampoline_stack_end - r_base), %sp - - call verify_cpu # Verify the cpu supports long mode - testl %eax, %eax # Check for return code - jnz no_longmode - - mov %cs, %ax - movzx %ax, %esi # Find the 32bit trampoline location - shll $4, %esi - - # Fixup the absolute vectors - leal (startup_32 - r_base)(%esi), %eax - movl %eax, startup_32_vector - r_base - leal (startup_64 - r_base)(%esi), %eax - movl %eax, startup_64_vector - r_base - leal (tgdt - r_base)(%esi), %eax - movl %eax, (tgdt + 2 - r_base) - - /* - * GDT tables in non default location kernel can be beyond 16MB and - * lgdt will not be able to load the address as in real mode default - * operand size is 16bit. Use lgdtl instead to force operand size - * to 32 bit. - */ - - lidtl tidt - r_base # load idt with 0, 0 - lgdtl tgdt - r_base # load gdt with whatever is appropriate - - mov $X86_CR0_PE, %ax # protected mode (PE) bit - lmsw %ax # into protected mode - - # flush prefetch and jump to startup_32 - ljmpl *(startup_32_vector - r_base) - - .code32 - .balign 4 -startup_32: - movl $__KERNEL_DS, %eax # Initialize the %ds segment register - movl %eax, %ds - - movl $X86_CR4_PAE, %eax - movl %eax, %cr4 # Enable PAE mode - - # Setup trampoline 4 level pagetables - leal (trampoline_level4_pgt - r_base)(%esi), %eax - movl %eax, %cr3 - - movl $MSR_EFER, %ecx - movl $(1 << _EFER_LME), %eax # Enable Long Mode - xorl %edx, %edx - wrmsr - - # Enable paging and in turn activate Long Mode - # Enable protected mode - movl $(X86_CR0_PG | X86_CR0_PE), %eax - movl %eax, %cr0 - - /* - * At this point we're in long mode but in 32bit compatibility mode - * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn - * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. - */ - ljmp *(startup_64_vector - r_base)(%esi) - - .code64 - .balign 4 -startup_64: - # Now jump into the kernel using virtual addresses - movq $secondary_startup_64, %rax - jmp *%rax - - .code16 -no_longmode: - hlt - jmp no_longmode -#include "verify_cpu.S" - - .balign 4 - # Careful these need to be in the same 64K segment as the above; -tidt: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L - - # Duplicate the global descriptor table - # so the kernel can live anywhere - .balign 4 -tgdt: - .short tgdt_end - tgdt # gdt limit - .long tgdt - r_base - .short 0 - .quad 0x00cf9b000000ffff # __KERNEL32_CS - .quad 0x00af9b000000ffff # __KERNEL_CS - .quad 0x00cf93000000ffff # __KERNEL_DS -tgdt_end: - - .balign 4 -startup_32_vector: - .long startup_32 - r_base - .word __KERNEL32_CS, 0 - - .balign 4 -startup_64_vector: - .long startup_64 - r_base - .word __KERNEL_CS, 0 - - .balign 4 -ENTRY(trampoline_status) - .long 0 - -trampoline_stack: - .org 0x1000 -trampoline_stack_end: -ENTRY(trampoline_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 510,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - -ENTRY(trampoline_end) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0f703f10901a..22a1530146a8 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -197,18 +197,6 @@ SECTIONS INIT_DATA_SECTION(16) - /* - * Code and data for a variety of lowlevel trampolines, to be - * copied into base memory (< 1 MiB) during initialization. - * Since it is copied early, the main copy can be discarded - * afterwards. - */ - .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) { - x86_trampoline_start = .; - *(.x86_trampoline) - x86_trampoline_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; *(.x86_cpu_dev.init) diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 56ec64f94e69..2432acb6b04f 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -14,9 +14,13 @@ always := realmode.bin realmode-y += header.o realmode-$(CONFIG_X86_32) += reboot_32.o realmode-y += trampoline_$(BITS).o +realmode-$(CONFIG_ACPI_SLEEP) += wakeup/wakeup.o targets += $(realmode-y) +$(obj)/wakeup/wakeup.o: FORCE + $(Q)$(MAKE) $(build)=$(obj)/wakeup $@ + REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y)) sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p' diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index a97900409c61..730b1316c099 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -26,5 +26,10 @@ ENTRY(real_mode_header) .long pa_startup_64_smp .long pa_level3_ident_pgt .long pa_level3_kernel_pgt +#endif + /* ACPI sleep */ +#ifdef CONFIG_ACPI_SLEEP + .long pa_wakeup_start + .long pa_wakeup_header #endif END(real_mode_header) diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index c5b8a4f31ba3..91b83ea55c37 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -25,6 +25,10 @@ SECTIONS .rodata : { *(.rodata) *(.rodata.*) + . = ALIGN(16); + video_cards = .; + *(.videocards) + video_cards_end = .; } . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/realmode/rm/wakeup/.gitignore b/arch/x86/realmode/rm/wakeup/.gitignore new file mode 100644 index 000000000000..58f1f48a58f8 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/.gitignore @@ -0,0 +1,3 @@ +wakeup.bin +wakeup.elf +wakeup.lds diff --git a/arch/x86/realmode/rm/wakeup/Makefile b/arch/x86/realmode/rm/wakeup/Makefile new file mode 100644 index 000000000000..4c8533240cdd --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/Makefile @@ -0,0 +1,33 @@ +# +# arch/x86/kernel/acpi/realmode/Makefile +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# + +always := wakeup.o + +wakeup-y += wakeup_asm.o wakemain.o video-mode.o +wakeup-y += copy.o bioscall.o regs.o + +# The link order of the video-*.o modules can matter. In particular, +# video-vga.o *must* be listed first, followed by video-vesa.o. +# Hardware-specific drivers should follow in the order they should be +# probed, and video-bios.o should typically be last. +wakeup-y += video-vga.o +wakeup-y += video-vesa.o +wakeup-y += video-bios.o + +targets += $(wakeup-y) + +WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) + +LDFLAGS_wakeup.o := -m elf_i386 -r +$(obj)/wakeup.o: $(WAKEUP_OBJS) FORCE + $(call if_changed,ld) + +bootsrc := $(src)/../../../boot + +ccflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc) +asflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc) diff --git a/arch/x86/realmode/rm/wakeup/bioscall.S b/arch/x86/realmode/rm/wakeup/bioscall.S new file mode 100644 index 000000000000..f51eb0bb56ce --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/bioscall.S @@ -0,0 +1 @@ +#include "../../../boot/bioscall.S" diff --git a/arch/x86/realmode/rm/wakeup/copy.S b/arch/x86/realmode/rm/wakeup/copy.S new file mode 100644 index 000000000000..dc59ebee69d8 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/copy.S @@ -0,0 +1 @@ +#include "../../../boot/copy.S" diff --git a/arch/x86/realmode/rm/wakeup/regs.c b/arch/x86/realmode/rm/wakeup/regs.c new file mode 100644 index 000000000000..6206033ba202 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/regs.c @@ -0,0 +1 @@ +#include "../../../boot/regs.c" diff --git a/arch/x86/realmode/rm/wakeup/video-bios.c b/arch/x86/realmode/rm/wakeup/video-bios.c new file mode 100644 index 000000000000..7deabc144a27 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/video-bios.c @@ -0,0 +1 @@ +#include "../../../boot/video-bios.c" diff --git a/arch/x86/realmode/rm/wakeup/video-mode.c b/arch/x86/realmode/rm/wakeup/video-mode.c new file mode 100644 index 000000000000..328ad209f113 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/video-mode.c @@ -0,0 +1 @@ +#include "../../../boot/video-mode.c" diff --git a/arch/x86/realmode/rm/wakeup/video-vesa.c b/arch/x86/realmode/rm/wakeup/video-vesa.c new file mode 100644 index 000000000000..9dbb9672226a --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/video-vesa.c @@ -0,0 +1 @@ +#include "../../../boot/video-vesa.c" diff --git a/arch/x86/realmode/rm/wakeup/video-vga.c b/arch/x86/realmode/rm/wakeup/video-vga.c new file mode 100644 index 000000000000..bcc81255f374 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/video-vga.c @@ -0,0 +1 @@ +#include "../../../boot/video-vga.c" diff --git a/arch/x86/realmode/rm/wakeup/wakemain.c b/arch/x86/realmode/rm/wakeup/wakemain.c new file mode 100644 index 000000000000..91405d515ec6 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/wakemain.c @@ -0,0 +1,82 @@ +#include "wakeup.h" +#include "boot.h" + +static void udelay(int loops) +{ + while (loops--) + io_delay(); /* Approximately 1 us */ +} + +static void beep(unsigned int hz) +{ + u8 enable; + + if (!hz) { + enable = 0x00; /* Turn off speaker */ + } else { + u16 div = 1193181/hz; + + outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */ + io_delay(); + outb(div, 0x42); /* LSB of counter */ + io_delay(); + outb(div >> 8, 0x42); /* MSB of counter */ + io_delay(); + + enable = 0x03; /* Turn on speaker */ + } + inb(0x61); /* Dummy read of System Control Port B */ + io_delay(); + outb(enable, 0x61); /* Enable timer 2 output to speaker */ + io_delay(); +} + +#define DOT_HZ 880 +#define DASH_HZ 587 +#define US_PER_DOT 125000 + +/* Okay, this is totally silly, but it's kind of fun. */ +static void send_morse(const char *pattern) +{ + char s; + + while ((s = *pattern++)) { + switch (s) { + case '.': + beep(DOT_HZ); + udelay(US_PER_DOT); + beep(0); + udelay(US_PER_DOT); + break; + case '-': + beep(DASH_HZ); + udelay(US_PER_DOT * 3); + beep(0); + udelay(US_PER_DOT); + break; + default: /* Assume it's a space */ + udelay(US_PER_DOT * 3); + break; + } + } +} + +void main(void) +{ + /* Kill machine if structures are wrong */ + if (wakeup_header.real_magic != 0x12345678) + while (1) + ; + + if (wakeup_header.realmode_flags & 4) + send_morse("...-"); + + if (wakeup_header.realmode_flags & 1) + asm volatile("lcallw $0xc000,$3"); + + if (wakeup_header.realmode_flags & 2) { + /* Need to call BIOS */ + probe_cards(0); + set_mode(wakeup_header.video_mode); + } +} diff --git a/arch/x86/realmode/rm/wakeup/wakeup.h b/arch/x86/realmode/rm/wakeup/wakeup.h new file mode 100644 index 000000000000..2dfaf06b8af1 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/wakeup.h @@ -0,0 +1,41 @@ +/* + * Definitions for the wakeup data structure at the head of the + * wakeup code. + */ + +#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H +#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H + +#ifndef __ASSEMBLY__ +#include + +/* This must match data at wakeup.S */ +struct wakeup_header { + u16 video_mode; /* Video mode number */ + u32 pmode_entry; /* Protected mode resume point, 32-bit only */ + u16 pmode_cs; + u32 pmode_cr0; /* Protected mode cr0 */ + u32 pmode_cr3; /* Protected mode cr3 */ + u32 pmode_cr4; /* Protected mode cr4 */ + u32 pmode_efer_low; /* Protected mode EFER */ + u32 pmode_efer_high; + u64 pmode_gdt; + u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */ + u32 pmode_misc_en_high; + u32 pmode_behavior; /* Wakeup routine behavior flags */ + u32 realmode_flags; + u32 real_magic; + u32 signature; /* To check we have correct structure */ +} __attribute__((__packed__)); + +extern struct wakeup_header wakeup_header; +#endif + +#define WAKEUP_HEADER_OFFSET 8 +#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 +#define WAKEUP_END_SIGNATURE 0x65a22c82 + +/* Wakeup behavior bits */ +#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 + +#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S new file mode 100644 index 000000000000..b61126cb599e --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S @@ -0,0 +1,189 @@ +/* + * ACPI wakeup real mode startup stub + */ +#include +#include +#include +#include +#include +#include "wakeup.h" + + .code16 + +/* This should match the structure in wakeup.h */ + .section ".data", "aw" + .globl wakeup_header +wakeup_header: +video_mode: .short 0 /* Video mode number */ +pmode_entry: .long 0 +pmode_cs: .short __KERNEL_CS +pmode_cr0: .long 0 /* Saved %cr0 */ +pmode_cr3: .long 0 /* Saved %cr3 */ +pmode_cr4: .long 0 /* Saved %cr4 */ +pmode_efer: .quad 0 /* Saved EFER */ +pmode_gdt: .quad 0 +pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ +pmode_behavior: .long 0 /* Wakeup behavior flags */ +realmode_flags: .long 0 +real_magic: .long 0 +signature: .long WAKEUP_HEADER_SIGNATURE + .size wakeup_header, .-wakeup_header + + .text + .code16 + .globl wakeup_start +wakeup_start: + cli + cld + + .byte 0xea /* ljmpw */ + .word 3f + .word real_mode_seg +3: + /* Apparently some dimwit BIOS programmers don't know how to + program a PM to RM transition, and we might end up here with + junk in the data segment descriptor registers. The only way + to repair that is to go into PM and fix it ourselves... */ + movw $16, %cx + lgdtl %cs:wakeup_gdt + movl %cr0, %eax + orb $X86_CR0_PE, %al + movl %eax, %cr0 + ljmpw $8, $2f +2: + movw %cx, %ds + movw %cx, %es + movw %cx, %ss + movw %cx, %fs + movw %cx, %gs + + andb $~X86_CR0_PE, %al + movl %eax, %cr0 + .byte 0xea /* ljmpw */ + .word 3f + .word real_mode_seg +3: + /* Set up segments */ + movw %cs, %ax + movw %ax, %ds + movw %ax, %es + movw %ax, %ss + lidtl wakeup_idt + + movl $wakeup_stack_end, %esp + + /* Clear the EFLAGS */ + pushl $0 + popfl + + /* Check header signature... */ + movl signature, %eax + cmpl $WAKEUP_HEADER_SIGNATURE, %eax + jne bogus_real_magic + + /* Check we really have everything... */ + movl end_signature, %eax + cmpl $WAKEUP_END_SIGNATURE, %eax + jne bogus_real_magic + + /* Call the C code */ + calll main + + /* Restore MISC_ENABLE before entering protected mode, in case + BIOS decided to clear XD_DISABLE during S3. */ + movl pmode_behavior, %eax + btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax + jnc 1f + + movl pmode_misc_en, %eax + movl pmode_misc_en + 4, %edx + movl $MSR_IA32_MISC_ENABLE, %ecx + wrmsr +1: + + /* Do any other stuff... */ + +#ifndef CONFIG_64BIT + /* This could also be done in C code... */ + movl pmode_cr3, %eax + movl %eax, %cr3 + + movl pmode_cr4, %ecx + jecxz 1f + movl %ecx, %cr4 +1: + movl pmode_efer, %eax + movl pmode_efer + 4, %edx + movl %eax, %ecx + orl %edx, %ecx + jz 1f + movl $MSR_EFER, %ecx + wrmsr +1: + + lgdtl pmode_gdt + + /* This really couldn't... */ + movl pmode_cr0, %eax + movl %eax, %cr0 + ljmpl *pmode_entry +#else + jmp trampoline_data +#endif + +bogus_real_magic: +1: + hlt + jmp 1b + + .section ".rodata","a" + + /* + * Set up the wakeup GDT. We set these up as Big Real Mode, + * that is, with limits set to 4 GB. At least the Lenovo + * Thinkpad X61 is known to need this for the video BIOS + * initialization quirk to work; this is likely to also + * be the case for other laptops or integrated video devices. + */ + + .globl wakeup_gdt + .balign 16 +wakeup_gdt: + .word 3*8-1 /* Self-descriptor */ + .long pa_wakeup_gdt + .word 0 + + .word 0xffff /* 16-bit code segment @ real_mode_base */ + .long 0x9b000000 + pa_real_mode_base + .word 0x008f /* big real mode */ + + .word 0xffff /* 16-bit data segment @ real_mode_base */ + .long 0x93000000 + pa_real_mode_base + .word 0x008f /* big real mode */ + .size wakeup_gdt, .-wakeup_gdt + + .data + .balign 8 + + /* This is the standard real-mode IDT */ +wakeup_idt: + .word 0xffff /* limit */ + .long 0 /* address */ + .word 0 + + .globl HEAP, heap_end +HEAP: + .long wakeup_heap +heap_end: + .long wakeup_stack + + .bss +wakeup_heap: + .space 2048 +wakeup_stack: + .space 2048 +wakeup_stack_end: + + .section ".signature","a" +end_signature: + .long WAKEUP_END_SIGNATURE -- cgit v1.2.1 From f156ffc439951b63cfa9f4d999a8d54267f13282 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:30 +0300 Subject: x86, realmode: Set permission for real mode pages Set proper permissions for rodata, text and data, removing the realmode trampoline area as a remaining RWX memory mapping in the kernel. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-8-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/realmode.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index a465775b32f2..d85ac20bb4eb 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -86,7 +86,21 @@ static int __init set_real_mode_permissions(void) PAGE_ALIGN(real_mode_header.end) - __pa(real_mode_base); - set_memory_x((unsigned long) real_mode_base, all_size >> PAGE_SHIFT); + size_t ro_size = + PAGE_ALIGN(real_mode_header.ro_end) - + __pa(real_mode_base); + + size_t text_size = + PAGE_ALIGN(real_mode_header.ro_end) - + real_mode_header.text_start; + + unsigned long text_start = + (unsigned long) __va(real_mode_header.text_start); + + set_memory_nx((unsigned long) real_mode_base, all_size >> PAGE_SHIFT); + set_memory_ro((unsigned long) real_mode_base, ro_size >> PAGE_SHIFT); + set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); + return 0; } -- cgit v1.2.1 From 487f50ffeb142d8f86fff6e43a8852ce3d46c173 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:32 +0300 Subject: x86, realmode: Add .text64 section, make barrier symbols absolute Add a .text64 section. The purpose of this is to keep 16-, 32- and 64-bit code segregated into separate sections, mainly to keep disassembly sane. Move barrier symbols out of sections to avoid the "symbol in empty section" problem in some versions of GNU ld. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-10-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/realmode.lds.S | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index 91b83ea55c37..4d4afcaf5f02 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -32,8 +32,8 @@ SECTIONS } . = ALIGN(PAGE_SIZE); + pa_text_start = .; .text : { - pa_text_start = .; *(.text) *(.text.*) } @@ -41,9 +41,14 @@ SECTIONS .text32 : { *(.text32) *(.text32.*) - pa_ro_end = .; } + .text64 : { + *(.text64) + *(.text64.*) + } + pa_ro_end = .; + . = ALIGN(PAGE_SIZE); .data : { *(.data) @@ -59,8 +64,8 @@ SECTIONS . = ALIGN(4); .signature : { *(.signature) - pa_end = .; } + pa_end = .; /DISCARD/ : { *(.note*) -- cgit v1.2.1 From 024742861124ef26dae4cfc620250f8f47ac934a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:33 +0300 Subject: x86, realmode: Move bits to the proper sections in trampoline_64.S Move various bits to the sections they really belong in in trampoline_64.S. Use GLOBAL() rather than ENTRY() for data objects: ENTRY() should only be used with code and forces alignment to 16 bytes. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-11-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/trampoline_64.S | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 063da008d520..66c58cf15503 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -80,6 +80,7 @@ no_longmode: jmp no_longmode #include "../kernel/verify_cpu.S" + .section ".text32","ax" .code32 .balign 4 ENTRY(startup_32) @@ -114,6 +115,7 @@ ENTRY(startup_32) */ ljmpl *(pa_startup_64_vector) + .section ".text64","ax" .code64 .balign 4 ENTRY(startup_64) @@ -123,7 +125,8 @@ ENTRY(startup_64) addl %esi, %eax jmp *%rax - # Careful these need to be in the same 64K segment as the above; + .section ".rodata","a" + .balign 16 tidt: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L @@ -153,9 +156,8 @@ startup_64_vector: .word __KERNEL_CS, 0 .data - .balign 4 -ENTRY(trampoline_status) +GLOBAL(trampoline_status) .long 0 trampoline_stack: @@ -164,7 +166,7 @@ trampoline_stack_end: .globl level3_ident_pgt .globl level3_kernel_pgt -ENTRY(trampoline_level4_pgt) +GLOBAL(trampoline_level4_pgt) level3_ident_pgt: .quad 0 .fill 510,8,0 level3_kernel_pgt: .quad 0 -- cgit v1.2.1 From f7436a9da902922a48cccc208099763b87d6171f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:34 +0300 Subject: x86, realmode: Align .data section in trampoline_32.S Specify the alignment of the .data section in trampoline_32.S. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-12-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/trampoline_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 18cb7fc9fad4..1f9e3316f73d 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -68,7 +68,7 @@ trampoline_data: .data .globl startup_32_smp, boot_gdt, trampoline_status - + .balign 4 boot_gdt_descr: .word __BOOT_DS + 7 # gdt limit boot_gdt: -- cgit v1.2.1 From 056a43a6d3ab903a798d8ee4435ad67d6fccc3e6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:35 +0300 Subject: x86, realmode: Remove indirect jumps in trampoline_64.S Remove indirect jumps in trampoline_64.S which are no longer necessary: the realmode code can relocate the absolute jumps correctly from the start. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-13-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/trampoline_64.S | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 66c58cf15503..77b72b45d705 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -73,7 +73,7 @@ ENTRY(trampoline_data) lmsw %ax # into protected mode # flush prefetch and jump to startup_32 - ljmpl *(startup_32_vector) + ljmpl $__KERNEL32_CS, $pa_startup_32 no_longmode: hlt @@ -113,7 +113,7 @@ ENTRY(startup_32) * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use * the new gdt/idt that has __KERNEL_CS with CS.L = 1. */ - ljmpl *(pa_startup_64_vector) + ljmpl $__KERNEL_CS, $pa_startup_64 .section ".text64","ax" .code64 @@ -144,17 +144,6 @@ tgdt: .quad 0x00cf93000000ffff # __KERNEL_DS tgdt_end: - .balign 4 -startup_32_vector: - .long pa_startup_32 - .word __KERNEL32_CS, 0 - - .balign 4 - .globl startup_64_vector -startup_64_vector: - .long pa_startup_64 - .word __KERNEL_CS, 0 - .data .balign 4 GLOBAL(trampoline_status) -- cgit v1.2.1 From 968ff9ee56f1e3ed4ff4a6d10185865dc77d8f7e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:36 +0300 Subject: x86, realmode: Remove indirect jumps in trampoline_32 and wakeup_asm Remove indirect jumps in trampoline_32.S and the 32-bit part of wakeup_asm.S. There exist systems which are known to do weird things if an SMI comes in right after a mode switch, and the safest way to deal with it is to always follow with a simple absolute far jump. In the 64-bit code we then to a register indirect near jump; follow that pattern for the 32-bit code. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-14-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/trampoline_32.S | 22 +++++++++++++--------- arch/x86/realmode/rm/wakeup/wakeup_asm.S | 8 +++++--- 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 1f9e3316f73d..1315ef48dbf1 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -47,24 +47,29 @@ trampoline_data: cli # We should be safe anyway + movl startup_32_smp, %eax # where we need to go + movl $0xA5A5A5A5, trampoline_status # write marker for master knows we're running - /* GDT tables in non default location kernel can be beyond 16MB and + /* + * GDT tables in non default location kernel can be beyond 16MB and * lgdt will not be able to load the address as in real mode default * operand size is 16bit. Use lgdtl instead to force operand size * to 32 bit. */ - lidtl boot_idt_descr # load idt with 0, 0 lgdtl boot_gdt_descr # load gdt with whatever is appropriate - xor %ax, %ax - inc %ax # protected mode (PE) bit - lmsw %ax # into protected mode + movw $1, %dx # protected mode (PE) bit + lmsw %dx # into protected mode - # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S - ljmpl *(startup_32_smp) + ljmpl $__BOOT_CS, $pa_startup_32 + + .section ".text32","ax" + .code32 +ENTRY(startup_32) # note: also used from wakeup_asm.S + jmp *%eax .data .globl startup_32_smp, boot_gdt, trampoline_status @@ -82,5 +87,4 @@ trampoline_status: .long 0 startup_32_smp: - .long 0x00000000 - .word __BOOT_CS, 0 + .long 0 diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S index b61126cb599e..4c5c5f2bfbec 100644 --- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S @@ -124,9 +124,11 @@ wakeup_start: lgdtl pmode_gdt /* This really couldn't... */ - movl pmode_cr0, %eax - movl %eax, %cr0 - ljmpl *pmode_entry + movl pmode_entry, %eax + movl pmode_cr0, %ecx + movl %ecx, %cr0 + ljmpl $__KERNEL_CS, $pa_startup_32 + /* -> jmp *%eax in trampoline_32.S */ #else jmp trampoline_data #endif -- cgit v1.2.1 From e5684ec438a094bec0f7d5c52652c0901b48b613 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:37 +0300 Subject: x86, realmode: Replace open-coded ljmpw with a macro We cannot code an ljmpw to the real-mode segment directly, because gas refuses to assemble an ljmp with a symbolic segment. Instead of open-coding it everywhere, define a macro and use it for this case. This is specifically an ljmpw from a 16-bit segment. This is okay, as one should never enter real mode from a 32-bit segment: if one do, the CPU ends up in a bizarre (and useless) mode sometimes called "unreal mode" where segments behave like real mode but the default address and operand sizes is 32 bits. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-15-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/realmode.h | 16 ++++++++++++++++ arch/x86/realmode/rm/reboot_32.S | 6 ++---- arch/x86/realmode/rm/trampoline_32.S | 5 ++--- arch/x86/realmode/rm/trampoline_64.S | 5 ++--- arch/x86/realmode/rm/wakeup/wakeup_asm.S | 9 +++------ 5 files changed, 25 insertions(+), 16 deletions(-) create mode 100644 arch/x86/realmode/rm/realmode.h (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h new file mode 100644 index 000000000000..15ab6335f843 --- /dev/null +++ b/arch/x86/realmode/rm/realmode.h @@ -0,0 +1,16 @@ +#ifndef ARCH_X86_REALMODE_RM_REALMODE_H +#define ARCH_X86_REALMODE_RM_REALMODE_H + +#ifdef __ASSEMBLY__ + +/* + * 16-bit ljmpw to the real_mode_seg + * + * This must be open-coded since gas will choke on using a + * relocatable symbol for the segment portion. + */ +#define LJMPW_RM(to) .byte 0xea ; .word (to), real_mode_seg + +#endif /* __ASSEMBLY__ */ + +#endif /* ARCH_X86_REALMODE_RM_REALMODE_H */ diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S index 83803c222b4a..e90f8c4bbae2 100644 --- a/arch/x86/realmode/rm/reboot_32.S +++ b/arch/x86/realmode/rm/reboot_32.S @@ -2,6 +2,7 @@ #include #include #include +#include "realmode.h" /* * The following code and data reboots the machine by switching to real @@ -82,10 +83,7 @@ machine_real_restart_asm16: 2: andb $0x10, %dl movl %edx, %cr0 - .byte 0xea /* ljmpw */ - .word 3f /* Offset */ - .word real_mode_seg /* Segment */ - + LJMPW_RM(3f) 3: testb $0, %al jz bios diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 1315ef48dbf1..279f82ef7a9e 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -29,6 +29,7 @@ #include #include #include +#include "realmode.h" .text .code16 @@ -38,9 +39,7 @@ trampoline_data: wbinvd # Needed for NUMA-Q should be harmless for others - .byte 0xea # ljmpw - .word 1f # Offset - .word real_mode_seg # Segment + LJMPW_RM(1f) 1: mov %cs, %ax # Code and data in the same place mov %ax, %ds diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 77b72b45d705..7459c52f0c25 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -31,6 +31,7 @@ #include #include #include +#include "realmode.h" .text .balign PAGE_SIZE @@ -40,9 +41,7 @@ ENTRY(trampoline_data) cli # We should be safe anyway wbinvd - .byte 0xea # ljmpw - .word 1f # Offset - .word real_mode_seg # Segment + LJMPW_RM(1f) 1: mov %cs, %ax # Code and data in the same place mov %ax, %ds diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S index 4c5c5f2bfbec..8064e1c3591b 100644 --- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S @@ -6,6 +6,7 @@ #include #include #include +#include "../realmode.h" #include "wakeup.h" .code16 @@ -36,9 +37,7 @@ wakeup_start: cli cld - .byte 0xea /* ljmpw */ - .word 3f - .word real_mode_seg + LJMPW_RM(3f) 3: /* Apparently some dimwit BIOS programmers don't know how to program a PM to RM transition, and we might end up here with @@ -59,9 +58,7 @@ wakeup_start: andb $~X86_CR0_PE, %al movl %eax, %cr0 - .byte 0xea /* ljmpw */ - .word 3f - .word real_mode_seg + LJMPW_RM(3f) 3: /* Set up segments */ movw %cs, %ax -- cgit v1.2.1 From be60828920d23758da8124bed771404a0438f369 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:38 +0300 Subject: x86, realmode: Move trampoline_*.S early in the link order Move trampoline_*.S earlier in the link order so it ends up being first in the text segment; since the SIPI vector requires 4K alignment it otherwise ends up padding the .text segment with that much completely unnecessarily. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-16-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 2432acb6b04f..2423142b4da4 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -12,8 +12,8 @@ subdir- := wakeup always := realmode.bin realmode-y += header.o -realmode-$(CONFIG_X86_32) += reboot_32.o realmode-y += trampoline_$(BITS).o +realmode-$(CONFIG_X86_32) += reboot_32.o realmode-$(CONFIG_ACPI_SLEEP) += wakeup/wakeup.o targets += $(realmode-y) -- cgit v1.2.1 From 6feb592dceaed1a6cf26c9747b1180958d5156cd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:39 +0300 Subject: x86, realmode: Fix always-zero test in reboot_32.S A test instruction is an "and", and an and with zero is always zero. This would cause us to always take the BIOS path, not the APM path, in case anyone actually cares... Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-17-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/reboot_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S index e90f8c4bbae2..50ba994ba921 100644 --- a/arch/x86/realmode/rm/reboot_32.S +++ b/arch/x86/realmode/rm/reboot_32.S @@ -85,7 +85,7 @@ machine_real_restart_asm16: movl %edx, %cr0 LJMPW_RM(3f) 3: - testb $0, %al + andw %ax, %ax jz bios apm: -- cgit v1.2.1 From 8e029fcdd8702719c9179317cae9ef84ebe7027e Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:40 +0300 Subject: x86, realmode: fix 64-bit wakeup sequence There were number of issues in wakeup sequence: - Wakeup stack was placed in hardcoded address. - NX bit in EFER was not enabled. - Initialization incorrectly set physical address of secondary_startup_64. - Some alignment issues. This patch fixes these issues and in addition: - Unifies coding conventions in .S files. - Sets alignments of code and data right. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-18-git-send-email-jarkko.sakkinen@intel.com Originally-by: H. Peter Anvin Cc: Rafael J. Wysocki Cc: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/kernel/realmode.c | 2 +- arch/x86/realmode/rm/Makefile | 1 + arch/x86/realmode/rm/header.S | 2 +- arch/x86/realmode/rm/reboot_32.S | 18 ++++---- arch/x86/realmode/rm/stack.S | 19 ++++++++ arch/x86/realmode/rm/trampoline_32.S | 29 ++++++------ arch/x86/realmode/rm/trampoline_64.S | 67 ++++++++++++---------------- arch/x86/realmode/rm/wakeup/wakeup_asm.S | 75 +++++++++++++++----------------- arch/x86/realmode/rmpiggy.S | 4 +- 9 files changed, 110 insertions(+), 107 deletions(-) create mode 100644 arch/x86/realmode/rm/stack.S (limited to 'arch/x86') diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index d85ac20bb4eb..e7bf82a409bf 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -64,7 +64,7 @@ void __init setup_real_mode(void) *((u32 *)__va(real_mode_header.boot_gdt)) = __pa(boot_gdt); #else *((u64 *) __va(real_mode_header.startup_64_smp)) = - (u64) __pa(secondary_startup_64); + (u64)secondary_startup_64; *((u64 *) __va(real_mode_header.level3_ident_pgt)) = __pa(level3_ident_pgt) + _KERNPG_TABLE; diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 2423142b4da4..c2c27a41ab8f 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -13,6 +13,7 @@ always := realmode.bin realmode-y += header.o realmode-y += trampoline_$(BITS).o +realmode-y += stack.o realmode-$(CONFIG_X86_32) += reboot_32.o realmode-$(CONFIG_ACPI_SLEEP) += wakeup/wakeup.o diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index 730b1316c099..a91ec8f6b15f 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -9,7 +9,7 @@ .section ".header", "a" -ENTRY(real_mode_header) +GLOBAL(real_mode_header) .long pa_text_start .long pa_ro_end .long pa_end diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S index 50ba994ba921..8d9bfd13a93e 100644 --- a/arch/x86/realmode/rm/reboot_32.S +++ b/arch/x86/realmode/rm/reboot_32.S @@ -16,10 +16,9 @@ */ .section ".text32", "ax" .code32 - .globl machine_real_restart_asm - .balign 16 -machine_real_restart_asm: + .balign 16 +ENTRY(machine_real_restart_asm) /* Set up the IDT for real mode. */ lidtl pa_machine_real_restart_idt @@ -67,7 +66,7 @@ machine_real_restart_asm: .text .code16 - .balign 16 + .balign 16 machine_real_restart_asm16: 1: xorl %ecx, %ecx @@ -102,15 +101,15 @@ bios: ljmpw $0xf000, $0xfff0 .section ".rodata", "a" - .globl machine_real_restart_idt, machine_real_restart_gdt - .balign 16 -machine_real_restart_idt: + .balign 16 +GLOBAL(machine_real_restart_idt) .word 0xffff /* Length - real mode default value */ .long 0 /* Base - real mode default value */ +END(machine_real_restart_idt) - .balign 16 -machine_real_restart_gdt: + .balign 16 +GLOBAL(machine_real_restart_gdt) /* Self-pointer */ .word 0xffff /* Length - real mode default value */ .long pa_machine_real_restart_gdt @@ -130,3 +129,4 @@ machine_real_restart_gdt: * semantics we don't have to reload the segments once CR0.PE = 0. */ .quad GDT_ENTRY(0x0093, 0x100, 0xffff) +END(machine_real_restart_gdt) diff --git a/arch/x86/realmode/rm/stack.S b/arch/x86/realmode/rm/stack.S new file mode 100644 index 000000000000..867ae87adfae --- /dev/null +++ b/arch/x86/realmode/rm/stack.S @@ -0,0 +1,19 @@ +/* + * Common heap and stack allocations + */ + +#include + + .data +GLOBAL(HEAP) + .long rm_heap +GLOBAL(heap_end) + .long rm_stack + + .bss + .balign 16 +GLOBAL(rm_heap) + .space 2048 +GLOBAL(rm_stack) + .space 2048 +GLOBAL(rm_stack_end) diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 279f82ef7a9e..1ecdbb59191b 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -33,10 +33,9 @@ .text .code16 - .globl trampoline_data - .balign PAGE_SIZE -trampoline_data: + .balign PAGE_SIZE +ENTRY(trampoline_data) wbinvd # Needed for NUMA-Q should be harmless for others LJMPW_RM(1f) @@ -70,20 +69,22 @@ trampoline_data: ENTRY(startup_32) # note: also used from wakeup_asm.S jmp *%eax - .data - .globl startup_32_smp, boot_gdt, trampoline_status - .balign 4 -boot_gdt_descr: - .word __BOOT_DS + 7 # gdt limit -boot_gdt: - .long 0 # gdt base + .section ".rodata","a" + .balign 4 boot_idt_descr: .word 0 # idt limit = 0 .long 0 # idt base = 0L -trampoline_status: - .long 0 + .data -startup_32_smp: - .long 0 +boot_gdt_descr: + .word __BOOT_DS + 7 # gdt limit +GLOBAL(boot_gdt) + .long 0 # gdt base + + .bss + + .balign 4 +GLOBAL(trampoline_status) .space 4 +GLOBAL(startup_32_smp) .space 4 diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 7459c52f0c25..f71ea0800d3d 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -52,7 +52,7 @@ ENTRY(trampoline_data) # write marker for master knows we're running # Setup stack - movw $trampoline_stack_end, %sp + movl $rm_stack_end, %esp call verify_cpu # Verify the cpu supports long mode testl %eax, %eax # Check for return code @@ -68,8 +68,11 @@ ENTRY(trampoline_data) lidtl tidt # load idt with 0, 0 lgdtl tgdt # load gdt with whatever is appropriate - mov $X86_CR0_PE, %ax # protected mode (PE) bit - lmsw %ax # into protected mode + movw $__KERNEL_DS, %dx # Data segment descriptor + + # Enable protected mode + movl $X86_CR0_PE, %eax # protected mode (PE) bit + movl %eax, %cr0 # into protected mode # flush prefetch and jump to startup_32 ljmpl $__KERNEL32_CS, $pa_startup_32 @@ -83,27 +86,27 @@ no_longmode: .code32 .balign 4 ENTRY(startup_32) - movl $__KERNEL_DS, %eax # Initialize the %ds segment register - movl %eax, %ds + movl %edx, %ss + addl $pa_real_mode_base, %esp + movl %edx, %ds + movl %edx, %es + movl %edx, %fs + movl %edx, %gs movl $X86_CR4_PAE, %eax movl %eax, %cr4 # Enable PAE mode - movl pa_startup_64_smp, %esi - movl pa_startup_64_smp_high, %edi - - # Setup trampoline 4 level pagetables - leal pa_trampoline_level4_pgt, %eax + # Setup trampoline 4 level pagetables + movl $pa_level3_ident_pgt, %eax movl %eax, %cr3 movl $MSR_EFER, %ecx - movl $(1 << _EFER_LME), %eax # Enable Long Mode + movl $((1 << _EFER_LME) | (1 << _EFER_NX)), %eax # Enable Long Mode xorl %edx, %edx wrmsr # Enable paging and in turn activate Long Mode - # Enable protected mode - movl $(X86_CR0_PG | X86_CR0_PE), %eax + movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax movl %eax, %cr0 /* @@ -119,10 +122,7 @@ ENTRY(startup_32) .balign 4 ENTRY(startup_64) # Now jump into the kernel using virtual addresses - movl %edi, %eax - shlq $32, %rax - addl %esi, %eax - jmp *%rax + jmpq *startup_64_smp(%rip) .section ".rodata","a" .balign 16 @@ -132,10 +132,10 @@ tidt: # Duplicate the global descriptor table # so the kernel can live anywhere - .balign 4 + .balign 16 .globl tgdt tgdt: - .short tgdt_end - tgdt # gdt limit + .short tgdt_end - tgdt - 1 # gdt limit .long pa_tgdt .short 0 .quad 0x00cf9b000000ffff # __KERNEL32_CS @@ -143,23 +143,12 @@ tgdt: .quad 0x00cf93000000ffff # __KERNEL_DS tgdt_end: - .data - .balign 4 -GLOBAL(trampoline_status) - .long 0 - -trampoline_stack: - .org 0x1000 -trampoline_stack_end: - - .globl level3_ident_pgt - .globl level3_kernel_pgt -GLOBAL(trampoline_level4_pgt) - level3_ident_pgt: .quad 0 - .fill 510,8,0 - level3_kernel_pgt: .quad 0 - - .globl startup_64_smp - .globl startup_64_smp_high -startup_64_smp: .long 0 -startup_64_smp_high: .long 0 + .bss + + .balign PAGE_SIZE +GLOBAL(level3_ident_pgt) .space 511*8 +GLOBAL(level3_kernel_pgt) .space 8 + + .balign 8 +GLOBAL(startup_64_smp) .space 8 +GLOBAL(trampoline_status) .space 4 diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S index 8064e1c3591b..f81c1cd99eaf 100644 --- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S @@ -1,6 +1,7 @@ /* * ACPI wakeup real mode startup stub */ +#include #include #include #include @@ -9,31 +10,33 @@ #include "../realmode.h" #include "wakeup.h" - .code16 + .code16 /* This should match the structure in wakeup.h */ - .section ".data", "aw" - .globl wakeup_header -wakeup_header: -video_mode: .short 0 /* Video mode number */ -pmode_entry: .long 0 -pmode_cs: .short __KERNEL_CS -pmode_cr0: .long 0 /* Saved %cr0 */ -pmode_cr3: .long 0 /* Saved %cr3 */ -pmode_cr4: .long 0 /* Saved %cr4 */ -pmode_efer: .quad 0 /* Saved EFER */ -pmode_gdt: .quad 0 -pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ -pmode_behavior: .long 0 /* Wakeup behavior flags */ -realmode_flags: .long 0 -real_magic: .long 0 -signature: .long WAKEUP_HEADER_SIGNATURE - .size wakeup_header, .-wakeup_header + .section ".data", "aw" + + .balign 16 +GLOBAL(wakeup_header) + video_mode: .short 0 /* Video mode number */ + pmode_entry: .long 0 + pmode_cs: .short __KERNEL_CS + pmode_cr0: .long 0 /* Saved %cr0 */ + pmode_cr3: .long 0 /* Saved %cr3 */ + pmode_cr4: .long 0 /* Saved %cr4 */ + pmode_efer: .quad 0 /* Saved EFER */ + pmode_gdt: .quad 0 + pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ + pmode_behavior: .long 0 /* Wakeup behavior flags */ + realmode_flags: .long 0 + real_magic: .long 0 + signature: .long WAKEUP_HEADER_SIGNATURE +END(wakeup_header) .text .code16 - .globl wakeup_start -wakeup_start: + + .balign 16 +ENTRY(wakeup_start) cli cld @@ -62,12 +65,14 @@ wakeup_start: 3: /* Set up segments */ movw %cs, %ax + movw %ax, %ss + movl $rm_stack_end, %esp movw %ax, %ds movw %ax, %es - movw %ax, %ss - lidtl wakeup_idt + movw %ax, %fs + movw %ax, %gs - movl $wakeup_stack_end, %esp + lidtl wakeup_idt /* Clear the EFLAGS */ pushl $0 @@ -145,9 +150,8 @@ bogus_real_magic: * be the case for other laptops or integrated video devices. */ - .globl wakeup_gdt .balign 16 -wakeup_gdt: +GLOBAL(wakeup_gdt) .word 3*8-1 /* Self-descriptor */ .long pa_wakeup_gdt .word 0 @@ -159,29 +163,18 @@ wakeup_gdt: .word 0xffff /* 16-bit data segment @ real_mode_base */ .long 0x93000000 + pa_real_mode_base .word 0x008f /* big real mode */ - .size wakeup_gdt, .-wakeup_gdt +END(wakeup_gdt) - .data + .section ".rodata","a" .balign 8 /* This is the standard real-mode IDT */ -wakeup_idt: + .balign 16 +GLOBAL(wakeup_idt) .word 0xffff /* limit */ .long 0 /* address */ .word 0 - - .globl HEAP, heap_end -HEAP: - .long wakeup_heap -heap_end: - .long wakeup_stack - - .bss -wakeup_heap: - .space 2048 -wakeup_stack: - .space 2048 -wakeup_stack_end: +END(wakeup_idt) .section ".signature","a" end_signature: diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S index 6047d7f604cf..fd72a99d12ae 100644 --- a/arch/x86/realmode/rmpiggy.S +++ b/arch/x86/realmode/rmpiggy.S @@ -9,10 +9,10 @@ .balign PAGE_SIZE -ENTRY(real_mode_blob) +GLOBAL(real_mode_blob) .incbin "arch/x86/realmode/rm/realmode.bin" END(real_mode_blob) -ENTRY(real_mode_relocs) +GLOBAL(real_mode_relocs) .incbin "arch/x86/realmode/rm/realmode.relocs" END(real_mode_relocs) -- cgit v1.2.1 From b429dbf6e866bd6dadb56fae66f61f611cde57ff Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:41 +0300 Subject: x86, realmode: don't copy real_mode_header Replaced copying of real_mode_header with a pointer to beginning of RM memory. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-19-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 5 ++-- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/realmode.c | 57 ++++++++++++++++--------------------- arch/x86/kernel/reboot.c | 2 +- arch/x86/kernel/smpboot.c | 4 +-- arch/x86/kernel/tboot.c | 2 +- arch/x86/realmode/rm/header.S | 1 - arch/x86/realmode/rm/realmode.lds.S | 1 - arch/x86/realmode/rmpiggy.S | 2 ++ 9 files changed, 34 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 1bfc74d213a4..d3ae49f4c3ef 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -8,7 +8,6 @@ struct real_mode_header { u32 text_start; u32 ro_end; - u32 end; /* reboot */ #ifdef CONFIG_X86_32 u32 machine_real_restart_asm; @@ -30,8 +29,8 @@ struct real_mode_header { #endif } __attribute__((__packed__)); -extern struct real_mode_header real_mode_header; -extern unsigned char *real_mode_base; +extern struct real_mode_header *real_mode_header; +extern unsigned char real_mode_blob_end[]; extern unsigned long init_rsp; extern unsigned long initial_code; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index d941b62da4b6..6ca3f54ebe7d 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -38,7 +38,7 @@ asmlinkage void acpi_enter_s3(void) int acpi_suspend_lowlevel(void) { struct wakeup_header *header = - (struct wakeup_header *) __va(real_mode_header.wakeup_header); + (struct wakeup_header *) __va(real_mode_header->wakeup_header); if (header->signature != WAKEUP_HEADER_SIGNATURE) { printk(KERN_ERR "wakeup header does not match\n"); diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index e7bf82a409bf..632c810ec8ea 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -5,8 +5,7 @@ #include #include -unsigned char *real_mode_base; -struct real_mode_header real_mode_header; +struct real_mode_header *real_mode_header; void __init setup_real_mode(void) { @@ -17,33 +16,32 @@ void __init setup_real_mode(void) u32 *ptr; u16 *seg; int i; + unsigned char *base; - struct real_mode_header *header = - (struct real_mode_header *) real_mode_blob; - - size_t size = PAGE_ALIGN(header->end); + size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); /* Has to be in very low memory so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); if (!mem) panic("Cannot allocate trampoline\n"); - real_mode_base = __va(mem); + base = __va(mem); memblock_reserve(mem, size); + real_mode_header = (struct real_mode_header *) base; printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - real_mode_base, (unsigned long long)mem, size); + base, (unsigned long long)mem, size); - memcpy(real_mode_base, real_mode_blob, size); + memcpy(base, real_mode_blob, size); - real_mode_seg = __pa(real_mode_base) >> 4; + real_mode_seg = __pa(base) >> 4; rel = (u32 *) real_mode_relocs; /* 16-bit segment relocations. */ count = rel[0]; rel = &rel[1]; for (i = 0; i < count; i++) { - seg = (u16 *) (real_mode_base + rel[i]); + seg = (u16 *) (base + rel[i]); *seg = real_mode_seg; } @@ -51,25 +49,21 @@ void __init setup_real_mode(void) count = rel[i]; rel = &rel[i + 1]; for (i = 0; i < count; i++) { - ptr = (u32 *) (real_mode_base + rel[i]); - *ptr += __pa(real_mode_base); + ptr = (u32 *) (base + rel[i]); + *ptr += __pa(base); } - /* Copied header will contain relocated physical addresses. */ - memcpy(&real_mode_header, real_mode_base, - sizeof(struct real_mode_header)); - #ifdef CONFIG_X86_32 - *((u32 *)__va(real_mode_header.startup_32_smp)) = __pa(startup_32_smp); - *((u32 *)__va(real_mode_header.boot_gdt)) = __pa(boot_gdt); + *((u32 *)__va(real_mode_header->startup_32_smp)) = __pa(startup_32_smp); + *((u32 *)__va(real_mode_header->boot_gdt)) = __pa(boot_gdt); #else - *((u64 *) __va(real_mode_header.startup_64_smp)) = + *((u64 *) __va(real_mode_header->startup_64_smp)) = (u64)secondary_startup_64; - *((u64 *) __va(real_mode_header.level3_ident_pgt)) = + *((u64 *) __va(real_mode_header->level3_ident_pgt)) = __pa(level3_ident_pgt) + _KERNPG_TABLE; - *((u64 *) __va(real_mode_header.level3_kernel_pgt)) = + *((u64 *) __va(real_mode_header->level3_kernel_pgt)) = __pa(level3_kernel_pgt) + _KERNPG_TABLE; #endif } @@ -82,23 +76,22 @@ void __init setup_real_mode(void) */ static int __init set_real_mode_permissions(void) { - size_t all_size = - PAGE_ALIGN(real_mode_header.end) - - __pa(real_mode_base); + unsigned char *base = (unsigned char *) real_mode_header; + size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); size_t ro_size = - PAGE_ALIGN(real_mode_header.ro_end) - - __pa(real_mode_base); + PAGE_ALIGN(real_mode_header->ro_end) - + __pa(base); size_t text_size = - PAGE_ALIGN(real_mode_header.ro_end) - - real_mode_header.text_start; + PAGE_ALIGN(real_mode_header->ro_end) - + real_mode_header->text_start; unsigned long text_start = - (unsigned long) __va(real_mode_header.text_start); + (unsigned long) __va(real_mode_header->text_start); - set_memory_nx((unsigned long) real_mode_base, all_size >> PAGE_SHIFT); - set_memory_ro((unsigned long) real_mode_base, ro_size >> PAGE_SHIFT); + set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); + set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); return 0; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 050eff29a4bb..658f856f09a3 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -336,7 +336,7 @@ core_initcall(reboot_init); void machine_real_restart(unsigned int type) { void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) - real_mode_header.machine_real_restart_asm; + real_mode_header->machine_real_restart_asm; local_irq_disable(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c7971ea74bd0..b8c0661e2341 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -665,9 +665,9 @@ static void __cpuinit announce_cpu(int cpu, int apicid) static int __cpuinit do_boot_cpu(int apicid, int cpu) { volatile u32 *trampoline_status = - (volatile u32 *) __va(real_mode_header.trampoline_status); + (volatile u32 *) __va(real_mode_header->trampoline_status); /* start_ip had better be page-aligned! */ - unsigned long start_ip = real_mode_header.trampoline_data; + unsigned long start_ip = real_mode_header->trampoline_data; unsigned long boot_error = 0; int timeout; diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index c136e2325062..65adda4fde93 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -202,7 +202,7 @@ static int tboot_setup_sleep(void) } tboot->acpi_sinfo.kernel_s3_resume_vector = - real_mode_header.wakeup_start; + real_mode_header->wakeup_start; return 0; } diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index a91ec8f6b15f..c83005c4d455 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -12,7 +12,6 @@ GLOBAL(real_mode_header) .long pa_text_start .long pa_ro_end - .long pa_end #ifdef CONFIG_X86_32 .long pa_machine_real_restart_asm #endif diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index 4d4afcaf5f02..86b2e8d6b1f1 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -65,7 +65,6 @@ SECTIONS .signature : { *(.signature) } - pa_end = .; /DISCARD/ : { *(.note*) diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S index fd72a99d12ae..204c6ece0e97 100644 --- a/arch/x86/realmode/rmpiggy.S +++ b/arch/x86/realmode/rmpiggy.S @@ -13,6 +13,8 @@ GLOBAL(real_mode_blob) .incbin "arch/x86/realmode/rm/realmode.bin" END(real_mode_blob) +GLOBAL(real_mode_blob_end); + GLOBAL(real_mode_relocs) .incbin "arch/x86/realmode/rm/realmode.relocs" END(real_mode_relocs) -- cgit v1.2.1 From c4845474a01f699966272536e8416222e3f2d2cb Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:42 +0300 Subject: x86, realmode: flattened rm hierachy Simplified hierarchy under rm directory to a flat directory because it is not anymore really justified to have own directory for wakeup code. It only adds more complexity. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-20-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/realmode/rm/Makefile | 20 ++-- arch/x86/realmode/rm/bioscall.S | 1 + arch/x86/realmode/rm/copy.S | 1 + arch/x86/realmode/rm/regs.c | 1 + arch/x86/realmode/rm/video-bios.c | 1 + arch/x86/realmode/rm/video-mode.c | 1 + arch/x86/realmode/rm/video-vesa.c | 1 + arch/x86/realmode/rm/video-vga.c | 1 + arch/x86/realmode/rm/wakemain.c | 82 ++++++++++++++ arch/x86/realmode/rm/wakeup.h | 41 +++++++ arch/x86/realmode/rm/wakeup/.gitignore | 3 - arch/x86/realmode/rm/wakeup/Makefile | 33 ------ arch/x86/realmode/rm/wakeup/bioscall.S | 1 - arch/x86/realmode/rm/wakeup/copy.S | 1 - arch/x86/realmode/rm/wakeup/regs.c | 1 - arch/x86/realmode/rm/wakeup/video-bios.c | 1 - arch/x86/realmode/rm/wakeup/video-mode.c | 1 - arch/x86/realmode/rm/wakeup/video-vesa.c | 1 - arch/x86/realmode/rm/wakeup/video-vga.c | 1 - arch/x86/realmode/rm/wakeup/wakemain.c | 82 -------------- arch/x86/realmode/rm/wakeup/wakeup.h | 41 ------- arch/x86/realmode/rm/wakeup/wakeup_asm.S | 181 ------------------------------- arch/x86/realmode/rm/wakeup_asm.S | 181 +++++++++++++++++++++++++++++++ 24 files changed, 325 insertions(+), 355 deletions(-) create mode 100644 arch/x86/realmode/rm/bioscall.S create mode 100644 arch/x86/realmode/rm/copy.S create mode 100644 arch/x86/realmode/rm/regs.c create mode 100644 arch/x86/realmode/rm/video-bios.c create mode 100644 arch/x86/realmode/rm/video-mode.c create mode 100644 arch/x86/realmode/rm/video-vesa.c create mode 100644 arch/x86/realmode/rm/video-vga.c create mode 100644 arch/x86/realmode/rm/wakemain.c create mode 100644 arch/x86/realmode/rm/wakeup.h delete mode 100644 arch/x86/realmode/rm/wakeup/.gitignore delete mode 100644 arch/x86/realmode/rm/wakeup/Makefile delete mode 100644 arch/x86/realmode/rm/wakeup/bioscall.S delete mode 100644 arch/x86/realmode/rm/wakeup/copy.S delete mode 100644 arch/x86/realmode/rm/wakeup/regs.c delete mode 100644 arch/x86/realmode/rm/wakeup/video-bios.c delete mode 100644 arch/x86/realmode/rm/wakeup/video-mode.c delete mode 100644 arch/x86/realmode/rm/wakeup/video-vesa.c delete mode 100644 arch/x86/realmode/rm/wakeup/video-vga.c delete mode 100644 arch/x86/realmode/rm/wakeup/wakemain.c delete mode 100644 arch/x86/realmode/rm/wakeup/wakeup.h delete mode 100644 arch/x86/realmode/rm/wakeup/wakeup_asm.S create mode 100644 arch/x86/realmode/rm/wakeup_asm.S (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 6ca3f54ebe7d..95bf99de9058 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -16,7 +16,7 @@ #include #include -#include "../../realmode/rm/wakeup/wakeup.h" +#include "../../realmode/rm/wakeup.h" #include "sleep.h" unsigned long acpi_realmode_flags; diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index c2c27a41ab8f..fc8854b09dfa 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -7,21 +7,26 @@ # # -subdir- := wakeup - always := realmode.bin realmode-y += header.o realmode-y += trampoline_$(BITS).o realmode-y += stack.o realmode-$(CONFIG_X86_32) += reboot_32.o -realmode-$(CONFIG_ACPI_SLEEP) += wakeup/wakeup.o +realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs) + +wakeup-objs := wakeup_asm.o wakemain.o video-mode.o +wakeup-objs += copy.o bioscall.o regs.o +# The link order of the video-*.o modules can matter. In particular, +# video-vga.o *must* be listed first, followed by video-vesa.o. +# Hardware-specific drivers should follow in the order they should be +# probed, and video-bios.o should typically be last. +wakeup-objs += video-vga.o +wakeup-objs += video-vesa.o +wakeup-objs += video-bios.o targets += $(realmode-y) -$(obj)/wakeup/wakeup.o: FORCE - $(Q)$(MAKE) $(build)=$(obj)/wakeup $@ - REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y)) sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p' @@ -55,7 +60,8 @@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE # How to compile the 16-bit code. Note we always compile for -march=i386, # that way we can complain to the user if the CPU is insufficient. -KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ \ +KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \ + -I$(srctree)/arch/x86/boot \ -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ diff --git a/arch/x86/realmode/rm/bioscall.S b/arch/x86/realmode/rm/bioscall.S new file mode 100644 index 000000000000..16162d197918 --- /dev/null +++ b/arch/x86/realmode/rm/bioscall.S @@ -0,0 +1 @@ +#include "../../boot/bioscall.S" diff --git a/arch/x86/realmode/rm/copy.S b/arch/x86/realmode/rm/copy.S new file mode 100644 index 000000000000..b785e6f38fdd --- /dev/null +++ b/arch/x86/realmode/rm/copy.S @@ -0,0 +1 @@ +#include "../../boot/copy.S" diff --git a/arch/x86/realmode/rm/regs.c b/arch/x86/realmode/rm/regs.c new file mode 100644 index 000000000000..fbb15b9f9ca9 --- /dev/null +++ b/arch/x86/realmode/rm/regs.c @@ -0,0 +1 @@ +#include "../../boot/regs.c" diff --git a/arch/x86/realmode/rm/video-bios.c b/arch/x86/realmode/rm/video-bios.c new file mode 100644 index 000000000000..848b25aaf11b --- /dev/null +++ b/arch/x86/realmode/rm/video-bios.c @@ -0,0 +1 @@ +#include "../../boot/video-bios.c" diff --git a/arch/x86/realmode/rm/video-mode.c b/arch/x86/realmode/rm/video-mode.c new file mode 100644 index 000000000000..2a98b7e2368b --- /dev/null +++ b/arch/x86/realmode/rm/video-mode.c @@ -0,0 +1 @@ +#include "../../boot/video-mode.c" diff --git a/arch/x86/realmode/rm/video-vesa.c b/arch/x86/realmode/rm/video-vesa.c new file mode 100644 index 000000000000..413edddb51e5 --- /dev/null +++ b/arch/x86/realmode/rm/video-vesa.c @@ -0,0 +1 @@ +#include "../../boot/video-vesa.c" diff --git a/arch/x86/realmode/rm/video-vga.c b/arch/x86/realmode/rm/video-vga.c new file mode 100644 index 000000000000..3085f5c9d288 --- /dev/null +++ b/arch/x86/realmode/rm/video-vga.c @@ -0,0 +1 @@ +#include "../../boot/video-vga.c" diff --git a/arch/x86/realmode/rm/wakemain.c b/arch/x86/realmode/rm/wakemain.c new file mode 100644 index 000000000000..91405d515ec6 --- /dev/null +++ b/arch/x86/realmode/rm/wakemain.c @@ -0,0 +1,82 @@ +#include "wakeup.h" +#include "boot.h" + +static void udelay(int loops) +{ + while (loops--) + io_delay(); /* Approximately 1 us */ +} + +static void beep(unsigned int hz) +{ + u8 enable; + + if (!hz) { + enable = 0x00; /* Turn off speaker */ + } else { + u16 div = 1193181/hz; + + outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */ + io_delay(); + outb(div, 0x42); /* LSB of counter */ + io_delay(); + outb(div >> 8, 0x42); /* MSB of counter */ + io_delay(); + + enable = 0x03; /* Turn on speaker */ + } + inb(0x61); /* Dummy read of System Control Port B */ + io_delay(); + outb(enable, 0x61); /* Enable timer 2 output to speaker */ + io_delay(); +} + +#define DOT_HZ 880 +#define DASH_HZ 587 +#define US_PER_DOT 125000 + +/* Okay, this is totally silly, but it's kind of fun. */ +static void send_morse(const char *pattern) +{ + char s; + + while ((s = *pattern++)) { + switch (s) { + case '.': + beep(DOT_HZ); + udelay(US_PER_DOT); + beep(0); + udelay(US_PER_DOT); + break; + case '-': + beep(DASH_HZ); + udelay(US_PER_DOT * 3); + beep(0); + udelay(US_PER_DOT); + break; + default: /* Assume it's a space */ + udelay(US_PER_DOT * 3); + break; + } + } +} + +void main(void) +{ + /* Kill machine if structures are wrong */ + if (wakeup_header.real_magic != 0x12345678) + while (1) + ; + + if (wakeup_header.realmode_flags & 4) + send_morse("...-"); + + if (wakeup_header.realmode_flags & 1) + asm volatile("lcallw $0xc000,$3"); + + if (wakeup_header.realmode_flags & 2) { + /* Need to call BIOS */ + probe_cards(0); + set_mode(wakeup_header.video_mode); + } +} diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h new file mode 100644 index 000000000000..2dfaf06b8af1 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup.h @@ -0,0 +1,41 @@ +/* + * Definitions for the wakeup data structure at the head of the + * wakeup code. + */ + +#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H +#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H + +#ifndef __ASSEMBLY__ +#include + +/* This must match data at wakeup.S */ +struct wakeup_header { + u16 video_mode; /* Video mode number */ + u32 pmode_entry; /* Protected mode resume point, 32-bit only */ + u16 pmode_cs; + u32 pmode_cr0; /* Protected mode cr0 */ + u32 pmode_cr3; /* Protected mode cr3 */ + u32 pmode_cr4; /* Protected mode cr4 */ + u32 pmode_efer_low; /* Protected mode EFER */ + u32 pmode_efer_high; + u64 pmode_gdt; + u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */ + u32 pmode_misc_en_high; + u32 pmode_behavior; /* Wakeup routine behavior flags */ + u32 realmode_flags; + u32 real_magic; + u32 signature; /* To check we have correct structure */ +} __attribute__((__packed__)); + +extern struct wakeup_header wakeup_header; +#endif + +#define WAKEUP_HEADER_OFFSET 8 +#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 +#define WAKEUP_END_SIGNATURE 0x65a22c82 + +/* Wakeup behavior bits */ +#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 + +#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/realmode/rm/wakeup/.gitignore b/arch/x86/realmode/rm/wakeup/.gitignore deleted file mode 100644 index 58f1f48a58f8..000000000000 --- a/arch/x86/realmode/rm/wakeup/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -wakeup.bin -wakeup.elf -wakeup.lds diff --git a/arch/x86/realmode/rm/wakeup/Makefile b/arch/x86/realmode/rm/wakeup/Makefile deleted file mode 100644 index 4c8533240cdd..000000000000 --- a/arch/x86/realmode/rm/wakeup/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -# -# arch/x86/kernel/acpi/realmode/Makefile -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# - -always := wakeup.o - -wakeup-y += wakeup_asm.o wakemain.o video-mode.o -wakeup-y += copy.o bioscall.o regs.o - -# The link order of the video-*.o modules can matter. In particular, -# video-vga.o *must* be listed first, followed by video-vesa.o. -# Hardware-specific drivers should follow in the order they should be -# probed, and video-bios.o should typically be last. -wakeup-y += video-vga.o -wakeup-y += video-vesa.o -wakeup-y += video-bios.o - -targets += $(wakeup-y) - -WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) - -LDFLAGS_wakeup.o := -m elf_i386 -r -$(obj)/wakeup.o: $(WAKEUP_OBJS) FORCE - $(call if_changed,ld) - -bootsrc := $(src)/../../../boot - -ccflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc) -asflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc) diff --git a/arch/x86/realmode/rm/wakeup/bioscall.S b/arch/x86/realmode/rm/wakeup/bioscall.S deleted file mode 100644 index f51eb0bb56ce..000000000000 --- a/arch/x86/realmode/rm/wakeup/bioscall.S +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/bioscall.S" diff --git a/arch/x86/realmode/rm/wakeup/copy.S b/arch/x86/realmode/rm/wakeup/copy.S deleted file mode 100644 index dc59ebee69d8..000000000000 --- a/arch/x86/realmode/rm/wakeup/copy.S +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/copy.S" diff --git a/arch/x86/realmode/rm/wakeup/regs.c b/arch/x86/realmode/rm/wakeup/regs.c deleted file mode 100644 index 6206033ba202..000000000000 --- a/arch/x86/realmode/rm/wakeup/regs.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/regs.c" diff --git a/arch/x86/realmode/rm/wakeup/video-bios.c b/arch/x86/realmode/rm/wakeup/video-bios.c deleted file mode 100644 index 7deabc144a27..000000000000 --- a/arch/x86/realmode/rm/wakeup/video-bios.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-bios.c" diff --git a/arch/x86/realmode/rm/wakeup/video-mode.c b/arch/x86/realmode/rm/wakeup/video-mode.c deleted file mode 100644 index 328ad209f113..000000000000 --- a/arch/x86/realmode/rm/wakeup/video-mode.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-mode.c" diff --git a/arch/x86/realmode/rm/wakeup/video-vesa.c b/arch/x86/realmode/rm/wakeup/video-vesa.c deleted file mode 100644 index 9dbb9672226a..000000000000 --- a/arch/x86/realmode/rm/wakeup/video-vesa.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-vesa.c" diff --git a/arch/x86/realmode/rm/wakeup/video-vga.c b/arch/x86/realmode/rm/wakeup/video-vga.c deleted file mode 100644 index bcc81255f374..000000000000 --- a/arch/x86/realmode/rm/wakeup/video-vga.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-vga.c" diff --git a/arch/x86/realmode/rm/wakeup/wakemain.c b/arch/x86/realmode/rm/wakeup/wakemain.c deleted file mode 100644 index 91405d515ec6..000000000000 --- a/arch/x86/realmode/rm/wakeup/wakemain.c +++ /dev/null @@ -1,82 +0,0 @@ -#include "wakeup.h" -#include "boot.h" - -static void udelay(int loops) -{ - while (loops--) - io_delay(); /* Approximately 1 us */ -} - -static void beep(unsigned int hz) -{ - u8 enable; - - if (!hz) { - enable = 0x00; /* Turn off speaker */ - } else { - u16 div = 1193181/hz; - - outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */ - io_delay(); - outb(div, 0x42); /* LSB of counter */ - io_delay(); - outb(div >> 8, 0x42); /* MSB of counter */ - io_delay(); - - enable = 0x03; /* Turn on speaker */ - } - inb(0x61); /* Dummy read of System Control Port B */ - io_delay(); - outb(enable, 0x61); /* Enable timer 2 output to speaker */ - io_delay(); -} - -#define DOT_HZ 880 -#define DASH_HZ 587 -#define US_PER_DOT 125000 - -/* Okay, this is totally silly, but it's kind of fun. */ -static void send_morse(const char *pattern) -{ - char s; - - while ((s = *pattern++)) { - switch (s) { - case '.': - beep(DOT_HZ); - udelay(US_PER_DOT); - beep(0); - udelay(US_PER_DOT); - break; - case '-': - beep(DASH_HZ); - udelay(US_PER_DOT * 3); - beep(0); - udelay(US_PER_DOT); - break; - default: /* Assume it's a space */ - udelay(US_PER_DOT * 3); - break; - } - } -} - -void main(void) -{ - /* Kill machine if structures are wrong */ - if (wakeup_header.real_magic != 0x12345678) - while (1) - ; - - if (wakeup_header.realmode_flags & 4) - send_morse("...-"); - - if (wakeup_header.realmode_flags & 1) - asm volatile("lcallw $0xc000,$3"); - - if (wakeup_header.realmode_flags & 2) { - /* Need to call BIOS */ - probe_cards(0); - set_mode(wakeup_header.video_mode); - } -} diff --git a/arch/x86/realmode/rm/wakeup/wakeup.h b/arch/x86/realmode/rm/wakeup/wakeup.h deleted file mode 100644 index 2dfaf06b8af1..000000000000 --- a/arch/x86/realmode/rm/wakeup/wakeup.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Definitions for the wakeup data structure at the head of the - * wakeup code. - */ - -#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H -#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H - -#ifndef __ASSEMBLY__ -#include - -/* This must match data at wakeup.S */ -struct wakeup_header { - u16 video_mode; /* Video mode number */ - u32 pmode_entry; /* Protected mode resume point, 32-bit only */ - u16 pmode_cs; - u32 pmode_cr0; /* Protected mode cr0 */ - u32 pmode_cr3; /* Protected mode cr3 */ - u32 pmode_cr4; /* Protected mode cr4 */ - u32 pmode_efer_low; /* Protected mode EFER */ - u32 pmode_efer_high; - u64 pmode_gdt; - u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */ - u32 pmode_misc_en_high; - u32 pmode_behavior; /* Wakeup routine behavior flags */ - u32 realmode_flags; - u32 real_magic; - u32 signature; /* To check we have correct structure */ -} __attribute__((__packed__)); - -extern struct wakeup_header wakeup_header; -#endif - -#define WAKEUP_HEADER_OFFSET 8 -#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 -#define WAKEUP_END_SIGNATURE 0x65a22c82 - -/* Wakeup behavior bits */ -#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 - -#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S deleted file mode 100644 index f81c1cd99eaf..000000000000 --- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S +++ /dev/null @@ -1,181 +0,0 @@ -/* - * ACPI wakeup real mode startup stub - */ -#include -#include -#include -#include -#include -#include -#include "../realmode.h" -#include "wakeup.h" - - .code16 - -/* This should match the structure in wakeup.h */ - .section ".data", "aw" - - .balign 16 -GLOBAL(wakeup_header) - video_mode: .short 0 /* Video mode number */ - pmode_entry: .long 0 - pmode_cs: .short __KERNEL_CS - pmode_cr0: .long 0 /* Saved %cr0 */ - pmode_cr3: .long 0 /* Saved %cr3 */ - pmode_cr4: .long 0 /* Saved %cr4 */ - pmode_efer: .quad 0 /* Saved EFER */ - pmode_gdt: .quad 0 - pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ - pmode_behavior: .long 0 /* Wakeup behavior flags */ - realmode_flags: .long 0 - real_magic: .long 0 - signature: .long WAKEUP_HEADER_SIGNATURE -END(wakeup_header) - - .text - .code16 - - .balign 16 -ENTRY(wakeup_start) - cli - cld - - LJMPW_RM(3f) -3: - /* Apparently some dimwit BIOS programmers don't know how to - program a PM to RM transition, and we might end up here with - junk in the data segment descriptor registers. The only way - to repair that is to go into PM and fix it ourselves... */ - movw $16, %cx - lgdtl %cs:wakeup_gdt - movl %cr0, %eax - orb $X86_CR0_PE, %al - movl %eax, %cr0 - ljmpw $8, $2f -2: - movw %cx, %ds - movw %cx, %es - movw %cx, %ss - movw %cx, %fs - movw %cx, %gs - - andb $~X86_CR0_PE, %al - movl %eax, %cr0 - LJMPW_RM(3f) -3: - /* Set up segments */ - movw %cs, %ax - movw %ax, %ss - movl $rm_stack_end, %esp - movw %ax, %ds - movw %ax, %es - movw %ax, %fs - movw %ax, %gs - - lidtl wakeup_idt - - /* Clear the EFLAGS */ - pushl $0 - popfl - - /* Check header signature... */ - movl signature, %eax - cmpl $WAKEUP_HEADER_SIGNATURE, %eax - jne bogus_real_magic - - /* Check we really have everything... */ - movl end_signature, %eax - cmpl $WAKEUP_END_SIGNATURE, %eax - jne bogus_real_magic - - /* Call the C code */ - calll main - - /* Restore MISC_ENABLE before entering protected mode, in case - BIOS decided to clear XD_DISABLE during S3. */ - movl pmode_behavior, %eax - btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax - jnc 1f - - movl pmode_misc_en, %eax - movl pmode_misc_en + 4, %edx - movl $MSR_IA32_MISC_ENABLE, %ecx - wrmsr -1: - - /* Do any other stuff... */ - -#ifndef CONFIG_64BIT - /* This could also be done in C code... */ - movl pmode_cr3, %eax - movl %eax, %cr3 - - movl pmode_cr4, %ecx - jecxz 1f - movl %ecx, %cr4 -1: - movl pmode_efer, %eax - movl pmode_efer + 4, %edx - movl %eax, %ecx - orl %edx, %ecx - jz 1f - movl $MSR_EFER, %ecx - wrmsr -1: - - lgdtl pmode_gdt - - /* This really couldn't... */ - movl pmode_entry, %eax - movl pmode_cr0, %ecx - movl %ecx, %cr0 - ljmpl $__KERNEL_CS, $pa_startup_32 - /* -> jmp *%eax in trampoline_32.S */ -#else - jmp trampoline_data -#endif - -bogus_real_magic: -1: - hlt - jmp 1b - - .section ".rodata","a" - - /* - * Set up the wakeup GDT. We set these up as Big Real Mode, - * that is, with limits set to 4 GB. At least the Lenovo - * Thinkpad X61 is known to need this for the video BIOS - * initialization quirk to work; this is likely to also - * be the case for other laptops or integrated video devices. - */ - - .balign 16 -GLOBAL(wakeup_gdt) - .word 3*8-1 /* Self-descriptor */ - .long pa_wakeup_gdt - .word 0 - - .word 0xffff /* 16-bit code segment @ real_mode_base */ - .long 0x9b000000 + pa_real_mode_base - .word 0x008f /* big real mode */ - - .word 0xffff /* 16-bit data segment @ real_mode_base */ - .long 0x93000000 + pa_real_mode_base - .word 0x008f /* big real mode */ -END(wakeup_gdt) - - .section ".rodata","a" - .balign 8 - - /* This is the standard real-mode IDT */ - .balign 16 -GLOBAL(wakeup_idt) - .word 0xffff /* limit */ - .long 0 /* address */ - .word 0 -END(wakeup_idt) - - .section ".signature","a" -end_signature: - .long WAKEUP_END_SIGNATURE diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S new file mode 100644 index 000000000000..8a57c5a05fbc --- /dev/null +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -0,0 +1,181 @@ +/* + * ACPI wakeup real mode startup stub + */ +#include +#include +#include +#include +#include +#include +#include "realmode.h" +#include "wakeup.h" + + .code16 + +/* This should match the structure in wakeup.h */ + .section ".data", "aw" + + .balign 16 +GLOBAL(wakeup_header) + video_mode: .short 0 /* Video mode number */ + pmode_entry: .long 0 + pmode_cs: .short __KERNEL_CS + pmode_cr0: .long 0 /* Saved %cr0 */ + pmode_cr3: .long 0 /* Saved %cr3 */ + pmode_cr4: .long 0 /* Saved %cr4 */ + pmode_efer: .quad 0 /* Saved EFER */ + pmode_gdt: .quad 0 + pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ + pmode_behavior: .long 0 /* Wakeup behavior flags */ + realmode_flags: .long 0 + real_magic: .long 0 + signature: .long WAKEUP_HEADER_SIGNATURE +END(wakeup_header) + + .text + .code16 + + .balign 16 +ENTRY(wakeup_start) + cli + cld + + LJMPW_RM(3f) +3: + /* Apparently some dimwit BIOS programmers don't know how to + program a PM to RM transition, and we might end up here with + junk in the data segment descriptor registers. The only way + to repair that is to go into PM and fix it ourselves... */ + movw $16, %cx + lgdtl %cs:wakeup_gdt + movl %cr0, %eax + orb $X86_CR0_PE, %al + movl %eax, %cr0 + ljmpw $8, $2f +2: + movw %cx, %ds + movw %cx, %es + movw %cx, %ss + movw %cx, %fs + movw %cx, %gs + + andb $~X86_CR0_PE, %al + movl %eax, %cr0 + LJMPW_RM(3f) +3: + /* Set up segments */ + movw %cs, %ax + movw %ax, %ss + movl $rm_stack_end, %esp + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %gs + + lidtl wakeup_idt + + /* Clear the EFLAGS */ + pushl $0 + popfl + + /* Check header signature... */ + movl signature, %eax + cmpl $WAKEUP_HEADER_SIGNATURE, %eax + jne bogus_real_magic + + /* Check we really have everything... */ + movl end_signature, %eax + cmpl $WAKEUP_END_SIGNATURE, %eax + jne bogus_real_magic + + /* Call the C code */ + calll main + + /* Restore MISC_ENABLE before entering protected mode, in case + BIOS decided to clear XD_DISABLE during S3. */ + movl pmode_behavior, %eax + btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax + jnc 1f + + movl pmode_misc_en, %eax + movl pmode_misc_en + 4, %edx + movl $MSR_IA32_MISC_ENABLE, %ecx + wrmsr +1: + + /* Do any other stuff... */ + +#ifndef CONFIG_64BIT + /* This could also be done in C code... */ + movl pmode_cr3, %eax + movl %eax, %cr3 + + movl pmode_cr4, %ecx + jecxz 1f + movl %ecx, %cr4 +1: + movl pmode_efer, %eax + movl pmode_efer + 4, %edx + movl %eax, %ecx + orl %edx, %ecx + jz 1f + movl $MSR_EFER, %ecx + wrmsr +1: + + lgdtl pmode_gdt + + /* This really couldn't... */ + movl pmode_entry, %eax + movl pmode_cr0, %ecx + movl %ecx, %cr0 + ljmpl $__KERNEL_CS, $pa_startup_32 + /* -> jmp *%eax in trampoline_32.S */ +#else + jmp trampoline_data +#endif + +bogus_real_magic: +1: + hlt + jmp 1b + + .section ".rodata","a" + + /* + * Set up the wakeup GDT. We set these up as Big Real Mode, + * that is, with limits set to 4 GB. At least the Lenovo + * Thinkpad X61 is known to need this for the video BIOS + * initialization quirk to work; this is likely to also + * be the case for other laptops or integrated video devices. + */ + + .balign 16 +GLOBAL(wakeup_gdt) + .word 3*8-1 /* Self-descriptor */ + .long pa_wakeup_gdt + .word 0 + + .word 0xffff /* 16-bit code segment @ real_mode_base */ + .long 0x9b000000 + pa_real_mode_base + .word 0x008f /* big real mode */ + + .word 0xffff /* 16-bit data segment @ real_mode_base */ + .long 0x93000000 + pa_real_mode_base + .word 0x008f /* big real mode */ +END(wakeup_gdt) + + .section ".rodata","a" + .balign 8 + + /* This is the standard real-mode IDT */ + .balign 16 +GLOBAL(wakeup_idt) + .word 0xffff /* limit */ + .long 0 /* address */ + .word 0 +END(wakeup_idt) + + .section ".signature","a" +end_signature: + .long WAKEUP_END_SIGNATURE -- cgit v1.2.1 From f37240f16bec91f15ce564515f70a6ca9715ce96 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:43 +0300 Subject: x86, realmode: header for trampoline code Added header for trampoline code that can be used to supply input data to it. This makes interface between real mode code and kernel cleaner and simpler. Replaced two confusing pointers to level4 pgt in trampoline_64.S with a single pointer to the beginning of the page table. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-21-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 32 +++++++++++++++++----------- arch/x86/kernel/realmode.c | 27 +++++++++++++----------- arch/x86/kernel/smpboot.c | 2 +- arch/x86/realmode/rm/header.S | 35 ++++++++++++++----------------- arch/x86/realmode/rm/trampoline_32.S | 36 ++++++-------------------------- arch/x86/realmode/rm/trampoline_64.S | 18 +++++----------- arch/x86/realmode/rm/trampoline_common.S | 23 ++++++++++++++++++++ arch/x86/realmode/rm/wakeup_asm.S | 2 +- 8 files changed, 87 insertions(+), 88 deletions(-) create mode 100644 arch/x86/realmode/rm/trampoline_common.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index d3ae49f4c3ef..1421eed1c8e8 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -8,24 +8,32 @@ struct real_mode_header { u32 text_start; u32 ro_end; - /* reboot */ -#ifdef CONFIG_X86_32 - u32 machine_real_restart_asm; -#endif /* SMP trampoline */ - u32 trampoline_data; + u32 trampoline_start; u32 trampoline_status; -#ifdef CONFIG_X86_32 - u32 startup_32_smp; - u32 boot_gdt; -#else - u32 startup_64_smp; - u32 level3_ident_pgt; - u32 level3_kernel_pgt; + u32 trampoline_header; +#ifdef CONFIG_X86_64 + u32 trampoline_pgd; #endif + /* ACPI S3 wakeup */ #ifdef CONFIG_ACPI_SLEEP u32 wakeup_start; u32 wakeup_header; +#endif + /* APM/BIOS reboot */ +#ifdef CONFIG_X86_32 + u32 machine_real_restart_asm; +#endif +} __attribute__((__packed__)); + +/* This must match data at trampoline_32/64.S */ +struct trampoline_header { +#ifdef CONFIG_X86_32 + u32 start; + u16 gdt_limit; + u32 gdt_base; +#else + u64 start; #endif } __attribute__((__packed__)); diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index 632c810ec8ea..712fba8fd774 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -17,8 +17,11 @@ void __init setup_real_mode(void) u16 *seg; int i; unsigned char *base; - + struct trampoline_header *trampoline_header; size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); +#ifdef CONFIG_X86_64 + u64 *trampoline_pgd; +#endif /* Has to be in very low memory so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); @@ -28,7 +31,6 @@ void __init setup_real_mode(void) base = __va(mem); memblock_reserve(mem, size); real_mode_header = (struct real_mode_header *) base; - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", base, (unsigned long long)mem, size); @@ -53,18 +55,19 @@ void __init setup_real_mode(void) *ptr += __pa(base); } + /* Must be perfomed *after* relocation. */ + trampoline_header = (struct trampoline_header *) + __va(real_mode_header->trampoline_header); + #ifdef CONFIG_X86_32 - *((u32 *)__va(real_mode_header->startup_32_smp)) = __pa(startup_32_smp); - *((u32 *)__va(real_mode_header->boot_gdt)) = __pa(boot_gdt); + trampoline_header->start = __pa(startup_32_smp); + trampoline_header->gdt_limit = __BOOT_DS + 7; + trampoline_header->gdt_base = __pa(boot_gdt); #else - *((u64 *) __va(real_mode_header->startup_64_smp)) = - (u64)secondary_startup_64; - - *((u64 *) __va(real_mode_header->level3_ident_pgt)) = - __pa(level3_ident_pgt) + _KERNPG_TABLE; - - *((u64 *) __va(real_mode_header->level3_kernel_pgt)) = - __pa(level3_kernel_pgt) + _KERNPG_TABLE; + trampoline_header->start = (u64) secondary_startup_64; + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); + trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; + trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; #endif } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b8c0661e2341..757c4b1d0a02 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -667,7 +667,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) volatile u32 *trampoline_status = (volatile u32 *) __va(real_mode_header->trampoline_status); /* start_ip had better be page-aligned! */ - unsigned long start_ip = real_mode_header->trampoline_data; + unsigned long start_ip = real_mode_header->trampoline_start; unsigned long boot_error = 0; int timeout; diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index c83005c4d455..b4c32632bf16 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -7,28 +7,25 @@ #include #include - .section ".header", "a" + .section ".header", "a" GLOBAL(real_mode_header) - .long pa_text_start - .long pa_ro_end -#ifdef CONFIG_X86_32 - .long pa_machine_real_restart_asm -#endif - /* SMP trampoline */ - .long pa_trampoline_data - .long pa_trampoline_status -#ifdef CONFIG_X86_32 - .long pa_startup_32_smp - .long pa_boot_gdt -#else - .long pa_startup_64_smp - .long pa_level3_ident_pgt - .long pa_level3_kernel_pgt + .long pa_text_start + .long pa_ro_end + /* SMP trampoline */ + .long pa_trampoline_start + .long pa_trampoline_status + .long pa_trampoline_header +#ifdef CONFIG_X86_64 + .long pa_trampoline_pgd; #endif - /* ACPI sleep */ + /* ACPI S3 wakeup */ #ifdef CONFIG_ACPI_SLEEP - .long pa_wakeup_start - .long pa_wakeup_header + .long pa_wakeup_start + .long pa_wakeup_header +#endif + /* APM/BIOS reboot */ +#ifdef CONFIG_X86_32 + .long pa_machine_real_restart_asm #endif END(real_mode_header) diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 1ecdbb59191b..6fc064b4d2b9 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -13,16 +13,10 @@ * * We jump into arch/x86/kernel/head_32.S. * - * On entry to trampoline_data, the processor is in real mode + * On entry to trampoline_start, the processor is in real mode * with 16-bit addressing and 16-bit data. CS has some value * and IP is zero. Thus, we load CS to the physical segment * of the real mode code before doing anything further. - * - * The structure real_mode_header includes entries that need - * to be set up before executing this code: - * - * startup_32_smp - * boot_gdt */ #include @@ -35,7 +29,7 @@ .code16 .balign PAGE_SIZE -ENTRY(trampoline_data) +ENTRY(trampoline_start) wbinvd # Needed for NUMA-Q should be harmless for others LJMPW_RM(1f) @@ -45,7 +39,7 @@ ENTRY(trampoline_data) cli # We should be safe anyway - movl startup_32_smp, %eax # where we need to go + movl tr_start, %eax # where we need to go movl $0xA5A5A5A5, trampoline_status # write marker for master knows we're running @@ -56,8 +50,8 @@ ENTRY(trampoline_data) * operand size is 16bit. Use lgdtl instead to force operand size * to 32 bit. */ - lidtl boot_idt_descr # load idt with 0, 0 - lgdtl boot_gdt_descr # load gdt with whatever is appropriate + lidtl tr_idt # load idt with 0, 0 + lgdtl tr_gdt # load gdt with whatever is appropriate movw $1, %dx # protected mode (PE) bit lmsw %dx # into protected mode @@ -69,22 +63,4 @@ ENTRY(trampoline_data) ENTRY(startup_32) # note: also used from wakeup_asm.S jmp *%eax - .section ".rodata","a" - - .balign 4 -boot_idt_descr: - .word 0 # idt limit = 0 - .long 0 # idt base = 0L - - .data - -boot_gdt_descr: - .word __BOOT_DS + 7 # gdt limit -GLOBAL(boot_gdt) - .long 0 # gdt base - - .bss - - .balign 4 -GLOBAL(trampoline_status) .space 4 -GLOBAL(startup_32_smp) .space 4 +#include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index f71ea0800d3d..3f7293239365 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -10,7 +10,7 @@ * trampoline page to make our stack and everything else * is a mystery. * - * On entry to trampoline_data, the processor is in real mode + * On entry to trampoline_start, the processor is in real mode * with 16-bit addressing and 16-bit data. CS has some value * and IP is zero. Thus, data addresses need to be absolute * (no relocation) and are taken with regard to r_base. @@ -37,7 +37,7 @@ .balign PAGE_SIZE .code16 -ENTRY(trampoline_data) +ENTRY(trampoline_start) cli # We should be safe anyway wbinvd @@ -97,7 +97,7 @@ ENTRY(startup_32) movl %eax, %cr4 # Enable PAE mode # Setup trampoline 4 level pagetables - movl $pa_level3_ident_pgt, %eax + movl $pa_trampoline_pgd, %eax movl %eax, %cr3 movl $MSR_EFER, %ecx @@ -122,7 +122,7 @@ ENTRY(startup_32) .balign 4 ENTRY(startup_64) # Now jump into the kernel using virtual addresses - jmpq *startup_64_smp(%rip) + jmpq *tr_start(%rip) .section ".rodata","a" .balign 16 @@ -143,12 +143,4 @@ tgdt: .quad 0x00cf93000000ffff # __KERNEL_DS tgdt_end: - .bss - - .balign PAGE_SIZE -GLOBAL(level3_ident_pgt) .space 511*8 -GLOBAL(level3_kernel_pgt) .space 8 - - .balign 8 -GLOBAL(startup_64_smp) .space 8 -GLOBAL(trampoline_status) .space 4 +#include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S new file mode 100644 index 000000000000..c3f951c468c5 --- /dev/null +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -0,0 +1,23 @@ + .section ".rodata","a" + + .balign 4 +tr_idt: .fill 1, 6, 0 + + .bss + + .balign 4 +GLOBAL(trampoline_status) .space 4 + +GLOBAL(trampoline_header) +#ifdef CONFIG_X86_32 + tr_start: .space 4 + tr_gdt: .space 6 +#else + tr_start: .space 8 +#endif +END(trampoline_header) + +#ifdef CONFIG_X86_64 + .balign PAGE_SIZE +GLOBAL(trampoline_pgd) .space PAGE_SIZE +#endif diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S index 8a57c5a05fbc..46108f05e04e 100644 --- a/arch/x86/realmode/rm/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -132,7 +132,7 @@ ENTRY(wakeup_start) ljmpl $__KERNEL_CS, $pa_startup_32 /* -> jmp *%eax in trampoline_32.S */ #else - jmp trampoline_data + jmp trampoline_start #endif bogus_real_magic: -- cgit v1.2.1 From f2604c141a00c00b92b7fd2f9d2455517fdd6c15 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:44 +0300 Subject: x86, realmode: move relocs from scripts/ to arch/x86/tools Moved relocs tool from scripts/ to arch/x86/tools because it is architecture specific script. Added new target archscripts that can be used to build scripts needed building an architecture. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-22-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin Cc: Sam Ravnborg Cc: Michal Marek --- arch/x86/Makefile | 3 + arch/x86/boot/compressed/Makefile | 4 +- arch/x86/realmode/rm/Makefile | 2 +- arch/x86/tools/.gitignore | 1 + arch/x86/tools/Makefile | 4 + arch/x86/tools/relocs.c | 804 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 815 insertions(+), 3 deletions(-) create mode 100644 arch/x86/tools/.gitignore create mode 100644 arch/x86/tools/relocs.c (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 41a7237606a3..94e91e401da9 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -134,6 +134,9 @@ KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) +archscripts: + $(Q)$(MAKE) $(build)=arch/x86/tools relocs + ### # Syscall table generation diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0435e8a2d20e..e398bb5d63bb 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -42,8 +42,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += vmlinux.bin.all vmlinux.relocs -CMD_RELOCS = scripts/x86-relocs -quiet_cmd_relocs = RELOCS $@ +CMD_RELOCS = arch/x86/tools/relocs +quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< $(obj)/vmlinux.relocs: vmlinux FORCE $(call if_changed,relocs) diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index fc8854b09dfa..de40bc44b92f 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -52,7 +52,7 @@ $(obj)/realmode.bin: $(obj)/realmode.elf $(call if_changed,objcopy) quiet_cmd_relocs = RELOCS $@ - cmd_relocs = scripts/x86-relocs --realmode $< > $@ + cmd_relocs = arch/x86/tools/relocs --realmode $< > $@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE $(call if_changed,relocs) diff --git a/arch/x86/tools/.gitignore b/arch/x86/tools/.gitignore new file mode 100644 index 000000000000..be0ed065249b --- /dev/null +++ b/arch/x86/tools/.gitignore @@ -0,0 +1 @@ +relocs diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index d511aa97533a..733057b435b0 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -36,3 +36,7 @@ HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x $(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c + +HOST_EXTRACFLAGS += -I$(srctree)/tools/include +hostprogs-y += relocs +relocs: $(obj)/relocs diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c new file mode 100644 index 000000000000..74e16bb15dc4 --- /dev/null +++ b/arch/x86/tools/relocs.c @@ -0,0 +1,804 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define USE_BSD +#include +#include +#include + +static void die(char *fmt, ...); + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +static Elf32_Ehdr ehdr; +static unsigned long reloc_count, reloc_idx; +static unsigned long *relocs; +static unsigned long reloc16_count, reloc16_idx; +static unsigned long *relocs16; + +struct section { + Elf32_Shdr shdr; + struct section *link; + Elf32_Sym *symtab; + Elf32_Rel *reltab; + char *strtab; +}; +static struct section *secs; + +enum symtype { + S_ABS, + S_REL, + S_SEG, + S_LIN, + S_NSYMTYPES +}; + +static const char * const sym_regex_kernel[S_NSYMTYPES] = { +/* + * Following symbols have been audited. There values are constant and do + * not change if bzImage is loaded at a different physical address than + * the address for which it has been compiled. Don't warn user about + * absolute relocations present w.r.t these symbols. + */ + [S_ABS] = + "^(xen_irq_disable_direct_reloc$|" + "xen_save_fl_direct_reloc$|" + "VDSO|" + "__crc_)", + +/* + * These symbols are known to be relative, even if the linker marks them + * as absolute (typically defined outside any section in the linker script.) + */ + [S_REL] = + "^_end$", +}; + + +static const char * const sym_regex_realmode[S_NSYMTYPES] = { +/* + * These symbols are known to be relative, even if the linker marks them + * as absolute (typically defined outside any section in the linker script.) + */ + [S_REL] = + "^pa_", + +/* + * These are 16-bit segment symbols when compiling 16-bit code. + */ + [S_SEG] = + "^real_mode_seg$", + +/* + * These are offsets belonging to segments, as opposed to linear addresses, + * when compiling 16-bit code. + */ + [S_LIN] = + "^pa_", +}; + +static const char * const *sym_regex; + +static regex_t sym_regex_c[S_NSYMTYPES]; +static int is_reloc(enum symtype type, const char *sym_name) +{ + return sym_regex[type] && + !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0); +} + +static void regex_init(int use_real_mode) +{ + char errbuf[128]; + int err; + int i; + + if (use_real_mode) + sym_regex = sym_regex_realmode; + else + sym_regex = sym_regex_kernel; + + for (i = 0; i < S_NSYMTYPES; i++) { + if (!sym_regex[i]) + continue; + + err = regcomp(&sym_regex_c[i], sym_regex[i], + REG_EXTENDED|REG_NOSUB); + + if (err) { + regerror(err, &sym_regex_c[i], errbuf, sizeof errbuf); + die("%s", errbuf); + } + } +} + +static void die(char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + exit(1); +} + +static const char *sym_type(unsigned type) +{ + static const char *type_name[] = { +#define SYM_TYPE(X) [X] = #X + SYM_TYPE(STT_NOTYPE), + SYM_TYPE(STT_OBJECT), + SYM_TYPE(STT_FUNC), + SYM_TYPE(STT_SECTION), + SYM_TYPE(STT_FILE), + SYM_TYPE(STT_COMMON), + SYM_TYPE(STT_TLS), +#undef SYM_TYPE + }; + const char *name = "unknown sym type name"; + if (type < ARRAY_SIZE(type_name)) { + name = type_name[type]; + } + return name; +} + +static const char *sym_bind(unsigned bind) +{ + static const char *bind_name[] = { +#define SYM_BIND(X) [X] = #X + SYM_BIND(STB_LOCAL), + SYM_BIND(STB_GLOBAL), + SYM_BIND(STB_WEAK), +#undef SYM_BIND + }; + const char *name = "unknown sym bind name"; + if (bind < ARRAY_SIZE(bind_name)) { + name = bind_name[bind]; + } + return name; +} + +static const char *sym_visibility(unsigned visibility) +{ + static const char *visibility_name[] = { +#define SYM_VISIBILITY(X) [X] = #X + SYM_VISIBILITY(STV_DEFAULT), + SYM_VISIBILITY(STV_INTERNAL), + SYM_VISIBILITY(STV_HIDDEN), + SYM_VISIBILITY(STV_PROTECTED), +#undef SYM_VISIBILITY + }; + const char *name = "unknown sym visibility name"; + if (visibility < ARRAY_SIZE(visibility_name)) { + name = visibility_name[visibility]; + } + return name; +} + +static const char *rel_type(unsigned type) +{ + static const char *type_name[] = { +#define REL_TYPE(X) [X] = #X + REL_TYPE(R_386_NONE), + REL_TYPE(R_386_32), + REL_TYPE(R_386_PC32), + REL_TYPE(R_386_GOT32), + REL_TYPE(R_386_PLT32), + REL_TYPE(R_386_COPY), + REL_TYPE(R_386_GLOB_DAT), + REL_TYPE(R_386_JMP_SLOT), + REL_TYPE(R_386_RELATIVE), + REL_TYPE(R_386_GOTOFF), + REL_TYPE(R_386_GOTPC), + REL_TYPE(R_386_8), + REL_TYPE(R_386_PC8), + REL_TYPE(R_386_16), + REL_TYPE(R_386_PC16), +#undef REL_TYPE + }; + const char *name = "unknown type rel type name"; + if (type < ARRAY_SIZE(type_name) && type_name[type]) { + name = type_name[type]; + } + return name; +} + +static const char *sec_name(unsigned shndx) +{ + const char *sec_strtab; + const char *name; + sec_strtab = secs[ehdr.e_shstrndx].strtab; + name = ""; + if (shndx < ehdr.e_shnum) { + name = sec_strtab + secs[shndx].shdr.sh_name; + } + else if (shndx == SHN_ABS) { + name = "ABSOLUTE"; + } + else if (shndx == SHN_COMMON) { + name = "COMMON"; + } + return name; +} + +static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym) +{ + const char *name; + name = ""; + if (sym->st_name) { + name = sym_strtab + sym->st_name; + } + else { + name = sec_name(sym->st_shndx); + } + return name; +} + + + +#if BYTE_ORDER == LITTLE_ENDIAN +#define le16_to_cpu(val) (val) +#define le32_to_cpu(val) (val) +#endif +#if BYTE_ORDER == BIG_ENDIAN +#define le16_to_cpu(val) bswap_16(val) +#define le32_to_cpu(val) bswap_32(val) +#endif + +static uint16_t elf16_to_cpu(uint16_t val) +{ + return le16_to_cpu(val); +} + +static uint32_t elf32_to_cpu(uint32_t val) +{ + return le32_to_cpu(val); +} + +static void read_ehdr(FILE *fp) +{ + if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) { + die("Cannot read ELF header: %s\n", + strerror(errno)); + } + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) { + die("No ELF magic\n"); + } + if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) { + die("Not a 32 bit executable\n"); + } + if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { + die("Not a LSB ELF executable\n"); + } + if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) { + die("Unknown ELF version\n"); + } + /* Convert the fields to native endian */ + ehdr.e_type = elf16_to_cpu(ehdr.e_type); + ehdr.e_machine = elf16_to_cpu(ehdr.e_machine); + ehdr.e_version = elf32_to_cpu(ehdr.e_version); + ehdr.e_entry = elf32_to_cpu(ehdr.e_entry); + ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff); + ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff); + ehdr.e_flags = elf32_to_cpu(ehdr.e_flags); + ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize); + ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize); + ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum); + ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize); + ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum); + ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx); + + if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) { + die("Unsupported ELF header type\n"); + } + if (ehdr.e_machine != EM_386) { + die("Not for x86\n"); + } + if (ehdr.e_version != EV_CURRENT) { + die("Unknown ELF version\n"); + } + if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) { + die("Bad Elf header size\n"); + } + if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) { + die("Bad program header entry\n"); + } + if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) { + die("Bad section header entry\n"); + } + if (ehdr.e_shstrndx >= ehdr.e_shnum) { + die("String table index out of bounds\n"); + } +} + +static void read_shdrs(FILE *fp) +{ + int i; + Elf32_Shdr shdr; + + secs = calloc(ehdr.e_shnum, sizeof(struct section)); + if (!secs) { + die("Unable to allocate %d section headers\n", + ehdr.e_shnum); + } + if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + ehdr.e_shoff, strerror(errno)); + } + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (fread(&shdr, sizeof shdr, 1, fp) != 1) + die("Cannot read ELF section headers %d/%d: %s\n", + i, ehdr.e_shnum, strerror(errno)); + sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name); + sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type); + sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags); + sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr); + sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset); + sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size); + sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link); + sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info); + sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign); + sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize); + if (sec->shdr.sh_link < ehdr.e_shnum) + sec->link = &secs[sec->shdr.sh_link]; + } + +} + +static void read_strtabs(FILE *fp) +{ + int i; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_STRTAB) { + continue; + } + sec->strtab = malloc(sec->shdr.sh_size); + if (!sec->strtab) { + die("malloc of %d bytes for strtab failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + } +} + +static void read_symtabs(FILE *fp) +{ + int i,j; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_SYMTAB) { + continue; + } + sec->symtab = malloc(sec->shdr.sh_size); + if (!sec->symtab) { + die("malloc of %d bytes for symtab failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { + Elf32_Sym *sym = &sec->symtab[j]; + sym->st_name = elf32_to_cpu(sym->st_name); + sym->st_value = elf32_to_cpu(sym->st_value); + sym->st_size = elf32_to_cpu(sym->st_size); + sym->st_shndx = elf16_to_cpu(sym->st_shndx); + } + } +} + + +static void read_relocs(FILE *fp) +{ + int i,j; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec->reltab = malloc(sec->shdr.sh_size); + if (!sec->reltab) { + die("malloc of %d bytes for relocs failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel = &sec->reltab[j]; + rel->r_offset = elf32_to_cpu(rel->r_offset); + rel->r_info = elf32_to_cpu(rel->r_info); + } + } +} + + +static void print_absolute_symbols(void) +{ + int i; + printf("Absolute symbols\n"); + printf(" Num: Value Size Type Bind Visibility Name\n"); + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + char *sym_strtab; + int j; + + if (sec->shdr.sh_type != SHT_SYMTAB) { + continue; + } + sym_strtab = sec->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { + Elf32_Sym *sym; + const char *name; + sym = &sec->symtab[j]; + name = sym_name(sym_strtab, sym); + if (sym->st_shndx != SHN_ABS) { + continue; + } + printf("%5d %08x %5d %10s %10s %12s %s\n", + j, sym->st_value, sym->st_size, + sym_type(ELF32_ST_TYPE(sym->st_info)), + sym_bind(ELF32_ST_BIND(sym->st_info)), + sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)), + name); + } + } + printf("\n"); +} + +static void print_absolute_relocs(void) +{ + int i, printed = 0; + + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + struct section *sec_applies, *sec_symtab; + char *sym_strtab; + Elf32_Sym *sh_symtab; + int j; + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec_symtab = sec->link; + sec_applies = &secs[sec->shdr.sh_info]; + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { + continue; + } + sh_symtab = sec_symtab->symtab; + sym_strtab = sec_symtab->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel; + Elf32_Sym *sym; + const char *name; + rel = &sec->reltab[j]; + sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; + name = sym_name(sym_strtab, sym); + if (sym->st_shndx != SHN_ABS) { + continue; + } + + /* Absolute symbols are not relocated if bzImage is + * loaded at a non-compiled address. Display a warning + * to user at compile time about the absolute + * relocations present. + * + * User need to audit the code to make sure + * some symbols which should have been section + * relative have not become absolute because of some + * linker optimization or wrong programming usage. + * + * Before warning check if this absolute symbol + * relocation is harmless. + */ + if (is_reloc(S_ABS, name) || is_reloc(S_REL, name)) + continue; + + if (!printed) { + printf("WARNING: Absolute relocations" + " present\n"); + printf("Offset Info Type Sym.Value " + "Sym.Name\n"); + printed = 1; + } + + printf("%08x %08x %10s %08x %s\n", + rel->r_offset, + rel->r_info, + rel_type(ELF32_R_TYPE(rel->r_info)), + sym->st_value, + name); + } + } + + if (printed) + printf("\n"); +} + +static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), + int use_real_mode) +{ + int i; + /* Walk through the relocations */ + for (i = 0; i < ehdr.e_shnum; i++) { + char *sym_strtab; + Elf32_Sym *sh_symtab; + struct section *sec_applies, *sec_symtab; + int j; + struct section *sec = &secs[i]; + + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec_symtab = sec->link; + sec_applies = &secs[sec->shdr.sh_info]; + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { + continue; + } + sh_symtab = sec_symtab->symtab; + sym_strtab = sec_symtab->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel; + Elf32_Sym *sym; + unsigned r_type; + const char *symname; + rel = &sec->reltab[j]; + sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; + r_type = ELF32_R_TYPE(rel->r_info); + + switch (r_type) { + case R_386_NONE: + case R_386_PC32: + case R_386_PC16: + case R_386_PC8: + /* + * NONE can be ignored and and PC relative + * relocations don't need to be adjusted. + */ + break; + + case R_386_16: + symname = sym_name(sym_strtab, sym); + if (!use_real_mode) + goto bad; + if (sym->st_shndx == SHN_ABS) { + if (is_reloc(S_ABS, symname)) + break; + else if (!is_reloc(S_SEG, symname)) + goto bad; + } else { + if (is_reloc(S_LIN, symname)) + goto bad; + else + break; + } + visit(rel, sym); + break; + + case R_386_32: + symname = sym_name(sym_strtab, sym); + if (sym->st_shndx == SHN_ABS) { + if (is_reloc(S_ABS, symname)) + break; + else if (!is_reloc(S_REL, symname)) + goto bad; + } else { + if (use_real_mode && + !is_reloc(S_LIN, symname)) + break; + } + visit(rel, sym); + break; + default: + die("Unsupported relocation type: %s (%d)\n", + rel_type(r_type), r_type); + break; + bad: + symname = sym_name(sym_strtab, sym); + die("Invalid %s relocation: %s\n", + rel_type(r_type), symname); + } + } + } +} + +static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym) +{ + if (ELF32_R_TYPE(rel->r_info) == R_386_16) + reloc16_count++; + else + reloc_count++; +} + +static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym) +{ + /* Remember the address that needs to be adjusted. */ + if (ELF32_R_TYPE(rel->r_info) == R_386_16) + relocs16[reloc16_idx++] = rel->r_offset; + else + relocs[reloc_idx++] = rel->r_offset; +} + +static int cmp_relocs(const void *va, const void *vb) +{ + const unsigned long *a, *b; + a = va; b = vb; + return (*a == *b)? 0 : (*a > *b)? 1 : -1; +} + +static int write32(unsigned int v, FILE *f) +{ + unsigned char buf[4]; + + put_unaligned_le32(v, buf); + return fwrite(buf, 1, 4, f) == 4 ? 0 : -1; +} + +static void emit_relocs(int as_text, int use_real_mode) +{ + int i; + /* Count how many relocations I have and allocate space for them. */ + reloc_count = 0; + walk_relocs(count_reloc, use_real_mode); + relocs = malloc(reloc_count * sizeof(relocs[0])); + if (!relocs) { + die("malloc of %d entries for relocs failed\n", + reloc_count); + } + + relocs16 = malloc(reloc16_count * sizeof(relocs[0])); + if (!relocs16) { + die("malloc of %d entries for relocs16 failed\n", + reloc16_count); + } + /* Collect up the relocations */ + reloc_idx = 0; + walk_relocs(collect_reloc, use_real_mode); + + if (reloc16_count && !use_real_mode) + die("Segment relocations found but --realmode not specified\n"); + + /* Order the relocations for more efficient processing */ + qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs); + qsort(relocs16, reloc16_count, sizeof(relocs16[0]), cmp_relocs); + + /* Print the relocations */ + if (as_text) { + /* Print the relocations in a form suitable that + * gas will like. + */ + printf(".section \".data.reloc\",\"a\"\n"); + printf(".balign 4\n"); + if (use_real_mode) { + printf("\t.long %lu\n", reloc16_count); + for (i = 0; i < reloc16_count; i++) + printf("\t.long 0x%08lx\n", relocs16[i]); + printf("\t.long %lu\n", reloc_count); + for (i = 0; i < reloc_count; i++) { + printf("\t.long 0x%08lx\n", relocs[i]); + } + } else { + /* Print a stop */ + printf("\t.long 0x%08lx\n", (unsigned long)0); + for (i = 0; i < reloc_count; i++) { + printf("\t.long 0x%08lx\n", relocs[i]); + } + } + + printf("\n"); + } + else { + if (use_real_mode) { + write32(reloc16_count, stdout); + for (i = 0; i < reloc16_count; i++) + write32(relocs16[i], stdout); + write32(reloc_count, stdout); + + /* Now print each relocation */ + for (i = 0; i < reloc_count; i++) + write32(relocs[i], stdout); + } else { + /* Print a stop */ + write32(0, stdout); + + /* Now print each relocation */ + for (i = 0; i < reloc_count; i++) { + write32(relocs[i], stdout); + } + } + } +} + +static void usage(void) +{ + die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n"); +} + +int main(int argc, char **argv) +{ + int show_absolute_syms, show_absolute_relocs; + int as_text, use_real_mode; + const char *fname; + FILE *fp; + int i; + + show_absolute_syms = 0; + show_absolute_relocs = 0; + as_text = 0; + use_real_mode = 0; + fname = NULL; + for (i = 1; i < argc; i++) { + char *arg = argv[i]; + if (*arg == '-') { + if (strcmp(arg, "--abs-syms") == 0) { + show_absolute_syms = 1; + continue; + } + if (strcmp(arg, "--abs-relocs") == 0) { + show_absolute_relocs = 1; + continue; + } + if (strcmp(arg, "--text") == 0) { + as_text = 1; + continue; + } + if (strcmp(arg, "--realmode") == 0) { + use_real_mode = 1; + continue; + } + } + else if (!fname) { + fname = arg; + continue; + } + usage(); + } + if (!fname) { + usage(); + } + regex_init(use_real_mode); + fp = fopen(fname, "r"); + if (!fp) { + die("Cannot open %s: %s\n", + fname, strerror(errno)); + } + read_ehdr(fp); + read_shdrs(fp); + read_strtabs(fp); + read_symtabs(fp); + read_relocs(fp); + if (show_absolute_syms) { + print_absolute_symbols(); + return 0; + } + if (show_absolute_relocs) { + print_absolute_relocs(); + return 0; + } + emit_relocs(as_text, use_real_mode); + return 0; +} -- cgit v1.2.1 From bf8b88e97716feb750c3d34492f00d9c085e1183 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:45 +0300 Subject: x86, realmode: fixes compilation issue in tboot.c Fixed include path of wakeup.h in tboot.c. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-23-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/tboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 65adda4fde93..f84fe00fad48 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -44,7 +44,7 @@ #include #include -#include "acpi/realmode/wakeup.h" +#include "../realmode/rm/wakeup.h" /* Global pointer to shared data; NULL means no measured launch. */ struct tboot *tboot __read_mostly; -- cgit v1.2.1 From cda846f101fb1396b6924f1d9b68ac3d42de5403 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:46 +0300 Subject: x86, realmode: read cr4 and EFER from kernel for 64-bit trampoline This patch changes 64-bit trampoline so that CR4 and EFER are provided by the kernel instead of using fixed values. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-24-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 7 ++++++- arch/x86/include/asm/realmode.h | 8 ++++++-- arch/x86/kernel/realmode.c | 8 ++++++++ arch/x86/kernel/setup.c | 2 ++ arch/x86/realmode/rm/header.S | 1 + arch/x86/realmode/rm/trampoline_64.S | 32 +++++++------------------------- arch/x86/realmode/rm/trampoline_common.S | 19 +++++++++++++++++++ 7 files changed, 49 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4fa7dcceb6c0..404583ccf0cf 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -544,13 +544,16 @@ static inline void load_sp0(struct tss_struct *tss, * enable), so that any CPU's that boot up * after us can get the correct flags. */ -extern unsigned long mmu_cr4_features; +extern unsigned long mmu_cr4_features; +extern u32 *trampoline_cr4_features; static inline void set_in_cr4(unsigned long mask) { unsigned long cr4; mmu_cr4_features |= mask; + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; cr4 = read_cr4(); cr4 |= mask; write_cr4(cr4); @@ -561,6 +564,8 @@ static inline void clear_in_cr4(unsigned long mask) unsigned long cr4; mmu_cr4_features &= ~mask; + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; cr4 = read_cr4(); cr4 &= ~mask; write_cr4(cr4); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 1421eed1c8e8..937dc6071d76 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -24,18 +24,22 @@ struct real_mode_header { #ifdef CONFIG_X86_32 u32 machine_real_restart_asm; #endif -} __attribute__((__packed__)); +}; /* This must match data at trampoline_32/64.S */ struct trampoline_header { #ifdef CONFIG_X86_32 u32 start; + u16 gdt_pad; u16 gdt_limit; u32 gdt_base; #else u64 start; + u32 cr4; + u32 efer_low; + u32 efer_high; #endif -} __attribute__((__packed__)); +}; extern struct real_mode_header *real_mode_header; extern unsigned char real_mode_blob_end[]; diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index 712fba8fd774..66ac276cf361 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -6,6 +6,7 @@ #include struct real_mode_header *real_mode_header; +u32 *trampoline_cr4_features; void __init setup_real_mode(void) { @@ -64,7 +65,14 @@ void __init setup_real_mode(void) trampoline_header->gdt_limit = __BOOT_DS + 7; trampoline_header->gdt_base = __pa(boot_gdt); #else + if (rdmsr_safe(MSR_EFER, &trampoline_header->efer_low, + &trampoline_header->efer_high)) + BUG(); + trampoline_header->start = (u64) secondary_startup_64; + trampoline_cr4_features = &trampoline_header->cr4; + *trampoline_cr4_features = read_cr4(); + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 7a14fece9cfc..efcf305210a4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -975,6 +975,8 @@ void __init setup_arch(char **cmdline_p) if (boot_cpu_data.cpuid_level >= 0) { /* A CPU has %cr4 if and only if it has CPUID */ mmu_cr4_features = read_cr4(); + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index b4c32632bf16..4612d5382791 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -9,6 +9,7 @@ .section ".header", "a" + .balign 16 GLOBAL(real_mode_header) .long pa_text_start .long pa_ro_end diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 3f7293239365..66e26f088288 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -34,9 +34,9 @@ #include "realmode.h" .text - .balign PAGE_SIZE .code16 + .balign PAGE_SIZE ENTRY(trampoline_start) cli # We should be safe anyway wbinvd @@ -65,8 +65,8 @@ ENTRY(trampoline_start) * to 32 bit. */ - lidtl tidt # load idt with 0, 0 - lgdtl tgdt # load gdt with whatever is appropriate + lidtl tr_idt # load idt with 0, 0 + lgdtl tr_gdt # load gdt with whatever is appropriate movw $__KERNEL_DS, %dx # Data segment descriptor @@ -93,16 +93,17 @@ ENTRY(startup_32) movl %edx, %fs movl %edx, %gs - movl $X86_CR4_PAE, %eax + movl pa_tr_cr4, %eax movl %eax, %cr4 # Enable PAE mode # Setup trampoline 4 level pagetables movl $pa_trampoline_pgd, %eax movl %eax, %cr3 + # Set up EFER + movl pa_tr_efer, %eax + movl pa_tr_efer + 4, %edx movl $MSR_EFER, %ecx - movl $((1 << _EFER_LME) | (1 << _EFER_NX)), %eax # Enable Long Mode - xorl %edx, %edx wrmsr # Enable paging and in turn activate Long Mode @@ -124,23 +125,4 @@ ENTRY(startup_64) # Now jump into the kernel using virtual addresses jmpq *tr_start(%rip) - .section ".rodata","a" - .balign 16 -tidt: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L - - # Duplicate the global descriptor table - # so the kernel can live anywhere - .balign 16 - .globl tgdt -tgdt: - .short tgdt_end - tgdt - 1 # gdt limit - .long pa_tgdt - .short 0 - .quad 0x00cf9b000000ffff # __KERNEL32_CS - .quad 0x00af9b000000ffff # __KERNEL_CS - .quad 0x00cf93000000ffff # __KERNEL_DS -tgdt_end: - #include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S index c3f951c468c5..cac444b942f8 100644 --- a/arch/x86/realmode/rm/trampoline_common.S +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -1,5 +1,20 @@ .section ".rodata","a" +#ifdef CONFIG_X86_64 + # Duplicate the global descriptor table + # so the kernel can live anywhere + .balign 16 + .globl tr_gdt +tr_gdt: + .short tr_gdt_end - tr_gdt - 1 # gdt limit + .long pa_tr_gdt + .short 0 + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS +tr_gdt_end: +#endif + .balign 4 tr_idt: .fill 1, 6, 0 @@ -8,12 +23,16 @@ tr_idt: .fill 1, 6, 0 .balign 4 GLOBAL(trampoline_status) .space 4 + .balign 8 GLOBAL(trampoline_header) #ifdef CONFIG_X86_32 tr_start: .space 4 + tr_gdt_pad: .space 2 tr_gdt: .space 6 #else tr_start: .space 8 + GLOBAL(tr_cr4) .space 4 + GLOBAL(tr_efer) .space 8 #endif END(trampoline_header) -- cgit v1.2.1 From 35bdd29095ad614c5fb4a934bfd4f57a94dfd395 Mon Sep 17 00:00:00 2001 From: Alessandro Rubini Date: Thu, 12 Apr 2012 10:48:44 +0200 Subject: mfd: Add driver for STA2X11 MFD block This also introduces to export a function that is in the base sta2x11 support patches. The header will increase with other prototypes and constants over time. Signed-off-by: Alessandro Rubini Acked-by: Giancarlo Asnaghi Cc: Alan Cox Signed-off-by: Samuel Ortiz --- arch/x86/include/asm/sta2x11.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 arch/x86/include/asm/sta2x11.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h new file mode 100644 index 000000000000..e9d32df89ccc --- /dev/null +++ b/arch/x86/include/asm/sta2x11.h @@ -0,0 +1,12 @@ +/* + * Header file for STMicroelectronics ConneXt (STA2X11) IOHub + */ +#ifndef __ASM_STA2X11_H +#define __ASM_STA2X11_H + +#include + +/* This needs to be called from the MFD to configure its sub-devices */ +struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev); + +#endif /* __ASM_STA2X11_H */ -- cgit v1.2.1 From c5403aed044e23f8d1ecdf05d0ff120314186527 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Wed, 9 May 2012 23:25:06 +0300 Subject: x86, realmode: build fix: remove duplicate build Real-mode binary was built twice. This patch fixes the issue by making realmode.relocs as target for realmode.bin. [ hpa: removed the direct dependency on realmode.relocs in arch/x86/realmode/Makefile ] Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336595106-21135-1-git-send-email-jarkko.sakkinen@intel.com Cc: Sam Ravnborg Cc: Michal Marek Signed-off-by: H. Peter Anvin --- arch/x86/realmode/Makefile | 5 +---- arch/x86/realmode/rm/Makefile | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile index f22a4f8d99d6..a05b3aca64ad 100644 --- a/arch/x86/realmode/Makefile +++ b/arch/x86/realmode/Makefile @@ -11,10 +11,7 @@ subdir- := rm obj-y += rmpiggy.o -$(obj)/rmpiggy.o: $(obj)/rm/realmode.relocs $(obj)/rm/realmode.bin +$(obj)/rmpiggy.o: $(obj)/rm/realmode.bin $(obj)/rm/realmode.bin: FORCE $(Q)$(MAKE) $(build)=$(obj)/rm $@ - -$(obj)/rm/realmode.relocs: FORCE - $(Q)$(MAKE) $(build)=$(obj)/rm $@ diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index de40bc44b92f..1c1d3d3bbee4 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -48,7 +48,7 @@ $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE OBJCOPYFLAGS_realmode.bin := -O binary -$(obj)/realmode.bin: $(obj)/realmode.elf +$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs $(call if_changed,objcopy) quiet_cmd_relocs = RELOCS $@ -- cgit v1.2.1 From 0f6f11eb00830fa691c16084048f53d83c5c3a5d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 9 May 2012 14:53:01 -0700 Subject: x86, realmode: Make sure all generated files are listed in targets Kbuild expects all generated files to be listed in the targets variable. If it isn't, weird things happen. Cc: Sam Ravnborg Cc: Michal Marek Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336595106-21135-1-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/Makefile | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 1c1d3d3bbee4..5b84a2d30888 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -7,13 +7,7 @@ # # -always := realmode.bin - -realmode-y += header.o -realmode-y += trampoline_$(BITS).o -realmode-y += stack.o -realmode-$(CONFIG_X86_32) += reboot_32.o -realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs) +always := realmode.bin realmode.relocs wakeup-objs := wakeup_asm.o wakemain.o video-mode.o wakeup-objs += copy.o bioscall.o regs.o @@ -25,6 +19,12 @@ wakeup-objs += video-vga.o wakeup-objs += video-vesa.o wakeup-objs += video-bios.o +realmode-y += header.o +realmode-y += trampoline_$(BITS).o +realmode-y += stack.o +realmode-$(CONFIG_X86_32) += reboot_32.o +realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs) + targets += $(realmode-y) REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y)) @@ -35,24 +35,30 @@ quiet_cmd_pasyms = PASYMS $@ cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \ sed $(sed-pasyms) | sort | uniq > $@ +targets += pasyms.h $(obj)/pasyms.h: $(REALMODE_OBJS) FORCE $(call if_changed,pasyms) +targets += realmode.lds $(obj)/realmode.lds: $(obj)/pasyms.h LDFLAGS_realmode.elf := --emit-relocs -T CPPFLAGS_realmode.lds += -P -C -I$(obj) +targets += realmode.elf $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE $(call if_changed,ld) OBJCOPYFLAGS_realmode.bin := -O binary +targets += realmode.bin $(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs $(call if_changed,objcopy) quiet_cmd_relocs = RELOCS $@ cmd_relocs = arch/x86/tools/relocs --realmode $< > $@ + +targets += realmode.relocs $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE $(call if_changed,relocs) -- cgit v1.2.1 From 34d0b02e08470c56a411ba6da1f377bc6da02826 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 10 May 2012 10:11:38 +0300 Subject: x86, realmode: Fix no cache bits test in reboot_32.S Before the new real-mode code infrastructure %edx was used for testing CD and NW bits with andl in order to decide whether to flush the processor caches or not. The value of cr0 was also stored in %eax, which was later used to set cr0 after masking out lower byte (except TS bit) in order to enter real-mode. In the new real-mode code infrastructure we wanted to keep input parameter in %eax so we are using %edx for both cr0 cases. This has caused regression since andl overwrites the value of %edx. This patch fixes the issue by replacing andl with testl, which is essentially andl without writing result to the register. Special thanks to Paolo Bonzini for noting this and proposing a fix. Reported-and-tested-by: Paolo Bonzini Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336633898-23743-1-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/realmode/rm/reboot_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S index 8d9bfd13a93e..114044876b3d 100644 --- a/arch/x86/realmode/rm/reboot_32.S +++ b/arch/x86/realmode/rm/reboot_32.S @@ -76,7 +76,7 @@ machine_real_restart_asm16: movl %edx, %cr0 movl %ecx, %cr3 movl %cr0, %edx - andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ + testl $0x60000000, %edx /* If no cache bits -> no wbinvd */ jz 2f wbinvd 2: -- cgit v1.2.1 From 5f3fbc342f408199e5cbb4b3dc220569147a99a7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 14 May 2012 14:58:58 +0800 Subject: KVM: VMX: unlike vmcs on fail path fix: [ 1529.577273] Call Trace: [ 1529.577289] [] kvm_arch_hardware_disable+0x13/0x30 [kvm] [ 1529.577302] [] hardware_disable_nolock+0x35/0x39 [kvm] [ 1529.577311] [] ? cpumask_clear_cpu.constprop.31+0x13/0x13 [kvm] [ 1529.577315] [] on_each_cpu+0x44/0x84 [ 1529.577326] [] hardware_disable_all_nolock+0x34/0x36 [kvm] [ 1529.577335] [] hardware_disable_all+0x2b/0x39 [kvm] [ 1529.577349] [] kvm_put_kvm+0xed/0x10f [kvm] [ 1529.577358] [] kvm_vm_release+0x22/0x28 [kvm] Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 61ebdb6390ee..3062ea95266e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6350,7 +6350,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: - free_vmcs(vmx->loaded_vmcs->vmcs); + free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); uninit_vcpu: -- cgit v1.2.1 From d54e4237bcbb400fda11c902fd538aa0b4805720 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 7 May 2012 12:12:25 +0200 Subject: KVM: x86 emulator: convert bsf/bsr instructions to emulate_2op_SrcV_nobyte() The instruction emulation for bsrw is broken in KVM because the code always uses bsr with 32 or 64 bit operand size for emulation. Fix that by using emulate_2op_SrcV_nobyte() macro to use guest operand size for emulation. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7fd25763b0e0..f95d242ee9f7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3133,35 +3133,13 @@ static int em_btc(struct x86_emulate_ctxt *ctxt) static int em_bsf(struct x86_emulate_ctxt *ctxt) { - u8 zf; - - __asm__ ("bsf %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - } + emulate_2op_SrcV_nobyte(ctxt, "bsf"); return X86EMUL_CONTINUE; } static int em_bsr(struct x86_emulate_ctxt *ctxt) { - u8 zf; - - __asm__ ("bsr %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - } + emulate_2op_SrcV_nobyte(ctxt, "bsr"); return X86EMUL_CONTINUE; } -- cgit v1.2.1 From 512d5649e8dc3ed36f2ebf0818da64a4d4c2544a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 13 May 2012 19:53:23 +0300 Subject: KVM: VMX: Fix %ds/%es clobber The vmx exit code unconditionally restores %ds and %es to __USER_DS. This can override the user's values, since %ds and %es are not saved and restored in x86_64 syscalls. In practice, this isn't dangerous since nobody uses segment registers in long mode, least of all programs that use KVM. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3062ea95266e..f2ee016e1004 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6102,7 +6102,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u16 _ds, _es; + savesegment(ds, _ds); + savesegment(es, _es); if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (vmcs12->idt_vectoring_info_field & @@ -6263,7 +6266,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) } } - asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + loadsegment(ds, _ds); + loadsegment(es, _es); vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); -- cgit v1.2.1 From b2da15ac26a0c00fc0d399a2bc5cf3c4e15f0b4f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 13 May 2012 19:53:24 +0300 Subject: KVM: VMX: Optimize %ds, %es reload On x86_64, we can defer %ds and %es reload to the heavyweight context switch, since nothing in the lightweight paths uses the host %ds or %es (they are ignored by the processor). Furthermore we can avoid the load if the segments are null, by letting the hardware load the null segments for us. This is the expected case. On i386, we could avoid the reload entirely, since the entry.S paths take care of reload, except for the SYSEXIT path which leaves %ds and %es set to __USER_DS. So we set them to the same values as well. Saves about 70 cycles out of 1600 (around 4%; noisy measurements). Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f2ee016e1004..32eb58866292 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -393,6 +393,9 @@ struct vcpu_vmx { struct { int loaded; u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif int gs_ldt_reload_needed; int fs_reload_needed; } host_state; @@ -1417,6 +1420,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmx->host_state.gs_ldt_reload_needed = 1; } +#ifdef CONFIG_X86_64 + savesegment(ds, vmx->host_state.ds_sel); + savesegment(es, vmx->host_state.es_sel); +#endif + #ifdef CONFIG_X86_64 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); @@ -1457,6 +1465,19 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) } if (vmx->host_state.fs_reload_needed) loadsegment(fs, vmx->host_state.fs_sel); +#ifdef CONFIG_X86_64 + if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { + loadsegment(ds, vmx->host_state.ds_sel); + loadsegment(es, vmx->host_state.es_sel); + } +#else + /* + * The sysexit path does not restore ds/es, so we must set them to + * a reasonable value ourselves. + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif reload_tss(); #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); @@ -3640,8 +3661,18 @@ static void vmx_set_constant_host_state(void) vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ +#ifdef CONFIG_X86_64 + /* + * Load null selectors, so we can avoid reloading them in + * __vmx_load_host_state(), in case userspace uses the null selectors + * too (the expected case). + */ + vmcs_write16(HOST_DS_SELECTOR, 0); + vmcs_write16(HOST_ES_SELECTOR, 0); +#else vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ +#endif vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ @@ -6102,10 +6133,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u16 _ds, _es; - savesegment(ds, _ds); - savesegment(es, _es); if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (vmcs12->idt_vectoring_info_field & @@ -6266,8 +6294,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) } } - loadsegment(ds, _ds); - loadsegment(es, _es); vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); -- cgit v1.2.1 From c142786c6291189b5c85f53d91743e1eefbd8fe0 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 14 May 2012 15:44:06 +0300 Subject: KVM: MMU: Don't use RCU for lockless shadow walking Using RCU for lockless shadow walking can increase the amount of memory in use by the system, since RCU grace periods are unpredictable. We also have an unconditional write to a shared variable (reader_counter), which isn't good for scaling. Replace that with a scheme similar to x86's get_user_pages_fast(): disable interrupts during lockless shadow walk to force the freer (kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the processor with interrupts enabled. We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent kvm_flush_remote_tlbs() from avoiding the IPI. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 4 --- arch/x86/kvm/mmu.c | 73 ++++++++++++++++------------------------- 2 files changed, 29 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 69e39bc7e36f..64c8989263f6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -240,8 +240,6 @@ struct kvm_mmu_page { #endif int write_flooding_count; - - struct rcu_head rcu; }; struct kvm_pio_request { @@ -540,8 +538,6 @@ struct kvm_arch { u64 hv_guest_os_id; u64 hv_hypercall; - atomic_t reader_counter; - #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 07424cf60434..72102e0ab7cb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -551,19 +551,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - rcu_read_lock(); - atomic_inc(&vcpu->kvm->arch.reader_counter); - - /* Increase the counter before walking shadow page table */ - smp_mb__after_atomic_inc(); + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); + vcpu->mode = READING_SHADOW_PAGE_TABLES; + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_mb(); } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* Decrease the counter after walking shadow page table finished */ - smp_mb__before_atomic_dec(); - atomic_dec(&vcpu->kvm->arch.reader_counter); - rcu_read_unlock(); + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_mb(); + vcpu->mode = OUTSIDE_GUEST_MODE; + local_irq_enable(); } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, @@ -1989,30 +1999,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, return ret; } -static void kvm_mmu_isolate_pages(struct list_head *invalid_list) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, invalid_list, link) - kvm_mmu_isolate_page(sp); -} - -static void free_pages_rcu(struct rcu_head *head) -{ - struct kvm_mmu_page *next, *sp; - - sp = container_of(head, struct kvm_mmu_page, rcu); - while (sp) { - if (!list_empty(&sp->link)) - next = list_first_entry(&sp->link, - struct kvm_mmu_page, link); - else - next = NULL; - kvm_mmu_free_page(sp); - sp = next; - } -} - static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list) { @@ -2021,17 +2007,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, if (list_empty(invalid_list)) return; - kvm_flush_remote_tlbs(kvm); - - if (atomic_read(&kvm->arch.reader_counter)) { - kvm_mmu_isolate_pages(invalid_list); - sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); - list_del_init(invalid_list); + /* + * wmb: make sure everyone sees our modifications to the page tables + * rmb: make sure we see changes to vcpu->mode + */ + smp_mb(); - trace_kvm_mmu_delay_free_pages(sp); - call_rcu(&sp->rcu, free_pages_rcu); - return; - } + /* + * Wait for all vcpus to exit guest mode and/or lockless shadow + * page table walks. + */ + kvm_flush_remote_tlbs(kvm); do { sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); @@ -2039,7 +2025,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, kvm_mmu_isolate_page(sp); kvm_mmu_free_page(sp); } while (!list_empty(invalid_list)); - } /* -- cgit v1.2.1 From 796038799a72adb279d785c9154df6eeb98b6e8d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 16 May 2012 13:22:41 -0700 Subject: x86, realmode: Mask out EFER.LMA when saving trampoline EFER Some AMD processors apparently #GP(0) if EFER.LMA is set in WRMSR, rather than ignoring it. Thus, we need to mask it out. Reported-by: Ingo Molnar Tested-by: Borislav Petkov Cc: Jarkko Sakkinen Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-24-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/kernel/realmode.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index 66ac276cf361..099277984b80 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -22,6 +22,7 @@ void __init setup_real_mode(void) size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); #ifdef CONFIG_X86_64 u64 *trampoline_pgd; + u32 efer_low, efer_high; #endif /* Has to be in very low memory so we can execute real-mode AP code. */ @@ -65,9 +66,13 @@ void __init setup_real_mode(void) trampoline_header->gdt_limit = __BOOT_DS + 7; trampoline_header->gdt_base = __pa(boot_gdt); #else - if (rdmsr_safe(MSR_EFER, &trampoline_header->efer_low, - &trampoline_header->efer_high)) - BUG(); + /* + * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR + * so we need to mask it out. + */ + rdmsr(MSR_EFER, efer_low, efer_high); + trampoline_header->efer_low = efer_low & ~EFER_LMA; + trampoline_header->efer_high = efer_high; trampoline_header->start = (u64) secondary_startup_64; trampoline_cr4_features = &trampoline_header->cr4; -- cgit v1.2.1 From 51edbe6a2f47c78c6c6e529999ee0a044fe59a89 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 16 May 2012 13:44:10 -0700 Subject: x86, realmode: Move not-common bits out of trampoline_common.S Move the bits that aren't actually common out of trampoline_common.S and into the arch-specific files. Furthermore, make sure the page directory is first in the .bss section for trampoline_64.S in order to not waste an entire page of memory. Signed-off-by: H. Peter Anvin Cc: Jarkko Sakkinen --- arch/x86/realmode/rm/trampoline_32.S | 8 ++++++++ arch/x86/realmode/rm/trampoline_64.S | 25 +++++++++++++++++++++++ arch/x86/realmode/rm/trampoline_common.S | 35 -------------------------------- 3 files changed, 33 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 6fc064b4d2b9..c1b2791183e7 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -63,4 +63,12 @@ ENTRY(trampoline_start) ENTRY(startup_32) # note: also used from wakeup_asm.S jmp *%eax + .bss + .balign 8 +GLOBAL(trampoline_header) + tr_start: .space 4 + tr_gdt_pad: .space 2 + tr_gdt: .space 6 +END(trampoline_header) + #include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 66e26f088288..1b9e1bc1ac5e 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -125,4 +125,29 @@ ENTRY(startup_64) # Now jump into the kernel using virtual addresses jmpq *tr_start(%rip) + .section ".rodata","a" + # Duplicate the global descriptor table + # so the kernel can live anywhere + .balign 16 + .globl tr_gdt +tr_gdt: + .short tr_gdt_end - tr_gdt - 1 # gdt limit + .long pa_tr_gdt + .short 0 + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS +tr_gdt_end: + + .bss + .balign PAGE_SIZE +GLOBAL(trampoline_pgd) .space PAGE_SIZE + + .balign 8 +GLOBAL(trampoline_header) + tr_start: .space 8 + GLOBAL(tr_cr4) .space 4 + GLOBAL(tr_efer) .space 8 +END(trampoline_header) + #include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S index cac444b942f8..b1ecdb9692ad 100644 --- a/arch/x86/realmode/rm/trampoline_common.S +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -1,42 +1,7 @@ .section ".rodata","a" - -#ifdef CONFIG_X86_64 - # Duplicate the global descriptor table - # so the kernel can live anywhere .balign 16 - .globl tr_gdt -tr_gdt: - .short tr_gdt_end - tr_gdt - 1 # gdt limit - .long pa_tr_gdt - .short 0 - .quad 0x00cf9b000000ffff # __KERNEL32_CS - .quad 0x00af9b000000ffff # __KERNEL_CS - .quad 0x00cf93000000ffff # __KERNEL_DS -tr_gdt_end: -#endif - - .balign 4 tr_idt: .fill 1, 6, 0 .bss - .balign 4 GLOBAL(trampoline_status) .space 4 - - .balign 8 -GLOBAL(trampoline_header) -#ifdef CONFIG_X86_32 - tr_start: .space 4 - tr_gdt_pad: .space 2 - tr_gdt: .space 6 -#else - tr_start: .space 8 - GLOBAL(tr_cr4) .space 4 - GLOBAL(tr_efer) .space 8 -#endif -END(trampoline_header) - -#ifdef CONFIG_X86_64 - .balign PAGE_SIZE -GLOBAL(trampoline_pgd) .space PAGE_SIZE -#endif -- cgit v1.2.1 From 137127018812ec7fcccb9843156cfc0b5cfa31d5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 16 May 2012 13:49:10 -0700 Subject: x86, realmode: Move kernel/realmode.c to realmode/init.c Keep all the realmode code together, including initialization (only the rm/ subdirectory is actually built as real-mode code, anyway.) Signed-off-by: H. Peter Anvin Cc: Jarkko Sakkinen --- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/realmode.c | 116 --------------------------------------------- arch/x86/realmode/Makefile | 1 + arch/x86/realmode/init.c | 116 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 117 insertions(+), 117 deletions(-) delete mode 100644 arch/x86/kernel/realmode.c create mode 100644 arch/x86/realmode/init.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4a20f4441ffe..08484332f329 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,7 +35,6 @@ obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o -obj-y += realmode.o obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c deleted file mode 100644 index 099277984b80..000000000000 --- a/arch/x86/kernel/realmode.c +++ /dev/null @@ -1,116 +0,0 @@ -#include -#include - -#include -#include -#include - -struct real_mode_header *real_mode_header; -u32 *trampoline_cr4_features; - -void __init setup_real_mode(void) -{ - phys_addr_t mem; - u16 real_mode_seg; - u32 *rel; - u32 count; - u32 *ptr; - u16 *seg; - int i; - unsigned char *base; - struct trampoline_header *trampoline_header; - size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); -#ifdef CONFIG_X86_64 - u64 *trampoline_pgd; - u32 efer_low, efer_high; -#endif - - /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) - panic("Cannot allocate trampoline\n"); - - base = __va(mem); - memblock_reserve(mem, size); - real_mode_header = (struct real_mode_header *) base; - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - base, (unsigned long long)mem, size); - - memcpy(base, real_mode_blob, size); - - real_mode_seg = __pa(base) >> 4; - rel = (u32 *) real_mode_relocs; - - /* 16-bit segment relocations. */ - count = rel[0]; - rel = &rel[1]; - for (i = 0; i < count; i++) { - seg = (u16 *) (base + rel[i]); - *seg = real_mode_seg; - } - - /* 32-bit linear relocations. */ - count = rel[i]; - rel = &rel[i + 1]; - for (i = 0; i < count; i++) { - ptr = (u32 *) (base + rel[i]); - *ptr += __pa(base); - } - - /* Must be perfomed *after* relocation. */ - trampoline_header = (struct trampoline_header *) - __va(real_mode_header->trampoline_header); - -#ifdef CONFIG_X86_32 - trampoline_header->start = __pa(startup_32_smp); - trampoline_header->gdt_limit = __BOOT_DS + 7; - trampoline_header->gdt_base = __pa(boot_gdt); -#else - /* - * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR - * so we need to mask it out. - */ - rdmsr(MSR_EFER, efer_low, efer_high); - trampoline_header->efer_low = efer_low & ~EFER_LMA; - trampoline_header->efer_high = efer_high; - - trampoline_header->start = (u64) secondary_startup_64; - trampoline_cr4_features = &trampoline_header->cr4; - *trampoline_cr4_features = read_cr4(); - - trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); - trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; - trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; -#endif -} - -/* - * set_real_mode_permissions() gets called very early, to guarantee the - * availability of low memory. This is before the proper kernel page - * tables are set up, so we cannot set page permissions in that - * function. Thus, we use an arch_initcall instead. - */ -static int __init set_real_mode_permissions(void) -{ - unsigned char *base = (unsigned char *) real_mode_header; - size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); - - size_t ro_size = - PAGE_ALIGN(real_mode_header->ro_end) - - __pa(base); - - size_t text_size = - PAGE_ALIGN(real_mode_header->ro_end) - - real_mode_header->text_start; - - unsigned long text_start = - (unsigned long) __va(real_mode_header->text_start); - - set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); - set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); - set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); - - return 0; -} - -arch_initcall(set_real_mode_permissions); diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile index a05b3aca64ad..94f7fbe97b08 100644 --- a/arch/x86/realmode/Makefile +++ b/arch/x86/realmode/Makefile @@ -9,6 +9,7 @@ subdir- := rm +obj-y += init.o obj-y += rmpiggy.o $(obj)/rmpiggy.o: $(obj)/rm/realmode.bin diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c new file mode 100644 index 000000000000..099277984b80 --- /dev/null +++ b/arch/x86/realmode/init.c @@ -0,0 +1,116 @@ +#include +#include + +#include +#include +#include + +struct real_mode_header *real_mode_header; +u32 *trampoline_cr4_features; + +void __init setup_real_mode(void) +{ + phys_addr_t mem; + u16 real_mode_seg; + u32 *rel; + u32 count; + u32 *ptr; + u16 *seg; + int i; + unsigned char *base; + struct trampoline_header *trampoline_header; + size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); +#ifdef CONFIG_X86_64 + u64 *trampoline_pgd; + u32 efer_low, efer_high; +#endif + + /* Has to be in very low memory so we can execute real-mode AP code. */ + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); + if (!mem) + panic("Cannot allocate trampoline\n"); + + base = __va(mem); + memblock_reserve(mem, size); + real_mode_header = (struct real_mode_header *) base; + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + base, (unsigned long long)mem, size); + + memcpy(base, real_mode_blob, size); + + real_mode_seg = __pa(base) >> 4; + rel = (u32 *) real_mode_relocs; + + /* 16-bit segment relocations. */ + count = rel[0]; + rel = &rel[1]; + for (i = 0; i < count; i++) { + seg = (u16 *) (base + rel[i]); + *seg = real_mode_seg; + } + + /* 32-bit linear relocations. */ + count = rel[i]; + rel = &rel[i + 1]; + for (i = 0; i < count; i++) { + ptr = (u32 *) (base + rel[i]); + *ptr += __pa(base); + } + + /* Must be perfomed *after* relocation. */ + trampoline_header = (struct trampoline_header *) + __va(real_mode_header->trampoline_header); + +#ifdef CONFIG_X86_32 + trampoline_header->start = __pa(startup_32_smp); + trampoline_header->gdt_limit = __BOOT_DS + 7; + trampoline_header->gdt_base = __pa(boot_gdt); +#else + /* + * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR + * so we need to mask it out. + */ + rdmsr(MSR_EFER, efer_low, efer_high); + trampoline_header->efer_low = efer_low & ~EFER_LMA; + trampoline_header->efer_high = efer_high; + + trampoline_header->start = (u64) secondary_startup_64; + trampoline_cr4_features = &trampoline_header->cr4; + *trampoline_cr4_features = read_cr4(); + + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); + trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; + trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; +#endif +} + +/* + * set_real_mode_permissions() gets called very early, to guarantee the + * availability of low memory. This is before the proper kernel page + * tables are set up, so we cannot set page permissions in that + * function. Thus, we use an arch_initcall instead. + */ +static int __init set_real_mode_permissions(void) +{ + unsigned char *base = (unsigned char *) real_mode_header; + size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); + + size_t ro_size = + PAGE_ALIGN(real_mode_header->ro_end) - + __pa(base); + + size_t text_size = + PAGE_ALIGN(real_mode_header->ro_end) - + real_mode_header->text_start; + + unsigned long text_start = + (unsigned long) __va(real_mode_header->text_start); + + set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); + set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); + set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); + + return 0; +} + +arch_initcall(set_real_mode_permissions); -- cgit v1.2.1 From 638d957b51c88852de72f15f7cd588d125e97dab Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 16 May 2012 14:02:05 -0700 Subject: x86, realmode: Change EFER to a single u64 field Change EFER to be a single u64 field instead of two u32 fields; change the order to maintain alignment. Note that on x86-64 cr4 is really also a 64-bit quantity, although we can only set the low 32 bits from the trampoline code since it is still executing in 32-bit mode at that point. Signed-off-by: H. Peter Anvin Cc: Jarkko Sakkinen --- arch/x86/include/asm/realmode.h | 3 +-- arch/x86/realmode/init.c | 7 +++---- arch/x86/realmode/rm/trampoline_64.S | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 937dc6071d76..fce3f4ae5bd6 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -35,9 +35,8 @@ struct trampoline_header { u32 gdt_base; #else u64 start; + u64 efer; u32 cr4; - u32 efer_low; - u32 efer_high; #endif }; diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 099277984b80..cbca565af5bd 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -22,7 +22,7 @@ void __init setup_real_mode(void) size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); #ifdef CONFIG_X86_64 u64 *trampoline_pgd; - u32 efer_low, efer_high; + u64 efer; #endif /* Has to be in very low memory so we can execute real-mode AP code. */ @@ -70,9 +70,8 @@ void __init setup_real_mode(void) * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR * so we need to mask it out. */ - rdmsr(MSR_EFER, efer_low, efer_high); - trampoline_header->efer_low = efer_low & ~EFER_LMA; - trampoline_header->efer_high = efer_high; + rdmsrl(MSR_EFER, efer); + trampoline_header->efer = efer & ~EFER_LMA; trampoline_header->start = (u64) secondary_startup_64; trampoline_cr4_features = &trampoline_header->cr4; diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 1b9e1bc1ac5e..bb360dc39d21 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -146,8 +146,8 @@ GLOBAL(trampoline_pgd) .space PAGE_SIZE .balign 8 GLOBAL(trampoline_header) tr_start: .space 8 - GLOBAL(tr_cr4) .space 4 GLOBAL(tr_efer) .space 8 + GLOBAL(tr_cr4) .space 4 END(trampoline_header) #include "trampoline_common.S" -- cgit v1.2.1 From d8368af8b46b904def42a0f341d2f4f29001fa77 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 14 May 2012 18:07:56 +0300 Subject: KVM: Fix mmu_reload() clash with nested vmx event injection Currently the inject_pending_event() call during guest entry happens after kvm_mmu_reload(). This is for historical reasons - we used to inject_pending_event() in atomic context, while kvm_mmu_reload() needs task context. A problem is that nested vmx can cause the mmu context to be reset, if event injection is intercepted and causes a #VMEXIT instead (the #VMEXIT resets CR0/CR3/CR4). If this happens, we end up with invalid root_hpa, and since kvm_mmu_reload() has already run, no one will fix it and we end up entering the guest this way. Fix by reordering event injection to be before kvm_mmu_reload(). Use ->cancel_injection() to undo if kvm_mmu_reload() fails. https://bugzilla.kernel.org/show_bug.cgi?id=42980 Reported-by: Luke-Jr Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4de705cdcafd..b78f89d34242 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5279,10 +5279,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_deliver_pmi(vcpu); } - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { inject_pending_event(vcpu); @@ -5298,6 +5294,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } } + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) { + kvm_x86_ops->cancel_injection(vcpu); + goto out; + } + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); -- cgit v1.2.1 From bea3f8781e30d0abc0bd0da80aa528d44c71959e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 May 2012 00:24:09 -0700 Subject: x86, relocs: Workaround for binutils 2.22.52.0.1 section bug GNU ld 2.22.52.0.1 has a bug that it blindly changes symbols from section-relative to absolute if they are in a section of zero length. This turns the symbols __init_begin and __init_end into absolute symbols. Let the relocs program know that those should be treated as relative symbols. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: H.J. Lu --- arch/x86/tools/relocs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 74e16bb15dc4..4df285450e8c 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -56,7 +56,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { * as absolute (typically defined outside any section in the linker script.) */ [S_REL] = - "^_end$", + "^(__init_begin|__init_end|_end)$" }; -- cgit v1.2.1 From c54a354c1835e7412a53458891b9ea05361b4e8a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 May 2012 08:31:44 -0700 Subject: x86, relocs: More relocations which may end up as absolute GNU ld 2.22.52.0.1 has a bug that it blindly changes symbols from section-relative to absolute if they are in a section of zero length. This turns the symbols __init_begin and __init_end into absolute symbols. Let the relocs program know that those should be treated as relative symbols. This bug is exposed by checkin 433de739bbc2 x86, realmode: 16-bit real-mode code support for relocs tool only in the sense that that checkin changes the relocs tool to report an error instead of silently generating a kernel which is broken if relocated. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: H.J. Lu Cc: Jarkko Sakkinen --- arch/x86/tools/relocs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 4df285450e8c..b49c2119295e 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -56,7 +56,11 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { * as absolute (typically defined outside any section in the linker script.) */ [S_REL] = - "^(__init_begin|__init_end|_end)$" + "^(__init_(begin|end)|" + "__x86_cpu_dev_(start|end)|" + "(__parainstructions|__alt_instructions)(|_end)|" + "(__iommu_table|__apicdrivers|__smp_locks)(|_end)|" + "_end)$" }; -- cgit v1.2.1 From 8a3b947c40cb36100f316ac0d433f4ae554ee4cc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 May 2012 09:52:01 -0700 Subject: x86, relocs: When printing an error, say relative or absolute When the relocs tool throws an error, let the error message say if it is an absolute or relative symbol. This should make it a lot more clear what action the programmer needs to take. Signed-off-by: H. Peter Anvin --- arch/x86/tools/relocs.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index b49c2119295e..dce982d4bc31 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -570,10 +570,14 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), Elf32_Sym *sym; unsigned r_type; const char *symname; + int shn_abs; + rel = &sec->reltab[j]; sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; r_type = ELF32_R_TYPE(rel->r_info); + shn_abs = sym->st_shndx == SHN_ABS; + switch (r_type) { case R_386_NONE: case R_386_PC32: @@ -589,7 +593,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), symname = sym_name(sym_strtab, sym); if (!use_real_mode) goto bad; - if (sym->st_shndx == SHN_ABS) { + if (shn_abs) { if (is_reloc(S_ABS, symname)) break; else if (!is_reloc(S_SEG, symname)) @@ -605,7 +609,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), case R_386_32: symname = sym_name(sym_strtab, sym); - if (sym->st_shndx == SHN_ABS) { + if (shn_abs) { if (is_reloc(S_ABS, symname)) break; else if (!is_reloc(S_REL, symname)) @@ -623,7 +627,8 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), break; bad: symname = sym_name(sym_strtab, sym); - die("Invalid %s relocation: %s\n", + die("Invalid %s %s relocation: %s\n", + shn_abs ? "absolute" : "relative", rel_type(r_type), symname); } } -- cgit v1.2.1 From 61f5446169046c217a5479517edac3a890c3bee7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 21 May 2012 00:02:45 -0700 Subject: x86, realmode: Move end signature into header.S The end signature was defined in wakeup_asm.S as it originally came from the ACPI wakeup code. However, we rely on the existence of the .signature section to expand .bss, otherwise we would have to include code to explicitly zero the .bss depending on the configuration. Since the expanded .bss is just in .init.data anyway, it's easier to always have it expanded. This fixes failures when compiled without CONFIG_ACPI_SLEEP. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: Jarkko Sakkinen --- arch/x86/realmode/rm/header.S | 9 +++++++++ arch/x86/realmode/rm/realmode.h | 5 +++++ arch/x86/realmode/rm/wakeup.h | 1 - arch/x86/realmode/rm/wakeup_asm.S | 6 +----- 4 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index 4612d5382791..fadf48378ada 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -7,6 +7,8 @@ #include #include +#include "realmode.h" + .section ".header", "a" .balign 16 @@ -30,3 +32,10 @@ GLOBAL(real_mode_header) .long pa_machine_real_restart_asm #endif END(real_mode_header) + + /* End signature, used to verify integrity */ + .section ".signature","a" + .balign 4 +GLOBAL(end_signature) + .long REALMODE_END_SIGNATURE +END(end_signature) diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h index 15ab6335f843..d74cff6350ed 100644 --- a/arch/x86/realmode/rm/realmode.h +++ b/arch/x86/realmode/rm/realmode.h @@ -13,4 +13,9 @@ #endif /* __ASSEMBLY__ */ +/* + * Signature at the end of the realmode region + */ +#define REALMODE_END_SIGNATURE 0x65a22c82 + #endif /* ARCH_X86_REALMODE_RM_REALMODE_H */ diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h index 2dfaf06b8af1..9317e0042f24 100644 --- a/arch/x86/realmode/rm/wakeup.h +++ b/arch/x86/realmode/rm/wakeup.h @@ -33,7 +33,6 @@ extern struct wakeup_header wakeup_header; #define WAKEUP_HEADER_OFFSET 8 #define WAKEUP_HEADER_SIGNATURE 0x51ee1111 -#define WAKEUP_END_SIGNATURE 0x65a22c82 /* Wakeup behavior bits */ #define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S index 46108f05e04e..8905166b0bbb 100644 --- a/arch/x86/realmode/rm/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -85,7 +85,7 @@ ENTRY(wakeup_start) /* Check we really have everything... */ movl end_signature, %eax - cmpl $WAKEUP_END_SIGNATURE, %eax + cmpl $REALMODE_END_SIGNATURE, %eax jne bogus_real_magic /* Call the C code */ @@ -175,7 +175,3 @@ GLOBAL(wakeup_idt) .long 0 /* address */ .word 0 END(wakeup_idt) - - .section ".signature","a" -end_signature: - .long WAKEUP_END_SIGNATURE -- cgit v1.2.1 From 0a2b9a6ea93650b8a00f9fd5ee8fdd25671e2df6 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 29 Dec 2011 13:09:51 +0100 Subject: X86: integrate CMA with DMA-mapping subsystem This patch adds support for CMA to dma-mapping subsystem for x86 architecture that uses common pci-dma/pci-nommu implementation. This allows to test CMA on KVM/QEMU and a lot of common x86 boxes. Signed-off-by: Marek Szyprowski Signed-off-by: Kyungmin Park CC: Michal Nazarewicz Acked-by: Arnd Bergmann --- arch/x86/Kconfig | 1 + arch/x86/include/asm/dma-contiguous.h | 13 +++++++++++++ arch/x86/include/asm/dma-mapping.h | 5 +++++ arch/x86/kernel/pci-dma.c | 18 ++++++++++++++++-- arch/x86/kernel/pci-nommu.c | 8 +------- arch/x86/kernel/setup.c | 2 ++ 6 files changed, 38 insertions(+), 9 deletions(-) create mode 100644 arch/x86/include/asm/dma-contiguous.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c9866b0b77d8..7cbdfdac3c7c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -31,6 +31,7 @@ config X86 select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS + select HAVE_DMA_CONTIGUOUS if !SWIOTLB select HAVE_KRETPROBES select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h new file mode 100644 index 000000000000..c09241659971 --- /dev/null +++ b/arch/x86/include/asm/dma-contiguous.h @@ -0,0 +1,13 @@ +#ifndef ASMX86_DMA_CONTIGUOUS_H +#define ASMX86_DMA_CONTIGUOUS_H + +#ifdef __KERNEL__ + +#include +#include + +static inline void +dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { } + +#endif +#endif diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 4b4331d71935..7b9227b44b9b 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_ISA # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) @@ -62,6 +63,10 @@ extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, struct dma_attrs *attrs); +extern void dma_generic_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs); + static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { if (!dev->dma_mask) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 3003250ac51d..62c9457ccd2f 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -100,14 +100,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, struct dma_attrs *attrs) { unsigned long dma_mask; - struct page *page; + struct page *page = NULL; + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; dma_addr_t addr; dma_mask = dma_alloc_coherent_mask(dev, flag); flag |= __GFP_ZERO; again: - page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); + if (!(flag & GFP_ATOMIC)) + page = dma_alloc_from_contiguous(dev, count, get_order(size)); + if (!page) + page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); if (!page) return NULL; @@ -127,6 +131,16 @@ again: return page_address(page); } +void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr, struct dma_attrs *attrs) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct page *page = virt_to_page(vaddr); + + if (!dma_release_from_contiguous(dev, page, count)) + free_pages((unsigned long)vaddr, get_order(size)); +} + /* * See for the iommu kernel * parameter documentation. diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index f96050685b46..871be4a84c7d 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -74,12 +74,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } -static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, struct dma_attrs *attrs) -{ - free_pages((unsigned long)vaddr, get_order(size)); -} - static void nommu_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) @@ -97,7 +91,7 @@ static void nommu_sync_sg_for_device(struct device *dev, struct dma_map_ops nommu_dma_ops = { .alloc = dma_generic_alloc_coherent, - .free = nommu_free_coherent, + .free = dma_generic_free_coherent, .map_sg = nommu_map_sg, .map_page = nommu_map_page, .sync_single_for_device = nommu_sync_single_for_device, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1a2901562059..d6c956e674cc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -934,6 +935,7 @@ void __init setup_arch(char **cmdline_p) } #endif memblock.current_limit = get_max_mapped(); + dma_contiguous_reserve(0); /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. -- cgit v1.2.1 From 2f1bd67d544d3c086fb5101513f4b6c8f4291b43 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 21 May 2012 09:19:38 -0400 Subject: xen/smp: unbind irqworkX when unplugging vCPUs. The git commit 1ff2b0c303698e486f1e0886b4d9876200ef8ca5 "xen: implement IRQ_WORK_VECTOR handler" added the functionality to have a per-cpu "irqworkX" for the IPI APIC functionality. However it missed the unbind when a vCPU is unplugged resulting in an orphaned per-cpu interrupt line for unplugged vCPU: 30: 216 0 xen-dyn-event hvc_console 31: 810 4 xen-dyn-event eth0 32: 29 0 xen-dyn-event blkif - 36: 0 0 xen-percpu-ipi irqwork2 - 37: 287 0 xen-dyn-event xenbus + 36: 287 0 xen-dyn-event xenbus NMI: 0 0 Non-maskable interrupts LOC: 0 0 Local timer interrupts SPU: 0 0 Spurious interrupts Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 3ec3f8eb19fc..ce9e98b1e69c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -419,6 +419,7 @@ static void xen_cpu_die(unsigned int cpu) unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); -- cgit v1.2.1 From 68c2c39a76b094e9b2773e5846424ea674bf2c46 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 21 May 2012 16:54:10 +0100 Subject: xen: do not map the same GSI twice in PVHVM guests. PV on HVM guests map GSIs into event channels. At restore time the event channels are resumed by restore_pirqs. Device drivers might try to register the same GSI again through ACPI at restore time, but the GSI has already been mapped and bound by restore_pirqs. This patch detects these situations and avoids mapping the same GSI multiple times. Without this patch we get: (XEN) irq.c:2235: dom4: pirq 23 or emuirq 28 already mapped and waste a pirq. CC: stable@kernel.org Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 7415aa927913..56ab74989cf1 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -64,6 +64,10 @@ static int xen_register_pirq(u32 gsi, int gsi_override, int triggering, int shareable = 0; char *name; + irq = xen_irq_from_gsi(gsi); + if (irq > 0) + return irq; + if (set_pirq) pirq = gsi; -- cgit v1.2.1 From a129a7c84582629741e5fa6f40026efcd7a65bd4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 19 Nov 2010 13:16:22 +0100 Subject: MCE: Fix vm86 handling for 32bit mce handler When running on 32bit the mce handler could misinterpret vm86 mode as ring 0. This can affect whether it does recovery or not; it was possible to panic when recovery was actually possible. Fix this by always forcing vm86 to look like ring 3. Signed-off-by: Andi Kleen Cc: Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 66e1c51be084..5f793e6c854b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -437,6 +437,14 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { m->ip = regs->ip; m->cs = regs->cs; + + /* + * When in VM86 mode make the cs look like ring 3 + * always. This is a lie, but it's better than passing + * the additional vm86 bit around everywhere. + */ + if (v8086_mode(regs)) + m->cs |= 3; } /* Use accurate RIP reporting if available. */ if (rip_msr) -- cgit v1.2.1 From 875e26648cf9b6db9d8dc07b7959d7c61fb3f49c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 23 May 2012 14:14:22 -0700 Subject: x86/mce: Fix check for processor context when machine check was taken. Linus pointed out that there was no value is checking whether m->ip was zero - because zero is a legimate value. If we have a reliable (or faked in the VM86 case) "m->cs" we can use it to tell whether we were in user mode or kernelwhen the machine check hit. Reported-by: Linus Torvalds Cc: Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 0c82091b1652..1ccd453903d8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -165,15 +165,19 @@ static struct severity { }; /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. + * If mcgstatus indicated that ip/cs on the stack were + * no good, then "m->cs" will be zero and we will have + * to assume the worst case (IN_KERNEL) as we actually + * have no idea what we were executing when the machine + * check hit. + * If we do have a good "m->cs" (or a faked one in the + * case we were executing in VM86 mode) we can use it to + * distinguish an exception taken in user from from one + * taken in the kernel. */ static int error_context(struct mce *m) { - if (m->mcgstatus & MCG_STATUS_EIPV) - return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; - /* Unknown, assume kernel */ - return IN_KERNEL; + return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } int mce_severity(struct mce *m, int tolerant, char **msg) -- cgit v1.2.1 From 37c3459b67dd5a396a968e819cf4a86d24ac9ace Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 10 May 2012 11:12:14 -0700 Subject: x86/mce: Add instruction recovery signatures to mce-severity table Instruction recovery cases are very similar to the data recovery one we already have. Just trade out for a new MCACOD value. Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 1ccd453903d8..413c2ced887c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -126,6 +126,16 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), USER ), + MCESEV( + KEEP, "HT thread notices Action required: instruction fetch error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + MCGMASK(MCG_STATUS_EIPV, 0) + ), + MCESEV( + AR, "Action required: instruction fetch error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + USER + ), #endif MCESEV( PANIC, "Action required: unknown MCACOD", -- cgit v1.2.1 From 4ae73f2d53255c388d50bf83c1681112a6f9cba1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 26 May 2012 10:14:39 -0700 Subject: x86: use generic strncpy_from_user routine The generic strncpy_from_user() is not really optimal, since it is designed to work on both little-endian and big-endian. And on little-endian you can simplify much of the logic to find the first zero byte, since little-endian arithmetic doesn't have to worry about the carry bit propagating into earlier bytes (only later bytes, which we don't care about). But I have patches to make the generic routines use the architecture- specific infrastructure, so that we can regain the little-endian optimizations. But before we do that, switch over to the generic routines to make the patches each do just one well-defined thing. Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/uaccess.h | 1 + arch/x86/lib/usercopy.c | 97 ------------------------------------------ 3 files changed, 2 insertions(+), 97 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 81c3e8be789a..3220d44e24d0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -93,6 +93,7 @@ config X86 select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) select GENERIC_TIME_VSYSCALL if X86_64 select KTIME_SCALAR if X86_32 + select GENERIC_STRNCPY_FROM_USER config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS || UPROBES) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 851fe0dc13bc..1354facd8f63 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -32,6 +32,7 @@ #define segment_eq(a, b) ((a).seg == (b).seg) +#define user_addr_max() (current_thread_info()->addr_limit.seg) #define __addr_ok(addr) \ ((unsigned long __force)(addr) < \ (current_thread_info()->addr_limit.seg)) diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index 2e4e4b02c37a..f61ee67ec00f 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -43,100 +43,3 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return len; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); - -/* - * Do a strncpy, return length of string without final '\0'. - * 'count' is the user-supplied count (return 'count' if we - * hit it), 'max' is the address space maximum (and we return - * -EFAULT if we hit it). - */ -static inline long do_strncpy_from_user(char *dst, const char __user *src, long count, unsigned long max) -{ - long res = 0; - - /* - * Truncate 'max' to the user-specified limit, so that - * we only have one limit we need to check in the loop - */ - if (max > count) - max = count; - - while (max >= sizeof(unsigned long)) { - unsigned long c, mask; - - /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) - break; - mask = has_zero(c); - if (mask) { - mask = (mask - 1) & ~mask; - mask >>= 7; - *(unsigned long *)(dst+res) = c & mask; - return res + count_masked_bytes(mask); - } - *(unsigned long *)(dst+res) = c; - res += sizeof(unsigned long); - max -= sizeof(unsigned long); - } - - while (max) { - char c; - - if (unlikely(__get_user(c,src+res))) - return -EFAULT; - dst[res] = c; - if (!c) - return res; - res++; - max--; - } - - /* - * Uhhuh. We hit 'max'. But was that the user-specified maximum - * too? If so, that's ok - we got as much as the user asked for. - */ - if (res >= count) - return res; - - /* - * Nope: we hit the address space limit, and we still had more - * characters the caller would have wanted. That's an EFAULT. - */ - return -EFAULT; -} - -/** - * strncpy_from_user: - Copy a NUL terminated string from userspace. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ -long -strncpy_from_user(char *dst, const char __user *src, long count) -{ - unsigned long max_addr, src_addr; - - if (unlikely(count <= 0)) - return 0; - - max_addr = current_thread_info()->addr_limit.seg; - src_addr = (unsigned long)src; - if (likely(src_addr < max_addr)) { - unsigned long max = max_addr - src_addr; - return do_strncpy_from_user(dst, src, count, max); - } - return -EFAULT; -} -EXPORT_SYMBOL(strncpy_from_user); -- cgit v1.2.1 From 36126f8f2ed8168eb13aa0662b9b9585cba100a9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 26 May 2012 10:43:17 -0700 Subject: word-at-a-time: make the interfaces truly generic This changes the interfaces in to be a bit more complicated, but a lot more generic. In particular, it allows us to really do the operations efficiently on both little-endian and big-endian machines, pretty much regardless of machine details. For example, if you can rely on a fast population count instruction on your architecture, this will allow you to make your optimized file with that. NOTE! The "generic" version in include/asm-generic/word-at-a-time.h is not truly generic, it actually only works on big-endian. Why? Because on little-endian the generic algorithms are wasteful, since you can inevitably do better. The x86 implementation is an example of that. (The only truly non-generic part of the asm-generic implementation is the "find_zero()" function, and you could make a little-endian version of it. And if the Kbuild infrastructure allowed us to pick a particular header file, that would be lovely) The functions are as follows: - WORD_AT_A_TIME_CONSTANTS: specific constants that the algorithm uses. - has_zero(): take a word, and determine if it has a zero byte in it. It gets the word, the pointer to the constant pool, and a pointer to an intermediate "data" field it can set. This is the "quick-and-dirty" zero tester: it's what is run inside the hot loops. - "prep_zero_mask()": take the word, the data that has_zero() produced, and the constant pool, and generate an *exact* mask of which byte had the first zero. This is run directly *outside* the loop, and allows the "has_zero()" function to answer the "is there a zero byte" question without necessarily getting exactly *which* byte is the first one to contain a zero. If you do multiple byte lookups concurrently (eg "hash_name()", which looks for both NUL and '/' bytes), after you've done the prep_zero_mask() phase, the result of those can be or'ed together to get the "either or" case. - The result from "prep_zero_mask()" can then be fed into "find_zero()" (to find the byte offset of the first byte that was zero) or into "zero_bytemask()" (to find the bytemask of the bytes preceding the zero byte). The existence of zero_bytemask() is optional, and is not necessary for the normal string routines. But dentry name hashing needs it, so if you enable DENTRY_WORD_AT_A_TIME you need to expose it. This changes the generic strncpy_from_user() function and the dentry hashing functions to use these modified word-at-a-time interfaces. This gets us back to the optimized state of the x86 strncpy that we lost in the previous commit when moving over to the generic version. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/word-at-a-time.h | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h index ae03facfadd6..5b238981542a 100644 --- a/arch/x86/include/asm/word-at-a-time.h +++ b/arch/x86/include/asm/word-at-a-time.h @@ -10,6 +10,11 @@ * bit count instruction, that might be better than the multiply * and shift, for example. */ +struct word_at_a_time { + const unsigned long one_bits, high_bits; +}; + +#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) } #ifdef CONFIG_64BIT @@ -37,10 +42,31 @@ static inline long count_masked_bytes(long mask) #endif -/* Return the high bit set in the first byte that is a zero */ -static inline unsigned long has_zero(unsigned long a) +/* Return nonzero if it has a zero */ +static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c) +{ + unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits; + *bits = mask; + return mask; +} + +static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c) +{ + return bits; +} + +static inline unsigned long create_zero_mask(unsigned long bits) +{ + bits = (bits - 1) & ~bits; + return bits >> 7; +} + +/* The mask we created is directly usable as a bytemask */ +#define zero_bytemask(mask) (mask) + +static inline unsigned long find_zero(unsigned long mask) { - return ((a - REPEAT_BYTE(0x01)) & ~a) & REPEAT_BYTE(0x80); + return count_masked_bytes(mask); } /* -- cgit v1.2.1 From 5723aa993d83803157c22327e90cd59e3dcbe879 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 26 May 2012 11:09:53 -0700 Subject: x86: use the new generic strnlen_user() function This throws away the old x86-specific functions in favor of the generic optimized version. Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/uaccess.h | 3 +++ arch/x86/include/asm/uaccess_32.h | 17 -------------- arch/x86/include/asm/uaccess_64.h | 3 --- arch/x86/lib/usercopy_32.c | 41 --------------------------------- arch/x86/lib/usercopy_64.c | 48 --------------------------------------- 6 files changed, 4 insertions(+), 109 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3220d44e24d0..d700811785ea 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -94,6 +94,7 @@ config X86 select GENERIC_TIME_VSYSCALL if X86_64 select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER + select GENERIC_STRNLEN_USER config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS || UPROBES) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1354facd8f63..04cd6882308e 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -566,6 +566,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long strncpy_from_user(char *dst, const char __user *src, long count); +extern __must_check long strlen_user(const char __user *str); +extern __must_check long strnlen_user(const char __user *str, long n); + /* * movsl can be slow when source and dest are not both 8-byte aligned */ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 8084bc73b18c..576e39bca6ad 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -213,23 +213,6 @@ static inline unsigned long __must_check copy_from_user(void *to, return n; } -/** - * strlen_user: - Get the size of a string in user space. - * @str: The string to measure. - * - * Context: User context only. This function may sleep. - * - * Get the size of a NUL-terminated string in user space. - * - * Returns the size of the string INCLUDING the terminating NUL. - * On exception, returns 0. - * - * If there is a limit on the length of a valid string, you may wish to - * consider using strnlen_user() instead. - */ -#define strlen_user(str) strnlen_user(str, LONG_MAX) - -long strnlen_user(const char __user *str, long n); unsigned long __must_check clear_user(void __user *mem, unsigned long len); unsigned long __must_check __clear_user(void __user *mem, unsigned long len); diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index fcd4b6f3ef02..8e796fbbf9c6 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -208,9 +208,6 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) } } -__must_check long strnlen_user(const char __user *str, long n); -__must_check long __strnlen_user(const char __user *str, long n); -__must_check long strlen_user(const char __user *str); __must_check unsigned long clear_user(void __user *mem, unsigned long len); __must_check unsigned long __clear_user(void __user *mem, unsigned long len); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 883b216c60b2..1781b2f950e2 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -95,47 +95,6 @@ __clear_user(void __user *to, unsigned long n) } EXPORT_SYMBOL(__clear_user); -/** - * strnlen_user: - Get the size of a string in user space. - * @s: The string to measure. - * @n: The maximum valid length - * - * Get the size of a NUL-terminated string in user space. - * - * Returns the size of the string INCLUDING the terminating NUL. - * On exception, returns 0. - * If the string is too long, returns a value greater than @n. - */ -long strnlen_user(const char __user *s, long n) -{ - unsigned long mask = -__addr_ok(s); - unsigned long res, tmp; - - might_fault(); - - __asm__ __volatile__( - " testl %0, %0\n" - " jz 3f\n" - " andl %0,%%ecx\n" - "0: repne; scasb\n" - " setne %%al\n" - " subl %%ecx,%0\n" - " addl %0,%%eax\n" - "1:\n" - ".section .fixup,\"ax\"\n" - "2: xorl %%eax,%%eax\n" - " jmp 1b\n" - "3: movb $1,%%al\n" - " jmp 1b\n" - ".previous\n" - _ASM_EXTABLE(0b,2b) - :"=&r" (n), "=&D" (s), "=&a" (res), "=&c" (tmp) - :"0" (n), "1" (s), "2" (0), "3" (mask) - :"cc"); - return res & mask; -} -EXPORT_SYMBOL(strnlen_user); - #ifdef CONFIG_X86_INTEL_USERCOPY static unsigned long __copy_user_intel(void __user *to, const void *from, unsigned long size) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0d0326f388c0..e5b130bc2d0e 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -52,54 +52,6 @@ unsigned long clear_user(void __user *to, unsigned long n) } EXPORT_SYMBOL(clear_user); -/* - * Return the size of a string (including the ending 0) - * - * Return 0 on exception, a value greater than N if too long - */ - -long __strnlen_user(const char __user *s, long n) -{ - long res = 0; - char c; - - while (1) { - if (res>n) - return n+1; - if (__get_user(c, s)) - return 0; - if (!c) - return res+1; - res++; - s++; - } -} -EXPORT_SYMBOL(__strnlen_user); - -long strnlen_user(const char __user *s, long n) -{ - if (!access_ok(VERIFY_READ, s, 1)) - return 0; - return __strnlen_user(s, n); -} -EXPORT_SYMBOL(strnlen_user); - -long strlen_user(const char __user *s) -{ - long res = 0; - char c; - - for (;;) { - if (get_user(c, s)) - return 0; - if (!c) - return res+1; - res++; - s++; - } -} -EXPORT_SYMBOL(strlen_user); - unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) { if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { -- cgit v1.2.1 From 91eb0f67c38c7104766faa49c5aaee2b4876511e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 29 May 2012 15:06:28 -0700 Subject: x86: print e820 physical addresses consistently with other parts of kernel Print physical address info in a style consistent with the %pR style used elsewhere in the kernel. For example: -BIOS-provided physical RAM map: +e820: BIOS-provided physical RAM map: - BIOS-e820: 0000000000000100 - 000000000009e000 (usable) +BIOS-e820: [mem 0x0000000000000100-0x000000000009dfff] usable -Allocating PCI resources starting at 90000000 (gap: 90000000:6ed1c000) +e820: [mem 0x90000000-0xfed1bfff] available for PCI devices -reserve RAM buffer: 000000000009e000 - 000000000009ffff +e820: reserve RAM buffer [mem 0x0009e000-0x0009ffff] Signed-off-by: Bjorn Helgaas Cc: Yinghai Lu Cc: Konrad Rzeszutek Wilk Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/e820.c | 53 +++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 62d61e9976eb..41857970517f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, int x = e820x->nr_map; if (x >= ARRAY_SIZE(e820x->map)) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long) start, + (unsigned long long) (start + size - 1)); return; } @@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type) switch (type) { case E820_RAM: case E820_RESERVED_KERN: - printk(KERN_CONT "(usable)"); + printk(KERN_CONT "usable"); break; case E820_RESERVED: - printk(KERN_CONT "(reserved)"); + printk(KERN_CONT "reserved"); break; case E820_ACPI: - printk(KERN_CONT "(ACPI data)"); + printk(KERN_CONT "ACPI data"); break; case E820_NVS: - printk(KERN_CONT "(ACPI NVS)"); + printk(KERN_CONT "ACPI NVS"); break; case E820_UNUSABLE: - printk(KERN_CONT "(unusable)"); + printk(KERN_CONT "unusable"); break; default: printk(KERN_CONT "type %u", type); @@ -158,10 +160,10 @@ void __init e820_print_map(char *who) int i; for (i = 0; i < e820.nr_map; i++) { - printk(KERN_INFO " %s: %016Lx - %016Lx ", who, + printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, (unsigned long long) e820.map[i].addr, (unsigned long long) - (e820.map[i].addr + e820.map[i].size)); + (e820.map[i].addr + e820.map[i].size - 1)); e820_print_type(e820.map[i].type); printk(KERN_CONT "\n"); } @@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", - (unsigned long long) start, - (unsigned long long) end); + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", + (unsigned long long) start, (unsigned long long) (end - 1)); e820_print_type(old_type); printk(KERN_CONT " ==> "); e820_print_type(new_type); @@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", - (unsigned long long) start, - (unsigned long long) end); + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", + (unsigned long long) start, (unsigned long long) (end - 1)); if (checktype) e820_print_type(old_type); printk(KERN_CONT "\n"); @@ -567,7 +567,7 @@ void __init update_e820(void) if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) return; e820.nr_map = nr_map; - printk(KERN_INFO "modified physical RAM map:\n"); + printk(KERN_INFO "e820: modified physical RAM map:\n"); e820_print_map("modified"); } static void __init update_e820_saved(void) @@ -637,8 +637,8 @@ __init void e820_setup_gap(void) if (!found) { gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; printk(KERN_ERR - "PCI: Warning: Cannot find a gap in the 32bit address range\n" - "PCI: Unassigned devices with 32bit resource registers may break!\n"); + "e820: cannot find a gap in the 32bit address range\n" + "e820: PCI devices with unassigned 32bit BARs may break!\n"); } #endif @@ -648,8 +648,8 @@ __init void e820_setup_gap(void) pci_mem_start = gapstart; printk(KERN_INFO - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); + "e820: [mem %#010lx-%#010lx] available for PCI devices\n", + gapstart, gapstart + gapsize - 1); } /** @@ -667,7 +667,7 @@ void __init parse_e820_ext(struct setup_data *sdata) extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - printk(KERN_INFO "extended physical RAM map:\n"); + printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); } @@ -734,7 +734,7 @@ u64 __init early_reserve_e820(u64 size, u64 align) addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); - printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); + printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n"); update_e820_saved(); } @@ -784,7 +784,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; - printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", + printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; } @@ -888,7 +888,7 @@ void __init finish_e820_parsing(void) early_panic("Invalid user supplied memory map"); e820.nr_map = nr; - printk(KERN_INFO "user-defined physical RAM map:\n"); + printk(KERN_INFO "e820: user-defined physical RAM map:\n"); e820_print_map("user"); } } @@ -996,8 +996,9 @@ void __init e820_reserve_resources_late(void) end = MAX_RESOURCE_SIZE; if (start >= end) continue; - printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ", - start, end); + printk(KERN_DEBUG + "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", + start, end); reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } @@ -1047,7 +1048,7 @@ void __init setup_memory_map(void) who = x86_init.resources.memory_setup(); memcpy(&e820_saved, &e820, sizeof(struct e820map)); - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); e820_print_map(who); } -- cgit v1.2.1 From 365811d6f9bd98543bedc02b72d94f0f0faf3670 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 29 May 2012 15:06:29 -0700 Subject: x86: print physical addresses consistently with other parts of kernel Print physical address info in a style consistent with the %pR style used elsewhere in the kernel. For example: -found SMP MP-table at [ffff8800000fce90] fce90 +found SMP MP-table at [mem 0x000fce90-0x000fce9f] mapped at [ffff8800000fce90] -initial memory mapped : 0 - 20000000 +initial memory mapped: [mem 0x00000000-0x1fffffff] -Base memory trampoline at [ffff88000009c000] 9c000 size 8192 +Base memory trampoline [mem 0x0009c000-0x0009dfff] mapped at [ffff88000009c000] -SRAT: Node 0 PXM 0 0-80000000 +SRAT: Node 0 PXM 0 [mem 0x00000000-0x7fffffff] Signed-off-by: Bjorn Helgaas Cc: Yinghai Lu Cc: Konrad Rzeszutek Wilk Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/mpparse.c | 10 ++++++---- arch/x86/kernel/setup.c | 16 ++++++++-------- arch/x86/mm/init.c | 16 +++++++++------- arch/x86/mm/numa.c | 32 ++++++++++++++++---------------- arch/x86/mm/numa_emulation.c | 4 ++-- arch/x86/mm/pat.c | 42 +++++++++++++++++++----------------------- arch/x86/mm/srat.c | 5 +++-- 7 files changed, 63 insertions(+), 62 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b02d4dd6b8a3..fbca2e6223bf 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -568,8 +568,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) struct mpf_intel *mpf; unsigned long mem; - apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", - bp, length); + apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", + base, base + length - 1); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { @@ -584,8 +584,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) #endif mpf_found = mpf; - printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", - mpf, (u64)virt_to_phys(mpf)); + printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", + (unsigned long long) virt_to_phys(mpf), + (unsigned long long) virt_to_phys(mpf) + + sizeof(*mpf) - 1, mpf); mem = virt_to_phys(mpf); memblock_reserve(mem, sizeof(*mpf)); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f2afee6a19c1..982e44f960db 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -334,8 +334,8 @@ static void __init relocate_initrd(void) memblock_reserve(ramdisk_here, area_size); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; - printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", - ramdisk_here, ramdisk_here + ramdisk_size); + printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", + ramdisk_here, ramdisk_here + ramdisk_size - 1); q = (char *)initrd_start; @@ -366,8 +366,8 @@ static void __init relocate_initrd(void) /* high pages is not converted by early_res_to_bootmem */ ramdisk_image = boot_params.hdr.ramdisk_image; ramdisk_size = boot_params.hdr.ramdisk_size; - printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to" - " %08llx - %08llx\n", + printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" + " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } @@ -392,8 +392,8 @@ static void __init reserve_initrd(void) ramdisk_size, end_of_lowmem>>1); } - printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, - ramdisk_end); + printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, + ramdisk_end - 1); if (ramdisk_end <= end_of_lowmem) { @@ -906,8 +906,8 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif - printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", - max_pfn_mapped<> PAGE_SHIFT); - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); + printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", + end - 1, pgt_buf_start << PAGE_SHIFT, + (pgt_buf_top << PAGE_SHIFT) - 1); } void __init native_pagetable_reserve(u64 start, u64 end) @@ -132,7 +133,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, int nr_range, i; int use_pse, use_gbpages; - printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); + printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n", + start, end - 1); #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) /* @@ -251,8 +253,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, } for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %010lx - %010lx page %s\n", - mr[i].start, mr[i].end, + printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", + mr[i].start, mr[i].end - 1, (mr[i].page_size_mask & (1<> PAGE_SHIFT); #else /* diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 19d3fa08b119..2d125be1bae9 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -141,8 +141,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end, /* whine about and ignore invalid blks */ if (start > end || nid < 0 || nid >= MAX_NUMNODES) { - pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", - nid, start, end); + pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); return 0; } @@ -210,8 +210,8 @@ static void __init setup_node_data(int nid, u64 start, u64 end) start = roundup(start, ZONE_ALIGN); - printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", - nid, start, end); + printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); /* * Allocate node data. Try remap allocator first, node-local @@ -232,7 +232,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) } /* report and initialize */ - printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n", + printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n", nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); if (!remapped && tnid != nid) @@ -291,14 +291,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) */ if (bi->end > bj->start && bi->start < bj->end) { if (bi->nid != bj->nid) { - pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->nid, bj->start, bj->end); + pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->nid, bj->start, bj->end - 1); return -EINVAL; } - pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->start, bj->end); + pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->start, bj->end - 1); } /* @@ -320,9 +320,9 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) } if (k < mi->nr_blks) continue; - printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n", - bi->nid, bi->start, bi->end, bj->start, bj->end, - start, end); + printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, bj->start, + bj->end - 1, start, end - 1); bi->start = start; bi->end = end; numa_remove_memblk_from(j--, mi); @@ -616,8 +616,8 @@ static int __init dummy_numa_init(void) { printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n", - 0LLU, PFN_PHYS(max_pfn)); + printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", + 0LLU, PFN_PHYS(max_pfn) - 1); node_set(0, numa_nodes_parsed); numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 871dd8868170..dbbbb47260cc 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -68,8 +68,8 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, numa_remove_memblk_from(phys_blk, pi); } - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, - eb->start, eb->end, (eb->end - eb->start) >> 20); + printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", + nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); return 0; } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f6ff57b7efa5..f11729fd019c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -209,9 +209,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, page = pfn_to_page(pfn); type = get_page_memtype(page); if (type != -1) { - printk(KERN_INFO "reserve_ram_pages_type failed " - "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", - start, end, type, req_type); + printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n", + start, end - 1, type, req_type); if (new_type) *new_type = type; @@ -314,9 +313,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, err = rbt_memtype_check_insert(new, new_type); if (err) { - printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " - "track %s, req %s\n", - start, end, cattr_name(new->type), cattr_name(req_type)); + printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + start, end - 1, + cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); @@ -325,8 +324,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_unlock(&memtype_lock); - dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", - start, end, cattr_name(new->type), cattr_name(req_type), + dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", + start, end - 1, cattr_name(new->type), cattr_name(req_type), new_type ? cattr_name(*new_type) : "-"); return err; @@ -360,14 +359,14 @@ int free_memtype(u64 start, u64 end) spin_unlock(&memtype_lock); if (!entry) { - printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", - current->comm, current->pid, start, end); + printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, start, end - 1); return -EINVAL; } kfree(entry); - dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); + dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1); return 0; } @@ -491,9 +490,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO - "Program %s tried to access /dev/mem between %Lx->%Lx.\n", - current->comm, from, to); + printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; @@ -554,12 +552,11 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) size; if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { - printk(KERN_INFO - "%s:%d ioremap_change_attr failed %s " - "for %Lx-%Lx\n", + printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " + "for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, cattr_name(flags), - base, (unsigned long long)(base + size)); + base, (unsigned long long)(base + size-1)); return -EINVAL; } return 0; @@ -591,12 +588,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, flags = lookup_memtype(paddr); if (want_flags != flags) { - printk(KERN_WARNING - "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", + printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_flags), (unsigned long long)paddr, - (unsigned long long)(paddr + size), + (unsigned long long)(paddr + size - 1), cattr_name(flags)); *vma_prot = __pgprot((pgprot_val(*vma_prot) & (~_PAGE_CACHE_MASK)) | @@ -614,11 +610,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, !is_new_memtype_allowed(paddr, size, want_flags, flags)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d map pfn expected mapping type %s" - " for %Lx-%Lx, got %s\n", + " for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_flags), (unsigned long long)paddr, - (unsigned long long)(paddr + size), + (unsigned long long)(paddr + size - 1), cattr_name(flags)); return -EINVAL; } diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index efb5b4b93711..732af3a96183 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -176,8 +176,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) return; } - printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, - start, end); + printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", + node, pxm, + (unsigned long long) start, (unsigned long long) end - 1); } void __init acpi_numa_arch_fixup(void) {} -- cgit v1.2.1 From 26c191788f18129af0eb32a358cdaea0c7479626 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 29 May 2012 15:06:49 -0700 Subject: mm: pmd_read_atomic: fix 32bit PAE pmd walk vs pmd_populate SMP race condition When holding the mmap_sem for reading, pmd_offset_map_lock should only run on a pmd_t that has been read atomically from the pmdp pointer, otherwise we may read only half of it leading to this crash. PID: 11679 TASK: f06e8000 CPU: 3 COMMAND: "do_race_2_panic" #0 [f06a9dd8] crash_kexec at c049b5ec #1 [f06a9e2c] oops_end at c083d1c2 #2 [f06a9e40] no_context at c0433ded #3 [f06a9e64] bad_area_nosemaphore at c043401a #4 [f06a9e6c] __do_page_fault at c0434493 #5 [f06a9eec] do_page_fault at c083eb45 #6 [f06a9f04] error_code (via page_fault) at c083c5d5 EAX: 01fb470c EBX: fff35000 ECX: 00000003 EDX: 00000100 EBP: 00000000 DS: 007b ESI: 9e201000 ES: 007b EDI: 01fb4700 GS: 00e0 CS: 0060 EIP: c083bc14 ERR: ffffffff EFLAGS: 00010246 #7 [f06a9f38] _spin_lock at c083bc14 #8 [f06a9f44] sys_mincore at c0507b7d #9 [f06a9fb0] system_call at c083becd start len EAX: ffffffda EBX: 9e200000 ECX: 00001000 EDX: 6228537f DS: 007b ESI: 00000000 ES: 007b EDI: 003d0f00 SS: 007b ESP: 62285354 EBP: 62285388 GS: 0033 CS: 0073 EIP: 00291416 ERR: 000000da EFLAGS: 00000286 This should be a longstanding bug affecting x86 32bit PAE without THP. Only archs with 64bit large pmd_t and 32bit unsigned long should be affected. With THP enabled the barrier() in pmd_none_or_trans_huge_or_clear_bad() would partly hide the bug when the pmd transition from none to stable, by forcing a re-read of the *pmd in pmd_offset_map_lock, but when THP is enabled a new set of problem arises by the fact could then transition freely in any of the none, pmd_trans_huge or pmd_trans_stable states. So making the barrier in pmd_none_or_trans_huge_or_clear_bad() unconditional isn't good idea and it would be a flakey solution. This should be fully fixed by introducing a pmd_read_atomic that reads the pmd in order with THP disabled, or by reading the pmd atomically with cmpxchg8b with THP enabled. Luckily this new race condition only triggers in the places that must already be covered by pmd_none_or_trans_huge_or_clear_bad() so the fix is localized there but this bug is not related to THP. NOTE: this can trigger on x86 32bit systems with PAE enabled with more than 4G of ram, otherwise the high part of the pmd will never risk to be truncated because it would be zero at all times, in turn so hiding the SMP race. This bug was discovered and fully debugged by Ulrich, quote: ---- [..] pmd_none_or_trans_huge_or_clear_bad() loads the content of edx and eax. 496 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) 497 { 498 /* depend on compiler for an atomic pmd read */ 499 pmd_t pmdval = *pmd; // edi = pmd pointer 0xc0507a74 : mov 0x8(%esp),%edi ... // edx = PTE page table high address 0xc0507a84 : mov 0x4(%edi),%edx ... // eax = PTE page table low address 0xc0507a8e : mov (%edi),%eax [..] Please note that the PMD is not read atomically. These are two "mov" instructions where the high order bits of the PMD entry are fetched first. Hence, the above machine code is prone to the following race. - The PMD entry {high|low} is 0x0000000000000000. The "mov" at 0xc0507a84 loads 0x00000000 into edx. - A page fault (on another CPU) sneaks in between the two "mov" instructions and instantiates the PMD. - The PMD entry {high|low} is now 0x00000003fda38067. The "mov" at 0xc0507a8e loads 0xfda38067 into eax. ---- Reported-by: Ulrich Obergfell Signed-off-by: Andrea Arcangeli Cc: Mel Gorman Cc: Hugh Dickins Cc: Larry Woodman Cc: Petr Matousek Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-3level.h | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index effff47a3c82..43876f16caf1 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) ptep->pte_low = pte.pte_low; } +#define pmd_read_atomic pmd_read_atomic +/* + * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with + * a "*pmdp" dereference done by gcc. Problem is, in certain places + * where pte_offset_map_lock is called, concurrent page faults are + * allowed, if the mmap_sem is hold for reading. An example is mincore + * vs page faults vs MADV_DONTNEED. On the page fault side + * pmd_populate rightfully does a set_64bit, but if we're reading the + * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen + * because gcc will not read the 64bit of the pmd atomically. To fix + * this all places running pmd_offset_map_lock() while holding the + * mmap_sem in read mode, shall read the pmdp pointer using this + * function to know if the pmd is null nor not, and in turn to know if + * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd + * operations. + * + * Without THP if the mmap_sem is hold for reading, the + * pmd can only transition from null to not null while pmd_read_atomic runs. + * So there's no need of literally reading it atomically. + * + * With THP if the mmap_sem is hold for reading, the pmd can become + * THP or null or point to a pte (and in turn become "stable") at any + * time under pmd_read_atomic, so it's mandatory to read it atomically + * with cmpxchg8b. + */ +#ifndef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmd_read_atomic(pmd_t *pmdp) +{ + pmdval_t ret; + u32 *tmp = (u32 *)pmdp; + + ret = (pmdval_t) (*tmp); + if (ret) { + /* + * If the low part is null, we must not read the high part + * or we can end up with a partial pmd. + */ + smp_rmb(); + ret |= ((pmdval_t)*(tmp + 1)) << 32; + } + + return (pmd_t) { ret }; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline pmd_t pmd_read_atomic(pmd_t *pmdp) +{ + return (pmd_t) { atomic64_read((atomic64_t *)pmdp) }; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); -- cgit v1.2.1 From 319b6ffc6df892e4ccffff823cc5521a4a5d2dca Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 30 May 2012 12:33:41 +0300 Subject: x86, realmode: Unbreak the ia64 build of drivers/acpi/sleep.c Revert usage of acpi_wakeup_address and move definition to x86 architecture code in order to make compilation work in ia64. [jsakkine: tested compilation in ia64/x86-64 and added proper commit message] Reported-by: Paul Gortmaker Originally-by: H. Peter Anvin Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1338370421-27735-1-git-send-email-jarkko.sakkinen@intel.com Cc: Tony Luck Cc: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 724aa441de7d..0c44630d1789 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -29,6 +29,7 @@ #include #include #include +#include #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long @@ -116,10 +117,8 @@ static inline void acpi_disable_pci(void) /* Low-level suspend routine. */ extern int acpi_suspend_lowlevel(void); -extern const unsigned char acpi_wakeup_code[]; - -/* early initialization routine */ -extern void acpi_reserve_wakeup_memory(void); +/* Physical address to resume after wakeup */ +#define acpi_wakeup_address ((unsigned long)(real_mode_header->wakeup_start)) /* * Check if the CPU can handle C2 and deeper -- cgit v1.2.1