From 3780578761921f094179c6289072a74b2228c602 Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Sat, 20 May 2017 15:03:29 -0500
Subject: x86/boot: Use CROSS_COMPILE prefix for readelf

The boot code Makefile contains a straight 'readelf' invocation. This
causes build warnings in cross compile environments, when there is no
unprefixed readelf accessible via $PATH.

Add the missing $(CROSS_COMPILE) prefix.

[ tglx: Rewrote changelog ]

Fixes: 98f78525371b ("x86/boot: Refuse to build with data relocations")
Signed-off-by: Rob Landley <rob@landley.net>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Paul Bolle <pebolle@tiscali.nl>
Cc: "H.J. Lu" <hjl.tools@gmail.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/ced18878-693a-9576-a024-113ef39a22c0@landley.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/boot/compressed/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 44163e8c3868..2c860ad4fe06 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -94,7 +94,7 @@ vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
 quiet_cmd_check_data_rel = DATAREL $@
 define cmd_check_data_rel
 	for obj in $(filter %.o,$^); do \
-		readelf -S $$obj | grep -qF .rel.local && { \
+		${CROSS_COMPILE}readelf -S $$obj | grep -qF .rel.local && { \
 			echo "error: $$obj has data relocations!" >&2; \
 			exit 1; \
 		} || true; \
-- 
cgit v1.2.1


From 2d1f406139ec20320bf38bcd2461aa8e358084b5 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 19 May 2017 11:39:09 +0200
Subject: x86/MCE: Export memory_error()

Export the function which checks whether an MCE is a memory error to
other users so that we can reuse the logic. Drop the boot_cpu_data use,
while at it, as mce.cpuvendor already has the CPU vendor in there.

Integrate a piece from a patch from Vishal Verma
<vishal.l.verma@intel.com> to export it for modules (nfit).

The main reason we're exporting it is that the nfit handler
nfit_handle_mce() needs to detect a memory error properly before doing
its recovery actions.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170519093915.15413-2-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/mce.h       |  1 +
 arch/x86/kernel/cpu/mcheck/mce.c | 13 ++++++-------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4fd5195deed0..3f9a3d2a5209 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -266,6 +266,7 @@ static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *s
 #endif
 
 int mce_available(struct cpuinfo_x86 *c);
+bool mce_is_memory_error(struct mce *m);
 
 DECLARE_PER_CPU(unsigned, mce_exception_count);
 DECLARE_PER_CPU(unsigned, mce_poll_count);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5abd4bf73d6e..5cfbaeb6529a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -499,16 +499,14 @@ static int mce_usable_address(struct mce *m)
 	return 1;
 }
 
-static bool memory_error(struct mce *m)
+bool mce_is_memory_error(struct mce *m)
 {
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-
-	if (c->x86_vendor == X86_VENDOR_AMD) {
+	if (m->cpuvendor == X86_VENDOR_AMD) {
 		/* ErrCodeExt[20:16] */
 		u8 xec = (m->status >> 16) & 0x1f;
 
 		return (xec == 0x0 || xec == 0x8);
-	} else if (c->x86_vendor == X86_VENDOR_INTEL) {
+	} else if (m->cpuvendor == X86_VENDOR_INTEL) {
 		/*
 		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 		 *
@@ -529,6 +527,7 @@ static bool memory_error(struct mce *m)
 
 	return false;
 }
+EXPORT_SYMBOL_GPL(mce_is_memory_error);
 
 static bool cec_add_mce(struct mce *m)
 {
@@ -536,7 +535,7 @@ static bool cec_add_mce(struct mce *m)
 		return false;
 
 	/* We eat only correctable DRAM errors with usable addresses. */
-	if (memory_error(m) &&
+	if (mce_is_memory_error(m) &&
 	    !(m->status & MCI_STATUS_UC) &&
 	    mce_usable_address(m))
 		if (!cec_add_elem(m->addr >> PAGE_SHIFT))
@@ -713,7 +712,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 		severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 
-		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
+		if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m))
 			if (m.status & MCI_STATUS_ADDRV)
 				m.severity = severity;
 
-- 
cgit v1.2.1


From ebd574994c63164d538a197172157318f58ac647 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 23 May 2017 10:37:29 -0500
Subject: Revert "x86/entry: Fix the end of the stack for newly forked tasks"

Petr Mladek reported the following warning when loading the livepatch
sample module:

  WARNING: CPU: 1 PID: 3699 at arch/x86/kernel/stacktrace.c:132 save_stack_trace_tsk_reliable+0x133/0x1a0
  ...
  Call Trace:
   __schedule+0x273/0x820
   schedule+0x36/0x80
   kthreadd+0x305/0x310
   ? kthread_create_on_cpu+0x80/0x80
   ? icmp_echo.part.32+0x50/0x50
   ret_from_fork+0x2c/0x40

That warning means the end of the stack is no longer recognized as such
for newly forked tasks.  The problem was introduced with the following
commit:

  ff3f7e2475bb ("x86/entry: Fix the end of the stack for newly forked tasks")

... which was completely misguided.  It only partially fixed the
reported issue, and it introduced another bug in the process.  None of
the other entry code saves the frame pointer before calling into C code,
so it doesn't make sense for ret_from_fork to do so either.

Contrary to what I originally thought, the original issue wasn't related
to newly forked tasks.  It was actually related to ftrace.  When entry
code calls into a function which then calls into an ftrace handler, the
stack frame looks different than normal.

The original issue will be fixed in the unwinder, in a subsequent patch.

Reported-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: live-patching@vger.kernel.org
Fixes: ff3f7e2475bb ("x86/entry: Fix the end of the stack for newly forked tasks")
Link: http://lkml.kernel.org/r/f350760f7e82f0750c8d1dd093456eb212751caa.1495553739.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S | 30 +++++++++++++++++++-----------
 arch/x86/entry/entry_64.S | 11 ++++-------
 2 files changed, 23 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 50bc26949e9e..48ef7bb32c42 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -251,6 +251,23 @@ ENTRY(__switch_to_asm)
 	jmp	__switch_to
 END(__switch_to_asm)
 
+/*
+ * The unwinder expects the last frame on the stack to always be at the same
+ * offset from the end of the page, which allows it to validate the stack.
+ * Calling schedule_tail() directly would break that convention because its an
+ * asmlinkage function so its argument has to be pushed on the stack.  This
+ * wrapper creates a proper "end of stack" frame header before the call.
+ */
+ENTRY(schedule_tail_wrapper)
+	FRAME_BEGIN
+
+	pushl	%eax
+	call	schedule_tail
+	popl	%eax
+
+	FRAME_END
+	ret
+ENDPROC(schedule_tail_wrapper)
 /*
  * A newly forked process directly context switches into this address.
  *
@@ -259,24 +276,15 @@ END(__switch_to_asm)
  * edi: kernel thread arg
  */
 ENTRY(ret_from_fork)
-	FRAME_BEGIN		/* help unwinder find end of stack */
-
-	/*
-	 * schedule_tail() is asmlinkage so we have to put its 'prev' argument
-	 * on the stack.
-	 */
-	pushl	%eax
-	call	schedule_tail
-	popl	%eax
+	call	schedule_tail_wrapper
 
 	testl	%ebx, %ebx
 	jnz	1f		/* kernel threads are uncommon */
 
 2:
 	/* When we fork, we trace the syscall return in the child, too. */
-	leal	FRAME_OFFSET(%esp), %eax
+	movl    %esp, %eax
 	call    syscall_return_slowpath
-	FRAME_END
 	jmp     restore_all
 
 	/* kernel thread */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 607d72c4a485..4a4c0834f965 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,7 +36,6 @@
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
 #include <asm/export.h>
-#include <asm/frame.h>
 #include <linux/err.h>
 
 .code64
@@ -406,19 +405,17 @@ END(__switch_to_asm)
  * r12: kernel thread arg
  */
 ENTRY(ret_from_fork)
-	FRAME_BEGIN			/* help unwinder find end of stack */
 	movq	%rax, %rdi
-	call	schedule_tail		/* rdi: 'prev' task parameter */
+	call	schedule_tail			/* rdi: 'prev' task parameter */
 
-	testq	%rbx, %rbx		/* from kernel_thread? */
-	jnz	1f			/* kernel threads are uncommon */
+	testq	%rbx, %rbx			/* from kernel_thread? */
+	jnz	1f				/* kernel threads are uncommon */
 
 2:
-	leaq	FRAME_OFFSET(%rsp),%rdi	/* pt_regs pointer */
+	movq	%rsp, %rdi
 	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
 	SWAPGS
-	FRAME_END
 	jmp	restore_regs_and_iret
 
 1:
-- 
cgit v1.2.1


From 519fb5c3350d1b5225b27b1cac55144f79351718 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 23 May 2017 10:37:30 -0500
Subject: x86/unwind: Add end-of-stack check for ftrace handlers

Dave Jones and Steven Rostedt reported unwinder warnings like the
following:

  WARNING: kernel stack frame pointer at ffff8800bda0ff30 in sshd:1090 has bad value 000055b32abf1fa8

In both cases, the unwinder was attempting to unwind from an ftrace
handler into entry code.  The callchain was something like:

  syscall entry code
    C function
      ftrace handler
        save_stack_trace()

The problem is that the unwinder's end-of-stack logic gets confused by
the way ftrace lays out the stack frame (with fentry enabled).

I was able to recreate this warning with:

  echo call_usermodehelper_exec_async:stacktrace > /sys/kernel/debug/tracing/set_ftrace_filter
  (exit login session)

I considered fixing this by changing the ftrace code to rewrite the
stack to make the unwinder happy.  But that seemed too intrusive after I
implemented it.  Instead, just add another check to the unwinder's
end-of-stack logic to detect this special case.

Side note: We could probably get rid of these end-of-stack checks by
encoding the frame pointer for syscall entry just like we do for
interrupt entry.  That would be simpler, but it would also be a lot more
intrusive since it would slightly affect the performance of every
syscall.

Reported-by: Dave Jones <davej@codemonkey.org.uk>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: live-patching@vger.kernel.org
Fixes: c32c47c68a0a ("x86/unwind: Warn on bad frame pointer")
Link: http://lkml.kernel.org/r/671ba22fbc0156b8f7e0cfa5ab2a795e08bc37e1.1495553739.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/unwind_frame.c | 49 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 82c6d7f1fd73..b9389d72b2f7 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -104,6 +104,11 @@ static inline unsigned long *last_frame(struct unwind_state *state)
 	return (unsigned long *)task_pt_regs(state->task) - 2;
 }
 
+static bool is_last_frame(struct unwind_state *state)
+{
+	return state->bp == last_frame(state);
+}
+
 #ifdef CONFIG_X86_32
 #define GCC_REALIGN_WORDS 3
 #else
@@ -115,16 +120,15 @@ static inline unsigned long *last_aligned_frame(struct unwind_state *state)
 	return last_frame(state) - GCC_REALIGN_WORDS;
 }
 
-static bool is_last_task_frame(struct unwind_state *state)
+static bool is_last_aligned_frame(struct unwind_state *state)
 {
 	unsigned long *last_bp = last_frame(state);
 	unsigned long *aligned_bp = last_aligned_frame(state);
 
 	/*
-	 * We have to check for the last task frame at two different locations
-	 * because gcc can occasionally decide to realign the stack pointer and
-	 * change the offset of the stack frame in the prologue of a function
-	 * called by head/entry code.  Examples:
+	 * GCC can occasionally decide to realign the stack pointer and change
+	 * the offset of the stack frame in the prologue of a function called
+	 * by head/entry code.  Examples:
 	 *
 	 * <start_secondary>:
 	 *      push   %edi
@@ -141,11 +145,38 @@ static bool is_last_task_frame(struct unwind_state *state)
 	 *      push   %rbp
 	 *      mov    %rsp,%rbp
 	 *
-	 * Note that after aligning the stack, it pushes a duplicate copy of
-	 * the return address before pushing the frame pointer.
+	 * After aligning the stack, it pushes a duplicate copy of the return
+	 * address before pushing the frame pointer.
+	 */
+	return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1));
+}
+
+static bool is_last_ftrace_frame(struct unwind_state *state)
+{
+	unsigned long *last_bp = last_frame(state);
+	unsigned long *last_ftrace_bp = last_bp - 3;
+
+	/*
+	 * When unwinding from an ftrace handler of a function called by entry
+	 * code, the stack layout of the last frame is:
+	 *
+	 *   bp
+	 *   parent ret addr
+	 *   bp
+	 *   function ret addr
+	 *   parent ret addr
+	 *   pt_regs
+	 *   -----------------
 	 */
-	return (state->bp == last_bp ||
-		(state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1)));
+	return (state->bp == last_ftrace_bp &&
+		*state->bp == *(state->bp + 2) &&
+		*(state->bp + 1) == *(state->bp + 4));
+}
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+	return is_last_frame(state) || is_last_aligned_frame(state) ||
+	       is_last_ftrace_frame(state);
 }
 
 /*
-- 
cgit v1.2.1


From 7e6091209f7f73e2a81943020793b5ad26d645c6 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 23 May 2017 18:27:54 +0200
Subject: x86/build: Permit building with old make versions

At least Make 3.82 dislikes the tab in front of the $(warning) function:

  arch/x86/Makefile:162: *** recipe commences before first target.  Stop.

Let's be gentle.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1944fcd8-e3df-d1f7-c0e4-60aeb1917a24@siemens.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5851411e60fb..bf240b920473 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -159,7 +159,7 @@ ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	# If '-Os' is enabled, disable it and print a warning.
         ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
           undefine CONFIG_CC_OPTIMIZE_FOR_SIZE
-	  $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE.  Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
+          $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE.  Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
         endif
 
     endif
-- 
cgit v1.2.1


From c9525a3fab63fbe091007494f8b7a06438eea6a7 Mon Sep 17 00:00:00 2001
From: Benjamin Peterson <bp@benjamin.pe>
Date: Sat, 20 May 2017 17:20:16 -0700
Subject: x86/watchdog: Fix Kconfig help text file path reference to lockup
 watchdog documentation

Signed-off-by: Benjamin Peterson <bp@benjamin.pe>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: 9919cba7ff71147803c988521cc1ceb80e7f0f6d ("watchdog: Update documentation")
Link: http://lkml.kernel.org/r/20170521002016.13258-1-bp@benjamin.pe
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cd18994a9555..4ccfacc7232a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -360,7 +360,7 @@ config SMP
 	  Management" code will be disabled if you say Y here.
 
 	  See also <file:Documentation/x86/i386/IO-APIC.txt>,
-	  <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at
+	  <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
 	  <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
-- 
cgit v1.2.1


From cbed27cdf0e3f7ea3b2259e86b9e34df02be3fe4 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Apr 2017 15:07:11 -0400
Subject: x86/PAT: Fix Xorg regression on CPUs that don't support PAT

In the file arch/x86/mm/pat.c, there's a '__pat_enabled' variable. The
variable is set to 1 by default and the function pat_init() sets
__pat_enabled to 0 if the CPU doesn't support PAT.

However, on AMD K6-3 CPUs, the processor initialization code never calls
pat_init() and so __pat_enabled stays 1 and the function pat_enabled()
returns true, even though the K6-3 CPU doesn't support PAT.

The result of this bug is that a kernel warning is produced when attempting to
start the Xserver and the Xserver doesn't start (fork() returns ENOMEM).
Another symptom of this bug is that the framebuffer driver doesn't set the
K6-3 MTRR registers:

  x86/PAT: Xorg:3891 map pfn expected mapping type uncached-minus for [mem 0xe4000000-0xe5ffffff], got write-combining
  ------------[ cut here ]------------
  WARNING: CPU: 0 PID: 3891 at arch/x86/mm/pat.c:1020 untrack_pfn+0x5c/0x9f
  ...
  x86/PAT: Xorg:3891 map pfn expected mapping type uncached-minus for [mem 0xe4000000-0xe5ffffff], got write-combining

To fix the bug change pat_enabled() so that it returns true only if PAT
initialization was actually done.

Also, I changed boot_cpu_has(X86_FEATURE_PAT) to
this_cpu_has(X86_FEATURE_PAT) in pat_ap_init(), so that we check the PAT
feature on the processor that is being initialized.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: stable@vger.kernel.org # v4.2+
Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1704181501450.26399@file01.intranet.prod.int.rdu2.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pat.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 9b78685b66e6..83a59a67757a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -65,9 +65,11 @@ static int __init nopat(char *str)
 }
 early_param("nopat", nopat);
 
+static bool __read_mostly __pat_initialized = false;
+
 bool pat_enabled(void)
 {
-	return !!__pat_enabled;
+	return __pat_initialized;
 }
 EXPORT_SYMBOL_GPL(pat_enabled);
 
@@ -225,13 +227,14 @@ static void pat_bsp_init(u64 pat)
 	}
 
 	wrmsrl(MSR_IA32_CR_PAT, pat);
+	__pat_initialized = true;
 
 	__init_cache_modes(pat);
 }
 
 static void pat_ap_init(u64 pat)
 {
-	if (!boot_cpu_has(X86_FEATURE_PAT)) {
+	if (!this_cpu_has(X86_FEATURE_PAT)) {
 		/*
 		 * If this happens we are on a secondary CPU, but switched to
 		 * PAT on the boot CPU. We have no way to undo PAT.
@@ -306,7 +309,7 @@ void pat_init(void)
 	u64 pat;
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
-	if (!pat_enabled()) {
+	if (!__pat_enabled) {
 		init_cache_modes();
 		return;
 	}
-- 
cgit v1.2.1


From fc152d22d6e9fac95a9a990e6c29510bdf1b9425 Mon Sep 17 00:00:00 2001
From: Mateusz Jurczyk <mjurczyk@google.com>
Date: Wed, 24 May 2017 15:55:00 +0200
Subject: x86/alternatives: Prevent uninitialized stack byte read in
 apply_alternatives()

In the current form of the code, if a->replacementlen is 0, the reference
to *insnbuf for comparison touches potentially garbage memory. While it
doesn't affect the execution flow due to the subsequent a->replacementlen
comparison, it is (rightly) detected as use of uninitialized memory by a
runtime instrumentation currently under my development, and could be
detected as such by other tools in the future, too (e.g. KMSAN).

Fix the "false-positive" by reordering the conditions to first check the
replacement instruction length before referencing specific opcode bytes.

Signed-off-by: Mateusz Jurczyk <mjurczyk@google.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Link: http://lkml.kernel.org/r/20170524135500.27223-1-mjurczyk@google.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/alternative.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c5b8f760473c..32e14d137416 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -409,8 +409,13 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 		memcpy(insnbuf, replacement, a->replacementlen);
 		insnbuf_sz = a->replacementlen;
 
-		/* 0xe8 is a relative jump; fix the offset. */
-		if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+		/*
+		 * 0xe8 is a relative jump; fix the offset.
+		 *
+		 * Instruction length is checked before the opcode to avoid
+		 * accessing uninitialized bytes for zero-length replacements.
+		 */
+		if (a->replacementlen == 5 && *insnbuf == 0xe8) {
 			*(s32 *)(insnbuf + 1) += replacement - instr;
 			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
 				*(s32 *)(insnbuf + 1),
-- 
cgit v1.2.1


From 702644ec1cab10ffefcebb4060d4da46d4ef2c7f Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 24 May 2017 20:04:41 +0200
Subject: x86/timers: Move simple_udelay_calibration past
 init_hypervisor_platform

This ensures that adjustments to x86_platform done by the hypervisor
setup is already respected by this simple calibration.

The current user of this, introduced by 1b5aeebf3a92 ("x86/earlyprintk:
Add support for earlyprintk via USB3 debug port"), comes much later
into play.

Fixes: dd759d93f4dd ("x86/timers: Add simple udelay calibration")
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: http://lkml.kernel.org/r/5e89fe60-aab3-2c1c-aba8-32f8ad376189@siemens.com
---
 arch/x86/kernel/setup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0b4d3c686b1e..f81823695014 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -980,8 +980,6 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	x86_configure_nx();
 
-	simple_udelay_calibration();
-
 	parse_early_param();
 
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -1041,6 +1039,8 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	init_hypervisor_platform();
 
+	simple_udelay_calibration();
+
 	x86_init.resources.probe_roms();
 
 	/* after parse_early_param, so could debug it */
-- 
cgit v1.2.1


From 5acc1ca4fb15f00bfa3d4046e35ca381bc25d580 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Sat, 20 May 2017 20:32:32 -0700
Subject: KVM: X86: Fix preempt the preemption timer cancel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Preemption can occur during cancel preemption timer, and there will be
inconsistent status in lapic, vmx and vmcs field.

          CPU0                    CPU1

  preemption timer vmexit
  handle_preemption_timer(vCPU0)
    kvm_lapic_expired_hv_timer
      vmx_cancel_hv_timer
        vmx->hv_deadline_tsc = -1
        vmcs_clear_bits
        /* hv_timer_in_use still true */
  sched_out
                           sched_in
                           kvm_arch_vcpu_load
                             vmx_set_hv_timer
                               write vmx->hv_deadline_tsc
                               vmcs_set_bits
                           /* back in kvm_lapic_expired_hv_timer */
                           hv_timer_in_use = false
                           ...
                           vmx_vcpu_run
                             vmx_arm_hv_run
                               write preemption timer deadline
                             spurious preemption timer vmexit
                               handle_preemption_timer(vCPU0)
                                 kvm_lapic_expired_hv_timer
                                   WARN_ON(!apic->lapic_timer.hv_timer_in_use);

This can be reproduced sporadically during boot of L2 on a
preemptible L1, causing a splat on L1.

 WARNING: CPU: 3 PID: 1952 at arch/x86/kvm/lapic.c:1529 kvm_lapic_expired_hv_timer+0xb5/0xd0 [kvm]
 CPU: 3 PID: 1952 Comm: qemu-system-x86 Not tainted 4.12.0-rc1+ #24 RIP: 0010:kvm_lapic_expired_hv_timer+0xb5/0xd0 [kvm]
  Call Trace:
  handle_preemption_timer+0xe/0x20 [kvm_intel]
  vmx_handle_exit+0xc9/0x15f0 [kvm_intel]
  ? lock_acquire+0xdb/0x250
  ? lock_acquire+0xdb/0x250
  ? kvm_arch_vcpu_ioctl_run+0xdf3/0x1ce0 [kvm]
  kvm_arch_vcpu_ioctl_run+0xe55/0x1ce0 [kvm]
  kvm_vcpu_ioctl+0x384/0x7b0 [kvm]
  ? kvm_vcpu_ioctl+0x384/0x7b0 [kvm]
  ? __fget+0xf3/0x210
  do_vfs_ioctl+0xa4/0x700
  ? __fget+0x114/0x210
  SyS_ioctl+0x79/0x90
  do_syscall_64+0x8f/0x750
  ? trace_hardirqs_on_thunk+0x1a/0x1c
  entry_SYSCALL64_slow_path+0x25/0x25

This patch fixes it by disabling preemption while cancelling
preemption timer.  This way cancel_hv_timer is atomic with
respect to kvm_arch_vcpu_load.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c329d2894905..6e6f345adfe6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1495,8 +1495,10 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
 
 static void cancel_hv_timer(struct kvm_lapic *apic)
 {
+	preempt_disable();
 	kvm_x86_ops->cancel_hv_timer(apic->vcpu);
 	apic->lapic_timer.hv_timer_in_use = false;
+	preempt_enable();
 }
 
 static bool start_hv_timer(struct kvm_lapic *apic)
-- 
cgit v1.2.1


From e1d39b17e044e8ae819827810d87d809ba5f58c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= <jschoenh@amazon.de>
Date: Sat, 20 May 2017 13:22:56 +0200
Subject: KVM: nVMX: Fix handling of lmsw instruction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The decision whether or not to exit from L2 to L1 on an lmsw instruction is
based on bogus values: instead of using the information encoded within the
exit qualification, it uses the data also used for the mov-to-cr
instruction, which boils down to using whatever is in %eax at that point.

Use the correct values instead.

Without this fix, an L1 may not get notified when a 32-bit Linux L2
switches its secondary CPUs to protected mode; the L1 is only notified on
the next modification of CR0. This short time window poses a problem, when
there is some other reason to exit to L1 in between. Then, L2 will be
resumed in real mode and chaos ensues.

Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 72f78396bc09..880f371705bc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7913,11 +7913,13 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	int cr = exit_qualification & 15;
-	int reg = (exit_qualification >> 8) & 15;
-	unsigned long val = kvm_register_readl(vcpu, reg);
+	int reg;
+	unsigned long val;
 
 	switch ((exit_qualification >> 4) & 3) {
 	case 0: /* mov to cr */
+		reg = (exit_qualification >> 8) & 15;
+		val = kvm_register_readl(vcpu, reg);
 		switch (cr) {
 		case 0:
 			if (vmcs12->cr0_guest_host_mask &
@@ -7972,6 +7974,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
 		 * cr0. Other attempted changes are ignored, with no exit.
 		 */
+		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
 		if (vmcs12->cr0_guest_host_mask & 0xe &
 		    (val ^ vmcs12->cr0_read_shadow))
 			return true;
-- 
cgit v1.2.1


From 52b5419016997f2960e9c8b6584c4acb3875d126 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= <jschoenh@amazon.de>
Date: Sat, 20 May 2017 13:24:32 +0200
Subject: KVM: x86: Fix virtual wire mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Intel SDM says, that at most one LAPIC should be configured with ExtINT
delivery. KVM configures all LAPICs this way. This causes pic_unlock()
to kick the first available vCPU from the internal KVM data structures.
If this vCPU is not the BSP, but some not-yet-booted AP, the BSP may
never realize that there is an interrupt.

Fix that by enabling ExtINT delivery only for the BSP.

This allows booting a Linux guest without a TSC in the above situation.
Otherwise the BSP gets stuck in calibrate_delay_converge().

Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 6e6f345adfe6..d24c8742d9b0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1936,7 +1936,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 	for (i = 0; i < KVM_APIC_LVT_NUM; i++)
 		kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
 	apic_update_lvtt(apic);
-	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+	if (kvm_vcpu_is_reset_bsp(vcpu) &&
+	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
 		kvm_lapic_set_reg(apic, APIC_LVT0,
 			     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
 	apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
-- 
cgit v1.2.1


From c93f5cf571e7795f97d49ef51b766cf25e328545 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 25 May 2017 19:38:17 +0900
Subject: kprobes/x86: Fix to set RWX bits correctly before releasing
 trampoline

Fix kprobes to set(recover) RWX bits correctly on trampoline
buffer before releasing it. Releasing readonly page to
module_memfree() crash the kernel.

Without this fix, if kprobes user register a bunch of kprobes
in function body (since kprobes on function entry usually
use ftrace) and unregister it, kernel hits a BUG and crash.

Link: http://lkml.kernel.org/r/149570868652.3518.14120169373590420503.stgit@devbox

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Fixes: d0381c81c2f7 ("kprobes/x86: Set kprobes pages read-only")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/kernel/kprobes/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 5b2bbfbb3712..6b877807598b 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -52,6 +52,7 @@
 #include <linux/ftrace.h>
 #include <linux/frame.h>
 #include <linux/kasan.h>
+#include <linux/moduleloader.h>
 
 #include <asm/text-patching.h>
 #include <asm/cacheflush.h>
@@ -417,6 +418,14 @@ static void prepare_boost(struct kprobe *p, struct insn *insn)
 	}
 }
 
+/* Recover page to RW mode before releasing it */
+void free_insn_page(void *page)
+{
+	set_memory_nx((unsigned long)page & PAGE_MASK, 1);
+	set_memory_rw((unsigned long)page & PAGE_MASK, 1);
+	module_memfree(page);
+}
+
 static int arch_copy_kprobe(struct kprobe *p)
 {
 	struct insn insn;
-- 
cgit v1.2.1


From a53276e2826010338478ed94310874001a8097fa Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 26 May 2017 10:14:11 -0400
Subject: x86/mm/ftrace: Do not bug in early boot on irqs_disabled in
 cpu_flush_range()

With function tracing starting in early bootup and having its trampoline
pages being read only, a bug triggered with the following:

kernel BUG at arch/x86/mm/pageattr.c:189!
invalid opcode: 0000 [#1] SMP
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted 4.12.0-rc2-test+ #3
Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
task: ffffffffb4222500 task.stack: ffffffffb4200000
RIP: 0010:change_page_attr_set_clr+0x269/0x302
RSP: 0000:ffffffffb4203c88 EFLAGS: 00010046
RAX: 0000000000000046 RBX: 0000000000000000 RCX: 00000001b6000000
RDX: ffffffffb4203d40 RSI: 0000000000000000 RDI: ffffffffb4240d60
RBP: ffffffffb4203d18 R08: 00000001b6000000 R09: 0000000000000001
R10: ffffffffb4203aa8 R11: 0000000000000003 R12: ffffffffc029b000
R13: ffffffffb4203d40 R14: 0000000000000001 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff9a639ea00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffff9a636b384000 CR3: 00000001ea21d000 CR4: 00000000000406b0
Call Trace:
 change_page_attr_clear+0x1f/0x21
 set_memory_ro+0x1e/0x20
 arch_ftrace_update_trampoline+0x207/0x21c
 ? ftrace_caller+0x64/0x64
 ? 0xffffffffc029b000
 ftrace_startup+0xf4/0x198
 register_ftrace_function+0x26/0x3c
 function_trace_init+0x5e/0x73
 tracer_init+0x1e/0x23
 tracing_set_tracer+0x127/0x15a
 register_tracer+0x19b/0x1bc
 init_function_trace+0x90/0x92
 early_trace_init+0x236/0x2b3
 start_kernel+0x200/0x3f5
 x86_64_start_reservations+0x29/0x2b
 x86_64_start_kernel+0x17c/0x18f
 secondary_startup_64+0x9f/0x9f
 ? secondary_startup_64+0x9f/0x9f

Interrupts should not be enabled at this early in the boot process. It is
also fine to leave interrupts enabled during this time as there's only one
CPU running, and on_each_cpu() means to only run on the current CPU.

If early_boot_irqs_disabled is set, it is safe to run cpu_flush_range() with
interrupts disabled. Don't trigger a BUG_ON() in that case.

Link: http://lkml.kernel.org/r/20170526093717.0be3b849@gandalf.local.home
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/mm/pageattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1dcd2be4cce4..c8520b2c62d2 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -186,7 +186,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
 	unsigned int i, level;
 	unsigned long addr;
 
-	BUG_ON(irqs_disabled());
+	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 	WARN_ON(PAGE_ALIGN(start) != start);
 
 	on_each_cpu(__cpa_flush_range, NULL, 1);
-- 
cgit v1.2.1


From 6ee98ffeea0bc9e072e419497d78697d8afcdd6d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 May 2017 10:57:51 +0200
Subject: x86/ftrace: Make sure that ftrace trampolines are not RWX

ftrace use module_alloc() to allocate trampoline pages. The mapping of
module_alloc() is RWX, which makes sense as the memory is written to right
after allocation. But nothing makes these pages RO after writing to them.

Add proper set_memory_rw/ro() calls to protect the trampolines after
modification.

Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1705251056410.1862@nanos

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/kernel/ftrace.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 0651e974dcb3..9bef1bbeba63 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -689,8 +689,12 @@ static inline void *alloc_tramp(unsigned long size)
 {
 	return module_alloc(size);
 }
-static inline void tramp_free(void *tramp)
+static inline void tramp_free(void *tramp, int size)
 {
+	int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	set_memory_nx((unsigned long)tramp, npages);
+	set_memory_rw((unsigned long)tramp, npages);
 	module_memfree(tramp);
 }
 #else
@@ -699,7 +703,7 @@ static inline void *alloc_tramp(unsigned long size)
 {
 	return NULL;
 }
-static inline void tramp_free(void *tramp) { }
+static inline void tramp_free(void *tramp, int size) { }
 #endif
 
 /* Defined as markers to the end of the ftrace default trampolines */
@@ -771,7 +775,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 	/* Copy ftrace_caller onto the trampoline memory */
 	ret = probe_kernel_read(trampoline, (void *)start_offset, size);
 	if (WARN_ON(ret < 0)) {
-		tramp_free(trampoline);
+		tramp_free(trampoline, *tramp_size);
 		return 0;
 	}
 
@@ -797,7 +801,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 
 	/* Are we pointing to the reference? */
 	if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) {
-		tramp_free(trampoline);
+		tramp_free(trampoline, *tramp_size);
 		return 0;
 	}
 
@@ -839,7 +843,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 	unsigned long offset;
 	unsigned long ip;
 	unsigned int size;
-	int ret;
+	int ret, npages;
 
 	if (ops->trampoline) {
 		/*
@@ -848,11 +852,14 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 		 */
 		if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
 			return;
+		npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
+		set_memory_rw(ops->trampoline, npages);
 	} else {
 		ops->trampoline = create_trampoline(ops, &size);
 		if (!ops->trampoline)
 			return;
 		ops->trampoline_size = size;
+		npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	}
 
 	offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
@@ -863,6 +870,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 	/* Do a safe modify in case the trampoline is executing */
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	ret = update_ftrace_func(ip, new);
+	set_memory_ro(ops->trampoline, npages);
 
 	/* The update should never fail */
 	WARN_ON(ret);
@@ -939,7 +947,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
 	if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
 		return;
 
-	tramp_free((void *)ops->trampoline);
+	tramp_free((void *)ops->trampoline, ops->trampoline_size);
 	ops->trampoline = 0;
 }
 
-- 
cgit v1.2.1


From 1ea34adb87c969b89dfd83f1905a79161e9ada26 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 26 May 2017 12:36:47 +0100
Subject: efi: Don't issue error message when booted under Xen

When booted as Xen dom0 there won't be an EFI memmap allocated. Avoid
issuing an error message in this case:

  [    0.144079] efi: Failed to allocate new EFI memmap

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: <stable@vger.kernel.org> # v4.9+
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20170526113652.21339-2-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/quirks.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 26615991d69c..e0cf95a83f3f 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -360,6 +360,9 @@ void __init efi_free_boot_services(void)
 		free_bootmem_late(start, size);
 	}
 
+	if (!num_entries)
+		return;
+
 	new_size = efi.memmap.desc_size * num_entries;
 	new_phys = efi_memmap_alloc(num_entries);
 	if (!new_phys) {
-- 
cgit v1.2.1


From 4e52797d2efefac3271abdc54439a3435abd77b9 Mon Sep 17 00:00:00 2001
From: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Date: Fri, 26 May 2017 12:36:49 +0100
Subject: x86/efi: Disable runtime services on kexec kernel if booted with
 efi=old_map

Booting kexec kernel with "efi=old_map" in kernel command line hits
kernel panic as shown below.

 BUG: unable to handle kernel paging request at ffff88007fe78070
 IP: virt_efi_set_variable.part.7+0x63/0x1b0
 PGD 7ea28067
 PUD 7ea2b067
 PMD 7ea2d067
 PTE 0
 [...]
 Call Trace:
  virt_efi_set_variable()
  efi_delete_dummy_variable()
  efi_enter_virtual_mode()
  start_kernel()
  x86_64_start_reservations()
  x86_64_start_kernel()
  start_cpu()

[ efi=old_map was never intended to work with kexec. The problem with
  using efi=old_map is that the virtual addresses are assigned from the
  memory region used by other kernel mappings; vmalloc() space.
  Potentially there could be collisions when booting kexec if something
  else is mapped at the virtual address we allocated for runtime service
  regions in the initial boot - Matt Fleming ]

Since kexec was never intended to work with efi=old_map, disable
runtime services in kexec if booted with efi=old_map, so that we don't
panic.

Tested-by: Lee Chun-Yi <jlee@suse.com>
Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20170526113652.21339-4-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 7e76a4d8304b..43b96f5f78ba 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -828,9 +828,11 @@ static void __init kexec_enter_virtual_mode(void)
 
 	/*
 	 * We don't do virtual mode, since we don't do runtime services, on
-	 * non-native EFI
+	 * non-native EFI. With efi=old_map, we don't do runtime services in
+	 * kexec kernel because in the initial boot something else might
+	 * have been mapped at these virtual addresses.
 	 */
-	if (!efi_is_native()) {
+	if (!efi_is_native() || efi_enabled(EFI_OLD_MEMMAP)) {
 		efi_memmap_unmap();
 		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
 		return;
-- 
cgit v1.2.1


From 94133e46a0f5ca3f138479806104ab4a8cb0455e Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Fri, 26 May 2017 12:36:50 +0100
Subject: x86/efi: Correct EFI identity mapping under 'efi=old_map' when KASLR
 is enabled

For EFI with the 'efi=old_map' kernel option specified, the kernel will panic
when KASLR is enabled:

  BUG: unable to handle kernel paging request at 000000007febd57e
  IP: 0x7febd57e
  PGD 1025a067
  PUD 0

  Oops: 0010 [#1] SMP
  Call Trace:
   efi_enter_virtual_mode()
   start_kernel()
   x86_64_start_reservations()
   x86_64_start_kernel()
   start_cpu()

The root cause is that the identity mapping is not built correctly
in the 'efi=old_map' case.

On 'nokaslr' kernels, PAGE_OFFSET is 0xffff880000000000 which is PGDIR_SIZE
aligned. We can borrow the PUD table from the direct mappings safely. Given a
physical address X, we have pud_index(X) == pud_index(__va(X)).

However, on KASLR kernels, PAGE_OFFSET is PUD_SIZE aligned. For a given physical
address X, pud_index(X) != pud_index(__va(X)). We can't just copy the PGD entry
from direct mapping to build identity mapping, instead we need to copy the
PUD entries one by one from the direct mapping.

Fix it.

Signed-off-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Young <dyoung@redhat.com>
Cc: Frank Ramsay <frank.ramsay@hpe.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <rja@sgi.com>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20170526113652.21339-5-matt@codeblueprint.co.uk
[ Fixed and reworded the changelog and code comments to be more readable. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi_64.c | 79 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 71 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index c488625c9712..eb8dff15a7f6 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -71,11 +71,13 @@ static void __init early_code_mapping_set_exec(int executable)
 
 pgd_t * __init efi_call_phys_prolog(void)
 {
-	unsigned long vaddress;
-	pgd_t *save_pgd;
+	unsigned long vaddr, addr_pgd, addr_p4d, addr_pud;
+	pgd_t *save_pgd, *pgd_k, *pgd_efi;
+	p4d_t *p4d, *p4d_k, *p4d_efi;
+	pud_t *pud;
 
 	int pgd;
-	int n_pgds;
+	int n_pgds, i, j;
 
 	if (!efi_enabled(EFI_OLD_MEMMAP)) {
 		save_pgd = (pgd_t *)read_cr3();
@@ -88,10 +90,49 @@ pgd_t * __init efi_call_phys_prolog(void)
 	n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
 	save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL);
 
+	/*
+	 * Build 1:1 identity mapping for efi=old_map usage. Note that
+	 * PAGE_OFFSET is PGDIR_SIZE aligned when KASLR is disabled, while
+	 * it is PUD_SIZE ALIGNED with KASLR enabled. So for a given physical
+	 * address X, the pud_index(X) != pud_index(__va(X)), we can only copy
+	 * PUD entry of __va(X) to fill in pud entry of X to build 1:1 mapping.
+	 * This means here we can only reuse the PMD tables of the direct mapping.
+	 */
 	for (pgd = 0; pgd < n_pgds; pgd++) {
-		save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
-		vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
-		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
+		addr_pgd = (unsigned long)(pgd * PGDIR_SIZE);
+		vaddr = (unsigned long)__va(pgd * PGDIR_SIZE);
+		pgd_efi = pgd_offset_k(addr_pgd);
+		save_pgd[pgd] = *pgd_efi;
+
+		p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd);
+		if (!p4d) {
+			pr_err("Failed to allocate p4d table!\n");
+			goto out;
+		}
+
+		for (i = 0; i < PTRS_PER_P4D; i++) {
+			addr_p4d = addr_pgd + i * P4D_SIZE;
+			p4d_efi = p4d + p4d_index(addr_p4d);
+
+			pud = pud_alloc(&init_mm, p4d_efi, addr_p4d);
+			if (!pud) {
+				pr_err("Failed to allocate pud table!\n");
+				goto out;
+			}
+
+			for (j = 0; j < PTRS_PER_PUD; j++) {
+				addr_pud = addr_p4d + j * PUD_SIZE;
+
+				if (addr_pud > (max_pfn << PAGE_SHIFT))
+					break;
+
+				vaddr = (unsigned long)__va(addr_pud);
+
+				pgd_k = pgd_offset_k(vaddr);
+				p4d_k = p4d_offset(pgd_k, vaddr);
+				pud[j] = *pud_offset(p4d_k, vaddr);
+			}
+		}
 	}
 out:
 	__flush_tlb_all();
@@ -104,8 +145,11 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
 	/*
 	 * After the lock is released, the original page table is restored.
 	 */
-	int pgd_idx;
+	int pgd_idx, i;
 	int nr_pgds;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
 
 	if (!efi_enabled(EFI_OLD_MEMMAP)) {
 		write_cr3((unsigned long)save_pgd);
@@ -115,9 +159,28 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
 
 	nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
 
-	for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++)
+	for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) {
+		pgd = pgd_offset_k(pgd_idx * PGDIR_SIZE);
 		set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
 
+		if (!(pgd_val(*pgd) & _PAGE_PRESENT))
+			continue;
+
+		for (i = 0; i < PTRS_PER_P4D; i++) {
+			p4d = p4d_offset(pgd,
+					 pgd_idx * PGDIR_SIZE + i * P4D_SIZE);
+
+			if (!(p4d_val(*p4d) & _PAGE_PRESENT))
+				continue;
+
+			pud = (pud_t *)p4d_page_vaddr(*p4d);
+			pud_free(&init_mm, pud);
+		}
+
+		p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+		p4d_free(&init_mm, p4d);
+	}
+
 	kfree(save_pgd);
 
 	__flush_tlb_all();
-- 
cgit v1.2.1


From dac6ca243c4c49a9ca7507d3d66140ebfac8b04b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 28 May 2017 22:04:14 +0200
Subject: x86/microcode/AMD: Change load_microcode_amd()'s param to bool to fix
 preemptibility bug

With CONFIG_DEBUG_PREEMPT enabled, I get:

  BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
  caller is debug_smp_processor_id
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.12.0-rc2+ #2
  Call Trace:
   dump_stack
   check_preemption_disabled
   debug_smp_processor_id
   save_microcode_in_initrd_amd
   ? microcode_init
   save_microcode_in_initrd
   ...

because, well, it says it above, we're using smp_processor_id() in
preemptible code.

But passing the CPU number is not really needed. It is only used to
determine whether we're on the BSP, and, if so, to save the microcode
patch for early loading.

 [ We don't absolutely need to do it on the BSP but we do that
   customarily there. ]

Instead, convert that function parameter to a boolean which denotes
whether the patch should be saved or not, thereby avoiding the use of
smp_processor_id() in preemptible code.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170528200414.31305-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/amd.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 45db4d2ebd01..e9f4d762aa5b 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -320,7 +320,7 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
 }
 
 static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
 
 int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
 {
@@ -338,8 +338,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
 	if (!desc.mc)
 		return -EINVAL;
 
-	ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax),
-				 desc.data, desc.size);
+	ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
 	if (ret != UCODE_OK)
 		return -EINVAL;
 
@@ -675,7 +674,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
 }
 
 static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
 {
 	enum ucode_state ret;
 
@@ -689,8 +688,8 @@ load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
 
 #ifdef CONFIG_X86_32
 	/* save BSP's matching patch for early load */
-	if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
-		struct ucode_patch *p = find_patch(cpu);
+	if (save) {
+		struct ucode_patch *p = find_patch(0);
 		if (p) {
 			memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
 			memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
@@ -722,11 +721,12 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
 {
 	char fw_name[36] = "amd-ucode/microcode_amd.bin";
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
 	enum ucode_state ret = UCODE_NFOUND;
 	const struct firmware *fw;
 
 	/* reload ucode container only on the boot cpu */
-	if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+	if (!refresh_fw || !bsp)
 		return UCODE_OK;
 
 	if (c->x86 >= 0x15)
@@ -743,7 +743,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
 		goto fw_release;
 	}
 
-	ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size);
+	ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
 
  fw_release:
 	release_firmware(fw);
-- 
cgit v1.2.1


From 5d9070b1f0fc9a159a9a3240c43004828408444b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 28 May 2017 11:03:42 +0200
Subject: x86/debug/32: Convert a smp_processor_id() call to raw to avoid
 DEBUG_PREEMPT warning

... to raw_smp_processor_id() to not trip the

  BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1

check. The reasoning behind it is that __warn() already uses the raw_
variants but the show_regs() path on 32-bit doesn't.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170528092212.fiod7kygpjm23m3o@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ff40e74c9181..ffeae818aa7a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -78,7 +78,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
 	printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
 	printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
-		smp_processor_id());
+		raw_smp_processor_id());
 
 	printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
 		regs->ax, regs->bx, regs->cx, regs->dx);
-- 
cgit v1.2.1


From cbf712792b6e61317b93dd56dd5c0784363c9ac9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Fri, 19 May 2017 15:48:51 +0200
Subject: KVM: nVMX: fix nested_vmx_check_vmptr failure paths under debugging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_skip_emulated_instruction() will return 0 if userspace is
single-stepping the guest.

kvm_skip_emulated_instruction() uses return status convention of exit
handler: 0 means "exit to userspace" and 1 means "continue vm entries".
The problem is that nested_vmx_check_vmptr() return status means
something else: 0 is ok, 1 is error.

This means we would continue executing after a failure.  Static checker
noticed it because vmptr was not initialized.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Fixes: 6affcbedcac7 ("KVM: x86: Add kvm_skip_emulated_instruction and use it.")
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 140 ++++++++++++++++++++++-------------------------------
 1 file changed, 57 insertions(+), 83 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 880f371705bc..9b4b5d6dcd34 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6914,97 +6914,21 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-/*
- * This function performs the various checks including
- * - if it's 4KB aligned
- * - No bits beyond the physical address width are set
- * - Returns 0 on success or else 1
- * (Intel SDM Section 30.3)
- */
-static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
-				  gpa_t *vmpointer)
+static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
 {
 	gva_t gva;
-	gpa_t vmptr;
 	struct x86_exception e;
-	struct page *page;
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 
 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
 			vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
 		return 1;
 
-	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
-				sizeof(vmptr), &e)) {
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
+				sizeof(*vmpointer), &e)) {
 		kvm_inject_page_fault(vcpu, &e);
 		return 1;
 	}
 
-	switch (exit_reason) {
-	case EXIT_REASON_VMON:
-		/*
-		 * SDM 3: 24.11.5
-		 * The first 4 bytes of VMXON region contain the supported
-		 * VMCS revision identifier
-		 *
-		 * Note - IA32_VMX_BASIC[48] will never be 1
-		 * for the nested case;
-		 * which replaces physical address width with 32
-		 *
-		 */
-		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
-			nested_vmx_failInvalid(vcpu);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-
-		page = nested_get_page(vcpu, vmptr);
-		if (page == NULL) {
-			nested_vmx_failInvalid(vcpu);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-		if (*(u32 *)kmap(page) != VMCS12_REVISION) {
-			kunmap(page);
-			nested_release_page_clean(page);
-			nested_vmx_failInvalid(vcpu);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-		kunmap(page);
-		nested_release_page_clean(page);
-		vmx->nested.vmxon_ptr = vmptr;
-		break;
-	case EXIT_REASON_VMCLEAR:
-		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
-			nested_vmx_failValid(vcpu,
-					     VMXERR_VMCLEAR_INVALID_ADDRESS);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-
-		if (vmptr == vmx->nested.vmxon_ptr) {
-			nested_vmx_failValid(vcpu,
-					     VMXERR_VMCLEAR_VMXON_POINTER);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-		break;
-	case EXIT_REASON_VMPTRLD:
-		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
-			nested_vmx_failValid(vcpu,
-					     VMXERR_VMPTRLD_INVALID_ADDRESS);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-
-		if (vmptr == vmx->nested.vmxon_ptr) {
-			nested_vmx_failValid(vcpu,
-					     VMXERR_VMPTRLD_VMXON_POINTER);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-		break;
-	default:
-		return 1; /* shouldn't happen */
-	}
-
-	if (vmpointer)
-		*vmpointer = vmptr;
 	return 0;
 }
 
@@ -7066,6 +6990,8 @@ out_msr_bitmap:
 static int handle_vmon(struct kvm_vcpu *vcpu)
 {
 	int ret;
+	gpa_t vmptr;
+	struct page *page;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
 		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -7095,9 +7021,37 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+	if (nested_vmx_get_vmptr(vcpu, &vmptr))
 		return 1;
- 
+
+	/*
+	 * SDM 3: 24.11.5
+	 * The first 4 bytes of VMXON region contain the supported
+	 * VMCS revision identifier
+	 *
+	 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
+	 * which replaces physical address width with 32
+	 */
+	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+		nested_vmx_failInvalid(vcpu);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+
+	page = nested_get_page(vcpu, vmptr);
+	if (page == NULL) {
+		nested_vmx_failInvalid(vcpu);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+	if (*(u32 *)kmap(page) != VMCS12_REVISION) {
+		kunmap(page);
+		nested_release_page_clean(page);
+		nested_vmx_failInvalid(vcpu);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+	kunmap(page);
+	nested_release_page_clean(page);
+
+	vmx->nested.vmxon_ptr = vmptr;
 	ret = enter_vmx_operation(vcpu);
 	if (ret)
 		return ret;
@@ -7213,9 +7167,19 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
-	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
+	if (nested_vmx_get_vmptr(vcpu, &vmptr))
 		return 1;
 
+	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+
+	if (vmptr == vmx->nested.vmxon_ptr) {
+		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+
 	if (vmptr == vmx->nested.current_vmptr)
 		nested_release_vmcs12(vmx);
 
@@ -7545,9 +7509,19 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
-	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
+	if (nested_vmx_get_vmptr(vcpu, &vmptr))
 		return 1;
 
+	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+
+	if (vmptr == vmx->nested.vmxon_ptr) {
+		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+
 	if (vmx->nested.current_vmptr != vmptr) {
 		struct vmcs12 *new_vmcs12;
 		struct page *page;
-- 
cgit v1.2.1


From 8eae9570d1d3887487be0b355d12656b46fac226 Mon Sep 17 00:00:00 2001
From: Gioh Kim <gi-oh.kim@profitbricks.com>
Date: Tue, 30 May 2017 15:24:45 +0200
Subject: KVM: SVM: ignore type when setting segment registers

Commit 19bca6ab75d8 ("KVM: SVM: Fix cross vendor migration issue with
unusable bit") added checking type when setting unusable.
So unusable can be set if present is 0 OR type is 0.
According to the AMD processor manual, long mode ignores the type value
in segment descriptor. And type can be 0 if it is read-only data segment.
Therefore type value is not related to unusable flag.

This patch is based on linux-next v4.12.0-rc3.

Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 183ddb235fb4..a654372efea1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1807,7 +1807,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	 * AMD's VMCB does not have an explicit unusable field, so emulate it
 	 * for cross vendor migration purposes by "not present"
 	 */
-	var->unusable = !var->present || (var->type == 0);
+	var->unusable = !var->present;
 
 	switch (seg) {
 	case VCPU_SREG_TR:
-- 
cgit v1.2.1


From d9c1b5431d5f0e07575db785a022bce91051ac1d Mon Sep 17 00:00:00 2001
From: Roman Pen <roman.penyaev@profitbricks.com>
Date: Thu, 1 Jun 2017 10:55:03 +0200
Subject: KVM: SVM: do not zero out segment attributes if segment is unusable
 or not present
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a fix for the problem [1], where VMCB.CPL was set to 0 and interrupt
was taken on userspace stack.  The root cause lies in the specific AMD CPU
behaviour which manifests itself as unusable segment attributes on SYSRET.
The corresponding work around for the kernel is the following:

61f01dd941ba ("x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue")

In other turn virtualization side treated unusable segment incorrectly and
restored CPL from SS attributes, which were zeroed out few lines above.

In current patch it is assured only that P bit is cleared in VMCB.save state
and segment attributes are not zeroed out if segment is not presented or is
unusable, therefore CPL can be safely restored from DPL field.

This is only one part of the fix, since QEMU side should be fixed accordingly
not to zero out attributes on its side.  Corresponding patch will follow.

[1] Message id: CAJrWOzD6Xq==b-zYCDdFLgSRMPM-NkNuTSDFEtX=7MreT45i7Q@mail.gmail.com

Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Signed-off-by: Mikhail Sennikovskii <mikhail.sennikovskii@profitbricks.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a654372efea1..ba9891ac5c56 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1840,6 +1840,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 		 */
 		if (var->unusable)
 			var->db = 0;
+		/* This is symmetric with svm_set_segment() */
 		var->dpl = to_svm(vcpu)->vmcb->save.cpl;
 		break;
 	}
@@ -1980,18 +1981,14 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 	s->base = var->base;
 	s->limit = var->limit;
 	s->selector = var->selector;
-	if (var->unusable)
-		s->attrib = 0;
-	else {
-		s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
-		s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
-		s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
-		s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
-		s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
-		s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
-		s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
-		s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
-	}
+	s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+	s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+	s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+	s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
+	s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+	s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+	s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+	s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
 
 	/*
 	 * This is always accurate, except if SYSRET returned to a segment
@@ -2000,7 +1997,8 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 	 * would entail passing the CPL to userspace and back.
 	 */
 	if (seg == VCPU_SREG_SS)
-		svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+		/* This is symmetric with svm_get_segment() */
+		svm->vmcb->save.cpl = (var->dpl & 3);
 
 	mark_dirty(svm->vmcb, VMCB_SEG);
 }
-- 
cgit v1.2.1


From 47a66eed99e6f231f4a1d261a9d493f4eee94829 Mon Sep 17 00:00:00 2001
From: ZhuangYanying <ann.zhuangyanying@huawei.com>
Date: Fri, 26 May 2017 13:16:48 +0800
Subject: KVM: x86: Fix nmi injection failure when vcpu got blocked

When spin_lock_irqsave() deadlock occurs inside the guest, vcpu threads,
other than the lock-holding one, would enter into S state because of
pvspinlock. Then inject NMI via libvirt API "inject-nmi", the NMI could
not be injected into vm.

The reason is:
1 It sets nmi_queued to 1 when calling ioctl KVM_NMI in qemu, and sets
cpu->kvm_vcpu_dirty to true in do_inject_external_nmi() meanwhile.
2 It sets nmi_queued to 0 in process_nmi(), before entering guest, because
cpu->kvm_vcpu_dirty is true.

It's not enough just to check nmi_queued to decide whether to stay in
vcpu_block() or not. NMI should be injected immediately at any situation.
Add checking nmi_pending, and testing KVM_REQ_NMI replaces nmi_queued
in vm_vcpu_has_events().

Do the same change for SMIs.

Signed-off-by: Zhuang Yanying <ann.zhuangyanying@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 02363e37d4a6..a2cd0997343c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8394,10 +8394,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.pv.pv_unhalted)
 		return true;
 
-	if (atomic_read(&vcpu->arch.nmi_queued))
+	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+	    (vcpu->arch.nmi_pending &&
+	     kvm_x86_ops->nmi_allowed(vcpu)))
 		return true;
 
-	if (kvm_test_request(KVM_REQ_SMI, vcpu))
+	if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+	    (vcpu->arch.smi_pending && !is_smm(vcpu)))
 		return true;
 
 	if (kvm_arch_interrupt_allowed(vcpu) &&
-- 
cgit v1.2.1


From c08d517480ea342cc43acdacc5cf4a795e18151d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 1 Jun 2017 15:52:23 +0200
Subject: Revert "x86/PAT: Fix Xorg regression on CPUs that don't support PAT"

This reverts commit cbed27cdf0e3f7ea3b2259e86b9e34df02be3fe4.

As Andy Lutomirski observed:

 "I think this patch is bogus. pat_enabled() sure looks like it's
  supposed to return true if PAT is *enabled*, and these days PAT is
  'enabled' even if there's no HW PAT support."

Reported-by: Bernhard Held <berny156@gmx.de>
Reported-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: stable@vger.kernel.org # v4.2+
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pat.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 83a59a67757a..9b78685b66e6 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -65,11 +65,9 @@ static int __init nopat(char *str)
 }
 early_param("nopat", nopat);
 
-static bool __read_mostly __pat_initialized = false;
-
 bool pat_enabled(void)
 {
-	return __pat_initialized;
+	return !!__pat_enabled;
 }
 EXPORT_SYMBOL_GPL(pat_enabled);
 
@@ -227,14 +225,13 @@ static void pat_bsp_init(u64 pat)
 	}
 
 	wrmsrl(MSR_IA32_CR_PAT, pat);
-	__pat_initialized = true;
 
 	__init_cache_modes(pat);
 }
 
 static void pat_ap_init(u64 pat)
 {
-	if (!this_cpu_has(X86_FEATURE_PAT)) {
+	if (!boot_cpu_has(X86_FEATURE_PAT)) {
 		/*
 		 * If this happens we are on a secondary CPU, but switched to
 		 * PAT on the boot CPU. We have no way to undo PAT.
@@ -309,7 +306,7 @@ void pat_init(void)
 	u64 pat;
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
-	if (!__pat_enabled) {
+	if (!pat_enabled()) {
 		init_cache_modes();
 		return;
 	}
-- 
cgit v1.2.1


From ae1d557d8f30cb097b4d1f2ab04fa294588ee1cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20S=C3=BCnkenberg?=
 <christian.suenkenberg@student.kit.edu>
Date: Sun, 4 Jun 2017 19:18:39 +0200
Subject: x86/cpu/cyrix: Add alternative Device ID of Geode GX1 SoC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A SoC variant of Geode GX1, notably NSC branded SC1100, seems to
report an inverted Device ID in its DIR0 configuration register,
specifically 0xb instead of the expected 0x4.

Catch this presumably quirky version so it's properly recognized
as GX1 and has its cache switched to write-back mode, which provides
a significant performance boost in most workloads.

SC1100's datasheet "Geode™ SC1100 Information Appliance On a Chip",
states in section 1.1.7.1 "Device ID" that device identification
values are specified in SC1100's device errata. These, however,
seem to not have been publicly released.

Wading through a number of boot logs and /proc/cpuinfo dumps found on
pastebin and blogs, this patch should mostly be relevant for a number
of now admittedly aging Soekris NET4801 and PC Engines WRAP devices,
the latter being the platform this issue was discovered on.
Performance impact was verified using "openssl speed", with
write-back caching scaling throughput between -3% and +41%.

Signed-off-by: Christian Sünkenberg <christian.suenkenberg@student.kit.edu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1496596719.26725.14.camel@student.kit.edu
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/cyrix.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index a70fd61095f8..6f077445647a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -255,6 +255,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
 		break;
 
 	case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
+	case 11: /* GX1 with inverted Device ID */
 #ifdef CONFIG_PCI
 	{
 		u32 vendor, device;
-- 
cgit v1.2.1


From bbaf0e2b1c1b4f88abd6ef49576f0efb1734eae5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 26 Apr 2017 16:56:26 +0200
Subject: kvm: async_pf: fix rcu_irq_enter() with irqs enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

native_safe_halt enables interrupts, and you just shouldn't
call rcu_irq_enter() with interrupts enabled.  Reorder the
call with the following local_irq_disable() to respect the
invariant.

Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kernel/kvm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index da5c09789984..43e10d6fdbed 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -161,8 +161,8 @@ void kvm_async_pf_task_wait(u32 token)
 			 */
 			rcu_irq_exit();
 			native_safe_halt();
-			rcu_irq_enter();
 			local_irq_disable();
+			rcu_irq_enter();
 		}
 	}
 	if (!n.halted)
-- 
cgit v1.2.1


From d4912215d1031e4fb3d1038d2e1857218dba0d0a Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Mon, 5 Jun 2017 05:19:09 -0700
Subject: KVM: nVMX: Fix exception injection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 WARNING: CPU: 3 PID: 2840 at arch/x86/kvm/vmx.c:10966 nested_vmx_vmexit+0xdcd/0xde0 [kvm_intel]
 CPU: 3 PID: 2840 Comm: qemu-system-x86 Tainted: G           OE   4.12.0-rc3+ #23
 RIP: 0010:nested_vmx_vmexit+0xdcd/0xde0 [kvm_intel]
 Call Trace:
  ? kvm_check_async_pf_completion+0xef/0x120 [kvm]
  ? rcu_read_lock_sched_held+0x79/0x80
  vmx_queue_exception+0x104/0x160 [kvm_intel]
  ? vmx_queue_exception+0x104/0x160 [kvm_intel]
  kvm_arch_vcpu_ioctl_run+0x1171/0x1ce0 [kvm]
  ? kvm_arch_vcpu_load+0x47/0x240 [kvm]
  ? kvm_arch_vcpu_load+0x62/0x240 [kvm]
  kvm_vcpu_ioctl+0x384/0x7b0 [kvm]
  ? kvm_vcpu_ioctl+0x384/0x7b0 [kvm]
  ? __fget+0xf3/0x210
  do_vfs_ioctl+0xa4/0x700
  ? __fget+0x114/0x210
  SyS_ioctl+0x79/0x90
  do_syscall_64+0x81/0x220
  entry_SYSCALL64_slow_path+0x25/0x25

This is triggered occasionally by running both win7 and win2016 in L2, in
addition, EPT is disabled on both L1 and L2. It can't be reproduced easily.

Commit 0b6ac343fc (KVM: nVMX: Correct handling of exception injection) mentioned
that "KVM wants to inject page-faults which it got to the guest. This function
assumes it is called with the exit reason in vmcs02 being a #PF exception".
Commit e011c663 (KVM: nVMX: Check all exceptions for intercept during delivery to
L2) allows to check all exceptions for intercept during delivery to L2. However,
there is no guarantee the exit reason is exception currently, when there is an
external interrupt occurred on host, maybe a time interrupt for host which should
not be injected to guest, and somewhere queues an exception, then the function
nested_vmx_check_exception() will be called and the vmexit emulation codes will
try to emulate the "Acknowledge interrupt on exit" behavior, the warning is
triggered.

Reusing the exit reason from the L2->L0 vmexit is wrong in this case,
the reason must always be EXCEPTION_NMI when injecting an exception into
L1 as a nested vmexit.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Fixes: e011c663b9c7 ("KVM: nVMX: Check all exceptions for intercept during delivery to L2")
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9b4b5d6dcd34..ca5d2b93385c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2425,7 +2425,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
 	if (!(vmcs12->exception_bitmap & (1u << nr)))
 		return 0;
 
-	nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
+	nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 			  vmcs_read32(VM_EXIT_INTR_INFO),
 			  vmcs_readl(EXIT_QUALIFICATION));
 	return 1;
-- 
cgit v1.2.1


From 5b0bc9ac2ce4881ee318a21f31140584ce4dbdad Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Wed, 7 Jun 2017 11:58:19 +0200
Subject: x86/microcode/intel: Clear patch pointer before jettisoning the
 initrd

During early boot, load_ucode_intel_ap() uses __load_ucode_intel()
to obtain a pointer to the relevant microcode patch (embedded in the
initrd), and stores this value in 'intel_ucode_patch' to speed up the
microcode patch application for subsequent CPUs.

On resuming from suspend-to-RAM, however, load_ucode_ap() calls
load_ucode_intel_ap() for each non-boot-CPU. By then the initramfs is
long gone so the pointer stored in 'intel_ucode_patch' no longer points to
a valid microcode patch.

Clear that pointer so that we effectively fall back to the CPU hotplug
notifier callbacks to update the microcode.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
[ Edit and massage commit message. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org> # 4.10..
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170607095819.9754-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index afdfd237b59f..f522415bf9e5 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -619,6 +619,9 @@ int __init save_microcode_in_initrd_intel(void)
 
 	show_saved_mc();
 
+	/* initrd is going away, clear patch ptr. */
+	intel_ucode_patch = NULL;
+
 	return 0;
 }
 
-- 
cgit v1.2.1


From a3641631d14571242eec0d30c9faa786cbf52d44 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 8 Jun 2017 01:22:07 -0700
Subject: KVM: cpuid: Fix read/write out-of-bounds vulnerability in cpuid
 emulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If "i" is the last element in the vcpu->arch.cpuid_entries[] array, it
potentially can be exploited the vulnerability. this will out-of-bounds
read and write.  Luckily, the effect is small:

	/* when no next entry is found, the current entry[i] is reselected */
	for (j = i + 1; ; j = (j + 1) % nent) {
		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
		if (ej->function == e->function) {

It reads ej->maxphyaddr, which is user controlled.  However...

			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;

After cpuid_entries there is

	int maxphyaddr;
	struct x86_emulate_ctxt emulate_ctxt;  /* 16-byte aligned */

So we have:

- cpuid_entries at offset 1B50 (6992)
- maxphyaddr at offset 27D0 (6992 + 3200 = 10192)
- padding at 27D4...27DF
- emulate_ctxt at 27E0

And it writes in the padding.  Pfew, writing the ops field of emulate_ctxt
would have been much worse.

This patch fixes it by modding the index to avoid the out-of-bounds
access. Worst case, i == j and ej->function == e->function,
the loop can bail out.

Reported-by: Moguofang <moguofang@huawei.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Guofang Mo <moguofang@huawei.com>
Cc: stable@vger.kernel.org
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/cpuid.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a181ae76c71c..59ca2eea522c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -780,18 +780,20 @@ out:
 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
 {
 	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
-	int j, nent = vcpu->arch.cpuid_nent;
+	struct kvm_cpuid_entry2 *ej;
+	int j = i;
+	int nent = vcpu->arch.cpuid_nent;
 
 	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
 	/* when no next entry is found, the current entry[i] is reselected */
-	for (j = i + 1; ; j = (j + 1) % nent) {
-		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
-		if (ej->function == e->function) {
-			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-			return j;
-		}
-	}
-	return 0; /* silence gcc, even though control never reaches here */
+	do {
+		j = (j + 1) % nent;
+		ej = &vcpu->arch.cpuid_entries[j];
+	} while (ej->function != e->function);
+
+	ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+
+	return j;
 }
 
 /* find an entry with matching function, matching index (if needed), and that
-- 
cgit v1.2.1


From 47b2c3fff4932e6fc17ce13d51a43c6969714e20 Mon Sep 17 00:00:00 2001
From: Bilal Amarni <bilal.amarni@gmail.com>
Date: Thu, 8 Jun 2017 14:47:26 +0100
Subject: security/keys: add CONFIG_KEYS_COMPAT to Kconfig

CONFIG_KEYS_COMPAT is defined in arch-specific Kconfigs and is missing for
several 64-bit architectures : mips, parisc, tile.

At the moment and for those architectures, calling in 32-bit userspace the
keyctl syscall would return an ENOSYS error.

This patch moves the CONFIG_KEYS_COMPAT option to security/keys/Kconfig, to
make sure the compatibility wrapper is registered by default for any 64-bit
architecture as long as it is configured with CONFIG_COMPAT.

[DH: Modified to remove arm64 compat enablement also as requested by Eric
 Biggers]

Signed-off-by: Bilal Amarni <bilal.amarni@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
cc: Eric Biggers <ebiggers3@gmail.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 arch/x86/Kconfig | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4ccfacc7232a..0efb4c9497bc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2776,10 +2776,6 @@ config COMPAT_FOR_U64_ALIGNMENT
 config SYSVIPC_COMPAT
 	def_bool y
 	depends on SYSVIPC
-
-config KEYS_COMPAT
-	def_bool y
-	depends on KEYS
 endif
 
 endmenu
-- 
cgit v1.2.1


From 9bc1f09f6fa76fdf31eb7d6a4a4df43574725f93 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 8 Jun 2017 20:13:40 -0700
Subject: KVM: async_pf: avoid async pf injection when in guest mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 INFO: task gnome-terminal-:1734 blocked for more than 120 seconds.
       Not tainted 4.12.0-rc4+ #8
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 gnome-terminal- D    0  1734   1015 0x00000000
 Call Trace:
  __schedule+0x3cd/0xb30
  schedule+0x40/0x90
  kvm_async_pf_task_wait+0x1cc/0x270
  ? __vfs_read+0x37/0x150
  ? prepare_to_swait+0x22/0x70
  do_async_page_fault+0x77/0xb0
  ? do_async_page_fault+0x77/0xb0
  async_page_fault+0x28/0x30

This is triggered by running both win7 and win2016 on L1 KVM simultaneously,
and then gives stress to memory on L1, I can observed this hang on L1 when
at least ~70% swap area is occupied on L0.

This is due to async pf was injected to L2 which should be injected to L1,
L2 guest starts receiving pagefault w/ bogus %cr2(apf token from the host
actually), and L1 guest starts accumulating tasks stuck in D state in
kvm_async_pf_task_wait() since missing PAGE_READY async_pfs.

This patch fixes the hang by doing async pf when executing L1 guest.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +++++--
 arch/x86/kvm/mmu.h | 1 +
 arch/x86/kvm/x86.c | 3 +--
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5d3376f67794..cb8225969255 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3698,12 +3698,15 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 	return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
-static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 {
 	if (unlikely(!lapic_in_kernel(vcpu) ||
 		     kvm_event_needs_reinjection(vcpu)))
 		return false;
 
+	if (is_guest_mode(vcpu))
+		return false;
+
 	return kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
@@ -3719,7 +3722,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 	if (!async)
 		return false; /* *pfn has correct page already */
 
-	if (!prefault && can_do_async_pf(vcpu)) {
+	if (!prefault && kvm_can_do_async_pf(vcpu)) {
 		trace_kvm_try_async_get_page(gva, gfn);
 		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
 			trace_kvm_async_pf_doublefault(gva, gfn);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 27975807cc64..330bf3a811fb 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -76,6 +76,7 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 			     bool accessed_dirty);
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a2cd0997343c..87d3cb901935 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8607,8 +8607,7 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
 		return true;
 	else
-		return !kvm_event_needs_reinjection(vcpu) &&
-			kvm_x86_ops->interrupt_allowed(vcpu);
+		return kvm_can_do_async_pf(vcpu);
 }
 
 void kvm_arch_start_assignment(struct kvm *kvm)
-- 
cgit v1.2.1