205 files changed, 11556 insertions, 4103 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c70684f859e1..8ec3a1aa4abd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -70,6 +70,7 @@ config X86
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_TEXT_POKE_SMP
 	select HAVE_GENERIC_HARDIRQS
+	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select SPARSE_IRQ
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_IRQ_PROBE
@@ -84,6 +85,7 @@ config X86
 	select GENERIC_IOMAP
 	select DCACHE_WORD_ACCESS
 	select GENERIC_SMP_IDLE_THREAD
+	select ARCH_WANT_IPC_PARSE_VERSION if X86_32
 	select HAVE_ARCH_SECCOMP_FILTER
 	select BUILDTIME_EXTABLE_SORT
 	select GENERIC_CMOS_UPDATE
@@ -1525,7 +1527,7 @@ config SECCOMP
 	  If unsure, say Y. Only embedded should say N here.
 
 config CC_STACKPROTECTOR
-	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
+	bool "Enable -fstack-protector buffer overflow detection"
 	---help---
 	  This option turns on the -fstack-protector GCC feature. This
 	  feature puts, at the beginning of functions, a canary value on
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index e46c2147397f..b322f124ee3c 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -129,6 +129,25 @@ config DOUBLEFAULT
 	  option saves about 4k and might cause you much additional grey
 	  hair.
 
+config DEBUG_TLBFLUSH
+	bool "Set upper limit of TLB entries to flush one-by-one"
+	depends on DEBUG_KERNEL && (X86_64 || X86_INVLPG)
+	---help---
+
+	X86-only for now.
+
+	This option allows the user to tune the amount of TLB entries the
+	kernel flushes one-by-one instead of doing a full TLB flush. In
+	certain situations, the former is cheaper. This is controlled by the
+	tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it
+	to -1, the code flushes the whole TLB unconditionally. Otherwise,
+	for positive values of it, the kernel will use single TLB entry
+	invalidating instructions according to the following formula:
+
+	flush_entries <= active_tlb_entries / 2^tlb_flushall_shift
+
+	If in doubt, say "N".
+
 config IOMMU_DEBUG
 	bool "Enable IOMMU debugging"
 	depends on GART_IOMMU && DEBUG_KERNEL
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1f2521434554..682e9c210baa 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -27,6 +27,10 @@ ifeq ($(CONFIG_X86_32),y)
 
         KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
 
+        # Never want PIC in a 32-bit kernel, prevent breakage with GCC built
+        # with nonstandard options
+        KBUILD_CFLAGS += -fno-pic
+
         # prevent gcc from keeping the stack 16 byte aligned
         KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
 
@@ -49,6 +53,9 @@ else
         KBUILD_AFLAGS += -m64
         KBUILD_CFLAGS += -m64
 
+	# Use -mpreferred-stack-boundary=3 if supported.
+	KBUILD_CFLAGS += $(call cc-option,-mno-sse -mpreferred-stack-boundary=3)
+
         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
         cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
         cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 5a747dd884db..f7535bedc33f 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -57,7 +57,7 @@ KBUILD_CFLAGS	:= $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
 		   -Wall -Wstrict-prototypes \
 		   -march=i386 -mregparm=3 \
 		   -include $(srctree)/$(src)/code16gcc.h \
-		   -fno-strict-aliasing -fomit-frame-pointer \
+		   -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
 		   $(call cc-option, -ffreestanding) \
 		   $(call cc-option, -fno-toplevel-reorder,\
 			$(call cc-option, -fno-unit-at-a-time)) \
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index cb62f786990d..10f6b1178c68 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,5 +1,7 @@
 #include "misc.h"
 
+#ifdef CONFIG_EARLY_PRINTK
+
 static unsigned long fs;
 static inline void set_fs(unsigned long seg)
 {
@@ -19,3 +21,5 @@ int cmdline_find_option_bool(const char *option)
 {
 	return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
 }
+
+#endif
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
index 261e81fb9582..d3d003cb5481 100644
--- a/arch/x86/boot/compressed/early_serial_console.c
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -1,5 +1,9 @@
 #include "misc.h"
 
+#ifdef CONFIG_EARLY_PRINTK
+
 int early_serial_base;
 
 #include "../early_serial_console.c"
+
+#endif
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 4e85f5f85837..b3e0227df2c9 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -729,32 +729,68 @@ fail:
  * need to create one ourselves (usually the bootloader would create
  * one for us).
  */
-static efi_status_t make_boot_params(struct boot_params *boot_params,
-				     efi_loaded_image_t *image,
-				     void *handle)
+struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
 {
-	struct efi_info *efi = &boot_params->efi_info;
-	struct apm_bios_info *bi = &boot_params->apm_bios_info;
-	struct sys_desc_table *sdt = &boot_params->sys_desc_table;
-	struct e820entry *e820_map = &boot_params->e820_map[0];
-	struct e820entry *prev = NULL;
-	struct setup_header *hdr = &boot_params->hdr;
-	unsigned long size, key, desc_size, _size;
-	efi_memory_desc_t *mem_map;
-	void *options = image->load_options;
-	u32 load_options_size = image->load_options_size / 2; /* ASCII */
+	struct boot_params *boot_params;
+	struct sys_desc_table *sdt;
+	struct apm_bios_info *bi;
+	struct setup_header *hdr;
+	struct efi_info *efi;
+	efi_loaded_image_t *image;
+	void *options;
+	u32 load_options_size;
+	efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
 	int options_size = 0;
 	efi_status_t status;
-	__u32 desc_version;
 	unsigned long cmdline;
-	u8 nr_entries;
 	u16 *s2;
 	u8 *s1;
 	int i;
 
+	sys_table = _table;
+
+	/* Check if we were booted by the EFI firmware */
+	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+		return NULL;
+
+	status = efi_call_phys3(sys_table->boottime->handle_protocol,
+				handle, &proto, (void *)&image);
+	if (status != EFI_SUCCESS) {
+		efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
+		return NULL;
+	}
+
+	status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
+	if (status != EFI_SUCCESS) {
+		efi_printk("Failed to alloc lowmem for boot params\n");
+		return NULL;
+	}
+
+	memset(boot_params, 0x0, 0x4000);
+
+	hdr = &boot_params->hdr;
+	efi = &boot_params->efi_info;
+	bi = &boot_params->apm_bios_info;
+	sdt = &boot_params->sys_desc_table;
+
+	/* Copy the second sector to boot_params */
+	memcpy(&hdr->jump, image->image_base + 512, 512);
+
+	/*
+	 * Fill out some of the header fields ourselves because the
+	 * EFI firmware loader doesn't load the first sector.
+	 */
+	hdr->root_flags = 1;
+	hdr->vid_mode = 0xffff;
+	hdr->boot_flag = 0xAA55;
+
+	hdr->code32_start = (__u64)(unsigned long)image->image_base;
+
 	hdr->type_of_loader = 0x21;
 
 	/* Convert unicode cmdline to ascii */
+	options = image->load_options;
+	load_options_size = image->load_options_size / 2; /* ASCII */
 	cmdline = 0;
 	s2 = (u16 *)options;
 
@@ -791,18 +827,36 @@ static efi_status_t make_boot_params(struct boot_params *boot_params,
 	hdr->ramdisk_image = 0;
 	hdr->ramdisk_size = 0;
 
-	status = handle_ramdisks(image, hdr);
-	if (status != EFI_SUCCESS)
-		goto free_cmdline;
-
-	setup_graphics(boot_params);
-
 	/* Clear APM BIOS info */
 	memset(bi, 0, sizeof(*bi));
 
 	memset(sdt, 0, sizeof(*sdt));
 
-	memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
+	status = handle_ramdisks(image, hdr);
+	if (status != EFI_SUCCESS)
+		goto fail2;
+
+	return boot_params;
+fail2:
+	if (options_size)
+		low_free(options_size, hdr->cmd_line_ptr);
+fail:
+	low_free(0x4000, (unsigned long)boot_params);
+	return NULL;
+}
+
+static efi_status_t exit_boot(struct boot_params *boot_params,
+			      void *handle)
+{
+	struct efi_info *efi = &boot_params->efi_info;
+	struct e820entry *e820_map = &boot_params->e820_map[0];
+	struct e820entry *prev = NULL;
+	unsigned long size, key, desc_size, _size;
+	efi_memory_desc_t *mem_map;
+	efi_status_t status;
+	__u32 desc_version;
+	u8 nr_entries;
+	int i;
 
 	size = sizeof(*mem_map) * 32;
 
@@ -811,7 +865,7 @@ again:
 	_size = size;
 	status = low_alloc(size, 1, (unsigned long *)&mem_map);
 	if (status != EFI_SUCCESS)
-		goto free_cmdline;
+		return status;
 
 	status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
 				mem_map, &key, &desc_size, &desc_version);
@@ -823,6 +877,7 @@ again:
 	if (status != EFI_SUCCESS)
 		goto free_mem_map;
 
+	memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
 	efi->efi_systab = (unsigned long)sys_table;
 	efi->efi_memdesc_size = desc_size;
 	efi->efi_memdesc_version = desc_version;
@@ -906,61 +961,13 @@ again:
 
 free_mem_map:
 	low_free(_size, (unsigned long)mem_map);
-free_cmdline:
-	if (options_size)
-		low_free(options_size, hdr->cmd_line_ptr);
-fail:
 	return status;
 }
 
-/*
- * On success we return a pointer to a boot_params structure, and NULL
- * on failure.
- */
-struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
+static efi_status_t relocate_kernel(struct setup_header *hdr)
 {
-	struct boot_params *boot_params;
 	unsigned long start, nr_pages;
-	struct desc_ptr *gdt, *idt;
-	efi_loaded_image_t *image;
-	struct setup_header *hdr;
 	efi_status_t status;
-	efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
-	struct desc_struct *desc;
-
-	sys_table = _table;
-
-	/* Check if we were booted by the EFI firmware */
-	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
-		goto fail;
-
-	status = efi_call_phys3(sys_table->boottime->handle_protocol,
-				handle, &proto, (void *)&image);
-	if (status != EFI_SUCCESS) {
-		efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
-		goto fail;
-	}
-
-	status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
-	if (status != EFI_SUCCESS) {
-		efi_printk("Failed to alloc lowmem for boot params\n");
-		goto fail;
-	}
-
-	memset(boot_params, 0x0, 0x4000);
-
-	hdr = &boot_params->hdr;
-
-	/* Copy the second sector to boot_params */
-	memcpy(&hdr->jump, image->image_base + 512, 512);
-
-	/*
-	 * Fill out some of the header fields ourselves because the
-	 * EFI firmware loader doesn't load the first sector.
-	 */
-	hdr->root_flags = 1;
-	hdr->vid_mode = 0xffff;
-	hdr->boot_flag = 0xAA55;
 
 	/*
 	 * The EFI firmware loader could have placed the kernel image
@@ -978,16 +985,40 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
 	if (status != EFI_SUCCESS) {
 		status = low_alloc(hdr->init_size, hdr->kernel_alignment,
 				   &start);
-		if (status != EFI_SUCCESS) {
+		if (status != EFI_SUCCESS)
 			efi_printk("Failed to alloc mem for kernel\n");
-			goto fail;
-		}
 	}
 
+	if (status == EFI_SUCCESS)
+		memcpy((void *)start, (void *)(unsigned long)hdr->code32_start,
+		       hdr->init_size);
+
+	hdr->pref_address = hdr->code32_start;
 	hdr->code32_start = (__u32)start;
-	hdr->pref_address = (__u64)(unsigned long)image->image_base;
 
-	memcpy((void *)start, image->image_base, image->image_size);
+	return status;
+}
+
+/*
+ * On success we return a pointer to a boot_params structure, and NULL
+ * on failure.
+ */
+struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
+			     struct boot_params *boot_params)
+{
+	struct desc_ptr *gdt, *idt;
+	efi_loaded_image_t *image;
+	struct setup_header *hdr = &boot_params->hdr;
+	efi_status_t status;
+	struct desc_struct *desc;
+
+	sys_table = _table;
+
+	/* Check if we were booted by the EFI firmware */
+	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+		goto fail;
+
+	setup_graphics(boot_params);
 
 	status = efi_call_phys3(sys_table->boottime->allocate_pool,
 				EFI_LOADER_DATA, sizeof(*gdt),
@@ -1015,7 +1046,18 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
 	idt->size = 0;
 	idt->address = 0;
 
-	status = make_boot_params(boot_params, image, handle);
+	/*
+	 * If the kernel isn't already loaded at the preferred load
+	 * address, relocate it.
+	 */
+	if (hdr->pref_address != hdr->code32_start) {
+		status = relocate_kernel(hdr);
+
+		if (status != EFI_SUCCESS)
+			goto fail;
+	}
+
+	status = exit_boot(boot_params, handle);
 	if (status != EFI_SUCCESS)
 		goto fail;
 
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index c85e3ac99bba..aa4aaf1b2380 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -42,6 +42,16 @@ ENTRY(startup_32)
 	 */
 	add	$0x4, %esp
 
+	call	make_boot_params
+	cmpl	$0, %eax
+	je	1f
+	movl	0x4(%esp), %esi
+	movl	(%esp), %ecx
+	pushl	%eax
+	pushl	%esi
+	pushl	%ecx
+
+	.org 0x30,0x90
 	call	efi_main
 	cmpl	$0, %eax
 	movl	%eax, %esi
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 87e03a13d8e3..2c4b171eec33 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -209,6 +209,16 @@ ENTRY(startup_64)
 	.org 0x210
 	mov	%rcx, %rdi
 	mov	%rdx, %rsi
+	pushq	%rdi
+	pushq	%rsi
+	call	make_boot_params
+	cmpq	$0,%rax
+	je	1f
+	mov	%rax, %rdx
+	popq	%rsi
+	popq	%rdi
+
+	.org 0x230,0x90
 	call	efi_main
 	movq	%rax,%rsi
 	cmpq	$0,%rax
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 7116dcba0c9e..88f7ff6da404 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -108,8 +108,6 @@ static void error(char *m);
  * This is set up by the setup-routine at boot-time
  */
 struct boot_params *real_mode;		/* Pointer to real-mode data */
-static int quiet;
-static int debug;
 
 void *memset(void *s, int c, size_t n);
 void *memcpy(void *dest, const void *src, size_t n);
@@ -170,15 +168,11 @@ static void serial_putchar(int ch)
 	outb(ch, early_serial_base + TXR);
 }
 
-void __putstr(int error, const char *s)
+void __putstr(const char *s)
 {
 	int x, y, pos;
 	char c;
 
-#ifndef CONFIG_X86_VERBOSE_BOOTUP
-	if (!error)
-		return;
-#endif
 	if (early_serial_base) {
 		const char *str = s;
 		while (*str) {
@@ -265,9 +259,9 @@ void *memcpy(void *dest, const void *src, size_t n)
 
 static void error(char *x)
 {
-	__putstr(1, "\n\n");
-	__putstr(1, x);
-	__putstr(1, "\n\n -- System halted");
+	error_putstr("\n\n");
+	error_putstr(x);
+	error_putstr("\n\n -- System halted");
 
 	while (1)
 		asm("hlt");
@@ -294,8 +288,7 @@ static void parse_elf(void *output)
 		return;
 	}
 
-	if (!quiet)
-		putstr("Parsing ELF... ");
+	debug_putstr("Parsing ELF... ");
 
 	phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
 	if (!phdrs)
@@ -332,11 +325,6 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 {
 	real_mode = rmode;
 
-	if (cmdline_find_option_bool("quiet"))
-		quiet = 1;
-	if (cmdline_find_option_bool("debug"))
-		debug = 1;
-
 	if (real_mode->screen_info.orig_video_mode == 7) {
 		vidmem = (char *) 0xb0000;
 		vidport = 0x3b4;
@@ -349,8 +337,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 	cols = real_mode->screen_info.orig_video_cols;
 
 	console_init();
-	if (debug)
-		putstr("early console in decompress_kernel\n");
+	debug_putstr("early console in decompress_kernel\n");
 
 	free_mem_ptr     = heap;	/* Heap */
 	free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
@@ -369,11 +356,9 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 		error("Wrong destination address");
 #endif
 
-	if (!quiet)
-		putstr("\nDecompressing Linux... ");
+	debug_putstr("\nDecompressing Linux... ");
 	decompress(input_data, input_len, NULL, NULL, output, NULL, error);
 	parse_elf(output);
-	if (!quiet)
-		putstr("done.\nBooting the kernel.\n");
+	debug_putstr("done.\nBooting the kernel.\n");
 	return;
 }
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3f19c81a6203..0e6dc0ee0eea 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -24,9 +24,21 @@
 
 /* misc.c */
 extern struct boot_params *real_mode;		/* Pointer to real-mode data */
-void __putstr(int error, const char *s);
-#define putstr(__x)  __putstr(0, __x)
-#define puts(__x)  __putstr(0, __x)
+void __putstr(const char *s);
+#define error_putstr(__x)  __putstr(__x)
+
+#ifdef CONFIG_X86_VERBOSE_BOOTUP
+
+#define debug_putstr(__x)  __putstr(__x)
+
+#else
+
+static inline void debug_putstr(const char *s)
+{ }
+
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
 
 /* cmdline.c */
 int cmdline_find_option(const char *option, char *buffer, int bufsize);
@@ -36,4 +48,13 @@ int cmdline_find_option_bool(const char *option);
 extern int early_serial_base;
 void console_init(void);
 
+#else
+
+/* early_serial_console.c */
+static const int early_serial_base;
+static inline void console_init(void)
+{ }
+
+#endif
+
 #endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index efe5acfc79c3..b4e15dd6786a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -283,7 +283,7 @@ _start:
 	# Part 2 of the header, from the old setup.S
 
 		.ascii	"HdrS"		# header signature
-		.word	0x020a		# header version number (>= 0x0105)
+		.word	0x020b		# header version number (>= 0x0105)
 					# or else old loadlin-1.5 will fail)
 		.globl realmode_swtch
 realmode_swtch:	.word	0, 0		# default_switch, SETUPSEG
@@ -401,18 +401,13 @@ pref_address:		.quad LOAD_PHYSICAL_ADDR	# preferred load addr
 #define INIT_SIZE VO_INIT_SIZE
 #endif
 init_size:		.long INIT_SIZE		# kernel initialization size
+handover_offset:	.long 0x30		# offset to the handover
+						# protocol entry point
 
 # End of setup header #####################################################
 
 	.section ".entrytext", "ax"
 start_of_setup:
-#ifdef SAFE_RESET_DISK_CONTROLLER
-# Reset the disk controller.
-	movw	$0x0000, %ax		# Reset disk controller
-	movb	$0x80, %dl		# All disks
-	int	$0x13
-#endif
-
 # Force %es = %ds
 	movw	%ds, %ax
 	movw	%ax, %es
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index e191ac048b59..e908e5de82d3 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,9 @@
 # Arch-specific CryptoAPI modules.
 #
 
+obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
+obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
+
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
 obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
@@ -12,8 +15,10 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
+obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
+obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 
@@ -30,16 +35,11 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
+twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
+serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
-
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
-
-# enable AVX support only when $(AS) can actually assemble the instructions
-ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes)
-AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT
-CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT
-endif
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/ablk_helper.c b/arch/x86/crypto/ablk_helper.c
new file mode 100644
index 000000000000..43282fe04a8b
--- /dev/null
+++ b/arch/x86/crypto/ablk_helper.c
@@ -0,0 +1,149 @@
+/*
+ * Shared async block cipher helpers
+ *
+ * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on aesni-intel_glue.c by:
+ *  Copyright (C) 2008, Intel Corp.
+ *    Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <crypto/algapi.h>
+#include <crypto/cryptd.h>
+#include <asm/i387.h>
+#include <asm/crypto/ablk_helper.h>
+
+int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
+		 unsigned int key_len)
+{
+	struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
+	int err;
+
+	crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+	crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
+				    & CRYPTO_TFM_REQ_MASK);
+	err = crypto_ablkcipher_setkey(child, key, key_len);
+	crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
+				    & CRYPTO_TFM_RES_MASK);
+	return err;
+}
+EXPORT_SYMBOL_GPL(ablk_set_key);
+
+int __ablk_encrypt(struct ablkcipher_request *req)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	struct blkcipher_desc desc;
+
+	desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+	desc.info = req->info;
+	desc.flags = 0;
+
+	return crypto_blkcipher_crt(desc.tfm)->encrypt(
+		&desc, req->dst, req->src, req->nbytes);
+}
+EXPORT_SYMBOL_GPL(__ablk_encrypt);
+
+int ablk_encrypt(struct ablkcipher_request *req)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+	if (!irq_fpu_usable()) {
+		struct ablkcipher_request *cryptd_req =
+			ablkcipher_request_ctx(req);
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+		return crypto_ablkcipher_encrypt(cryptd_req);
+	} else {
+		return __ablk_encrypt(req);
+	}
+}
+EXPORT_SYMBOL_GPL(ablk_encrypt);
+
+int ablk_decrypt(struct ablkcipher_request *req)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+	if (!irq_fpu_usable()) {
+		struct ablkcipher_request *cryptd_req =
+			ablkcipher_request_ctx(req);
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+		return crypto_ablkcipher_decrypt(cryptd_req);
+	} else {
+		struct blkcipher_desc desc;
+
+		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+		desc.info = req->info;
+		desc.flags = 0;
+
+		return crypto_blkcipher_crt(desc.tfm)->decrypt(
+			&desc, req->dst, req->src, req->nbytes);
+	}
+}
+EXPORT_SYMBOL_GPL(ablk_decrypt);
+
+void ablk_exit(struct crypto_tfm *tfm)
+{
+	struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	cryptd_free_ablkcipher(ctx->cryptd_tfm);
+}
+EXPORT_SYMBOL_GPL(ablk_exit);
+
+int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
+{
+	struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+
+	ctx->cryptd_tfm = cryptd_tfm;
+	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
+		crypto_ablkcipher_reqsize(&cryptd_tfm->base);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ablk_init_common);
+
+int ablk_init(struct crypto_tfm *tfm)
+{
+	char drv_name[CRYPTO_MAX_ALG_NAME];
+
+	snprintf(drv_name, sizeof(drv_name), "__driver-%s",
+					crypto_tfm_alg_driver_name(tfm));
+
+	return ablk_init_common(tfm, drv_name);
+}
+EXPORT_SYMBOL_GPL(ablk_init);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
index 8efcf42a9d7e..59b37deb8c8d 100644
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@ -5,7 +5,7 @@
 
 #include <linux/module.h>
 #include <crypto/aes.h>
-#include <asm/aes.h>
+#include <asm/crypto/aes.h>
 
 asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
 asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ac7f5cd019e8..34fdcff4d2c8 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -30,7 +30,8 @@
 #include <crypto/ctr.h>
 #include <asm/cpu_device_id.h>
 #include <asm/i387.h>
-#include <asm/aes.h>
+#include <asm/crypto/aes.h>
+#include <asm/crypto/ablk_helper.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/internal/aead.h>
 #include <linux/workqueue.h>
@@ -52,10 +53,6 @@
 #define HAS_XTS
 #endif
 
-struct async_aes_ctx {
-	struct cryptd_ablkcipher *cryptd_tfm;
-};
-
 /* This data is stored at the end of the crypto_tfm struct.
  * It's a type of per "session" data storage location.
  * This needs to be 16 byte aligned.
@@ -377,87 +374,6 @@ static int ctr_crypt(struct blkcipher_desc *desc,
 }
 #endif
 
-static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
-			unsigned int key_len)
-{
-	struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-	struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
-	int err;
-
-	crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
-	crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
-				    & CRYPTO_TFM_REQ_MASK);
-	err = crypto_ablkcipher_setkey(child, key, key_len);
-	crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
-				    & CRYPTO_TFM_RES_MASK);
-	return err;
-}
-
-static int ablk_encrypt(struct ablkcipher_request *req)
-{
-	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
-	struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-
-	if (!irq_fpu_usable()) {
-		struct ablkcipher_request *cryptd_req =
-			ablkcipher_request_ctx(req);
-		memcpy(cryptd_req, req, sizeof(*req));
-		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-		return crypto_ablkcipher_encrypt(cryptd_req);
-	} else {
-		struct blkcipher_desc desc;
-		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
-		desc.info = req->info;
-		desc.flags = 0;
-		return crypto_blkcipher_crt(desc.tfm)->encrypt(
-			&desc, req->dst, req->src, req->nbytes);
-	}
-}
-
-static int ablk_decrypt(struct ablkcipher_request *req)
-{
-	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
-	struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-
-	if (!irq_fpu_usable()) {
-		struct ablkcipher_request *cryptd_req =
-			ablkcipher_request_ctx(req);
-		memcpy(cryptd_req, req, sizeof(*req));
-		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-		return crypto_ablkcipher_decrypt(cryptd_req);
-	} else {
-		struct blkcipher_desc desc;
-		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
-		desc.info = req->info;
-		desc.flags = 0;
-		return crypto_blkcipher_crt(desc.tfm)->decrypt(
-			&desc, req->dst, req->src, req->nbytes);
-	}
-}
-
-static void ablk_exit(struct crypto_tfm *tfm)
-{
-	struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-
-	cryptd_free_ablkcipher(ctx->cryptd_tfm);
-}
-
-static int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name)
-{
-	struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct cryptd_ablkcipher *cryptd_tfm;
-
-	cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-
-	ctx->cryptd_tfm = cryptd_tfm;
-	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
-		crypto_ablkcipher_reqsize(&cryptd_tfm->base);
-
-	return 0;
-}
-
 static int ablk_ecb_init(struct crypto_tfm *tfm)
 {
 	return ablk_init_common(tfm, "__driver-ecb-aes-aesni");
@@ -613,7 +529,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
 	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
 	struct aesni_rfc4106_gcm_ctx *child_ctx =
                                  aesni_rfc4106_gcm_ctx_get(cryptd_child);
-	u8 *new_key_mem = NULL;
+	u8 *new_key_align, *new_key_mem = NULL;
 
 	if (key_len < 4) {
 		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
@@ -637,9 +553,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
 		if (!new_key_mem)
 			return -ENOMEM;
 
-		new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
-		memcpy(new_key_mem, key, key_len);
-		key = new_key_mem;
+		new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
+		memcpy(new_key_align, key, key_len);
+		key = new_key_align;
 	}
 
 	if (!irq_fpu_usable())
@@ -968,7 +884,7 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
@@ -989,7 +905,7 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
@@ -1033,7 +949,7 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
@@ -1098,7 +1014,7 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
@@ -1126,7 +1042,7 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
@@ -1150,7 +1066,7 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
@@ -1174,7 +1090,7 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 3306dc0b139e..eeb2b3b743e9 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -5,10 +5,6 @@
  *
  * Camellia parts based on code by:
  *  Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
- * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
- *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -34,9 +30,9 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <crypto/algapi.h>
-#include <crypto/b128ops.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
 
 #define CAMELLIA_MIN_KEY_SIZE	16
 #define CAMELLIA_MAX_KEY_SIZE	32
@@ -1312,307 +1308,128 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
 				 &tfm->crt_flags);
 }
 
-static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
-		     void (*fn)(struct camellia_ctx *, u8 *, const u8 *),
-		     void (*fn_2way)(struct camellia_ctx *, u8 *, const u8 *))
+static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
 {
-	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = CAMELLIA_BLOCK_SIZE;
-	unsigned int nbytes;
-	int err;
-
-	err = blkcipher_walk_virt(desc, walk);
-
-	while ((nbytes = walk->nbytes)) {
-		u8 *wsrc = walk->src.virt.addr;
-		u8 *wdst = walk->dst.virt.addr;
-
-		/* Process two block batch */
-		if (nbytes >= bsize * 2) {
-			do {
-				fn_2way(ctx, wdst, wsrc);
-
-				wsrc += bsize * 2;
-				wdst += bsize * 2;
-				nbytes -= bsize * 2;
-			} while (nbytes >= bsize * 2);
-
-			if (nbytes < bsize)
-				goto done;
-		}
-
-		/* Handle leftovers */
-		do {
-			fn(ctx, wdst, wsrc);
-
-			wsrc += bsize;
-			wdst += bsize;
-			nbytes -= bsize;
-		} while (nbytes >= bsize);
-
-done:
-		err = blkcipher_walk_done(desc, walk, nbytes);
-	}
-
-	return err;
-}
-
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	return ecb_crypt(desc, &walk, camellia_enc_blk, camellia_enc_blk_2way);
-}
+	u128 iv = *src;
 
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	return ecb_crypt(desc, &walk, camellia_dec_blk, camellia_dec_blk_2way);
-}
+	camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
 
-static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
-				  struct blkcipher_walk *walk)
-{
-	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = CAMELLIA_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u128 *src = (u128 *)walk->src.virt.addr;
-	u128 *dst = (u128 *)walk->dst.virt.addr;
-	u128 *iv = (u128 *)walk->iv;
-
-	do {
-		u128_xor(dst, src, iv);
-		camellia_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
-		iv = dst;
-
-		src += 1;
-		dst += 1;
-		nbytes -= bsize;
-	} while (nbytes >= bsize);
-
-	u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
-	return nbytes;
+	u128_xor(&dst[1], &dst[1], &iv);
 }
 
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
+static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
 {
-	struct blkcipher_walk walk;
-	int err;
+	be128 ctrblk;
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
+	if (dst != src)
+		*dst = *src;
 
-	while ((nbytes = walk.nbytes)) {
-		nbytes = __cbc_encrypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
+	u128_to_be128(&ctrblk, iv);
+	u128_inc(iv);
 
-	return err;
+	camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
 }
 
-static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
-				  struct blkcipher_walk *walk)
+static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
+				    u128 *iv)
 {
-	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = CAMELLIA_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u128 *src = (u128 *)walk->src.virt.addr;
-	u128 *dst = (u128 *)walk->dst.virt.addr;
-	u128 ivs[2 - 1];
-	u128 last_iv;
+	be128 ctrblks[2];
 
-	/* Start of the last block. */
-	src += nbytes / bsize - 1;
-	dst += nbytes / bsize - 1;
-
-	last_iv = *src;
-
-	/* Process two block batch */
-	if (nbytes >= bsize * 2) {
-		do {
-			nbytes -= bsize * (2 - 1);
-			src -= 2 - 1;
-			dst -= 2 - 1;
-
-			ivs[0] = src[0];
-
-			camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
-
-			u128_xor(dst + 1, dst + 1, ivs + 0);
-
-			nbytes -= bsize;
-			if (nbytes < bsize)
-				goto done;
-
-			u128_xor(dst, dst, src - 1);
-			src -= 1;
-			dst -= 1;
-		} while (nbytes >= bsize * 2);
-
-		if (nbytes < bsize)
-			goto done;
+	if (dst != src) {
+		dst[0] = src[0];
+		dst[1] = src[1];
 	}
 
-	/* Handle leftovers */
-	for (;;) {
-		camellia_dec_blk(ctx, (u8 *)dst, (u8 *)src);
-
-		nbytes -= bsize;
-		if (nbytes < bsize)
-			break;
+	u128_to_be128(&ctrblks[0], iv);
+	u128_inc(iv);
+	u128_to_be128(&ctrblks[1], iv);
+	u128_inc(iv);
 
-		u128_xor(dst, dst, src - 1);
-		src -= 1;
-		dst -= 1;
-	}
-
-done:
-	u128_xor(dst, dst, (u128 *)walk->iv);
-	*(u128 *)walk->iv = last_iv;
-
-	return nbytes;
+	camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
 }
 
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
+static const struct common_glue_ctx camellia_enc = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = -1,
+
+	.funcs = { {
+		.num_blocks = 2,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+	} }
+};
 
-	while ((nbytes = walk.nbytes)) {
-		nbytes = __cbc_decrypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
+static const struct common_glue_ctx camellia_ctr = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = -1,
+
+	.funcs = { {
+		.num_blocks = 2,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+	} }
+};
 
-	return err;
-}
+static const struct common_glue_ctx camellia_dec = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = -1,
+
+	.funcs = { {
+		.num_blocks = 2,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+	} }
+};
 
-static inline void u128_to_be128(be128 *dst, const u128 *src)
-{
-	dst->a = cpu_to_be64(src->a);
-	dst->b = cpu_to_be64(src->b);
-}
+static const struct common_glue_ctx camellia_dec_cbc = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = -1,
+
+	.funcs = { {
+		.num_blocks = 2,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+	} }
+};
 
-static inline void be128_to_u128(u128 *dst, const be128 *src)
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
 {
-	dst->a = be64_to_cpu(src->a);
-	dst->b = be64_to_cpu(src->b);
+	return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
 }
 
-static inline void u128_inc(u128 *i)
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
 {
-	i->b++;
-	if (!i->b)
-		i->a++;
+	return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
 }
 
-static void ctr_crypt_final(struct blkcipher_desc *desc,
-			    struct blkcipher_walk *walk)
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
 {
-	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	u8 keystream[CAMELLIA_BLOCK_SIZE];
-	u8 *src = walk->src.virt.addr;
-	u8 *dst = walk->dst.virt.addr;
-	unsigned int nbytes = walk->nbytes;
-	u128 ctrblk;
-
-	memcpy(keystream, src, nbytes);
-	camellia_enc_blk_xor(ctx, keystream, walk->iv);
-	memcpy(dst, keystream, nbytes);
-
-	be128_to_u128(&ctrblk, (be128 *)walk->iv);
-	u128_inc(&ctrblk);
-	u128_to_be128((be128 *)walk->iv, &ctrblk);
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
+				       dst, src, nbytes);
 }
 
-static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
-				struct blkcipher_walk *walk)
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
 {
-	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = CAMELLIA_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u128 *src = (u128 *)walk->src.virt.addr;
-	u128 *dst = (u128 *)walk->dst.virt.addr;
-	u128 ctrblk;
-	be128 ctrblocks[2];
-
-	be128_to_u128(&ctrblk, (be128 *)walk->iv);
-
-	/* Process two block batch */
-	if (nbytes >= bsize * 2) {
-		do {
-			if (dst != src) {
-				dst[0] = src[0];
-				dst[1] = src[1];
-			}
-
-			/* create ctrblks for parallel encrypt */
-			u128_to_be128(&ctrblocks[0], &ctrblk);
-			u128_inc(&ctrblk);
-			u128_to_be128(&ctrblocks[1], &ctrblk);
-			u128_inc(&ctrblk);
-
-			camellia_enc_blk_xor_2way(ctx, (u8 *)dst,
-						 (u8 *)ctrblocks);
-
-			src += 2;
-			dst += 2;
-			nbytes -= bsize * 2;
-		} while (nbytes >= bsize * 2);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Handle leftovers */
-	do {
-		if (dst != src)
-			*dst = *src;
-
-		u128_to_be128(&ctrblocks[0], &ctrblk);
-		u128_inc(&ctrblk);
-
-		camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks);
-
-		src += 1;
-		dst += 1;
-		nbytes -= bsize;
-	} while (nbytes >= bsize);
-
-done:
-	u128_to_be128((be128 *)walk->iv, &ctrblk);
-	return nbytes;
+	return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
+				       nbytes);
 }
 
 static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		     struct scatterlist *src, unsigned int nbytes)
 {
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, CAMELLIA_BLOCK_SIZE);
-
-	while ((nbytes = walk.nbytes) >= CAMELLIA_BLOCK_SIZE) {
-		nbytes = __ctr_crypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	if (walk.nbytes) {
-		ctr_crypt_final(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, 0);
-	}
-
-	return err;
+	return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
 }
 
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
new file mode 100644
index 000000000000..4854f0f31e4f
--- /dev/null
+++ b/arch/x86/crypto/glue_helper.c
@@ -0,0 +1,307 @@
+/*
+ * Shared glue code for 128bit block ciphers
+ *
+ * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <crypto/b128ops.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+#include <crypto/scatterwalk.h>
+
+static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
+				   struct blkcipher_desc *desc,
+				   struct blkcipher_walk *walk)
+{
+	void *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = 128 / 8;
+	unsigned int nbytes, i, func_bytes;
+	bool fpu_enabled = false;
+	int err;
+
+	err = blkcipher_walk_virt(desc, walk);
+
+	while ((nbytes = walk->nbytes)) {
+		u8 *wsrc = walk->src.virt.addr;
+		u8 *wdst = walk->dst.virt.addr;
+
+		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+					     desc, fpu_enabled, nbytes);
+
+		for (i = 0; i < gctx->num_funcs; i++) {
+			func_bytes = bsize * gctx->funcs[i].num_blocks;
+
+			/* Process multi-block batch */
+			if (nbytes >= func_bytes) {
+				do {
+					gctx->funcs[i].fn_u.ecb(ctx, wdst,
+								wsrc);
+
+					wsrc += func_bytes;
+					wdst += func_bytes;
+					nbytes -= func_bytes;
+				} while (nbytes >= func_bytes);
+
+				if (nbytes < bsize)
+					goto done;
+			}
+		}
+
+done:
+		err = blkcipher_walk_done(desc, walk, nbytes);
+	}
+
+	glue_fpu_end(fpu_enabled);
+	return err;
+}
+
+int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
+			  struct blkcipher_desc *desc, struct scatterlist *dst,
+			  struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return __glue_ecb_crypt_128bit(gctx, desc, &walk);
+}
+EXPORT_SYMBOL_GPL(glue_ecb_crypt_128bit);
+
+static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn,
+					      struct blkcipher_desc *desc,
+					      struct blkcipher_walk *walk)
+{
+	void *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = 128 / 8;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 *iv = (u128 *)walk->iv;
+
+	do {
+		u128_xor(dst, src, iv);
+		fn(ctx, (u8 *)dst, (u8 *)dst);
+		iv = dst;
+
+		src += 1;
+		dst += 1;
+		nbytes -= bsize;
+	} while (nbytes >= bsize);
+
+	u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
+	return nbytes;
+}
+
+int glue_cbc_encrypt_128bit(const common_glue_func_t fn,
+			    struct blkcipher_desc *desc,
+			    struct scatterlist *dst,
+			    struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while ((nbytes = walk.nbytes)) {
+		nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(glue_cbc_encrypt_128bit);
+
+static unsigned int
+__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
+			  struct blkcipher_desc *desc,
+			  struct blkcipher_walk *walk)
+{
+	void *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = 128 / 8;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 last_iv;
+	unsigned int num_blocks, func_bytes;
+	unsigned int i;
+
+	/* Start of the last block. */
+	src += nbytes / bsize - 1;
+	dst += nbytes / bsize - 1;
+
+	last_iv = *src;
+
+	for (i = 0; i < gctx->num_funcs; i++) {
+		num_blocks = gctx->funcs[i].num_blocks;
+		func_bytes = bsize * num_blocks;
+
+		/* Process multi-block batch */
+		if (nbytes >= func_bytes) {
+			do {
+				nbytes -= func_bytes - bsize;
+				src -= num_blocks - 1;
+				dst -= num_blocks - 1;
+
+				gctx->funcs[i].fn_u.cbc(ctx, dst, src);
+
+				nbytes -= bsize;
+				if (nbytes < bsize)
+					goto done;
+
+				u128_xor(dst, dst, src - 1);
+				src -= 1;
+				dst -= 1;
+			} while (nbytes >= func_bytes);
+
+			if (nbytes < bsize)
+				goto done;
+		}
+	}
+
+done:
+	u128_xor(dst, dst, (u128 *)walk->iv);
+	*(u128 *)walk->iv = last_iv;
+
+	return nbytes;
+}
+
+int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
+			    struct blkcipher_desc *desc,
+			    struct scatterlist *dst,
+			    struct scatterlist *src, unsigned int nbytes)
+{
+	const unsigned int bsize = 128 / 8;
+	bool fpu_enabled = false;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while ((nbytes = walk.nbytes)) {
+		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+					     desc, fpu_enabled, nbytes);
+		nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	glue_fpu_end(fpu_enabled);
+	return err;
+}
+EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
+
+static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
+					struct blkcipher_desc *desc,
+					struct blkcipher_walk *walk)
+{
+	void *ctx = crypto_blkcipher_ctx(desc->tfm);
+	u8 *src = (u8 *)walk->src.virt.addr;
+	u8 *dst = (u8 *)walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+	u128 ctrblk;
+	u128 tmp;
+
+	be128_to_u128(&ctrblk, (be128 *)walk->iv);
+
+	memcpy(&tmp, src, nbytes);
+	fn_ctr(ctx, &tmp, &tmp, &ctrblk);
+	memcpy(dst, &tmp, nbytes);
+
+	u128_to_be128((be128 *)walk->iv, &ctrblk);
+}
+EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
+
+static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
+					    struct blkcipher_desc *desc,
+					    struct blkcipher_walk *walk)
+{
+	const unsigned int bsize = 128 / 8;
+	void *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 ctrblk;
+	unsigned int num_blocks, func_bytes;
+	unsigned int i;
+
+	be128_to_u128(&ctrblk, (be128 *)walk->iv);
+
+	/* Process multi-block batch */
+	for (i = 0; i < gctx->num_funcs; i++) {
+		num_blocks = gctx->funcs[i].num_blocks;
+		func_bytes = bsize * num_blocks;
+
+		if (nbytes >= func_bytes) {
+			do {
+				gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk);
+
+				src += num_blocks;
+				dst += num_blocks;
+				nbytes -= func_bytes;
+			} while (nbytes >= func_bytes);
+
+			if (nbytes < bsize)
+				goto done;
+		}
+	}
+
+done:
+	u128_to_be128((be128 *)walk->iv, &ctrblk);
+	return nbytes;
+}
+
+int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
+			  struct blkcipher_desc *desc, struct scatterlist *dst,
+			  struct scatterlist *src, unsigned int nbytes)
+{
+	const unsigned int bsize = 128 / 8;
+	bool fpu_enabled = false;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, bsize);
+
+	while ((nbytes = walk.nbytes) >= bsize) {
+		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+					     desc, fpu_enabled, nbytes);
+		nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	glue_fpu_end(fpu_enabled);
+
+	if (walk.nbytes) {
+		glue_ctr_crypt_final_128bit(
+			gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..504106bf04a2
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -0,0 +1,704 @@
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
+ *  Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+.file "serpent-avx-x86_64-asm_64.S"
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+  8-way AVX serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define tp  %xmm5
+
+#define RA2 %xmm6
+#define RB2 %xmm7
+#define RC2 %xmm8
+#define RD2 %xmm9
+#define RE2 %xmm10
+
+#define RNOT %xmm11
+
+#define RK0 %xmm12
+#define RK1 %xmm13
+#define RK2 %xmm14
+#define RK3 %xmm15
+
+
+#define S0_1(x0, x1, x2, x3, x4)      \
+	vpor		x0,   x3, tp; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x2,   x3, x4; \
+	vpxor		RNOT, x4, x4; \
+	vpxor		x1,   tp, x3; \
+	vpand		x0,   x1, x1; \
+	vpxor		x4,   x1, x1; \
+	vpxor		x0,   x2, x2;
+#define S0_2(x0, x1, x2, x3, x4)      \
+	vpxor		x3,   x0, x0; \
+	vpor		x0,   x4, x4; \
+	vpxor		x2,   x0, x0; \
+	vpand		x1,   x2, x2; \
+	vpxor		x2,   x3, x3; \
+	vpxor		RNOT, x1, x1; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x2,   x1, x1;
+
+#define S1_1(x0, x1, x2, x3, x4)      \
+	vpxor		x0,   x1, tp; \
+	vpxor		x3,   x0, x0; \
+	vpxor		RNOT, x3, x3; \
+	vpand		tp,   x1, x4; \
+	vpor		tp,   x0, x0; \
+	vpxor		x2,   x3, x3; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x3,   tp, x1;
+#define S1_2(x0, x1, x2, x3, x4)      \
+	vpxor		x4,   x3, x3; \
+	vpor		x4,   x1, x1; \
+	vpxor		x2,   x4, x4; \
+	vpand		x0,   x2, x2; \
+	vpxor		x1,   x2, x2; \
+	vpor		x0,   x1, x1; \
+	vpxor		RNOT, x0, x0; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x1,   x4, x4;
+
+#define S2_1(x0, x1, x2, x3, x4)      \
+	vpxor		RNOT, x3, x3; \
+	vpxor		x0,   x1, x1; \
+	vpand		x2,   x0, tp; \
+	vpxor		x3,   tp, tp; \
+	vpor		x0,   x3, x3; \
+	vpxor		x1,   x2, x2; \
+	vpxor		x1,   x3, x3; \
+	vpand		tp,   x1, x1;
+#define S2_2(x0, x1, x2, x3, x4)      \
+	vpxor		x2,   tp, tp; \
+	vpand		x3,   x2, x2; \
+	vpor		x1,   x3, x3; \
+	vpxor		RNOT, tp, tp; \
+	vpxor		tp,   x3, x3; \
+	vpxor		tp,   x0, x4; \
+	vpxor		x2,   tp, x0; \
+	vpor		x2,   x1, x1;
+
+#define S3_1(x0, x1, x2, x3, x4)      \
+	vpxor		x3,   x1, tp; \
+	vpor		x0,   x3, x3; \
+	vpand		x0,   x1, x4; \
+	vpxor		x2,   x0, x0; \
+	vpxor		tp,   x2, x2; \
+	vpand		x3,   tp, x1; \
+	vpxor		x3,   x2, x2; \
+	vpor		x4,   x0, x0; \
+	vpxor		x3,   x4, x4;
+#define S3_2(x0, x1, x2, x3, x4)      \
+	vpxor		x0,   x1, x1; \
+	vpand		x3,   x0, x0; \
+	vpand		x4,   x3, x3; \
+	vpxor		x2,   x3, x3; \
+	vpor		x1,   x4, x4; \
+	vpand		x1,   x2, x2; \
+	vpxor		x3,   x4, x4; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x2,   x3, x3;
+
+#define S4_1(x0, x1, x2, x3, x4)      \
+	vpand		x0,   x3, tp; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x2,   tp, tp; \
+	vpor		x3,   x2, x2; \
+	vpxor		x1,   x0, x0; \
+	vpxor		tp,   x3, x4; \
+	vpor		x0,   x2, x2; \
+	vpxor		x1,   x2, x2;
+#define S4_2(x0, x1, x2, x3, x4)      \
+	vpand		x0,   x1, x1; \
+	vpxor		x4,   x1, x1; \
+	vpand		x2,   x4, x4; \
+	vpxor		tp,   x2, x2; \
+	vpxor		x0,   x4, x4; \
+	vpor		x1,   tp, x3; \
+	vpxor		RNOT, x1, x1; \
+	vpxor		x0,   x3, x3;
+
+#define S5_1(x0, x1, x2, x3, x4)      \
+	vpor		x0,   x1, tp; \
+	vpxor		tp,   x2, x2; \
+	vpxor		RNOT, x3, x3; \
+	vpxor		x0,   x1, x4; \
+	vpxor		x2,   x0, x0; \
+	vpand		x4,   tp, x1; \
+	vpor		x3,   x4, x4; \
+	vpxor		x0,   x4, x4;
+#define S5_2(x0, x1, x2, x3, x4)      \
+	vpand		x3,   x0, x0; \
+	vpxor		x3,   x1, x1; \
+	vpxor		x2,   x3, x3; \
+	vpxor		x1,   x0, x0; \
+	vpand		x4,   x2, x2; \
+	vpxor		x2,   x1, x1; \
+	vpand		x0,   x2, x2; \
+	vpxor		x2,   x3, x3;
+
+#define S6_1(x0, x1, x2, x3, x4)      \
+	vpxor		x0,   x3, x3; \
+	vpxor		x2,   x1, tp; \
+	vpxor		x0,   x2, x2; \
+	vpand		x3,   x0, x0; \
+	vpor		x3,   tp, tp; \
+	vpxor		RNOT, x1, x4; \
+	vpxor		tp,   x0, x0; \
+	vpxor		x2,   tp, x1;
+#define S6_2(x0, x1, x2, x3, x4)      \
+	vpxor		x4,   x3, x3; \
+	vpxor		x0,   x4, x4; \
+	vpand		x0,   x2, x2; \
+	vpxor		x1,   x4, x4; \
+	vpxor		x3,   x2, x2; \
+	vpand		x1,   x3, x3; \
+	vpxor		x0,   x3, x3; \
+	vpxor		x2,   x1, x1;
+
+#define S7_1(x0, x1, x2, x3, x4)      \
+	vpxor		RNOT, x1, tp; \
+	vpxor		RNOT, x0, x0; \
+	vpand		x2,   tp, x1; \
+	vpxor		x3,   x1, x1; \
+	vpor		tp,   x3, x3; \
+	vpxor		x2,   tp, x4; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x0,   x3, x3; \
+	vpor		x1,   x0, x0;
+#define S7_2(x0, x1, x2, x3, x4)      \
+	vpand		x0,   x2, x2; \
+	vpxor		x4,   x0, x0; \
+	vpxor		x3,   x4, x4; \
+	vpand		x0,   x3, x3; \
+	vpxor		x1,   x4, x4; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x1,   x3, x3; \
+	vpor		x0,   x4, x4; \
+	vpxor		x1,   x4, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4)     \
+	vpxor		x0,   x1, x1; \
+	vpor		x1,   x3, tp; \
+	vpxor		x1,   x3, x4; \
+	vpxor		RNOT, x0, x0; \
+	vpxor		tp,   x2, x2; \
+	vpxor		x0,   tp, x3; \
+	vpand		x1,   x0, x0; \
+	vpxor		x2,   x0, x0;
+#define SI0_2(x0, x1, x2, x3, x4)     \
+	vpand		x3,   x2, x2; \
+	vpxor		x4,   x3, x3; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x3,   x1, x1; \
+	vpand		x0,   x3, x3; \
+	vpxor		x0,   x1, x1; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x3,   x4, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4)     \
+	vpxor		x3,   x1, x1; \
+	vpxor		x2,   x0, tp; \
+	vpxor		RNOT, x2, x2; \
+	vpor		x1,   x0, x4; \
+	vpxor		x3,   x4, x4; \
+	vpand		x1,   x3, x3; \
+	vpxor		x2,   x1, x1; \
+	vpand		x4,   x2, x2;
+#define SI1_2(x0, x1, x2, x3, x4)     \
+	vpxor		x1,   x4, x4; \
+	vpor		x3,   x1, x1; \
+	vpxor		tp,   x3, x3; \
+	vpxor		tp,   x2, x2; \
+	vpor		x4,   tp, x0; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x0,   x1, x1; \
+	vpxor		x1,   x4, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4)     \
+	vpxor		x1,   x2, x2; \
+	vpxor		RNOT, x3, tp; \
+	vpor		x2,   tp, tp; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x0,   x3, x4; \
+	vpxor		x1,   tp, x3; \
+	vpor		x2,   x1, x1; \
+	vpxor		x0,   x2, x2;
+#define SI2_2(x0, x1, x2, x3, x4)     \
+	vpxor		x4,   x1, x1; \
+	vpor		x3,   x4, x4; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x2,   x4, x4; \
+	vpand		x1,   x2, x2; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x4,   x3, x3; \
+	vpxor		x0,   x4, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4)     \
+	vpxor		x1,   x2, x2; \
+	vpand		x2,   x1, tp; \
+	vpxor		x0,   tp, tp; \
+	vpor		x1,   x0, x0; \
+	vpxor		x3,   x1, x4; \
+	vpxor		x3,   x0, x0; \
+	vpor		tp,   x3, x3; \
+	vpxor		x2,   tp, x1;
+#define SI3_2(x0, x1, x2, x3, x4)     \
+	vpxor		x3,   x1, x1; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x3,   x2, x2; \
+	vpand		x1,   x3, x3; \
+	vpxor		x0,   x1, x1; \
+	vpand		x2,   x0, x0; \
+	vpxor		x3,   x4, x4; \
+	vpxor		x0,   x3, x3; \
+	vpxor		x1,   x0, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4)     \
+	vpxor		x3,   x2, x2; \
+	vpand		x1,   x0, tp; \
+	vpxor		x2,   tp, tp; \
+	vpor		x3,   x2, x2; \
+	vpxor		RNOT, x0, x4; \
+	vpxor		tp,   x1, x1; \
+	vpxor		x2,   tp, x0; \
+	vpand		x4,   x2, x2;
+#define SI4_2(x0, x1, x2, x3, x4)     \
+	vpxor		x0,   x2, x2; \
+	vpor		x4,   x0, x0; \
+	vpxor		x3,   x0, x0; \
+	vpand		x2,   x3, x3; \
+	vpxor		x3,   x4, x4; \
+	vpxor		x1,   x3, x3; \
+	vpand		x0,   x1, x1; \
+	vpxor		x1,   x4, x4; \
+	vpxor		x3,   x0, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4)     \
+	vpor		x2,   x1, tp; \
+	vpxor		x1,   x2, x2; \
+	vpxor		x3,   tp, tp; \
+	vpand		x1,   x3, x3; \
+	vpxor		x3,   x2, x2; \
+	vpor		x0,   x3, x3; \
+	vpxor		RNOT, x0, x0; \
+	vpxor		x2,   x3, x3; \
+	vpor		x0,   x2, x2;
+#define SI5_2(x0, x1, x2, x3, x4)     \
+	vpxor		tp,   x1, x4; \
+	vpxor		x4,   x2, x2; \
+	vpand		x0,   x4, x4; \
+	vpxor		tp,   x0, x0; \
+	vpxor		x3,   tp, x1; \
+	vpand		x2,   x0, x0; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x3,   x4, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4)     \
+	vpxor		x2,   x0, x0; \
+	vpand		x3,   x0, tp; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x2,   tp, tp; \
+	vpxor		x1,   x3, x3; \
+	vpor		x0,   x2, x2; \
+	vpxor		x3,   x2, x2; \
+	vpand		tp,   x3, x3;
+#define SI6_2(x0, x1, x2, x3, x4)     \
+	vpxor		RNOT, tp, tp; \
+	vpxor		x1,   x3, x3; \
+	vpand		x2,   x1, x1; \
+	vpxor		tp,   x0, x4; \
+	vpxor		x4,   x3, x3; \
+	vpxor		x2,   x4, x4; \
+	vpxor		x1,   tp, x0; \
+	vpxor		x0,   x2, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4)     \
+	vpand		x0,   x3, tp; \
+	vpxor		x2,   x0, x0; \
+	vpor		x3,   x2, x2; \
+	vpxor		x1,   x3, x4; \
+	vpxor		RNOT, x0, x0; \
+	vpor		tp,   x1, x1; \
+	vpxor		x0,   x4, x4; \
+	vpand		x2,   x0, x0; \
+	vpxor		x1,   x0, x0;
+#define SI7_2(x0, x1, x2, x3, x4)     \
+	vpand		x2,   x1, x1; \
+	vpxor		x2,   tp, x3; \
+	vpxor		x3,   x4, x4; \
+	vpand		x3,   x2, x2; \
+	vpor		x0,   x3, x3; \
+	vpxor		x4,   x1, x1; \
+	vpxor		x4,   x3, x3; \
+	vpand		x0,   x4, x4; \
+	vpxor		x2,   x4, x4;
+
+#define get_key(i, j, t) \
+	vbroadcastss (4*(i)+(j))*4(CTX), t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, RK0); \
+	get_key(i, 1, RK1); \
+	get_key(i, 2, RK2); \
+	get_key(i, 3, RK3); \
+	vpxor RK0,	x0 ## 1, x0 ## 1; \
+	vpxor RK1,	x1 ## 1, x1 ## 1; \
+	vpxor RK2,	x2 ## 1, x2 ## 1; \
+	vpxor RK3,	x3 ## 1, x3 ## 1; \
+		vpxor RK0,	x0 ## 2, x0 ## 2; \
+		vpxor RK1,	x1 ## 2, x1 ## 2; \
+		vpxor RK2,	x2 ## 2, x2 ## 2; \
+		vpxor RK3,	x3 ## 2, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+	vpslld $13,		x0 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 13),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
+	vpslld $3,		x2 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 3),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
+		vpslld $13,		x0 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 13),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
+		vpslld $3,		x2 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 3),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
+	vpslld $1,		x1 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 1),	x1 ## 1, x1 ## 1;          \
+	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
+	vpslld $3,		x0 ## 1, x4 ## 1;          \
+	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
+	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
+	get_key(i, 1, RK1); \
+		vpslld $1,		x1 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 1),	x1 ## 2, x1 ## 2;          \
+		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
+		vpslld $3,		x0 ## 2, x4 ## 2;          \
+		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
+		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
+		get_key(i, 3, RK3); \
+	vpslld $7,		x3 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 7),	x3 ## 1, x3 ## 1;          \
+	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
+	vpslld $7,		x1 ## 1, x4 ## 1;          \
+	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	get_key(i, 0, RK0); \
+		vpslld $7,		x3 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 7),	x3 ## 2, x3 ## 2;          \
+		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
+		vpslld $7,		x1 ## 2, x4 ## 2;          \
+		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		get_key(i, 2, RK2); \
+	vpxor			RK1, x1 ## 1, x1 ## 1;     \
+	vpxor			RK3, x3 ## 1, x3 ## 1;     \
+	vpslld $5,		x0 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 5),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpslld $22,		x2 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 22),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			RK0, x0 ## 1, x0 ## 1;     \
+	vpxor			RK2, x2 ## 1, x2 ## 1;     \
+		vpxor			RK1, x1 ## 2, x1 ## 2;     \
+		vpxor			RK3, x3 ## 2, x3 ## 2;     \
+		vpslld $5,		x0 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 5),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpslld $22,		x2 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 22),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			RK0, x0 ## 2, x0 ## 2;     \
+		vpxor			RK2, x2 ## 2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+	vpxor			RK0, x0 ## 1, x0 ## 1;     \
+	vpxor			RK2, x2 ## 1, x2 ## 1;     \
+	vpsrld $5,		x0 ## 1, x4 ## 1;          \
+	vpslld $(32 - 5),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			RK3, x3 ## 1, x3 ## 1;     \
+	vpxor			RK1, x1 ## 1, x1 ## 1;     \
+	vpsrld $22,		x2 ## 1, x4 ## 1;          \
+	vpslld $(32 - 22),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
+		vpxor			RK0, x0 ## 2, x0 ## 2;     \
+		vpxor			RK2, x2 ## 2, x2 ## 2;     \
+		vpsrld $5,		x0 ## 2, x4 ## 2;          \
+		vpslld $(32 - 5),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			RK3, x3 ## 2, x3 ## 2;     \
+		vpxor			RK1, x1 ## 2, x1 ## 2;     \
+		vpsrld $22,		x2 ## 2, x4 ## 2;          \
+		vpslld $(32 - 22),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
+	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
+	vpslld $7,		x1 ## 1, x4 ## 1;          \
+	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpsrld $1,		x1 ## 1, x4 ## 1;          \
+	vpslld $(32 - 1),	x1 ## 1, x1 ## 1;          \
+	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
+		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
+		vpslld $7,		x1 ## 2, x4 ## 2;          \
+		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpsrld $1,		x1 ## 2, x4 ## 2;          \
+		vpslld $(32 - 1),	x1 ## 2, x1 ## 2;          \
+		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
+	vpsrld $7,		x3 ## 1, x4 ## 1;          \
+	vpslld $(32 - 7),	x3 ## 1, x3 ## 1;          \
+	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
+	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
+	vpslld $3,		x0 ## 1, x4 ## 1;          \
+	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
+		vpsrld $7,		x3 ## 2, x4 ## 2;          \
+		vpslld $(32 - 7),	x3 ## 2, x3 ## 2;          \
+		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
+		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
+		vpslld $3,		x0 ## 2, x4 ## 2;          \
+		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
+	vpsrld $13,		x0 ## 1, x4 ## 1;          \
+	vpslld $(32 - 13),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
+	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
+	vpsrld $3,		x2 ## 1, x4 ## 1;          \
+	vpslld $(32 - 3),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+		vpsrld $13,		x0 ## 2, x4 ## 2;          \
+		vpslld $(32 - 13),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
+		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
+		vpsrld $3,		x2 ## 2, x4 ## 2;          \
+		vpslld $(32 - 3),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, RK0); \
+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	get_key(i, 2, RK2); \
+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	get_key(i, 3, RK3); \
+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+	get_key(i, 1, RK1); \
+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	vpunpckldq		x1, x0, t0; \
+	vpunpckhdq		x1, x0, t2; \
+	vpunpckldq		x3, x2, t1; \
+	vpunpckhdq		x3, x2, x3; \
+	\
+	vpunpcklqdq		t1, t0, x0; \
+	vpunpckhqdq		t1, t0, x1; \
+	vpunpcklqdq		x3, t2, x2; \
+	vpunpckhqdq		x3, t2, x3;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+	vmovdqu (0*4*4)(in),	x0; \
+	vmovdqu (1*4*4)(in),	x1; \
+	vmovdqu (2*4*4)(in),	x2; \
+	vmovdqu (3*4*4)(in),	x3; \
+	\
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	vmovdqu x0,		(0*4*4)(out); \
+	vmovdqu x1,		(1*4*4)(out); \
+	vmovdqu x2,		(2*4*4)(out); \
+	vmovdqu x3,		(3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	vpxor (0*4*4)(out),	x0, x0;       \
+	vmovdqu x0,		(0*4*4)(out); \
+	vpxor (1*4*4)(out),	x1, x1;       \
+	vmovdqu x1,		(1*4*4)(out); \
+	vpxor (2*4*4)(out),	x2, x2;       \
+	vmovdqu x2,		(2*4*4)(out); \
+	vpxor (3*4*4)(out),	x3, x3;       \
+	vmovdqu x3,		(3*4*4)(out);
+
+.align 8
+.global __serpent_enc_blk_8way_avx
+.type   __serpent_enc_blk_8way_avx,@function;
+
+__serpent_enc_blk_8way_avx:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: bool, if true: xor output
+	 */
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+
+	leaq (4*4*4)(%rdx), %rax;
+	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+						 K2(RA, RB, RC, RD, RE, 0);
+	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
+	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
+	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
+	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
+	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
+	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
+	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
+	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
+	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
+	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
+	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
+	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
+	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
+	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
+	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
+	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
+	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
+	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
+	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
+	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
+	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
+	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
+	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
+	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
+	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
+	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
+	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
+	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
+	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
+	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
+	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
+	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
+
+	leaq (4*4*4)(%rsi), %rax;
+
+	testb %cl, %cl;
+	jnz __enc_xor8;
+
+	write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+	ret;
+
+__enc_xor8:
+	xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+	ret;
+
+.align 8
+.global serpent_dec_blk_8way_avx
+.type   serpent_dec_blk_8way_avx,@function;
+
+serpent_dec_blk_8way_avx:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+
+	leaq (4*4*4)(%rdx), %rax;
+	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+						 K2(RA, RB, RC, RD, RE, 32);
+	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
+	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
+	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
+	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
+	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
+	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
+	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
+	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
+	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
+	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
+	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
+	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
+	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
+	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
+	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
+	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
+	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
+	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
+	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
+	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
+	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
+	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
+	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
+	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
+	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
+	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
+	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
+	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
+	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
+	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
+	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
+	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
+
+	leaq (4*4*4)(%rsi), %rax;
+	write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+	write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+	ret;
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
new file mode 100644
index 000000000000..b36bdac237eb
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -0,0 +1,636 @@
+/*
+ * Glue Code for AVX assembler versions of Serpent Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Glue code based on serpent_sse2_glue.c by:
+ *  Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/serpent.h>
+#include <crypto/cryptd.h>
+#include <crypto/b128ops.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/serpent-avx.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+
+static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
+{
+	u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
+	unsigned int j;
+
+	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
+		ivs[j] = src[j];
+
+	serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
+
+	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
+		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
+}
+
+static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+{
+	be128 ctrblk;
+
+	u128_to_be128(&ctrblk, iv);
+	u128_inc(iv);
+
+	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+	u128_xor(dst, src, (u128 *)&ctrblk);
+}
+
+static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
+				   u128 *iv)
+{
+	be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
+	unsigned int i;
+
+	for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
+		if (dst != src)
+			dst[i] = src[i];
+
+		u128_to_be128(&ctrblks[i], iv);
+		u128_inc(iv);
+	}
+
+	serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
+}
+
+static const struct common_glue_ctx serpent_enc = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_ctr = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_dec = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_dec_cbc = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+	} }
+};
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
+				     dst, src, nbytes);
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
+				       nbytes);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
+}
+
+static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS,
+			      NULL, fpu_enabled, nbytes);
+}
+
+static inline void serpent_fpu_end(bool fpu_enabled)
+{
+	glue_fpu_end(fpu_enabled);
+}
+
+struct crypt_priv {
+	struct serpent_ctx *ctx;
+	bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		__serpent_encrypt(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
+}
+
+struct serpent_lrw_ctx {
+	struct lrw_table_ctx lrw_table;
+	struct serpent_ctx serpent_ctx;
+};
+
+static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
+							SERPENT_BLOCK_SIZE);
+	if (err)
+		return err;
+
+	return lrw_init_table(&ctx->lrw_table, key + keylen -
+						SERPENT_BLOCK_SIZE);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->serpent_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->serpent_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static void lrw_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	lrw_free_table(&ctx->lrw_table);
+}
+
+struct serpent_xts_ctx {
+	struct serpent_ctx tweak_ctx;
+	struct serpent_ctx crypt_ctx;
+};
+
+static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+	int err;
+
+	/* key consists of keys of equal size concatenated, therefore
+	 * the length must be even
+	 */
+	if (keylen % 2) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	/* first half of xts-key is for crypt */
+	err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
+	if (err)
+		return err;
+
+	/* second half of xts-key is for tweak */
+	return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->crypt_ctx,
+		.fpu_enabled = false,
+	};
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = xts_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->crypt_ctx,
+		.fpu_enabled = false,
+	};
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = xts_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static struct crypto_alg serpent_algs[10] = { {
+	.cra_name		= "__ecb-serpent-avx",
+	.cra_driver_name	= "__driver-ecb-serpent-avx",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[0].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-serpent-avx",
+	.cra_driver_name	= "__driver-cbc-serpent-avx",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[1].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-serpent-avx",
+	.cra_driver_name	= "__driver-ctr-serpent-avx",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[2].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "__lrw-serpent-avx",
+	.cra_driver_name	= "__driver-lrw-serpent-avx",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[3].cra_list),
+	.cra_exit		= lrw_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= lrw_serpent_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__xts-serpent-avx",
+	.cra_driver_name	= "__driver-xts-serpent-avx",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[4].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= xts_serpent_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(serpent)",
+	.cra_driver_name	= "ecb-serpent-avx",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[5].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(serpent)",
+	.cra_driver_name	= "cbc-serpent-avx",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[6].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(serpent)",
+	.cra_driver_name	= "ctr-serpent-avx",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[7].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+}, {
+	.cra_name		= "lrw(serpent)",
+	.cra_driver_name	= "lrw-serpent-avx",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[8].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "xts(serpent)",
+	.cra_driver_name	= "xts-serpent-avx",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(serpent_algs[9].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+} };
+
+static int __init serpent_init(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx || !cpu_has_osxsave) {
+		printk(KERN_INFO "AVX instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		printk(KERN_INFO "AVX detected but unusable.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
+}
+
+static void __exit serpent_exit(void)
+{
+	crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs));
+}
+
+module_init(serpent_init);
+module_exit(serpent_exit);
+
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("serpent");
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 4b21be85e0a1..d679c8675f4a 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -41,358 +41,145 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/i387.h>
-#include <asm/serpent.h>
-#include <crypto/scatterwalk.h>
-#include <linux/workqueue.h>
-#include <linux/spinlock.h>
-
-struct async_serpent_ctx {
-	struct cryptd_ablkcipher *cryptd_tfm;
-};
+#include <asm/crypto/serpent-sse2.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
 
-static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
-{
-	if (fpu_enabled)
-		return true;
-
-	/* SSE2 is only used when chunk to be processed is large enough, so
-	 * do not enable FPU until it is necessary.
-	 */
-	if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
-		return false;
-
-	kernel_fpu_begin();
-	return true;
-}
-
-static inline void serpent_fpu_end(bool fpu_enabled)
+static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
 {
-	if (fpu_enabled)
-		kernel_fpu_end();
-}
-
-static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
-		     bool enc)
-{
-	bool fpu_enabled = false;
-	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	const unsigned int bsize = SERPENT_BLOCK_SIZE;
-	unsigned int nbytes;
-	int err;
-
-	err = blkcipher_walk_virt(desc, walk);
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	while ((nbytes = walk->nbytes)) {
-		u8 *wsrc = walk->src.virt.addr;
-		u8 *wdst = walk->dst.virt.addr;
-
-		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
-
-		/* Process multi-block batch */
-		if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
-			do {
-				if (enc)
-					serpent_enc_blk_xway(ctx, wdst, wsrc);
-				else
-					serpent_dec_blk_xway(ctx, wdst, wsrc);
-
-				wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
-				wdst += bsize * SERPENT_PARALLEL_BLOCKS;
-				nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
-			} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
-
-			if (nbytes < bsize)
-				goto done;
-		}
-
-		/* Handle leftovers */
-		do {
-			if (enc)
-				__serpent_encrypt(ctx, wdst, wsrc);
-			else
-				__serpent_decrypt(ctx, wdst, wsrc);
-
-			wsrc += bsize;
-			wdst += bsize;
-			nbytes -= bsize;
-		} while (nbytes >= bsize);
-
-done:
-		err = blkcipher_walk_done(desc, walk, nbytes);
-	}
+	u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
+	unsigned int j;
 
-	serpent_fpu_end(fpu_enabled);
-	return err;
-}
+	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
+		ivs[j] = src[j];
 
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
+	serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	return ecb_crypt(desc, &walk, true);
+	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
+		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
 }
 
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
+static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
 {
-	struct blkcipher_walk walk;
+	be128 ctrblk;
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	return ecb_crypt(desc, &walk, false);
-}
+	u128_to_be128(&ctrblk, iv);
+	u128_inc(iv);
 
-static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
-				  struct blkcipher_walk *walk)
-{
-	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	const unsigned int bsize = SERPENT_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u128 *src = (u128 *)walk->src.virt.addr;
-	u128 *dst = (u128 *)walk->dst.virt.addr;
-	u128 *iv = (u128 *)walk->iv;
-
-	do {
-		u128_xor(dst, src, iv);
-		__serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
-		iv = dst;
-
-		src += 1;
-		dst += 1;
-		nbytes -= bsize;
-	} while (nbytes >= bsize);
-
-	u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
-	return nbytes;
+	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+	u128_xor(dst, src, (u128 *)&ctrblk);
 }
 
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
+static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
+				   u128 *iv)
 {
-	struct blkcipher_walk walk;
-	int err;
+	be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
+	unsigned int i;
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
+	for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
+		if (dst != src)
+			dst[i] = src[i];
 
-	while ((nbytes = walk.nbytes)) {
-		nbytes = __cbc_encrypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
+		u128_to_be128(&ctrblks[i], iv);
+		u128_inc(iv);
 	}
 
-	return err;
+	serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
 }
 
-static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
-				  struct blkcipher_walk *walk)
-{
-	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	const unsigned int bsize = SERPENT_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u128 *src = (u128 *)walk->src.virt.addr;
-	u128 *dst = (u128 *)walk->dst.virt.addr;
-	u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
-	u128 last_iv;
-	int i;
-
-	/* Start of the last block. */
-	src += nbytes / bsize - 1;
-	dst += nbytes / bsize - 1;
-
-	last_iv = *src;
-
-	/* Process multi-block batch */
-	if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
-		do {
-			nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
-			src -= SERPENT_PARALLEL_BLOCKS - 1;
-			dst -= SERPENT_PARALLEL_BLOCKS - 1;
-
-			for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
-				ivs[i] = src[i];
-
-			serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
-			for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
-				u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
-
-			nbytes -= bsize;
-			if (nbytes < bsize)
-				goto done;
+static const struct common_glue_ctx serpent_enc = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
 
-			u128_xor(dst, dst, src - 1);
-			src -= 1;
-			dst -= 1;
-		} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Handle leftovers */
-	for (;;) {
-		__serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
-
-		nbytes -= bsize;
-		if (nbytes < bsize)
-			break;
+	.funcs = { {
+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+	} }
+};
 
-		u128_xor(dst, dst, src - 1);
-		src -= 1;
-		dst -= 1;
-	}
+static const struct common_glue_ctx serpent_ctr = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
+	} }
+};
 
-done:
-	u128_xor(dst, dst, (u128 *)walk->iv);
-	*(u128 *)walk->iv = last_iv;
+static const struct common_glue_ctx serpent_dec = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+	} }
+};
 
-	return nbytes;
-}
+static const struct common_glue_ctx serpent_dec_cbc = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+	} }
+};
 
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 {
-	bool fpu_enabled = false;
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	while ((nbytes = walk.nbytes)) {
-		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
-		nbytes = __cbc_decrypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	serpent_fpu_end(fpu_enabled);
-	return err;
+	return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
 }
 
-static inline void u128_to_be128(be128 *dst, const u128 *src)
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
 {
-	dst->a = cpu_to_be64(src->a);
-	dst->b = cpu_to_be64(src->b);
+	return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
 }
 
-static inline void be128_to_u128(u128 *dst, const be128 *src)
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
 {
-	dst->a = be64_to_cpu(src->a);
-	dst->b = be64_to_cpu(src->b);
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
+				     dst, src, nbytes);
 }
 
-static inline void u128_inc(u128 *i)
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
 {
-	i->b++;
-	if (!i->b)
-		i->a++;
+	return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
+				       nbytes);
 }
 
-static void ctr_crypt_final(struct blkcipher_desc *desc,
-			    struct blkcipher_walk *walk)
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
 {
-	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	u8 *ctrblk = walk->iv;
-	u8 keystream[SERPENT_BLOCK_SIZE];
-	u8 *src = walk->src.virt.addr;
-	u8 *dst = walk->dst.virt.addr;
-	unsigned int nbytes = walk->nbytes;
-
-	__serpent_encrypt(ctx, keystream, ctrblk);
-	crypto_xor(keystream, src, nbytes);
-	memcpy(dst, keystream, nbytes);
-
-	crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
+	return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
 }
 
-static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
-				struct blkcipher_walk *walk)
+static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
 {
-	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	const unsigned int bsize = SERPENT_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u128 *src = (u128 *)walk->src.virt.addr;
-	u128 *dst = (u128 *)walk->dst.virt.addr;
-	u128 ctrblk;
-	be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
-	int i;
-
-	be128_to_u128(&ctrblk, (be128 *)walk->iv);
-
-	/* Process multi-block batch */
-	if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
-		do {
-			/* create ctrblks for parallel encrypt */
-			for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
-				if (dst != src)
-					dst[i] = src[i];
-
-				u128_to_be128(&ctrblocks[i], &ctrblk);
-				u128_inc(&ctrblk);
-			}
-
-			serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
-						 (u8 *)ctrblocks);
-
-			src += SERPENT_PARALLEL_BLOCKS;
-			dst += SERPENT_PARALLEL_BLOCKS;
-			nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
-		} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Handle leftovers */
-	do {
-		if (dst != src)
-			*dst = *src;
-
-		u128_to_be128(&ctrblocks[0], &ctrblk);
-		u128_inc(&ctrblk);
-
-		__serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
-		u128_xor(dst, dst, (u128 *)ctrblocks);
-
-		src += 1;
-		dst += 1;
-		nbytes -= bsize;
-	} while (nbytes >= bsize);
-
-done:
-	u128_to_be128((be128 *)walk->iv, &ctrblk);
-	return nbytes;
+	return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS,
+			      NULL, fpu_enabled, nbytes);
 }
 
-static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		     struct scatterlist *src, unsigned int nbytes)
+static inline void serpent_fpu_end(bool fpu_enabled)
 {
-	bool fpu_enabled = false;
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
-		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
-		nbytes = __ctr_crypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	serpent_fpu_end(fpu_enabled);
-
-	if (walk.nbytes) {
-		ctr_crypt_final(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, 0);
-	}
-
-	return err;
+	glue_fpu_end(fpu_enabled);
 }
 
 struct crypt_priv {
@@ -596,106 +383,6 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return ret;
 }
 
-static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
-			unsigned int key_len)
-{
-	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-	struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
-	int err;
-
-	crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
-	crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
-				    & CRYPTO_TFM_REQ_MASK);
-	err = crypto_ablkcipher_setkey(child, key, key_len);
-	crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
-				    & CRYPTO_TFM_RES_MASK);
-	return err;
-}
-
-static int __ablk_encrypt(struct ablkcipher_request *req)
-{
-	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
-	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-	struct blkcipher_desc desc;
-
-	desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
-	desc.info = req->info;
-	desc.flags = 0;
-
-	return crypto_blkcipher_crt(desc.tfm)->encrypt(
-		&desc, req->dst, req->src, req->nbytes);
-}
-
-static int ablk_encrypt(struct ablkcipher_request *req)
-{
-	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
-	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-
-	if (!irq_fpu_usable()) {
-		struct ablkcipher_request *cryptd_req =
-			ablkcipher_request_ctx(req);
-
-		memcpy(cryptd_req, req, sizeof(*req));
-		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-
-		return crypto_ablkcipher_encrypt(cryptd_req);
-	} else {
-		return __ablk_encrypt(req);
-	}
-}
-
-static int ablk_decrypt(struct ablkcipher_request *req)
-{
-	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
-	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
-
-	if (!irq_fpu_usable()) {
-		struct ablkcipher_request *cryptd_req =
-			ablkcipher_request_ctx(req);
-
-		memcpy(cryptd_req, req, sizeof(*req));
-		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-
-		return crypto_ablkcipher_decrypt(cryptd_req);
-	} else {
-		struct blkcipher_desc desc;
-
-		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
-		desc.info = req->info;
-		desc.flags = 0;
-
-		return crypto_blkcipher_crt(desc.tfm)->decrypt(
-			&desc, req->dst, req->src, req->nbytes);
-	}
-}
-
-static void ablk_exit(struct crypto_tfm *tfm)
-{
-	struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
-
-	cryptd_free_ablkcipher(ctx->cryptd_tfm);
-}
-
-static int ablk_init(struct crypto_tfm *tfm)
-{
-	struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct cryptd_ablkcipher *cryptd_tfm;
-	char drv_name[CRYPTO_MAX_ALG_NAME];
-
-	snprintf(drv_name, sizeof(drv_name), "__driver-%s",
-					crypto_tfm_alg_driver_name(tfm));
-
-	cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0);
-	if (IS_ERR(cryptd_tfm))
-		return PTR_ERR(cryptd_tfm);
-
-	ctx->cryptd_tfm = cryptd_tfm;
-	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
-		crypto_ablkcipher_reqsize(&cryptd_tfm->base);
-
-	return 0;
-}
-
 static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__ecb-serpent-sse2",
 	.cra_driver_name	= "__driver-ecb-serpent-sse2",
@@ -808,7 +495,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
@@ -830,7 +517,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
@@ -853,7 +540,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
@@ -877,7 +564,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
@@ -902,7 +589,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_priority		= 400,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index b2c2f57d70e8..49d6987a73d9 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -468,7 +468,7 @@ W_PRECALC_SSSE3
  */
 SHA1_VECTOR_ASM     sha1_transform_ssse3
 
-#ifdef SHA1_ENABLE_AVX_SUPPORT
+#ifdef CONFIG_AS_AVX
 
 .macro W_PRECALC_AVX
 
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index f916499d0abe..4a11a9d72451 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -35,7 +35,7 @@
 
 asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
 				     unsigned int rounds);
-#ifdef SHA1_ENABLE_AVX_SUPPORT
+#ifdef CONFIG_AS_AVX
 asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
 				   unsigned int rounds);
 #endif
@@ -184,7 +184,7 @@ static struct shash_alg alg = {
 	}
 };
 
-#ifdef SHA1_ENABLE_AVX_SUPPORT
+#ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
 	u64 xcr0;
@@ -209,7 +209,7 @@ static int __init sha1_ssse3_mod_init(void)
 	if (cpu_has_ssse3)
 		sha1_transform_asm = sha1_transform_ssse3;
 
-#ifdef SHA1_ENABLE_AVX_SUPPORT
+#ifdef CONFIG_AS_AVX
 	/* allow AVX to override SSSE3, it's a little faster */
 	if (avx_usable())
 		sha1_transform_asm = sha1_transform_avx;
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..35f45574390d
--- /dev/null
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -0,0 +1,300 @@
+/*
+ * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+.file "twofish-avx-x86_64-asm_64.S"
+.text
+
+/* structure of crypto context */
+#define s0	0
+#define s1	1024
+#define s2	2048
+#define s3	3072
+#define w	4096
+#define k	4128
+
+/**********************************************************************
+  8-way AVX twofish
+ **********************************************************************/
+#define CTX %rdi
+
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+
+#define RA2 %xmm4
+#define RB2 %xmm5
+#define RC2 %xmm6
+#define RD2 %xmm7
+
+#define RX %xmm8
+#define RY %xmm9
+
+#define RK1 %xmm10
+#define RK2 %xmm11
+
+#define RID1  %rax
+#define RID1b %al
+#define RID2  %rbx
+#define RID2b %bl
+
+#define RGI1   %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2   %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+
+#define RGS1  %r8
+#define RGS1d %r8d
+#define RGS2  %r9
+#define RGS2d %r9d
+#define RGS3  %r10
+#define RGS3d %r10d
+
+
+#define lookup_32bit(t0, t1, t2, t3, src, dst) \
+	movb		src ## bl,        RID1b;     \
+	movb		src ## bh,        RID2b;     \
+	movl		t0(CTX, RID1, 4), dst ## d;  \
+	xorl		t1(CTX, RID2, 4), dst ## d;  \
+	shrq $16,	src;                         \
+	movb		src ## bl,        RID1b;     \
+	movb		src ## bh,        RID2b;     \
+	xorl		t2(CTX, RID1, 4), dst ## d;  \
+	xorl		t3(CTX, RID2, 4), dst ## d;
+
+#define G(a, x, t0, t1, t2, t3) \
+	vmovq		a,    RGI1;               \
+	vpsrldq $8,	a,    x;                  \
+	vmovq		x,    RGI2;               \
+	\
+	lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
+	shrq $16,	RGI1;                     \
+	lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
+	shlq $32,	RGS2;                     \
+	orq		RGS1, RGS2;               \
+	\
+	lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
+	shrq $16,	RGI2;                     \
+	lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
+	shlq $32,	RGS3;                     \
+	orq		RGS1, RGS3;               \
+	\
+	vmovq		RGS2, x;                  \
+	vpinsrq $1,	RGS3, x, x;
+
+#define encround(a, b, c, d, x, y) \
+	G(a, x, s0, s1, s2, s3);           \
+	G(b, y, s1, s2, s3, s0);           \
+	vpaddd			x, y,   x; \
+	vpaddd			y, x,   y; \
+	vpaddd			x, RK1, x; \
+	vpaddd			y, RK2, y; \
+	vpxor			x, c,   c; \
+	vpsrld $1,		c, x;      \
+	vpslld $(32 - 1),	c, c;      \
+	vpor			c, x,   c; \
+	vpslld $1,		d, x;      \
+	vpsrld $(32 - 1),	d, d;      \
+	vpor			d, x,   d; \
+	vpxor			d, y,   d;
+
+#define decround(a, b, c, d, x, y) \
+	G(a, x, s0, s1, s2, s3);           \
+	G(b, y, s1, s2, s3, s0);           \
+	vpaddd			x, y,   x; \
+	vpaddd			y, x,   y; \
+	vpaddd			y, RK2, y; \
+	vpxor			d, y,   d; \
+	vpsrld $1,		d, y;      \
+	vpslld $(32 - 1),	d, d;      \
+	vpor			d, y,   d; \
+	vpslld $1,		c, y;      \
+	vpsrld $(32 - 1),	c, c;      \
+	vpor			c, y,   c; \
+	vpaddd			x, RK1, x; \
+	vpxor			x, c,   c;
+
+#define encrypt_round(n, a, b, c, d) \
+	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;           \
+	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;           \
+	encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
+	encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
+
+#define decrypt_round(n, a, b, c, d) \
+	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;           \
+	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;           \
+	decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
+	decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
+
+#define encrypt_cycle(n) \
+	encrypt_round((2*n), RA, RB, RC, RD);       \
+	encrypt_round(((2*n) + 1), RC, RD, RA, RB);
+
+#define decrypt_cycle(n) \
+	decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
+	decrypt_round((2*n), RA, RB, RC, RD);
+
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	vpunpckldq		x1, x0, t0; \
+	vpunpckhdq		x1, x0, t2; \
+	vpunpckldq		x3, x2, t1; \
+	vpunpckhdq		x3, x2, x3; \
+	\
+	vpunpcklqdq		t1, t0, x0; \
+	vpunpckhqdq		t1, t0, x1; \
+	vpunpcklqdq		x3, t2, x2; \
+	vpunpckhqdq		x3, t2, x3;
+
+#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
+	vpxor (0*4*4)(in),	wkey, x0; \
+	vpxor (1*4*4)(in),	wkey, x1; \
+	vpxor (2*4*4)(in),	wkey, x2; \
+	vpxor (3*4*4)(in),	wkey, x3; \
+	\
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	vpxor		x0, wkey, x0;     \
+	vmovdqu 	x0, (0*4*4)(out); \
+	vpxor		x1, wkey, x1;     \
+	vmovdqu		x1, (1*4*4)(out); \
+	vpxor		x2, wkey, x2;     \
+	vmovdqu		x2, (2*4*4)(out); \
+	vpxor		x3, wkey, x3;     \
+	vmovdqu		x3, (3*4*4)(out);
+
+#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	vpxor		x0, wkey, x0;         \
+	vpxor		(0*4*4)(out), x0, x0; \
+	vmovdqu 	x0, (0*4*4)(out);     \
+	vpxor		x1, wkey, x1;         \
+	vpxor		(1*4*4)(out), x1, x1; \
+	vmovdqu	        x1, (1*4*4)(out);     \
+	vpxor		x2, wkey, x2;         \
+	vpxor           (2*4*4)(out), x2, x2; \
+	vmovdqu		x2, (2*4*4)(out);     \
+	vpxor		x3, wkey, x3;         \
+	vpxor           (3*4*4)(out), x3, x3; \
+	vmovdqu		x3, (3*4*4)(out);
+
+.align 8
+.global __twofish_enc_blk_8way
+.type   __twofish_enc_blk_8way,@function;
+
+__twofish_enc_blk_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: bool, if true: xor output
+	 */
+
+	pushq %rbx;
+	pushq %rcx;
+
+	vmovdqu w(CTX), RK1;
+
+	leaq (4*4*4)(%rdx), %rax;
+	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
+	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
+
+	xorq RID1, RID1;
+	xorq RID2, RID2;
+
+	encrypt_cycle(0);
+	encrypt_cycle(1);
+	encrypt_cycle(2);
+	encrypt_cycle(3);
+	encrypt_cycle(4);
+	encrypt_cycle(5);
+	encrypt_cycle(6);
+	encrypt_cycle(7);
+
+	vmovdqu (w+4*4)(CTX), RK1;
+
+	popq %rcx;
+	popq %rbx;
+
+	leaq (4*4*4)(%rsi), %rax;
+
+	testb %cl, %cl;
+	jnz __enc_xor8;
+
+	outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
+	outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+
+	ret;
+
+__enc_xor8:
+	outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
+	outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+
+	ret;
+
+.align 8
+.global twofish_dec_blk_8way
+.type   twofish_dec_blk_8way,@function;
+
+twofish_dec_blk_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	pushq %rbx;
+
+	vmovdqu (w+4*4)(CTX), RK1;
+
+	leaq (4*4*4)(%rdx), %rax;
+	inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
+	inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+
+	xorq RID1, RID1;
+	xorq RID2, RID2;
+
+	decrypt_cycle(7);
+	decrypt_cycle(6);
+	decrypt_cycle(5);
+	decrypt_cycle(4);
+	decrypt_cycle(3);
+	decrypt_cycle(2);
+	decrypt_cycle(1);
+	decrypt_cycle(0);
+
+	vmovdqu (w)(CTX), RK1;
+
+	popq %rbx;
+
+	leaq (4*4*4)(%rsi), %rax;
+	outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
+	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
+
+	ret;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
new file mode 100644
index 000000000000..782b67ddaf6a
--- /dev/null
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -0,0 +1,624 @@
+/*
+ * Glue Code for AVX assembler version of Twofish Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/twofish.h>
+#include <crypto/cryptd.h>
+#include <crypto/b128ops.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/twofish.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+#include <crypto/scatterwalk.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+
+#define TWOFISH_PARALLEL_BLOCKS 8
+
+static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__twofish_enc_blk_3way(ctx, dst, src, false);
+}
+
+/* 8-way parallel cipher functions */
+asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst,
+				       const u8 *src, bool xor);
+asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src);
+
+static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__twofish_enc_blk_8way(ctx, dst, src, false);
+}
+
+static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
+					    const u8 *src)
+{
+	__twofish_enc_blk_8way(ctx, dst, src, true);
+}
+
+static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	twofish_dec_blk_8way(ctx, dst, src);
+}
+
+static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
+{
+	u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
+	unsigned int j;
+
+	for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
+		ivs[j] = src[j];
+
+	twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
+
+	for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
+		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
+}
+
+static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
+				     u128 *iv)
+{
+	be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
+	unsigned int i;
+
+	for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
+		if (dst != src)
+			dst[i] = src[i];
+
+		u128_to_be128(&ctrblks[i], iv);
+		u128_inc(iv);
+	}
+
+	twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
+}
+
+static const struct common_glue_ctx twofish_enc = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_ctr = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_dec = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_dec_cbc = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
+	} }
+};
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
+				       dst, src, nbytes);
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
+				       nbytes);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
+}
+
+static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	return glue_fpu_begin(TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS, NULL,
+			      fpu_enabled, nbytes);
+}
+
+static inline void twofish_fpu_end(bool fpu_enabled)
+{
+	glue_fpu_end(fpu_enabled);
+}
+
+struct crypt_priv {
+	struct twofish_ctx *ctx;
+	bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = TF_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
+		twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
+		twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
+
+	nbytes %= bsize * 3;
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		twofish_enc_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = TF_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
+		twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3)
+		twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
+
+	nbytes %= bsize * 3;
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		twofish_dec_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[TWOFISH_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->twofish_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	twofish_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[TWOFISH_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->twofish_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	twofish_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[TWOFISH_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->crypt_ctx,
+		.fpu_enabled = false,
+	};
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = xts_crypt(desc, dst, src, nbytes, &req);
+	twofish_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[TWOFISH_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->crypt_ctx,
+		.fpu_enabled = false,
+	};
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = xts_crypt(desc, dst, src, nbytes, &req);
+	twofish_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static struct crypto_alg twofish_algs[10] = { {
+	.cra_name		= "__ecb-twofish-avx",
+	.cra_driver_name	= "__driver-ecb-twofish-avx",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(twofish_algs[0].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-twofish-avx",
+	.cra_driver_name	= "__driver-cbc-twofish-avx",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(twofish_algs[1].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-twofish-avx",
+	.cra_driver_name	= "__driver-ctr-twofish-avx",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(twofish_algs[2].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "__lrw-twofish-avx",
+	.cra_driver_name	= "__driver-lrw-twofish-avx",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(twofish_algs[3].cra_list),
+	.cra_exit		= lrw_twofish_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= lrw_twofish_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__xts-twofish-avx",
+	.cra_driver_name	= "__driver-xts-twofish-avx",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(twofish_algs[4].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE * 2,
+			.max_keysize	= TF_MAX_KEY_SIZE * 2,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= xts_twofish_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(twofish)",
+	.cra_driver_name	= "ecb-twofish-avx",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(twofish_algs[5].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(twofish)",
+	.cra_driver_name	= "cbc-twofish-avx",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(twofish_algs[6].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(twofish)",
+	.cra_driver_name	= "ctr-twofish-avx",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(twofish_algs[7].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+}, {
+	.cra_name		= "lrw(twofish)",
+	.cra_driver_name	= "lrw-twofish-avx",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(twofish_algs[8].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "xts(twofish)",
+	.cra_driver_name	= "xts-twofish-avx",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(twofish_algs[9].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE * 2,
+			.max_keysize	= TF_MAX_KEY_SIZE * 2,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+} };
+
+static int __init twofish_init(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx || !cpu_has_osxsave) {
+		printk(KERN_INFO "AVX instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		printk(KERN_INFO "AVX detected but unusable.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
+}
+
+static void __exit twofish_exit(void)
+{
+	crypto_unregister_algs(twofish_algs, ARRAY_SIZE(twofish_algs));
+}
+
+module_init(twofish_init);
+module_exit(twofish_exit);
+
+MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 922ab24cce31..15f9347316c8 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -3,11 +3,6 @@
  *
  * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  *
- * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
- *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
- *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -33,20 +28,13 @@
 #include <crypto/algapi.h>
 #include <crypto/twofish.h>
 #include <crypto/b128ops.h>
+#include <asm/crypto/twofish.h>
+#include <asm/crypto/glue_helper.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
 
-/* regular block cipher functions from twofish_x86_64 module */
-asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
-				const u8 *src);
-asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
-				const u8 *src);
-
-/* 3-way parallel cipher functions */
-asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
-				       const u8 *src, bool xor);
-asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
-				     const u8 *src);
+EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way);
+EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);
 
 static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 					const u8 *src)
@@ -60,311 +48,139 @@ static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst,
 	__twofish_enc_blk_3way(ctx, dst, src, true);
 }
 
-static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
-		     void (*fn)(struct twofish_ctx *, u8 *, const u8 *),
-		     void (*fn_3way)(struct twofish_ctx *, u8 *, const u8 *))
-{
-	struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = TF_BLOCK_SIZE;
-	unsigned int nbytes;
-	int err;
-
-	err = blkcipher_walk_virt(desc, walk);
-
-	while ((nbytes = walk->nbytes)) {
-		u8 *wsrc = walk->src.virt.addr;
-		u8 *wdst = walk->dst.virt.addr;
-
-		/* Process three block batch */
-		if (nbytes >= bsize * 3) {
-			do {
-				fn_3way(ctx, wdst, wsrc);
-
-				wsrc += bsize * 3;
-				wdst += bsize * 3;
-				nbytes -= bsize * 3;
-			} while (nbytes >= bsize * 3);
-
-			if (nbytes < bsize)
-				goto done;
-		}
-
-		/* Handle leftovers */
-		do {
-			fn(ctx, wdst, wsrc);
-
-			wsrc += bsize;
-			wdst += bsize;
-			nbytes -= bsize;
-		} while (nbytes >= bsize);
-
-done:
-		err = blkcipher_walk_done(desc, walk, nbytes);
-	}
-
-	return err;
-}
-
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
+void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
 {
-	struct blkcipher_walk walk;
+	u128 ivs[2];
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	return ecb_crypt(desc, &walk, twofish_enc_blk, twofish_enc_blk_3way);
-}
+	ivs[0] = src[0];
+	ivs[1] = src[1];
 
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
+	twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way);
+	u128_xor(&dst[1], &dst[1], &ivs[0]);
+	u128_xor(&dst[2], &dst[2], &ivs[1]);
 }
+EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
 
-static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
-				  struct blkcipher_walk *walk)
-{
-	struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = TF_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u128 *src = (u128 *)walk->src.virt.addr;
-	u128 *dst = (u128 *)walk->dst.virt.addr;
-	u128 *iv = (u128 *)walk->iv;
-
-	do {
-		u128_xor(dst, src, iv);
-		twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
-		iv = dst;
-
-		src += 1;
-		dst += 1;
-		nbytes -= bsize;
-	} while (nbytes >= bsize);
-
-	u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
-	return nbytes;
-}
-
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
+void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
 {
-	struct blkcipher_walk walk;
-	int err;
+	be128 ctrblk;
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
+	if (dst != src)
+		*dst = *src;
 
-	while ((nbytes = walk.nbytes)) {
-		nbytes = __cbc_encrypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
+	u128_to_be128(&ctrblk, iv);
+	u128_inc(iv);
 
-	return err;
+	twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+	u128_xor(dst, dst, (u128 *)&ctrblk);
 }
+EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
 
-static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
-				  struct blkcipher_walk *walk)
+void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
+				     u128 *iv)
 {
-	struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = TF_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u128 *src = (u128 *)walk->src.virt.addr;
-	u128 *dst = (u128 *)walk->dst.virt.addr;
-	u128 ivs[3 - 1];
-	u128 last_iv;
-
-	/* Start of the last block. */
-	src += nbytes / bsize - 1;
-	dst += nbytes / bsize - 1;
-
-	last_iv = *src;
-
-	/* Process three block batch */
-	if (nbytes >= bsize * 3) {
-		do {
-			nbytes -= bsize * (3 - 1);
-			src -= 3 - 1;
-			dst -= 3 - 1;
-
-			ivs[0] = src[0];
-			ivs[1] = src[1];
-
-			twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
-
-			u128_xor(dst + 1, dst + 1, ivs + 0);
-			u128_xor(dst + 2, dst + 2, ivs + 1);
-
-			nbytes -= bsize;
-			if (nbytes < bsize)
-				goto done;
-
-			u128_xor(dst, dst, src - 1);
-			src -= 1;
-			dst -= 1;
-		} while (nbytes >= bsize * 3);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Handle leftovers */
-	for (;;) {
-		twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
-
-		nbytes -= bsize;
-		if (nbytes < bsize)
-			break;
+	be128 ctrblks[3];
 
-		u128_xor(dst, dst, src - 1);
-		src -= 1;
-		dst -= 1;
+	if (dst != src) {
+		dst[0] = src[0];
+		dst[1] = src[1];
+		dst[2] = src[2];
 	}
 
-done:
-	u128_xor(dst, dst, (u128 *)walk->iv);
-	*(u128 *)walk->iv = last_iv;
+	u128_to_be128(&ctrblks[0], iv);
+	u128_inc(iv);
+	u128_to_be128(&ctrblks[1], iv);
+	u128_inc(iv);
+	u128_to_be128(&ctrblks[2], iv);
+	u128_inc(iv);
 
-	return nbytes;
+	twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
 }
+EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way);
+
+static const struct common_glue_ctx twofish_enc = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = -1,
+
+	.funcs = { {
+		.num_blocks = 3,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
+	} }
+};
 
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-		       struct scatterlist *src, unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
+static const struct common_glue_ctx twofish_ctr = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = -1,
+
+	.funcs = { {
+		.num_blocks = 3,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) }
+	} }
+};
 
-	while ((nbytes = walk.nbytes)) {
-		nbytes = __cbc_decrypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
+static const struct common_glue_ctx twofish_dec = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = -1,
+
+	.funcs = { {
+		.num_blocks = 3,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
+	} }
+};
 
-	return err;
-}
+static const struct common_glue_ctx twofish_dec_cbc = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = -1,
+
+	.funcs = { {
+		.num_blocks = 3,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
+	} }
+};
 
-static inline void u128_to_be128(be128 *dst, const u128 *src)
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
 {
-	dst->a = cpu_to_be64(src->a);
-	dst->b = cpu_to_be64(src->b);
+	return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
 }
 
-static inline void be128_to_u128(u128 *dst, const be128 *src)
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
 {
-	dst->a = be64_to_cpu(src->a);
-	dst->b = be64_to_cpu(src->b);
+	return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
 }
 
-static inline void u128_inc(u128 *i)
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
 {
-	i->b++;
-	if (!i->b)
-		i->a++;
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
+				       dst, src, nbytes);
 }
 
-static void ctr_crypt_final(struct blkcipher_desc *desc,
-			    struct blkcipher_walk *walk)
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
 {
-	struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	u8 *ctrblk = walk->iv;
-	u8 keystream[TF_BLOCK_SIZE];
-	u8 *src = walk->src.virt.addr;
-	u8 *dst = walk->dst.virt.addr;
-	unsigned int nbytes = walk->nbytes;
-
-	twofish_enc_blk(ctx, keystream, ctrblk);
-	crypto_xor(keystream, src, nbytes);
-	memcpy(dst, keystream, nbytes);
-
-	crypto_inc(ctrblk, TF_BLOCK_SIZE);
-}
-
-static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
-				struct blkcipher_walk *walk)
-{
-	struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	unsigned int bsize = TF_BLOCK_SIZE;
-	unsigned int nbytes = walk->nbytes;
-	u128 *src = (u128 *)walk->src.virt.addr;
-	u128 *dst = (u128 *)walk->dst.virt.addr;
-	u128 ctrblk;
-	be128 ctrblocks[3];
-
-	be128_to_u128(&ctrblk, (be128 *)walk->iv);
-
-	/* Process three block batch */
-	if (nbytes >= bsize * 3) {
-		do {
-			if (dst != src) {
-				dst[0] = src[0];
-				dst[1] = src[1];
-				dst[2] = src[2];
-			}
-
-			/* create ctrblks for parallel encrypt */
-			u128_to_be128(&ctrblocks[0], &ctrblk);
-			u128_inc(&ctrblk);
-			u128_to_be128(&ctrblocks[1], &ctrblk);
-			u128_inc(&ctrblk);
-			u128_to_be128(&ctrblocks[2], &ctrblk);
-			u128_inc(&ctrblk);
-
-			twofish_enc_blk_xor_3way(ctx, (u8 *)dst,
-						 (u8 *)ctrblocks);
-
-			src += 3;
-			dst += 3;
-			nbytes -= bsize * 3;
-		} while (nbytes >= bsize * 3);
-
-		if (nbytes < bsize)
-			goto done;
-	}
-
-	/* Handle leftovers */
-	do {
-		if (dst != src)
-			*dst = *src;
-
-		u128_to_be128(&ctrblocks[0], &ctrblk);
-		u128_inc(&ctrblk);
-
-		twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
-		u128_xor(dst, dst, (u128 *)ctrblocks);
-
-		src += 1;
-		dst += 1;
-		nbytes -= bsize;
-	} while (nbytes >= bsize);
-
-done:
-	u128_to_be128((be128 *)walk->iv, &ctrblk);
-	return nbytes;
+	return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
+				       nbytes);
 }
 
 static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		     struct scatterlist *src, unsigned int nbytes)
 {
-	struct blkcipher_walk walk;
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE);
-
-	while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) {
-		nbytes = __ctr_crypt(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, nbytes);
-	}
-
-	if (walk.nbytes) {
-		ctr_crypt_final(desc, &walk);
-		err = blkcipher_walk_done(desc, &walk, 0);
-	}
-
-	return err;
+	return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
 }
 
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
@@ -397,13 +213,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		twofish_dec_blk(ctx, srcdst, srcdst);
 }
 
-struct twofish_lrw_ctx {
-	struct lrw_table_ctx lrw_table;
-	struct twofish_ctx twofish_ctx;
-};
-
-static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
-			      unsigned int keylen)
+int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
+		       unsigned int keylen)
 {
 	struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
 	int err;
@@ -415,6 +226,7 @@ static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
 
 	return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE);
 }
+EXPORT_SYMBOL_GPL(lrw_twofish_setkey);
 
 static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
@@ -450,20 +262,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return lrw_crypt(desc, dst, src, nbytes, &req);
 }
 
-static void lrw_exit_tfm(struct crypto_tfm *tfm)
+void lrw_twofish_exit_tfm(struct crypto_tfm *tfm)
 {
 	struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
 
 	lrw_free_table(&ctx->lrw_table);
 }
+EXPORT_SYMBOL_GPL(lrw_twofish_exit_tfm);
 
-struct twofish_xts_ctx {
-	struct twofish_ctx tweak_ctx;
-	struct twofish_ctx crypt_ctx;
-};
-
-static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
-			      unsigned int keylen)
+int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
+		       unsigned int keylen)
 {
 	struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm);
 	u32 *flags = &tfm->crt_flags;
@@ -486,6 +294,7 @@ static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
 	return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
 				flags);
 }
+EXPORT_SYMBOL_GPL(xts_twofish_setkey);
 
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
@@ -596,7 +405,7 @@ static struct crypto_alg tf_algs[5] = { {
 	.cra_type		= &crypto_blkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_list		= LIST_HEAD_INIT(tf_algs[3].cra_list),
-	.cra_exit		= lrw_exit_tfm,
+	.cra_exit		= lrw_twofish_exit_tfm,
 	.cra_u = {
 		.blkcipher = {
 			.min_keysize	= TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 49331bedc158..70780689599a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -75,23 +75,54 @@ static inline int alternatives_text_reserved(void *start, void *end)
 }
 #endif	/* CONFIG_SMP */
 
+#define OLDINSTR(oldinstr)	"661:\n\t" oldinstr "\n662:\n"
+
+#define b_replacement(number)	"663"#number
+#define e_replacement(number)	"664"#number
+
+#define alt_slen "662b-661b"
+#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
+
+#define ALTINSTR_ENTRY(feature, number)					      \
+	" .long 661b - .\n"				/* label           */ \
+	" .long " b_replacement(number)"f - .\n"	/* new instruction */ \
+	" .word " __stringify(feature) "\n"		/* feature bit     */ \
+	" .byte " alt_slen "\n"				/* source len      */ \
+	" .byte " alt_rlen(number) "\n"			/* replacement len */
+
+#define DISCARD_ENTRY(number)				/* rlen <= slen */    \
+	" .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
+
+#define ALTINSTR_REPLACEMENT(newinstr, feature, number)	/* replacement */     \
+	b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
+
 /* alternative assembly primitive: */
 #define ALTERNATIVE(oldinstr, newinstr, feature)			\
-									\
-      "661:\n\t" oldinstr "\n662:\n"					\
-      ".section .altinstructions,\"a\"\n"				\
-      "	 .long 661b - .\n"			/* label           */	\
-      "	 .long 663f - .\n"			/* new instruction */	\
-      "	 .word " __stringify(feature) "\n"	/* feature bit     */	\
-      "	 .byte 662b-661b\n"			/* sourcelen       */	\
-      "	 .byte 664f-663f\n"			/* replacementlen  */	\
-      ".previous\n"							\
-      ".section .discard,\"aw\",@progbits\n"				\
-      "	 .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */	\
-      ".previous\n"							\
-      ".section .altinstr_replacement, \"ax\"\n"			\
-      "663:\n\t" newinstr "\n664:\n"		/* replacement     */	\
-      ".previous"
+	OLDINSTR(oldinstr)						\
+	".section .altinstructions,\"a\"\n"				\
+	ALTINSTR_ENTRY(feature, 1)					\
+	".previous\n"							\
+	".section .discard,\"aw\",@progbits\n"				\
+	DISCARD_ENTRY(1)						\
+	".previous\n"							\
+	".section .altinstr_replacement, \"ax\"\n"			\
+	ALTINSTR_REPLACEMENT(newinstr, feature, 1)			\
+	".previous"
+
+#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
+	OLDINSTR(oldinstr)						\
+	".section .altinstructions,\"a\"\n"				\
+	ALTINSTR_ENTRY(feature1, 1)					\
+	ALTINSTR_ENTRY(feature2, 2)					\
+	".previous\n"							\
+	".section .discard,\"aw\",@progbits\n"				\
+	DISCARD_ENTRY(1)						\
+	DISCARD_ENTRY(2)						\
+	".previous\n"							\
+	".section .altinstr_replacement, \"ax\"\n"			\
+	ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)			\
+	ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)			\
+	".previous"
 
 /*
  * This must be included *after* the definition of ALTERNATIVE due to
@@ -140,6 +171,19 @@ static inline int alternatives_text_reserved(void *start, void *end)
 		: output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
 
 /*
+ * Like alternative_call, but there are two features and respective functions.
+ * If CPU has feature2, function2 is used.
+ * Otherwise, if CPU has feature1, function1 is used.
+ * Otherwise, old function is used.
+ */
+#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2,   \
+			   output, input...)				      \
+	asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
+		"call %P[new2]", feature2)				      \
+		: output : [old] "i" (oldfunc), [new1] "i" (newfunc1),	      \
+		[new2] "i" (newfunc2), ## input)
+
+/*
  * use this macro(s) if you need more than one output parameter
  * in alternative_io
  */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 49ad773f4b9f..b3341e9cd8fd 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -26,10 +26,31 @@ struct amd_l3_cache {
 	u8	 subcaches[4];
 };
 
+struct threshold_block {
+	unsigned int		block;
+	unsigned int		bank;
+	unsigned int		cpu;
+	u32			address;
+	u16			interrupt_enable;
+	bool			interrupt_capable;
+	u16			threshold_limit;
+	struct kobject		kobj;
+	struct list_head	miscj;
+};
+
+struct threshold_bank {
+	struct kobject		*kobj;
+	struct threshold_block	*blocks;
+
+	/* initialized to the number of CPUs on the node sharing this bank */
+	atomic_t		cpus;
+};
+
 struct amd_northbridge {
 	struct pci_dev *misc;
 	struct pci_dev *link;
 	struct amd_l3_cache l3_cache;
+	struct threshold_bank *bank4;
 };
 
 struct amd_northbridge_info {
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index eaff4790ed96..f34261296ffb 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -306,7 +306,8 @@ struct apic {
 	unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
 	unsigned long (*check_apicid_present)(int apicid);
 
-	void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
+	void (*vector_allocation_domain)(int cpu, struct cpumask *retmask,
+					 const struct cpumask *mask);
 	void (*init_apic_ldr)(void);
 
 	void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
@@ -331,9 +332,9 @@ struct apic {
 	unsigned long (*set_apic_id)(unsigned int id);
 	unsigned long apic_id_mask;
 
-	unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
-	unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
-					       const struct cpumask *andmask);
+	int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
+				      const struct cpumask *andmask,
+				      unsigned int *apicid);
 
 	/* ipi */
 	void (*send_IPI_mask)(const struct cpumask *mask, int vector);
@@ -464,6 +465,8 @@ static inline u32 safe_apic_wait_icr_idle(void)
 	return apic->safe_wait_icr_idle();
 }
 
+extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));
+
 #else /* CONFIG_X86_LOCAL_APIC */
 
 static inline u32 apic_read(u32 reg) { return 0; }
@@ -473,6 +476,7 @@ static inline u64 apic_icr_read(void) { return 0; }
 static inline void apic_icr_write(u32 low, u32 high) { }
 static inline void apic_wait_icr_idle(void) { }
 static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
+static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}
 
 #endif /* CONFIG_X86_LOCAL_APIC */
 
@@ -537,7 +541,12 @@ static inline const struct cpumask *default_target_cpus(void)
 #endif
 }
 
-DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+static inline const struct cpumask *online_target_cpus(void)
+{
+	return cpu_online_mask;
+}
+
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
 
 
 static inline unsigned int read_apic_id(void)
@@ -586,21 +595,50 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
 
 #endif
 
-static inline unsigned int
-default_cpu_mask_to_apicid(const struct cpumask *cpumask)
+static inline int
+flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+			    const struct cpumask *andmask,
+			    unsigned int *apicid)
 {
-	return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+	unsigned long cpu_mask = cpumask_bits(cpumask)[0] &
+				 cpumask_bits(andmask)[0] &
+				 cpumask_bits(cpu_online_mask)[0] &
+				 APIC_ALL_CPUS;
+
+	if (likely(cpu_mask)) {
+		*apicid = (unsigned int)cpu_mask;
+		return 0;
+	} else {
+		return -EINVAL;
+	}
 }
 
-static inline unsigned int
+extern int
 default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
-			       const struct cpumask *andmask)
+			       const struct cpumask *andmask,
+			       unsigned int *apicid);
+
+static inline void
+flat_vector_allocation_domain(int cpu, struct cpumask *retmask,
+			      const struct cpumask *mask)
 {
-	unsigned long mask1 = cpumask_bits(cpumask)[0];
-	unsigned long mask2 = cpumask_bits(andmask)[0];
-	unsigned long mask3 = cpumask_bits(cpu_online_mask)[0];
+	/* Careful. Some cpus do not strictly honor the set of cpus
+	 * specified in the interrupt destination when using lowest
+	 * priority interrupt delivery mode.
+	 *
+	 * In particular there was a hyperthreading cpu observed to
+	 * deliver interrupts to the wrong hyperthread when only one
+	 * hyperthread was specified in the interrupt desitination.
+	 */
+	cpumask_clear(retmask);
+	cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
+}
 
-	return (unsigned int)(mask1 & mask2 & mask3);
+static inline void
+default_vector_allocation_domain(int cpu, struct cpumask *retmask,
+				 const struct cpumask *mask)
+{
+	cpumask_copy(retmask, cpumask_of(cpu));
 }
 
 static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index a6983b277220..72f5009deb5a 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
  * This operation is non-atomic and can be reordered.
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
+ *
+ * Note: the operation is performed atomically with respect to
+ * the local CPU, but not other CPUs. Portable code should not
+ * rely on this behaviour.
+ * KVM relies on this behaviour on x86 for modifying memory that is also
+ * accessed from a hypervisor on the same CPU if running in a VM: don't change
+ * this without also updating arch/x86/kernel/kvm.c
  */
 static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
 {
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index eb45aa6b1f27..2ad874cb661c 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -66,6 +66,7 @@ struct setup_header {
 	__u64	setup_data;
 	__u64	pref_address;
 	__u32	init_size;
+	__u32	handover_offset;
 } __attribute__((packed));
 
 struct sys_desc_table {
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index f91e80f4f180..6b7ee5ff6820 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -207,6 +207,8 @@
 #define X86_FEATURE_ERMS	(9*32+ 9) /* Enhanced REP MOVSB/STOSB */
 #define X86_FEATURE_INVPCID	(9*32+10) /* Invalidate Processor Context ID */
 #define X86_FEATURE_RTM		(9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_RDSEED	(9*32+18) /* The RDSEED instruction */
+#define X86_FEATURE_ADX		(9*32+19) /* The ADCX and ADOX instructions */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
diff --git a/arch/x86/include/asm/crypto/ablk_helper.h b/arch/x86/include/asm/crypto/ablk_helper.h
new file mode 100644
index 000000000000..4f93df50c23e
--- /dev/null
+++ b/arch/x86/include/asm/crypto/ablk_helper.h
@@ -0,0 +1,31 @@
+/*
+ * Shared async block cipher helpers
+ */
+
+#ifndef _CRYPTO_ABLK_HELPER_H
+#define _CRYPTO_ABLK_HELPER_H
+
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <crypto/cryptd.h>
+
+struct async_helper_ctx {
+	struct cryptd_ablkcipher *cryptd_tfm;
+};
+
+extern int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
+			unsigned int key_len);
+
+extern int __ablk_encrypt(struct ablkcipher_request *req);
+
+extern int ablk_encrypt(struct ablkcipher_request *req);
+
+extern int ablk_decrypt(struct ablkcipher_request *req);
+
+extern void ablk_exit(struct crypto_tfm *tfm);
+
+extern int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name);
+
+extern int ablk_init(struct crypto_tfm *tfm);
+
+#endif /* _CRYPTO_ABLK_HELPER_H */
diff --git a/arch/x86/include/asm/aes.h b/arch/x86/include/asm/crypto/aes.h
index 80545a1cbe39..80545a1cbe39 100644
--- a/arch/x86/include/asm/aes.h
+++ b/arch/x86/include/asm/crypto/aes.h
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
new file mode 100644
index 000000000000..3e408bddc96f
--- /dev/null
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -0,0 +1,115 @@
+/*
+ * Shared glue code for 128bit block ciphers
+ */
+
+#ifndef _CRYPTO_GLUE_HELPER_H
+#define _CRYPTO_GLUE_HELPER_H
+
+#include <linux/kernel.h>
+#include <linux/crypto.h>
+#include <asm/i387.h>
+#include <crypto/b128ops.h>
+
+typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
+typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
+typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
+				       u128 *iv);
+
+#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
+#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
+#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
+
+struct common_glue_func_entry {
+	unsigned int num_blocks; /* number of blocks that @fn will process */
+	union {
+		common_glue_func_t ecb;
+		common_glue_cbc_func_t cbc;
+		common_glue_ctr_func_t ctr;
+	} fn_u;
+};
+
+struct common_glue_ctx {
+	unsigned int num_funcs;
+	int fpu_blocks_limit; /* -1 means fpu not needed at all */
+
+	/*
+	 * First funcs entry must have largest num_blocks and last funcs entry
+	 * must have num_blocks == 1!
+	 */
+	struct common_glue_func_entry funcs[];
+};
+
+static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit,
+				  struct blkcipher_desc *desc,
+				  bool fpu_enabled, unsigned int nbytes)
+{
+	if (likely(fpu_blocks_limit < 0))
+		return false;
+
+	if (fpu_enabled)
+		return true;
+
+	/*
+	 * Vector-registers are only used when chunk to be processed is large
+	 * enough, so do not enable FPU until it is necessary.
+	 */
+	if (nbytes < bsize * (unsigned int)fpu_blocks_limit)
+		return false;
+
+	if (desc) {
+		/* prevent sleeping if FPU is in use */
+		desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	}
+
+	kernel_fpu_begin();
+	return true;
+}
+
+static inline void glue_fpu_end(bool fpu_enabled)
+{
+	if (fpu_enabled)
+		kernel_fpu_end();
+}
+
+static inline void u128_to_be128(be128 *dst, const u128 *src)
+{
+	dst->a = cpu_to_be64(src->a);
+	dst->b = cpu_to_be64(src->b);
+}
+
+static inline void be128_to_u128(u128 *dst, const be128 *src)
+{
+	dst->a = be64_to_cpu(src->a);
+	dst->b = be64_to_cpu(src->b);
+}
+
+static inline void u128_inc(u128 *i)
+{
+	i->b++;
+	if (!i->b)
+		i->a++;
+}
+
+extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
+				 struct blkcipher_desc *desc,
+				 struct scatterlist *dst,
+				 struct scatterlist *src, unsigned int nbytes);
+
+extern int glue_cbc_encrypt_128bit(const common_glue_func_t fn,
+				   struct blkcipher_desc *desc,
+				   struct scatterlist *dst,
+				   struct scatterlist *src,
+				   unsigned int nbytes);
+
+extern int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
+				   struct blkcipher_desc *desc,
+				   struct scatterlist *dst,
+				   struct scatterlist *src,
+				   unsigned int nbytes);
+
+extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
+				 struct blkcipher_desc *desc,
+				 struct scatterlist *dst,
+				 struct scatterlist *src, unsigned int nbytes);
+
+#endif /* _CRYPTO_GLUE_HELPER_H */
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
new file mode 100644
index 000000000000..432deedd2945
--- /dev/null
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -0,0 +1,32 @@
+#ifndef ASM_X86_SERPENT_AVX_H
+#define ASM_X86_SERPENT_AVX_H
+
+#include <linux/crypto.h>
+#include <crypto/serpent.h>
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					   const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src);
+
+static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+				   const u8 *src)
+{
+	__serpent_enc_blk_8way_avx(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
+				       const u8 *src)
+{
+	__serpent_enc_blk_8way_avx(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+				   const u8 *src)
+{
+	serpent_dec_blk_8way_avx(ctx, dst, src);
+}
+
+#endif
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/crypto/serpent-sse2.h
index d3ef63fe0c81..e6e77dffbdab 100644
--- a/arch/x86/include/asm/serpent.h
+++ b/arch/x86/include/asm/crypto/serpent-sse2.h
@@ -1,5 +1,5 @@
-#ifndef ASM_X86_SERPENT_H
-#define ASM_X86_SERPENT_H
+#ifndef ASM_X86_SERPENT_SSE2_H
+#define ASM_X86_SERPENT_SSE2_H
 
 #include <linux/crypto.h>
 #include <crypto/serpent.h>
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
new file mode 100644
index 000000000000..9d2c514bd5f9
--- /dev/null
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -0,0 +1,46 @@
+#ifndef ASM_X86_TWOFISH_H
+#define ASM_X86_TWOFISH_H
+
+#include <linux/crypto.h>
+#include <crypto/twofish.h>
+#include <crypto/lrw.h>
+#include <crypto/b128ops.h>
+
+struct twofish_lrw_ctx {
+	struct lrw_table_ctx lrw_table;
+	struct twofish_ctx twofish_ctx;
+};
+
+struct twofish_xts_ctx {
+	struct twofish_ctx tweak_ctx;
+	struct twofish_ctx crypt_ctx;
+};
+
+/* regular block cipher functions from twofish_x86_64 module */
+asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
+				const u8 *src);
+asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
+				const u8 *src);
+
+/* 3-way parallel cipher functions */
+asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
+				       const u8 *src, bool xor);
+asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src);
+
+/* helpers from twofish_x86_64-3way module */
+extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
+extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
+				u128 *iv);
+extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
+				     u128 *iv);
+
+extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen);
+
+extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
+
+extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen);
+
+#endif /* ASM_X86_TWOFISH_H */
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
index cc70c1c78ca4..75ce3f47d204 100644
--- a/arch/x86/include/asm/emergency-restart.h
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -4,9 +4,7 @@
 enum reboot_type {
 	BOOT_TRIPLE = 't',
 	BOOT_KBD = 'k',
-#ifdef CONFIG_X86_32
 	BOOT_BIOS = 'b',
-#endif
 	BOOT_ACPI = 'a',
 	BOOT_EFI = 'e',
 	BOOT_CF9 = 'p',
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 0baa628e330c..40afa0005c69 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -15,15 +15,6 @@ BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
 BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
 BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
 BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
-
-.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
-	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-.if NUM_INVALIDATE_TLB_VECTORS > \idx
-BUILD_INTERRUPT3(invalidate_interrupt\idx,
-		 (INVALIDATE_TLB_VECTOR_START)+\idx,
-		 smp_invalidate_interrupt)
-.endif
-.endr
 #endif
 
 BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
index dbe82a5c5eac..d3d74698dce9 100644
--- a/arch/x86/include/asm/floppy.h
+++ b/arch/x86/include/asm/floppy.h
@@ -99,7 +99,7 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id)
 		virtual_dma_residue += virtual_dma_count;
 		virtual_dma_count = 0;
 #ifdef TRACE_FLPY_INT
-		printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
+		printk(KERN_DEBUG "count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
 		       virtual_dma_count, virtual_dma_residue, calls, bytes,
 		       dma_wait);
 		calls = 0;
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 7a15153c675d..b518c7509933 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper;
 extern const struct hypervisor_x86 x86_hyper_vmware;
 extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
 extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+extern const struct hypervisor_x86 x86_hyper_kvm;
 
 static inline bool hypervisor_x2apic_available(void)
 {
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index dffc38ee6255..345c99cef152 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -5,7 +5,6 @@ extern struct dma_map_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 extern int iommu_pass_through;
-extern int iommu_group_mf;
 
 /* 10 seconds */
 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4b4448761e88..1508e518c7e3 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -119,17 +119,6 @@
  */
 #define LOCAL_TIMER_VECTOR		0xef
 
-/* up to 32 vectors used for spreading out TLB flushes: */
-#if NR_CPUS <= 32
-# define NUM_INVALIDATE_TLB_VECTORS	(NR_CPUS)
-#else
-# define NUM_INVALIDATE_TLB_VECTORS	(32)
-#endif
-
-#define INVALIDATE_TLB_VECTOR_END	(0xee)
-#define INVALIDATE_TLB_VECTOR_START	\
-	(INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
-
 #define NR_VECTORS			 256
 
 #define FPU_IRQ				  13
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index e7d1c194d272..246617efd67f 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -12,6 +12,7 @@
 /* Select x86 specific features in <linux/kvm.h> */
 #define __KVM_HAVE_PIT
 #define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_DEVICE_ASSIGNMENT
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1ac46c22dd50..c764f43b71c5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -192,8 +192,8 @@ struct x86_emulate_ops {
 			 struct x86_instruction_info *info,
 			 enum x86_intercept_stage stage);
 
-	bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
-			 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+	void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
+			  u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
 };
 
 typedef u32 __attribute__((vector_size(16))) sse128_t;
@@ -280,9 +280,9 @@ struct x86_emulate_ctxt {
 	u8 modrm_seg;
 	bool rip_relative;
 	unsigned long _eip;
+	struct operand memop;
 	/* Fields above regs are cleared together. */
 	unsigned long regs[NR_VCPU_REGS];
-	struct operand memop;
 	struct operand *memopp;
 	struct fetch_cache fetch;
 	struct read_cache io_read;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index db7c1f2709a2..09155d64cf7e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,12 +48,13 @@
 
 #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
 #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
+#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL
 #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS |	\
 				  0xFFFFFF0000000000ULL)
 #define CR4_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
-			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
+			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
 			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
@@ -175,6 +176,13 @@ enum {
 
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC	0
+/*
+ * The following bit is set with PV-EOI, unset on EOI.
+ * We detect PV-EOI changes by guest by comparing
+ * this bit with PV-EOI in guest memory.
+ * See the implementation in apic_update_pv_eoi.
+ */
+#define KVM_APIC_PV_EOI_PENDING	1
 
 /*
  * We don't want allocation failures within the mmu code, so we preallocate
@@ -313,8 +321,8 @@ struct kvm_pmu {
 	u64 counter_bitmask[2];
 	u64 global_ctrl_mask;
 	u8 version;
-	struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC];
-	struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED];
+	struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
+	struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
 	struct irq_work irq_work;
 	u64 reprogram_pmi;
 };
@@ -484,6 +492,11 @@ struct kvm_vcpu_arch {
 		u64 length;
 		u64 status;
 	} osvw;
+
+	struct {
+		u64 msr_val;
+		struct gfn_to_hva_cache data;
+	} pv_eoi;
 };
 
 struct kvm_lpage_info {
@@ -661,6 +674,7 @@ struct kvm_x86_ops {
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
+	bool (*invpcid_supported)(void);
 	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
 
 	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -802,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
-int kvm_pic_set_irq(void *opaque, int irq, int level);
+static inline int __kvm_irq_line_state(unsigned long *irq_state,
+				       int irq_source_id, int level)
+{
+	/* Logical OR for level trig interrupt */
+	if (level)
+		__set_bit(irq_source_id, irq_state);
+	else
+		__clear_bit(irq_source_id, irq_state);
+
+	return !!(*irq_state);
+}
+
+int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
+void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 63ab1661d00e..2f7712e08b1e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -22,6 +22,7 @@
 #define KVM_FEATURE_CLOCKSOURCE2        3
 #define KVM_FEATURE_ASYNC_PF		4
 #define KVM_FEATURE_STEAL_TIME		5
+#define KVM_FEATURE_PV_EOI		6
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -37,6 +38,7 @@
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02
 #define MSR_KVM_STEAL_TIME  0x4b564d03
+#define MSR_KVM_PV_EOI_EN      0x4b564d04
 
 struct kvm_steal_time {
 	__u64 steal;
@@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data {
 	__u32 enabled;
 };
 
+#define KVM_PV_EOI_BIT 0
+#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
+#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
+#define KVM_PV_EOI_DISABLED 0x0
+
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 441520e4174f..a3ac52b29cbf 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -33,6 +33,14 @@
 #define MCI_STATUS_PCC   (1ULL<<57)  /* processor context corrupt */
 #define MCI_STATUS_S	 (1ULL<<56)  /* Signaled machine check */
 #define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
+#define MCACOD		  0xffff     /* MCA Error Code */
+
+/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
+#define MCACOD_SCRUB	0x00C0	/* 0xC0-0xCF Memory Scrubbing */
+#define MCACOD_SCRUBMSK	0xfff0
+#define MCACOD_L3WB	0x017A	/* L3 Explicit Writeback */
+#define MCACOD_DATA	0x0134	/* Data Load */
+#define MCACOD_INSTR	0x0150	/* Instruction Fetch */
 
 /* MCi_MISC register defines */
 #define MCI_MISC_ADDR_LSB(m)	((m) & 0x3f)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 084ef95274cd..813ed103f45e 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -115,8 +115,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
 
 extern unsigned long long native_read_tsc(void);
 
-extern int native_rdmsr_safe_regs(u32 regs[8]);
-extern int native_wrmsr_safe_regs(u32 regs[8]);
+extern int rdmsr_safe_regs(u32 regs[8]);
+extern int wrmsr_safe_regs(u32 regs[8]);
 
 static __always_inline unsigned long long __native_read_tsc(void)
 {
@@ -187,43 +187,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	return err;
 }
 
-static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
-{
-	u32 gprs[8] = { 0 };
-	int err;
-
-	gprs[1] = msr;
-	gprs[7] = 0x9c5a203a;
-
-	err = native_rdmsr_safe_regs(gprs);
-
-	*p = gprs[0] | ((u64)gprs[2] << 32);
-
-	return err;
-}
-
-static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
-{
-	u32 gprs[8] = { 0 };
-
-	gprs[0] = (u32)val;
-	gprs[1] = msr;
-	gprs[2] = val >> 32;
-	gprs[7] = 0x9c5a203a;
-
-	return native_wrmsr_safe_regs(gprs);
-}
-
-static inline int rdmsr_safe_regs(u32 regs[8])
-{
-	return native_rdmsr_safe_regs(regs);
-}
-
-static inline int wrmsr_safe_regs(u32 regs[8])
-{
-	return native_wrmsr_safe_regs(regs);
-}
-
 #define rdtscl(low)						\
 	((low) = (u32)__native_read_tsc())
 
@@ -237,6 +200,8 @@ do {							\
 	(high) = (u32)(_l >> 32);			\
 } while (0)
 
+#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
+
 #define rdtscp(low, high, aux)					\
 do {                                                            \
 	unsigned long long _val = native_read_tscp(&(aux));     \
@@ -248,8 +213,7 @@ do {                                                            \
 
 #endif	/* !CONFIG_PARAVIRT */
 
-
-#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val),		\
+#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val),		\
 					     (u32)((val) >> 32))
 
 #define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2))
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index dc580c42851c..c0fa356e90de 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -44,28 +44,14 @@ struct nmiaction {
 	const char		*name;
 };
 
-#define register_nmi_handler(t, fn, fg, n)		\
+#define register_nmi_handler(t, fn, fg, n, init...)	\
 ({							\
-	static struct nmiaction fn##_na = {		\
+	static struct nmiaction init fn##_na = {	\
 		.handler = (fn),			\
 		.name = (n),				\
 		.flags = (fg),				\
 	};						\
-	__register_nmi_handler((t), &fn##_na);	\
-})
-
-/*
- * For special handlers that register/unregister in the
- * init section only.  This should be considered rare.
- */
-#define register_nmi_handler_initonly(t, fn, fg, n)		\
-({							\
-	static struct nmiaction fn##_na __initdata = {		\
-		.handler = (fn),			\
-		.name = (n),				\
-		.flags = (fg),				\
-	};						\
-	__register_nmi_handler((t), &fn##_na);	\
+	__register_nmi_handler((t), &fn##_na);		\
 })
 
 int __register_nmi_handler(unsigned int, struct nmiaction *);
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 87bdbca72f94..72f9adf6eca4 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -100,25 +100,6 @@ extern void olpc_xo1_pm_wakeup_clear(u16 value);
 
 extern int pci_olpc_init(void);
 
-/* EC related functions */
-
-extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
-		unsigned char *outbuf, size_t outlen);
-
-/* EC commands */
-
-#define EC_FIRMWARE_REV			0x08
-#define EC_WRITE_SCI_MASK		0x1b
-#define EC_WAKE_UP_WLAN			0x24
-#define EC_WLAN_LEAVE_RESET		0x25
-#define EC_READ_EB_MODE			0x2a
-#define EC_SET_SCI_INHIBIT		0x32
-#define EC_SET_SCI_INHIBIT_RELEASE	0x34
-#define EC_WLAN_ENTER_RESET		0x35
-#define EC_WRITE_EXT_SCI_MASK		0x38
-#define EC_SCI_QUERY			0x84
-#define EC_EXT_SCI_QUERY		0x85
-
 /* SCI source values */
 
 #define EC_SCI_SRC_EMPTY	0x00
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 6cbbabf52707..a0facf3908d7 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -128,21 +128,11 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err)
 	return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
 }
 
-static inline int paravirt_rdmsr_regs(u32 *regs)
-{
-	return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs);
-}
-
 static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
 {
 	return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
 }
 
-static inline int paravirt_wrmsr_regs(u32 *regs)
-{
-	return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs);
-}
-
 /* These should all do BUG_ON(_err), but our headers are too tangled. */
 #define rdmsr(msr, val1, val2)			\
 do {						\
@@ -176,9 +166,6 @@ do {						\
 	_err;					\
 })
 
-#define rdmsr_safe_regs(regs)	paravirt_rdmsr_regs(regs)
-#define wrmsr_safe_regs(regs)	paravirt_wrmsr_regs(regs)
-
 static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 {
 	int err;
@@ -186,32 +173,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	*p = paravirt_read_msr(msr, &err);
 	return err;
 }
-static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
-{
-	u32 gprs[8] = { 0 };
-	int err;
-
-	gprs[1] = msr;
-	gprs[7] = 0x9c5a203a;
-
-	err = paravirt_rdmsr_regs(gprs);
-
-	*p = gprs[0] | ((u64)gprs[2] << 32);
-
-	return err;
-}
-
-static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
-{
-	u32 gprs[8] = { 0 };
-
-	gprs[0] = (u32)val;
-	gprs[1] = msr;
-	gprs[2] = val >> 32;
-	gprs[7] = 0x9c5a203a;
-
-	return paravirt_wrmsr_regs(gprs);
-}
 
 static inline u64 paravirt_read_tsc(void)
 {
@@ -252,6 +213,8 @@ do {						\
 	high = _l >> 32;			\
 } while (0)
 
+#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
+
 static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
 {
 	return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
@@ -397,9 +360,10 @@ static inline void __flush_tlb_single(unsigned long addr)
 
 static inline void flush_tlb_others(const struct cpumask *cpumask,
 				    struct mm_struct *mm,
-				    unsigned long va)
+				    unsigned long start,
+				    unsigned long end)
 {
-	PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
+	PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
 }
 
 static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 8e8b9a4987ee..142236ed83af 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -153,9 +153,7 @@ struct pv_cpu_ops {
 	/* MSR, PMC and TSR operations.
 	   err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
 	u64 (*read_msr)(unsigned int msr, int *err);
-	int (*rdmsr_regs)(u32 *regs);
 	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
-	int (*wrmsr_regs)(u32 *regs);
 
 	u64 (*read_tsc)(void);
 	u64 (*read_pmc)(int counter);
@@ -250,7 +248,8 @@ struct pv_mmu_ops {
 	void (*flush_tlb_single)(unsigned long addr);
 	void (*flush_tlb_others)(const struct cpumask *cpus,
 				 struct mm_struct *mm,
-				 unsigned long va);
+				 unsigned long start,
+				 unsigned long end);
 
 	/* Hooks for allocating and freeing a pagetable top-level */
 	int  (*pgd_alloc)(struct mm_struct *mm);
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b3a531746026..73e8eeff22ee 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -7,9 +7,13 @@
 #undef DEBUG
 
 #ifdef DEBUG
-#define DBG(x...) printk(x)
+#define DBG(fmt, ...) printk(fmt, ##__VA_ARGS__)
 #else
-#define DBG(x...)
+#define DBG(fmt, ...)				\
+do {						\
+	if (0)					\
+		printk(fmt, ##__VA_ARGS__);	\
+} while (0)
 #endif
 
 #define PCI_PROBE_BIOS		0x0001
@@ -100,6 +104,7 @@ struct pci_raw_ops {
 extern const struct pci_raw_ops *raw_pci_ops;
 extern const struct pci_raw_ops *raw_pci_ext_ops;
 
+extern const struct pci_raw_ops pci_mmcfg;
 extern const struct pci_raw_ops pci_direct_conf1;
 extern bool port_cf9_safe;
 
@@ -135,6 +140,12 @@ struct pci_mmcfg_region {
 
 extern int __init pci_mmcfg_arch_init(void);
 extern void __init pci_mmcfg_arch_free(void);
+extern int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg);
+extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg);
+extern int __devinit pci_mmconfig_insert(struct device *dev,
+					 u16 seg, u8 start,
+					 u8 end, phys_addr_t addr);
+extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
 extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
 
 extern struct list_head pci_mmcfg_list;
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index d9b8e3f7f42a..1104afaba52b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -551,6 +551,12 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
 				{ [0 ... NR_CPUS-1] = _initvalue };	\
 	__typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
 
+#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue)	\
+	DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue;		\
+	__typeof__(_type) _name##_early_map[NR_CPUS] __initdata =	\
+				{ [0 ... NR_CPUS-1] = _initvalue };	\
+	__typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
+
 #define EXPORT_EARLY_PER_CPU_SYMBOL(_name)			\
 	EXPORT_PER_CPU_SYMBOL(_name)
 
@@ -559,6 +565,11 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
 	extern __typeof__(_type) *_name##_early_ptr;		\
 	extern __typeof__(_type)  _name##_early_map[]
 
+#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name)		\
+	DECLARE_PER_CPU_READ_MOSTLY(_type, _name);		\
+	extern __typeof__(_type) *_name##_early_ptr;		\
+	extern __typeof__(_type)  _name##_early_map[]
+
 #define	early_per_cpu_ptr(_name) (_name##_early_ptr)
 #define	early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
 #define	early_per_cpu(_name, _cpu) 				\
@@ -570,12 +581,18 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
 #define	DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)		\
 	DEFINE_PER_CPU(_type, _name) = _initvalue
 
+#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue)	\
+	DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue
+
 #define EXPORT_EARLY_PER_CPU_SYMBOL(_name)			\
 	EXPORT_PER_CPU_SYMBOL(_name)
 
 #define DECLARE_EARLY_PER_CPU(_type, _name)			\
 	DECLARE_PER_CPU(_type, _name)
 
+#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name)		\
+	DECLARE_PER_CPU_READ_MOSTLY(_type, _name)
+
 #define	early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
 #define	early_per_cpu_ptr(_name) NULL
 /* no early_per_cpu_map() */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 588f52ea810e..cb4e43bce98a 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -5,11 +5,10 @@
  * Performance event hw details:
  */
 
-#define X86_PMC_MAX_GENERIC				       32
-#define X86_PMC_MAX_FIXED					3
+#define INTEL_PMC_MAX_GENERIC				       32
+#define INTEL_PMC_MAX_FIXED					3
+#define INTEL_PMC_IDX_FIXED				       32
 
-#define X86_PMC_IDX_GENERIC				        0
-#define X86_PMC_IDX_FIXED				       32
 #define X86_PMC_IDX_MAX					       64
 
 #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
@@ -48,8 +47,7 @@
 	(X86_RAW_EVENT_MASK          |  \
 	 AMD64_EVENTSEL_EVENT)
 #define AMD64_NUM_COUNTERS				4
-#define AMD64_NUM_COUNTERS_F15H				6
-#define AMD64_NUM_COUNTERS_MAX				AMD64_NUM_COUNTERS_F15H
+#define AMD64_NUM_COUNTERS_CORE				6
 
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
@@ -121,16 +119,16 @@ struct x86_pmu_capability {
 
 /* Instr_Retired.Any: */
 #define MSR_ARCH_PERFMON_FIXED_CTR0	0x309
-#define X86_PMC_IDX_FIXED_INSTRUCTIONS	(X86_PMC_IDX_FIXED + 0)
+#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS	(INTEL_PMC_IDX_FIXED + 0)
 
 /* CPU_CLK_Unhalted.Core: */
 #define MSR_ARCH_PERFMON_FIXED_CTR1	0x30a
-#define X86_PMC_IDX_FIXED_CPU_CYCLES	(X86_PMC_IDX_FIXED + 1)
+#define INTEL_PMC_IDX_FIXED_CPU_CYCLES	(INTEL_PMC_IDX_FIXED + 1)
 
 /* CPU_CLK_Unhalted.Ref: */
 #define MSR_ARCH_PERFMON_FIXED_CTR2	0x30b
-#define X86_PMC_IDX_FIXED_REF_CYCLES	(X86_PMC_IDX_FIXED + 2)
-#define X86_PMC_MSK_FIXED_REF_CYCLES	(1ULL << X86_PMC_IDX_FIXED_REF_CYCLES)
+#define INTEL_PMC_IDX_FIXED_REF_CYCLES	(INTEL_PMC_IDX_FIXED + 2)
+#define INTEL_PMC_MSK_FIXED_REF_CYCLES	(1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES)
 
 /*
  * We model BTS tracing as another fixed-mode PMC.
@@ -139,7 +137,7 @@ struct x86_pmu_capability {
  * values are used by actual fixed events and higher values are used
  * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
  */
-#define X86_PMC_IDX_FIXED_BTS				(X86_PMC_IDX_FIXED + 16)
+#define INTEL_PMC_IDX_FIXED_BTS				(INTEL_PMC_IDX_FIXED + 16)
 
 /*
  * IBS cpuid feature detection
@@ -198,11 +196,16 @@ static inline u32 get_ibs_caps(void) { return 0; }
 extern void perf_events_lapic_init(void);
 
 /*
- * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
- * This flag is otherwise unused and ABI specified to be 0, so nobody should
- * care what we do with it.
+ * Abuse bits {3,5} of the cpu eflags register. These flags are otherwise
+ * unused and ABI specified to be 0, so nobody should care what we do with
+ * them.
+ *
+ * EXACT - the IP points to the exact instruction that triggered the
+ *         event (HW bugs exempt).
+ * VM    - original X86_VM_MASK; see set_linear_ip().
  */
 #define PERF_EFLAGS_EXACT	(1UL << 3)
+#define PERF_EFLAGS_VM		(1UL << 5)
 
 struct pt_regs;
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
@@ -234,8 +237,9 @@ struct perf_guest_switch_msr {
 
 extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
 extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
+extern void perf_check_microcode(void);
 #else
-static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
 {
 	*nr = 0;
 	return NULL;
@@ -247,6 +251,7 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
 }
 
 static inline void perf_events_lapic_init(void)	{ }
+static inline void perf_check_microcode(void) { }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 98391db840c6..f2b489cf1602 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -2,9 +2,9 @@
 #define _ASM_X86_PGTABLE_2LEVEL_H
 
 #define pte_ERROR(e) \
-	printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
+	pr_err("%s:%d: bad pte %08lx\n", __FILE__, __LINE__, (e).pte_low)
 #define pgd_ERROR(e) \
-	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
+	pr_err("%s:%d: bad pgd %08lx\n", __FILE__, __LINE__, pgd_val(e))
 
 /*
  * Certain architectures need to do special things when PTEs
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index cb00ccc7d571..4cc9f2b7cdc3 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -9,13 +9,13 @@
  */
 
 #define pte_ERROR(e)							\
-	printk("%s:%d: bad pte %p(%08lx%08lx).\n",			\
+	pr_err("%s:%d: bad pte %p(%08lx%08lx)\n",			\
 	       __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
 #define pmd_ERROR(e)							\
-	printk("%s:%d: bad pmd %p(%016Lx).\n",				\
+	pr_err("%s:%d: bad pmd %p(%016Lx)\n",				\
 	       __FILE__, __LINE__, &(e), pmd_val(e))
 #define pgd_ERROR(e)							\
-	printk("%s:%d: bad pgd %p(%016Lx).\n",				\
+	pr_err("%s:%d: bad pgd %p(%016Lx)\n",				\
 	       __FILE__, __LINE__, &(e), pgd_val(e))
 
 /* Rules for using set_pte: the pte being assigned *must* be
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 975f709e09ae..8251be02301e 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -26,16 +26,16 @@ extern pgd_t init_level4_pgt[];
 extern void paging_init(void);
 
 #define pte_ERROR(e)					\
-	printk("%s:%d: bad pte %p(%016lx).\n",		\
+	pr_err("%s:%d: bad pte %p(%016lx)\n",		\
 	       __FILE__, __LINE__, &(e), pte_val(e))
 #define pmd_ERROR(e)					\
-	printk("%s:%d: bad pmd %p(%016lx).\n",		\
+	pr_err("%s:%d: bad pmd %p(%016lx)\n",		\
 	       __FILE__, __LINE__, &(e), pmd_val(e))
 #define pud_ERROR(e)					\
-	printk("%s:%d: bad pud %p(%016lx).\n",		\
+	pr_err("%s:%d: bad pud %p(%016lx)\n",		\
 	       __FILE__, __LINE__, &(e), pud_val(e))
 #define pgd_ERROR(e)					\
-	printk("%s:%d: bad pgd %p(%016lx).\n",		\
+	pr_err("%s:%d: bad pgd %p(%016lx)\n",		\
 	       __FILE__, __LINE__, &(e), pgd_val(e))
 
 struct mm_struct;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 013286a10c2c..db8fec6d2953 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -303,11 +303,9 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte);
 
 extern void native_pagetable_reserve(u64 start, u64 end);
 #ifdef CONFIG_X86_32
-extern void native_pagetable_setup_start(pgd_t *base);
-extern void native_pagetable_setup_done(pgd_t *base);
+extern void native_pagetable_init(void);
 #else
-#define native_pagetable_setup_start x86_init_pgd_noop
-#define native_pagetable_setup_done  x86_init_pgd_noop
+#define native_pagetable_init        paging_init
 #endif
 
 struct seq_file;
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index f8ab3eaad128..aea1d1d848c7 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -44,6 +44,7 @@
  */
 #define X86_CR3_PWT	0x00000008 /* Page Write Through */
 #define X86_CR3_PCD	0x00000010 /* Page Cache Disable */
+#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */
 
 /*
  * Intel CPU features in CR4
@@ -61,6 +62,7 @@
 #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
 #define X86_CR4_VMXE	0x00002000 /* enable VMX virtualization */
 #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
+#define X86_CR4_PCIDE	0x00020000 /* enable PCID support */
 #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
 #define X86_CR4_SMEP	0x00100000 /* enable SMEP support */
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 39bc5777211a..d048cad9bcad 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -61,6 +61,19 @@ static inline void *current_text_addr(void)
 # define ARCH_MIN_MMSTRUCT_ALIGN	0
 #endif
 
+enum tlb_infos {
+	ENTRIES,
+	NR_INFO
+};
+
+extern u16 __read_mostly tlb_lli_4k[NR_INFO];
+extern u16 __read_mostly tlb_lli_2m[NR_INFO];
+extern u16 __read_mostly tlb_lli_4m[NR_INFO];
+extern u16 __read_mostly tlb_lld_4k[NR_INFO];
+extern u16 __read_mostly tlb_lld_2m[NR_INFO];
+extern u16 __read_mostly tlb_lld_4m[NR_INFO];
+extern s8  __read_mostly tlb_flushall_shift;
+
 /*
  *  CPU type and hardware bug flags. Kept separately for each CPU.
  *  Members of this structure are referenced in head.S, so think twice
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index fce3f4ae5bd6..fe1ec5bcd846 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -21,8 +21,9 @@ struct real_mode_header {
 	u32	wakeup_header;
 #endif
 	/* APM/BIOS reboot */
-#ifdef CONFIG_X86_32
 	u32	machine_real_restart_asm;
+#ifdef CONFIG_X86_64
+	u32	machine_real_restart_seg;
 #endif
 };
 
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 92f297069e87..a82c4f1b4d83 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -18,8 +18,8 @@ extern struct machine_ops machine_ops;
 
 void native_machine_crash_shutdown(struct pt_regs *regs);
 void native_machine_shutdown(void);
-void machine_real_restart(unsigned int type);
-/* These must match dispatch_table in reboot_32.S */
+void __noreturn machine_real_restart(unsigned int type);
+/* These must match dispatch in arch/x86/realmore/rm/reboot.S */
 #define MRR_BIOS	0
 #define MRR_APM		1
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index f48394513c37..4f19a1526037 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -31,12 +31,12 @@ static inline bool cpu_has_ht_siblings(void)
 	return has_siblings;
 }
 
-DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
-DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
 /* cpus sharing the last level cache: */
-DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
-DECLARE_PER_CPU(u16, cpu_llc_id);
-DECLARE_PER_CPU(int, cpu_number);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
 
 static inline struct cpumask *cpu_sibling_mask(int cpu)
 {
@@ -53,10 +53,10 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
 	return per_cpu(cpu_llc_shared_map, cpu);
 }
 
-DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
-DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
-DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
 #endif
 
 /* Static state in head.S used to set up a CPU */
@@ -169,11 +169,6 @@ void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);
 void smp_store_cpu_info(int id);
 #define cpu_physical_id(cpu)	per_cpu(x86_cpu_to_apicid, cpu)
 
-/* We don't mark CPUs online until __cpu_up(), so we need another measure */
-static inline int num_booting_cpus(void)
-{
-	return cpumask_weight(cpu_callout_mask);
-}
 #else /* !CONFIG_SMP */
 #define wbinvd_on_cpu(cpu)     wbinvd()
 static inline int wbinvd_on_all_cpus(void)
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index b315a33867f2..33692eaabab5 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -12,8 +12,7 @@
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
  *
- * These are fair FIFO ticket locks, which are currently limited to 256
- * CPUs.
+ * These are fair FIFO ticket locks, which support up to 2^16 CPUs.
  *
  * (the type definitions are in asm/spinlock_types.h)
  */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 829215fef9ee..4fef20773b8f 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -4,7 +4,14 @@
 #define tlb_start_vma(tlb, vma) do { } while (0)
 #define tlb_end_vma(tlb, vma) do { } while (0)
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+
+#define tlb_flush(tlb)							\
+{									\
+	if (tlb->fullmm == 0)						\
+		flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL);	\
+	else								\
+		flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL);	\
+}
 
 #include <asm-generic/tlb.h>
 
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 36a1a2ab87d2..74a44333545a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -73,14 +73,10 @@ static inline void __flush_tlb_one(unsigned long addr)
  *  - flush_tlb_page(vma, vmaddr) flushes one page
  *  - flush_tlb_range(vma, start, end) flushes a range of pages
  *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- *  - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus
+ *  - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
  *
  * ..but the i386 has somewhat limited tlb flushing capabilities,
  * and page-granular flushes are available only on i486 and up.
- *
- * x86-64 can only flush individual pages or full VMs. For a range flush
- * we always do the full VM. Might be worth trying if for a small
- * range a few INVLPGs in a row are a win.
  */
 
 #ifndef CONFIG_SMP
@@ -109,9 +105,17 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 		__flush_tlb();
 }
 
+static inline void flush_tlb_mm_range(struct mm_struct *mm,
+	   unsigned long start, unsigned long end, unsigned long vmflag)
+{
+	if (mm == current->active_mm)
+		__flush_tlb();
+}
+
 static inline void native_flush_tlb_others(const struct cpumask *cpumask,
 					   struct mm_struct *mm,
-					   unsigned long va)
+					   unsigned long start,
+					   unsigned long end)
 {
 }
 
@@ -119,27 +123,35 @@ static inline void reset_lazy_tlbstate(void)
 {
 }
 
+static inline void flush_tlb_kernel_range(unsigned long start,
+					  unsigned long end)
+{
+	flush_tlb_all();
+}
+
 #else  /* SMP */
 
 #include <asm/smp.h>
 
 #define local_flush_tlb() __flush_tlb()
 
+#define flush_tlb_mm(mm)	flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
+
+#define flush_tlb_range(vma, start, end)	\
+		flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
+
 extern void flush_tlb_all(void);
 extern void flush_tlb_current_task(void);
-extern void flush_tlb_mm(struct mm_struct *);
 extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
+extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+				unsigned long end, unsigned long vmflag);
+extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
 #define flush_tlb()	flush_tlb_current_task()
 
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end)
-{
-	flush_tlb_mm(vma->vm_mm);
-}
-
 void native_flush_tlb_others(const struct cpumask *cpumask,
-			     struct mm_struct *mm, unsigned long va);
+				struct mm_struct *mm,
+				unsigned long start, unsigned long end);
 
 #define TLBSTATE_OK	1
 #define TLBSTATE_LAZY	2
@@ -159,13 +171,8 @@ static inline void reset_lazy_tlbstate(void)
 #endif	/* SMP */
 
 #ifndef CONFIG_PARAVIRT
-#define flush_tlb_others(mask, mm, va)	native_flush_tlb_others(mask, mm, va)
+#define flush_tlb_others(mask, mm, start, end)	\
+	native_flush_tlb_others(mask, mm, start, end)
 #endif
 
-static inline void flush_tlb_kernel_range(unsigned long start,
-					  unsigned long end)
-{
-	flush_tlb_all();
-}
-
 #endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 8e796fbbf9c6..d8def8b3dba0 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -17,6 +17,8 @@
 
 /* Handles exceptions in both to and from, but doesn't do access_ok */
 __must_check unsigned long
+copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
+__must_check unsigned long
 copy_user_generic_string(void *to, const void *from, unsigned len);
 __must_check unsigned long
 copy_user_generic_unrolled(void *to, const void *from, unsigned len);
@@ -26,9 +28,16 @@ copy_user_generic(void *to, const void *from, unsigned len)
 {
 	unsigned ret;
 
-	alternative_call(copy_user_generic_unrolled,
+	/*
+	 * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
+	 * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
+	 * Otherwise, use copy_user_generic_unrolled.
+	 */
+	alternative_call_2(copy_user_generic_unrolled,
 			 copy_user_generic_string,
 			 X86_FEATURE_REP_GOOD,
+			 copy_user_enhanced_fast_string,
+			 X86_FEATURE_ERMS,
 			 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
 				     "=d" (len)),
 			 "1" (to), "2" (from), "3" (len)
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 4437001d8e3d..0d9776e9e2dc 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -15,7 +15,6 @@
 # ifdef CONFIG_X86_32
 
 #  include <asm/unistd_32.h>
-#  define __ARCH_WANT_IPC_PARSE_VERSION
 #  define __ARCH_WANT_STAT64
 #  define __ARCH_WANT_SYS_IPC
 #  define __ARCH_WANT_SYS_OLD_MMAP
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 1e9bed14f7ae..f3971bbcd1de 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -48,7 +48,7 @@ struct arch_uprobe_task {
 #endif
 };
 
-extern int  arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm);
+extern int  arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr);
 extern int  arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern int  arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 3bb9491b7659..b47c2a82ff15 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -15,7 +15,8 @@ extern void uv_nmi_init(void);
 extern void uv_system_init(void);
 extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 						 struct mm_struct *mm,
-						 unsigned long va,
+						 unsigned long start,
+						 unsigned end,
 						 unsigned int cpu);
 
 #else	/* X86_UV */
@@ -26,7 +27,7 @@ static inline void uv_cpu_init(void)	{ }
 static inline void uv_system_init(void)	{ }
 static inline const struct cpumask *
 uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
-		    unsigned long va, unsigned int cpu)
+		    unsigned long start, unsigned long end, unsigned int cpu)
 { return cpumask; }
 
 #endif	/* X86_UV */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 6149b476d9df..a06983cdc125 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -140,6 +140,9 @@
 #define IPI_RESET_LIMIT			1
 /* after this # consecutive successes, bump up the throttle if it was lowered */
 #define COMPLETE_THRESHOLD		5
+/* after this # of giveups (fall back to kernel IPI's) disable the use of
+   the BAU for a period of time */
+#define GIVEUP_LIMIT			100
 
 #define UV_LB_SUBNODEID			0x10
 
@@ -166,7 +169,6 @@
 #define FLUSH_RETRY_TIMEOUT		2
 #define FLUSH_GIVEUP			3
 #define FLUSH_COMPLETE			4
-#define FLUSH_RETRY_BUSYBUG		5
 
 /*
  * tuning the action when the numalink network is extremely delayed
@@ -175,7 +177,7 @@
 						   microseconds */
 #define CONGESTED_REPS			10	/* long delays averaged over
 						   this many broadcasts */
-#define CONGESTED_PERIOD		30	/* time for the bau to be
+#define DISABLED_PERIOD			10	/* time for the bau to be
 						   disabled, in seconds */
 /* see msg_type: */
 #define MSG_NOOP			0
@@ -520,6 +522,12 @@ struct ptc_stats {
 	unsigned long	s_uv2_wars;		/* uv2 workaround, perm. busy */
 	unsigned long	s_uv2_wars_hw;		/* uv2 workaround, hiwater */
 	unsigned long	s_uv2_war_waits;	/* uv2 workaround, long waits */
+	unsigned long	s_overipilimit;		/* over the ipi reset limit */
+	unsigned long	s_giveuplimit;		/* disables, over giveup limit*/
+	unsigned long	s_enters;		/* entries to the driver */
+	unsigned long	s_ipifordisabled;	/* fall back to IPI; disabled */
+	unsigned long	s_plugged;		/* plugged by h/w bug*/
+	unsigned long	s_congested;		/* giveup on long wait */
 	/* destination statistics */
 	unsigned long	d_alltlb;		/* times all tlb's on this
 						   cpu were flushed */
@@ -586,8 +594,8 @@ struct bau_control {
 	int			timeout_tries;
 	int			ipi_attempts;
 	int			conseccompletes;
-	int			baudisabled;
-	int			set_bau_off;
+	short			nobau;
+	short			baudisabled;
 	short			cpu;
 	short			osnode;
 	short			uvhub_cpu;
@@ -596,14 +604,16 @@ struct bau_control {
 	short			cpus_in_socket;
 	short			cpus_in_uvhub;
 	short			partition_base_pnode;
-	short			using_desc; /* an index, like uvhub_cpu */
-	unsigned int		inuse_map;
+	short			busy;       /* all were busy (war) */
 	unsigned short		message_number;
 	unsigned short		uvhub_quiesce;
 	short			socket_acknowledge_count[DEST_Q_SIZE];
 	cycles_t		send_message;
+	cycles_t		period_end;
+	cycles_t		period_time;
 	spinlock_t		uvhub_lock;
 	spinlock_t		queue_lock;
+	spinlock_t		disable_lock;
 	/* tunables */
 	int			max_concurr;
 	int			max_concurr_const;
@@ -614,9 +624,9 @@ struct bau_control {
 	int			complete_threshold;
 	int			cong_response_us;
 	int			cong_reps;
-	int			cong_period;
-	unsigned long		clocks_per_100_usec;
-	cycles_t		period_time;
+	cycles_t		disabled_period;
+	int			period_giveups;
+	int			giveup_limit;
 	long			period_requests;
 	struct hub_and_pnode	*thp;
 };
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 31f180c21ce9..74fcb963595b 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -60,6 +60,7 @@
 #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
 #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
+#define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
 
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
@@ -281,6 +282,7 @@ enum vmcs_field {
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD		54
 #define EXIT_REASON_XSETBV		55
+#define EXIT_REASON_INVPCID		58
 
 /*
  * Interruption-information format
@@ -404,6 +406,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT				(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT			(1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT			(1ull << 17)
+#define VMX_EPT_AD_BIT					(1ull << 21)
 #define VMX_EPT_EXTENT_INDIVIDUAL_BIT		(1ull << 24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
@@ -415,11 +418,14 @@ enum vmcs_field {
 #define VMX_EPT_MAX_GAW				0x4
 #define VMX_EPT_MT_EPTE_SHIFT			3
 #define VMX_EPT_GAW_EPTP_SHIFT			3
+#define VMX_EPT_AD_ENABLE_BIT			(1ull << 6)
 #define VMX_EPT_DEFAULT_MT			0x6ull
 #define VMX_EPT_READABLE_MASK			0x1ull
 #define VMX_EPT_WRITABLE_MASK			0x2ull
 #define VMX_EPT_EXECUTABLE_MASK			0x4ull
 #define VMX_EPT_IPAT_BIT    			(1ull << 6)
+#define VMX_EPT_ACCESS_BIT				(1ull << 8)
+#define VMX_EPT_DIRTY_BIT				(1ull << 9)
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
 
diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
index 92e54abf89e0..f90f0a587c66 100644
--- a/arch/x86/include/asm/x2apic.h
+++ b/arch/x86/include/asm/x2apic.h
@@ -9,15 +9,6 @@
 #include <asm/ipi.h>
 #include <linux/cpumask.h>
 
-/*
- * Need to use more than cpu 0, because we need more vectors
- * when MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
-{
-	return cpu_online_mask;
-}
-
 static int x2apic_apic_id_valid(int apicid)
 {
 	return 1;
@@ -28,15 +19,6 @@ static int x2apic_apic_id_registered(void)
 	return 1;
 }
 
-/*
- * For now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	cpumask_clear(retmask);
-	cpumask_set_cpu(cpu, retmask);
-}
-
 static void
 __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
 {
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index c090af10ac7d..57693498519c 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -81,12 +81,13 @@ struct x86_init_mapping {
 
 /**
  * struct x86_init_paging - platform specific paging functions
- * @pagetable_setup_start:	platform specific pre paging_init() call
- * @pagetable_setup_done:	platform specific post paging_init() call
+ * @pagetable_init:	platform specific paging initialization call to setup
+ *			the kernel pagetables and prepare accessors functions.
+ *			Callback must call paging_init(). Called once after the
+ *			direct mapping for phys memory is available.
  */
 struct x86_init_paging {
-	void (*pagetable_setup_start)(pgd_t *base);
-	void (*pagetable_setup_done)(pgd_t *base);
+	void (*pagetable_init)(void);
 };
 
 /**
@@ -156,7 +157,6 @@ struct x86_cpuinit_ops {
 /**
  * struct x86_platform_ops - platform specific runtime functions
  * @calibrate_tsc:		calibrate TSC
- * @wallclock_init:		init the wallclock device
  * @get_wallclock:		get time from HW clock like RTC etc.
  * @set_wallclock:		set time back to HW clock
  * @is_untracked_pat_range	exclude from PAT logic
@@ -164,10 +164,10 @@ struct x86_cpuinit_ops {
  * @i8042_detect		pre-detect if i8042 controller exists
  * @save_sched_clock_state:	save state for sched_clock() on suspend
  * @restore_sched_clock_state:	restore state for sched_clock() on resume
+ * @apic_post_init:		adjust apic if neeeded
  */
 struct x86_platform_ops {
 	unsigned long (*calibrate_tsc)(void);
-	void (*wallclock_init)(void);
 	unsigned long (*get_wallclock)(void);
 	int (*set_wallclock)(unsigned long nowtime);
 	void (*iommu_shutdown)(void);
@@ -177,6 +177,7 @@ struct x86_platform_ops {
 	int (*i8042_detect)(void);
 	void (*save_sched_clock_state)(void);
 	void (*restore_sched_clock_state)(void);
+	void (*apic_post_init)(void);
 };
 
 struct pci_dev;
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5728852fb90f..59c226d120cd 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -48,6 +48,7 @@
 #include <xen/interface/sched.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/platform.h>
+#include <xen/interface/xen-mca.h>
 
 /*
  * The hypercall asms have to meet several constraints:
@@ -302,6 +303,13 @@ HYPERVISOR_set_timer_op(u64 timeout)
 }
 
 static inline int
+HYPERVISOR_mca(struct xen_mc *mc_op)
+{
+	mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
+	return _hypercall1(int, mca, mc_op);
+}
+
+static inline int
 HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
 {
 	platform_op->interface_version = XENPF_INTERFACE_VERSION;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 95bf99de9058..1b8e5a03d942 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -25,10 +25,6 @@ unsigned long acpi_realmode_flags;
 static char temp_stack[4096];
 #endif
 
-asmlinkage void acpi_enter_s3(void)
-{
-	acpi_enter_sleep_state(3, wake_sleep_flags);
-}
 /**
  * acpi_suspend_lowlevel - save kernel state
  *
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index 5653a5791ec9..67f59f8c6956 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -2,7 +2,6 @@
  *	Variables and functions used by the code in sleep.c
  */
 
-#include <linux/linkage.h>
 #include <asm/realmode.h>
 
 extern unsigned long saved_video_mode;
@@ -11,7 +10,6 @@ extern long saved_magic;
 extern int wakeup_pmode_return;
 
 extern u8 wake_sleep_flags;
-extern asmlinkage void acpi_enter_s3(void);
 
 extern unsigned long acpi_copy_wakeup_routine(unsigned long);
 extern void wakeup_long64(void);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 72610839f03b..13ab720573e3 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -74,7 +74,9 @@ restore_registers:
 ENTRY(do_suspend_lowlevel)
 	call	save_processor_state
 	call	save_registers
-	call	acpi_enter_s3
+	pushl	$3
+	call	acpi_enter_sleep_state
+	addl	$4, %esp
 
 #	In case of S3 failure, we'll emerge here.  Jump
 # 	to ret_point to recover
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 014d1d28c397..8ea5164cbd04 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -71,7 +71,9 @@ ENTRY(do_suspend_lowlevel)
 	movq	%rsi, saved_rsi
 
 	addq	$8, %rsp
-	call	acpi_enter_s3
+	movl	$3, %edi
+	xorl	%eax, %eax
+	call	acpi_enter_sleep_state
 	/* in case something went wrong, restore the machine status and go on */
 	jmp	resume_point
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1f84794f0759..ced4534baed5 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) "SMP alternatives: " fmt
+
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/mutex.h>
@@ -63,8 +65,11 @@ static int __init setup_noreplace_paravirt(char *str)
 __setup("noreplace-paravirt", setup_noreplace_paravirt);
 #endif
 
-#define DPRINTK(fmt, args...) if (debug_alternative) \
-	printk(KERN_DEBUG fmt, args)
+#define DPRINTK(fmt, ...)				\
+do {							\
+	if (debug_alternative)				\
+		printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
+} while (0)
 
 /*
  * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
@@ -160,7 +165,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
 #endif
 
 #ifdef P6_NOP1
-static const unsigned char  __initconst_or_module p6nops[] =
+static const unsigned char p6nops[] =
 {
 	P6_NOP1,
 	P6_NOP2,
@@ -219,7 +224,7 @@ void __init arch_init_ideal_nops(void)
 			ideal_nops = intel_nops;
 #endif
 		}
-
+		break;
 	default:
 #ifdef CONFIG_X86_64
 		ideal_nops = k8_nops;
@@ -428,7 +433,7 @@ void alternatives_smp_switch(int smp)
 	 * If this still occurs then you should see a hang
 	 * or crash shortly after this line:
 	 */
-	printk("lockdep: fixing up alternatives.\n");
+	pr_info("lockdep: fixing up alternatives\n");
 #endif
 
 	if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
@@ -444,14 +449,14 @@ void alternatives_smp_switch(int smp)
 	if (smp == smp_mode) {
 		/* nothing */
 	} else if (smp) {
-		printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
+		pr_info("switching to SMP code\n");
 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
 		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
 		list_for_each_entry(mod, &smp_alt_modules, next)
 			alternatives_smp_lock(mod->locks, mod->locks_end,
 					      mod->text, mod->text_end);
 	} else {
-		printk(KERN_INFO "SMP alternatives: switching to UP code\n");
+		pr_info("switching to UP code\n");
 		set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
 		set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
 		list_for_each_entry(mod, &smp_alt_modules, next)
@@ -546,7 +551,7 @@ void __init alternative_instructions(void)
 #ifdef CONFIG_SMP
 	if (smp_alt_once) {
 		if (1 == num_possible_cpus()) {
-			printk(KERN_INFO "SMP alternatives: switching to UP code\n");
+			pr_info("switching to UP code\n");
 			set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
 			set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
 
@@ -664,7 +669,7 @@ static int __kprobes stop_machine_text_poke(void *data)
 	struct text_poke_param *p;
 	int i;
 
-	if (atomic_dec_and_test(&stop_machine_first)) {
+	if (atomic_xchg(&stop_machine_first, 0)) {
 		for (i = 0; i < tpp->nparams; i++) {
 			p = &tpp->params[i];
 			text_poke(p->addr, p->opcode, p->len);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index be16854591cc..aadf3359e2a7 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -2,6 +2,9 @@
  * Shared support code for AMD K8 northbridges and derivates.
  * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -16,6 +19,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
 	{}
 };
 EXPORT_SYMBOL(amd_nb_misc_ids);
@@ -258,7 +262,7 @@ void amd_flush_garts(void)
 	}
 	spin_unlock_irqrestore(&gart_lock, flags);
 	if (!flushed)
-		printk("nothing to flush?\n");
+		pr_notice("nothing to flush?\n");
 }
 EXPORT_SYMBOL_GPL(amd_flush_garts);
 
@@ -269,11 +273,10 @@ static __init int init_amd_nbs(void)
 	err = amd_cache_northbridges();
 
 	if (err < 0)
-		printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
+		pr_notice("Cannot enumerate AMD northbridges\n");
 
 	if (amd_cache_gart() < 0)
-		printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
-		       "GART support disabled.\n");
+		pr_notice("Cannot initialize GART flush words, GART support disabled\n");
 
 	return err;
 }
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 39a222e094af..24deb3082328 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -75,8 +75,8 @@ physid_mask_t phys_cpu_present_map;
 /*
  * Map cpu index to physical APIC ID
  */
-DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
 
@@ -88,7 +88,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
  * used for the mapping.  This is where the behaviors of x86_64 and 32
  * actually diverge.  Let's keep it ugly for now.
  */
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
 
 /*
  * Knob to control our willingness to enable the local APIC.
@@ -2123,6 +2123,42 @@ void default_init_apic_ldr(void)
 	apic_write(APIC_LDR, val);
 }
 
+int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+				   const struct cpumask *andmask,
+				   unsigned int *apicid)
+{
+	unsigned int cpu;
+
+	for_each_cpu_and(cpu, cpumask, andmask) {
+		if (cpumask_test_cpu(cpu, cpu_online_mask))
+			break;
+	}
+
+	if (likely(cpu < nr_cpu_ids)) {
+		*apicid = per_cpu(x86_cpu_to_apicid, cpu);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * Override the generic EOI implementation with an optimized version.
+ * Only called during early boot when only one CPU is active and with
+ * interrupts disabled, so we know this does not race with actual APIC driver
+ * use.
+ */
+void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
+{
+	struct apic **drv;
+
+	for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+		/* Should happen once for each apic */
+		WARN_ON((*drv)->eoi_write == eoi_write);
+		(*drv)->eoi_write = eoi_write;
+	}
+}
+
 /*
  * Power management
  */
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 0e881c46e8c8..00c77cf78e9e 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -36,25 +36,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	return 1;
 }
 
-static const struct cpumask *flat_target_cpus(void)
-{
-	return cpu_online_mask;
-}
-
-static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	/* Careful. Some cpus do not strictly honor the set of cpus
-	 * specified in the interrupt destination when using lowest
-	 * priority interrupt delivery mode.
-	 *
-	 * In particular there was a hyperthreading cpu observed to
-	 * deliver interrupts to the wrong hyperthread when only one
-	 * hyperthread was specified in the interrupt desitination.
-	 */
-	cpumask_clear(retmask);
-	cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
 /*
  * Set up the logical destination ID.
  *
@@ -92,7 +73,7 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
 }
 
 static void
- flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
+flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
 {
 	unsigned long mask = cpumask_bits(cpumask)[0];
 	int cpu = smp_processor_id();
@@ -186,7 +167,7 @@ static struct apic apic_flat =  {
 	.irq_delivery_mode		= dest_LowestPrio,
 	.irq_dest_mode			= 1, /* logical */
 
-	.target_cpus			= flat_target_cpus,
+	.target_cpus			= online_target_cpus,
 	.disable_esr			= 0,
 	.dest_logical			= APIC_DEST_LOGICAL,
 	.check_apicid_used		= NULL,
@@ -210,8 +191,7 @@ static struct apic apic_flat =  {
 	.set_apic_id			= set_apic_id,
 	.apic_id_mask			= 0xFFu << 24,
 
-	.cpu_mask_to_apicid		= default_cpu_mask_to_apicid,
-	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
+	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= flat_send_IPI_mask,
 	.send_IPI_mask_allbutself	= flat_send_IPI_mask_allbutself,
@@ -262,17 +242,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	return 0;
 }
 
-static const struct cpumask *physflat_target_cpus(void)
-{
-	return cpu_online_mask;
-}
-
-static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	cpumask_clear(retmask);
-	cpumask_set_cpu(cpu, retmask);
-}
-
 static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
 {
 	default_send_IPI_mask_sequence_phys(cpumask, vector);
@@ -294,38 +263,6 @@ static void physflat_send_IPI_all(int vector)
 	physflat_send_IPI_mask(cpu_online_mask, vector);
 }
 
-static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
-	int cpu;
-
-	/*
-	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
-	 * May as well be the first.
-	 */
-	cpu = cpumask_first(cpumask);
-	if ((unsigned)cpu < nr_cpu_ids)
-		return per_cpu(x86_cpu_to_apicid, cpu);
-	else
-		return BAD_APICID;
-}
-
-static unsigned int
-physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
-				const struct cpumask *andmask)
-{
-	int cpu;
-
-	/*
-	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
-	 * May as well be the first.
-	 */
-	for_each_cpu_and(cpu, cpumask, andmask) {
-		if (cpumask_test_cpu(cpu, cpu_online_mask))
-			break;
-	}
-	return per_cpu(x86_cpu_to_apicid, cpu);
-}
-
 static int physflat_probe(void)
 {
 	if (apic == &apic_physflat || num_possible_cpus() > 8)
@@ -345,13 +282,13 @@ static struct apic apic_physflat =  {
 	.irq_delivery_mode		= dest_Fixed,
 	.irq_dest_mode			= 0, /* physical */
 
-	.target_cpus			= physflat_target_cpus,
+	.target_cpus			= online_target_cpus,
 	.disable_esr			= 0,
 	.dest_logical			= 0,
 	.check_apicid_used		= NULL,
 	.check_apicid_present		= NULL,
 
-	.vector_allocation_domain	= physflat_vector_allocation_domain,
+	.vector_allocation_domain	= default_vector_allocation_domain,
 	/* not needed, but shouldn't hurt: */
 	.init_apic_ldr			= flat_init_apic_ldr,
 
@@ -370,8 +307,7 @@ static struct apic apic_physflat =  {
 	.set_apic_id			= set_apic_id,
 	.apic_id_mask			= 0xFFu << 24,
 
-	.cpu_mask_to_apicid		= physflat_cpu_mask_to_apicid,
-	.cpu_mask_to_apicid_and		= physflat_cpu_mask_to_apicid_and,
+	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= physflat_send_IPI_mask,
 	.send_IPI_mask_allbutself	= physflat_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index a6e4c6e06c08..e145f28b4099 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -100,12 +100,12 @@ static unsigned long noop_check_apicid_present(int bit)
 	return physid_isset(bit, phys_cpu_present_map);
 }
 
-static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
+					  const struct cpumask *mask)
 {
 	if (cpu != 0)
 		pr_warning("APIC: Vector allocated for non-BSP cpu\n");
-	cpumask_clear(retmask);
-	cpumask_set_cpu(cpu, retmask);
+	cpumask_copy(retmask, cpumask_of(cpu));
 }
 
 static u32 noop_apic_read(u32 reg)
@@ -159,8 +159,7 @@ struct apic apic_noop = {
 	.set_apic_id			= NULL,
 	.apic_id_mask			= 0x0F << 24,
 
-	.cpu_mask_to_apicid		= default_cpu_mask_to_apicid,
-	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
+	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= noop_send_IPI_mask,
 	.send_IPI_mask_allbutself	= noop_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 6ec6d5d297c3..bc552cff2578 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -72,17 +72,6 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
 	return initial_apic_id >> index_msb;
 }
 
-static const struct cpumask *numachip_target_cpus(void)
-{
-	return cpu_online_mask;
-}
-
-static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	cpumask_clear(retmask);
-	cpumask_set_cpu(cpu, retmask);
-}
-
 static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 {
 	union numachip_csr_g3_ext_irq_gen int_gen;
@@ -157,38 +146,6 @@ static void numachip_send_IPI_self(int vector)
 	__default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
 }
 
-static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
-	int cpu;
-
-	/*
-	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
-	 * May as well be the first.
-	 */
-	cpu = cpumask_first(cpumask);
-	if (likely((unsigned)cpu < nr_cpu_ids))
-		return per_cpu(x86_cpu_to_apicid, cpu);
-
-	return BAD_APICID;
-}
-
-static unsigned int
-numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
-				const struct cpumask *andmask)
-{
-	int cpu;
-
-	/*
-	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
-	 * May as well be the first.
-	 */
-	for_each_cpu_and(cpu, cpumask, andmask) {
-		if (cpumask_test_cpu(cpu, cpu_online_mask))
-			break;
-	}
-	return per_cpu(x86_cpu_to_apicid, cpu);
-}
-
 static int __init numachip_probe(void)
 {
 	return apic == &apic_numachip;
@@ -253,13 +210,13 @@ static struct apic apic_numachip __refconst = {
 	.irq_delivery_mode		= dest_Fixed,
 	.irq_dest_mode			= 0, /* physical */
 
-	.target_cpus			= numachip_target_cpus,
+	.target_cpus			= online_target_cpus,
 	.disable_esr			= 0,
 	.dest_logical			= 0,
 	.check_apicid_used		= NULL,
 	.check_apicid_present		= NULL,
 
-	.vector_allocation_domain	= numachip_vector_allocation_domain,
+	.vector_allocation_domain	= default_vector_allocation_domain,
 	.init_apic_ldr			= flat_init_apic_ldr,
 
 	.ioapic_phys_id_map		= NULL,
@@ -277,8 +234,7 @@ static struct apic apic_numachip __refconst = {
 	.set_apic_id			= set_apic_id,
 	.apic_id_mask			= 0xffU << 24,
 
-	.cpu_mask_to_apicid		= numachip_cpu_mask_to_apicid,
-	.cpu_mask_to_apicid_and		= numachip_cpu_mask_to_apicid_and,
+	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= numachip_send_IPI_mask,
 	.send_IPI_mask_allbutself	= numachip_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 31fbdbfbf960..d50e3640d5ae 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -26,15 +26,6 @@ static int bigsmp_apic_id_registered(void)
 	return 1;
 }
 
-static const struct cpumask *bigsmp_target_cpus(void)
-{
-#ifdef CONFIG_SMP
-	return cpu_online_mask;
-#else
-	return cpumask_of(0);
-#endif
-}
-
 static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
 {
 	return 0;
@@ -105,32 +96,6 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
 	return 1;
 }
 
-/* As we are using single CPU as destination, pick only one CPU here */
-static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
-	int cpu = cpumask_first(cpumask);
-
-	if (cpu < nr_cpu_ids)
-		return cpu_physical_id(cpu);
-	return BAD_APICID;
-}
-
-static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
-			      const struct cpumask *andmask)
-{
-	int cpu;
-
-	/*
-	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
-	 * May as well be the first.
-	 */
-	for_each_cpu_and(cpu, cpumask, andmask) {
-		if (cpumask_test_cpu(cpu, cpu_online_mask))
-			return cpu_physical_id(cpu);
-	}
-	return BAD_APICID;
-}
-
 static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
 {
 	return cpuid_apic >> index_msb;
@@ -177,12 +142,6 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
 	{ } /* NULL entry stops DMI scanning */
 };
 
-static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	cpumask_clear(retmask);
-	cpumask_set_cpu(cpu, retmask);
-}
-
 static int probe_bigsmp(void)
 {
 	if (def_to_bigsmp)
@@ -205,13 +164,13 @@ static struct apic apic_bigsmp = {
 	/* phys delivery to target CPU: */
 	.irq_dest_mode			= 0,
 
-	.target_cpus			= bigsmp_target_cpus,
+	.target_cpus			= default_target_cpus,
 	.disable_esr			= 1,
 	.dest_logical			= 0,
 	.check_apicid_used		= bigsmp_check_apicid_used,
 	.check_apicid_present		= bigsmp_check_apicid_present,
 
-	.vector_allocation_domain	= bigsmp_vector_allocation_domain,
+	.vector_allocation_domain	= default_vector_allocation_domain,
 	.init_apic_ldr			= bigsmp_init_apic_ldr,
 
 	.ioapic_phys_id_map		= bigsmp_ioapic_phys_id_map,
@@ -229,8 +188,7 @@ static struct apic apic_bigsmp = {
 	.set_apic_id			= NULL,
 	.apic_id_mask			= 0xFF << 24,
 
-	.cpu_mask_to_apicid		= bigsmp_cpu_mask_to_apicid,
-	.cpu_mask_to_apicid_and		= bigsmp_cpu_mask_to_apicid_and,
+	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= bigsmp_send_IPI_mask,
 	.send_IPI_mask_allbutself	= NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index db4ab1be3c79..0874799a98c6 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -394,21 +394,6 @@ static void es7000_enable_apic_mode(void)
 		WARN(1, "Command failed, status = %x\n", mip_status);
 }
 
-static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	/* Careful. Some cpus do not strictly honor the set of cpus
-	 * specified in the interrupt destination when using lowest
-	 * priority interrupt delivery mode.
-	 *
-	 * In particular there was a hyperthreading cpu observed to
-	 * deliver interrupts to the wrong hyperthread when only one
-	 * hyperthread was specified in the interrupt desitination.
-	 */
-	cpumask_clear(retmask);
-	cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-
 static void es7000_wait_for_init_deassert(atomic_t *deassert)
 {
 	while (!atomic_read(deassert))
@@ -540,45 +525,49 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
 	return 1;
 }
 
-static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
+static inline int
+es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
 {
 	unsigned int round = 0;
-	int cpu, uninitialized_var(apicid);
+	unsigned int cpu, uninitialized_var(apicid);
 
 	/*
 	 * The cpus in the mask must all be on the apic cluster.
 	 */
-	for_each_cpu(cpu, cpumask) {
+	for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
 		int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
 
 		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
 			WARN(1, "Not a valid mask!");
 
-			return BAD_APICID;
+			return -EINVAL;
 		}
-		apicid = new_apicid;
+		apicid |= new_apicid;
 		round++;
 	}
-	return apicid;
+	if (!round)
+		return -EINVAL;
+	*dest_id = apicid;
+	return 0;
 }
 
-static unsigned int
+static int
 es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
-			      const struct cpumask *andmask)
+			      const struct cpumask *andmask,
+			      unsigned int *apicid)
 {
-	int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
 	cpumask_var_t cpumask;
+	*apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
 
 	if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
-		return apicid;
+		return 0;
 
 	cpumask_and(cpumask, inmask, andmask);
-	cpumask_and(cpumask, cpumask, cpu_online_mask);
-	apicid = es7000_cpu_mask_to_apicid(cpumask);
+	es7000_cpu_mask_to_apicid(cpumask, apicid);
 
 	free_cpumask_var(cpumask);
 
-	return apicid;
+	return 0;
 }
 
 static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -638,7 +627,7 @@ static struct apic __refdata apic_es7000_cluster = {
 	.check_apicid_used		= es7000_check_apicid_used,
 	.check_apicid_present		= es7000_check_apicid_present,
 
-	.vector_allocation_domain	= es7000_vector_allocation_domain,
+	.vector_allocation_domain	= flat_vector_allocation_domain,
 	.init_apic_ldr			= es7000_init_apic_ldr_cluster,
 
 	.ioapic_phys_id_map		= es7000_ioapic_phys_id_map,
@@ -656,7 +645,6 @@ static struct apic __refdata apic_es7000_cluster = {
 	.set_apic_id			= NULL,
 	.apic_id_mask			= 0xFF << 24,
 
-	.cpu_mask_to_apicid		= es7000_cpu_mask_to_apicid,
 	.cpu_mask_to_apicid_and		= es7000_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= es7000_send_IPI_mask,
@@ -705,7 +693,7 @@ static struct apic __refdata apic_es7000 = {
 	.check_apicid_used		= es7000_check_apicid_used,
 	.check_apicid_present		= es7000_check_apicid_present,
 
-	.vector_allocation_domain	= es7000_vector_allocation_domain,
+	.vector_allocation_domain	= flat_vector_allocation_domain,
 	.init_apic_ldr			= es7000_init_apic_ldr,
 
 	.ioapic_phys_id_map		= es7000_ioapic_phys_id_map,
@@ -723,7 +711,6 @@ static struct apic __refdata apic_es7000 = {
 	.set_apic_id			= NULL,
 	.apic_id_mask			= 0xFF << 24,
 
-	.cpu_mask_to_apicid		= es7000_cpu_mask_to_apicid,
 	.cpu_mask_to_apicid_and		= es7000_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= es7000_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5f0ff597437c..c265593ec2cd 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -448,8 +448,8 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
 
 	entry = alloc_irq_pin_list(node);
 	if (!entry) {
-		printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
-				node, apic, pin);
+		pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
+		       node, apic, pin);
 		return -ENOMEM;
 	}
 	entry->apic = apic;
@@ -661,7 +661,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 	ioapic_mask_entry(apic, pin);
 	entry = ioapic_read_entry(apic, pin);
 	if (entry.irr)
-		printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n",
+		pr_err("Unable to reset IRR for apic: %d, pin :%d\n",
 		       mpc_ioapic_id(apic), pin);
 }
 
@@ -895,7 +895,7 @@ static int irq_polarity(int idx)
 		}
 		case 2: /* reserved */
 		{
-			printk(KERN_WARNING "broken BIOS!!\n");
+			pr_warn("broken BIOS!!\n");
 			polarity = 1;
 			break;
 		}
@@ -906,7 +906,7 @@ static int irq_polarity(int idx)
 		}
 		default: /* invalid */
 		{
-			printk(KERN_WARNING "broken BIOS!!\n");
+			pr_warn("broken BIOS!!\n");
 			polarity = 1;
 			break;
 		}
@@ -948,7 +948,7 @@ static int irq_trigger(int idx)
 				}
 				default:
 				{
-					printk(KERN_WARNING "broken BIOS!!\n");
+					pr_warn("broken BIOS!!\n");
 					trigger = 1;
 					break;
 				}
@@ -962,7 +962,7 @@ static int irq_trigger(int idx)
 		}
 		case 2: /* reserved */
 		{
-			printk(KERN_WARNING "broken BIOS!!\n");
+			pr_warn("broken BIOS!!\n");
 			trigger = 1;
 			break;
 		}
@@ -973,7 +973,7 @@ static int irq_trigger(int idx)
 		}
 		default: /* invalid */
 		{
-			printk(KERN_WARNING "broken BIOS!!\n");
+			pr_warn("broken BIOS!!\n");
 			trigger = 0;
 			break;
 		}
@@ -991,7 +991,7 @@ static int pin_2_irq(int idx, int apic, int pin)
 	 * Debugging check, we are in big trouble if this message pops up!
 	 */
 	if (mp_irqs[idx].dstirq != pin)
-		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+		pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
 
 	if (test_bit(bus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
@@ -1112,8 +1112,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
 	static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
-	static int current_offset = VECTOR_OFFSET_START % 8;
-	unsigned int old_vector;
+	static int current_offset = VECTOR_OFFSET_START % 16;
 	int cpu, err;
 	cpumask_var_t tmp_mask;
 
@@ -1123,35 +1122,45 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
 		return -ENOMEM;
 
-	old_vector = cfg->vector;
-	if (old_vector) {
-		cpumask_and(tmp_mask, mask, cpu_online_mask);
-		cpumask_and(tmp_mask, cfg->domain, tmp_mask);
-		if (!cpumask_empty(tmp_mask)) {
-			free_cpumask_var(tmp_mask);
-			return 0;
-		}
-	}
-
 	/* Only try and allocate irqs on cpus that are present */
 	err = -ENOSPC;
-	for_each_cpu_and(cpu, mask, cpu_online_mask) {
-		int new_cpu;
-		int vector, offset;
+	cpumask_clear(cfg->old_domain);
+	cpu = cpumask_first_and(mask, cpu_online_mask);
+	while (cpu < nr_cpu_ids) {
+		int new_cpu, vector, offset;
 
-		apic->vector_allocation_domain(cpu, tmp_mask);
+		apic->vector_allocation_domain(cpu, tmp_mask, mask);
+
+		if (cpumask_subset(tmp_mask, cfg->domain)) {
+			err = 0;
+			if (cpumask_equal(tmp_mask, cfg->domain))
+				break;
+			/*
+			 * New cpumask using the vector is a proper subset of
+			 * the current in use mask. So cleanup the vector
+			 * allocation for the members that are not used anymore.
+			 */
+			cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
+			cfg->move_in_progress = 1;
+			cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+			break;
+		}
 
 		vector = current_vector;
 		offset = current_offset;
 next:
-		vector += 8;
+		vector += 16;
 		if (vector >= first_system_vector) {
-			/* If out of vectors on large boxen, must share them. */
-			offset = (offset + 1) % 8;
+			offset = (offset + 1) % 16;
 			vector = FIRST_EXTERNAL_VECTOR + offset;
 		}
-		if (unlikely(current_vector == vector))
+
+		if (unlikely(current_vector == vector)) {
+			cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
+			cpumask_andnot(tmp_mask, mask, cfg->old_domain);
+			cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
 			continue;
+		}
 
 		if (test_bit(vector, used_vectors))
 			goto next;
@@ -1162,7 +1171,7 @@ next:
 		/* Found one! */
 		current_vector = vector;
 		current_offset = offset;
-		if (old_vector) {
+		if (cfg->vector) {
 			cfg->move_in_progress = 1;
 			cpumask_copy(cfg->old_domain, cfg->domain);
 		}
@@ -1195,7 +1204,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
-	for_each_cpu(cpu, cfg->domain)
+	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
 		per_cpu(vector_irq, cpu)[vector] = -1;
 
 	cfg->vector = 0;
@@ -1203,7 +1212,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 
 	if (likely(!cfg->move_in_progress))
 		return;
-	for_each_cpu(cpu, cfg->old_domain) {
+	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 								vector++) {
 			if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1346,18 +1355,28 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
 
 	if (!IO_APIC_IRQ(irq))
 		return;
+
 	/*
-	 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
-	 * controllers like 8259. Now that IO-APIC can handle this irq, update
-	 * the cfg->domain.
+	 * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC
+	 * can handle this irq and the apic driver is finialized at this point,
+	 * update the cfg->domain.
 	 */
-	if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
-		apic->vector_allocation_domain(0, cfg->domain);
+	if (irq < legacy_pic->nr_legacy_irqs &&
+	    cpumask_equal(cfg->domain, cpumask_of(0)))
+		apic->vector_allocation_domain(0, cfg->domain,
+					       apic->target_cpus());
 
 	if (assign_irq_vector(irq, cfg, apic->target_cpus()))
 		return;
 
-	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+	if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
+					 &dest)) {
+		pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
+			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
+		__clear_irq_vector(irq, cfg);
+
+		return;
+	}
 
 	apic_printk(APIC_VERBOSE,KERN_DEBUG
 		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1366,7 +1385,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
 		    cfg->vector, irq, attr->trigger, attr->polarity, dest);
 
 	if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
-		pr_warn("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
+		pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
 			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
 		__clear_irq_vector(irq, cfg);
 
@@ -1469,9 +1488,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)
  * Set up the timer pin, possibly with the 8259A-master behind.
  */
 static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
-					 unsigned int pin, int vector)
+					unsigned int pin, int vector)
 {
 	struct IO_APIC_route_entry entry;
+	unsigned int dest;
 
 	if (irq_remapping_enabled)
 		return;
@@ -1482,9 +1502,13 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
 	 * We use logical delivery to get the timer IRQ
 	 * to the first CPU.
 	 */
+	if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
+						  apic->target_cpus(), &dest)))
+		dest = BAD_APICID;
+
 	entry.dest_mode = apic->irq_dest_mode;
 	entry.mask = 0;			/* don't mask IRQ for edge */
-	entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
+	entry.dest = dest;
 	entry.delivery_mode = apic->irq_delivery_mode;
 	entry.polarity = 0;
 	entry.trigger = 0;
@@ -1521,7 +1545,6 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
 		reg_03.raw = io_apic_read(ioapic_idx, 3);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	printk("\n");
 	printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
@@ -1578,7 +1601,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
 				i,
 				ir_entry->index
 			);
-			printk("%1d   %1d    %1d    %1d   %1d   "
+			pr_cont("%1d   %1d    %1d    %1d   %1d   "
 				"%1d    %1d     %X    %02X\n",
 				ir_entry->format,
 				ir_entry->mask,
@@ -1598,7 +1621,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
 				i,
 				entry.dest
 			);
-			printk("%1d    %1d    %1d   %1d   %1d    "
+			pr_cont("%1d    %1d    %1d   %1d   %1d    "
 				"%1d    %1d    %02X\n",
 				entry.mask,
 				entry.trigger,
@@ -1651,8 +1674,8 @@ __apicdebuginit(void) print_IO_APICs(void)
 			continue;
 		printk(KERN_DEBUG "IRQ%d ", irq);
 		for_each_irq_pin(entry, cfg->irq_2_pin)
-			printk("-> %d:%d", entry->apic, entry->pin);
-		printk("\n");
+			pr_cont("-> %d:%d", entry->apic, entry->pin);
+		pr_cont("\n");
 	}
 
 	printk(KERN_INFO ".................................... done.\n");
@@ -1665,9 +1688,9 @@ __apicdebuginit(void) print_APIC_field(int base)
 	printk(KERN_DEBUG);
 
 	for (i = 0; i < 8; i++)
-		printk(KERN_CONT "%08x", apic_read(base + i*0x10));
+		pr_cont("%08x", apic_read(base + i*0x10));
 
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 }
 
 __apicdebuginit(void) print_local_APIC(void *dummy)
@@ -1769,7 +1792,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
 			printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
 		}
 	}
-	printk("\n");
+	pr_cont("\n");
 }
 
 __apicdebuginit(void) print_local_APICs(int maxcpu)
@@ -2065,7 +2088,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
 		reg_00.raw = io_apic_read(ioapic_idx, 0);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 		if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
-			printk("could not set ID!\n");
+			pr_cont("could not set ID!\n");
 		else
 			apic_printk(APIC_VERBOSE, " ok.\n");
 	}
@@ -2210,71 +2233,6 @@ void send_cleanup_vector(struct irq_cfg *cfg)
 	cfg->move_in_progress = 0;
 }
 
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
-	int apic, pin;
-	struct irq_pin_list *entry;
-	u8 vector = cfg->vector;
-
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
-		unsigned int reg;
-
-		apic = entry->apic;
-		pin = entry->pin;
-		/*
-		 * With interrupt-remapping, destination information comes
-		 * from interrupt-remapping table entry.
-		 */
-		if (!irq_remapped(cfg))
-			io_apic_write(apic, 0x11 + pin*2, dest);
-		reg = io_apic_read(apic, 0x10 + pin*2);
-		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
-		reg |= vector;
-		io_apic_modify(apic, 0x10 + pin*2, reg);
-	}
-}
-
-/*
- * Either sets data->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves data->affinity untouched.
- */
-int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-			  unsigned int *dest_id)
-{
-	struct irq_cfg *cfg = data->chip_data;
-
-	if (!cpumask_intersects(mask, cpu_online_mask))
-		return -1;
-
-	if (assign_irq_vector(data->irq, data->chip_data, mask))
-		return -1;
-
-	cpumask_copy(data->affinity, mask);
-
-	*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
-	return 0;
-}
-
-static int
-ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		    bool force)
-{
-	unsigned int dest, irq = data->irq;
-	unsigned long flags;
-	int ret;
-
-	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	ret = __ioapic_set_affinity(data, mask, &dest);
-	if (!ret) {
-		/* Only the high 8 bits are valid. */
-		dest = SET_APIC_LOGICAL_ID(dest);
-		__target_IO_APIC_irq(irq, dest, data->chip_data);
-	}
-	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-	return ret;
-}
-
 asmlinkage void smp_irq_move_cleanup_interrupt(void)
 {
 	unsigned vector, me;
@@ -2362,6 +2320,87 @@ void irq_force_complete_move(int irq)
 static inline void irq_complete_move(struct irq_cfg *cfg) { }
 #endif
 
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+	int apic, pin;
+	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
+
+	for_each_irq_pin(entry, cfg->irq_2_pin) {
+		unsigned int reg;
+
+		apic = entry->apic;
+		pin = entry->pin;
+		/*
+		 * With interrupt-remapping, destination information comes
+		 * from interrupt-remapping table entry.
+		 */
+		if (!irq_remapped(cfg))
+			io_apic_write(apic, 0x11 + pin*2, dest);
+		reg = io_apic_read(apic, 0x10 + pin*2);
+		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+		reg |= vector;
+		io_apic_modify(apic, 0x10 + pin*2, reg);
+	}
+}
+
+/*
+ * Either sets data->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
+ * leaves data->affinity untouched.
+ */
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+			  unsigned int *dest_id)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	unsigned int irq = data->irq;
+	int err;
+
+	if (!config_enabled(CONFIG_SMP))
+		return -1;
+
+	if (!cpumask_intersects(mask, cpu_online_mask))
+		return -EINVAL;
+
+	err = assign_irq_vector(irq, cfg, mask);
+	if (err)
+		return err;
+
+	err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
+	if (err) {
+		if (assign_irq_vector(irq, cfg, data->affinity))
+			pr_err("Failed to recover vector for irq %d\n", irq);
+		return err;
+	}
+
+	cpumask_copy(data->affinity, mask);
+
+	return 0;
+}
+
+static int
+ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		    bool force)
+{
+	unsigned int dest, irq = data->irq;
+	unsigned long flags;
+	int ret;
+
+	if (!config_enabled(CONFIG_SMP))
+		return -1;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	ret = __ioapic_set_affinity(data, mask, &dest);
+	if (!ret) {
+		/* Only the high 8 bits are valid. */
+		dest = SET_APIC_LOGICAL_ID(dest);
+		__target_IO_APIC_irq(irq, dest, data->chip_data);
+		ret = IRQ_SET_MASK_OK_NOCOPY;
+	}
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+	return ret;
+}
+
 static void ack_apic_edge(struct irq_data *data)
 {
 	irq_complete_move(data->chip_data);
@@ -2541,9 +2580,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
 	chip->irq_ack = ir_ack_apic_edge;
 	chip->irq_eoi = ir_ack_apic_level;
 
-#ifdef CONFIG_SMP
 	chip->irq_set_affinity = set_remapped_irq_affinity;
-#endif
 }
 #endif /* CONFIG_IRQ_REMAP */
 
@@ -2554,9 +2591,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
 	.irq_unmask		= unmask_ioapic_irq,
 	.irq_ack		= ack_apic_edge,
 	.irq_eoi		= ack_apic_level,
-#ifdef CONFIG_SMP
 	.irq_set_affinity	= ioapic_set_affinity,
-#endif
 	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
@@ -3038,7 +3073,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 	if (err)
 		return err;
 
-	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+	err = apic->cpu_mask_to_apicid_and(cfg->domain,
+					   apic->target_cpus(), &dest);
+	if (err)
+		return err;
 
 	if (irq_remapped(cfg)) {
 		compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
@@ -3072,7 +3110,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 	return err;
 }
 
-#ifdef CONFIG_SMP
 static int
 msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
@@ -3092,9 +3129,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 
 	__write_msi_msg(data->msi_desc, &msg);
 
-	return 0;
+	return IRQ_SET_MASK_OK_NOCOPY;
 }
-#endif /* CONFIG_SMP */
 
 /*
  * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
@@ -3105,9 +3141,7 @@ static struct irq_chip msi_chip = {
 	.irq_unmask		= unmask_msi_irq,
 	.irq_mask		= mask_msi_irq,
 	.irq_ack		= ack_apic_edge,
-#ifdef CONFIG_SMP
 	.irq_set_affinity	= msi_set_affinity,
-#endif
 	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
@@ -3192,7 +3226,6 @@ void native_teardown_msi_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_DMAR_TABLE
-#ifdef CONFIG_SMP
 static int
 dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
 		      bool force)
@@ -3214,19 +3247,15 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
 
 	dmar_msi_write(irq, &msg);
 
-	return 0;
+	return IRQ_SET_MASK_OK_NOCOPY;
 }
 
-#endif /* CONFIG_SMP */
-
 static struct irq_chip dmar_msi_type = {
 	.name			= "DMAR_MSI",
 	.irq_unmask		= dmar_msi_unmask,
 	.irq_mask		= dmar_msi_mask,
 	.irq_ack		= ack_apic_edge,
-#ifdef CONFIG_SMP
 	.irq_set_affinity	= dmar_msi_set_affinity,
-#endif
 	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
@@ -3247,7 +3276,6 @@ int arch_setup_dmar_msi(unsigned int irq)
 
 #ifdef CONFIG_HPET_TIMER
 
-#ifdef CONFIG_SMP
 static int hpet_msi_set_affinity(struct irq_data *data,
 				 const struct cpumask *mask, bool force)
 {
@@ -3267,19 +3295,15 @@ static int hpet_msi_set_affinity(struct irq_data *data,
 
 	hpet_msi_write(data->handler_data, &msg);
 
-	return 0;
+	return IRQ_SET_MASK_OK_NOCOPY;
 }
 
-#endif /* CONFIG_SMP */
-
 static struct irq_chip hpet_msi_type = {
 	.name = "HPET_MSI",
 	.irq_unmask = hpet_msi_unmask,
 	.irq_mask = hpet_msi_mask,
 	.irq_ack = ack_apic_edge,
-#ifdef CONFIG_SMP
 	.irq_set_affinity = hpet_msi_set_affinity,
-#endif
 	.irq_retrigger = ioapic_retrigger_irq,
 };
 
@@ -3314,8 +3338,6 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
  */
 #ifdef CONFIG_HT_IRQ
 
-#ifdef CONFIG_SMP
-
 static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
 {
 	struct ht_irq_msg msg;
@@ -3340,25 +3362,23 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 		return -1;
 
 	target_ht_irq(data->irq, dest, cfg->vector);
-	return 0;
+	return IRQ_SET_MASK_OK_NOCOPY;
 }
 
-#endif
-
 static struct irq_chip ht_irq_chip = {
 	.name			= "PCI-HT",
 	.irq_mask		= mask_ht_irq,
 	.irq_unmask		= unmask_ht_irq,
 	.irq_ack		= ack_apic_edge,
-#ifdef CONFIG_SMP
 	.irq_set_affinity	= ht_set_affinity,
-#endif
 	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 {
 	struct irq_cfg *cfg;
+	struct ht_irq_msg msg;
+	unsigned dest;
 	int err;
 
 	if (disable_apic)
@@ -3366,36 +3386,37 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 
 	cfg = irq_cfg(irq);
 	err = assign_irq_vector(irq, cfg, apic->target_cpus());
-	if (!err) {
-		struct ht_irq_msg msg;
-		unsigned dest;
+	if (err)
+		return err;
+
+	err = apic->cpu_mask_to_apicid_and(cfg->domain,
+					   apic->target_cpus(), &dest);
+	if (err)
+		return err;
 
-		dest = apic->cpu_mask_to_apicid_and(cfg->domain,
-						    apic->target_cpus());
+	msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
 
-		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+	msg.address_lo =
+		HT_IRQ_LOW_BASE |
+		HT_IRQ_LOW_DEST_ID(dest) |
+		HT_IRQ_LOW_VECTOR(cfg->vector) |
+		((apic->irq_dest_mode == 0) ?
+			HT_IRQ_LOW_DM_PHYSICAL :
+			HT_IRQ_LOW_DM_LOGICAL) |
+		HT_IRQ_LOW_RQEOI_EDGE |
+		((apic->irq_delivery_mode != dest_LowestPrio) ?
+			HT_IRQ_LOW_MT_FIXED :
+			HT_IRQ_LOW_MT_ARBITRATED) |
+		HT_IRQ_LOW_IRQ_MASKED;
 
-		msg.address_lo =
-			HT_IRQ_LOW_BASE |
-			HT_IRQ_LOW_DEST_ID(dest) |
-			HT_IRQ_LOW_VECTOR(cfg->vector) |
-			((apic->irq_dest_mode == 0) ?
-				HT_IRQ_LOW_DM_PHYSICAL :
-				HT_IRQ_LOW_DM_LOGICAL) |
-			HT_IRQ_LOW_RQEOI_EDGE |
-			((apic->irq_delivery_mode != dest_LowestPrio) ?
-				HT_IRQ_LOW_MT_FIXED :
-				HT_IRQ_LOW_MT_ARBITRATED) |
-			HT_IRQ_LOW_IRQ_MASKED;
+	write_ht_irq_msg(irq, &msg);
 
-		write_ht_irq_msg(irq, &msg);
+	irq_set_chip_and_handler_name(irq, &ht_irq_chip,
+				      handle_edge_irq, "edge");
 
-		irq_set_chip_and_handler_name(irq, &ht_irq_chip,
-					      handle_edge_irq, "edge");
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
 
-		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
-	}
-	return err;
+	return 0;
 }
 #endif /* CONFIG_HT_IRQ */
 
@@ -3563,7 +3584,8 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id)
 
 		/* Sanity check */
 		if (reg_00.bits.ID != apic_id) {
-			printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+			pr_err("IOAPIC[%d]: Unable to change apic_id!\n",
+			       ioapic);
 			return -1;
 		}
 	}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index f00a68cca37a..d661ee95cabf 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -406,16 +406,13 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid)
  * We use physical apicids here, not logical, so just return the default
  * physical broadcast to stop people from breaking us
  */
-static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
-	return 0x0F;
-}
-
-static inline unsigned int
+static int
 numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
-			     const struct cpumask *andmask)
+			     const struct cpumask *andmask,
+			     unsigned int *apicid)
 {
-	return 0x0F;
+	*apicid = 0x0F;
+	return 0;
 }
 
 /* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
@@ -441,20 +438,6 @@ static int probe_numaq(void)
 	return found_numaq;
 }
 
-static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	/* Careful. Some cpus do not strictly honor the set of cpus
-	 * specified in the interrupt destination when using lowest
-	 * priority interrupt delivery mode.
-	 *
-	 * In particular there was a hyperthreading cpu observed to
-	 * deliver interrupts to the wrong hyperthread when only one
-	 * hyperthread was specified in the interrupt desitination.
-	 */
-	cpumask_clear(retmask);
-	cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
 static void numaq_setup_portio_remap(void)
 {
 	int num_quads = num_online_nodes();
@@ -491,7 +474,7 @@ static struct apic __refdata apic_numaq = {
 	.check_apicid_used		= numaq_check_apicid_used,
 	.check_apicid_present		= numaq_check_apicid_present,
 
-	.vector_allocation_domain	= numaq_vector_allocation_domain,
+	.vector_allocation_domain	= flat_vector_allocation_domain,
 	.init_apic_ldr			= numaq_init_apic_ldr,
 
 	.ioapic_phys_id_map		= numaq_ioapic_phys_id_map,
@@ -509,7 +492,6 @@ static struct apic __refdata apic_numaq = {
 	.set_apic_id			= NULL,
 	.apic_id_mask			= 0x0F << 24,
 
-	.cpu_mask_to_apicid		= numaq_cpu_mask_to_apicid,
 	.cpu_mask_to_apicid_and		= numaq_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= numaq_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 1b291da09e60..eb35ef9ee63f 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -66,21 +66,6 @@ static void setup_apic_flat_routing(void)
 #endif
 }
 
-static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	/*
-	 * Careful. Some cpus do not strictly honor the set of cpus
-	 * specified in the interrupt destination when using lowest
-	 * priority interrupt delivery mode.
-	 *
-	 * In particular there was a hyperthreading cpu observed to
-	 * deliver interrupts to the wrong hyperthread when only one
-	 * hyperthread was specified in the interrupt desitination.
-	 */
-	cpumask_clear(retmask);
-	cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
 /* should be called last. */
 static int probe_default(void)
 {
@@ -105,7 +90,7 @@ static struct apic apic_default = {
 	.check_apicid_used		= default_check_apicid_used,
 	.check_apicid_present		= default_check_apicid_present,
 
-	.vector_allocation_domain	= default_vector_allocation_domain,
+	.vector_allocation_domain	= flat_vector_allocation_domain,
 	.init_apic_ldr			= default_init_apic_ldr,
 
 	.ioapic_phys_id_map		= default_ioapic_phys_id_map,
@@ -123,8 +108,7 @@ static struct apic apic_default = {
 	.set_apic_id			= NULL,
 	.apic_id_mask			= 0x0F << 24,
 
-	.cpu_mask_to_apicid		= default_cpu_mask_to_apicid,
-	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
+	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= default_send_IPI_mask_logical,
 	.send_IPI_mask_allbutself	= default_send_IPI_mask_allbutself_logical,
@@ -208,6 +192,9 @@ void __init default_setup_apic_routing(void)
 
 	if (apic->setup_apic_routing)
 		apic->setup_apic_routing();
+
+	if (x86_platform.apic_post_init)
+		x86_platform.apic_post_init();
 }
 
 void __init generic_apic_probe(void)
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 3fe986698929..1793dba7a741 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,11 +23,6 @@
 #include <asm/ipi.h>
 #include <asm/setup.h>
 
-static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
-{
-	return hard_smp_processor_id() >> index_msb;
-}
-
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
  */
@@ -48,10 +43,8 @@ void __init default_setup_apic_routing(void)
 		}
 	}
 
-	if (is_vsmp_box()) {
-		/* need to update phys_pkg_id */
-		apic->phys_pkg_id = apicid_phys_pkg_id;
-	}
+	if (x86_platform.apic_post_init)
+		x86_platform.apic_post_init();
 }
 
 /* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 659897c00755..77c95c0e1bf7 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -26,6 +26,8 @@
  *
  */
 
+#define pr_fmt(fmt) "summit: %s: " fmt, __func__
+
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <asm/io.h>
@@ -235,8 +237,8 @@ static int summit_apic_id_registered(void)
 
 static void summit_setup_apic_routing(void)
 {
-	printk("Enabling APIC mode:  Summit.  Using %d I/O APICs\n",
-						nr_ioapics);
+	pr_info("Enabling APIC mode:  Summit.  Using %d I/O APICs\n",
+		nr_ioapics);
 }
 
 static int summit_cpu_present_to_apicid(int mps_cpu)
@@ -263,43 +265,48 @@ static int summit_check_phys_apicid_present(int physical_apicid)
 	return 1;
 }
 
-static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
+static inline int
+summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
 {
 	unsigned int round = 0;
-	int cpu, apicid = 0;
+	unsigned int cpu, apicid = 0;
 
 	/*
 	 * The cpus in the mask must all be on the apic cluster.
 	 */
-	for_each_cpu(cpu, cpumask) {
+	for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
 		int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
 
 		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
-			printk("%s: Not a valid mask!\n", __func__);
-			return BAD_APICID;
+			pr_err("Not a valid mask!\n");
+			return -EINVAL;
 		}
 		apicid |= new_apicid;
 		round++;
 	}
-	return apicid;
+	if (!round)
+		return -EINVAL;
+	*dest_id = apicid;
+	return 0;
 }
 
-static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
-			      const struct cpumask *andmask)
+static int
+summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
+			      const struct cpumask *andmask,
+			      unsigned int *apicid)
 {
-	int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
 	cpumask_var_t cpumask;
+	*apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
 
 	if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
-		return apicid;
+		return 0;
 
 	cpumask_and(cpumask, inmask, andmask);
-	cpumask_and(cpumask, cpumask, cpu_online_mask);
-	apicid = summit_cpu_mask_to_apicid(cpumask);
+	summit_cpu_mask_to_apicid(cpumask, apicid);
 
 	free_cpumask_var(cpumask);
 
-	return apicid;
+	return 0;
 }
 
 /*
@@ -320,20 +327,6 @@ static int probe_summit(void)
 	return 0;
 }
 
-static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	/* Careful. Some cpus do not strictly honor the set of cpus
-	 * specified in the interrupt destination when using lowest
-	 * priority interrupt delivery mode.
-	 *
-	 * In particular there was a hyperthreading cpu observed to
-	 * deliver interrupts to the wrong hyperthread when only one
-	 * hyperthread was specified in the interrupt desitination.
-	 */
-	cpumask_clear(retmask);
-	cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
 #ifdef CONFIG_X86_SUMMIT_NUMA
 static struct rio_table_hdr *rio_table_hdr;
 static struct scal_detail   *scal_devs[MAX_NUMNODES];
@@ -355,7 +348,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
 		}
 	}
 	if (i == rio_table_hdr->num_rio_dev) {
-		printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
+		pr_err("Couldn't find owner Cyclone for Winnipeg!\n");
 		return last_bus;
 	}
 
@@ -366,7 +359,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
 		}
 	}
 	if (i == rio_table_hdr->num_scal_dev) {
-		printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
+		pr_err("Couldn't find owner Twister for Cyclone!\n");
 		return last_bus;
 	}
 
@@ -396,7 +389,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
 		num_buses = 9;
 		break;
 	default:
-		printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
+		pr_info("Unsupported Winnipeg type!\n");
 		return last_bus;
 	}
 
@@ -411,13 +404,15 @@ static int build_detail_arrays(void)
 	int i, scal_detail_size, rio_detail_size;
 
 	if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
-		printk(KERN_WARNING "%s: MAX_NUMNODES too low!  Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+		pr_warn("MAX_NUMNODES too low!  Defined as %d, but system has %d nodes\n",
+			MAX_NUMNODES, rio_table_hdr->num_scal_dev);
 		return 0;
 	}
 
 	switch (rio_table_hdr->version) {
 	default:
-		printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
+		pr_warn("Invalid Rio Grande Table Version: %d\n",
+			rio_table_hdr->version);
 		return 0;
 	case 2:
 		scal_detail_size = 11;
@@ -462,7 +457,7 @@ void setup_summit(void)
 		offset = *((unsigned short *)(ptr + offset));
 	}
 	if (!rio_table_hdr) {
-		printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
+		pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n");
 		return;
 	}
 
@@ -509,7 +504,7 @@ static struct apic apic_summit = {
 	.check_apicid_used		= summit_check_apicid_used,
 	.check_apicid_present		= summit_check_apicid_present,
 
-	.vector_allocation_domain	= summit_vector_allocation_domain,
+	.vector_allocation_domain	= flat_vector_allocation_domain,
 	.init_apic_ldr			= summit_init_apic_ldr,
 
 	.ioapic_phys_id_map		= summit_ioapic_phys_id_map,
@@ -527,7 +522,6 @@ static struct apic apic_summit = {
 	.set_apic_id			= NULL,
 	.apic_id_mask			= 0xFF << 24,
 
-	.cpu_mask_to_apicid		= summit_cpu_mask_to_apicid,
 	.cpu_mask_to_apicid_and		= summit_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= summit_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index ff35cff0e1a7..c88baa4ff0e5 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -81,7 +81,7 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 }
 
 static void
- x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 {
 	__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
 }
@@ -96,36 +96,37 @@ static void x2apic_send_IPI_all(int vector)
 	__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
 }
 
-static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
+static int
+x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+			      const struct cpumask *andmask,
+			      unsigned int *apicid)
 {
-	/*
-	 * We're using fixed IRQ delivery, can only return one logical APIC ID.
-	 * May as well be the first.
-	 */
-	int cpu = cpumask_first(cpumask);
+	u32 dest = 0;
+	u16 cluster;
+	int i;
 
-	if ((unsigned)cpu < nr_cpu_ids)
-		return per_cpu(x86_cpu_to_logical_apicid, cpu);
-	else
-		return BAD_APICID;
-}
+	for_each_cpu_and(i, cpumask, andmask) {
+		if (!cpumask_test_cpu(i, cpu_online_mask))
+			continue;
+		dest = per_cpu(x86_cpu_to_logical_apicid, i);
+		cluster = x2apic_cluster(i);
+		break;
+	}
 
-static unsigned int
-x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
-			      const struct cpumask *andmask)
-{
-	int cpu;
+	if (!dest)
+		return -EINVAL;
 
-	/*
-	 * We're using fixed IRQ delivery, can only return one logical APIC ID.
-	 * May as well be the first.
-	 */
-	for_each_cpu_and(cpu, cpumask, andmask) {
-		if (cpumask_test_cpu(cpu, cpu_online_mask))
-			break;
+	for_each_cpu_and(i, cpumask, andmask) {
+		if (!cpumask_test_cpu(i, cpu_online_mask))
+			continue;
+		if (cluster != x2apic_cluster(i))
+			continue;
+		dest |= per_cpu(x86_cpu_to_logical_apicid, i);
 	}
 
-	return per_cpu(x86_cpu_to_logical_apicid, cpu);
+	*apicid = dest;
+
+	return 0;
 }
 
 static void init_x2apic_ldr(void)
@@ -208,6 +209,32 @@ static int x2apic_cluster_probe(void)
 		return 0;
 }
 
+static const struct cpumask *x2apic_cluster_target_cpus(void)
+{
+	return cpu_all_mask;
+}
+
+/*
+ * Each x2apic cluster is an allocation domain.
+ */
+static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
+					     const struct cpumask *mask)
+{
+	/*
+	 * To minimize vector pressure, default case of boot, device bringup
+	 * etc will use a single cpu for the interrupt destination.
+	 *
+	 * On explicit migration requests coming from irqbalance etc,
+	 * interrupts will be routed to the x2apic cluster (cluster-id
+	 * derived from the first cpu in the mask) members specified
+	 * in the mask.
+	 */
+	if (mask == x2apic_cluster_target_cpus())
+		cpumask_copy(retmask, cpumask_of(cpu));
+	else
+		cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
+}
+
 static struct apic apic_x2apic_cluster = {
 
 	.name				= "cluster x2apic",
@@ -219,13 +246,13 @@ static struct apic apic_x2apic_cluster = {
 	.irq_delivery_mode		= dest_LowestPrio,
 	.irq_dest_mode			= 1, /* logical */
 
-	.target_cpus			= x2apic_target_cpus,
+	.target_cpus			= x2apic_cluster_target_cpus,
 	.disable_esr			= 0,
 	.dest_logical			= APIC_DEST_LOGICAL,
 	.check_apicid_used		= NULL,
 	.check_apicid_present		= NULL,
 
-	.vector_allocation_domain	= x2apic_vector_allocation_domain,
+	.vector_allocation_domain	= cluster_vector_allocation_domain,
 	.init_apic_ldr			= init_x2apic_ldr,
 
 	.ioapic_phys_id_map		= NULL,
@@ -243,7 +270,6 @@ static struct apic apic_x2apic_cluster = {
 	.set_apic_id			= x2apic_set_apic_id,
 	.apic_id_mask			= 0xFFFFFFFFu,
 
-	.cpu_mask_to_apicid		= x2apic_cpu_mask_to_apicid,
 	.cpu_mask_to_apicid_and		= x2apic_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= x2apic_send_IPI_mask,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index c17e982db275..e03a1e180e81 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -76,38 +76,6 @@ static void x2apic_send_IPI_all(int vector)
 	__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
 }
 
-static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
-	/*
-	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
-	 * May as well be the first.
-	 */
-	int cpu = cpumask_first(cpumask);
-
-	if ((unsigned)cpu < nr_cpu_ids)
-		return per_cpu(x86_cpu_to_apicid, cpu);
-	else
-		return BAD_APICID;
-}
-
-static unsigned int
-x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
-			      const struct cpumask *andmask)
-{
-	int cpu;
-
-	/*
-	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
-	 * May as well be the first.
-	 */
-	for_each_cpu_and(cpu, cpumask, andmask) {
-		if (cpumask_test_cpu(cpu, cpu_online_mask))
-			break;
-	}
-
-	return per_cpu(x86_cpu_to_apicid, cpu);
-}
-
 static void init_x2apic_ldr(void)
 {
 }
@@ -131,13 +99,13 @@ static struct apic apic_x2apic_phys = {
 	.irq_delivery_mode		= dest_Fixed,
 	.irq_dest_mode			= 0, /* physical */
 
-	.target_cpus			= x2apic_target_cpus,
+	.target_cpus			= online_target_cpus,
 	.disable_esr			= 0,
 	.dest_logical			= 0,
 	.check_apicid_used		= NULL,
 	.check_apicid_present		= NULL,
 
-	.vector_allocation_domain	= x2apic_vector_allocation_domain,
+	.vector_allocation_domain	= default_vector_allocation_domain,
 	.init_apic_ldr			= init_x2apic_ldr,
 
 	.ioapic_phys_id_map		= NULL,
@@ -155,8 +123,7 @@ static struct apic apic_x2apic_phys = {
 	.set_apic_id			= x2apic_set_apic_id,
 	.apic_id_mask			= 0xFFFFFFFFu,
 
-	.cpu_mask_to_apicid		= x2apic_cpu_mask_to_apicid,
-	.cpu_mask_to_apicid_and		= x2apic_cpu_mask_to_apicid_and,
+	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= x2apic_send_IPI_mask,
 	.send_IPI_mask_allbutself	= x2apic_send_IPI_mask_allbutself,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c6d03f7a4401..8cfade9510a4 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -185,17 +185,6 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
 unsigned long sn_rtc_cycles_per_second;
 EXPORT_SYMBOL(sn_rtc_cycles_per_second);
 
-static const struct cpumask *uv_target_cpus(void)
-{
-	return cpu_online_mask;
-}
-
-static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	cpumask_clear(retmask);
-	cpumask_set_cpu(cpu, retmask);
-}
-
 static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 {
 #ifdef CONFIG_SMP
@@ -280,25 +269,12 @@ static void uv_init_apic_ldr(void)
 {
 }
 
-static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
-	/*
-	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
-	 * May as well be the first.
-	 */
-	int cpu = cpumask_first(cpumask);
-
-	if ((unsigned)cpu < nr_cpu_ids)
-		return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
-	else
-		return BAD_APICID;
-}
-
-static unsigned int
+static int
 uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
-			  const struct cpumask *andmask)
+			  const struct cpumask *andmask,
+			  unsigned int *apicid)
 {
-	int cpu;
+	int unsigned cpu;
 
 	/*
 	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
@@ -308,7 +284,13 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 		if (cpumask_test_cpu(cpu, cpu_online_mask))
 			break;
 	}
-	return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
+
+	if (likely(cpu < nr_cpu_ids)) {
+		*apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
+		return 0;
+	}
+
+	return -EINVAL;
 }
 
 static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -362,13 +344,13 @@ static struct apic __refdata apic_x2apic_uv_x = {
 	.irq_delivery_mode		= dest_Fixed,
 	.irq_dest_mode			= 0, /* physical */
 
-	.target_cpus			= uv_target_cpus,
+	.target_cpus			= online_target_cpus,
 	.disable_esr			= 0,
 	.dest_logical			= APIC_DEST_LOGICAL,
 	.check_apicid_used		= NULL,
 	.check_apicid_present		= NULL,
 
-	.vector_allocation_domain	= uv_vector_allocation_domain,
+	.vector_allocation_domain	= default_vector_allocation_domain,
 	.init_apic_ldr			= uv_init_apic_ldr,
 
 	.ioapic_phys_id_map		= NULL,
@@ -386,7 +368,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
 	.set_apic_id			= set_apic_id,
 	.apic_id_mask			= 0xFFFFFFFFu,
 
-	.cpu_mask_to_apicid		= uv_cpu_mask_to_apicid,
 	.cpu_mask_to_apicid_and		= uv_cpu_mask_to_apicid_and,
 
 	.send_IPI_mask			= uv_send_IPI_mask,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 07b0c0db466c..d65464e43503 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -201,6 +201,8 @@
  *    http://www.microsoft.com/whdc/archive/amp_12.mspx]
  */
 
+#define pr_fmt(fmt) "apm: " fmt
+
 #include <linux/module.h>
 
 #include <linux/poll.h>
@@ -485,11 +487,11 @@ static void apm_error(char *str, int err)
 		if (error_table[i].key == err)
 			break;
 	if (i < ERROR_COUNT)
-		printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
+		pr_notice("%s: %s\n", str, error_table[i].msg);
 	else if (err < 0)
-		printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err);
+		pr_notice("%s: linux error code %i\n", str, err);
 	else
-		printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
+		pr_notice("%s: unknown error code %#2.2x\n",
 		       str, err);
 }
 
@@ -1184,7 +1186,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
 			static int notified;
 
 			if (notified++ == 0)
-			    printk(KERN_ERR "apm: an event queue overflowed\n");
+				pr_err("an event queue overflowed\n");
 			if (++as->event_tail >= APM_MAX_EVENTS)
 				as->event_tail = 0;
 		}
@@ -1447,7 +1449,7 @@ static void apm_mainloop(void)
 static int check_apm_user(struct apm_user *as, const char *func)
 {
 	if (as == NULL || as->magic != APM_BIOS_MAGIC) {
-		printk(KERN_ERR "apm: %s passed bad filp\n", func);
+		pr_err("%s passed bad filp\n", func);
 		return 1;
 	}
 	return 0;
@@ -1586,7 +1588,7 @@ static int do_release(struct inode *inode, struct file *filp)
 		     as1 = as1->next)
 			;
 		if (as1 == NULL)
-			printk(KERN_ERR "apm: filp not in user list\n");
+			pr_err("filp not in user list\n");
 		else
 			as1->next = as->next;
 	}
@@ -1600,11 +1602,9 @@ static int do_open(struct inode *inode, struct file *filp)
 	struct apm_user *as;
 
 	as = kmalloc(sizeof(*as), GFP_KERNEL);
-	if (as == NULL) {
-		printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
-		       sizeof(*as));
+	if (as == NULL)
 		return -ENOMEM;
-	}
+
 	as->magic = APM_BIOS_MAGIC;
 	as->event_tail = as->event_head = 0;
 	as->suspends_pending = as->standbys_pending = 0;
@@ -2313,16 +2313,16 @@ static int __init apm_init(void)
 	}
 
 	if (apm_info.disabled) {
-		printk(KERN_NOTICE "apm: disabled on user request.\n");
+		pr_notice("disabled on user request.\n");
 		return -ENODEV;
 	}
 	if ((num_online_cpus() > 1) && !power_off && !smp) {
-		printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
+		pr_notice("disabled - APM is not SMP safe.\n");
 		apm_info.disabled = 1;
 		return -ENODEV;
 	}
 	if (!acpi_disabled) {
-		printk(KERN_NOTICE "apm: overridden by ACPI.\n");
+		pr_notice("overridden by ACPI.\n");
 		apm_info.disabled = 1;
 		return -ENODEV;
 	}
@@ -2356,8 +2356,7 @@ static int __init apm_init(void)
 
 	kapmd_task = kthread_create(apm, NULL, "kapmd");
 	if (IS_ERR(kapmd_task)) {
-		printk(KERN_ERR "apm: disabled - Unable to start kernel "
-				"thread.\n");
+		pr_err("disabled - Unable to start kernel thread\n");
 		err = PTR_ERR(kapmd_task);
 		kapmd_task = NULL;
 		remove_proc_entry("apm", NULL);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6ab6aa2fdfdd..d30a6a9a0121 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o		:= $(nostackp)
 
 obj-y			:= intel_cacheinfo.o scattered.o topology.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
-obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
+obj-y			+= vmware.o hypervisor.o mshyperv.o
 obj-y			+= rdrand.o
 obj-y			+= match.o
 
@@ -32,7 +32,9 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
 
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_p4.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_uncore.o
 endif
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 146bb6218eec..9d92e19039f0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -19,6 +19,39 @@
 
 #include "cpu.h"
 
+static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+{
+	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+	u32 gprs[8] = { 0 };
+	int err;
+
+	WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__);
+
+	gprs[1] = msr;
+	gprs[7] = 0x9c5a203a;
+
+	err = rdmsr_safe_regs(gprs);
+
+	*p = gprs[0] | ((u64)gprs[2] << 32);
+
+	return err;
+}
+
+static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
+{
+	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+	u32 gprs[8] = { 0 };
+
+	WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__);
+
+	gprs[0] = (u32)val;
+	gprs[1] = msr;
+	gprs[2] = val >> 32;
+	gprs[7] = 0x9c5a203a;
+
+	return wrmsr_safe_regs(gprs);
+}
+
 #ifdef CONFIG_X86_32
 /*
  *	B step AMD K6 before B 9730xxxx have hardware bugs that can cause
@@ -586,9 +619,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	    !cpu_has(c, X86_FEATURE_TOPOEXT)) {
 		u64 val;
 
-		if (!rdmsrl_amd_safe(0xc0011005, &val)) {
+		if (!rdmsrl_safe(0xc0011005, &val)) {
 			val |= 1ULL << 54;
-			wrmsrl_amd_safe(0xc0011005, val);
+			wrmsrl_safe(0xc0011005, val);
 			rdmsrl(0xc0011005, val);
 			if (val & (1ULL << 54)) {
 				set_cpu_cap(c, X86_FEATURE_TOPOEXT);
@@ -679,7 +712,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
 		if (err == 0) {
 			mask |= (1 << 10);
-			checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
+			wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
 		}
 	}
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 46674fbb62ba..c97bb7b5a9f8 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -55,8 +55,8 @@ static void __init check_fpu(void)
 
 	if (!boot_cpu_data.hard_math) {
 #ifndef CONFIG_MATH_EMULATION
-		printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
-		printk(KERN_EMERG "Giving up.\n");
+		pr_emerg("No coprocessor found and no math emulation present\n");
+		pr_emerg("Giving up\n");
 		for (;;) ;
 #endif
 		return;
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
 
 	boot_cpu_data.fdiv_bug = fdiv_bug;
 	if (boot_cpu_data.fdiv_bug)
-		printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
+		pr_warn("Hmm, FPU with FDIV bug\n");
 }
 
 static void __init check_hlt(void)
@@ -94,16 +94,16 @@ static void __init check_hlt(void)
 	if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
 		return;
 
-	printk(KERN_INFO "Checking 'hlt' instruction... ");
+	pr_info("Checking 'hlt' instruction... ");
 	if (!boot_cpu_data.hlt_works_ok) {
-		printk("disabled\n");
+		pr_cont("disabled\n");
 		return;
 	}
 	halt();
 	halt();
 	halt();
 	halt();
-	printk(KERN_CONT "OK.\n");
+	pr_cont("OK\n");
 }
 
 /*
@@ -116,7 +116,7 @@ static void __init check_popad(void)
 #ifndef CONFIG_X86_POPAD_OK
 	int res, inp = (int) &res;
 
-	printk(KERN_INFO "Checking for popad bug... ");
+	pr_info("Checking for popad bug... ");
 	__asm__ __volatile__(
 	  "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
 	  : "=&a" (res)
@@ -127,9 +127,9 @@ static void __init check_popad(void)
 	 * CPU hard. Too bad.
 	 */
 	if (res != 12345678)
-		printk(KERN_CONT "Buggy.\n");
+		pr_cont("Buggy\n");
 	else
-		printk(KERN_CONT "OK.\n");
+		pr_cont("OK\n");
 #endif
 }
 
@@ -161,7 +161,7 @@ void __init check_bugs(void)
 {
 	identify_boot_cpu();
 #ifndef CONFIG_SMP
-	printk(KERN_INFO "CPU: ");
+	pr_info("CPU: ");
 	print_cpu_info(&boot_cpu_data);
 #endif
 	check_config();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6b9333b429ba..a5fbc3c5fccc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -144,6 +144,8 @@ static int __init x86_xsave_setup(char *s)
 {
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	setup_clear_cpu_cap(X86_FEATURE_AVX);
+	setup_clear_cpu_cap(X86_FEATURE_AVX2);
 	return 1;
 }
 __setup("noxsave", x86_xsave_setup);
@@ -452,6 +454,35 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
 	c->x86_cache_size = l2size;
 }
 
+u16 __read_mostly tlb_lli_4k[NR_INFO];
+u16 __read_mostly tlb_lli_2m[NR_INFO];
+u16 __read_mostly tlb_lli_4m[NR_INFO];
+u16 __read_mostly tlb_lld_4k[NR_INFO];
+u16 __read_mostly tlb_lld_2m[NR_INFO];
+u16 __read_mostly tlb_lld_4m[NR_INFO];
+
+/*
+ * tlb_flushall_shift shows the balance point in replacing cr3 write
+ * with multiple 'invlpg'. It will do this replacement when
+ *   flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
+ * If tlb_flushall_shift is -1, means the replacement will be disabled.
+ */
+s8  __read_mostly tlb_flushall_shift = -1;
+
+void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
+{
+	if (this_cpu->c_detect_tlb)
+		this_cpu->c_detect_tlb(c);
+
+	printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
+		"Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n"	     \
+		"tlb_flushall_shift is 0x%x\n",
+		tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
+		tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
+		tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
+		tlb_flushall_shift);
+}
+
 void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_X86_HT
@@ -911,6 +942,8 @@ void __init identify_boot_cpu(void)
 #else
 	vgetcpu_set_mode();
 #endif
+	if (boot_cpu_data.cpuid_level >= 2)
+		cpu_detect_tlb(&boot_cpu_data);
 }
 
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -947,7 +980,7 @@ static void __cpuinit __print_cpu_msr(void)
 		index_max = msr_range_array[i].max;
 
 		for (index = index_min; index < index_max; index++) {
-			if (rdmsrl_amd_safe(index, &val))
+			if (rdmsrl_safe(index, &val))
 				continue;
 			printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
 		}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 8bacc7826fb3..4041c24ae7db 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -20,10 +20,19 @@ struct cpu_dev {
 	void		(*c_bsp_init)(struct cpuinfo_x86 *);
 	void		(*c_init)(struct cpuinfo_x86 *);
 	void		(*c_identify)(struct cpuinfo_x86 *);
+	void		(*c_detect_tlb)(struct cpuinfo_x86 *);
 	unsigned int	(*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
 	int		c_x86_vendor;
 };
 
+struct _tlb_table {
+	unsigned char descriptor;
+	char tlb_type;
+	unsigned int entries;
+	/* unsigned int ways; */
+	char info[128];
+};
+
 #define cpu_dev_register(cpu_devX) \
 	static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
 	__attribute__((__section__(".x86_cpu_dev.init"))) = \
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 755f64fb0743..a8f8fa9769d6 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 #endif
 	&x86_hyper_vmware,
 	&x86_hyper_ms_hyperv,
+#ifdef CONFIG_KVM_GUEST
+	&x86_hyper_kvm,
+#endif
 };
 
 const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3e6ff6cbf42a..0a4ce2980a5a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -491,6 +491,181 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
 }
 #endif
 
+#define TLB_INST_4K	0x01
+#define TLB_INST_4M	0x02
+#define TLB_INST_2M_4M	0x03
+
+#define TLB_INST_ALL	0x05
+#define TLB_INST_1G	0x06
+
+#define TLB_DATA_4K	0x11
+#define TLB_DATA_4M	0x12
+#define TLB_DATA_2M_4M	0x13
+#define TLB_DATA_4K_4M	0x14
+
+#define TLB_DATA_1G	0x16
+
+#define TLB_DATA0_4K	0x21
+#define TLB_DATA0_4M	0x22
+#define TLB_DATA0_2M_4M	0x23
+
+#define STLB_4K		0x41
+
+static const struct _tlb_table intel_tlb_table[] __cpuinitconst = {
+	{ 0x01, TLB_INST_4K,		32,	" TLB_INST 4 KByte pages, 4-way set associative" },
+	{ 0x02, TLB_INST_4M,		2,	" TLB_INST 4 MByte pages, full associative" },
+	{ 0x03, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way set associative" },
+	{ 0x04, TLB_DATA_4M,		8,	" TLB_DATA 4 MByte pages, 4-way set associative" },
+	{ 0x05, TLB_DATA_4M,		32,	" TLB_DATA 4 MByte pages, 4-way set associative" },
+	{ 0x0b, TLB_INST_4M,		4,	" TLB_INST 4 MByte pages, 4-way set associative" },
+	{ 0x4f, TLB_INST_4K,		32,	" TLB_INST 4 KByte pages */" },
+	{ 0x50, TLB_INST_ALL,		64,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+	{ 0x51, TLB_INST_ALL,		128,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+	{ 0x52, TLB_INST_ALL,		256,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+	{ 0x55, TLB_INST_2M_4M,		7,	" TLB_INST 2-MByte or 4-MByte pages, fully associative" },
+	{ 0x56, TLB_DATA0_4M,		16,	" TLB_DATA0 4 MByte pages, 4-way set associative" },
+	{ 0x57, TLB_DATA0_4K,		16,	" TLB_DATA0 4 KByte pages, 4-way associative" },
+	{ 0x59, TLB_DATA0_4K,		16,	" TLB_DATA0 4 KByte pages, fully associative" },
+	{ 0x5a, TLB_DATA0_2M_4M,	32,	" TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
+	{ 0x5b, TLB_DATA_4K_4M,		64,	" TLB_DATA 4 KByte and 4 MByte pages" },
+	{ 0x5c, TLB_DATA_4K_4M,		128,	" TLB_DATA 4 KByte and 4 MByte pages" },
+	{ 0x5d, TLB_DATA_4K_4M,		256,	" TLB_DATA 4 KByte and 4 MByte pages" },
+	{ 0xb0, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 4-way set associative" },
+	{ 0xb1, TLB_INST_2M_4M,		4,	" TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
+	{ 0xb2, TLB_INST_4K,		64,	" TLB_INST 4KByte pages, 4-way set associative" },
+	{ 0xb3, TLB_DATA_4K,		128,	" TLB_DATA 4 KByte pages, 4-way set associative" },
+	{ 0xb4, TLB_DATA_4K,		256,	" TLB_DATA 4 KByte pages, 4-way associative" },
+	{ 0xba, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way associative" },
+	{ 0xc0, TLB_DATA_4K_4M,		8,	" TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
+	{ 0xca, STLB_4K,		512,	" STLB 4 KByte pages, 4-way associative" },
+	{ 0x00, 0, 0 }
+};
+
+static void __cpuinit intel_tlb_lookup(const unsigned char desc)
+{
+	unsigned char k;
+	if (desc == 0)
+		return;
+
+	/* look up this descriptor in the table */
+	for (k = 0; intel_tlb_table[k].descriptor != desc && \
+			intel_tlb_table[k].descriptor != 0; k++)
+		;
+
+	if (intel_tlb_table[k].tlb_type == 0)
+		return;
+
+	switch (intel_tlb_table[k].tlb_type) {
+	case STLB_4K:
+		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+		break;
+	case TLB_INST_ALL:
+		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+		if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+		break;
+	case TLB_INST_4K:
+		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+		break;
+	case TLB_INST_4M:
+		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+		break;
+	case TLB_INST_2M_4M:
+		if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+		break;
+	case TLB_DATA_4K:
+	case TLB_DATA0_4K:
+		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+		break;
+	case TLB_DATA_4M:
+	case TLB_DATA0_4M:
+		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+		break;
+	case TLB_DATA_2M_4M:
+	case TLB_DATA0_2M_4M:
+		if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
+		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+		break;
+	case TLB_DATA_4K_4M:
+		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+		break;
+	}
+}
+
+static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
+{
+	if (!cpu_has_invlpg) {
+		tlb_flushall_shift = -1;
+		return;
+	}
+	switch ((c->x86 << 8) + c->x86_model) {
+	case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+	case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+	case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+	case 0x61d: /* six-core 45 nm xeon "Dunnington" */
+		tlb_flushall_shift = -1;
+		break;
+	case 0x61a: /* 45 nm nehalem, "Bloomfield" */
+	case 0x61e: /* 45 nm nehalem, "Lynnfield" */
+	case 0x625: /* 32 nm nehalem, "Clarkdale" */
+	case 0x62c: /* 32 nm nehalem, "Gulftown" */
+	case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
+	case 0x62f: /* 32 nm Xeon E7 */
+		tlb_flushall_shift = 6;
+		break;
+	case 0x62a: /* SandyBridge */
+	case 0x62d: /* SandyBridge, "Romely-EP" */
+		tlb_flushall_shift = 5;
+		break;
+	case 0x63a: /* Ivybridge */
+		tlb_flushall_shift = 1;
+		break;
+	default:
+		tlb_flushall_shift = 6;
+	}
+}
+
+static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c)
+{
+	int i, j, n;
+	unsigned int regs[4];
+	unsigned char *desc = (unsigned char *)regs;
+	/* Number of times to iterate */
+	n = cpuid_eax(2) & 0xFF;
+
+	for (i = 0 ; i < n ; i++) {
+		cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
+
+		/* If bit 31 is set, this is an unknown format */
+		for (j = 0 ; j < 3 ; j++)
+			if (regs[j] & (1 << 31))
+				regs[j] = 0;
+
+		/* Byte 0 is level count, not a descriptor */
+		for (j = 1 ; j < 16 ; j++)
+			intel_tlb_lookup(desc[j]);
+	}
+	intel_tlb_flushall_shift_set(c);
+}
+
 static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
 	.c_vendor	= "Intel",
 	.c_ident	= { "GenuineIntel" },
@@ -546,6 +721,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
 	},
 	.c_size_cache	= intel_size_cache,
 #endif
+	.c_detect_tlb	= intel_detect_tlb,
 	.c_early_init   = early_init_intel,
 	.c_init		= init_intel,
 	.c_x86_vendor	= X86_VENDOR_INTEL,
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 413c2ced887c..13017626f9a8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -55,13 +55,6 @@ static struct severity {
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #define	MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
-#define MCACOD 0xffff
-/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
-#define MCACOD_SCRUB	0x00C0	/* 0xC0-0xCF Memory Scrubbing */
-#define MCACOD_SCRUBMSK	0xfff0
-#define MCACOD_L3WB	0x017A	/* L3 Explicit Writeback */
-#define MCACOD_DATA	0x0134	/* Data Load */
-#define MCACOD_INSTR	0x0150	/* Instruction Fetch */
 
 	MCESEV(
 		NO, "Invalid",
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index da27c5d2168a..292d0258311c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -7,6 +7,9 @@
  * Copyright 2008 Intel Corporation
  * Author: Andi Kleen
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/thread_info.h>
 #include <linux/capability.h>
 #include <linux/miscdevice.h>
@@ -57,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
 
 int mce_disabled __read_mostly;
 
-#define MISC_MCELOG_MINOR	227
-
 #define SPINUNIT 100	/* 100ns */
 
 atomic_t mce_entry;
@@ -102,6 +103,8 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 
 static DEFINE_PER_CPU(struct work_struct, mce_work);
 
+static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+
 /*
  * CPU/chipset specific EDAC code can register a notifier call here to print
  * MCE errors in a human-readable form.
@@ -210,7 +213,7 @@ static void drain_mcelog_buffer(void)
 				cpu_relax();
 
 				if (!m->finished && retries >= 4) {
-					pr_err("MCE: skipping error being logged currently!\n");
+					pr_err("skipping error being logged currently!\n");
 					break;
 				}
 			}
@@ -649,14 +652,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
  * Do a quick check if any of the events requires a panic.
  * This decides if we keep the events around or clear them.
  */
-static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp)
+static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+			  struct pt_regs *regs)
 {
 	int i, ret = 0;
 
 	for (i = 0; i < banks; i++) {
 		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
-		if (m->status & MCI_STATUS_VAL)
+		if (m->status & MCI_STATUS_VAL) {
 			__set_bit(i, validp);
+			if (quirk_no_way_out)
+				quirk_no_way_out(i, m, regs);
+		}
 		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
 			ret = 1;
 	}
@@ -1039,7 +1046,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	*final = m;
 
 	memset(valid_banks, 0, sizeof(valid_banks));
-	no_way_out = mce_no_way_out(&m, &msg, valid_banks);
+	no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
 
 	barrier();
 
@@ -1167,8 +1174,9 @@ int memory_failure(unsigned long pfn, int vector, int flags)
 {
 	/* mce_severity() should not hand us an ACTION_REQUIRED error */
 	BUG_ON(flags & MF_ACTION_REQUIRED);
-	printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
-		"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
+	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
+	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
+	       pfn);
 
 	return 0;
 }
@@ -1186,6 +1194,7 @@ void mce_notify_process(void)
 {
 	unsigned long pfn;
 	struct mce_info *mi = mce_find_info();
+	int flags = MF_ACTION_REQUIRED;
 
 	if (!mi)
 		mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
@@ -1200,8 +1209,9 @@ void mce_notify_process(void)
 	 * doomed. We still need to mark the page as poisoned and alert any
 	 * other users of the page.
 	 */
-	if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 ||
-			   mi->restartable == 0) {
+	if (!mi->restartable)
+		flags |= MF_MUST_KILL;
+	if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
 		pr_err("Memory error not recovered");
 		force_sig(SIGBUS, current);
 	}
@@ -1358,11 +1368,10 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
 
 	b = cap & MCG_BANKCNT_MASK;
 	if (!banks)
-		printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
+		pr_info("CPU supports %d MCE banks\n", b);
 
 	if (b > MAX_NR_BANKS) {
-		printk(KERN_WARNING
-		       "MCE: Using only %u machine check banks out of %u\n",
+		pr_warn("Using only %u machine check banks out of %u\n",
 			MAX_NR_BANKS, b);
 		b = MAX_NR_BANKS;
 	}
@@ -1415,11 +1424,39 @@ static void __mcheck_cpu_init_generic(void)
 	}
 }
 
+/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+	if (bank != 0)
+		return;
+	if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+		return;
+	if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+		          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+			  MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+			  MCACOD)) !=
+			 (MCI_STATUS_UC|MCI_STATUS_EN|
+			  MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+			  MCI_STATUS_AR|MCACOD_INSTR))
+		return;
+
+	m->mcgstatus |= MCG_STATUS_EIPV;
+	m->ip = regs->ip;
+	m->cs = regs->cs;
+}
+
 /* Add per CPU specific workarounds here */
 static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 {
 	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
-		pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
+		pr_info("unknown CPU type - not enabling MCE support\n");
 		return -EOPNOTSUPP;
 	}
 
@@ -1512,6 +1549,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 		 */
 		if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
 			mce_bootlog = 0;
+
+		if (c->x86 == 6 && c->x86_model == 45)
+			quirk_no_way_out = quirk_sandybridge_ifu;
 	}
 	if (monarch_timeout < 0)
 		monarch_timeout = 0;
@@ -1574,7 +1614,7 @@ static void __mcheck_cpu_init_timer(void)
 /* Handle unconfigured int18 (should never happen) */
 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
 {
-	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
 	       smp_processor_id());
 }
 
@@ -1893,8 +1933,7 @@ static int __init mcheck_enable(char *str)
 			get_option(&str, &monarch_timeout);
 		}
 	} else {
-		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
-		       str);
+		pr_info("mce argument %s ignored. Please use /sys\n", str);
 		return 0;
 	}
 	return 1;
@@ -2342,7 +2381,7 @@ static __init int mcheck_init_device(void)
 
 	return err;
 }
-device_initcall(mcheck_init_device);
+device_initcall_sync(mcheck_init_device);
 
 /*
  * Old style boot options parsing. Only for compatibility.
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f4873a64f46d..c4e916d77378 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,15 +1,17 @@
 /*
- *  (c) 2005, 2006 Advanced Micro Devices, Inc.
+ *  (c) 2005-2012 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
  *
  *  Written by Jacob Shin - AMD, Inc.
  *
- *  Support : jacob.shin@amd.com
+ *  Support: borislav.petkov@amd.com
  *
  *  April 2006
  *     - added support for AMD Family 0x10 processors
+ *  May 2012
+ *     - major scrubbing
  *
  *  All MC4_MISCi registers are shared between multi-cores
  */
@@ -25,6 +27,7 @@
 #include <linux/cpu.h>
 #include <linux/smp.h>
 
+#include <asm/amd_nb.h>
 #include <asm/apic.h>
 #include <asm/idle.h>
 #include <asm/mce.h>
@@ -45,23 +48,15 @@
 #define MASK_BLKPTR_LO    0xFF000000
 #define MCG_XBLK_ADDR     0xC0000400
 
-struct threshold_block {
-	unsigned int		block;
-	unsigned int		bank;
-	unsigned int		cpu;
-	u32			address;
-	u16			interrupt_enable;
-	bool			interrupt_capable;
-	u16			threshold_limit;
-	struct kobject		kobj;
-	struct list_head	miscj;
+static const char * const th_names[] = {
+	"load_store",
+	"insn_fetch",
+	"combined_unit",
+	"",
+	"northbridge",
+	"execution_unit",
 };
 
-struct threshold_bank {
-	struct kobject		*kobj;
-	struct threshold_block	*blocks;
-	cpumask_var_t		cpus;
-};
 static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
 
 static unsigned char shared_bank[NR_BANKS] = {
@@ -84,6 +79,26 @@ struct thresh_restart {
 	u16			old_limit;
 };
 
+static const char * const bank4_names(struct threshold_block *b)
+{
+	switch (b->address) {
+	/* MSR4_MISC0 */
+	case 0x00000413:
+		return "dram";
+
+	case 0xc0000408:
+		return "ht_links";
+
+	case 0xc0000409:
+		return "l3_cache";
+
+	default:
+		WARN(1, "Funny MSR: 0x%08x\n", b->address);
+		return "";
+	}
+};
+
+
 static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
 {
 	/*
@@ -224,8 +239,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 
 			if (!block)
 				per_cpu(bank_map, cpu) |= (1 << bank);
-			if (shared_bank[bank] && c->cpu_core_id)
-				break;
 
 			memset(&b, 0, sizeof(b));
 			b.cpu			= cpu;
@@ -326,7 +339,7 @@ struct threshold_attr {
 #define SHOW_FIELDS(name)						\
 static ssize_t show_ ## name(struct threshold_block *b, char *buf)	\
 {									\
-	return sprintf(buf, "%lx\n", (unsigned long) b->name);		\
+	return sprintf(buf, "%lu\n", (unsigned long) b->name);		\
 }
 SHOW_FIELDS(interrupt_enable)
 SHOW_FIELDS(threshold_limit)
@@ -377,38 +390,21 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
 	return size;
 }
 
-struct threshold_block_cross_cpu {
-	struct threshold_block	*tb;
-	long			retval;
-};
-
-static void local_error_count_handler(void *_tbcc)
-{
-	struct threshold_block_cross_cpu *tbcc = _tbcc;
-	struct threshold_block *b = tbcc->tb;
-	u32 low, high;
-
-	rdmsr(b->address, low, high);
-	tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
-}
-
 static ssize_t show_error_count(struct threshold_block *b, char *buf)
 {
-	struct threshold_block_cross_cpu tbcc = { .tb = b, };
+	u32 lo, hi;
 
-	smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
-	return sprintf(buf, "%lx\n", tbcc.retval);
-}
-
-static ssize_t store_error_count(struct threshold_block *b,
-				 const char *buf, size_t count)
-{
-	struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
+	rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
 
-	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
-	return 1;
+	return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
+				     (THRESHOLD_MAX - b->threshold_limit)));
 }
 
+static struct threshold_attr error_count = {
+	.attr = {.name = __stringify(error_count), .mode = 0444 },
+	.show = show_error_count,
+};
+
 #define RW_ATTR(val)							\
 static struct threshold_attr val = {					\
 	.attr	= {.name = __stringify(val), .mode = 0644 },		\
@@ -418,7 +414,6 @@ static struct threshold_attr val = {					\
 
 RW_ATTR(interrupt_enable);
 RW_ATTR(threshold_limit);
-RW_ATTR(error_count);
 
 static struct attribute *default_attrs[] = {
 	&threshold_limit.attr,
@@ -517,7 +512,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
 
 	err = kobject_init_and_add(&b->kobj, &threshold_ktype,
 				   per_cpu(threshold_banks, cpu)[bank]->kobj,
-				   "misc%i", block);
+				   (bank == 4 ? bank4_names(b) : th_names[bank]));
 	if (err)
 		goto out_free;
 recurse:
@@ -548,98 +543,91 @@ out_free:
 	return err;
 }
 
-static __cpuinit long
-local_allocate_threshold_blocks(int cpu, unsigned int bank)
+static __cpuinit int __threshold_add_blocks(struct threshold_bank *b)
 {
-	return allocate_threshold_blocks(cpu, bank, 0,
-					 MSR_IA32_MC0_MISC + bank * 4);
+	struct list_head *head = &b->blocks->miscj;
+	struct threshold_block *pos = NULL;
+	struct threshold_block *tmp = NULL;
+	int err = 0;
+
+	err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
+	if (err)
+		return err;
+
+	list_for_each_entry_safe(pos, tmp, head, miscj) {
+
+		err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
+		if (err) {
+			list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
+				kobject_del(&pos->kobj);
+
+			return err;
+		}
+	}
+	return err;
 }
 
-/* symlinks sibling shared banks to first core.  first core owns dir/files. */
 static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
-	int i, err = 0;
-	struct threshold_bank *b = NULL;
 	struct device *dev = per_cpu(mce_device, cpu);
-	char name[32];
+	struct amd_northbridge *nb = NULL;
+	struct threshold_bank *b = NULL;
+	const char *name = th_names[bank];
+	int err = 0;
 
-	sprintf(name, "threshold_bank%i", bank);
+	if (shared_bank[bank]) {
 
-#ifdef CONFIG_SMP
-	if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {	/* symlink */
-		i = cpumask_first(cpu_llc_shared_mask(cpu));
+		nb = node_to_amd_nb(amd_get_nb_id(cpu));
+		WARN_ON(!nb);
 
-		/* first core not up yet */
-		if (cpu_data(i).cpu_core_id)
-			goto out;
+		/* threshold descriptor already initialized on this node? */
+		if (nb->bank4) {
+			/* yes, use it */
+			b = nb->bank4;
+			err = kobject_add(b->kobj, &dev->kobj, name);
+			if (err)
+				goto out;
 
-		/* already linked */
-		if (per_cpu(threshold_banks, cpu)[bank])
-			goto out;
+			per_cpu(threshold_banks, cpu)[bank] = b;
+			atomic_inc(&b->cpus);
 
-		b = per_cpu(threshold_banks, i)[bank];
+			err = __threshold_add_blocks(b);
 
-		if (!b)
 			goto out;
-
-		err = sysfs_create_link(&dev->kobj, b->kobj, name);
-		if (err)
-			goto out;
-
-		cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
-		per_cpu(threshold_banks, cpu)[bank] = b;
-
-		goto out;
+		}
 	}
-#endif
 
 	b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
 	if (!b) {
 		err = -ENOMEM;
 		goto out;
 	}
-	if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
-		kfree(b);
-		err = -ENOMEM;
-		goto out;
-	}
 
 	b->kobj = kobject_create_and_add(name, &dev->kobj);
-	if (!b->kobj)
+	if (!b->kobj) {
+		err = -EINVAL;
 		goto out_free;
-
-#ifndef CONFIG_SMP
-	cpumask_setall(b->cpus);
-#else
-	cpumask_set_cpu(cpu, b->cpus);
-#endif
+	}
 
 	per_cpu(threshold_banks, cpu)[bank] = b;
 
-	err = local_allocate_threshold_blocks(cpu, bank);
-	if (err)
-		goto out_free;
-
-	for_each_cpu(i, b->cpus) {
-		if (i == cpu)
-			continue;
-
-		dev = per_cpu(mce_device, i);
-		if (dev)
-			err = sysfs_create_link(&dev->kobj,b->kobj, name);
-		if (err)
-			goto out;
+	if (shared_bank[bank]) {
+		atomic_set(&b->cpus, 1);
 
-		per_cpu(threshold_banks, i)[bank] = b;
+		/* nb is already initialized, see above */
+		WARN_ON(nb->bank4);
+		nb->bank4 = b;
 	}
 
-	goto out;
+	err = allocate_threshold_blocks(cpu, bank, 0,
+					MSR_IA32_MC0_MISC + bank * 4);
+	if (!err)
+		goto out;
 
-out_free:
-	per_cpu(threshold_banks, cpu)[bank] = NULL;
-	free_cpumask_var(b->cpus);
+ out_free:
 	kfree(b);
-out:
+
+ out:
 	return err;
 }
 
@@ -660,12 +648,6 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
 	return err;
 }
 
-/*
- * let's be hotplug friendly.
- * in case of multiple core processors, the first core always takes ownership
- *   of shared sysfs dir/files, and rest of the cores will be symlinked to it.
- */
-
 static void deallocate_threshold_block(unsigned int cpu,
 						 unsigned int bank)
 {
@@ -686,41 +668,42 @@ static void deallocate_threshold_block(unsigned int cpu,
 	per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
 }
 
+static void __threshold_remove_blocks(struct threshold_bank *b)
+{
+	struct threshold_block *pos = NULL;
+	struct threshold_block *tmp = NULL;
+
+	kobject_del(b->kobj);
+
+	list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
+		kobject_del(&pos->kobj);
+}
+
 static void threshold_remove_bank(unsigned int cpu, int bank)
 {
+	struct amd_northbridge *nb;
 	struct threshold_bank *b;
-	struct device *dev;
-	char name[32];
-	int i = 0;
 
 	b = per_cpu(threshold_banks, cpu)[bank];
 	if (!b)
 		return;
+
 	if (!b->blocks)
 		goto free_out;
 
-	sprintf(name, "threshold_bank%i", bank);
-
-#ifdef CONFIG_SMP
-	/* sibling symlink */
-	if (shared_bank[bank] && b->blocks->cpu != cpu) {
-		dev = per_cpu(mce_device, cpu);
-		sysfs_remove_link(&dev->kobj, name);
-		per_cpu(threshold_banks, cpu)[bank] = NULL;
-
-		return;
-	}
-#endif
-
-	/* remove all sibling symlinks before unregistering */
-	for_each_cpu(i, b->cpus) {
-		if (i == cpu)
-			continue;
-
-		dev = per_cpu(mce_device, i);
-		if (dev)
-			sysfs_remove_link(&dev->kobj, name);
-		per_cpu(threshold_banks, i)[bank] = NULL;
+	if (shared_bank[bank]) {
+		if (!atomic_dec_and_test(&b->cpus)) {
+			__threshold_remove_blocks(b);
+			per_cpu(threshold_banks, cpu)[bank] = NULL;
+			return;
+		} else {
+			/*
+			 * the last CPU on this node using the shared bank is
+			 * going away, remove that bank now.
+			 */
+			nb = node_to_amd_nb(amd_get_nb_id(cpu));
+			nb->bank4 = NULL;
+		}
 	}
 
 	deallocate_threshold_block(cpu, bank);
@@ -728,7 +711,6 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 free_out:
 	kobject_del(b->kobj);
 	kobject_put(b->kobj);
-	free_cpumask_var(b->cpus);
 	kfree(b);
 	per_cpu(threshold_banks, cpu)[bank] = NULL;
 }
@@ -777,4 +759,24 @@ static __init int threshold_init_device(void)
 
 	return 0;
 }
-device_initcall(threshold_init_device);
+/*
+ * there are 3 funcs which need to be _initcalled in a logic sequence:
+ * 1. xen_late_init_mcelog
+ * 2. mcheck_init_device
+ * 3. threshold_init_device
+ *
+ * xen_late_init_mcelog must register xen_mce_chrdev_device before
+ * native mce_chrdev_device registration if running under xen platform;
+ *
+ * mcheck_init_device should be inited before threshold_init_device to
+ * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
+ *
+ * so we use following _initcalls
+ * 1. device_initcall(xen_late_init_mcelog);
+ * 2. device_initcall_sync(mcheck_init_device);
+ * 3. late_initcall(threshold_init_device);
+ *
+ * when running under xen, the initcall order is 1,2,3;
+ * on baremetal, we skip 1 and we do only 2 and 3.
+ */
+late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index bdda2e6c673b..35ffda5d0727 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -258,11 +258,11 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
 
 		/* Compute the maximum size with which we can make a range: */
 		if (range_startk)
-			max_align = ffs(range_startk) - 1;
+			max_align = __ffs(range_startk);
 		else
-			max_align = 32;
+			max_align = BITS_PER_LONG - 1;
 
-		align = fls(range_sizek) - 1;
+		align = __fls(range_sizek);
 		if (align > max_align)
 			align = max_align;
 
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 75772ae6c65f..e9fe907cd249 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -361,11 +361,7 @@ static void __init print_mtrr_state(void)
 	}
 	pr_debug("MTRR variable ranges %sabled:\n",
 		 mtrr_state.enabled & 2 ? "en" : "dis");
-	if (size_or_mask & 0xffffffffUL)
-		high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
-	else
-		high_width = ffs(size_or_mask>>32) + 32 - 1;
-	high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
+	high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
 
 	for (i = 0; i < num_var_ranges; ++i) {
 		if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c4706cf9c011..915b876edd1e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -32,20 +32,11 @@
 #include <asm/smp.h>
 #include <asm/alternative.h>
 #include <asm/timer.h>
+#include <asm/desc.h>
+#include <asm/ldt.h>
 
 #include "perf_event.h"
 
-#if 0
-#undef wrmsrl
-#define wrmsrl(msr, val) 					\
-do {								\
-	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
-			(unsigned long)(val));			\
-	native_write_msr((msr), (u32)((u64)(val)), 		\
-			(u32)((u64)(val) >> 32));		\
-} while (0)
-#endif
-
 struct x86_pmu x86_pmu __read_mostly;
 
 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -74,7 +65,7 @@ u64 x86_perf_event_update(struct perf_event *event)
 	int idx = hwc->idx;
 	s64 delta;
 
-	if (idx == X86_PMC_IDX_FIXED_BTS)
+	if (idx == INTEL_PMC_IDX_FIXED_BTS)
 		return 0;
 
 	/*
@@ -86,7 +77,7 @@ u64 x86_perf_event_update(struct perf_event *event)
 	 */
 again:
 	prev_raw_count = local64_read(&hwc->prev_count);
-	rdmsrl(hwc->event_base, new_raw_count);
+	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
 
 	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 					new_raw_count) != prev_raw_count)
@@ -189,7 +180,7 @@ static void release_pmc_hardware(void) {}
 
 static bool check_hw_exists(void)
 {
-	u64 val, val_new = 0;
+	u64 val, val_new = ~0;
 	int i, reg, ret = 0;
 
 	/*
@@ -222,8 +213,9 @@ static bool check_hw_exists(void)
 	 * that don't trap on the MSR access and always return 0s.
 	 */
 	val = 0xabcdUL;
-	ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
-	ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
+	reg = x86_pmu_event_addr(0);
+	ret = wrmsrl_safe(reg, val);
+	ret |= rdmsrl_safe(reg, &val_new);
 	if (ret || val != val_new)
 		goto msr_fail;
 
@@ -240,6 +232,7 @@ bios_fail:
 
 msr_fail:
 	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
+	printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new);
 
 	return false;
 }
@@ -388,7 +381,7 @@ int x86_pmu_hw_config(struct perf_event *event)
 		int precise = 0;
 
 		/* Support for constant skid */
-		if (x86_pmu.pebs_active) {
+		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
 			precise++;
 
 			/* Support for IP fixup */
@@ -637,8 +630,8 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 	c = sched->constraints[sched->state.event];
 
 	/* Prefer fixed purpose counters */
-	if (x86_pmu.num_counters_fixed) {
-		idx = X86_PMC_IDX_FIXED;
+	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
+		idx = INTEL_PMC_IDX_FIXED;
 		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
 			if (!__test_and_set_bit(idx, sched->state.used))
 				goto done;
@@ -646,7 +639,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 	}
 	/* Grab the first unused counter starting with idx */
 	idx = sched->state.counter;
-	for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
+	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
 		if (!__test_and_set_bit(idx, sched->state.used))
 			goto done;
 	}
@@ -704,8 +697,8 @@ static bool perf_sched_next_event(struct perf_sched *sched)
 /*
  * Assign a counter for each event.
  */
-static int perf_assign_events(struct event_constraint **constraints, int n,
-			      int wmin, int wmax, int *assign)
+int perf_assign_events(struct event_constraint **constraints, int n,
+			int wmin, int wmax, int *assign)
 {
 	struct perf_sched sched;
 
@@ -824,15 +817,17 @@ static inline void x86_assign_hw_event(struct perf_event *event,
 	hwc->last_cpu = smp_processor_id();
 	hwc->last_tag = ++cpuc->tags[i];
 
-	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
+	if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
 		hwc->config_base = 0;
 		hwc->event_base	= 0;
-	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
+	} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
 		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
+		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
+		hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
 	} else {
 		hwc->config_base = x86_pmu_config_addr(hwc->idx);
 		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
+		hwc->event_base_rdpmc = hwc->idx;
 	}
 }
 
@@ -930,7 +925,7 @@ int x86_perf_event_set_period(struct perf_event *event)
 	s64 period = hwc->sample_period;
 	int ret = 0, idx = hwc->idx;
 
-	if (idx == X86_PMC_IDX_FIXED_BTS)
+	if (idx == INTEL_PMC_IDX_FIXED_BTS)
 		return 0;
 
 	/*
@@ -1316,7 +1311,6 @@ static struct attribute_group x86_pmu_format_group = {
 static int __init init_hw_perf_events(void)
 {
 	struct x86_pmu_quirk *quirk;
-	struct event_constraint *c;
 	int err;
 
 	pr_info("Performance Events: ");
@@ -1347,21 +1341,8 @@ static int __init init_hw_perf_events(void)
 	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
 		quirk->func();
 
-	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
-		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
-		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
-	}
-	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
-	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
-		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
-		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
-	}
-
-	x86_pmu.intel_ctrl |=
-		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+	if (!x86_pmu.intel_ctrl)
+		x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
 
 	perf_events_lapic_init();
 	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
@@ -1370,22 +1351,6 @@ static int __init init_hw_perf_events(void)
 		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
 				   0, x86_pmu.num_counters, 0);
 
-	if (x86_pmu.event_constraints) {
-		/*
-		 * event on fixed counter2 (REF_CYCLES) only works on this
-		 * counter, so do not extend mask to generic counters
-		 */
-		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if (c->cmask != X86_RAW_EVENT_MASK
-			    || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
-				continue;
-			}
-
-			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
-			c->weight += x86_pmu.num_counters;
-		}
-	}
-
 	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
 	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
 
@@ -1620,8 +1585,8 @@ static int x86_pmu_event_idx(struct perf_event *event)
 	if (!x86_pmu.attr_rdpmc)
 		return 0;
 
-	if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) {
-		idx -= X86_PMC_IDX_FIXED;
+	if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
+		idx -= INTEL_PMC_IDX_FIXED;
 		idx |= 1 << 30;
 	}
 
@@ -1649,7 +1614,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
 			      struct device_attribute *attr,
 			      const char *buf, size_t count)
 {
-	unsigned long val = simple_strtoul(buf, NULL, 0);
+	unsigned long val;
+	ssize_t ret;
+
+	ret = kstrtoul(buf, 0, &val);
+	if (ret)
+		return ret;
 
 	if (!!val != !!x86_pmu.attr_rdpmc) {
 		x86_pmu.attr_rdpmc = !!val;
@@ -1682,13 +1652,20 @@ static void x86_pmu_flush_branch_stack(void)
 		x86_pmu.flush_branch_stack();
 }
 
+void perf_check_microcode(void)
+{
+	if (x86_pmu.check_microcode)
+		x86_pmu.check_microcode();
+}
+EXPORT_SYMBOL_GPL(perf_check_microcode);
+
 static struct pmu pmu = {
 	.pmu_enable		= x86_pmu_enable,
 	.pmu_disable		= x86_pmu_disable,
 
-	.attr_groups	= x86_pmu_attr_groups,
+	.attr_groups		= x86_pmu_attr_groups,
 
-	.event_init	= x86_pmu_event_init,
+	.event_init		= x86_pmu_event_init,
 
 	.add			= x86_pmu_add,
 	.del			= x86_pmu_del,
@@ -1696,11 +1673,11 @@ static struct pmu pmu = {
 	.stop			= x86_pmu_stop,
 	.read			= x86_pmu_read,
 
-	.start_txn	= x86_pmu_start_txn,
-	.cancel_txn	= x86_pmu_cancel_txn,
-	.commit_txn	= x86_pmu_commit_txn,
+	.start_txn		= x86_pmu_start_txn,
+	.cancel_txn		= x86_pmu_cancel_txn,
+	.commit_txn		= x86_pmu_commit_txn,
 
-	.event_idx	= x86_pmu_event_idx,
+	.event_idx		= x86_pmu_event_idx,
 	.flush_branch_stack	= x86_pmu_flush_branch_stack,
 };
 
@@ -1763,6 +1740,29 @@ valid_user_frame(const void __user *fp, unsigned long size)
 	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
 }
 
+static unsigned long get_segment_base(unsigned int segment)
+{
+	struct desc_struct *desc;
+	int idx = segment >> 3;
+
+	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+		if (idx > LDT_ENTRIES)
+			return 0;
+
+		if (idx > current->active_mm->context.size)
+			return 0;
+
+		desc = current->active_mm->context.ldt;
+	} else {
+		if (idx > GDT_ENTRIES)
+			return 0;
+
+		desc = __this_cpu_ptr(&gdt_page.gdt[0]);
+	}
+
+	return get_desc_base(desc + idx);
+}
+
 #ifdef CONFIG_COMPAT
 
 #include <asm/compat.h>
@@ -1771,13 +1771,17 @@ static inline int
 perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	/* 32-bit process in 64-bit kernel. */
+	unsigned long ss_base, cs_base;
 	struct stack_frame_ia32 frame;
 	const void __user *fp;
 
 	if (!test_thread_flag(TIF_IA32))
 		return 0;
 
-	fp = compat_ptr(regs->bp);
+	cs_base = get_segment_base(regs->cs);
+	ss_base = get_segment_base(regs->ss);
+
+	fp = compat_ptr(ss_base + regs->bp);
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
@@ -1790,8 +1794,8 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		if (!valid_user_frame(fp, sizeof(frame)))
 			break;
 
-		perf_callchain_store(entry, frame.return_address);
-		fp = compat_ptr(frame.next_frame);
+		perf_callchain_store(entry, cs_base + frame.return_address);
+		fp = compat_ptr(ss_base + frame.next_frame);
 	}
 	return 1;
 }
@@ -1814,6 +1818,12 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 	}
 
+	/*
+	 * We don't know what to do with VM86 stacks.. ignore them for now.
+	 */
+	if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
+		return;
+
 	fp = (void __user *)regs->bp;
 
 	perf_callchain_store(entry, regs->ip);
@@ -1841,16 +1851,50 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 	}
 }
 
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
+/*
+ * Deal with code segment offsets for the various execution modes:
+ *
+ *   VM86 - the good olde 16 bit days, where the linear address is
+ *          20 bits and we use regs->ip + 0x10 * regs->cs.
+ *
+ *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
+ *          to figure out what the 32bit base address is.
+ *
+ *    X32 - has TIF_X32 set, but is running in x86_64
+ *
+ * X86_64 - CS,DS,SS,ES are all zero based.
+ */
+static unsigned long code_segment_base(struct pt_regs *regs)
 {
-	unsigned long ip;
+	/*
+	 * If we are in VM86 mode, add the segment offset to convert to a
+	 * linear address.
+	 */
+	if (regs->flags & X86_VM_MASK)
+		return 0x10 * regs->cs;
+
+	/*
+	 * For IA32 we look at the GDT/LDT segment base to convert the
+	 * effective IP to a linear address.
+	 */
+#ifdef CONFIG_X86_32
+	if (user_mode(regs) && regs->cs != __USER_CS)
+		return get_segment_base(regs->cs);
+#else
+	if (test_thread_flag(TIF_IA32)) {
+		if (user_mode(regs) && regs->cs != __USER32_CS)
+			return get_segment_base(regs->cs);
+	}
+#endif
+	return 0;
+}
 
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
-		ip = perf_guest_cbs->get_guest_ip();
-	else
-		ip = instruction_pointer(regs);
+		return perf_guest_cbs->get_guest_ip();
 
-	return ip;
+	return regs->ip + code_segment_base(regs);
 }
 
 unsigned long perf_misc_flags(struct pt_regs *regs)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 7241e2fc3c17..6605a81ba339 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -14,6 +14,18 @@
 
 #include <linux/perf_event.h>
 
+#if 0
+#undef wrmsrl
+#define wrmsrl(msr, val) 						\
+do {									\
+	unsigned int _msr = (msr);					\
+	u64 _val = (val);						\
+	trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr),		\
+			(unsigned long long)(_val));			\
+	native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32));	\
+} while (0)
+#endif
+
 /*
  *          |   NHM/WSM    |      SNB     |
  * register -------------------------------
@@ -57,7 +69,7 @@ struct amd_nb {
 };
 
 /* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS		4
+#define MAX_PEBS_EVENTS		8
 
 /*
  * A debug store configuration.
@@ -349,6 +361,8 @@ struct x86_pmu {
 	void		(*cpu_starting)(int cpu);
 	void		(*cpu_dying)(int cpu);
 	void		(*cpu_dead)(int cpu);
+
+	void		(*check_microcode)(void);
 	void		(*flush_branch_stack)(void);
 
 	/*
@@ -360,12 +374,16 @@ struct x86_pmu {
 	/*
 	 * Intel DebugStore bits
 	 */
-	int		bts, pebs;
-	int		bts_active, pebs_active;
+	unsigned int	bts		:1,
+			bts_active	:1,
+			pebs		:1,
+			pebs_active	:1,
+			pebs_broken	:1;
 	int		pebs_record_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
 	struct event_constraint *pebs_constraints;
 	void		(*pebs_aliases)(struct perf_event *event);
+	int 		max_pebs_events;
 
 	/*
 	 * Intel LBR
@@ -468,6 +486,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 
 void x86_pmu_enable_all(int added);
 
+int perf_assign_events(struct event_constraint **constraints, int n,
+			int wmin, int wmax, int *assign);
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
 
 void x86_pmu_stop(struct perf_event *event, int flags);
@@ -496,6 +516,26 @@ static inline bool kernel_ip(unsigned long ip)
 #endif
 }
 
+/*
+ * Not all PMUs provide the right context information to place the reported IP
+ * into full context. Specifically segment registers are typically not
+ * supplied.
+ *
+ * Assuming the address is a linear address (it is for IBS), we fake the CS and
+ * vm86 mode using the known zero-based code segment and 'fix up' the registers
+ * to reflect this.
+ *
+ * Intel PEBS/LBR appear to typically provide the effective address, nothing
+ * much we can do about that but pray and treat it like a linear address.
+ */
+static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
+{
+	regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
+	if (regs->flags & X86_VM_MASK)
+		regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
+	regs->ip = ip;
+}
+
 #ifdef CONFIG_CPU_SUP_AMD
 
 int amd_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 11a4eb9131d5..4528ae7b6ec4 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -366,7 +366,7 @@ static void amd_pmu_cpu_starting(int cpu)
 
 	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
 
-	if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15)
+	if (boot_cpu_data.x86_max_cores < 2)
 		return;
 
 	nb_id = amd_get_nb_id(cpu);
@@ -422,35 +422,6 @@ static struct attribute *amd_format_attr[] = {
 	NULL,
 };
 
-static __initconst const struct x86_pmu amd_pmu = {
-	.name			= "AMD",
-	.handle_irq		= x86_pmu_handle_irq,
-	.disable_all		= x86_pmu_disable_all,
-	.enable_all		= x86_pmu_enable_all,
-	.enable			= x86_pmu_enable_event,
-	.disable		= x86_pmu_disable_event,
-	.hw_config		= amd_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_K7_EVNTSEL0,
-	.perfctr		= MSR_K7_PERFCTR0,
-	.event_map		= amd_pmu_event_map,
-	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
-	.num_counters		= AMD64_NUM_COUNTERS,
-	.cntval_bits		= 48,
-	.cntval_mask		= (1ULL << 48) - 1,
-	.apic			= 1,
-	/* use highest bit to detect overflow */
-	.max_period		= (1ULL << 47) - 1,
-	.get_event_constraints	= amd_get_event_constraints,
-	.put_event_constraints	= amd_put_event_constraints,
-
-	.format_attrs		= amd_format_attr,
-
-	.cpu_prepare		= amd_pmu_cpu_prepare,
-	.cpu_starting		= amd_pmu_cpu_starting,
-	.cpu_dead		= amd_pmu_cpu_dead,
-};
-
 /* AMD Family 15h */
 
 #define AMD_EVENT_TYPE_MASK	0x000000F0ULL
@@ -597,8 +568,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
 	}
 }
 
-static __initconst const struct x86_pmu amd_pmu_f15h = {
-	.name			= "AMD Family 15h",
+static __initconst const struct x86_pmu amd_pmu = {
+	.name			= "AMD",
 	.handle_irq		= x86_pmu_handle_irq,
 	.disable_all		= x86_pmu_disable_all,
 	.enable_all		= x86_pmu_enable_all,
@@ -606,50 +577,68 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
 	.disable		= x86_pmu_disable_event,
 	.hw_config		= amd_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_F15H_PERF_CTL,
-	.perfctr		= MSR_F15H_PERF_CTR,
+	.eventsel		= MSR_K7_EVNTSEL0,
+	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= amd_pmu_event_map,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
-	.num_counters		= AMD64_NUM_COUNTERS_F15H,
+	.num_counters		= AMD64_NUM_COUNTERS,
 	.cntval_bits		= 48,
 	.cntval_mask		= (1ULL << 48) - 1,
 	.apic			= 1,
 	/* use highest bit to detect overflow */
 	.max_period		= (1ULL << 47) - 1,
-	.get_event_constraints	= amd_get_event_constraints_f15h,
-	/* nortbridge counters not yet implemented: */
-#if 0
+	.get_event_constraints	= amd_get_event_constraints,
 	.put_event_constraints	= amd_put_event_constraints,
 
+	.format_attrs		= amd_format_attr,
+
 	.cpu_prepare		= amd_pmu_cpu_prepare,
-	.cpu_dead		= amd_pmu_cpu_dead,
-#endif
 	.cpu_starting		= amd_pmu_cpu_starting,
-	.format_attrs		= amd_format_attr,
+	.cpu_dead		= amd_pmu_cpu_dead,
 };
 
+static int setup_event_constraints(void)
+{
+	if (boot_cpu_data.x86 >= 0x15)
+		x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
+	return 0;
+}
+
+static int setup_perfctr_core(void)
+{
+	if (!cpu_has_perfctr_core) {
+		WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h,
+		     KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!");
+		return -ENODEV;
+	}
+
+	WARN(x86_pmu.get_event_constraints == amd_get_event_constraints,
+	     KERN_ERR "hw perf events core counters need constraints handler!");
+
+	/*
+	 * If core performance counter extensions exists, we must use
+	 * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
+	 * x86_pmu_addr_offset().
+	 */
+	x86_pmu.eventsel	= MSR_F15H_PERF_CTL;
+	x86_pmu.perfctr		= MSR_F15H_PERF_CTR;
+	x86_pmu.num_counters	= AMD64_NUM_COUNTERS_CORE;
+
+	printk(KERN_INFO "perf: AMD core performance counters detected\n");
+
+	return 0;
+}
+
 __init int amd_pmu_init(void)
 {
 	/* Performance-monitoring supported from K7 and later: */
 	if (boot_cpu_data.x86 < 6)
 		return -ENODEV;
 
-	/*
-	 * If core performance counter extensions exists, it must be
-	 * family 15h, otherwise fail. See x86_pmu_addr_offset().
-	 */
-	switch (boot_cpu_data.x86) {
-	case 0x15:
-		if (!cpu_has_perfctr_core)
-			return -ENODEV;
-		x86_pmu = amd_pmu_f15h;
-		break;
-	default:
-		if (cpu_has_perfctr_core)
-			return -ENODEV;
-		x86_pmu = amd_pmu;
-		break;
-	}
+	x86_pmu = amd_pmu;
+
+	setup_event_constraints();
+	setup_perfctr_core();
 
 	/* Events are common for all AMDs */
 	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index da9bcdcd9856..7bfb5bec8630 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -13,6 +13,8 @@
 
 #include <asm/apic.h>
 
+#include "perf_event.h"
+
 static u32 ibs_caps;
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
@@ -536,7 +538,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
 	if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 	} else {
-		instruction_pointer_set(&regs, ibs_data.regs[1]);
+		set_linear_ip(&regs, ibs_data.regs[1]);
 		regs.flags |= PERF_EFLAGS_EXACT;
 	}
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 187c294bc658..7f2739e03e79 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -5,6 +5,8 @@
  * among events on a single PMU.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/stddef.h>
 #include <linux/types.h>
 #include <linux/init.h>
@@ -21,14 +23,14 @@
  */
 static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
 {
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
-  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
-  [PERF_COUNT_HW_REF_CPU_CYCLES]	= 0x0300, /* pseudo-encoding */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+	[PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
+	[PERF_COUNT_HW_REF_CPU_CYCLES]		= 0x0300, /* pseudo-encoding */
 };
 
 static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -136,6 +138,84 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
+#define SNB_DMND_DATA_RD	(1ULL << 0)
+#define SNB_DMND_RFO		(1ULL << 1)
+#define SNB_DMND_IFETCH		(1ULL << 2)
+#define SNB_DMND_WB		(1ULL << 3)
+#define SNB_PF_DATA_RD		(1ULL << 4)
+#define SNB_PF_RFO		(1ULL << 5)
+#define SNB_PF_IFETCH		(1ULL << 6)
+#define SNB_LLC_DATA_RD		(1ULL << 7)
+#define SNB_LLC_RFO		(1ULL << 8)
+#define SNB_LLC_IFETCH		(1ULL << 9)
+#define SNB_BUS_LOCKS		(1ULL << 10)
+#define SNB_STRM_ST		(1ULL << 11)
+#define SNB_OTHER		(1ULL << 15)
+#define SNB_RESP_ANY		(1ULL << 16)
+#define SNB_NO_SUPP		(1ULL << 17)
+#define SNB_LLC_HITM		(1ULL << 18)
+#define SNB_LLC_HITE		(1ULL << 19)
+#define SNB_LLC_HITS		(1ULL << 20)
+#define SNB_LLC_HITF		(1ULL << 21)
+#define SNB_LOCAL		(1ULL << 22)
+#define SNB_REMOTE		(0xffULL << 23)
+#define SNB_SNP_NONE		(1ULL << 31)
+#define SNB_SNP_NOT_NEEDED	(1ULL << 32)
+#define SNB_SNP_MISS		(1ULL << 33)
+#define SNB_NO_FWD		(1ULL << 34)
+#define SNB_SNP_FWD		(1ULL << 35)
+#define SNB_HITM		(1ULL << 36)
+#define SNB_NON_DRAM		(1ULL << 37)
+
+#define SNB_DMND_READ		(SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
+#define SNB_DMND_WRITE		(SNB_DMND_RFO|SNB_LLC_RFO)
+#define SNB_DMND_PREFETCH	(SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SNB_SNP_ANY		(SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
+				 SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
+				 SNB_HITM)
+
+#define SNB_DRAM_ANY		(SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
+#define SNB_DRAM_REMOTE		(SNB_REMOTE|SNB_SNP_ANY)
+
+#define SNB_L3_ACCESS		SNB_RESP_ANY
+#define SNB_L3_MISS		(SNB_DRAM_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 snb_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_L3_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_L3_MISS,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
+		[ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
+		[ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
+		[ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
+	},
+ },
+};
+
 static __initconst const u64 snb_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -233,16 +313,16 @@ static __initconst const u64 snb_hw_cache_event_ids
  },
  [ C(NODE) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
  },
 
@@ -747,7 +827,7 @@ static void intel_pmu_disable_all(void)
 
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
-	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
 		intel_pmu_disable_bts();
 
 	intel_pmu_pebs_disable_all();
@@ -763,9 +843,9 @@ static void intel_pmu_enable_all(int added)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
 			x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
 
-	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
 		struct perf_event *event =
-			cpuc->events[X86_PMC_IDX_FIXED_BTS];
+			cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
 
 		if (WARN_ON_ONCE(!event))
 			return;
@@ -871,7 +951,7 @@ static inline void intel_pmu_ack_status(u64 ack)
 
 static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 {
-	int idx = hwc->idx - X86_PMC_IDX_FIXED;
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
 
 	mask = 0xfULL << (idx * 4);
@@ -886,7 +966,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
+	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
 		intel_pmu_disable_bts();
 		intel_pmu_drain_bts_buffer();
 		return;
@@ -915,7 +995,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 
 static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 {
-	int idx = hwc->idx - X86_PMC_IDX_FIXED;
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
 
 	/*
@@ -949,7 +1029,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
+	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
 		if (!__this_cpu_read(cpu_hw_events.enabled))
 			return;
 
@@ -1000,14 +1080,14 @@ static void intel_pmu_reset(void)
 
 	local_irq_save(flags);
 
-	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+	pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		checking_wrmsrl(x86_pmu_config_addr(idx), 0ull);
-		checking_wrmsrl(x86_pmu_event_addr(idx),  0ull);
+		wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
+		wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
 	}
 	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
-		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+		wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
 
 	if (ds)
 		ds->bts_index = ds->bts_buffer_base;
@@ -1442,8 +1522,16 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
 	arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
 	arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
 	arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
+	/*
+	 * If PMU counter has PEBS enabled it is not enough to disable counter
+	 * on a guest entry since PEBS memory write can overshoot guest entry
+	 * and corrupt guest memory. Disabling PEBS solves the problem.
+	 */
+	arr[1].msr = MSR_IA32_PEBS_ENABLE;
+	arr[1].host = cpuc->pebs_enabled;
+	arr[1].guest = 0;
 
-	*nr = 1;
+	*nr = 2;
 	return arr;
 }
 
@@ -1707,16 +1795,61 @@ static __init void intel_clovertown_quirk(void)
 	 * But taken together it might just make sense to not enable PEBS on
 	 * these chips.
 	 */
-	printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
+	pr_warn("PEBS disabled due to CPU errata\n");
 	x86_pmu.pebs = 0;
 	x86_pmu.pebs_constraints = NULL;
 }
 
+static int intel_snb_pebs_broken(int cpu)
+{
+	u32 rev = UINT_MAX; /* default to broken for unknown models */
+
+	switch (cpu_data(cpu).x86_model) {
+	case 42: /* SNB */
+		rev = 0x28;
+		break;
+
+	case 45: /* SNB-EP */
+		switch (cpu_data(cpu).x86_mask) {
+		case 6: rev = 0x618; break;
+		case 7: rev = 0x70c; break;
+		}
+	}
+
+	return (cpu_data(cpu).microcode < rev);
+}
+
+static void intel_snb_check_microcode(void)
+{
+	int pebs_broken = 0;
+	int cpu;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		if ((pebs_broken = intel_snb_pebs_broken(cpu)))
+			break;
+	}
+	put_online_cpus();
+
+	if (pebs_broken == x86_pmu.pebs_broken)
+		return;
+
+	/*
+	 * Serialized by the microcode lock..
+	 */
+	if (x86_pmu.pebs_broken) {
+		pr_info("PEBS enabled due to microcode update\n");
+		x86_pmu.pebs_broken = 0;
+	} else {
+		pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
+		x86_pmu.pebs_broken = 1;
+	}
+}
+
 static __init void intel_sandybridge_quirk(void)
 {
-	printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
-	x86_pmu.pebs = 0;
-	x86_pmu.pebs_constraints = NULL;
+	x86_pmu.check_microcode = intel_snb_check_microcode;
+	intel_snb_check_microcode();
 }
 
 static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
@@ -1736,8 +1869,8 @@ static __init void intel_arch_events_quirk(void)
 	/* disable event that reported as not presend by cpuid */
 	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
 		intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
-		printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n",
-				intel_arch_events_map[bit].name);
+		pr_warn("CPUID marked event: \'%s\' unavailable\n",
+			intel_arch_events_map[bit].name);
 	}
 }
 
@@ -1756,7 +1889,7 @@ static __init void intel_nehalem_quirk(void)
 		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
 		ebx.split.no_branch_misses_retired = 0;
 		x86_pmu.events_maskl = ebx.full;
-		printk(KERN_INFO "CPU erratum AAJ80 worked around\n");
+		pr_info("CPU erratum AAJ80 worked around\n");
 	}
 }
 
@@ -1765,6 +1898,7 @@ __init int intel_pmu_init(void)
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
 	union cpuid10_ebx ebx;
+	struct event_constraint *c;
 	unsigned int unused;
 	int version;
 
@@ -1800,6 +1934,8 @@ __init int intel_pmu_init(void)
 	x86_pmu.events_maskl		= ebx.full;
 	x86_pmu.events_mask_len		= eax.split.mask_length;
 
+	x86_pmu.max_pebs_events		= min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
 	/*
 	 * Quirk: v2 perfmon does not report fixed-purpose events, so
 	 * assume at least 3 events:
@@ -1914,6 +2050,8 @@ __init int intel_pmu_init(void)
 	case 58: /* IvyBridge */
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
 
 		intel_pmu_lbr_init_snb();
 
@@ -1951,5 +2089,37 @@ __init int intel_pmu_init(void)
 		}
 	}
 
+	if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
+		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+		     x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
+		x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
+	}
+	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+	if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+		     x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
+		x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
+	}
+
+	x86_pmu.intel_ctrl |=
+		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+	if (x86_pmu.event_constraints) {
+		/*
+		 * event on fixed counter2 (REF_CYCLES) only works on this
+		 * counter, so do not extend mask to generic counters
+		 */
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if (c->cmask != X86_RAW_EVENT_MASK
+			    || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+				continue;
+			}
+
+			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			c->weight += x86_pmu.num_counters;
+		}
+	}
+
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 35e2192df9f4..e38d97bf4259 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -248,7 +248,7 @@ void reserve_ds_buffers(void)
  */
 
 struct event_constraint bts_constraint =
-	EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
+	EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
 
 void intel_pmu_enable_bts(u64 config)
 {
@@ -295,7 +295,7 @@ int intel_pmu_drain_bts_buffer(void)
 		u64	to;
 		u64	flags;
 	};
-	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+	struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
 	struct bts_record *at, *top;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
@@ -499,7 +499,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	 * We sampled a branch insn, rewind using the LBR stack
 	 */
 	if (ip == to) {
-		regs->ip = from;
+		set_linear_ip(regs, from);
 		return 1;
 	}
 
@@ -529,7 +529,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	} while (to < ip);
 
 	if (to == ip) {
-		regs->ip = old_to;
+		set_linear_ip(regs, old_to);
 		return 1;
 	}
 
@@ -569,7 +569,8 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
 	 */
 	regs = *iregs;
-	regs.ip = pebs->ip;
+	regs.flags = pebs->flags;
+	set_linear_ip(&regs, pebs->ip);
 	regs.bp = pebs->bp;
 	regs.sp = pebs->sp;
 
@@ -620,7 +621,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	 * Should not happen, we program the threshold at 1 and do not
 	 * set a reset value.
 	 */
-	WARN_ON_ONCE(n > 1);
+	WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
 	at += n - 1;
 
 	__intel_pmu_pebs_event(event, iregs, at);
@@ -651,10 +652,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	 * Should not happen, we program the threshold at 1 and do not
 	 * set a reset value.
 	 */
-	WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
+	WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n);
 
 	for ( ; at < top; at++) {
-		for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
+		for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) {
 			event = cpuc->events[bit];
 			if (!test_bit(bit, cpuc->active_mask))
 				continue;
@@ -670,7 +671,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 			break;
 		}
 
-		if (!event || bit >= MAX_PEBS_EVENTS)
+		if (!event || bit >= x86_pmu.max_pebs_events)
 			continue;
 
 		__intel_pmu_pebs_event(event, iregs, at);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
new file mode 100644
index 000000000000..0a5571080e74
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -0,0 +1,2915 @@
+#include "perf_event_intel_uncore.h"
+
+static struct intel_uncore_type *empty_uncore[] = { NULL, };
+static struct intel_uncore_type **msr_uncores = empty_uncore;
+static struct intel_uncore_type **pci_uncores = empty_uncore;
+/* pci bus to socket mapping */
+static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
+
+static DEFINE_RAW_SPINLOCK(uncore_box_lock);
+
+/* mask of cpus that collect uncore events */
+static cpumask_t uncore_cpu_mask;
+
+/* constraint for the fixed counter */
+static struct event_constraint constraint_fixed =
+	EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
+static struct event_constraint constraint_empty =
+	EVENT_CONSTRAINT(0, 0, 0);
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+
+static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	u64 count;
+
+	rdmsrl(event->hw.event_base, count);
+
+	return count;
+}
+
+/*
+ * generic get constraint function for shared match/mask registers.
+ */
+static struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_extra_reg *er;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	unsigned long flags;
+	bool ok = false;
+
+	/*
+	 * reg->alloc can be set due to existing state, so for fake box we
+	 * need to ignore this, otherwise we might fail to allocate proper
+	 * fake state for this extra reg constraint.
+	 */
+	if (reg1->idx == EXTRA_REG_NONE ||
+	    (!uncore_box_is_fake(box) && reg1->alloc))
+		return NULL;
+
+	er = &box->shared_regs[reg1->idx];
+	raw_spin_lock_irqsave(&er->lock, flags);
+	if (!atomic_read(&er->ref) ||
+	    (er->config1 == reg1->config && er->config2 == reg2->config)) {
+		atomic_inc(&er->ref);
+		er->config1 = reg1->config;
+		er->config2 = reg2->config;
+		ok = true;
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	if (ok) {
+		if (!uncore_box_is_fake(box))
+			reg1->alloc = 1;
+		return NULL;
+	}
+
+	return &constraint_empty;
+}
+
+static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_extra_reg *er;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+
+	/*
+	 * Only put constraint if extra reg was actually allocated. Also
+	 * takes care of event which do not use an extra shared reg.
+	 *
+	 * Also, if this is a fake box we shouldn't touch any event state
+	 * (reg->alloc) and we don't care about leaving inconsistent box
+	 * state either since it will be thrown out.
+	 */
+	if (uncore_box_is_fake(box) || !reg1->alloc)
+		return;
+
+	er = &box->shared_regs[reg1->idx];
+	atomic_dec(&er->ref);
+	reg1->alloc = 0;
+}
+
+/* Sandy Bridge-EP uncore support */
+static struct intel_uncore_type snbep_uncore_cbox;
+static struct intel_uncore_type snbep_uncore_pcu;
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+	u32 config;
+
+	pci_read_config_dword(pdev, box_ctl, &config);
+	config |= SNBEP_PMON_BOX_CTL_FRZ;
+	pci_write_config_dword(pdev, box_ctl, config);
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+	u32 config;
+
+	pci_read_config_dword(pdev, box_ctl, &config);
+	config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+	pci_write_config_dword(pdev, box_ctl, config);
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 count;
+
+	pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+	pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+	return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+
+	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	u64 config;
+	unsigned msr;
+
+	msr = uncore_msr_box_ctl(box);
+	if (msr) {
+		rdmsrl(msr, config);
+		config |= SNBEP_PMON_BOX_CTL_FRZ;
+		wrmsrl(msr, config);
+	}
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	u64 config;
+	unsigned msr;
+
+	msr = uncore_msr_box_ctl(box);
+	if (msr) {
+		rdmsrl(msr, config);
+		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+		wrmsrl(msr, config);
+	}
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE)
+		wrmsrl(reg1->reg, reg1->config);
+
+	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+
+	if (msr)
+		wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static int snbep_uncore_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (box->pmu->type == &snbep_uncore_cbox) {
+		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 &
+			SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK;
+	} else {
+		if (box->pmu->type == &snbep_uncore_pcu) {
+			reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+			reg1->config = event->attr.config1 & SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK;
+		} else {
+			return 0;
+		}
+	}
+	reg1->idx = 0;
+
+	return 0;
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid.attr,
+	&format_attr_filter_nid.attr,
+	&format_attr_filter_state.attr,
+	&format_attr_filter_opc.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_occ_sel.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge.attr,
+	&format_attr_filter_band0.attr,
+	&format_attr_filter_band1.attr,
+	&format_attr_filter_band2.attr,
+	&format_attr_filter_band3.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+	{ /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
+	INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+	INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x02,umask=0x08"),
+	INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x03,umask=0x04"),
+	{ /* end: all zeroes */ },
+};
+
+static struct attribute_group snbep_uncore_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_qpi_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+	.init_box	= snbep_uncore_msr_init_box,
+	.disable_box	= snbep_uncore_msr_disable_box,
+	.enable_box	= snbep_uncore_msr_enable_box,
+	.disable_event	= snbep_uncore_msr_disable_event,
+	.enable_event	= snbep_uncore_msr_enable_event,
+	.read_counter	= uncore_msr_read_counter,
+	.get_constraint = uncore_get_constraint,
+	.put_constraint = uncore_put_constraint,
+	.hw_config	= snbep_uncore_hw_config,
+};
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+	.init_box	= snbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= snbep_uncore_pci_disable_event,
+	.enable_event	= snbep_uncore_pci_enable_event,
+	.read_counter	= snbep_uncore_pci_read_counter,
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+	EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+	.name		= "ubox",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.fixed_ctr_bits	= 48,
+	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
+	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
+	.event_mask	= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops		= &snbep_uncore_msr_ops,
+	.format_group	= &snbep_uncore_ubox_format_group,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 44,
+	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= snbep_uncore_cbox_constraints,
+	.ops			= &snbep_uncore_msr_ops,
+	.format_group		= &snbep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_msr_ops,
+	.format_group		= &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+	&snbep_uncore_ubox,
+	&snbep_uncore_cbox,
+	&snbep_uncore_pcu,
+	NULL,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT()				\
+	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
+	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
+	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,		\
+	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
+	.ops		= &snbep_uncore_pci_ops,		\
+	.format_group	= &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 4,
+	.num_boxes	= 4,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= snbep_uncore_imc_events,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+	.name		= "qpi",
+	.num_counters   = 4,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	.perf_ctr	= SNBEP_PCI_PMON_CTR0,
+	.event_ctl	= SNBEP_PCI_PMON_CTL0,
+	.event_mask	= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,
+	.ops		= &snbep_uncore_pci_ops,
+	.event_descs	= snbep_uncore_qpi_events,
+	.format_group	= &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 3,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r3qpi_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+	&snbep_uncore_ha,
+	&snbep_uncore_imc,
+	&snbep_uncore_qpi,
+	&snbep_uncore_r2pcie,
+	&snbep_uncore_r3qpi,
+	NULL,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = {
+	{ /* Home Agent */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+		.driver_data = (unsigned long)&snbep_uncore_ha,
+	},
+	{ /* MC Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+		.driver_data = (unsigned long)&snbep_uncore_imc,
+	},
+	{ /* MC Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+		.driver_data = (unsigned long)&snbep_uncore_imc,
+	},
+	{ /* MC Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+		.driver_data = (unsigned long)&snbep_uncore_imc,
+	},
+	{ /* MC Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+		.driver_data = (unsigned long)&snbep_uncore_imc,
+	},
+	{ /* QPI Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+		.driver_data = (unsigned long)&snbep_uncore_qpi,
+	},
+	{ /* QPI Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+		.driver_data = (unsigned long)&snbep_uncore_qpi,
+	},
+	{ /* P2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+		.driver_data = (unsigned long)&snbep_uncore_r2pcie,
+	},
+	{ /* R3QPI Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+		.driver_data = (unsigned long)&snbep_uncore_r3qpi,
+	},
+	{ /* R3QPI Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+		.driver_data = (unsigned long)&snbep_uncore_r3qpi,
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+	.name		= "snbep_uncore",
+	.id_table	= snbep_uncore_pci_ids,
+};
+
+/*
+ * build pci bus to socket mapping
+ */
+static void snbep_pci2phy_map_init(void)
+{
+	struct pci_dev *ubox_dev = NULL;
+	int i, bus, nodeid;
+	u32 config;
+
+	while (1) {
+		/* find the UBOX device */
+		ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL,
+					PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX,
+					ubox_dev);
+		if (!ubox_dev)
+			break;
+		bus = ubox_dev->bus->number;
+		/* get the Node ID of the local register */
+		pci_read_config_dword(ubox_dev, 0x40, &config);
+		nodeid = config;
+		/* get the Node ID mapping */
+		pci_read_config_dword(ubox_dev, 0x54, &config);
+		/*
+		 * every three bits in the Node ID mapping register maps
+		 * to a particular node.
+		 */
+		for (i = 0; i < 8; i++) {
+			if (nodeid == ((config >> (3 * i)) & 0x7)) {
+				pcibus_to_physid[bus] = i;
+				break;
+			}
+		}
+	};
+	return;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+	else
+		wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	wrmsrl(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->pmu_idx == 0) {
+		wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+			SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+	}
+}
+
+static struct attribute *snb_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask5.attr,
+	NULL,
+};
+
+static struct attribute_group snb_uncore_format_group = {
+	.name		= "format",
+	.attrs		= snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+	.init_box	= snb_uncore_msr_init_box,
+	.disable_event	= snb_uncore_msr_disable_event,
+	.enable_event	= snb_uncore_msr_enable_event,
+	.read_counter	= uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+	.name		= "cbox",
+	.num_counters   = 2,
+	.num_boxes	= 4,
+	.perf_ctr_bits	= 44,
+	.fixed_ctr_bits	= 48,
+	.perf_ctr	= SNB_UNC_CBO_0_PER_CTR0,
+	.event_ctl	= SNB_UNC_CBO_0_PERFEVTSEL0,
+	.fixed_ctr	= SNB_UNC_FIXED_CTR,
+	.fixed_ctl	= SNB_UNC_FIXED_CTR_CTRL,
+	.single_fixed	= 1,
+	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
+	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET,
+	.constraints	= snb_uncore_cbox_constraints,
+	.ops		= &snb_uncore_msr_ops,
+	.format_group	= &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+	&snb_uncore_cbox,
+	NULL,
+};
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+	else
+		wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask8.attr,
+	NULL,
+};
+
+static struct attribute_group nhm_uncore_format_group = {
+	.name = "format",
+	.attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
+	INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
+	INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+	.disable_box	= nhm_uncore_msr_disable_box,
+	.enable_box	= nhm_uncore_msr_enable_box,
+	.disable_event	= snb_uncore_msr_disable_event,
+	.enable_event	= nhm_uncore_msr_enable_event,
+	.read_counter	= uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+	.name		= "",
+	.num_counters   = 8,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.event_ctl	= NHM_UNC_PERFEVTSEL0,
+	.perf_ctr	= NHM_UNC_UNCORE_PMC0,
+	.fixed_ctr	= NHM_UNC_FIXED_CTR,
+	.fixed_ctl	= NHM_UNC_FIXED_CTR_CTRL,
+	.event_mask	= NHM_UNC_RAW_EVENT_MASK,
+	.event_descs	= nhm_uncore_events,
+	.ops		= &nhm_uncore_msr_ops,
+	.format_group	= &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+	&nhm_uncore,
+	NULL,
+};
+/* end of Nehalem uncore support */
+
+/* Nehalem-EX uncore support */
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+				((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+	u64 config;
+
+	if (msr) {
+		rdmsrl(msr, config);
+		config &= ~((1ULL << uncore_num_counters(box)) - 1);
+		/* WBox has a fixed counter */
+		if (uncore_msr_fixed_ctl(box))
+			config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+		wrmsrl(msr, config);
+	}
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+	u64 config;
+
+	if (msr) {
+		rdmsrl(msr, config);
+		config |= (1ULL << uncore_num_counters(box)) - 1;
+		/* WBox has a fixed counter */
+		if (uncore_msr_fixed_ctl(box))
+			config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+		wrmsrl(msr, config);
+	}
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	wrmsrl(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+	else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+	else
+		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT()				\
+	.init_box	= nhmex_uncore_msr_init_box,		\
+	.disable_box	= nhmex_uncore_msr_disable_box,		\
+	.enable_box	= nhmex_uncore_msr_enable_box,		\
+	.disable_event	= nhmex_uncore_msr_disable_event,	\
+	.read_counter	= uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event	= nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_edge.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_ubox_format_group = {
+	.name		= "format",
+	.attrs		= nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+	.name		= "ubox",
+	.num_counters	= 1,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.event_ctl	= NHMEX_U_MSR_PMON_EV_SEL,
+	.perf_ctr	= NHMEX_U_MSR_PMON_CTR,
+	.event_mask	= NHMEX_U_PMON_RAW_EVENT_MASK,
+	.box_ctl	= NHMEX_U_MSR_PMON_GLOBAL_CTL,
+	.ops		= &nhmex_uncore_ops,
+	.format_group	= &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static unsigned nhmex_cbox_msr_offsets[] = {
+	0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 6,
+	.num_boxes		= 10,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_C0_MSR_PMON_EV_SEL0,
+	.perf_ctr		= NHMEX_C0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+	.msr_offsets		= nhmex_cbox_msr_offsets,
+	.pair_ctr_ctl		= 1,
+	.ops			= &nhmex_uncore_ops,
+	.format_group		= &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+	.name			= "wbox",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_W_MSR_PMON_CNT0,
+	.perf_ctr		= NHMEX_W_MSR_PMON_EVT_SEL0,
+	.fixed_ctr		= NHMEX_W_MSR_PMON_FIXED_CTR,
+	.fixed_ctl		= NHMEX_W_MSR_PMON_FIXED_CTL,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_W_MSR_GLOBAL_CTL,
+	.pair_ctr_ctl		= 1,
+	.event_descs		= nhmex_uncore_wbox_events,
+	.ops			= &nhmex_uncore_ops,
+	.format_group		= &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int ctr, ev_sel;
+
+	ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+		NHMEX_B_PMON_CTR_SHIFT;
+	ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+		  NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+	/* events that do not use the match/mask registers */
+	if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+	    (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+		return 0;
+
+	if (box->pmu->pmu_idx == 0)
+		reg1->reg = NHMEX_B0_MSR_MATCH;
+	else
+		reg1->reg = NHMEX_B1_MSR_MATCH;
+	reg1->idx = 0;
+	reg1->config = event->attr.config1;
+	reg2->config = event->attr.config2;
+	return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg1->reg, reg1->config);
+		wrmsrl(reg1->reg + 1, reg2->config);
+	}
+	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+		(hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+	EVENT_CONSTRAINT(0 , 1, 0xc0),
+	EVENT_CONSTRAINT(0x40, 2, 0xc0),
+	EVENT_CONSTRAINT(0x80, 4, 0xc0),
+	EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+	EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+	&format_attr_event5.attr,
+	&format_attr_counter.attr,
+	&format_attr_match.attr,
+	&format_attr_mask.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_bbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_bbox_msr_enable_event,
+	.hw_config		= nhmex_bbox_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+	.name			= "bbox",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_B0_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_B0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_B_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+	.msr_offset		= NHMEX_B_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 1,
+	.constraints		= nhmex_uncore_bbox_constraints,
+	.ops			= &nhmex_uncore_bbox_ops,
+	.format_group		= &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	/* only TO_R_PROG_EV event uses the match/mask register */
+	if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+	    NHMEX_S_EVENT_TO_R_PROG_EV)
+		return 0;
+
+	if (box->pmu->pmu_idx == 0)
+		reg1->reg = NHMEX_S0_MSR_MM_CFG;
+	else
+		reg1->reg = NHMEX_S1_MSR_MM_CFG;
+	reg1->idx = 0;
+	reg1->config = event->attr.config1;
+	reg2->config = event->attr.config2;
+	return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg1->reg, 0);
+		wrmsrl(reg1->reg + 1, reg1->config);
+		wrmsrl(reg1->reg + 2, reg2->config);
+		wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+	}
+	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_match.attr,
+	&format_attr_mask.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_sbox_format_group = {
+	.name			= "format",
+	.attrs			= nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_sbox_msr_enable_event,
+	.hw_config		= nhmex_sbox_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_S0_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_S0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+	.msr_offset		= NHMEX_S_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 1,
+	.ops			= &nhmex_uncore_sbox_ops,
+	.format_group		= &nhmex_uncore_sbox_format_group
+};
+
+enum {
+	EXTRA_REG_NHMEX_M_FILTER,
+	EXTRA_REG_NHMEX_M_DSP,
+	EXTRA_REG_NHMEX_M_ISS,
+	EXTRA_REG_NHMEX_M_MAP,
+	EXTRA_REG_NHMEX_M_MSC_THR,
+	EXTRA_REG_NHMEX_M_PGT,
+	EXTRA_REG_NHMEX_M_PLD,
+	EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+	MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+	MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+	MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+	MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+	/* event 0xa uses two extra registers */
+	MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+	MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+	MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+	/* events 0xd ~ 0x10 use the same extra register */
+	MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+	EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	bool ret = false;
+	u64 mask;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		er = &box->shared_regs[idx];
+		raw_spin_lock_irqsave(&er->lock, flags);
+		if (!atomic_read(&er->ref) || er->config == config) {
+			atomic_inc(&er->ref);
+			er->config = config;
+			ret = true;
+		}
+		raw_spin_unlock_irqrestore(&er->lock, flags);
+
+		return ret;
+	}
+	/*
+	 * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+	 * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+	 * fields which are shared.
+	 */
+	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	if (WARN_ON_ONCE(idx >= 4))
+		return false;
+
+	/* mask of the shared fields */
+	if (uncore_nhmex)
+		mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+	else
+		mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+	raw_spin_lock_irqsave(&er->lock, flags);
+	/* add mask of the non-shared field if it's in use */
+	if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+		if (uncore_nhmex)
+			mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		else
+			mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	}
+
+	if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+		atomic_add(1 << (idx * 8), &er->ref);
+		if (uncore_nhmex)
+			mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+				NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		else
+			mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+				WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		er->config &= ~mask;
+		er->config |= (config & mask);
+		ret = true;
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+	struct intel_uncore_extra_reg *er;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		er = &box->shared_regs[idx];
+		atomic_dec(&er->ref);
+		return;
+	}
+
+	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+	atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	int idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+	u64 config = reg1->config;
+
+	/* get the non-shared control bits and shift them */
+	idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	if (uncore_nhmex)
+		config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	else
+		config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	if (new_idx > orig_idx) {
+		idx = new_idx - orig_idx;
+		config <<= 3 * idx;
+	} else {
+		idx = orig_idx - new_idx;
+		config >>= 3 * idx;
+	}
+
+	/* add the shared control bits back */
+	if (uncore_nhmex)
+		config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	else
+		config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	if (modify) {
+		/* adjust the main event selector */
+		if (new_idx > orig_idx)
+			hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+		else
+			hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+		reg1->config = config;
+		reg1->idx = ~0xff | new_idx;
+	}
+	return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	int i, idx[2], alloc = 0;
+	u64 config1 = reg1->config;
+
+	idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+	idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+	for (i = 0; i < 2; i++) {
+		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+			idx[i] = 0xff;
+
+		if (idx[i] == 0xff)
+			continue;
+
+		if (!nhmex_mbox_get_shared_reg(box, idx[i],
+				__BITS_VALUE(config1, i, 32)))
+			goto fail;
+		alloc |= (0x1 << i);
+	}
+
+	/* for the match/mask registers */
+	if (reg2->idx != EXTRA_REG_NONE &&
+	    (uncore_box_is_fake(box) || !reg2->alloc) &&
+	    !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+		goto fail;
+
+	/*
+	 * If it's a fake box -- as per validate_{group,event}() we
+	 * shouldn't touch event state and we can avoid doing so
+	 * since both will only call get_event_constraints() once
+	 * on each event, this avoids the need for reg->alloc.
+	 */
+	if (!uncore_box_is_fake(box)) {
+		if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+			nhmex_mbox_alter_er(event, idx[0], true);
+		reg1->alloc |= alloc;
+		if (reg2->idx != EXTRA_REG_NONE)
+			reg2->alloc = 1;
+	}
+	return NULL;
+fail:
+	if (idx[0] != 0xff && !(alloc & 0x1) &&
+	    idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		/*
+		 * events 0xd ~ 0x10 are functional identical, but are
+		 * controlled by different fields in the ZDP_CTL_FVC
+		 * register. If we failed to take one field, try the
+		 * rest 3 choices.
+		 */
+		BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+		idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+		idx[0] = (idx[0] + 1) % 4;
+		idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+		if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+			config1 = nhmex_mbox_alter_er(event, idx[0], false);
+			goto again;
+		}
+	}
+
+	if (alloc & 0x1)
+		nhmex_mbox_put_shared_reg(box, idx[0]);
+	if (alloc & 0x2)
+		nhmex_mbox_put_shared_reg(box, idx[1]);
+	return &constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+	if (uncore_box_is_fake(box))
+		return;
+
+	if (reg1->alloc & 0x1)
+		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+	if (reg1->alloc & 0x2)
+		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+	reg1->alloc = 0;
+
+	if (reg2->alloc) {
+		nhmex_mbox_put_shared_reg(box, reg2->idx);
+		reg2->alloc = 0;
+	}
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+	if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+		return er->idx;
+	return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_type *type = box->pmu->type;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	struct extra_reg *er;
+	unsigned msr;
+	int reg_idx = 0;
+	/*
+	 * The mbox events may require 2 extra MSRs at the most. But only
+	 * the lower 32 bits in these MSRs are significant, so we can use
+	 * config1 to pass two MSRs' config.
+	 */
+	for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		if (event->attr.config1 & ~er->valid_mask)
+			return -EINVAL;
+
+		msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+		if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+			return -EINVAL;
+
+		/* always use the 32~63 bits to pass the PLD config */
+		if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+			reg_idx = 1;
+		else if (WARN_ON_ONCE(reg_idx > 0))
+			return -EINVAL;
+
+		reg1->idx &= ~(0xff << (reg_idx * 8));
+		reg1->reg &= ~(0xffff << (reg_idx * 16));
+		reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+		reg1->reg |= msr << (reg_idx * 16);
+		reg1->config = event->attr.config1;
+		reg_idx++;
+	}
+	/*
+	 * The mbox only provides ability to perform address matching
+	 * for the PLD events.
+	 */
+	if (reg_idx == 2) {
+		reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+		if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+			reg2->config = event->attr.config2;
+		else
+			reg2->config = ~0ULL;
+		if (box->pmu->pmu_idx == 0)
+			reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+		else
+			reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+	}
+	return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	u64 config;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+		return box->shared_regs[idx].config;
+
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+	raw_spin_lock_irqsave(&er->lock, flags);
+	config = er->config;
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+	return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int idx;
+
+	idx = __BITS_VALUE(reg1->idx, 0, 8);
+	if (idx != 0xff)
+		wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
+			nhmex_mbox_shared_reg_config(box, idx));
+	idx = __BITS_VALUE(reg1->idx, 1, 8);
+	if (idx != 0xff)
+		wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
+			nhmex_mbox_shared_reg_config(box, idx));
+
+	if (reg2->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg2->reg, 0);
+		if (reg2->config != ~0ULL) {
+			wrmsrl(reg2->reg + 1,
+				reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+			wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+				(reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+			wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+		}
+	}
+
+	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode,		count_mode,	"config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode,		storage_mode,	"config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,		wrap_mode,	"config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode,		flag_mode,	"config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel,		inc_sel,	"config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,		set_flag_sel,	"config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,	filter_cfg_en,	"config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match,		filter_match,	"config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask,		filter_mask,	"config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp,			dsp,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr,			thr,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc,			fvc,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt,			pgt,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map,			map,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss,			iss,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld,			pld,		"config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+	&format_attr_count_mode.attr,
+	&format_attr_storage_mode.attr,
+	&format_attr_wrap_mode.attr,
+	&format_attr_flag_mode.attr,
+	&format_attr_inc_sel.attr,
+	&format_attr_set_flag_sel.attr,
+	&format_attr_filter_cfg_en.attr,
+	&format_attr_filter_match.attr,
+	&format_attr_filter_mask.attr,
+	&format_attr_dsp.attr,
+	&format_attr_thr.attr,
+	&format_attr_fvc.attr,
+	&format_attr_pgt.attr,
+	&format_attr_map.attr,
+	&format_attr_iss.attr,
+	&format_attr_pld.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_mbox_format_group = {
+	.name		= "format",
+	.attrs		= nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+	{ /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event	= nhmex_mbox_msr_enable_event,
+	.hw_config	= nhmex_mbox_hw_config,
+	.get_constraint	= nhmex_mbox_get_constraint,
+	.put_constraint	= nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+	.name			= "mbox",
+	.num_counters		= 6,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_M0_MSR_PMU_CTL0,
+	.perf_ctr		= NHMEX_M0_MSR_PMU_CNT0,
+	.event_mask		= NHMEX_M_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_M0_MSR_GLOBAL_CTL,
+	.msr_offset		= NHMEX_M_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 8,
+	.event_descs		= nhmex_uncore_mbox_events,
+	.ops			= &nhmex_uncore_mbox_ops,
+	.format_group		= &nhmex_uncore_mbox_format_group,
+};
+
+void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	int port;
+
+	/* adjust the main event selector and extra register index */
+	if (reg1->idx % 2) {
+		reg1->idx--;
+		hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	} else {
+		reg1->idx++;
+		hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	}
+
+	/* adjust extra register config */
+	port = reg1->idx / 6 + box->pmu->pmu_idx * 4;
+	switch (reg1->idx % 6) {
+	case 2:
+		/* shift the 8~15 bits to the 0~7 bits */
+		reg1->config >>= 8;
+		break;
+	case 3:
+		/* shift the 0~7 bits to the 8~15 bits */
+		reg1->config <<= 8;
+		break;
+	};
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	int idx, er_idx;
+	u64 config1;
+	bool ok = false;
+
+	if (!uncore_box_is_fake(box) && reg1->alloc)
+		return NULL;
+
+	idx = reg1->idx % 6;
+	config1 = reg1->config;
+again:
+	er_idx = idx;
+	/* the 3rd and 4th events use the same extra register */
+	if (er_idx > 2)
+		er_idx--;
+	er_idx += (reg1->idx / 6) * 5;
+
+	er = &box->shared_regs[er_idx];
+	raw_spin_lock_irqsave(&er->lock, flags);
+	if (idx < 2) {
+		if (!atomic_read(&er->ref) || er->config == reg1->config) {
+			atomic_inc(&er->ref);
+			er->config = reg1->config;
+			ok = true;
+		}
+	} else if (idx == 2 || idx == 3) {
+		/*
+		 * these two events use different fields in a extra register,
+		 * the 0~7 bits and the 8~15 bits respectively.
+		 */
+		u64 mask = 0xff << ((idx - 2) * 8);
+		if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+				!((er->config ^ config1) & mask)) {
+			atomic_add(1 << ((idx - 2) * 8), &er->ref);
+			er->config &= ~mask;
+			er->config |= config1 & mask;
+			ok = true;
+		}
+	} else {
+		if (!atomic_read(&er->ref) ||
+				(er->config == (hwc->config >> 32) &&
+				 er->config1 == reg1->config &&
+				 er->config2 == reg2->config)) {
+			atomic_inc(&er->ref);
+			er->config = (hwc->config >> 32);
+			er->config1 = reg1->config;
+			er->config2 = reg2->config;
+			ok = true;
+		}
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	if (!ok) {
+		/*
+		 * The Rbox events are always in pairs. The paired
+		 * events are functional identical, but use different
+		 * extra registers. If we failed to take an extra
+		 * register, try the alternative.
+		 */
+		if (idx % 2)
+			idx--;
+		else
+			idx++;
+		if (idx != reg1->idx % 6) {
+			if (idx == 2)
+				config1 >>= 8;
+			else if (idx == 3)
+				config1 <<= 8;
+			goto again;
+		}
+	} else {
+		if (!uncore_box_is_fake(box)) {
+			if (idx != reg1->idx % 6)
+				nhmex_rbox_alter_er(box, event);
+			reg1->alloc = 1;
+		}
+		return NULL;
+	}
+	return &constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_extra_reg *er;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	int idx, er_idx;
+
+	if (uncore_box_is_fake(box) || !reg1->alloc)
+		return;
+
+	idx = reg1->idx % 6;
+	er_idx = idx;
+	if (er_idx > 2)
+		er_idx--;
+	er_idx += (reg1->idx / 6) * 5;
+
+	er = &box->shared_regs[er_idx];
+	if (idx == 2 || idx == 3)
+		atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+	else
+		atomic_dec(&er->ref);
+
+	reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	int idx;
+
+	idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+		NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	if (idx >= 0x18)
+		return -EINVAL;
+
+	reg1->idx = idx;
+	reg1->config = event->attr.config1;
+
+	switch (idx % 6) {
+	case 4:
+	case 5:
+		hwc->config |= event->attr.config & (~0ULL << 32);
+		reg2->config = event->attr.config2;
+		break;
+	};
+	return 0;
+}
+
+static u64 nhmex_rbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	u64 config;
+
+	er = &box->shared_regs[idx];
+
+	raw_spin_lock_irqsave(&er->lock, flags);
+	config = er->config;
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	return config;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int idx, port;
+
+	idx = reg1->idx;
+	port = idx / 6 + box->pmu->pmu_idx * 4;
+
+	switch (idx % 6) {
+	case 0:
+		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+		break;
+	case 1:
+		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+		break;
+	case 2:
+	case 3:
+		wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+			nhmex_rbox_shared_reg_config(box, 2 + (idx / 6) * 5));
+		break;
+	case 4:
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+			hwc->config >> 32);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+		break;
+	case 5:
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+			hwc->config >> 32);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+		break;
+	};
+
+	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+		(hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+	&format_attr_event5.attr,
+	&format_attr_xbr_mm_cfg.attr,
+	&format_attr_xbr_match.attr,
+	&format_attr_xbr_mask.attr,
+	&format_attr_qlx_cfg.attr,
+	&format_attr_iperf_cfg.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_rbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,		"event=0x0,iperf_cfg=0x80000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,		"event=0x6,iperf_cfg=0x80000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,		"event=0x0,iperf_cfg=0x40000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,		"event=0x6,iperf_cfg=0x40000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi0_date_response,	"event=0x0,iperf_cfg=0xc4"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_date_response,	"event=0x6,iperf_cfg=0xc4"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_rbox_msr_enable_event,
+	.hw_config		= nhmex_rbox_hw_config,
+	.get_constraint		= nhmex_rbox_get_constraint,
+	.put_constraint		= nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+	.name			= "rbox",
+	.num_counters		= 8,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_R_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_R_MSR_PMON_CNT0,
+	.event_mask		= NHMEX_R_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_R_MSR_GLOBAL_CTL,
+	.msr_offset		= NHMEX_R_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 20,
+	.event_descs		= nhmex_uncore_rbox_events,
+	.ops			= &nhmex_uncore_rbox_ops,
+	.format_group		= &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+	&nhmex_uncore_ubox,
+	&nhmex_uncore_cbox,
+	&nhmex_uncore_bbox,
+	&nhmex_uncore_sbox,
+	&nhmex_uncore_mbox,
+	&nhmex_uncore_rbox,
+	&nhmex_uncore_wbox,
+	NULL,
+};
+/* end of Nehalem-EX uncore support */
+
+static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->idx = idx;
+	hwc->last_tag = ++box->tags[idx];
+
+	if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
+		hwc->event_base = uncore_fixed_ctr(box);
+		hwc->config_base = uncore_fixed_ctl(box);
+		return;
+	}
+
+	hwc->config_base = uncore_event_ctl(box, hwc->idx);
+	hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
+}
+
+static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+{
+	u64 prev_count, new_count, delta;
+	int shift;
+
+	if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
+		shift = 64 - uncore_fixed_ctr_bits(box);
+	else
+		shift = 64 - uncore_perf_ctr_bits(box);
+
+	/* the hrtimer might modify the previous event value */
+again:
+	prev_count = local64_read(&event->hw.prev_count);
+	new_count = uncore_read_counter(box, event);
+	if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
+		goto again;
+
+	delta = (new_count << shift) - (prev_count << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+}
+
+/*
+ * The overflow interrupt is unavailable for SandyBridge-EP, is broken
+ * for SandyBridge. So we use hrtimer to periodically poll the counter
+ * to avoid overflow.
+ */
+static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
+{
+	struct intel_uncore_box *box;
+	unsigned long flags;
+	int bit;
+
+	box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
+	if (!box->n_active || box->cpu != smp_processor_id())
+		return HRTIMER_NORESTART;
+	/*
+	 * disable local interrupt to prevent uncore_pmu_event_start/stop
+	 * to interrupt the update process
+	 */
+	local_irq_save(flags);
+
+	for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
+		uncore_perf_event_update(box, box->events[bit]);
+
+	local_irq_restore(flags);
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL));
+	return HRTIMER_RESTART;
+}
+
+static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+{
+	__hrtimer_start_range_ns(&box->hrtimer,
+			ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0,
+			HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+{
+	hrtimer_cancel(&box->hrtimer);
+}
+
+static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
+{
+	hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	box->hrtimer.function = uncore_pmu_hrtimer;
+}
+
+struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cpu)
+{
+	struct intel_uncore_box *box;
+	int i, size;
+
+	size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
+
+	box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu));
+	if (!box)
+		return NULL;
+
+	for (i = 0; i < type->num_shared_regs; i++)
+		raw_spin_lock_init(&box->shared_regs[i].lock);
+
+	uncore_pmu_init_hrtimer(box);
+	atomic_set(&box->refcnt, 1);
+	box->cpu = -1;
+	box->phys_id = -1;
+
+	return box;
+}
+
+static struct intel_uncore_box *
+uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+{
+	static struct intel_uncore_box *box;
+
+	box = *per_cpu_ptr(pmu->box, cpu);
+	if (box)
+		return box;
+
+	raw_spin_lock(&uncore_box_lock);
+	list_for_each_entry(box, &pmu->box_list, list) {
+		if (box->phys_id == topology_physical_package_id(cpu)) {
+			atomic_inc(&box->refcnt);
+			*per_cpu_ptr(pmu->box, cpu) = box;
+			break;
+		}
+	}
+	raw_spin_unlock(&uncore_box_lock);
+
+	return *per_cpu_ptr(pmu->box, cpu);
+}
+
+static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+	return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+	/*
+	 * perf core schedules event on the basis of cpu, uncore events are
+	 * collected by one of the cpus inside a physical package.
+	 */
+	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
+}
+
+static int
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
+{
+	struct perf_event *event;
+	int n, max_count;
+
+	max_count = box->pmu->type->num_counters;
+	if (box->pmu->type->fixed_ctl)
+		max_count++;
+
+	if (box->n_events >= max_count)
+		return -EINVAL;
+
+	n = box->n_events;
+	box->event_list[n] = leader;
+	n++;
+	if (!dogrp)
+		return n;
+
+	list_for_each_entry(event, &leader->sibling_list, group_entry) {
+		if (event->state <= PERF_EVENT_STATE_OFF)
+			continue;
+
+		if (n >= max_count)
+			return -EINVAL;
+
+		box->event_list[n] = event;
+		n++;
+	}
+	return n;
+}
+
+static struct event_constraint *
+uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_type *type = box->pmu->type;
+	struct event_constraint *c;
+
+	if (type->ops->get_constraint) {
+		c = type->ops->get_constraint(box, event);
+		if (c)
+			return c;
+	}
+
+	if (event->hw.config == ~0ULL)
+		return &constraint_fixed;
+
+	if (type->constraints) {
+		for_each_event_constraint(c, type->constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &type->unconstrainted;
+}
+
+static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	if (box->pmu->type->ops->put_constraint)
+		box->pmu->type->ops->put_constraint(box, event);
+}
+
+static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
+{
+	unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+	struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX];
+	int i, wmin, wmax, ret = 0;
+	struct hw_perf_event *hwc;
+
+	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
+
+	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+		c = uncore_get_event_constraint(box, box->event_list[i]);
+		constraints[i] = c;
+		wmin = min(wmin, c->weight);
+		wmax = max(wmax, c->weight);
+	}
+
+	/* fastpath, try to reuse previous register */
+	for (i = 0; i < n; i++) {
+		hwc = &box->event_list[i]->hw;
+		c = constraints[i];
+
+		/* never assigned */
+		if (hwc->idx == -1)
+			break;
+
+		/* constraint still honored */
+		if (!test_bit(hwc->idx, c->idxmsk))
+			break;
+
+		/* not already used */
+		if (test_bit(hwc->idx, used_mask))
+			break;
+
+		__set_bit(hwc->idx, used_mask);
+		if (assign)
+			assign[i] = hwc->idx;
+	}
+	/* slow path */
+	if (i != n)
+		ret = perf_assign_events(constraints, n, wmin, wmax, assign);
+
+	if (!assign || ret) {
+		for (i = 0; i < n; i++)
+			uncore_put_event_constraint(box, box->event_list[i]);
+	}
+	return ret ? -EINVAL : 0;
+}
+
+static void uncore_pmu_event_start(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	int idx = event->hw.idx;
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
+		return;
+
+	event->hw.state = 0;
+	box->events[idx] = event;
+	box->n_active++;
+	__set_bit(idx, box->active_mask);
+
+	local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
+	uncore_enable_event(box, event);
+
+	if (box->n_active == 1) {
+		uncore_enable_box(box);
+		uncore_pmu_start_hrtimer(box);
+	}
+}
+
+static void uncore_pmu_event_stop(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
+		uncore_disable_event(box, event);
+		box->n_active--;
+		box->events[hwc->idx] = NULL;
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+
+		if (box->n_active == 0) {
+			uncore_disable_box(box);
+			uncore_pmu_cancel_hrtimer(box);
+		}
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		uncore_perf_event_update(box, event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int uncore_pmu_event_add(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+	int assign[UNCORE_PMC_IDX_MAX];
+	int i, n, ret;
+
+	if (!box)
+		return -ENODEV;
+
+	ret = n = uncore_collect_events(box, event, false);
+	if (ret < 0)
+		return ret;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	ret = uncore_assign_events(box, assign, n);
+	if (ret)
+		return ret;
+
+	/* save events moving to new counters */
+	for (i = 0; i < box->n_events; i++) {
+		event = box->event_list[i];
+		hwc = &event->hw;
+
+		if (hwc->idx == assign[i] &&
+			hwc->last_tag == box->tags[assign[i]])
+			continue;
+		/*
+		 * Ensure we don't accidentally enable a stopped
+		 * counter simply because we rescheduled.
+		 */
+		if (hwc->state & PERF_HES_STOPPED)
+			hwc->state |= PERF_HES_ARCH;
+
+		uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+	}
+
+	/* reprogram moved events into new counters */
+	for (i = 0; i < n; i++) {
+		event = box->event_list[i];
+		hwc = &event->hw;
+
+		if (hwc->idx != assign[i] ||
+			hwc->last_tag != box->tags[assign[i]])
+			uncore_assign_hw_event(box, event, assign[i]);
+		else if (i < box->n_events)
+			continue;
+
+		if (hwc->state & PERF_HES_ARCH)
+			continue;
+
+		uncore_pmu_event_start(event, 0);
+	}
+	box->n_events = n;
+
+	return 0;
+}
+
+static void uncore_pmu_event_del(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	int i;
+
+	uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+
+	for (i = 0; i < box->n_events; i++) {
+		if (event == box->event_list[i]) {
+			uncore_put_event_constraint(box, event);
+
+			while (++i < box->n_events)
+				box->event_list[i - 1] = box->event_list[i];
+
+			--box->n_events;
+			break;
+		}
+	}
+
+	event->hw.idx = -1;
+	event->hw.last_tag = ~0ULL;
+}
+
+static void uncore_pmu_event_read(struct perf_event *event)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	uncore_perf_event_update(box, event);
+}
+
+/*
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int uncore_validate_group(struct intel_uncore_pmu *pmu,
+				struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+	struct intel_uncore_box *fake_box;
+	int ret = -EINVAL, n;
+
+	fake_box = uncore_alloc_box(pmu->type, smp_processor_id());
+	if (!fake_box)
+		return -ENOMEM;
+
+	fake_box->pmu = pmu;
+	/*
+	 * the event is not yet connected with its
+	 * siblings therefore we must first collect
+	 * existing siblings, then add the new event
+	 * before we can simulate the scheduling
+	 */
+	n = uncore_collect_events(fake_box, leader, true);
+	if (n < 0)
+		goto out;
+
+	fake_box->n_events = n;
+	n = uncore_collect_events(fake_box, event, false);
+	if (n < 0)
+		goto out;
+
+	fake_box->n_events = n;
+
+	ret = uncore_assign_events(fake_box, NULL, n);
+out:
+	kfree(fake_box);
+	return ret;
+}
+
+int uncore_pmu_event_init(struct perf_event *event)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	struct hw_perf_event *hwc = &event->hw;
+	int ret;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	pmu = uncore_event_to_pmu(event);
+	/* no device found for this pmu */
+	if (pmu->func_id < 0)
+		return -ENOENT;
+
+	/*
+	 * Uncore PMU does measure at all privilege level all the time.
+	 * So it doesn't make sense to specify any exclude bits.
+	 */
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+			event->attr.exclude_hv || event->attr.exclude_idle)
+		return -EINVAL;
+
+	/* Sampling not supported yet */
+	if (hwc->sample_period)
+		return -EINVAL;
+
+	/*
+	 * Place all uncore events for a particular physical package
+	 * onto a single cpu
+	 */
+	if (event->cpu < 0)
+		return -EINVAL;
+	box = uncore_pmu_to_box(pmu, event->cpu);
+	if (!box || box->cpu < 0)
+		return -EINVAL;
+	event->cpu = box->cpu;
+
+	event->hw.idx = -1;
+	event->hw.last_tag = ~0ULL;
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+	event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+	if (event->attr.config == UNCORE_FIXED_EVENT) {
+		/* no fixed counter */
+		if (!pmu->type->fixed_ctl)
+			return -EINVAL;
+		/*
+		 * if there is only one fixed counter, only the first pmu
+		 * can access the fixed counter
+		 */
+		if (pmu->type->single_fixed && pmu->pmu_idx > 0)
+			return -EINVAL;
+		hwc->config = ~0ULL;
+	} else {
+		hwc->config = event->attr.config & pmu->type->event_mask;
+		if (pmu->type->ops->hw_config) {
+			ret = pmu->type->ops->hw_config(box, event);
+			if (ret)
+				return ret;
+		}
+	}
+
+	if (event->group_leader != event)
+		ret = uncore_validate_group(pmu, event);
+	else
+		ret = 0;
+
+	return ret;
+}
+
+static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
+{
+	int ret;
+
+	pmu->pmu = (struct pmu) {
+		.attr_groups	= pmu->type->attr_groups,
+		.task_ctx_nr	= perf_invalid_context,
+		.event_init	= uncore_pmu_event_init,
+		.add		= uncore_pmu_event_add,
+		.del		= uncore_pmu_event_del,
+		.start		= uncore_pmu_event_start,
+		.stop		= uncore_pmu_event_stop,
+		.read		= uncore_pmu_event_read,
+	};
+
+	if (pmu->type->num_boxes == 1) {
+		if (strlen(pmu->type->name) > 0)
+			sprintf(pmu->name, "uncore_%s", pmu->type->name);
+		else
+			sprintf(pmu->name, "uncore");
+	} else {
+		sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
+			pmu->pmu_idx);
+	}
+
+	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+	return ret;
+}
+
+static void __init uncore_type_exit(struct intel_uncore_type *type)
+{
+	int i;
+
+	for (i = 0; i < type->num_boxes; i++)
+		free_percpu(type->pmus[i].box);
+	kfree(type->pmus);
+	type->pmus = NULL;
+	kfree(type->attr_groups[1]);
+	type->attr_groups[1] = NULL;
+}
+
+static void __init uncore_types_exit(struct intel_uncore_type **types)
+{
+	int i;
+	for (i = 0; types[i]; i++)
+		uncore_type_exit(types[i]);
+}
+
+static int __init uncore_type_init(struct intel_uncore_type *type)
+{
+	struct intel_uncore_pmu *pmus;
+	struct attribute_group *events_group;
+	struct attribute **attrs;
+	int i, j;
+
+	pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
+	if (!pmus)
+		return -ENOMEM;
+
+	type->unconstrainted = (struct event_constraint)
+		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
+				0, type->num_counters, 0);
+
+	for (i = 0; i < type->num_boxes; i++) {
+		pmus[i].func_id = -1;
+		pmus[i].pmu_idx = i;
+		pmus[i].type = type;
+		INIT_LIST_HEAD(&pmus[i].box_list);
+		pmus[i].box = alloc_percpu(struct intel_uncore_box *);
+		if (!pmus[i].box)
+			goto fail;
+	}
+
+	if (type->event_descs) {
+		i = 0;
+		while (type->event_descs[i].attr.attr.name)
+			i++;
+
+		events_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
+					sizeof(*events_group), GFP_KERNEL);
+		if (!events_group)
+			goto fail;
+
+		attrs = (struct attribute **)(events_group + 1);
+		events_group->name = "events";
+		events_group->attrs = attrs;
+
+		for (j = 0; j < i; j++)
+			attrs[j] = &type->event_descs[j].attr.attr;
+
+		type->attr_groups[1] = events_group;
+	}
+
+	type->pmus = pmus;
+	return 0;
+fail:
+	uncore_type_exit(type);
+	return -ENOMEM;
+}
+
+static int __init uncore_types_init(struct intel_uncore_type **types)
+{
+	int i, ret;
+
+	for (i = 0; types[i]; i++) {
+		ret = uncore_type_init(types[i]);
+		if (ret)
+			goto fail;
+	}
+	return 0;
+fail:
+	while (--i >= 0)
+		uncore_type_exit(types[i]);
+	return ret;
+}
+
+static struct pci_driver *uncore_pci_driver;
+static bool pcidrv_registered;
+
+/*
+ * add a pci uncore device
+ */
+static int __devinit uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, phys_id;
+
+	phys_id = pcibus_to_physid[pdev->bus->number];
+	if (phys_id < 0)
+		return -ENODEV;
+
+	box = uncore_alloc_box(type, 0);
+	if (!box)
+		return -ENOMEM;
+
+	/*
+	 * for performance monitoring unit with multiple boxes,
+	 * each box has a different function id.
+	 */
+	for (i = 0; i < type->num_boxes; i++) {
+		pmu = &type->pmus[i];
+		if (pmu->func_id == pdev->devfn)
+			break;
+		if (pmu->func_id < 0) {
+			pmu->func_id = pdev->devfn;
+			break;
+		}
+		pmu = NULL;
+	}
+
+	if (!pmu) {
+		kfree(box);
+		return -EINVAL;
+	}
+
+	box->phys_id = phys_id;
+	box->pci_dev = pdev;
+	box->pmu = pmu;
+	uncore_box_init(box);
+	pci_set_drvdata(pdev, box);
+
+	raw_spin_lock(&uncore_box_lock);
+	list_add_tail(&box->list, &pmu->box_list);
+	raw_spin_unlock(&uncore_box_lock);
+
+	return 0;
+}
+
+static void uncore_pci_remove(struct pci_dev *pdev)
+{
+	struct intel_uncore_box *box = pci_get_drvdata(pdev);
+	struct intel_uncore_pmu *pmu = box->pmu;
+	int cpu, phys_id = pcibus_to_physid[pdev->bus->number];
+
+	if (WARN_ON_ONCE(phys_id != box->phys_id))
+		return;
+
+	raw_spin_lock(&uncore_box_lock);
+	list_del(&box->list);
+	raw_spin_unlock(&uncore_box_lock);
+
+	for_each_possible_cpu(cpu) {
+		if (*per_cpu_ptr(pmu->box, cpu) == box) {
+			*per_cpu_ptr(pmu->box, cpu) = NULL;
+			atomic_dec(&box->refcnt);
+		}
+	}
+
+	WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
+	kfree(box);
+}
+
+static int __devinit uncore_pci_probe(struct pci_dev *pdev,
+				const struct pci_device_id *id)
+{
+	struct intel_uncore_type *type;
+
+	type = (struct intel_uncore_type *)id->driver_data;
+
+	return uncore_pci_add(type, pdev);
+}
+
+static int __init uncore_pci_init(void)
+{
+	int ret;
+
+	switch (boot_cpu_data.x86_model) {
+	case 45: /* Sandy Bridge-EP */
+		pci_uncores = snbep_pci_uncores;
+		uncore_pci_driver = &snbep_uncore_pci_driver;
+		snbep_pci2phy_map_init();
+		break;
+	default:
+		return 0;
+	}
+
+	ret = uncore_types_init(pci_uncores);
+	if (ret)
+		return ret;
+
+	uncore_pci_driver->probe = uncore_pci_probe;
+	uncore_pci_driver->remove = uncore_pci_remove;
+
+	ret = pci_register_driver(uncore_pci_driver);
+	if (ret == 0)
+		pcidrv_registered = true;
+	else
+		uncore_types_exit(pci_uncores);
+
+	return ret;
+}
+
+static void __init uncore_pci_exit(void)
+{
+	if (pcidrv_registered) {
+		pcidrv_registered = false;
+		pci_unregister_driver(uncore_pci_driver);
+		uncore_types_exit(pci_uncores);
+	}
+}
+
+static void __cpuinit uncore_cpu_dying(int cpu)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, j;
+
+	for (i = 0; msr_uncores[i]; i++) {
+		type = msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			box = *per_cpu_ptr(pmu->box, cpu);
+			*per_cpu_ptr(pmu->box, cpu) = NULL;
+			if (box && atomic_dec_and_test(&box->refcnt))
+				kfree(box);
+		}
+	}
+}
+
+static int __cpuinit uncore_cpu_starting(int cpu)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box, *exist;
+	int i, j, k, phys_id;
+
+	phys_id = topology_physical_package_id(cpu);
+
+	for (i = 0; msr_uncores[i]; i++) {
+		type = msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			box = *per_cpu_ptr(pmu->box, cpu);
+			/* called by uncore_cpu_init? */
+			if (box && box->phys_id >= 0) {
+				uncore_box_init(box);
+				continue;
+			}
+
+			for_each_online_cpu(k) {
+				exist = *per_cpu_ptr(pmu->box, k);
+				if (exist && exist->phys_id == phys_id) {
+					atomic_inc(&exist->refcnt);
+					*per_cpu_ptr(pmu->box, cpu) = exist;
+					kfree(box);
+					box = NULL;
+					break;
+				}
+			}
+
+			if (box) {
+				box->phys_id = phys_id;
+				uncore_box_init(box);
+			}
+		}
+	}
+	return 0;
+}
+
+static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, j;
+
+	for (i = 0; msr_uncores[i]; i++) {
+		type = msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			if (pmu->func_id < 0)
+				pmu->func_id = j;
+
+			box = uncore_alloc_box(type, cpu);
+			if (!box)
+				return -ENOMEM;
+
+			box->pmu = pmu;
+			box->phys_id = phys_id;
+			*per_cpu_ptr(pmu->box, cpu) = box;
+		}
+	}
+	return 0;
+}
+
+static void __cpuinit
+uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, j;
+
+	for (i = 0; uncores[i]; i++) {
+		type = uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			if (old_cpu < 0)
+				box = uncore_pmu_to_box(pmu, new_cpu);
+			else
+				box = uncore_pmu_to_box(pmu, old_cpu);
+			if (!box)
+				continue;
+
+			if (old_cpu < 0) {
+				WARN_ON_ONCE(box->cpu != -1);
+				box->cpu = new_cpu;
+				continue;
+			}
+
+			WARN_ON_ONCE(box->cpu != old_cpu);
+			if (new_cpu >= 0) {
+				uncore_pmu_cancel_hrtimer(box);
+				perf_pmu_migrate_context(&pmu->pmu,
+						old_cpu, new_cpu);
+				box->cpu = new_cpu;
+			} else {
+				box->cpu = -1;
+			}
+		}
+	}
+}
+
+static void __cpuinit uncore_event_exit_cpu(int cpu)
+{
+	int i, phys_id, target;
+
+	/* if exiting cpu is used for collecting uncore events */
+	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+		return;
+
+	/* find a new cpu to collect uncore events */
+	phys_id = topology_physical_package_id(cpu);
+	target = -1;
+	for_each_online_cpu(i) {
+		if (i == cpu)
+			continue;
+		if (phys_id == topology_physical_package_id(i)) {
+			target = i;
+			break;
+		}
+	}
+
+	/* migrate uncore events to the new cpu */
+	if (target >= 0)
+		cpumask_set_cpu(target, &uncore_cpu_mask);
+
+	uncore_change_context(msr_uncores, cpu, target);
+	uncore_change_context(pci_uncores, cpu, target);
+}
+
+static void __cpuinit uncore_event_init_cpu(int cpu)
+{
+	int i, phys_id;
+
+	phys_id = topology_physical_package_id(cpu);
+	for_each_cpu(i, &uncore_cpu_mask) {
+		if (phys_id == topology_physical_package_id(i))
+			return;
+	}
+
+	cpumask_set_cpu(cpu, &uncore_cpu_mask);
+
+	uncore_change_context(msr_uncores, -1, cpu);
+	uncore_change_context(pci_uncores, -1, cpu);
+}
+
+static int
+ __cpuinit uncore_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	/* allocate/free data structure for uncore box */
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		uncore_cpu_prepare(cpu, -1);
+		break;
+	case CPU_STARTING:
+		uncore_cpu_starting(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DYING:
+		uncore_cpu_dying(cpu);
+		break;
+	default:
+		break;
+	}
+
+	/* select the cpu that collects uncore events */
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_FAILED:
+	case CPU_STARTING:
+		uncore_event_init_cpu(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		uncore_event_exit_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block uncore_cpu_nb __cpuinitdata = {
+	.notifier_call	= uncore_cpu_notifier,
+	/*
+	 * to migrate uncore events, our notifier should be executed
+	 * before perf core's notifier.
+	 */
+	.priority	= CPU_PRI_PERF + 1,
+};
+
+static void __init uncore_cpu_setup(void *dummy)
+{
+	uncore_cpu_starting(smp_processor_id());
+}
+
+static int __init uncore_cpu_init(void)
+{
+	int ret, cpu, max_cores;
+
+	max_cores = boot_cpu_data.x86_max_cores;
+	switch (boot_cpu_data.x86_model) {
+	case 26: /* Nehalem */
+	case 30:
+	case 37: /* Westmere */
+	case 44:
+		msr_uncores = nhm_msr_uncores;
+		break;
+	case 42: /* Sandy Bridge */
+		if (snb_uncore_cbox.num_boxes > max_cores)
+			snb_uncore_cbox.num_boxes = max_cores;
+		msr_uncores = snb_msr_uncores;
+		break;
+	case 45: /* Sandy Birdge-EP */
+		if (snbep_uncore_cbox.num_boxes > max_cores)
+			snbep_uncore_cbox.num_boxes = max_cores;
+		msr_uncores = snbep_msr_uncores;
+		break;
+	case 46: /* Nehalem-EX */
+		uncore_nhmex = true;
+	case 47: /* Westmere-EX aka. Xeon E7 */
+		if (!uncore_nhmex)
+			nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+		if (nhmex_uncore_cbox.num_boxes > max_cores)
+			nhmex_uncore_cbox.num_boxes = max_cores;
+		msr_uncores = nhmex_msr_uncores;
+		break;
+	default:
+		return 0;
+	}
+
+	ret = uncore_types_init(msr_uncores);
+	if (ret)
+		return ret;
+
+	get_online_cpus();
+
+	for_each_online_cpu(cpu) {
+		int i, phys_id = topology_physical_package_id(cpu);
+
+		for_each_cpu(i, &uncore_cpu_mask) {
+			if (phys_id == topology_physical_package_id(i)) {
+				phys_id = -1;
+				break;
+			}
+		}
+		if (phys_id < 0)
+			continue;
+
+		uncore_cpu_prepare(cpu, phys_id);
+		uncore_event_init_cpu(cpu);
+	}
+	on_each_cpu(uncore_cpu_setup, NULL, 1);
+
+	register_cpu_notifier(&uncore_cpu_nb);
+
+	put_online_cpus();
+
+	return 0;
+}
+
+static int __init uncore_pmus_register(void)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_type *type;
+	int i, j;
+
+	for (i = 0; msr_uncores[i]; i++) {
+		type = msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			uncore_pmu_register(pmu);
+		}
+	}
+
+	for (i = 0; pci_uncores[i]; i++) {
+		type = pci_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			uncore_pmu_register(pmu);
+		}
+	}
+
+	return 0;
+}
+
+static int __init intel_uncore_init(void)
+{
+	int ret;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return -ENODEV;
+
+	ret = uncore_pci_init();
+	if (ret)
+		goto fail;
+	ret = uncore_cpu_init();
+	if (ret) {
+		uncore_pci_exit();
+		goto fail;
+	}
+
+	uncore_pmus_register();
+	return 0;
+fail:
+	return ret;
+}
+device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
new file mode 100644
index 000000000000..5b81c1856aac
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -0,0 +1,619 @@
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/perf_event.h>
+#include "perf_event.h"
+
+#define UNCORE_PMU_NAME_LEN		32
+#define UNCORE_PMU_HRTIMER_INTERVAL	(60LL * NSEC_PER_SEC)
+
+#define UNCORE_FIXED_EVENT		0xff
+#define UNCORE_PMC_IDX_MAX_GENERIC	8
+#define UNCORE_PMC_IDX_FIXED		UNCORE_PMC_IDX_MAX_GENERIC
+#define UNCORE_PMC_IDX_MAX		(UNCORE_PMC_IDX_FIXED + 1)
+
+#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK			0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET			(1 << 18)
+#define SNB_UNC_CTL_EN				(1 << 22)
+#define SNB_UNC_CTL_INVERT			(1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK			0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK			0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN		(1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
+						 SNB_UNC_CTL_UMASK_MASK | \
+						 SNB_UNC_CTL_EDGE_DET | \
+						 SNB_UNC_CTL_INVERT | \
+						 SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
+						 SNB_UNC_CTL_UMASK_MASK | \
+						 SNB_UNC_CTL_EDGE_DET | \
+						 SNB_UNC_CTL_INVERT | \
+						 NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
+#define SNB_UNC_FIXED_CTR_CTRL                  0x394
+#define SNB_UNC_FIXED_CTR                       0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
+#define SNB_UNC_CBO_0_PER_CTR0                  0x706
+#define SNB_UNC_CBO_MSR_OFFSET                  0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
+#define NHM_UNC_FIXED_CTR                       0x394
+#define NHM_UNC_FIXED_CTR_CTRL                  0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0                     0x3c0
+#define NHM_UNC_UNCORE_PMC0                     0x3b0
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL	(1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS	(1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ		(1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN	(1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
+					 SNBEP_PMON_BOX_CTL_RST_CTRS | \
+					 SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK	0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK	0x0000ff00
+#define SNBEP_PMON_CTL_RST		(1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET		(1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT	(1 << 21)	/* only for QPI */
+#define SNBEP_PMON_CTL_EN		(1 << 22)
+#define SNBEP_PMON_CTL_INVERT		(1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK	0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK	(SNBEP_PMON_CTL_EV_SEL_MASK | \
+					 SNBEP_PMON_CTL_UMASK_MASK | \
+					 SNBEP_PMON_CTL_EDGE_DET | \
+					 SNBEP_PMON_CTL_INVERT | \
+					 SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK		0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK		\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PMON_CTL_UMASK_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN		(1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK	(SNBEP_PMON_RAW_EVENT_MASK | \
+						 SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK	0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK	0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT	(1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET	(1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_RAW_EVENT_MASK | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL			0xf4
+#define SNBEP_PCI_PMON_CTL0			0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0			0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0	0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1	0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH	0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL		0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR		0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0		0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1		0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0		0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1		0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0			0xc16
+#define SNBEP_U_MSR_PMON_CTL0			0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL		0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR		0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0			0xd16
+#define SNBEP_C0_MSR_PMON_CTL0			0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL		0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER		0xd14
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK	0xfffffc1f
+#define SNBEP_CBO_MSR_OFFSET			0x20
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0			0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0			0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL		0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER		0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK	0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR		0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR		0x3fd
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK	0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK	0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0		(1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET		(1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN		(1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22		(1 << 22)
+#define NHMEX_PMON_CTL_INVERT		(1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK	0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK	(NHMEX_PMON_CTL_EV_SEL_MASK | \
+					 NHMEX_PMON_CTL_UMASK_MASK | \
+					 NHMEX_PMON_CTL_EDGE_DET | \
+					 NHMEX_PMON_CTL_INVERT | \
+					 NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL		0xc00
+#define NHMEX_U_MSR_PMON_CTR			0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL			0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN			(1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL	0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL		(1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL		(1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK		\
+		(NHMEX_PMON_CTL_EV_SEL_MASK |	\
+		 NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL		0xd00
+#define NHMEX_C0_MSR_PMON_CTR0			0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0		0xd10
+#define NHMEX_C_MSR_OFFSET			0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL		0xc20
+#define NHMEX_B0_MSR_PMON_CTR0			0xc31
+#define NHMEX_B0_MSR_PMON_CTL0			0xc30
+#define NHMEX_B_MSR_OFFSET			0x40
+#define NHMEX_B0_MSR_MATCH			0xe45
+#define NHMEX_B0_MSR_MASK			0xe46
+#define NHMEX_B1_MSR_MATCH			0xe4d
+#define NHMEX_B1_MSR_MASK			0xe4e
+
+#define NHMEX_B_PMON_CTL_EN			(1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT		1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK		\
+		(0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT		6
+#define NHMEX_B_PMON_CTR_MASK		\
+		(0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK		\
+		(NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+		 NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL		0xc40
+#define NHMEX_S0_MSR_PMON_CTR0			0xc51
+#define NHMEX_S0_MSR_PMON_CTL0			0xc50
+#define NHMEX_S_MSR_OFFSET			0x80
+#define NHMEX_S0_MSR_MM_CFG			0xe48
+#define NHMEX_S0_MSR_MATCH			0xe49
+#define NHMEX_S0_MSR_MASK			0xe4a
+#define NHMEX_S1_MSR_MM_CFG			0xe58
+#define NHMEX_S1_MSR_MATCH			0xe59
+#define NHMEX_S1_MSR_MASK			0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN			(0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV		0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL			0xca0
+#define NHMEX_M0_MSR_PMU_DSP			0xca5
+#define NHMEX_M0_MSR_PMU_ISS			0xca6
+#define NHMEX_M0_MSR_PMU_MAP			0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR		0xca8
+#define NHMEX_M0_MSR_PMU_PGT			0xca9
+#define NHMEX_M0_MSR_PMU_PLD			0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC		0xcab
+#define NHMEX_M0_MSR_PMU_CTL0			0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0			0xcb1
+#define NHMEX_M_MSR_OFFSET			0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG			0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG			0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN			(1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK		0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK		0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT		34
+
+#define NHMEX_M_PMON_CTL_EN			(1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN			(1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT	2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK	\
+	(0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT	4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK	\
+	(0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE		(1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE		(1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT		9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK		\
+	(0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT	19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK	\
+	(0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK			\
+		(NHMEX_M_PMON_CTL_COUNT_MODE_MASK |	\
+		 NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |	\
+		 NHMEX_M_PMON_CTL_WRAP_MODE |		\
+		 NHMEX_M_PMON_CTL_FLAG_MODE |		\
+		 NHMEX_M_PMON_CTL_INC_SEL_MASK |	\
+		 NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7 << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7 << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+				NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+			   NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+				NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+		EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+				MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+		EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+				MBOX_SET_FLAG_SEL_MASK, \
+				(u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL			0xe00
+#define NHMEX_R_MSR_PMON_CTL0			0xe10
+#define NHMEX_R_MSR_PMON_CNT0			0xe11
+#define NHMEX_R_MSR_OFFSET			0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n)		\
+		((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)		(0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)		(0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)		\
+		(((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)	\
+		(0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)	\
+		(0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN			(1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT		1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK		\
+		(0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN			(1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK		NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL			0xc80
+#define NHMEX_W_MSR_PMON_CNT0			0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0		0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR		0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL		0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN		(1ULL << 31)
+
+struct intel_uncore_ops;
+struct intel_uncore_pmu;
+struct intel_uncore_box;
+struct uncore_event_desc;
+
+struct intel_uncore_type {
+	const char *name;
+	int num_counters;
+	int num_boxes;
+	int perf_ctr_bits;
+	int fixed_ctr_bits;
+	unsigned perf_ctr;
+	unsigned event_ctl;
+	unsigned event_mask;
+	unsigned fixed_ctr;
+	unsigned fixed_ctl;
+	unsigned box_ctl;
+	unsigned msr_offset;
+	unsigned num_shared_regs:8;
+	unsigned single_fixed:1;
+	unsigned pair_ctr_ctl:1;
+	unsigned *msr_offsets;
+	struct event_constraint unconstrainted;
+	struct event_constraint *constraints;
+	struct intel_uncore_pmu *pmus;
+	struct intel_uncore_ops *ops;
+	struct uncore_event_desc *event_descs;
+	const struct attribute_group *attr_groups[3];
+};
+
+#define format_group attr_groups[0]
+
+struct intel_uncore_ops {
+	void (*init_box)(struct intel_uncore_box *);
+	void (*disable_box)(struct intel_uncore_box *);
+	void (*enable_box)(struct intel_uncore_box *);
+	void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
+	void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
+	u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
+	int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
+	struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
+						   struct perf_event *);
+	void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
+};
+
+struct intel_uncore_pmu {
+	struct pmu pmu;
+	char name[UNCORE_PMU_NAME_LEN];
+	int pmu_idx;
+	int func_id;
+	struct intel_uncore_type *type;
+	struct intel_uncore_box ** __percpu box;
+	struct list_head box_list;
+};
+
+struct intel_uncore_extra_reg {
+	raw_spinlock_t lock;
+	u64 config, config1, config2;
+	atomic_t ref;
+};
+
+struct intel_uncore_box {
+	int phys_id;
+	int n_active;	/* number of active events */
+	int n_events;
+	int cpu;	/* cpu to collect events */
+	unsigned long flags;
+	atomic_t refcnt;
+	struct perf_event *events[UNCORE_PMC_IDX_MAX];
+	struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+	unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+	u64 tags[UNCORE_PMC_IDX_MAX];
+	struct pci_dev *pci_dev;
+	struct intel_uncore_pmu *pmu;
+	struct hrtimer hrtimer;
+	struct list_head list;
+	struct intel_uncore_extra_reg shared_regs[0];
+};
+
+#define UNCORE_BOX_FLAG_INITIATED	0
+
+struct uncore_event_desc {
+	struct kobj_attribute attr;
+	const char *config;
+};
+
+#define INTEL_UNCORE_EVENT_DESC(_name, _config)			\
+{								\
+	.attr	= __ATTR(_name, 0444, uncore_event_show, NULL),	\
+	.config	= _config,					\
+}
+
+#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)			\
+static ssize_t __uncore_##_var##_show(struct kobject *kobj,		\
+				struct kobj_attribute *attr,		\
+				char *page)				\
+{									\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			\
+	return sprintf(page, _format "\n");				\
+}									\
+static struct kobj_attribute format_attr_##_var =			\
+	__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
+
+
+static ssize_t uncore_event_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	struct uncore_event_desc *event =
+		container_of(attr, struct uncore_event_desc, attr);
+	return sprintf(buf, "%s", event->config);
+}
+
+static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
+{
+	return box->pmu->type->box_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctr;
+}
+
+static inline
+unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
+{
+	return idx * 4 + box->pmu->type->event_ctl;
+}
+
+static inline
+unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+	return idx * 8 + box->pmu->type->perf_ctr;
+}
+
+static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
+{
+	struct intel_uncore_pmu *pmu = box->pmu;
+	return pmu->type->msr_offsets ?
+		pmu->type->msr_offsets[pmu->pmu_idx] :
+		pmu->type->msr_offset * pmu->pmu_idx;
+}
+
+static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
+{
+	if (!box->pmu->type->box_ctl)
+		return 0;
+	return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
+{
+	if (!box->pmu->type->fixed_ctl)
+		return 0;
+	return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
+{
+	return box->pmu->type->event_ctl +
+		(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+		uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+	return box->pmu->type->perf_ctr +
+		(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+		uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
+{
+	if (box->pci_dev)
+		return uncore_pci_fixed_ctl(box);
+	else
+		return uncore_msr_fixed_ctl(box);
+}
+
+static inline
+unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
+{
+	if (box->pci_dev)
+		return uncore_pci_fixed_ctr(box);
+	else
+		return uncore_msr_fixed_ctr(box);
+}
+
+static inline
+unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
+{
+	if (box->pci_dev)
+		return uncore_pci_event_ctl(box, idx);
+	else
+		return uncore_msr_event_ctl(box, idx);
+}
+
+static inline
+unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+	if (box->pci_dev)
+		return uncore_pci_perf_ctr(box, idx);
+	else
+		return uncore_msr_perf_ctr(box, idx);
+}
+
+static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
+{
+	return box->pmu->type->perf_ctr_bits;
+}
+
+static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctr_bits;
+}
+
+static inline int uncore_num_counters(struct intel_uncore_box *box)
+{
+	return box->pmu->type->num_counters;
+}
+
+static inline void uncore_disable_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->type->ops->disable_box)
+		box->pmu->type->ops->disable_box(box);
+}
+
+static inline void uncore_enable_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->type->ops->enable_box)
+		box->pmu->type->ops->enable_box(box);
+}
+
+static inline void uncore_disable_event(struct intel_uncore_box *box,
+				struct perf_event *event)
+{
+	box->pmu->type->ops->disable_event(box, event);
+}
+
+static inline void uncore_enable_event(struct intel_uncore_box *box,
+				struct perf_event *event)
+{
+	box->pmu->type->ops->enable_event(box, event);
+}
+
+static inline u64 uncore_read_counter(struct intel_uncore_box *box,
+				struct perf_event *event)
+{
+	return box->pmu->type->ops->read_counter(box, event);
+}
+
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+	if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+		if (box->pmu->type->ops->init_box)
+			box->pmu->type->ops->init_box(box);
+	}
+}
+
+static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
+{
+	return (box->phys_id < 0);
+}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 47124a73dd73..92c7e39a079f 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -895,8 +895,8 @@ static void p4_pmu_disable_pebs(void)
 	 * So at moment let leave metrics turned on forever -- it's
 	 * ok for now but need to be revisited!
 	 *
-	 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
-	 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
+	 * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)0);
+	 * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
 	 */
 }
 
@@ -909,7 +909,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
 	 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
 	 * asserted again and again
 	 */
-	(void)checking_wrmsrl(hwc->config_base,
+	(void)wrmsrl_safe(hwc->config_base,
 		(u64)(p4_config_unpack_cccr(hwc->config)) &
 			~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
 }
@@ -943,8 +943,8 @@ static void p4_pmu_enable_pebs(u64 config)
 
 	bind = &p4_pebs_bind_map[idx];
 
-	(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE,	(u64)bind->metric_pebs);
-	(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT,	(u64)bind->metric_vert);
+	(void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE,	(u64)bind->metric_pebs);
+	(void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,	(u64)bind->metric_vert);
 }
 
 static void p4_pmu_enable_event(struct perf_event *event)
@@ -978,8 +978,8 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	 */
 	p4_pmu_enable_pebs(hwc->config);
 
-	(void)checking_wrmsrl(escr_addr, escr_conf);
-	(void)checking_wrmsrl(hwc->config_base,
+	(void)wrmsrl_safe(escr_addr, escr_conf);
+	(void)wrmsrl_safe(hwc->config_base,
 				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
 }
 
@@ -1325,7 +1325,7 @@ __init int p4_pmu_init(void)
 	unsigned int low, high;
 
 	/* If we get stripped -- indexing fails */
-	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
+	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
 
 	rdmsr(MSR_IA32_MISC_ENABLE, low, high);
 	if (!(low & (1 << 7))) {
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 32bcfc7dd230..e4dd0f7a0453 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -71,7 +71,7 @@ p6_pmu_disable_event(struct perf_event *event)
 	if (cpuc->enabled)
 		val |= ARCH_PERFMON_EVENTSEL_ENABLE;
 
-	(void)checking_wrmsrl(hwc->config_base, val);
+	(void)wrmsrl_safe(hwc->config_base, val);
 }
 
 static void p6_pmu_enable_event(struct perf_event *event)
@@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
 	if (cpuc->enabled)
 		val |= ARCH_PERFMON_EVENTSEL_ENABLE;
 
-	(void)checking_wrmsrl(hwc->config_base, val);
+	(void)wrmsrl_safe(hwc->config_base, val);
 }
 
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
deleted file mode 100644
index a640ae5ad201..000000000000
--- a/arch/x86/kernel/cpu/sched.c
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <linux/sched.h>
-#include <linux/math64.h>
-#include <linux/percpu.h>
-#include <linux/irqflags.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-#ifdef CONFIG_SMP
-
-static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
-
-static unsigned long scale_aperfmperf(void)
-{
-	struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
-	unsigned long ratio, flags;
-
-	local_irq_save(flags);
-	get_aperfmperf(&val);
-	local_irq_restore(flags);
-
-	ratio = calc_aperfmperf_ratio(old, &val);
-	*old = val;
-
-	return ratio;
-}
-
-unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-	/*
-	 * do aperf/mperf on the cpu level because it includes things
-	 * like turbo mode, which are relevant to full cores.
-	 */
-	if (boot_cpu_has(X86_FEATURE_APERFMPERF))
-		return scale_aperfmperf();
-
-	/*
-	 * maybe have something cpufreq here
-	 */
-
-	return default_scale_freq_power(sd, cpu);
-}
-
-unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-	/*
-	 * aperf/mperf already includes the smt gain
-	 */
-	if (boot_cpu_has(X86_FEATURE_APERFMPERF))
-		return SCHED_LOAD_SCALE;
-
-	return default_scale_smt_power(sd, cpu);
-}
-
-#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 571246d81edf..ae42418bc50f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,8 +27,8 @@ static int die_counter;
 
 void printk_address(unsigned long address, int reliable)
 {
-	printk(" [<%p>] %s%pB\n", (void *) address,
-			reliable ? "" : "? ", (void *) address);
+	pr_cont(" [<%p>] %s%pB\n",
+		(void *)address, reliable ? "" : "? ", (void *)address);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -271,6 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
 		return 1;
 
+	print_modules();
 	show_regs(regs);
 #ifdef CONFIG_X86_32
 	if (user_mode_vm(regs)) {
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index e0b1d783daab..1038a417ea53 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -73,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		if (kstack_end(stack))
 			break;
 		if (i && ((i % STACKSLOTS_PER_LINE) == 0))
-			printk(KERN_CONT "\n");
-		printk(KERN_CONT " %08lx", *stack++);
+			pr_cont("\n");
+		pr_cont(" %08lx", *stack++);
 		touch_nmi_watchdog();
 	}
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
@@ -86,12 +86,11 @@ void show_regs(struct pt_regs *regs)
 {
 	int i;
 
-	print_modules();
 	__show_regs(regs, !user_mode_vm(regs));
 
-	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
-		TASK_COMM_LEN, current->comm, task_pid_nr(current),
-		current_thread_info(), current, task_thread_info(current));
+	pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
+		 TASK_COMM_LEN, current->comm, task_pid_nr(current),
+		 current_thread_info(), current, task_thread_info(current));
 	/*
 	 * When in-kernel, we also print out the stack and code at the
 	 * time of the fault..
@@ -102,10 +101,10 @@ void show_regs(struct pt_regs *regs)
 		unsigned char c;
 		u8 *ip;
 
-		printk(KERN_EMERG "Stack:\n");
+		pr_emerg("Stack:\n");
 		show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
 
-		printk(KERN_EMERG "Code: ");
+		pr_emerg("Code:");
 
 		ip = (u8 *)regs->ip - code_prologue;
 		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
@@ -116,16 +115,16 @@ void show_regs(struct pt_regs *regs)
 		for (i = 0; i < code_len; i++, ip++) {
 			if (ip < (u8 *)PAGE_OFFSET ||
 					probe_kernel_address(ip, c)) {
-				printk(KERN_CONT " Bad EIP value.");
+				pr_cont("  Bad EIP value.");
 				break;
 			}
 			if (ip == (u8 *)regs->ip)
-				printk(KERN_CONT "<%02x> ", c);
+				pr_cont(" <%02x>", c);
 			else
-				printk(KERN_CONT "%02x ", c);
+				pr_cont(" %02x", c);
 		}
 	}
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 }
 
 int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 791b76122aa8..b653675d5288 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -228,20 +228,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		if (stack >= irq_stack && stack <= irq_stack_end) {
 			if (stack == irq_stack_end) {
 				stack = (unsigned long *) (irq_stack_end[-1]);
-				printk(KERN_CONT " <EOI> ");
+				pr_cont(" <EOI> ");
 			}
 		} else {
 		if (((long) stack & (THREAD_SIZE-1)) == 0)
 			break;
 		}
 		if (i && ((i % STACKSLOTS_PER_LINE) == 0))
-			printk(KERN_CONT "\n");
-		printk(KERN_CONT " %016lx", *stack++);
+			pr_cont("\n");
+		pr_cont(" %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
 	preempt_enable();
 
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
@@ -254,10 +254,9 @@ void show_regs(struct pt_regs *regs)
 
 	sp = regs->sp;
 	printk("CPU %d ", cpu);
-	print_modules();
 	__show_regs(regs, 1);
-	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
-		cur->comm, cur->pid, task_thread_info(cur), cur);
+	printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n",
+	       cur->comm, cur->pid, task_thread_info(cur), cur);
 
 	/*
 	 * When in-kernel, we also print out the stack and code at the
@@ -284,16 +283,16 @@ void show_regs(struct pt_regs *regs)
 		for (i = 0; i < code_len; i++, ip++) {
 			if (ip < (u8 *)PAGE_OFFSET ||
 					probe_kernel_address(ip, c)) {
-				printk(KERN_CONT " Bad RIP value.");
+				pr_cont(" Bad RIP value.");
 				break;
 			}
 			if (ip == (u8 *)regs->ip)
-				printk(KERN_CONT "<%02x> ", c);
+				pr_cont("<%02x> ", c);
 			else
-				printk(KERN_CONT "%02x ", c);
+				pr_cont("%02x ", c);
 		}
 	}
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 }
 
 int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 41857970517f..ed858e9e9a74 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -944,7 +944,7 @@ void __init e820_reserve_resources(void)
 	for (i = 0; i < e820_saved.nr_map; i++) {
 		struct e820entry *entry = &e820_saved.map[i];
 		firmware_map_add_early(entry->addr,
-			entry->addr + entry->size - 1,
+			entry->addr + entry->size,
 			e820_type_to_string(entry->type));
 	}
 }
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 7d65133b51be..69babd8c834f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1048,24 +1048,6 @@ apicinterrupt LOCAL_TIMER_VECTOR \
 apicinterrupt X86_PLATFORM_IPI_VECTOR \
 	x86_platform_ipi smp_x86_platform_ipi
 
-#ifdef CONFIG_SMP
-	ALIGN
-	INTR_FRAME
-.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
-	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-.if NUM_INVALIDATE_TLB_VECTORS > \idx
-ENTRY(invalidate_interrupt\idx)
-	pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
-	jmp .Lcommon_invalidate_interrupt0
-	CFI_ADJUST_CFA_OFFSET -8
-END(invalidate_interrupt\idx)
-.endif
-.endr
-	CFI_ENDPROC
-apicinterrupt INVALIDATE_TLB_VECTOR_START, \
-	invalidate_interrupt0, smp_invalidate_interrupt
-#endif
-
 apicinterrupt THRESHOLD_APIC_VECTOR \
 	threshold_interrupt smp_threshold_interrupt
 apicinterrupt THERMAL_APIC_VECTOR \
@@ -1758,10 +1740,30 @@ end_repeat_nmi:
 	 */
 	call save_paranoid
 	DEFAULT_FRAME 0
+
+	/*
+	 * Save off the CR2 register. If we take a page fault in the NMI then
+	 * it could corrupt the CR2 value. If the NMI preempts a page fault
+	 * handler before it was able to read the CR2 register, and then the
+	 * NMI itself takes a page fault, the page fault that was preempted
+	 * will read the information from the NMI page fault and not the
+	 * origin fault. Save it off and restore it if it changes.
+	 * Use the r12 callee-saved register.
+	 */
+	movq %cr2, %r12
+
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
 	movq %rsp,%rdi
 	movq $-1,%rsi
 	call do_nmi
+
+	/* Did the NMI take a page fault? Restore cr2 if it did */
+	movq %cr2, %rcx
+	cmpq %rcx, %r12
+	je 1f
+	movq %r12, %cr2
+1:
+	
 	testl %ebx,%ebx				/* swapgs needed? */
 	jnz nmi_restore
 nmi_swapgs:
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3dafc6003b7c..d44f7829968e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -270,7 +270,7 @@ void fixup_irqs(void)
 
 		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 			break_affinity = 1;
-			affinity = cpu_all_mask;
+			affinity = cpu_online_mask;
 		}
 
 		chip = irq_data_get_irq_chip(data);
@@ -294,9 +294,9 @@ void fixup_irqs(void)
 		raw_spin_unlock(&desc->lock);
 
 		if (break_affinity && set_affinity)
-			printk("Broke affinity for irq %i\n", irq);
+			pr_notice("Broke affinity for irq %i\n", irq);
 		else if (!set_affinity)
-			printk("Cannot set affinity for irq %i\n", irq);
+			pr_notice("Cannot set affinity for irq %i\n", irq);
 	}
 
 	/*
@@ -328,6 +328,7 @@ void fixup_irqs(void)
 				chip->irq_retrigger(data);
 			raw_spin_unlock(&desc->lock);
 		}
+		__this_cpu_write(vector_irq[vector], -1);
 	}
 }
 #endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 252981afd6c4..6e03b0d69138 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -171,79 +171,6 @@ static void __init smp_intr_init(void)
 	 */
 	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
 
-	/* IPIs for invalidation */
-#define ALLOC_INVTLB_VEC(NR) \
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
-		invalidate_interrupt##NR)
-
-	switch (NUM_INVALIDATE_TLB_VECTORS) {
-	default:
-		ALLOC_INVTLB_VEC(31);
-	case 31:
-		ALLOC_INVTLB_VEC(30);
-	case 30:
-		ALLOC_INVTLB_VEC(29);
-	case 29:
-		ALLOC_INVTLB_VEC(28);
-	case 28:
-		ALLOC_INVTLB_VEC(27);
-	case 27:
-		ALLOC_INVTLB_VEC(26);
-	case 26:
-		ALLOC_INVTLB_VEC(25);
-	case 25:
-		ALLOC_INVTLB_VEC(24);
-	case 24:
-		ALLOC_INVTLB_VEC(23);
-	case 23:
-		ALLOC_INVTLB_VEC(22);
-	case 22:
-		ALLOC_INVTLB_VEC(21);
-	case 21:
-		ALLOC_INVTLB_VEC(20);
-	case 20:
-		ALLOC_INVTLB_VEC(19);
-	case 19:
-		ALLOC_INVTLB_VEC(18);
-	case 18:
-		ALLOC_INVTLB_VEC(17);
-	case 17:
-		ALLOC_INVTLB_VEC(16);
-	case 16:
-		ALLOC_INVTLB_VEC(15);
-	case 15:
-		ALLOC_INVTLB_VEC(14);
-	case 14:
-		ALLOC_INVTLB_VEC(13);
-	case 13:
-		ALLOC_INVTLB_VEC(12);
-	case 12:
-		ALLOC_INVTLB_VEC(11);
-	case 11:
-		ALLOC_INVTLB_VEC(10);
-	case 10:
-		ALLOC_INVTLB_VEC(9);
-	case 9:
-		ALLOC_INVTLB_VEC(8);
-	case 8:
-		ALLOC_INVTLB_VEC(7);
-	case 7:
-		ALLOC_INVTLB_VEC(6);
-	case 6:
-		ALLOC_INVTLB_VEC(5);
-	case 5:
-		ALLOC_INVTLB_VEC(4);
-	case 4:
-		ALLOC_INVTLB_VEC(3);
-	case 3:
-		ALLOC_INVTLB_VEC(2);
-	case 2:
-		ALLOC_INVTLB_VEC(1);
-	case 1:
-		ALLOC_INVTLB_VEC(0);
-		break;
-	}
-
 	/* IPI for generic function call */
 	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 1d5d31ea686b..dc1404bf8e4b 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -107,7 +107,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 {
 	struct setup_data_node *node;
 	struct setup_data *data;
-	int error = -ENOMEM;
+	int error;
 	struct dentry *d;
 	struct page *pg;
 	u64 pa_data;
@@ -121,8 +121,10 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 
 	while (pa_data) {
 		node = kmalloc(sizeof(*node), GFP_KERNEL);
-		if (!node)
+		if (!node) {
+			error = -ENOMEM;
 			goto err_dir;
+		}
 
 		pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
 		if (PageHighMem(pg)) {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e554e5ad2fe8..c1d61ee4b4f1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,6 +39,9 @@
 #include <asm/desc.h>
 #include <asm/tlbflush.h>
 #include <asm/idle.h>
+#include <asm/apic.h>
+#include <asm/apicdef.h>
+#include <asm/hypervisor.h>
 
 static int kvmapf = 1;
 
@@ -283,6 +286,22 @@ static void kvm_register_steal_time(void)
 		cpu, __pa(st));
 }
 
+static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+
+static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
+{
+	/**
+	 * This relies on __test_and_clear_bit to modify the memory
+	 * in a way that is atomic with respect to the local CPU.
+	 * The hypervisor only accesses this memory from the local CPU so
+	 * there's no need for lock or memory barriers.
+	 * An optimization barrier is implied in apic write.
+	 */
+	if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi)))
+		return;
+	apic_write(APIC_EOI, APIC_EOI_ACK);
+}
+
 void __cpuinit kvm_guest_cpu_init(void)
 {
 	if (!kvm_para_available())
@@ -300,11 +319,20 @@ void __cpuinit kvm_guest_cpu_init(void)
 		       smp_processor_id());
 	}
 
+	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
+		unsigned long pa;
+		/* Size alignment is implied but just to make it explicit. */
+		BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
+		__get_cpu_var(kvm_apic_eoi) = 0;
+		pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED;
+		wrmsrl(MSR_KVM_PV_EOI_EN, pa);
+	}
+
 	if (has_steal_clock)
 		kvm_register_steal_time();
 }
 
-static void kvm_pv_disable_apf(void *unused)
+static void kvm_pv_disable_apf(void)
 {
 	if (!__get_cpu_var(apf_reason).enabled)
 		return;
@@ -316,11 +344,23 @@ static void kvm_pv_disable_apf(void *unused)
 	       smp_processor_id());
 }
 
+static void kvm_pv_guest_cpu_reboot(void *unused)
+{
+	/*
+	 * We disable PV EOI before we load a new kernel by kexec,
+	 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
+	 * New kernel can re-enable when it boots.
+	 */
+	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+	kvm_pv_disable_apf();
+}
+
 static int kvm_pv_reboot_notify(struct notifier_block *nb,
 				unsigned long code, void *unused)
 {
 	if (code == SYS_RESTART)
-		on_each_cpu(kvm_pv_disable_apf, NULL, 1);
+		on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
 	return NOTIFY_DONE;
 }
 
@@ -371,7 +411,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
 static void kvm_guest_cpu_offline(void *dummy)
 {
 	kvm_disable_steal_time();
-	kvm_pv_disable_apf(NULL);
+	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+	kvm_pv_disable_apf();
 	apf_task_wake_all();
 }
 
@@ -424,6 +466,9 @@ void __init kvm_guest_init(void)
 		pv_time_ops.steal_clock = kvm_steal_clock;
 	}
 
+	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+		apic_set_eoi_write(kvm_guest_apic_eoi_write);
+
 #ifdef CONFIG_SMP
 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 	register_cpu_notifier(&kvm_cpu_notifier);
@@ -432,6 +477,19 @@ void __init kvm_guest_init(void)
 #endif
 }
 
+static bool __init kvm_detect(void)
+{
+	if (!kvm_para_available())
+		return false;
+	return true;
+}
+
+const struct hypervisor_x86 x86_hyper_kvm __refconst = {
+	.name			= "KVM",
+	.detect			= kvm_detect,
+};
+EXPORT_SYMBOL_GPL(x86_hyper_kvm);
+
 static __init int activate_jump_labels(void)
 {
 	if (has_steal_clock) {
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 8a2ce8fd41c0..82746f942cd8 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -143,11 +143,12 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
 				  unsigned int *current_size)
 {
 	struct microcode_header_amd *mc_hdr;
-	unsigned int actual_size;
+	unsigned int actual_size, patch_size;
 	u16 equiv_cpu_id;
 
 	/* size of the current patch we're staring at */
-	*current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE;
+	patch_size = *(u32 *)(ucode_ptr + 4);
+	*current_size = patch_size + SECTION_HDR_SIZE;
 
 	equiv_cpu_id = find_equiv_id();
 	if (!equiv_cpu_id)
@@ -174,7 +175,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
 	/*
 	 * now that the header looks sane, verify its size
 	 */
-	actual_size = verify_ucode_size(cpu, *current_size, leftover_size);
+	actual_size = verify_ucode_size(cpu, patch_size, leftover_size);
 	if (!actual_size)
 		return 0;
 
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fbdfc6917180..4873e62db6a1 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -87,6 +87,7 @@
 #include <asm/microcode.h>
 #include <asm/processor.h>
 #include <asm/cpu_device_id.h>
+#include <asm/perf_event.h>
 
 MODULE_DESCRIPTION("Microcode Update Driver");
 MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -277,7 +278,6 @@ static int reload_for_cpu(int cpu)
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 	int err = 0;
 
-	mutex_lock(&microcode_mutex);
 	if (uci->valid) {
 		enum ucode_state ustate;
 
@@ -288,7 +288,6 @@ static int reload_for_cpu(int cpu)
 			if (ustate == UCODE_ERROR)
 				err = -EINVAL;
 	}
-	mutex_unlock(&microcode_mutex);
 
 	return err;
 }
@@ -298,19 +297,31 @@ static ssize_t reload_store(struct device *dev,
 			    const char *buf, size_t size)
 {
 	unsigned long val;
-	int cpu = dev->id;
-	ssize_t ret = 0;
+	int cpu;
+	ssize_t ret = 0, tmp_ret;
 
 	ret = kstrtoul(buf, 0, &val);
 	if (ret)
 		return ret;
 
-	if (val == 1) {
-		get_online_cpus();
-		if (cpu_online(cpu))
-			ret = reload_for_cpu(cpu);
-		put_online_cpus();
+	if (val != 1)
+		return size;
+
+	get_online_cpus();
+	mutex_lock(&microcode_mutex);
+	for_each_online_cpu(cpu) {
+		tmp_ret = reload_for_cpu(cpu);
+		if (tmp_ret != 0)
+			pr_warn("Error reloading microcode on CPU %d\n", cpu);
+
+		/* save retval of the first encountered reload error */
+		if (!ret)
+			ret = tmp_ret;
 	}
+	if (!ret)
+		perf_check_microcode();
+	mutex_unlock(&microcode_mutex);
+	put_online_cpus();
 
 	if (!ret)
 		ret = size;
@@ -339,7 +350,6 @@ static DEVICE_ATTR(version, 0400, version_show, NULL);
 static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
 
 static struct attribute *mc_default_attrs[] = {
-	&dev_attr_reload.attr,
 	&dev_attr_version.attr,
 	&dev_attr_processor_flags.attr,
 	NULL
@@ -504,7 +514,7 @@ static struct notifier_block __refdata mc_cpu_notifier = {
 
 #ifdef MODULE
 /* Autoload on Intel and AMD systems */
-static const struct x86_cpu_id microcode_id[] = {
+static const struct x86_cpu_id __initconst microcode_id[] = {
 #ifdef CONFIG_MICROCODE_INTEL
 	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
 #endif
@@ -516,6 +526,16 @@ static const struct x86_cpu_id microcode_id[] = {
 MODULE_DEVICE_TABLE(x86cpu, microcode_id);
 #endif
 
+static struct attribute *cpu_root_microcode_attrs[] = {
+	&dev_attr_reload.attr,
+	NULL
+};
+
+static struct attribute_group cpu_root_microcode_group = {
+	.name  = "microcode",
+	.attrs = cpu_root_microcode_attrs,
+};
+
 static int __init microcode_init(void)
 {
 	struct cpuinfo_x86 *c = &cpu_data(0);
@@ -540,16 +560,25 @@ static int __init microcode_init(void)
 	mutex_lock(&microcode_mutex);
 
 	error = subsys_interface_register(&mc_cpu_interface);
-
+	if (!error)
+		perf_check_microcode();
 	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
 
 	if (error)
 		goto out_pdev;
 
+	error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
+				   &cpu_root_microcode_group);
+
+	if (error) {
+		pr_err("Error creating microcode group!\n");
+		goto out_driver;
+	}
+
 	error = microcode_dev_init();
 	if (error)
-		goto out_driver;
+		goto out_ucode_group;
 
 	register_syscore_ops(&mc_syscore_ops);
 	register_hotcpu_notifier(&mc_cpu_notifier);
@@ -559,7 +588,11 @@ static int __init microcode_init(void)
 
 	return 0;
 
-out_driver:
+ out_ucode_group:
+	sysfs_remove_group(&cpu_subsys.dev_root->kobj,
+			   &cpu_root_microcode_group);
+
+ out_driver:
 	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
@@ -568,7 +601,7 @@ out_driver:
 	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
 
-out_pdev:
+ out_pdev:
 	platform_device_unregister(microcode_pdev);
 	return error;
 
@@ -584,6 +617,9 @@ static void __exit microcode_exit(void)
 	unregister_hotcpu_notifier(&mc_cpu_notifier);
 	unregister_syscore_ops(&mc_syscore_ops);
 
+	sysfs_remove_group(&cpu_subsys.dev_root->kobj,
+			   &cpu_root_microcode_group);
+
 	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f21fd94ac897..216a4d754b0c 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -15,6 +15,9 @@
     along with this program; if not, write to the Free Software
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/moduleloader.h>
 #include <linux/elf.h>
 #include <linux/vmalloc.h>
@@ -30,9 +33,14 @@
 #include <asm/pgtable.h>
 
 #if 0
-#define DEBUGP printk
+#define DEBUGP(fmt, ...)				\
+	printk(KERN_DEBUG fmt, ##__VA_ARGS__)
 #else
-#define DEBUGP(fmt...)
+#define DEBUGP(fmt, ...)				\
+do {							\
+	if (0)						\
+		printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
+} while (0)
 #endif
 
 void *module_alloc(unsigned long size)
@@ -56,8 +64,8 @@ int apply_relocate(Elf32_Shdr *sechdrs,
 	Elf32_Sym *sym;
 	uint32_t *location;
 
-	DEBUGP("Applying relocate section %u to %u\n", relsec,
-	       sechdrs[relsec].sh_info);
+	DEBUGP("Applying relocate section %u to %u\n",
+	       relsec, sechdrs[relsec].sh_info);
 	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
 		/* This is where to make the change */
 		location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -73,11 +81,11 @@ int apply_relocate(Elf32_Shdr *sechdrs,
 			*location += sym->st_value;
 			break;
 		case R_386_PC32:
-			/* Add the value, subtract its postition */
+			/* Add the value, subtract its position */
 			*location += sym->st_value - (uint32_t)location;
 			break;
 		default:
-			printk(KERN_ERR "module %s: Unknown relocation: %u\n",
+			pr_err("%s: Unknown relocation: %u\n",
 			       me->name, ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 		}
@@ -97,8 +105,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 	void *loc;
 	u64 val;
 
-	DEBUGP("Applying relocate section %u to %u\n", relsec,
-	       sechdrs[relsec].sh_info);
+	DEBUGP("Applying relocate section %u to %u\n",
+	       relsec, sechdrs[relsec].sh_info);
 	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
 		/* This is where to make the change */
 		loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -110,8 +118,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 			+ ELF64_R_SYM(rel[i].r_info);
 
 		DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
-			(int)ELF64_R_TYPE(rel[i].r_info),
-			sym->st_value, rel[i].r_addend, (u64)loc);
+		       (int)ELF64_R_TYPE(rel[i].r_info),
+		       sym->st_value, rel[i].r_addend, (u64)loc);
 
 		val = sym->st_value + rel[i].r_addend;
 
@@ -140,7 +148,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 #endif
 			break;
 		default:
-			printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
+			pr_err("%s: Unknown rela relocation: %llu\n",
 			       me->name, ELF64_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 		}
@@ -148,9 +156,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 	return 0;
 
 overflow:
-	printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
+	pr_err("overflow in relocation type %d val %Lx\n",
 	       (int)ELF64_R_TYPE(rel[i].r_info), val);
-	printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
+	pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
 	       me->name);
 	return -ENOEXEC;
 }
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index a0b2f84457be..f84f5c57de35 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -365,8 +365,9 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
 #ifdef CONFIG_X86_32
 /*
  * For i386, NMIs use the same stack as the kernel, and we can
- * add a workaround to the iret problem in C. Simply have 3 states
- * the NMI can be in.
+ * add a workaround to the iret problem in C (preventing nested
+ * NMIs if an NMI takes a trap). Simply have 3 states the NMI
+ * can be in:
  *
  *  1) not running
  *  2) executing
@@ -383,32 +384,50 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
  * If an NMI hits a breakpoint that executes an iret, another
  * NMI can preempt it. We do not want to allow this new NMI
  * to run, but we want to execute it when the first one finishes.
- * We set the state to "latched", and the first NMI will perform
- * an cmpxchg on the state, and if it doesn't successfully
- * reset the state to "not running" it will restart the next
- * NMI.
+ * We set the state to "latched", and the exit of the first NMI will
+ * perform a dec_return, if the result is zero (NOT_RUNNING), then
+ * it will simply exit the NMI handler. If not, the dec_return
+ * would have set the state to NMI_EXECUTING (what we want it to
+ * be when we are running). In this case, we simply jump back
+ * to rerun the NMI handler again, and restart the 'latched' NMI.
+ *
+ * No trap (breakpoint or page fault) should be hit before nmi_restart,
+ * thus there is no race between the first check of state for NOT_RUNNING
+ * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
+ * at this point.
+ *
+ * In case the NMI takes a page fault, we need to save off the CR2
+ * because the NMI could have preempted another page fault and corrupt
+ * the CR2 that is about to be read. As nested NMIs must be restarted
+ * and they can not take breakpoints or page faults, the update of the
+ * CR2 must be done before converting the nmi state back to NOT_RUNNING.
+ * Otherwise, there would be a race of another nested NMI coming in
+ * after setting state to NOT_RUNNING but before updating the nmi_cr2.
  */
 enum nmi_states {
-	NMI_NOT_RUNNING,
+	NMI_NOT_RUNNING = 0,
 	NMI_EXECUTING,
 	NMI_LATCHED,
 };
 static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+static DEFINE_PER_CPU(unsigned long, nmi_cr2);
 
 #define nmi_nesting_preprocess(regs)					\
 	do {								\
-		if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) {	\
-			__get_cpu_var(nmi_state) = NMI_LATCHED;		\
+		if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {	\
+			this_cpu_write(nmi_state, NMI_LATCHED);		\
 			return;						\
 		}							\
-	nmi_restart:							\
-		__get_cpu_var(nmi_state) = NMI_EXECUTING;		\
-	} while (0)
+		this_cpu_write(nmi_state, NMI_EXECUTING);		\
+		this_cpu_write(nmi_cr2, read_cr2());			\
+	} while (0);							\
+	nmi_restart:
 
 #define nmi_nesting_postprocess()					\
 	do {								\
-		if (cmpxchg(&__get_cpu_var(nmi_state),			\
-		    NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING)	\
+		if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))	\
+			write_cr2(this_cpu_read(nmi_cr2));		\
+		if (this_cpu_dec_return(nmi_state))			\
 			goto nmi_restart;				\
 	} while (0)
 #else /* x86_64 */
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 149b8d9c6ad4..6d9582ec0324 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -42,7 +42,8 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
 static void __init init_nmi_testsuite(void)
 {
 	/* trap all the unknown NMIs we may generate */
-	register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
+	register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk",
+			__initdata);
 }
 
 static void __init cleanup_nmi_testsuite(void)
@@ -64,8 +65,8 @@ static void __init test_nmi_ipi(struct cpumask *mask)
 {
 	unsigned long timeout;
 
-	if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback,
-				 NMI_FLAG_FIRST, "nmi_selftest")) {
+	if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+				 NMI_FLAG_FIRST, "nmi_selftest", __initdata)) {
 		nmi_fail = FAILURE;
 		return;
 	}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9ce885996fd7..17fff18a1031 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -352,9 +352,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 #endif
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
-	.rdmsr_regs = native_rdmsr_safe_regs,
 	.write_msr = native_write_msr_safe,
-	.wrmsr_regs = native_wrmsr_safe_regs,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
 	.read_tscp = native_read_tscp,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index b72838bae64a..299d49302e7d 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -22,6 +22,8 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
+#define pr_fmt(fmt) "Calgary: " fmt
+
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/types.h>
@@ -245,7 +247,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
 		offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
 					  npages, 0, boundary_size, 0);
 		if (offset == ~0UL) {
-			printk(KERN_WARNING "Calgary: IOMMU full.\n");
+			pr_warn("IOMMU full\n");
 			spin_unlock_irqrestore(&tbl->it_lock, flags);
 			if (panic_on_overflow)
 				panic("Calgary: fix the allocator.\n");
@@ -271,8 +273,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	entry = iommu_range_alloc(dev, tbl, npages);
 
 	if (unlikely(entry == DMA_ERROR_CODE)) {
-		printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
-		       "iommu %p\n", npages, tbl);
+		pr_warn("failed to allocate %u pages in iommu %p\n",
+			npages, tbl);
 		return DMA_ERROR_CODE;
 	}
 
@@ -561,8 +563,7 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl)
 		i++;
 	} while ((val & 0xff) != 0xff && i < 100);
 	if (i == 100)
-		printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
-		       "continuing anyway\n");
+		pr_warn("PCI bus not quiesced, continuing anyway\n");
 
 	/* invalidate TCE cache */
 	target = calgary_reg(bbar, tar_offset(tbl->it_busno));
@@ -604,8 +605,7 @@ begin:
 		i++;
 	} while ((val64 & 0xff) != 0xff && i < 100);
 	if (i == 100)
-		printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, "
-		       "continuing anyway\n");
+		pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n");
 
 	/* 3. poll Page Migration DEBUG for SoftStopFault */
 	target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
@@ -617,8 +617,7 @@ begin:
 		if (++count < 100)
 			goto begin;
 		else {
-			printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, "
-			       "aborting TCE cache flush sequence!\n");
+			pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n");
 			return; /* pray for the best */
 		}
 	}
@@ -840,8 +839,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl)
 	plssr = be32_to_cpu(readl(target));
 
 	/* If no error, the agent ID in the CSR is not valid */
-	printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, "
-	       "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr);
+	pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n",
+		 tbl->it_busno, csr, plssr);
 }
 
 static void calioc2_dump_error_regs(struct iommu_table *tbl)
@@ -867,22 +866,21 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl)
 	target = calgary_reg(bbar, phboff | 0x800);
 	mck = be32_to_cpu(readl(target));
 
-	printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n",
-	       tbl->it_busno);
+	pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno);
 
-	printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
-	       csr, plssr, csmr, mck);
+	pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
+		 csr, plssr, csmr, mck);
 
 	/* dump rest of error regs */
-	printk(KERN_EMERG "Calgary: ");
+	pr_emerg("");
 	for (i = 0; i < ARRAY_SIZE(errregs); i++) {
 		/* err regs are at 0x810 - 0x870 */
 		erroff = (0x810 + (i * 0x10));
 		target = calgary_reg(bbar, phboff | erroff);
 		errregs[i] = be32_to_cpu(readl(target));
-		printk("0x%08x@0x%lx ", errregs[i], erroff);
+		pr_cont("0x%08x@0x%lx ", errregs[i], erroff);
 	}
-	printk("\n");
+	pr_cont("\n");
 
 	/* root complex status */
 	target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index c0f420f76cd3..de2b7ad70273 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,15 +45,6 @@ int iommu_detected __read_mostly = 0;
  */
 int iommu_pass_through __read_mostly;
 
-/*
- * Group multi-function PCI devices into a single device-group for the
- * iommu_device_group interface.  This tells the iommu driver to pretend
- * it cannot distinguish between functions of a device, exposing only one
- * group for the device.  Useful for disallowing use of individual PCI
- * functions from userspace drivers.
- */
-int iommu_group_mf __read_mostly;
-
 extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
 
 /* Dummy device used for NULL arguments (normally ISA). */
@@ -194,8 +185,6 @@ static __init int iommu_setup(char *p)
 #endif
 		if (!strncmp(p, "pt", 2))
 			iommu_pass_through = 1;
-		if (!strncmp(p, "group_mf", 8))
-			iommu_group_mf = 1;
 
 		gart_parse_options(p);
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 735279e54e59..ef6a8456f719 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -145,16 +147,14 @@ void show_regs_common(void)
 	/* Board Name is optional */
 	board = dmi_get_system_info(DMI_BOARD_NAME);
 
-	printk(KERN_CONT "\n");
-	printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s",
-		current->pid, current->comm, print_tainted(),
-		init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
-	printk(KERN_CONT " %s %s", vendor, product);
-	if (board)
-		printk(KERN_CONT "/%s", board);
-	printk(KERN_CONT "\n");
+	printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n",
+	       current->pid, current->comm, print_tainted(),
+	       init_utsname()->release,
+	       (int)strcspn(init_utsname()->version, " "),
+	       init_utsname()->version,
+	       vendor, product,
+	       board ? "/" : "",
+	       board ? board : "");
 }
 
 void flush_thread(void)
@@ -645,7 +645,7 @@ static void amd_e400_idle(void)
 			amd_e400_c1e_detected = true;
 			if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
 				mark_tsc_unstable("TSC halt in AMD C1E");
-			printk(KERN_INFO "System has AMD C1E enabled\n");
+			pr_info("System has AMD C1E enabled\n");
 		}
 	}
 
@@ -659,8 +659,7 @@ static void amd_e400_idle(void)
 			 */
 			clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
 					   &cpu);
-			printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
-			       cpu);
+			pr_info("Switch to broadcast mode on CPU%d\n", cpu);
 		}
 		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
 
@@ -681,8 +680,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	if (pm_idle == poll_idle && smp_num_siblings > 1) {
-		printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
-			" performance may degrade.\n");
+		pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
 	}
 #endif
 	if (pm_idle)
@@ -692,11 +690,11 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 		/*
 		 * One CPU supports mwait => All CPUs supports mwait
 		 */
-		printk(KERN_INFO "using mwait in idle threads.\n");
+		pr_info("using mwait in idle threads\n");
 		pm_idle = mwait_idle;
 	} else if (cpu_has_amd_erratum(amd_erratum_400)) {
 		/* E400: APIC timer interrupt does not wake up CPU from C1e */
-		printk(KERN_INFO "using AMD E400 aware idle routine\n");
+		pr_info("using AMD E400 aware idle routine\n");
 		pm_idle = amd_e400_idle;
 	} else
 		pm_idle = default_idle;
@@ -715,7 +713,7 @@ static int __init idle_setup(char *str)
 		return -EINVAL;
 
 	if (!strcmp(str, "poll")) {
-		printk("using polling idle threads.\n");
+		pr_info("using polling idle threads\n");
 		pm_idle = poll_idle;
 		boot_option_idle_override = IDLE_POLL;
 	} else if (!strcmp(str, "mwait")) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 61cdf7fdf099..0a980c9d7cb8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -117,10 +117,10 @@ void release_thread(struct task_struct *dead_task)
 {
 	if (dead_task->mm) {
 		if (dead_task->mm->context.size) {
-			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
-					dead_task->comm,
-					dead_task->mm->context.ldt,
-					dead_task->mm->context.size);
+			pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n",
+				dead_task->comm,
+				dead_task->mm->context.ldt,
+				dead_task->mm->context.size);
 			BUG();
 		}
 	}
@@ -466,7 +466,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 			task->thread.gs = addr;
 			if (doit) {
 				load_gs_index(0);
-				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
+				ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
 			}
 		}
 		put_cpu();
@@ -494,7 +494,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 				/* set the selector to 0 to not confuse
 				   __switch_to */
 				loadsegment(fs, 0);
-				ret = checking_wrmsrl(MSR_FS_BASE, addr);
+				ret = wrmsrl_safe(MSR_FS_BASE, addr);
 			}
 		}
 		put_cpu();
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 03920a15a632..1b27de563561 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
 
 #if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
 /* Set correct numa_node information for AMD NB functions */
-static void __init quirk_amd_nb_node(struct pci_dev *dev)
+static void __devinit quirk_amd_nb_node(struct pci_dev *dev)
 {
 	struct pci_dev *nb_ht;
 	unsigned int devfn;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 5de92f1abd76..52190a938b4a 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/init.h>
@@ -20,14 +22,12 @@
 #include <asm/virtext.h>
 #include <asm/cpu.h>
 #include <asm/nmi.h>
+#include <asm/smp.h>
 
-#ifdef CONFIG_X86_32
-# include <linux/ctype.h>
-# include <linux/mc146818rtc.h>
-# include <asm/realmode.h>
-#else
-# include <asm/x86_init.h>
-#endif
+#include <linux/ctype.h>
+#include <linux/mc146818rtc.h>
+#include <asm/realmode.h>
+#include <asm/x86_init.h>
 
 /*
  * Power off function, if any
@@ -49,7 +49,7 @@ int reboot_force;
  */
 static int reboot_default = 1;
 
-#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 static int reboot_cpu = -1;
 #endif
 
@@ -67,8 +67,8 @@ bool port_cf9_safe = false;
  * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
  * warm   Don't set the cold reboot flag
  * cold   Set the cold reboot flag
- * bios   Reboot by jumping through the BIOS (only for X86_32)
- * smp    Reboot by executing reset on BSP or other CPU (only for X86_32)
+ * bios   Reboot by jumping through the BIOS
+ * smp    Reboot by executing reset on BSP or other CPU
  * triple Force a triple fault (init)
  * kbd    Use the keyboard controller. cold reset (default)
  * acpi   Use the RESET_REG in the FADT
@@ -95,7 +95,6 @@ static int __init reboot_setup(char *str)
 			reboot_mode = 0;
 			break;
 
-#ifdef CONFIG_X86_32
 #ifdef CONFIG_SMP
 		case 's':
 			if (isdigit(*(str+1))) {
@@ -112,7 +111,6 @@ static int __init reboot_setup(char *str)
 #endif /* CONFIG_SMP */
 
 		case 'b':
-#endif
 		case 'a':
 		case 'k':
 		case 't':
@@ -138,7 +136,6 @@ static int __init reboot_setup(char *str)
 __setup("reboot=", reboot_setup);
 
 
-#ifdef CONFIG_X86_32
 /*
  * Reboot options and system auto-detection code provided by
  * Dell Inc. so their systems "just work". :-)
@@ -152,16 +149,14 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
 {
 	if (reboot_type != BOOT_BIOS) {
 		reboot_type = BOOT_BIOS;
-		printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
+		pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+			"BIOS", d->ident);
 	}
 	return 0;
 }
 
-void machine_real_restart(unsigned int type)
+void __noreturn machine_real_restart(unsigned int type)
 {
-	void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int))
-		real_mode_header->machine_real_restart_asm;
-
 	local_irq_disable();
 
 	/*
@@ -181,25 +176,28 @@ void machine_real_restart(unsigned int type)
 	/*
 	 * Switch back to the initial page table.
 	 */
+#ifdef CONFIG_X86_32
 	load_cr3(initial_page_table);
-
-	/*
-	 * Write 0x1234 to absolute memory location 0x472.  The BIOS reads
-	 * this on booting to tell it to "Bypass memory test (also warm
-	 * boot)".  This seems like a fairly standard thing that gets set by
-	 * REBOOT.COM programs, and the previous reset routine did this
-	 * too. */
-	*((unsigned short *)0x472) = reboot_mode;
+#else
+	write_cr3(real_mode_header->trampoline_pgd);
+#endif
 
 	/* Jump to the identity-mapped low memory code */
-	restart_lowmem(type);
+#ifdef CONFIG_X86_32
+	asm volatile("jmpl *%0" : :
+		     "rm" (real_mode_header->machine_real_restart_asm),
+		     "a" (type));
+#else
+	asm volatile("ljmpl *%0" : :
+		     "m" (real_mode_header->machine_real_restart_asm),
+		     "D" (type));
+#endif
+	unreachable();
 }
 #ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(machine_real_restart);
 #endif
 
-#endif /* CONFIG_X86_32 */
-
 /*
  * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
  */
@@ -207,8 +205,8 @@ static int __init set_pci_reboot(const struct dmi_system_id *d)
 {
 	if (reboot_type != BOOT_CF9) {
 		reboot_type = BOOT_CF9;
-		printk(KERN_INFO "%s series board detected. "
-		       "Selecting PCI-method for reboots.\n", d->ident);
+		pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+			"PCI", d->ident);
 	}
 	return 0;
 }
@@ -217,17 +215,16 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
 {
 	if (reboot_type != BOOT_KBD) {
 		reboot_type = BOOT_KBD;
-		printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident);
+		pr_info("%s series board detected. Selecting %s-method for reboot.\n",
+			"KBD", d->ident);
 	}
 	return 0;
 }
 
 /*
- * This is a single dmi_table handling all reboot quirks.  Note that
- * REBOOT_BIOS is only available for 32bit
+ * This is a single dmi_table handling all reboot quirks.
  */
 static struct dmi_system_id __initdata reboot_dmi_table[] = {
-#ifdef CONFIG_X86_32
 	{	/* Handle problems with rebooting on Dell E520's */
 		.callback = set_bios_reboot,
 		.ident = "Dell E520",
@@ -377,7 +374,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
 		},
 	},
-#endif /* CONFIG_X86_32 */
 
 	{	/* Handle reboot issue on Acer Aspire one */
 		.callback = set_kbd_reboot,
@@ -584,13 +580,11 @@ static void native_machine_emergency_restart(void)
 			reboot_type = BOOT_KBD;
 			break;
 
-#ifdef CONFIG_X86_32
 		case BOOT_BIOS:
 			machine_real_restart(MRR_BIOS);
 
 			reboot_type = BOOT_KBD;
 			break;
-#endif
 
 		case BOOT_ACPI:
 			acpi_reboot();
@@ -632,12 +626,10 @@ void native_machine_shutdown(void)
 	/* The boot cpu is always logical cpu 0 */
 	int reboot_cpu_id = 0;
 
-#ifdef CONFIG_X86_32
 	/* See if there has been given a command line override */
 	if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
 		cpu_online(reboot_cpu))
 		reboot_cpu_id = reboot_cpu;
-#endif
 
 	/* Make certain the cpu I'm about to reboot on is online */
 	if (!cpu_online(reboot_cpu_id))
@@ -678,7 +670,7 @@ static void __machine_emergency_restart(int emergency)
 
 static void native_machine_restart(char *__unused)
 {
-	printk("machine restart\n");
+	pr_notice("machine restart\n");
 
 	if (!reboot_force)
 		machine_shutdown();
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 16be6dc14db1..4f165479c453 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -961,9 +961,7 @@ void __init setup_arch(char **cmdline_p)
 	kvmclock_init();
 #endif
 
-	x86_init.paging.pagetable_setup_start(swapper_pg_dir);
-	paging_init();
-	x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+	x86_init.paging.pagetable_init();
 
 	if (boot_cpu_data.cpuid_level >= 0) {
 		/* A CPU has %cr4 if and only if it has CPUID */
@@ -1031,8 +1029,6 @@ void __init setup_arch(char **cmdline_p)
 
 	x86_init.timers.wallclock_init();
 
-	x86_platform.wallclock_init();
-
 	mcheck_init();
 
 	arch_init_ideal_nops();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5a98aa272184..5cdff0357746 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -21,7 +21,7 @@
 #include <asm/cpu.h>
 #include <asm/stackprotector.h>
 
-DEFINE_PER_CPU(int, cpu_number);
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
 EXPORT_PER_CPU_SYMBOL(cpu_number);
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 21af737053aa..b280908a376e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,6 +6,9 @@
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -814,7 +817,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 		       me->comm, me->pid, where, frame,
 		       regs->ip, regs->sp, regs->orig_ax);
 		print_vma_addr(" in ", regs->ip);
-		printk(KERN_CONT "\n");
+		pr_cont("\n");
 	}
 
 	force_sig(SIGSEGV, me);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7bd8a0823654..7c5a8c314c02 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1,4 +1,4 @@
-/*
+ /*
  *	x86 SMP booting functions
  *
  *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
@@ -39,6 +39,8 @@
  *	Glauber Costa		:	i386 and x86_64 integration
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/module.h>
@@ -104,17 +106,17 @@ int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
 
 /* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
 
 /* representing HT siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 
 /* representing HT and core siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 
-DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 
 /* Per CPU bogomips and other parameters */
 DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -184,7 +186,7 @@ static void __cpuinit smp_callin(void)
 	 * boards)
 	 */
 
-	pr_debug("CALLIN, before setup_local_APIC().\n");
+	pr_debug("CALLIN, before setup_local_APIC()\n");
 	if (apic->smp_callin_clear_local_apic)
 		apic->smp_callin_clear_local_apic();
 	setup_local_APIC();
@@ -255,22 +257,13 @@ notrace static void __cpuinit start_secondary(void *unused)
 	check_tsc_sync_target();
 
 	/*
-	 * We need to hold call_lock, so there is no inconsistency
-	 * between the time smp_call_function() determines number of
-	 * IPI recipients, and the time when the determination is made
-	 * for which cpus receive the IPI. Holding this
-	 * lock helps us to not include this cpu in a currently in progress
-	 * smp_call_function().
-	 *
 	 * We need to hold vector_lock so there the set of online cpus
 	 * does not change while we are assigning vectors to cpus.  Holding
 	 * this lock ensures we don't half assign or remove an irq from a cpu.
 	 */
-	ipi_call_lock();
 	lock_vector_lock();
 	set_cpu_online(smp_processor_id(), true);
 	unlock_vector_lock();
-	ipi_call_unlock();
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 	x86_platform.nmi_init();
 
@@ -432,17 +425,16 @@ static void impress_friends(void)
 	/*
 	 * Allow the user to impress friends.
 	 */
-	pr_debug("Before bogomips.\n");
+	pr_debug("Before bogomips\n");
 	for_each_possible_cpu(cpu)
 		if (cpumask_test_cpu(cpu, cpu_callout_mask))
 			bogosum += cpu_data(cpu).loops_per_jiffy;
-	printk(KERN_INFO
-		"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+	pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
 		num_online_cpus(),
 		bogosum/(500000/HZ),
 		(bogosum/(5000/HZ))%100);
 
-	pr_debug("Before bogocount - setting activated=1.\n");
+	pr_debug("Before bogocount - setting activated=1\n");
 }
 
 void __inquire_remote_apic(int apicid)
@@ -452,18 +444,17 @@ void __inquire_remote_apic(int apicid)
 	int timeout;
 	u32 status;
 
-	printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
+	pr_info("Inquiring remote APIC 0x%x...\n", apicid);
 
 	for (i = 0; i < ARRAY_SIZE(regs); i++) {
-		printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
+		pr_info("... APIC 0x%x %s: ", apicid, names[i]);
 
 		/*
 		 * Wait for idle.
 		 */
 		status = safe_apic_wait_icr_idle();
 		if (status)
-			printk(KERN_CONT
-			       "a previous APIC delivery may have failed\n");
+			pr_cont("a previous APIC delivery may have failed\n");
 
 		apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
 
@@ -476,10 +467,10 @@ void __inquire_remote_apic(int apicid)
 		switch (status) {
 		case APIC_ICR_RR_VALID:
 			status = apic_read(APIC_RRR);
-			printk(KERN_CONT "%08x\n", status);
+			pr_cont("%08x\n", status);
 			break;
 		default:
-			printk(KERN_CONT "failed\n");
+			pr_cont("failed\n");
 		}
 	}
 }
@@ -513,12 +504,12 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
 			apic_write(APIC_ESR, 0);
 		accept_status = (apic_read(APIC_ESR) & 0xEF);
 	}
-	pr_debug("NMI sent.\n");
+	pr_debug("NMI sent\n");
 
 	if (send_status)
-		printk(KERN_ERR "APIC never delivered???\n");
+		pr_err("APIC never delivered???\n");
 	if (accept_status)
-		printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+		pr_err("APIC delivery error (%lx)\n", accept_status);
 
 	return (send_status | accept_status);
 }
@@ -540,7 +531,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		apic_read(APIC_ESR);
 	}
 
-	pr_debug("Asserting INIT.\n");
+	pr_debug("Asserting INIT\n");
 
 	/*
 	 * Turn INIT on target chip
@@ -556,7 +547,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 
 	mdelay(10);
 
-	pr_debug("Deasserting INIT.\n");
+	pr_debug("Deasserting INIT\n");
 
 	/* Target chip */
 	/* Send IPI */
@@ -589,14 +580,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	/*
 	 * Run STARTUP IPI loop.
 	 */
-	pr_debug("#startup loops: %d.\n", num_starts);
+	pr_debug("#startup loops: %d\n", num_starts);
 
 	for (j = 1; j <= num_starts; j++) {
-		pr_debug("Sending STARTUP #%d.\n", j);
+		pr_debug("Sending STARTUP #%d\n", j);
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
 		apic_read(APIC_ESR);
-		pr_debug("After apic_write.\n");
+		pr_debug("After apic_write\n");
 
 		/*
 		 * STARTUP IPI
@@ -613,7 +604,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		 */
 		udelay(300);
 
-		pr_debug("Startup point 1.\n");
+		pr_debug("Startup point 1\n");
 
 		pr_debug("Waiting for send to finish...\n");
 		send_status = safe_apic_wait_icr_idle();
@@ -628,12 +619,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		if (send_status || accept_status)
 			break;
 	}
-	pr_debug("After Startup.\n");
+	pr_debug("After Startup\n");
 
 	if (send_status)
-		printk(KERN_ERR "APIC never delivered???\n");
+		pr_err("APIC never delivered???\n");
 	if (accept_status)
-		printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+		pr_err("APIC delivery error (%lx)\n", accept_status);
 
 	return (send_status | accept_status);
 }
@@ -647,11 +638,11 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
 	if (system_state == SYSTEM_BOOTING) {
 		if (node != current_node) {
 			if (current_node > (-1))
-				pr_cont(" Ok.\n");
+				pr_cont(" OK\n");
 			current_node = node;
 			pr_info("Booting Node %3d, Processors ", node);
 		}
-		pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
+		pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : "");
 		return;
 	} else
 		pr_info("Booting Node %d Processor %d APIC 0x%x\n",
@@ -731,9 +722,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 		/*
 		 * allow APs to start initializing.
 		 */
-		pr_debug("Before Callout %d.\n", cpu);
+		pr_debug("Before Callout %d\n", cpu);
 		cpumask_set_cpu(cpu, cpu_callout_mask);
-		pr_debug("After Callout %d.\n", cpu);
+		pr_debug("After Callout %d\n", cpu);
 
 		/*
 		 * Wait 5s total for a response
@@ -761,7 +752,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 				pr_err("CPU%d: Stuck ??\n", cpu);
 			else
 				/* trampoline code not run */
-				pr_err("CPU%d: Not responding.\n", cpu);
+				pr_err("CPU%d: Not responding\n", cpu);
 			if (apic->inquire_remote_apic)
 				apic->inquire_remote_apic(apicid);
 		}
@@ -806,7 +797,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
 	    !physid_isset(apicid, phys_cpu_present_map) ||
 	    !apic->apic_id_valid(apicid)) {
-		printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
+		pr_err("%s: bad cpu %d\n", __func__, cpu);
 		return -EINVAL;
 	}
 
@@ -887,9 +878,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
 		unsigned int cpu;
 		unsigned nr;
 
-		printk(KERN_WARNING
-		       "More than 8 CPUs detected - skipping them.\n"
-		       "Use CONFIG_X86_BIGSMP.\n");
+		pr_warn("More than 8 CPUs detected - skipping them\n"
+			"Use CONFIG_X86_BIGSMP\n");
 
 		nr = 0;
 		for_each_present_cpu(cpu) {
@@ -910,8 +900,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
 #endif
 
 	if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
-		printk(KERN_WARNING
-			"weird, boot CPU (#%d) not listed by the BIOS.\n",
+		pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
 			hard_smp_processor_id());
 
 		physid_set(hard_smp_processor_id(), phys_cpu_present_map);
@@ -923,11 +912,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	 */
 	if (!smp_found_config && !acpi_lapic) {
 		preempt_enable();
-		printk(KERN_NOTICE "SMP motherboard not detected.\n");
+		pr_notice("SMP motherboard not detected\n");
 		disable_smp();
 		if (APIC_init_uniprocessor())
-			printk(KERN_NOTICE "Local APIC not detected."
-					   " Using dummy APIC emulation.\n");
+			pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
 		return -1;
 	}
 
@@ -936,9 +924,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	 * CPU too, but we do it for the sake of robustness anyway.
 	 */
 	if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
-		printk(KERN_NOTICE
-			"weird, boot CPU (#%d) not listed by the BIOS.\n",
-			boot_cpu_physical_apicid);
+		pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
+			  boot_cpu_physical_apicid);
 		physid_set(hard_smp_processor_id(), phys_cpu_present_map);
 	}
 	preempt_enable();
@@ -951,8 +938,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
 		if (!disable_apic) {
 			pr_err("BIOS bug, local APIC #%d not detected!...\n",
 				boot_cpu_physical_apicid);
-			pr_err("... forcing use of dummy APIC emulation."
-				"(tell your hw vendor)\n");
+			pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
 		}
 		smpboot_clear_io_apic();
 		disable_ioapic_support();
@@ -965,7 +951,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	 * If SMP should be disabled, then really disable it!
 	 */
 	if (!max_cpus) {
-		printk(KERN_INFO "SMP mode deactivated.\n");
+		pr_info("SMP mode deactivated\n");
 		smpboot_clear_io_apic();
 
 		connect_bsp_APIC();
@@ -1017,7 +1003,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 
 
 	if (smp_sanity_check(max_cpus) < 0) {
-		printk(KERN_INFO "SMP disabled\n");
+		pr_info("SMP disabled\n");
 		disable_smp();
 		goto out;
 	}
@@ -1055,7 +1041,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	 * Set up local APIC timer on boot CPU.
 	 */
 
-	printk(KERN_INFO "CPU%d: ", 0);
+	pr_info("CPU%d: ", 0);
 	print_cpu_info(&cpu_data(0));
 	x86_init.timers.setup_percpu_clockev();
 
@@ -1105,7 +1091,7 @@ void __init native_smp_prepare_boot_cpu(void)
 
 void __init native_smp_cpus_done(unsigned int max_cpus)
 {
-	pr_debug("Boot done.\n");
+	pr_debug("Boot done\n");
 
 	nmi_selftest();
 	impress_friends();
@@ -1166,8 +1152,7 @@ __init void prefill_possible_map(void)
 
 	/* nr_cpu_ids could be reduced via nr_cpus= */
 	if (possible > nr_cpu_ids) {
-		printk(KERN_WARNING
-			"%d Processors exceeds NR_CPUS limit of %d\n",
+		pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
 			possible, nr_cpu_ids);
 		possible = nr_cpu_ids;
 	}
@@ -1176,13 +1161,12 @@ __init void prefill_possible_map(void)
 	if (!setup_max_cpus)
 #endif
 	if (possible > i) {
-		printk(KERN_WARNING
-			"%d Processors exceeds max_cpus limit of %u\n",
+		pr_warn("%d Processors exceeds max_cpus limit of %u\n",
 			possible, setup_max_cpus);
 		possible = i;
 	}
 
-	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+	pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
 		possible, max_t(int, possible - num_processors, 0));
 
 	for (i = 0; i < possible; i++)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 05b31d92f69c..b481341c9369 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -9,6 +9,9 @@
 /*
  * Handle hardware traps and faults.
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/interrupt.h>
 #include <linux/kallsyms.h>
 #include <linux/spinlock.h>
@@ -143,12 +146,11 @@ trap_signal:
 #ifdef CONFIG_X86_64
 	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
 	    printk_ratelimit()) {
-		printk(KERN_INFO
-		       "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
-		       tsk->comm, tsk->pid, str,
-		       regs->ip, regs->sp, error_code);
+		pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+			tsk->comm, tsk->pid, str,
+			regs->ip, regs->sp, error_code);
 		print_vma_addr(" in ", regs->ip);
-		printk("\n");
+		pr_cont("\n");
 	}
 #endif
 
@@ -269,12 +271,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 			printk_ratelimit()) {
-		printk(KERN_INFO
-			"%s[%d] general protection ip:%lx sp:%lx error:%lx",
+		pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
 			tsk->comm, task_pid_nr(tsk),
 			regs->ip, regs->sp, error_code);
 		print_vma_addr(" in ", regs->ip);
-		printk("\n");
+		pr_cont("\n");
 	}
 
 	force_sig(SIGSEGV, tsk);
@@ -570,7 +571,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 	conditional_sti(regs);
 #if 0
 	/* No need to warn about this any longer. */
-	printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
+	pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
 #endif
 }
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index fc0a147e3727..cfa5d4f7ca56 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/init.h>
@@ -84,8 +86,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
 #ifdef CONFIG_X86_TSC
 int __init notsc_setup(char *str)
 {
-	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
-			"cannot disable TSC completely.\n");
+	pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n");
 	tsc_disabled = 1;
 	return 1;
 }
@@ -373,7 +374,7 @@ static unsigned long quick_pit_calibrate(void)
 			goto success;
 		}
 	}
-	printk("Fast TSC calibration failed\n");
+	pr_err("Fast TSC calibration failed\n");
 	return 0;
 
 success:
@@ -392,7 +393,7 @@ success:
 	 */
 	delta *= PIT_TICK_RATE;
 	do_div(delta, i*256*1000);
-	printk("Fast TSC calibration using PIT\n");
+	pr_info("Fast TSC calibration using PIT\n");
 	return delta;
 }
 
@@ -487,9 +488,8 @@ unsigned long native_calibrate_tsc(void)
 		 * use the reference value, as it is more precise.
 		 */
 		if (delta >= 90 && delta <= 110) {
-			printk(KERN_INFO
-			       "TSC: PIT calibration matches %s. %d loops\n",
-			       hpet ? "HPET" : "PMTIMER", i + 1);
+			pr_info("PIT calibration matches %s. %d loops\n",
+				hpet ? "HPET" : "PMTIMER", i + 1);
 			return tsc_ref_min;
 		}
 
@@ -511,38 +511,36 @@ unsigned long native_calibrate_tsc(void)
 	 */
 	if (tsc_pit_min == ULONG_MAX) {
 		/* PIT gave no useful value */
-		printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
+		pr_warn("Unable to calibrate against PIT\n");
 
 		/* We don't have an alternative source, disable TSC */
 		if (!hpet && !ref1 && !ref2) {
-			printk("TSC: No reference (HPET/PMTIMER) available\n");
+			pr_notice("No reference (HPET/PMTIMER) available\n");
 			return 0;
 		}
 
 		/* The alternative source failed as well, disable TSC */
 		if (tsc_ref_min == ULONG_MAX) {
-			printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
-			       "failed.\n");
+			pr_warn("HPET/PMTIMER calibration failed\n");
 			return 0;
 		}
 
 		/* Use the alternative source */
-		printk(KERN_INFO "TSC: using %s reference calibration\n",
-		       hpet ? "HPET" : "PMTIMER");
+		pr_info("using %s reference calibration\n",
+			hpet ? "HPET" : "PMTIMER");
 
 		return tsc_ref_min;
 	}
 
 	/* We don't have an alternative source, use the PIT calibration value */
 	if (!hpet && !ref1 && !ref2) {
-		printk(KERN_INFO "TSC: Using PIT calibration value\n");
+		pr_info("Using PIT calibration value\n");
 		return tsc_pit_min;
 	}
 
 	/* The alternative source failed, use the PIT calibration value */
 	if (tsc_ref_min == ULONG_MAX) {
-		printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
-		       "Using PIT calibration\n");
+		pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
 		return tsc_pit_min;
 	}
 
@@ -551,9 +549,9 @@ unsigned long native_calibrate_tsc(void)
 	 * the PIT value as we know that there are PMTIMERs around
 	 * running at double speed. At least we let the user know:
 	 */
-	printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
-	       hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
-	printk(KERN_INFO "TSC: Using PIT calibration value\n");
+	pr_warn("PIT calibration deviates from %s: %lu %lu\n",
+		hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
+	pr_info("Using PIT calibration value\n");
 	return tsc_pit_min;
 }
 
@@ -785,7 +783,7 @@ void mark_tsc_unstable(char *reason)
 		tsc_unstable = 1;
 		sched_clock_stable = 0;
 		disable_sched_clock_irqtime();
-		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
+		pr_info("Marking TSC unstable due to %s\n", reason);
 		/* Change only the rating, when not registered */
 		if (clocksource_tsc.mult)
 			clocksource_mark_unstable(&clocksource_tsc);
@@ -912,9 +910,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
 		goto out;
 
 	tsc_khz = freq;
-	printk(KERN_INFO "Refined TSC clocksource calibration: "
-		"%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
-					(unsigned long)tsc_khz % 1000);
+	pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
+		(unsigned long)tsc_khz / 1000,
+		(unsigned long)tsc_khz % 1000);
 
 out:
 	clocksource_register_khz(&clocksource_tsc, tsc_khz);
@@ -970,9 +968,9 @@ void __init tsc_init(void)
 		return;
 	}
 
-	printk("Detected %lu.%03lu MHz processor.\n",
-			(unsigned long)cpu_khz / 1000,
-			(unsigned long)cpu_khz % 1000);
+	pr_info("Detected %lu.%03lu MHz processor\n",
+		(unsigned long)cpu_khz / 1000,
+		(unsigned long)cpu_khz % 1000);
 
 	/*
 	 * Secondary CPUs do not run through tsc_init(), so set up
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index dc4e910a7d96..36fd42091fa7 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -409,9 +409,10 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,
  * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
  * @mm: the probed address space.
  * @arch_uprobe: the probepoint information.
+ * @addr: virtual address at which to install the probepoint
  * Return 0 on success or a -ve number on error.
  */
-int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm)
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
 {
 	int ret;
 	struct insn insn;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 255f58ae71e8..54abcc0baf23 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -28,6 +28,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
@@ -137,14 +139,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	local_irq_enable();
 
 	if (!current->thread.vm86_info) {
-		printk("no vm86_info: BAD\n");
+		pr_alert("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
 	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
 	tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
 	tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
 	if (tmp) {
-		printk("vm86: could not access userspace vm86_info\n");
+		pr_alert("could not access userspace vm86_info\n");
 		do_exit(SIGSEGV);
 	}
 
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 8eeb55a551b4..992f890283e9 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -16,6 +16,7 @@
 #include <linux/pci_ids.h>
 #include <linux/pci_regs.h>
 #include <linux/smp.h>
+#include <linux/irq.h>
 
 #include <asm/apic.h>
 #include <asm/pci-direct.h>
@@ -95,6 +96,18 @@ static void __init set_vsmp_pv_ops(void)
 	ctl = readl(address + 4);
 	printk(KERN_INFO "vSMP CTL: capabilities:0x%08x  control:0x%08x\n",
 	       cap, ctl);
+
+	/* If possible, let the vSMP foundation route the interrupt optimally */
+#ifdef CONFIG_SMP
+	if (cap & ctl & BIT(8)) {
+		ctl &= ~BIT(8);
+#ifdef CONFIG_PROC_FS
+		/* Don't let users change irq affinity via procfs */
+		no_irq_affinity = 1;
+#endif
+	}
+#endif
+
 	if (cap & ctl & (1 << 4)) {
 		/* Setup irq ops and turn on vSMP  IRQ fastpath handling */
 		pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
@@ -102,12 +115,11 @@ static void __init set_vsmp_pv_ops(void)
 		pv_irq_ops.save_fl  = PV_CALLEE_SAVE(vsmp_save_fl);
 		pv_irq_ops.restore_fl  = PV_CALLEE_SAVE(vsmp_restore_fl);
 		pv_init_ops.patch = vsmp_patch;
-
 		ctl &= ~(1 << 4);
-		writel(ctl, address + 4);
-		ctl = readl(address + 4);
-		printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
 	}
+	writel(ctl, address + 4);
+	ctl = readl(address + 4);
+	pr_info("vSMP CTL: control set to:0x%08x\n", ctl);
 
 	early_iounmap(address, 8);
 }
@@ -187,12 +199,36 @@ static void __init vsmp_cap_cpus(void)
 #endif
 }
 
+static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+	return hard_smp_processor_id() >> index_msb;
+}
+
+/*
+ * In vSMP, all cpus should be capable of handling interrupts, regardless of
+ * the APIC used.
+ */
+static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask,
+					  const struct cpumask *mask)
+{
+	cpumask_setall(retmask);
+}
+
+static void vsmp_apic_post_init(void)
+{
+	/* need to update phys_pkg_id */
+	apic->phys_pkg_id = apicid_phys_pkg_id;
+	apic->vector_allocation_domain = fill_vector_allocation_domain;
+}
+
 void __init vsmp_init(void)
 {
 	detect_vsmp_box();
 	if (!is_vsmp_box())
 		return;
 
+	x86_platform.apic_post_init = vsmp_apic_post_init;
+
 	vsmp_cap_cpus();
 
 	set_vsmp_pv_ops();
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 5db36caf4289..8d141b309046 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -18,6 +18,8 @@
  *  use the vDSO.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -111,18 +113,13 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
 			      const char *message)
 {
-	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
-	struct task_struct *tsk;
-
-	if (!show_unhandled_signals || !__ratelimit(&rs))
+	if (!show_unhandled_signals)
 		return;
 
-	tsk = current;
-
-	printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
-	       level, tsk->comm, task_pid_nr(tsk),
-	       message, regs->ip, regs->cs,
-	       regs->sp, regs->ax, regs->si, regs->di);
+	pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+			      level, current->comm, task_pid_nr(current),
+			      message, regs->ip, regs->cs,
+			      regs->sp, regs->ax, regs->si, regs->di);
 }
 
 static int addr_to_vsyscall_nr(unsigned long addr)
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 9796c2f3d074..6020f6f5927c 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -28,6 +28,7 @@ EXPORT_SYMBOL(__put_user_8);
 
 EXPORT_SYMBOL(copy_user_generic_string);
 EXPORT_SYMBOL(copy_user_generic_unrolled);
+EXPORT_SYMBOL(copy_user_enhanced_fast_string);
 EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 35c5e543f550..7a3d075a814a 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -26,10 +26,8 @@
 
 void __cpuinit x86_init_noop(void) { }
 void __init x86_init_uint_noop(unsigned int unused) { }
-void __init x86_init_pgd_noop(pgd_t *unused) { }
 int __init iommu_init_noop(void) { return 0; }
 void iommu_shutdown_noop(void) { }
-void wallclock_init_noop(void) { }
 
 /*
  * The platform setup functions are preset with the default functions
@@ -69,8 +67,7 @@ struct x86_init_ops x86_init __initdata = {
 	},
 
 	.paging = {
-		.pagetable_setup_start	= native_pagetable_setup_start,
-		.pagetable_setup_done	= native_pagetable_setup_done,
+		.pagetable_init		= native_pagetable_init,
 	},
 
 	.timers = {
@@ -101,7 +98,6 @@ static int default_i8042_detect(void) { return 1; };
 
 struct x86_platform_ops x86_platform = {
 	.calibrate_tsc			= native_calibrate_tsc,
-	.wallclock_init			= wallclock_init_noop,
 	.get_wallclock			= mach_get_cmos_time,
 	.set_wallclock			= mach_set_rtc_mmss,
 	.iommu_shutdown			= iommu_shutdown_noop,
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index bd18149b2b0f..3d3e20709119 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -3,6 +3,9 @@
  *
  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/bootmem.h>
 #include <linux/compat.h>
 #include <asm/i387.h>
@@ -162,7 +165,7 @@ int save_i387_xstate(void __user *buf)
 	BUG_ON(sig_xstate_size < xstate_size);
 
 	if ((unsigned long)buf % 64)
-		printk("save_i387_xstate: bad fpstate %p\n", buf);
+		pr_err("%s: bad fpstate %p\n", __func__, buf);
 
 	if (!used_math())
 		return 0;
@@ -422,7 +425,7 @@ static void __init xstate_enable_boot_cpu(void)
 	pcntxt_mask = eax + ((u64)edx << 32);
 
 	if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
-		printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
+		pr_err("FP/SSE not shown under xsave features 0x%llx\n",
 		       pcntxt_mask);
 		BUG();
 	}
@@ -445,9 +448,8 @@ static void __init xstate_enable_boot_cpu(void)
 
 	setup_xstate_init();
 
-	printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
-	       "cntxt size 0x%x\n",
-	       pcntxt_mask, xstate_size);
+	pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
+		pcntxt_mask, xstate_size);
 }
 
 /*
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7df1c6d839fb..0595f1397b7c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	unsigned f_lm = 0;
 #endif
 	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
+	unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
 
 	/* cpuid 1.edx */
 	const u32 kvm_supported_word0_x86_features =
@@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
 		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
-		0 /* Reserved, DCA */ | F(XMM4_1) |
+		F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
 		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
 		F(F16C) | F(RDRAND);
@@ -248,7 +249,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	/* cpuid 7.0.ebx */
 	const u32 kvm_supported_word9_x86_features =
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
-		F(BMI2) | F(ERMS) | F(RTM);
+		F(BMI2) | F(ERMS) | f_invpcid | F(RTM);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
@@ -409,6 +410,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
 			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
 			     (1 << KVM_FEATURE_ASYNC_PF) |
+			     (1 << KVM_FEATURE_PV_EOI) |
 			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
 
 		if (sched_info_on())
@@ -639,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
 	return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
 }
 
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
 {
-	u32 function, index;
+	u32 function = *eax, index = *ecx;
 	struct kvm_cpuid_entry2 *best;
 
-	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
 	best = kvm_find_cpuid_entry(vcpu, function, index);
 
 	if (!best)
 		best = check_cpuid_limit(vcpu, function, index);
 
 	if (best) {
-		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
-		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
-		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
-		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
-	}
+		*eax = best->eax;
+		*ebx = best->ebx;
+		*ecx = best->ecx;
+		*edx = best->edx;
+	} else
+		*eax = *ebx = *ecx = *edx = 0;
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+	u32 function, eax, ebx, ecx, edx;
+
+	function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
-	trace_kvm_cpuid(function,
-			kvm_register_read(vcpu, VCPU_REGS_RAX),
-			kvm_register_read(vcpu, VCPU_REGS_RBX),
-			kvm_register_read(vcpu, VCPU_REGS_RCX),
-			kvm_register_read(vcpu, VCPU_REGS_RDX));
+	trace_kvm_cpuid(function, eax, ebx, ecx, edx);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 26d1fb437eb5..a10e46016851 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 			      struct kvm_cpuid2 *cpuid,
 			      struct kvm_cpuid_entry2 __user *entries);
+void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
 
 
 static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
@@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
 	return best && (best->ecx & bit(X86_FEATURE_OSVW));
 }
 
+static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	return best && (best->ecx & bit(X86_FEATURE_PCID));
+}
+
 #endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f95d242ee9f7..a3b57a27be88 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -433,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
 	return ctxt->ops->intercept(ctxt, &info, stage);
 }
 
+static void assign_masked(ulong *dest, ulong src, ulong mask)
+{
+	*dest = (*dest & ~mask) | (src & mask);
+}
+
 static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
 {
 	return (1UL << (ctxt->ad_bytes << 3)) - 1;
 }
 
+static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
+{
+	u16 sel;
+	struct desc_struct ss;
+
+	if (ctxt->mode == X86EMUL_MODE_PROT64)
+		return ~0UL;
+	ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
+	return ~0U >> ((ss.d ^ 1) * 16);  /* d=0: 0xffff; d=1: 0xffffffff */
+}
+
+static int stack_size(struct x86_emulate_ctxt *ctxt)
+{
+	return (__fls(stack_mask(ctxt)) + 1) >> 3;
+}
+
 /* Access/update address held in a register, based on addressing mode. */
 static inline unsigned long
 address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
@@ -454,13 +475,26 @@ register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
 	return address_mask(ctxt, reg);
 }
 
+static void masked_increment(ulong *reg, ulong mask, int inc)
+{
+	assign_masked(reg, *reg + inc, mask);
+}
+
 static inline void
 register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
 {
+	ulong mask;
+
 	if (ctxt->ad_bytes == sizeof(unsigned long))
-		*reg += inc;
+		mask = ~0UL;
 	else
-		*reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
+		mask = ad_mask(ctxt);
+	masked_increment(reg, mask, inc);
+}
+
+static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
+{
+	masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc);
 }
 
 static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -958,6 +992,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 	op->orig_val = op->val;
 }
 
+static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
+{
+	if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
+		ctxt->modrm_seg = VCPU_SREG_SS;
+}
+
 static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			struct operand *op)
 {
@@ -1061,15 +1101,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 
 			if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
 				modrm_ea += insn_fetch(s32, ctxt);
-			else
+			else {
 				modrm_ea += ctxt->regs[base_reg];
+				adjust_modrm_seg(ctxt, base_reg);
+			}
 			if (index_reg != 4)
 				modrm_ea += ctxt->regs[index_reg] << scale;
 		} else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
 			if (ctxt->mode == X86EMUL_MODE_PROT64)
 				ctxt->rip_relative = 1;
-		} else
-			modrm_ea += ctxt->regs[ctxt->modrm_rm];
+		} else {
+			base_reg = ctxt->modrm_rm;
+			modrm_ea += ctxt->regs[base_reg];
+			adjust_modrm_seg(ctxt, base_reg);
+		}
 		switch (ctxt->modrm_mod) {
 		case 0:
 			if (ctxt->modrm_rm == 5)
@@ -1264,7 +1309,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
 
 /* allowed just for 8 bytes segments */
 static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   u16 selector, struct desc_struct *desc)
+				   u16 selector, struct desc_struct *desc,
+				   ulong *desc_addr_p)
 {
 	struct desc_ptr dt;
 	u16 index = selector >> 3;
@@ -1275,7 +1321,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (dt.size < index * 8 + 7)
 		return emulate_gp(ctxt, selector & 0xfffc);
 
-	addr = dt.address + index * 8;
+	*desc_addr_p = addr = dt.address + index * 8;
 	return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
 				   &ctxt->exception);
 }
@@ -1302,11 +1348,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				   u16 selector, int seg)
 {
-	struct desc_struct seg_desc;
+	struct desc_struct seg_desc, old_desc;
 	u8 dpl, rpl, cpl;
 	unsigned err_vec = GP_VECTOR;
 	u32 err_code = 0;
 	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+	ulong desc_addr;
 	int ret;
 
 	memset(&seg_desc, 0, sizeof seg_desc);
@@ -1324,8 +1371,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 		goto load;
 	}
 
-	/* NULL selector is not valid for TR, CS and SS */
-	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+	rpl = selector & 3;
+	cpl = ctxt->ops->cpl(ctxt);
+
+	/* NULL selector is not valid for TR, CS and SS (except for long mode) */
+	if ((seg == VCPU_SREG_CS
+	     || (seg == VCPU_SREG_SS
+		 && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
+	     || seg == VCPU_SREG_TR)
 	    && null_selector)
 		goto exception;
 
@@ -1336,7 +1389,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (null_selector) /* for NULL selector skip all following checks */
 		goto load;
 
-	ret = read_segment_descriptor(ctxt, selector, &seg_desc);
+	ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -1352,9 +1405,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 		goto exception;
 	}
 
-	rpl = selector & 3;
 	dpl = seg_desc.dpl;
-	cpl = ctxt->ops->cpl(ctxt);
 
 	switch (seg) {
 	case VCPU_SREG_SS:
@@ -1384,6 +1435,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	case VCPU_SREG_TR:
 		if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
 			goto exception;
+		old_desc = seg_desc;
+		seg_desc.type |= 2; /* busy */
+		ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
+						  sizeof(seg_desc), &ctxt->exception);
+		if (ret != X86EMUL_CONTINUE)
+			return ret;
 		break;
 	case VCPU_SREG_LDTR:
 		if (seg_desc.s || seg_desc.type != 2)
@@ -1474,17 +1531,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static int em_push(struct x86_emulate_ctxt *ctxt)
+static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
 {
 	struct segmented_address addr;
 
-	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes);
-	addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
+	rsp_increment(ctxt, -bytes);
+	addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
 	addr.seg = VCPU_SREG_SS;
 
+	return segmented_write(ctxt, addr, data, bytes);
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
 	/* Disable writeback. */
 	ctxt->dst.type = OP_NONE;
-	return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes);
+	return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1493,13 +1555,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 	int rc;
 	struct segmented_address addr;
 
-	addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
+	addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt);
 	addr.seg = VCPU_SREG_SS;
 	rc = segmented_read(ctxt, addr, dest, len);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len);
+	rsp_increment(ctxt, len);
 	return rc;
 }
 
@@ -1556,6 +1618,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
 	return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 }
 
+static int em_enter(struct x86_emulate_ctxt *ctxt)
+{
+	int rc;
+	unsigned frame_size = ctxt->src.val;
+	unsigned nesting_level = ctxt->src2.val & 31;
+
+	if (nesting_level)
+		return X86EMUL_UNHANDLEABLE;
+
+	rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt));
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+	assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP],
+		      stack_mask(ctxt));
+	assign_masked(&ctxt->regs[VCPU_REGS_RSP],
+		      ctxt->regs[VCPU_REGS_RSP] - frame_size,
+		      stack_mask(ctxt));
+	return X86EMUL_CONTINUE;
+}
+
+static int em_leave(struct x86_emulate_ctxt *ctxt)
+{
+	assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP],
+		      stack_mask(ctxt));
+	return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes);
+}
+
 static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
 {
 	int seg = ctxt->src2.val;
@@ -1612,8 +1701,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 
 	while (reg >= VCPU_REGS_RAX) {
 		if (reg == VCPU_REGS_RSP) {
-			register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP],
-							ctxt->op_bytes);
+			rsp_increment(ctxt, ctxt->op_bytes);
 			--reg;
 		}
 
@@ -1993,8 +2081,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
 	u32 eax, ebx, ecx, edx;
 
 	eax = ecx = 0;
-	return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)
-		&& ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
+	ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+	return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
 		&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
 		&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
 }
@@ -2013,32 +2101,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
 
 	eax = 0x00000000;
 	ecx = 0x00000000;
-	if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) {
-		/*
-		 * Intel ("GenuineIntel")
-		 * remark: Intel CPUs only support "syscall" in 64bit
-		 * longmode. Also an 64bit guest with a
-		 * 32bit compat-app running will #UD !! While this
-		 * behaviour can be fixed (by emulating) into AMD
-		 * response - CPUs of AMD can't behave like Intel.
-		 */
-		if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
-		    ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
-		    edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
-			return false;
+	ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+	/*
+	 * Intel ("GenuineIntel")
+	 * remark: Intel CPUs only support "syscall" in 64bit
+	 * longmode. Also an 64bit guest with a
+	 * 32bit compat-app running will #UD !! While this
+	 * behaviour can be fixed (by emulating) into AMD
+	 * response - CPUs of AMD can't behave like Intel.
+	 */
+	if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
+	    ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
+	    edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
+		return false;
 
-		/* AMD ("AuthenticAMD") */
-		if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
-		    ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
-		    edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
-			return true;
-
-		/* AMD ("AMDisbetter!") */
-		if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
-		    ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
-		    edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
-			return true;
-	}
+	/* AMD ("AuthenticAMD") */
+	if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
+	    ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
+	    edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
+		return true;
+
+	/* AMD ("AMDisbetter!") */
+	if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
+	    ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
+	    edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
+		return true;
 
 	/* default: (not Intel, not AMD), apply Intel's stricter rules... */
 	return false;
@@ -2547,13 +2634,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	ulong old_tss_base =
 		ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
 	u32 desc_limit;
+	ulong desc_addr;
 
 	/* FIXME: old_tss_base == ~0 ? */
 
-	ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
+	ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
+	ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -2749,7 +2837,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
 	rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val);
+	rsp_increment(ctxt, ctxt->src.val);
 	return X86EMUL_CONTINUE;
 }
 
@@ -2948,6 +3036,24 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
 	return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
 }
 
+static int em_lldt(struct x86_emulate_ctxt *ctxt)
+{
+	u16 sel = ctxt->src.val;
+
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
+}
+
+static int em_ltr(struct x86_emulate_ctxt *ctxt)
+{
+	u16 sel = ctxt->src.val;
+
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
+}
+
 static int em_invlpg(struct x86_emulate_ctxt *ctxt)
 {
 	int rc;
@@ -2989,11 +3095,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
+				  void (*get)(struct x86_emulate_ctxt *ctxt,
+					      struct desc_ptr *ptr))
+{
+	struct desc_ptr desc_ptr;
+
+	if (ctxt->mode == X86EMUL_MODE_PROT64)
+		ctxt->op_bytes = 8;
+	get(ctxt, &desc_ptr);
+	if (ctxt->op_bytes == 2) {
+		ctxt->op_bytes = 4;
+		desc_ptr.address &= 0x00ffffff;
+	}
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	return segmented_write(ctxt, ctxt->dst.addr.mem,
+			       &desc_ptr, 2 + ctxt->op_bytes);
+}
+
+static int em_sgdt(struct x86_emulate_ctxt *ctxt)
+{
+	return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
+}
+
+static int em_sidt(struct x86_emulate_ctxt *ctxt)
+{
+	return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
+}
+
 static int em_lgdt(struct x86_emulate_ctxt *ctxt)
 {
 	struct desc_ptr desc_ptr;
 	int rc;
 
+	if (ctxt->mode == X86EMUL_MODE_PROT64)
+		ctxt->op_bytes = 8;
 	rc = read_descriptor(ctxt, ctxt->src.addr.mem,
 			     &desc_ptr.size, &desc_ptr.address,
 			     ctxt->op_bytes);
@@ -3021,6 +3158,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
 	struct desc_ptr desc_ptr;
 	int rc;
 
+	if (ctxt->mode == X86EMUL_MODE_PROT64)
+		ctxt->op_bytes = 8;
 	rc = read_descriptor(ctxt, ctxt->src.addr.mem,
 			     &desc_ptr.size, &desc_ptr.address,
 			     ctxt->op_bytes);
@@ -3143,6 +3282,42 @@ static int em_bsr(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_cpuid(struct x86_emulate_ctxt *ctxt)
+{
+	u32 eax, ebx, ecx, edx;
+
+	eax = ctxt->regs[VCPU_REGS_RAX];
+	ecx = ctxt->regs[VCPU_REGS_RCX];
+	ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+	ctxt->regs[VCPU_REGS_RAX] = eax;
+	ctxt->regs[VCPU_REGS_RBX] = ebx;
+	ctxt->regs[VCPU_REGS_RCX] = ecx;
+	ctxt->regs[VCPU_REGS_RDX] = edx;
+	return X86EMUL_CONTINUE;
+}
+
+static int em_lahf(struct x86_emulate_ctxt *ctxt)
+{
+	ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL;
+	ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8;
+	return X86EMUL_CONTINUE;
+}
+
+static int em_bswap(struct x86_emulate_ctxt *ctxt)
+{
+	switch (ctxt->op_bytes) {
+#ifdef CONFIG_X86_64
+	case 8:
+		asm("bswap %0" : "+r"(ctxt->dst.val));
+		break;
+#endif
+	default:
+		asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
+		break;
+	}
+	return X86EMUL_CONTINUE;
+}
+
 static bool valid_cr(int nr)
 {
 	switch (nr) {
@@ -3424,14 +3599,14 @@ static struct opcode group5[] = {
 static struct opcode group6[] = {
 	DI(Prot,	sldt),
 	DI(Prot,	str),
-	DI(Prot | Priv,	lldt),
-	DI(Prot | Priv,	ltr),
+	II(Prot | Priv | SrcMem16, em_lldt, lldt),
+	II(Prot | Priv | SrcMem16, em_ltr, ltr),
 	N, N, N, N,
 };
 
 static struct group_dual group7 = { {
-	DI(Mov | DstMem | Priv,			sgdt),
-	DI(Mov | DstMem | Priv,			sidt),
+	II(Mov | DstMem | Priv,			em_sgdt, sgdt),
+	II(Mov | DstMem | Priv,			em_sidt, sidt),
 	II(SrcMem | Priv,			em_lgdt, lgdt),
 	II(SrcMem | Priv,			em_lidt, lidt),
 	II(SrcNone | DstMem | Mov,		em_smsw, smsw), N,
@@ -3538,7 +3713,7 @@ static struct opcode opcode_table[256] = {
 	D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
 	I(SrcImmFAddr | No64, em_call_far), N,
 	II(ImplicitOps | Stack, em_pushf, pushf),
-	II(ImplicitOps | Stack, em_popf, popf), N, N,
+	II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf),
 	/* 0xA0 - 0xA7 */
 	I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
 	I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
@@ -3561,7 +3736,8 @@ static struct opcode opcode_table[256] = {
 	I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
 	G(ByteOp, group11), G(0, group11),
 	/* 0xC8 - 0xCF */
-	N, N, N, I(ImplicitOps | Stack, em_ret_far),
+	I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
+	N, I(ImplicitOps | Stack, em_ret_far),
 	D(ImplicitOps), DI(SrcImmByte, intn),
 	D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
 	/* 0xD0 - 0xD7 */
@@ -3635,7 +3811,7 @@ static struct opcode twobyte_table[256] = {
 	X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
 	/* 0xA0 - 0xA7 */
 	I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
-	DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
+	II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
 	D(DstMem | SrcReg | Src2ImmByte | ModRM),
 	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
 	/* 0xA8 - 0xAF */
@@ -3658,11 +3834,12 @@ static struct opcode twobyte_table[256] = {
 	I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
 	I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
 	D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
-	/* 0xC0 - 0xCF */
+	/* 0xC0 - 0xC7 */
 	D2bv(DstMem | SrcReg | ModRM | Lock),
 	N, D(DstMem | SrcReg | ModRM | Mov),
 	N, N, N, GD(0, &group9),
-	N, N, N, N, N, N, N, N,
+	/* 0xC8 - 0xCF */
+	X8(I(DstReg, em_bswap)),
 	/* 0xD0 - 0xDF */
 	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xEF */
@@ -4426,12 +4603,12 @@ twobyte_insn:
 		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		ctxt->dst.bytes = ctxt->op_bytes;
-		ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
+		ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
 						       : (u16) ctxt->src.val;
 		break;
 	case 0xbe ... 0xbf:	/* movsx */
 		ctxt->dst.bytes = ctxt->op_bytes;
-		ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
+		ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
 							(s16) ctxt->src.val;
 		break;
 	case 0xc0 ... 0xc1:	/* xadd */
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 81cf4fa4a2be..e498b18f010c 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s)
 	pic_unlock(s);
 }
 
-int kvm_pic_set_irq(void *opaque, int irq, int level)
+int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
 {
-	struct kvm_pic *s = opaque;
 	int ret = -1;
 
 	pic_lock(s);
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
-		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
+		int irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+						     irq_source_id, level);
+		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
 		pic_update_irq(s);
 		trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
 				      s->pics[irq >> 3].imr, ret == 0);
@@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
 	return ret;
 }
 
+void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
+{
+	int i;
+
+	pic_lock(s);
+	for (i = 0; i < PIC_NUM_PINS; i++)
+		__clear_bit(irq_source_id, &s->irq_states[i]);
+	pic_unlock(s);
+}
+
 /*
  * acknowledge interrupt 'irq'
  */
@@ -305,6 +316,11 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 	addr &= 1;
 	if (addr == 0) {
 		if (val & 0x10) {
+			u8 edge_irr = s->irr & ~s->elcr;
+			int i;
+			bool found;
+			struct kvm_vcpu *vcpu;
+
 			s->init4 = val & 1;
 			s->last_irr = 0;
 			s->irr &= s->elcr;
@@ -322,6 +338,18 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 			if (val & 0x08)
 				pr_pic_unimpl(
 					"level sensitive irq not supported");
+
+			kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
+				if (kvm_apic_accept_pic_intr(vcpu)) {
+					found = true;
+					break;
+				}
+
+
+			if (found)
+				for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+					if (edge_irr & (1 << irq))
+						pic_clear_isr(s, irq);
 		} else if (val & 0x08) {
 			if (val & 0x04)
 				s->poll = 1;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93c15743f1ee..ce878788a39f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -107,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap)
 	clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 }
 
+static inline int __apic_test_and_set_vector(int vec, void *bitmap)
+{
+	return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
+{
+	return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
 static inline int apic_hw_enabled(struct kvm_lapic *apic)
 {
 	return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
@@ -210,6 +220,16 @@ static int find_highest_vector(void *bitmap)
 		return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
 }
 
+static u8 count_vectors(void *bitmap)
+{
+	u32 *word = bitmap;
+	int word_offset;
+	u8 count = 0;
+	for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset)
+		count += hweight32(word[word_offset << 2]);
+	return count;
+}
+
 static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
 {
 	apic->irr_pending = true;
@@ -242,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
 		apic->irr_pending = true;
 }
 
+static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
+{
+	if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+		++apic->isr_count;
+	BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+	/*
+	 * ISR (in service register) bit is set when injecting an interrupt.
+	 * The highest vector is injected. Thus the latest bit set matches
+	 * the highest bit in ISR.
+	 */
+	apic->highest_isr_cache = vec;
+}
+
+static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+{
+	if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+		--apic->isr_count;
+	BUG_ON(apic->isr_count < 0);
+	apic->highest_isr_cache = -1;
+}
+
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
@@ -270,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
 			irq->level, irq->trig_mode);
 }
 
+static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
+{
+
+	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+				      sizeof(val));
+}
+
+static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
+{
+
+	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+				      sizeof(*val));
+}
+
+static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+
+static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
+{
+	u8 val;
+	if (pv_eoi_get_user(vcpu, &val) < 0)
+		apic_debug("Can't read EOI MSR value: 0x%llx\n",
+			   (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+	return val & 0x1;
+}
+
+static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
+{
+	if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
+		apic_debug("Can't set EOI MSR value: 0x%llx\n",
+			   (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+		return;
+	}
+	__set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
+static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
+{
+	if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
+		apic_debug("Can't clear EOI MSR value: 0x%llx\n",
+			   (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+		return;
+	}
+	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
 static inline int apic_find_highest_isr(struct kvm_lapic *apic)
 {
 	int result;
+	if (!apic->isr_count)
+		return -1;
+	if (likely(apic->highest_isr_cache != -1))
+		return apic->highest_isr_cache;
 
 	result = find_highest_vector(apic->regs + APIC_ISR);
 	ASSERT(result == -1 || result >= 16);
@@ -482,17 +575,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 }
 
-static void apic_set_eoi(struct kvm_lapic *apic)
+static int apic_set_eoi(struct kvm_lapic *apic)
 {
 	int vector = apic_find_highest_isr(apic);
+
+	trace_kvm_eoi(apic, vector);
+
 	/*
 	 * Not every write EOI will has corresponding ISR,
 	 * one example is when Kernel check timer on setup_IO_APIC
 	 */
 	if (vector == -1)
-		return;
+		return vector;
 
-	apic_clear_vector(vector, apic->regs + APIC_ISR);
+	apic_clear_isr(vector, apic);
 	apic_update_ppr(apic);
 
 	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
@@ -505,6 +601,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
 		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
 	}
 	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+	return vector;
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1081,10 +1178,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
 	apic->irr_pending = false;
+	apic->isr_count = 0;
+	apic->highest_isr_cache = -1;
 	update_divide_count(apic);
 	atomic_set(&apic->lapic_timer.pending, 0);
 	if (kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+	vcpu->arch.pv_eoi.msr_val = 0;
 	apic_update_ppr(apic);
 
 	vcpu->arch.apic_arb_prio = 0;
@@ -1248,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 	if (vector == -1)
 		return -1;
 
-	apic_set_vector(vector, apic->regs + APIC_ISR);
+	apic_set_isr(vector, apic);
 	apic_update_ppr(apic);
 	apic_clear_irr(vector, apic);
 	return vector;
@@ -1267,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 	update_divide_count(apic);
 	start_apic_timer(apic);
 	apic->irr_pending = true;
+	apic->isr_count = count_vectors(apic->regs + APIC_ISR);
+	apic->highest_isr_cache = -1;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
@@ -1283,11 +1385,51 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
+/*
+ * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
+ *
+ * Detect whether guest triggered PV EOI since the
+ * last entry. If yes, set EOI on guests's behalf.
+ * Clear PV EOI in guest memory in any case.
+ */
+static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
+					struct kvm_lapic *apic)
+{
+	bool pending;
+	int vector;
+	/*
+	 * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
+	 * and KVM_PV_EOI_ENABLED in guest memory as follows:
+	 *
+	 * KVM_APIC_PV_EOI_PENDING is unset:
+	 * 	-> host disabled PV EOI.
+	 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
+	 * 	-> host enabled PV EOI, guest did not execute EOI yet.
+	 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
+	 * 	-> host enabled PV EOI, guest executed EOI.
+	 */
+	BUG_ON(!pv_eoi_enabled(vcpu));
+	pending = pv_eoi_get_pending(vcpu);
+	/*
+	 * Clear pending bit in any case: it will be set again on vmentry.
+	 * While this might not be ideal from performance point of view,
+	 * this makes sure pv eoi is only enabled when we know it's safe.
+	 */
+	pv_eoi_clr_pending(vcpu);
+	if (pending)
+		return;
+	vector = apic_set_eoi(apic);
+	trace_kvm_pv_eoi(apic, vector);
+}
+
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
 {
 	u32 data;
 	void *vapic;
 
+	if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
+		apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
+
 	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
@@ -1298,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
 	apic_set_tpr(vcpu->arch.apic, data & 0xff);
 }
 
+/*
+ * apic_sync_pv_eoi_to_guest - called before vmentry
+ *
+ * Detect whether it's safe to enable PV EOI and
+ * if yes do so.
+ */
+static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
+					struct kvm_lapic *apic)
+{
+	if (!pv_eoi_enabled(vcpu) ||
+	    /* IRR set or many bits in ISR: could be nested. */
+	    apic->irr_pending ||
+	    /* Cache not set: could be safe but we don't bother. */
+	    apic->highest_isr_cache == -1 ||
+	    /* Need EOI to update ioapic. */
+	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
+		/*
+		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
+		 * so we need not do anything here.
+		 */
+		return;
+	}
+
+	pv_eoi_set_pending(apic->vcpu);
+}
+
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 {
 	u32 data, tpr;
 	int max_irr, max_isr;
-	struct kvm_lapic *apic;
+	struct kvm_lapic *apic = vcpu->arch.apic;
 	void *vapic;
 
+	apic_sync_pv_eoi_to_guest(vcpu, apic);
+
 	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
-	apic = vcpu->arch.apic;
 	tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
 	max_irr = apic_find_highest_irr(apic);
 	if (max_irr < 0)
@@ -1394,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
 
 	return 0;
 }
+
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
+{
+	u64 addr = data & ~KVM_MSR_ENABLED;
+	if (!IS_ALIGNED(addr, 4))
+		return 1;
+
+	vcpu->arch.pv_eoi.msr_val = data;
+	if (!pv_eoi_enabled(vcpu))
+		return 0;
+	return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+					 addr);
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 6f4ce2575d09..4af5405ae1e2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,6 +13,15 @@ struct kvm_lapic {
 	u32 divide_count;
 	struct kvm_vcpu *vcpu;
 	bool irr_pending;
+	/* Number of bits set in ISR. */
+	s16 isr_count;
+	/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
+	int highest_isr_cache;
+	/**
+	 * APIC register page.  The layout matches the register layout seen by
+	 * the guest 1:1, because it is accessed by the vmx microcode.
+	 * Note: Only one register, the TPR, is used by the microcode.
+	 */
 	void *regs;
 	gpa_t vapic_addr;
 	struct page *vapic_page;
@@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
 }
+
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
 #endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 57e168e27b5b..7fbd0d273ea8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -90,7 +90,7 @@ module_param(dbg, bool, 0644);
 
 #define PTE_PREFETCH_NUM		8
 
-#define PT_FIRST_AVAIL_BITS_SHIFT 9
+#define PT_FIRST_AVAIL_BITS_SHIFT 10
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
 #define PT64_LEVEL_BITS 9
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
 #define CREATE_TRACE_POINTS
 #include "mmutrace.h"
 
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
@@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
+static void mmu_free_roots(struct kvm_vcpu *vcpu);
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 {
@@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
 }
 #endif
 
+static bool spte_is_locklessly_modifiable(u64 spte)
+{
+	return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+}
+
 static bool spte_has_volatile_bits(u64 spte)
 {
+	/*
+	 * Always atomicly update spte if it can be updated
+	 * out of mmu-lock, it can ensure dirty bit is not lost,
+	 * also, it can help us to get a stable is_writable_pte()
+	 * to ensure tlb flush is not missed.
+	 */
+	if (spte_is_locklessly_modifiable(spte))
+		return true;
+
 	if (!shadow_accessed_mask)
 		return false;
 
@@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
 
 /* Rules for using mmu_spte_update:
  * Update the state bits, it means the mapped pfn is not changged.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
  */
-static void mmu_spte_update(u64 *sptep, u64 new_spte)
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 {
-	u64 mask, old_spte = *sptep;
+	u64 old_spte = *sptep;
+	bool ret = false;
 
 	WARN_ON(!is_rmap_spte(new_spte));
 
-	if (!is_shadow_present_pte(old_spte))
-		return mmu_spte_set(sptep, new_spte);
-
-	new_spte |= old_spte & shadow_dirty_mask;
-
-	mask = shadow_accessed_mask;
-	if (is_writable_pte(old_spte))
-		mask |= shadow_dirty_mask;
+	if (!is_shadow_present_pte(old_spte)) {
+		mmu_spte_set(sptep, new_spte);
+		return ret;
+	}
 
-	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
+	if (!spte_has_volatile_bits(old_spte))
 		__update_clear_spte_fast(sptep, new_spte);
 	else
 		old_spte = __update_clear_spte_slow(sptep, new_spte);
 
+	/*
+	 * For the spte updated out of mmu-lock is safe, since
+	 * we always atomicly update it, see the comments in
+	 * spte_has_volatile_bits().
+	 */
+	if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
+		ret = true;
+
 	if (!shadow_accessed_mask)
-		return;
+		return ret;
 
 	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+
+	return ret;
 }
 
 /*
@@ -652,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 				mmu_page_header_cache);
 }
 
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
-				    size_t size)
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 {
 	void *p;
 
@@ -664,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
 
 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 {
-	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
-				      sizeof(struct pte_list_desc));
+	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
 }
 
 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@ -1051,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }
 
-static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
+
+static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
+{
+	if (is_large_pte(*sptep)) {
+		WARN_ON(page_header(__pa(sptep))->role.level ==
+			PT_PAGE_TABLE_LEVEL);
+		drop_spte(kvm, sptep);
+		--kvm->stat.lpages;
+		return true;
+	}
+
+	return false;
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+	if (__drop_large_spte(vcpu->kvm, sptep))
+		kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
+/*
+ * Write-protect on the specified @sptep, @pt_protect indicates whether
+ * spte writ-protection is caused by protecting shadow page table.
+ * @flush indicates whether tlb need be flushed.
+ *
+ * Note: write protection is difference between drity logging and spte
+ * protection:
+ * - for dirty logging, the spte can be set to writable at anytime if
+ *   its dirty bitmap is properly set.
+ * - for spte protection, the spte can be writable only after unsync-ing
+ *   shadow page.
+ *
+ * Return true if the spte is dropped.
+ */
+static bool
+spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
+{
+	u64 spte = *sptep;
+
+	if (!is_writable_pte(spte) &&
+	      !(pt_protect && spte_is_locklessly_modifiable(spte)))
+		return false;
+
+	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
+
+	if (__drop_large_spte(kvm, sptep)) {
+		*flush |= true;
+		return true;
+	}
+
+	if (pt_protect)
+		spte &= ~SPTE_MMU_WRITEABLE;
+	spte = spte & ~PT_WRITABLE_MASK;
+
+	*flush |= mmu_spte_update(sptep, spte);
+	return false;
+}
+
+static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+				 int level, bool pt_protect)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
-	int write_protected = 0;
+	bool flush = false;
 
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-		rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
-
-		if (!is_writable_pte(*sptep)) {
-			sptep = rmap_get_next(&iter);
-			continue;
-		}
-
-		if (level == PT_PAGE_TABLE_LEVEL) {
-			mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
-			sptep = rmap_get_next(&iter);
-		} else {
-			BUG_ON(!is_large_pte(*sptep));
-			drop_spte(kvm, sptep);
-			--kvm->stat.lpages;
+		if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
 			sptep = rmap_get_first(*rmapp, &iter);
+			continue;
 		}
 
-		write_protected = 1;
+		sptep = rmap_get_next(&iter);
 	}
 
-	return write_protected;
+	return flush;
 }
 
 /**
@@ -1100,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 
 	while (mask) {
 		rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
-		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
+		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
 
 		/* clear the first set bit */
 		mask &= mask - 1;
 	}
 }
 
-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	int i;
-	int write_protected = 0;
+	bool write_protected = false;
 
 	slot = gfn_to_memslot(kvm, gfn);
 
 	for (i = PT_PAGE_TABLE_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		rmapp = __gfn_to_rmap(gfn, i, slot);
-		write_protected |= __rmap_write_protect(kvm, rmapp, i);
+		write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
 	}
 
 	return write_protected;
@@ -1238,11 +1312,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			 unsigned long data)
 {
 	u64 *sptep;
-	struct rmap_iterator iter;
+	struct rmap_iterator uninitialized_var(iter);
 	int young = 0;
 
 	/*
-	 * Emulate the accessed bit for EPT, by checking if this page has
+	 * In case of absence of EPT Access and Dirty Bits supports,
+	 * emulate the accessed bit for EPT, by checking if this page has
 	 * an EPT mapping, and clearing it if it does. On the next access,
 	 * a new EPT mapping will be established.
 	 * This has some overhead, but not as much as the cost of swapping
@@ -1253,11 +1328,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
 	     sptep = rmap_get_next(&iter)) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+		BUG_ON(!is_shadow_present_pte(*sptep));
 
-		if (*sptep & PT_ACCESSED_MASK) {
+		if (*sptep & shadow_accessed_mask) {
 			young = 1;
-			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep);
+			clear_bit((ffs(shadow_accessed_mask) - 1),
+				 (unsigned long *)sptep);
 		}
 	}
 
@@ -1281,9 +1357,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
 	     sptep = rmap_get_next(&iter)) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+		BUG_ON(!is_shadow_present_pte(*sptep));
 
-		if (*sptep & PT_ACCESSED_MASK) {
+		if (*sptep & shadow_accessed_mask) {
 			young = 1;
 			break;
 		}
@@ -1401,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 					       u64 *parent_pte, int direct)
 {
 	struct kvm_mmu_page *sp;
-	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
-					sizeof *sp);
-	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
 	if (!direct)
-		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
-						  PAGE_SIZE);
+		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
@@ -1701,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 
 	kvm_mmu_pages_init(parent, &parents, &pages);
 	while (mmu_unsync_walk(parent, &pages)) {
-		int protected = 0;
+		bool protected = false;
 
 		for_each_sp(pages, sp, parents, i)
 			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
@@ -1866,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 	mmu_spte_set(sptep, spte);
 }
 
-static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
-{
-	if (is_large_pte(*sptep)) {
-		drop_spte(vcpu->kvm, sptep);
-		--vcpu->kvm->stat.lpages;
-		kvm_flush_remote_tlbs(vcpu->kvm);
-	}
-}
-
 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 				   unsigned direct_access)
 {
@@ -2243,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		    gfn_t gfn, pfn_t pfn, bool speculative,
 		    bool can_unsync, bool host_writable)
 {
-	u64 spte, entry = *sptep;
+	u64 spte;
 	int ret = 0;
 
 	if (set_mmio_spte(sptep, gfn, pfn, pte_access))
@@ -2257,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_x_mask;
 	else
 		spte |= shadow_nx_mask;
+
 	if (pte_access & ACC_USER_MASK)
 		spte |= shadow_user_mask;
+
 	if (level > PT_PAGE_TABLE_LEVEL)
 		spte |= PT_PAGE_SIZE_MASK;
 	if (tdp_enabled)
@@ -2283,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			goto done;
 		}
 
-		spte |= PT_WRITABLE_MASK;
+		spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
 
 		if (!vcpu->arch.mmu.direct_map
 		    && !(pte_access & ACC_WRITE_MASK)) {
@@ -2312,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 				 __func__, gfn);
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
-			if (is_writable_pte(spte))
-				spte &= ~PT_WRITABLE_MASK;
+			spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
 		}
 	}
 
@@ -2321,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-	mmu_spte_update(sptep, spte);
-	/*
-	 * If we overwrite a writable spte with a read-only one we
-	 * should flush remote TLBs. Otherwise rmap_write_protect
-	 * will find a read-only spte, even though the writable spte
-	 * might be cached on a CPU's TLB.
-	 */
-	if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+	if (mmu_spte_update(sptep, spte))
 		kvm_flush_remote_tlbs(vcpu->kvm);
 done:
 	return ret;
@@ -2403,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
+	mmu_free_roots(vcpu);
 }
 
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2625,18 +2685,116 @@ exit:
 	return ret;
 }
 
+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+{
+	/*
+	 * #PF can be fast only if the shadow page table is present and it
+	 * is caused by write-protect, that means we just need change the
+	 * W bit of the spte which can be done out of mmu-lock.
+	 */
+	if (!(error_code & PFERR_PRESENT_MASK) ||
+	      !(error_code & PFERR_WRITE_MASK))
+		return false;
+
+	return true;
+}
+
+static bool
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
+{
+	struct kvm_mmu_page *sp = page_header(__pa(sptep));
+	gfn_t gfn;
+
+	WARN_ON(!sp->role.direct);
+
+	/*
+	 * The gfn of direct spte is stable since it is calculated
+	 * by sp->gfn.
+	 */
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+		mark_page_dirty(vcpu->kvm, gfn);
+
+	return true;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
+			    u32 error_code)
+{
+	struct kvm_shadow_walk_iterator iterator;
+	bool ret = false;
+	u64 spte = 0ull;
+
+	if (!page_fault_can_be_fast(vcpu, error_code))
+		return false;
+
+	walk_shadow_page_lockless_begin(vcpu);
+	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+		if (!is_shadow_present_pte(spte) || iterator.level < level)
+			break;
+
+	/*
+	 * If the mapping has been changed, let the vcpu fault on the
+	 * same address again.
+	 */
+	if (!is_rmap_spte(spte)) {
+		ret = true;
+		goto exit;
+	}
+
+	if (!is_last_spte(spte, level))
+		goto exit;
+
+	/*
+	 * Check if it is a spurious fault caused by TLB lazily flushed.
+	 *
+	 * Need not check the access of upper level table entries since
+	 * they are always ACC_ALL.
+	 */
+	 if (is_writable_pte(spte)) {
+		ret = true;
+		goto exit;
+	}
+
+	/*
+	 * Currently, to simplify the code, only the spte write-protected
+	 * by dirty-log can be fast fixed.
+	 */
+	if (!spte_is_locklessly_modifiable(spte))
+		goto exit;
+
+	/*
+	 * Currently, fast page fault only works for direct mapping since
+	 * the gfn is not stable for indirect shadow page.
+	 * See Documentation/virtual/kvm/locking.txt to get more detail.
+	 */
+	ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
+exit:
+	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
+			      spte, ret);
+	walk_shadow_page_lockless_end(vcpu);
+
+	return ret;
+}
+
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
 
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
-			 bool prefault)
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
+			 gfn_t gfn, bool prefault)
 {
 	int r;
 	int level;
 	int force_pt_level;
 	pfn_t pfn;
 	unsigned long mmu_seq;
-	bool map_writable;
+	bool map_writable, write = error_code & PFERR_WRITE_MASK;
 
 	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
 	if (likely(!force_pt_level)) {
@@ -2653,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 	} else
 		level = PT_PAGE_TABLE_LEVEL;
 
+	if (fast_page_fault(vcpu, v, level, error_code))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
@@ -3041,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	gfn = gva >> PAGE_SHIFT;
 
 	return nonpaging_map(vcpu, gva & PAGE_MASK,
-			     error_code & PFERR_WRITE_MASK, gfn, prefault);
+			     error_code, gfn, prefault);
 }
 
 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3121,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	} else
 		level = PT_PAGE_TABLE_LEVEL;
 
+	if (fast_page_fault(vcpu, gpa, level, error_code))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
@@ -3885,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
 	struct kvm_mmu_page *sp;
+	bool flush = false;
 
 	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
 		int i;
@@ -3899,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 			      !is_last_spte(pt[i], sp->role.level))
 				continue;
 
-			if (is_large_pte(pt[i])) {
-				drop_spte(kvm, &pt[i]);
-				--kvm->stat.lpages;
-				continue;
-			}
-
-			/* avoid RMW */
-			if (is_writable_pte(pt[i]))
-				mmu_spte_update(&pt[i],
-						pt[i] & ~PT_WRITABLE_MASK);
+			spte_write_protect(kvm, &pt[i], &flush, false);
 		}
 	}
 	kvm_flush_remote_tlbs(kvm);
@@ -3945,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct kvm *kvm;
-	struct kvm *kvm_freed = NULL;
 	int nr_to_scan = sc->nr_to_scan;
 
 	if (nr_to_scan == 0)
@@ -3957,22 +4112,35 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 		int idx;
 		LIST_HEAD(invalid_list);
 
+		/*
+		 * Never scan more than sc->nr_to_scan VM instances.
+		 * Will not hit this condition practically since we do not try
+		 * to shrink more than one VM and it is very unlikely to see
+		 * !n_used_mmu_pages so many times.
+		 */
+		if (!nr_to_scan--)
+			break;
+		/*
+		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
+		 * here. We may skip a VM instance errorneosly, but we do not
+		 * want to shrink a VM that only started to populate its MMU
+		 * anyway.
+		 */
+		if (!kvm->arch.n_used_mmu_pages)
+			continue;
+
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
-		if (!kvm_freed && nr_to_scan > 0 &&
-		    kvm->arch.n_used_mmu_pages > 0) {
-			kvm_mmu_remove_some_alloc_mmu_pages(kvm,
-							    &invalid_list);
-			kvm_freed = kvm;
-		}
-		nr_to_scan--;
 
+		kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
 		spin_unlock(&kvm->mmu_lock);
 		srcu_read_unlock(&kvm->srcu, idx);
+
+		list_move_tail(&kvm->vm_list, &vm_list);
+		break;
 	}
-	if (kvm_freed)
-		list_move_tail(&kvm_freed->vm_list, &vm_list);
 
 	raw_spin_unlock(&kvm_lock);
 
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e81322a..cd6e98333ba3 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
  */
 TRACE_EVENT(
 	kvm_mmu_pagetable_walk,
-	TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
-	TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+	TP_PROTO(u64 addr, u32 pferr),
+	TP_ARGS(addr, pferr),
 
 	TP_STRUCT__entry(
 		__field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(
 
 	TP_fast_assign(
 		__entry->addr = addr;
-		__entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
-		                 | (!!fetch_fault << 4);
+		__entry->pferr = pferr;
 	),
 
 	TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
@@ -243,6 +242,44 @@ TRACE_EVENT(
 	TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
 		  __entry->access)
 );
+
+#define __spte_satisfied(__spte)				\
+	(__entry->retry && is_writable_pte(__entry->__spte))
+
+TRACE_EVENT(
+	fast_page_fault,
+	TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+		 u64 *sptep, u64 old_spte, bool retry),
+	TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(gva_t, gva)
+		__field(u32, error_code)
+		__field(u64 *, sptep)
+		__field(u64, old_spte)
+		__field(u64, new_spte)
+		__field(bool, retry)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu->vcpu_id;
+		__entry->gva = gva;
+		__entry->error_code = error_code;
+		__entry->sptep = sptep;
+		__entry->old_spte = old_spte;
+		__entry->new_spte = *sptep;
+		__entry->retry = retry;
+	),
+
+	TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
+		  " new %llx spurious %d fixed %d", __entry->vcpu_id,
+		  __entry->gva, __print_flags(__entry->error_code, "|",
+		  kvm_mmu_trace_pferr_flags), __entry->sptep,
+		  __entry->old_spte, __entry->new_spte,
+		  __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+	)
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 34f970937ef1..bb7cf01cae76 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	const int fetch_fault = access & PFERR_FETCH_MASK;
 	u16 errcode = 0;
 
-	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
-				     fetch_fault);
+	trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
 	eperm = false;
 	walker->level = mmu->root_level;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 2e88438ffd83..9b7ec1150ab0 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -80,10 +80,10 @@ static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
 
 static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
 {
-	if (idx < X86_PMC_IDX_FIXED)
+	if (idx < INTEL_PMC_IDX_FIXED)
 		return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
 	else
-		return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED);
+		return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED);
 }
 
 void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
@@ -291,7 +291,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx)
 	if (pmc_is_gp(pmc))
 		reprogram_gp_counter(pmc, pmc->eventsel);
 	else {
-		int fidx = idx - X86_PMC_IDX_FIXED;
+		int fidx = idx - INTEL_PMC_IDX_FIXED;
 		reprogram_fixed_counter(pmc,
 				fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
 	}
@@ -452,7 +452,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
 		return;
 
 	pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
-			X86_PMC_MAX_GENERIC);
+			INTEL_PMC_MAX_GENERIC);
 	pmu->counter_bitmask[KVM_PMC_GP] =
 		((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
 	bitmap_len = (entry->eax >> 24) & 0xff;
@@ -462,13 +462,13 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
 		pmu->nr_arch_fixed_counters = 0;
 	} else {
 		pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
-				X86_PMC_MAX_FIXED);
+				INTEL_PMC_MAX_FIXED);
 		pmu->counter_bitmask[KVM_PMC_FIXED] =
 			((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
 	}
 
 	pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
-		(((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED);
+		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
 	pmu->global_ctrl_mask = ~pmu->global_ctrl;
 }
 
@@ -478,15 +478,15 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
 	struct kvm_pmu *pmu = &vcpu->arch.pmu;
 
 	memset(pmu, 0, sizeof(*pmu));
-	for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
+	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
 		pmu->gp_counters[i].type = KVM_PMC_GP;
 		pmu->gp_counters[i].vcpu = vcpu;
 		pmu->gp_counters[i].idx = i;
 	}
-	for (i = 0; i < X86_PMC_MAX_FIXED; i++) {
+	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
 		pmu->fixed_counters[i].type = KVM_PMC_FIXED;
 		pmu->fixed_counters[i].vcpu = vcpu;
-		pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED;
+		pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
 	}
 	init_irq_work(&pmu->irq_work, trigger_pmi);
 	kvm_pmu_cpuid_update(vcpu);
@@ -498,13 +498,13 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
 	int i;
 
 	irq_work_sync(&pmu->irq_work);
-	for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
+	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
 		struct kvm_pmc *pmc = &pmu->gp_counters[i];
 		stop_counter(pmc);
 		pmc->counter = pmc->eventsel = 0;
 	}
 
-	for (i = 0; i < X86_PMC_MAX_FIXED; i++)
+	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
 		stop_counter(&pmu->fixed_counters[i]);
 
 	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f75af406b268..baead950d6c8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3185,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 		break;
 	case MSR_IA32_DEBUGCTLMSR:
 		if (!boot_cpu_has(X86_FEATURE_LBRV)) {
-			pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
-					__func__, data);
+			vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+				    __func__, data);
 			break;
 		}
 		if (data & DEBUGCTL_RESERVED_BITS)
@@ -3205,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 	case MSR_VM_CR:
 		return svm_set_vm_cr(vcpu, data);
 	case MSR_VM_IGNNE:
-		pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+		vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
 		break;
 	default:
 		return kvm_set_msr_common(vcpu, ecx, data);
@@ -4044,6 +4044,11 @@ static bool svm_rdtscp_supported(void)
 	return false;
 }
 
+static bool svm_invpcid_supported(void)
+{
+	return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
 	return true;
@@ -4312,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.cpuid_update = svm_cpuid_update,
 
 	.rdtscp_supported = svm_rdtscp_supported,
+	.invpcid_supported = svm_invpcid_supported,
 
 	.set_supported_cpuid = svm_set_supported_cpuid,
 
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 911d2641f14c..a71faf727ff3 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq,
 		  __entry->coalesced ? " (coalesced)" : "")
 );
 
+TRACE_EVENT(kvm_eoi,
+	    TP_PROTO(struct kvm_lapic *apic, int vector),
+	    TP_ARGS(apic, vector),
+
+	TP_STRUCT__entry(
+		__field(	__u32,		apicid		)
+		__field(	int,		vector		)
+	),
+
+	TP_fast_assign(
+		__entry->apicid		= apic->vcpu->vcpu_id;
+		__entry->vector		= vector;
+	),
+
+	TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
+TRACE_EVENT(kvm_pv_eoi,
+	    TP_PROTO(struct kvm_lapic *apic, int vector),
+	    TP_ARGS(apic, vector),
+
+	TP_STRUCT__entry(
+		__field(	__u32,		apicid		)
+		__field(	int,		vector		)
+	),
+
+	TP_fast_assign(
+		__entry->apicid		= apic->vcpu->vcpu_id;
+		__entry->vector		= vector;
+	),
+
+	TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
 /*
  * Tracepoint for nested VMRUN
  */
@@ -710,16 +744,6 @@ TRACE_EVENT(kvm_skinit,
 		  __entry->rip, __entry->slb)
 );
 
-#define __print_insn(insn, ilen) ({		                 \
-	int i;							 \
-	const char *ret = p->buffer + p->len;			 \
-								 \
-	for (i = 0; i < ilen; ++i)				 \
-		trace_seq_printf(p, " %02x", insn[i]);		 \
-	trace_seq_printf(p, "%c", 0);				 \
-	ret;							 \
-	})
-
 #define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
 #define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
 #define KVM_EMUL_INSN_F_CS_D   (1 << 2)
@@ -786,7 +810,7 @@ TRACE_EVENT(kvm_emulate_insn,
 
 	TP_printk("%x:%llx:%s (%s)%s",
 		  __entry->csbase, __entry->rip,
-		  __print_insn(__entry->insn, __entry->len),
+		  __print_hex(__entry->insn, __entry->len),
 		  __print_symbolic(__entry->flags,
 				   kvm_trace_symbol_emul_flags),
 		  __entry->failed ? " failed" : ""
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 32eb58866292..c00f03de1b79 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,7 +71,10 @@ static bool __read_mostly enable_unrestricted_guest = 1;
 module_param_named(unrestricted_guest,
 			enable_unrestricted_guest, bool, S_IRUGO);
 
-static bool __read_mostly emulate_invalid_guest_state = 0;
+static bool __read_mostly enable_ept_ad_bits = 1;
+module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+
+static bool __read_mostly emulate_invalid_guest_state = true;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
 static bool __read_mostly vmm_exclusive = 1;
@@ -615,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+			    struct kvm_segment *var, int seg);
+static void vmx_get_segment(struct kvm_vcpu *vcpu,
+			    struct kvm_segment *var, int seg);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -789,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
 	return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
 }
 
+static inline bool cpu_has_vmx_ept_ad_bits(void)
+{
+	return vmx_capability.ept & VMX_EPT_AD_BIT;
+}
+
 static inline bool cpu_has_vmx_invept_individual_addr(void)
 {
 	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -849,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void)
 		SECONDARY_EXEC_RDTSCP;
 }
 
+static inline bool cpu_has_vmx_invpcid(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_INVPCID;
+}
+
 static inline bool cpu_has_virtual_nmis(void)
 {
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -1470,13 +1488,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 		loadsegment(ds, vmx->host_state.ds_sel);
 		loadsegment(es, vmx->host_state.es_sel);
 	}
-#else
-	/*
-	 * The sysexit path does not restore ds/es, so we must set them to
-	 * a reasonable value ourselves.
-	 */
-	loadsegment(ds, __USER_DS);
-	loadsegment(es, __USER_DS);
 #endif
 	reload_tss();
 #ifdef CONFIG_X86_64
@@ -1739,6 +1750,11 @@ static bool vmx_rdtscp_supported(void)
 	return cpu_has_vmx_rdtscp();
 }
 
+static bool vmx_invpcid_supported(void)
+{
+	return cpu_has_vmx_invpcid() && enable_ept;
+}
+
 /*
  * Swap MSR entry in host/guest MSR entry array.
  */
@@ -2458,7 +2474,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_ENABLE_EPT |
 			SECONDARY_EXEC_UNRESTRICTED_GUEST |
 			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
-			SECONDARY_EXEC_RDTSCP;
+			SECONDARY_EXEC_RDTSCP |
+			SECONDARY_EXEC_ENABLE_INVPCID;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -2645,8 +2662,12 @@ static __init int hardware_setup(void)
 	    !cpu_has_vmx_ept_4levels()) {
 		enable_ept = 0;
 		enable_unrestricted_guest = 0;
+		enable_ept_ad_bits = 0;
 	}
 
+	if (!cpu_has_vmx_ept_ad_bits())
+		enable_ept_ad_bits = 0;
+
 	if (!cpu_has_vmx_unrestricted_guest())
 		enable_unrestricted_guest = 0;
 
@@ -2770,6 +2791,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 {
 	unsigned long flags;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_segment var;
 
 	if (enable_unrestricted_guest)
 		return;
@@ -2813,20 +2835,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	if (emulate_invalid_guest_state)
 		goto continue_rmode;
 
-	vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
-	vmcs_write32(GUEST_SS_LIMIT, 0xffff);
-	vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
+	vmx_get_segment(vcpu, &var, VCPU_SREG_SS);
+	vmx_set_segment(vcpu, &var, VCPU_SREG_SS);
+
+	vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
+	vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
 
-	vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
-	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-	if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
-		vmcs_writel(GUEST_CS_BASE, 0xf0000);
-	vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
+	vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
+	vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
 
-	fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
-	fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
-	fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
-	fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+	vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
+	vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
+
+	vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
+	vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
+
+	vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
+	vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
 
 continue_rmode:
 	kvm_mmu_reset_context(vcpu);
@@ -3027,6 +3052,8 @@ static u64 construct_eptp(unsigned long root_hpa)
 	/* TODO write the value reading from MSR */
 	eptp = VMX_EPT_DEFAULT_MT |
 		VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+	if (enable_ept_ad_bits)
+		eptp |= VMX_EPT_AD_ENABLE_BIT;
 	eptp |= (root_hpa & PAGE_MASK);
 
 	return eptp;
@@ -3153,11 +3180,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
 
 static int vmx_get_cpl(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/*
+	 * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
+	 * fail; use the cache instead.
+	 */
+	if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
+		return vmx->cpl;
+	}
+
 	if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
 		__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
-		to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
+		vmx->cpl = __vmx_get_cpl(vcpu);
 	}
-	return to_vmx(vcpu)->cpl;
+
+	return vmx->cpl;
 }
 
 
@@ -3165,7 +3203,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
 {
 	u32 ar;
 
-	if (var->unusable)
+	if (var->unusable || !var->present)
 		ar = 1 << 16;
 	else {
 		ar = var->type & 15;
@@ -3177,8 +3215,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
 		ar |= (var->db & 1) << 14;
 		ar |= (var->g & 1) << 15;
 	}
-	if (ar == 0) /* a 0 value means unusable */
-		ar = AR_UNUSABLE_MASK;
 
 	return ar;
 }
@@ -3229,6 +3265,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 
 	vmcs_write32(sf->ar_bytes, ar);
 	__clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+
+	/*
+	 * Fix segments for real mode guest in hosts that don't have
+	 * "unrestricted_mode" or it was disabled.
+	 * This is done to allow migration of the guests from hosts with
+	 * unrestricted guest like Westmere to older host that don't have
+	 * unrestricted guest like Nehelem.
+	 */
+	if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
+		switch (seg) {
+		case VCPU_SREG_CS:
+			vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
+			vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+			if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
+				vmcs_writel(GUEST_CS_BASE, 0xf0000);
+			vmcs_write16(GUEST_CS_SELECTOR,
+				     vmcs_readl(GUEST_CS_BASE) >> 4);
+			break;
+		case VCPU_SREG_ES:
+			fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
+			break;
+		case VCPU_SREG_DS:
+			fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
+			break;
+		case VCPU_SREG_GS:
+			fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
+			break;
+		case VCPU_SREG_FS:
+			fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+			break;
+		case VCPU_SREG_SS:
+			vmcs_write16(GUEST_SS_SELECTOR,
+				     vmcs_readl(GUEST_SS_BASE) >> 4);
+			vmcs_write32(GUEST_SS_LIMIT, 0xffff);
+			vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
+			break;
+		}
+	}
 }
 
 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3731,6 +3805,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 	if (!enable_ept) {
 		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
 		enable_unrestricted_guest = 0;
+		/* Enable INVPCID for non-ept guests may cause performance regression. */
+		exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
 	}
 	if (!enable_unrestricted_guest)
 		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -4489,7 +4565,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		break;
 	}
 	vcpu->run->exit_reason = 0;
-	pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
+	vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
 	       (int)(exit_qualification >> 4) & 3, cr);
 	return 0;
 }
@@ -4769,6 +4845,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification;
 	gpa_t gpa;
+	u32 error_code;
 	int gla_validity;
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4793,7 +4870,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	trace_kvm_page_fault(gpa, exit_qualification);
-	return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
+
+	/* It is a write fault? */
+	error_code = exit_qualification & (1U << 1);
+	/* ept page table is present? */
+	error_code |= (exit_qualification >> 3) & 0x1;
+
+	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
 static u64 ept_rsvd_mask(u64 spte, int level)
@@ -4908,15 +4991,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 	int ret = 1;
 	u32 cpu_exec_ctrl;
 	bool intr_window_requested;
+	unsigned count = 130;
 
 	cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
 
-	while (!guest_state_valid(vcpu)) {
-		if (intr_window_requested
-		    && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
+	while (!guest_state_valid(vcpu) && count-- != 0) {
+		if (intr_window_requested && vmx_interrupt_allowed(vcpu))
 			return handle_interrupt_window(&vmx->vcpu);
 
+		if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+			return 1;
+
 		err = emulate_instruction(vcpu, 0);
 
 		if (err == EMULATE_DO_MMIO) {
@@ -4924,8 +5010,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 			goto out;
 		}
 
-		if (err != EMULATE_DONE)
+		if (err != EMULATE_DONE) {
+			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+			vcpu->run->internal.ndata = 0;
 			return 0;
+		}
 
 		if (signal_pending(current))
 			goto out;
@@ -4933,7 +5023,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 			schedule();
 	}
 
-	vmx->emulation_required = 0;
+	vmx->emulation_required = !guest_state_valid(vcpu);
 out:
 	return ret;
 }
@@ -6273,6 +6363,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 	      );
 
+#ifndef CONFIG_X86_64
+	/*
+	 * The sysexit path does not restore ds/es, so we must set them to
+	 * a reasonable value ourselves.
+	 *
+	 * We can't defer this to vmx_load_host_state() since that function
+	 * may be executed in interrupt context, which saves and restore segments
+	 * around it, nullifying its effect.
+	 */
+	loadsegment(ds, __USER_DS);
+	loadsegment(es, __USER_DS);
+#endif
+
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
 				  | (1 << VCPU_EXREG_RFLAGS)
 				  | (1 << VCPU_EXREG_CPL)
@@ -6467,6 +6570,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 			}
 		}
 	}
+
+	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	/* Exposing INVPCID only when PCID is exposed */
+	best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+	if (vmx_invpcid_supported() &&
+	    best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
+	    guest_cpuid_has_pcid(vcpu)) {
+		exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+			     exec_control);
+	} else {
+		exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+			     exec_control);
+		if (best)
+			best->ecx &= ~bit(X86_FEATURE_INVPCID);
+	}
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -7201,6 +7321,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.cpuid_update = vmx_cpuid_update,
 
 	.rdtscp_supported = vmx_rdtscp_supported,
+	.invpcid_supported = vmx_invpcid_supported,
 
 	.set_supported_cpuid = vmx_set_supported_cpuid,
 
@@ -7230,23 +7351,21 @@ static int __init vmx_init(void)
 	if (!vmx_io_bitmap_a)
 		return -ENOMEM;
 
+	r = -ENOMEM;
+
 	vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_io_bitmap_b) {
-		r = -ENOMEM;
+	if (!vmx_io_bitmap_b)
 		goto out;
-	}
 
 	vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_legacy) {
-		r = -ENOMEM;
+	if (!vmx_msr_bitmap_legacy)
 		goto out1;
-	}
+
 
 	vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap_longmode) {
-		r = -ENOMEM;
+	if (!vmx_msr_bitmap_longmode)
 		goto out2;
-	}
+
 
 	/*
 	 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7275,8 +7394,10 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
 	if (enable_ept) {
-		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
-				VMX_EPT_EXECUTABLE_MASK);
+		kvm_mmu_set_mask_ptes(0ull,
+			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
+			(enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
+			0ull, VMX_EPT_EXECUTABLE_MASK);
 		ept_set_mmio_spte_mask();
 		kvm_enable_tdp();
 	} else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index be6d54929fa7..148ed666e311 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 			return 1;
 	}
 
+	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+		return 1;
+
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 
 	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
@@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 				   kvm_read_cr3(vcpu)))
 		return 1;
 
+	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
+		if (!guest_cpuid_has_pcid(vcpu))
+			return 1;
+
+		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+			return 1;
+	}
+
 	if (kvm_x86_ops->set_cr4(vcpu, cr4))
 		return 1;
 
-	if ((cr4 ^ old_cr4) & pdptr_bits)
+	if (((cr4 ^ old_cr4) & pdptr_bits) ||
+	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
 		kvm_mmu_reset_context(vcpu);
 
 	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
@@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	}
 
 	if (is_long_mode(vcpu)) {
-		if (cr3 & CR3_L_MODE_RESERVED_BITS)
-			return 1;
+		if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
+			if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
+				return 1;
+		} else
+			if (cr3 & CR3_L_MODE_RESERVED_BITS)
+				return 1;
 	} else {
 		if (is_pae(vcpu)) {
 			if (cr3 & CR3_PAE_RESERVED_BITS)
@@ -789,12 +806,13 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	9
+#define KVM_SAVE_MSRS_BEGIN	10
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+	MSR_KVM_PV_EOI_EN,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -907,6 +925,10 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 	 */
 	getboottime(&boot);
 
+	if (kvm->arch.kvmclock_offset) {
+		struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
+		boot = timespec_sub(boot, ts);
+	}
 	wc.sec = boot.tv_sec;
 	wc.nsec = boot.tv_nsec;
 	wc.version = version;
@@ -1437,8 +1459,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		break;
 	}
 	default:
-		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
-			  "data 0x%llx\n", msr, data);
+		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+			    "data 0x%llx\n", msr, data);
 		return 1;
 	}
 	return 0;
@@ -1470,8 +1492,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case HV_X64_MSR_TPR:
 		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
 	default:
-		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
-			  "data 0x%llx\n", msr, data);
+		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+			    "data 0x%llx\n", msr, data);
 		return 1;
 	}
 
@@ -1551,15 +1573,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
 		data &= ~(u64)0x8;	/* ignore TLB cache disable */
 		if (data != 0) {
-			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
-				data);
+			vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+				    data);
 			return 1;
 		}
 		break;
 	case MSR_FAM10H_MMIO_CONF_BASE:
 		if (data != 0) {
-			pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
-				"0x%llx\n", data);
+			vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+				    "0x%llx\n", data);
 			return 1;
 		}
 		break;
@@ -1574,8 +1596,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			   thus reserved and should throw a #GP */
 			return 1;
 		}
-		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
-			__func__, data);
+		vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+			    __func__, data);
 		break;
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_UCODE_WRITE:
@@ -1653,6 +1675,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 
 		break;
+	case MSR_KVM_PV_EOI_EN:
+		if (kvm_lapic_enable_pv_eoi(vcpu, data))
+			return 1;
+		break;
 
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
@@ -1671,8 +1697,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case MSR_K7_EVNTSEL2:
 	case MSR_K7_EVNTSEL3:
 		if (data != 0)
-			pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
-				"0x%x data 0x%llx\n", msr, data);
+			vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+				    "0x%x data 0x%llx\n", msr, data);
 		break;
 	/* at least RHEL 4 unconditionally writes to the perfctr registers,
 	 * so we ignore writes to make it happy.
@@ -1681,8 +1707,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case MSR_K7_PERFCTR1:
 	case MSR_K7_PERFCTR2:
 	case MSR_K7_PERFCTR3:
-		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
-			"0x%x data 0x%llx\n", msr, data);
+		vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+			    "0x%x data 0x%llx\n", msr, data);
 		break;
 	case MSR_P6_PERFCTR0:
 	case MSR_P6_PERFCTR1:
@@ -1693,8 +1719,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			return kvm_pmu_set_msr(vcpu, msr, data);
 
 		if (pr || data != 0)
-			pr_unimpl(vcpu, "disabled perfctr wrmsr: "
-				"0x%x data 0x%llx\n", msr, data);
+			vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
+				    "0x%x data 0x%llx\n", msr, data);
 		break;
 	case MSR_K7_CLK_CTL:
 		/*
@@ -1720,7 +1746,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		/* Drop writes to this legacy MSR -- see rdmsr
 		 * counterpart for further detail.
 		 */
-		pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
+		vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
 		break;
 	case MSR_AMD64_OSVW_ID_LENGTH:
 		if (!guest_cpuid_has_osvw(vcpu))
@@ -1738,12 +1764,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		if (kvm_pmu_msr(vcpu, msr))
 			return kvm_pmu_set_msr(vcpu, msr, data);
 		if (!ignore_msrs) {
-			pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
-				msr, data);
+			vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
+				    msr, data);
 			return 1;
 		} else {
-			pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
-				msr, data);
+			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
+				    msr, data);
 			break;
 		}
 	}
@@ -1846,7 +1872,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		data = kvm->arch.hv_hypercall;
 		break;
 	default:
-		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
 	}
 
@@ -1877,7 +1903,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		data = vcpu->arch.hv_vapic;
 		break;
 	default:
-		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
 	}
 	*pdata = data;
@@ -1974,6 +2000,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_KVM_STEAL_TIME:
 		data = vcpu->arch.st.msr_val;
 		break;
+	case MSR_KVM_PV_EOI_EN:
+		data = vcpu->arch.pv_eoi.msr_val;
+		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
@@ -2030,10 +2059,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		if (kvm_pmu_msr(vcpu, msr))
 			return kvm_pmu_get_msr(vcpu, msr, pdata);
 		if (!ignore_msrs) {
-			pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+			vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 			return 1;
 		} else {
-			pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
+			vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
 			data = 0;
 		}
 		break;
@@ -4116,7 +4145,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
 		value = kvm_get_cr8(vcpu);
 		break;
 	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+		kvm_err("%s: unexpected cr %u\n", __func__, cr);
 		return 0;
 	}
 
@@ -4145,7 +4174,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
 		res = kvm_set_cr8(vcpu, val);
 		break;
 	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+		kvm_err("%s: unexpected cr %u\n", __func__, cr);
 		res = -1;
 	}
 
@@ -4297,26 +4326,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
 	return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
 }
 
-static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
 			       u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
 {
-	struct kvm_cpuid_entry2 *cpuid = NULL;
-
-	if (eax && ecx)
-		cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
-					    *eax, *ecx);
-
-	if (cpuid) {
-		*eax = cpuid->eax;
-		*ecx = cpuid->ecx;
-		if (ebx)
-			*ebx = cpuid->ebx;
-		if (edx)
-			*edx = cpuid->edx;
-		return true;
-	}
-
-	return false;
+	kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
 }
 
 static struct x86_emulate_ops emulate_ops = {
@@ -5296,8 +5309,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	r = kvm_mmu_reload(vcpu);
 	if (unlikely(r)) {
-		kvm_x86_ops->cancel_injection(vcpu);
-		goto out;
+		goto cancel_injection;
 	}
 
 	preempt_disable();
@@ -5322,9 +5334,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		smp_wmb();
 		local_irq_enable();
 		preempt_enable();
-		kvm_x86_ops->cancel_injection(vcpu);
 		r = 1;
-		goto out;
+		goto cancel_injection;
 	}
 
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -5388,9 +5399,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (unlikely(vcpu->arch.tsc_always_catchup))
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
-	kvm_lapic_sync_from_vapic(vcpu);
+	if (vcpu->arch.apic_attention)
+		kvm_lapic_sync_from_vapic(vcpu);
 
 	r = kvm_x86_ops->handle_exit(vcpu);
+	return r;
+
+cancel_injection:
+	kvm_x86_ops->cancel_injection(vcpu);
+	if (unlikely(vcpu->arch.apic_attention))
+		kvm_lapic_sync_from_vapic(vcpu);
 out:
 	return r;
 }
@@ -6304,7 +6322,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 
 	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
 		if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
-			vfree(free->arch.lpage_info[i]);
+			kvm_kvfree(free->arch.lpage_info[i]);
 			free->arch.lpage_info[i] = NULL;
 		}
 	}
@@ -6323,7 +6341,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 				      slot->base_gfn, level) + 1;
 
 		slot->arch.lpage_info[i] =
-			vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
+			kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
 		if (!slot->arch.lpage_info[i])
 			goto out_free;
 
@@ -6350,7 +6368,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 
 out_free:
 	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-		vfree(slot->arch.lpage_info[i]);
+		kvm_kvfree(slot->arch.lpage_info[i]);
 		slot->arch.lpage_info[i] = NULL;
 	}
 	return -ENOMEM;
diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c
index a311cc59b65d..8d6ef78b5d01 100644
--- a/arch/x86/lib/msr-reg-export.c
+++ b/arch/x86/lib/msr-reg-export.c
@@ -1,5 +1,5 @@
 #include <linux/module.h>
 #include <asm/msr.h>
 
-EXPORT_SYMBOL(native_rdmsr_safe_regs);
-EXPORT_SYMBOL(native_wrmsr_safe_regs);
+EXPORT_SYMBOL(rdmsr_safe_regs);
+EXPORT_SYMBOL(wrmsr_safe_regs);
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index 69fa10623f21..f6d13eefad10 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -6,13 +6,13 @@
 
 #ifdef CONFIG_X86_64
 /*
- * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
+ * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
  *
  * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]
  *
  */
 .macro op_safe_regs op
-ENTRY(native_\op\()_safe_regs)
+ENTRY(\op\()_safe_regs)
 	CFI_STARTPROC
 	pushq_cfi %rbx
 	pushq_cfi %rbp
@@ -45,13 +45,13 @@ ENTRY(native_\op\()_safe_regs)
 
 	_ASM_EXTABLE(1b, 3b)
 	CFI_ENDPROC
-ENDPROC(native_\op\()_safe_regs)
+ENDPROC(\op\()_safe_regs)
 .endm
 
 #else /* X86_32 */
 
 .macro op_safe_regs op
-ENTRY(native_\op\()_safe_regs)
+ENTRY(\op\()_safe_regs)
 	CFI_STARTPROC
 	pushl_cfi %ebx
 	pushl_cfi %ebp
@@ -92,7 +92,7 @@ ENTRY(native_\op\()_safe_regs)
 
 	_ASM_EXTABLE(1b, 3b)
 	CFI_ENDPROC
-ENDPROC(native_\op\()_safe_regs)
+ENDPROC(\op\()_safe_regs)
 .endm
 
 #endif
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f6679a7fb8ca..b91e48512425 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -56,9 +56,16 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 }
 
 /*
- * search for a shareable pmd page for hugetlb.
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
  */
-static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+static pte_t *
+huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 {
 	struct vm_area_struct *vma = find_vma(mm, addr);
 	struct address_space *mapping = vma->vm_file->f_mapping;
@@ -68,9 +75,10 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	struct vm_area_struct *svma;
 	unsigned long saddr;
 	pte_t *spte = NULL;
+	pte_t *pte;
 
 	if (!vma_shareable(vma, addr))
-		return;
+		return (pte_t *)pmd_alloc(mm, pud, addr);
 
 	mutex_lock(&mapping->i_mmap_mutex);
 	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
@@ -97,7 +105,9 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 		put_page(virt_to_page(spte));
 	spin_unlock(&mm->page_table_lock);
 out:
+	pte = (pte_t *)pmd_alloc(mm, pud, addr);
 	mutex_unlock(&mapping->i_mmap_mutex);
+	return pte;
 }
 
 /*
@@ -142,8 +152,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 		} else {
 			BUG_ON(sz != PMD_SIZE);
 			if (pud_none(*pud))
-				huge_pmd_share(mm, addr, pud);
-			pte = (pte_t *) pmd_alloc(mm, pud, addr);
+				pte = huge_pmd_share(mm, addr, pud);
+			else
+				pte = (pte_t *)pmd_alloc(mm, pud, addr);
 		}
 	}
 	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bc4e9d84157f..e0e6990723e9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -385,7 +385,7 @@ void free_initmem(void)
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
+void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
 	/*
 	 * end could be not aligned, and We can not align that,
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 575d86f85ce4..4f04db150027 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -445,10 +445,10 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base)
 }
 #endif /* CONFIG_HIGHMEM */
 
-void __init native_pagetable_setup_start(pgd_t *base)
+void __init native_pagetable_init(void)
 {
 	unsigned long pfn, va;
-	pgd_t *pgd;
+	pgd_t *pgd, *base = swapper_pg_dir;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
@@ -475,10 +475,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
 		pte_clear(NULL, va, pte);
 	}
 	paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
-}
-
-void __init native_pagetable_setup_done(pgd_t *base)
-{
+	paging_init();
 }
 
 /*
@@ -493,7 +490,7 @@ void __init native_pagetable_setup_done(pgd_t *base)
  * If we're booting paravirtualized under a hypervisor, then there are
  * more options: we may already be running PAE, and the pagetable may
  * or may not be based in swapper_pg_dir.  In any case,
- * paravirt_pagetable_setup_start() will set up swapper_pg_dir
+ * paravirt_pagetable_init() will set up swapper_pg_dir
  * appropriately for the rest of the initialization to work.
  *
  * In general, pagetable_init() assumes that the pagetable may already
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 4599c3e8bcb6..4ddf497ca65b 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -142,23 +142,23 @@ static inline int save_add_info(void) {return 0;}
 #endif
 
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
-void __init
+int __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
 	u64 start, end;
 	int node, pxm;
 
 	if (srat_disabled())
-		return;
+		return -1;
 	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
 		bad_srat();
-		return;
+		return -1;
 	}
 	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
-		return;
+		return -1;
 
 	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
-		return;
+		return -1;
 	start = ma->base_address;
 	end = start + ma->length;
 	pxm = ma->proximity_domain;
@@ -168,12 +168,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
 		bad_srat();
-		return;
+		return -1;
 	}
 
 	if (numa_add_memblk(node, start, end) < 0) {
 		bad_srat();
-		return;
+		return -1;
 	}
 
 	node_set(node, numa_nodes_parsed);
@@ -181,6 +181,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
 	       node, pxm,
 	       (unsigned long long) start, (unsigned long long) end - 1);
+	return 0;
 }
 
 void __init acpi_numa_arch_fixup(void) {}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5e57e113b72c..613cd83e8c0c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
 #include <asm/cache.h>
 #include <asm/apic.h>
 #include <asm/uv/uv.h>
+#include <linux/debugfs.h>
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
 			= { &init_mm, 0, };
@@ -27,33 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
  *
  *	More scalable flush, from Andi Kleen
  *
- *	To avoid global state use 8 different call vectors.
- *	Each CPU uses a specific vector to trigger flushes on other
- *	CPUs. Depending on the received vector the target CPUs look into
- *	the right array slot for the flush data.
- *
- *	With more than 8 CPUs they are hashed to the 8 available
- *	vectors. The limited global vector space forces us to this right now.
- *	In future when interrupts are split into per CPU domains this could be
- *	fixed, at the cost of triggering multiple IPIs in some cases.
+ *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
 
-union smp_flush_state {
-	struct {
-		struct mm_struct *flush_mm;
-		unsigned long flush_va;
-		raw_spinlock_t tlbstate_lock;
-		DECLARE_BITMAP(flush_cpumask, NR_CPUS);
-	};
-	char pad[INTERNODE_CACHE_BYTES];
-} ____cacheline_internodealigned_in_smp;
-
-/* State is put into the per CPU data section, but padded
-   to a full cache line because other CPUs can access it and we don't
-   want false sharing in the per cpu data segment. */
-static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
-
-static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
+struct flush_tlb_info {
+	struct mm_struct *flush_mm;
+	unsigned long flush_start;
+	unsigned long flush_end;
+};
 
 /*
  * We cannot call mmdrop() because we are in interrupt context,
@@ -72,28 +54,25 @@ void leave_mm(int cpu)
 EXPORT_SYMBOL_GPL(leave_mm);
 
 /*
- *
  * The flush IPI assumes that a thread switch happens in this order:
  * [cpu0: the cpu that switches]
  * 1) switch_mm() either 1a) or 1b)
  * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- *	Stop ipi delivery for the old mm. This is not synchronized with
- *	the other cpus, but smp_invalidate_interrupt ignore flush ipis
- *	for the wrong mm, and in the worst case we perform a superfluous
- *	tlb flush.
- * 1a2) set cpu mmu_state to TLBSTATE_OK
- *	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *	was in lazy tlb mode.
- * 1a3) update cpu active_mm
+ * 1a1) set cpu_tlbstate to TLBSTATE_OK
+ *	Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
+ *	if cpu0 was in lazy tlb mode.
+ * 1a2) update cpu active_mm
  *	Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
  *	Now the other cpus will send tlb flush ipis.
  * 1a4) change cr3.
+ * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *	Stop ipi delivery for the old mm. This is not synchronized with
+ *	the other cpus, but flush_tlb_func ignore flush ipis for the wrong
+ *	mm, and in the worst case we perform a superfluous tlb flush.
  * 1b) thread switch without mm change
- *	cpu active_mm is correct, cpu0 already handles
- *	flush ipis.
- * 1b1) set cpu mmu_state to TLBSTATE_OK
+ *	cpu active_mm is correct, cpu0 already handles flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
  * 1b2) test_and_set the cpu bit in cpu_vm_mask.
  *	Atomically set the bit [other cpus will start sending flush ipis],
  *	and test the bit.
@@ -106,174 +85,62 @@ EXPORT_SYMBOL_GPL(leave_mm);
  *   runs in kernel space, the cpu could load tlb entries for user space
  *   pages.
  *
- * The good news is that cpu mmu_state is local to each cpu, no
+ * The good news is that cpu_tlbstate is local to each cpu, no
  * write/read ordering problems.
  */
 
 /*
- * TLB flush IPI:
- *
+ * TLB flush funcation:
  * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
  * 2) Leave the mm if we are in the lazy tlb mode.
- *
- * Interrupts are disabled.
- */
-
-/*
- * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop
- * but still used for documentation purpose but the usage is slightly
- * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt
- * entry calls in with the first parameter in %eax.  Maybe define
- * intrlinkage?
  */
-#ifdef CONFIG_X86_64
-asmlinkage
-#endif
-void smp_invalidate_interrupt(struct pt_regs *regs)
+static void flush_tlb_func(void *info)
 {
-	unsigned int cpu;
-	unsigned int sender;
-	union smp_flush_state *f;
-
-	cpu = smp_processor_id();
-	/*
-	 * orig_rax contains the negated interrupt vector.
-	 * Use that to determine where the sender put the data.
-	 */
-	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
-	f = &flush_state[sender];
-
-	if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
-		goto out;
-		/*
-		 * This was a BUG() but until someone can quote me the
-		 * line from the intel manual that guarantees an IPI to
-		 * multiple CPUs is retried _only_ on the erroring CPUs
-		 * its staying as a return
-		 *
-		 * BUG();
-		 */
-
-	if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
-		if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
-			if (f->flush_va == TLB_FLUSH_ALL)
-				local_flush_tlb();
-			else
-				__flush_tlb_one(f->flush_va);
-		} else
-			leave_mm(cpu);
-	}
-out:
-	ack_APIC_irq();
-	smp_mb__before_clear_bit();
-	cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
-	smp_mb__after_clear_bit();
-	inc_irq_stat(irq_tlb_count);
-}
+	struct flush_tlb_info *f = info;
 
-static void flush_tlb_others_ipi(const struct cpumask *cpumask,
-				 struct mm_struct *mm, unsigned long va)
-{
-	unsigned int sender;
-	union smp_flush_state *f;
-
-	/* Caller has disabled preemption */
-	sender = this_cpu_read(tlb_vector_offset);
-	f = &flush_state[sender];
-
-	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
-		raw_spin_lock(&f->tlbstate_lock);
-
-	f->flush_mm = mm;
-	f->flush_va = va;
-	if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
-		/*
-		 * We have to send the IPI only to
-		 * CPUs affected.
-		 */
-		apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
-			      INVALIDATE_TLB_VECTOR_START + sender);
-
-		while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
-			cpu_relax();
-	}
+	if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+		return;
+
+	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+		if (f->flush_end == TLB_FLUSH_ALL || !cpu_has_invlpg)
+			local_flush_tlb();
+		else if (!f->flush_end)
+			__flush_tlb_single(f->flush_start);
+		else {
+			unsigned long addr;
+			addr = f->flush_start;
+			while (addr < f->flush_end) {
+				__flush_tlb_single(addr);
+				addr += PAGE_SIZE;
+			}
+		}
+	} else
+		leave_mm(smp_processor_id());
 
-	f->flush_mm = NULL;
-	f->flush_va = 0;
-	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
-		raw_spin_unlock(&f->tlbstate_lock);
 }
 
 void native_flush_tlb_others(const struct cpumask *cpumask,
-			     struct mm_struct *mm, unsigned long va)
+				 struct mm_struct *mm, unsigned long start,
+				 unsigned long end)
 {
+	struct flush_tlb_info info;
+	info.flush_mm = mm;
+	info.flush_start = start;
+	info.flush_end = end;
+
 	if (is_uv_system()) {
 		unsigned int cpu;
 
 		cpu = smp_processor_id();
-		cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
+		cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
 		if (cpumask)
-			flush_tlb_others_ipi(cpumask, mm, va);
+			smp_call_function_many(cpumask, flush_tlb_func,
+								&info, 1);
 		return;
 	}
-	flush_tlb_others_ipi(cpumask, mm, va);
+	smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
 }
 
-static void __cpuinit calculate_tlb_offset(void)
-{
-	int cpu, node, nr_node_vecs, idx = 0;
-	/*
-	 * we are changing tlb_vector_offset for each CPU in runtime, but this
-	 * will not cause inconsistency, as the write is atomic under X86. we
-	 * might see more lock contentions in a short time, but after all CPU's
-	 * tlb_vector_offset are changed, everything should go normal
-	 *
-	 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
-	 * waste some vectors.
-	 **/
-	if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
-		nr_node_vecs = 1;
-	else
-		nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
-
-	for_each_online_node(node) {
-		int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
-			nr_node_vecs;
-		int cpu_offset = 0;
-		for_each_cpu(cpu, cpumask_of_node(node)) {
-			per_cpu(tlb_vector_offset, cpu) = node_offset +
-				cpu_offset;
-			cpu_offset++;
-			cpu_offset = cpu_offset % nr_node_vecs;
-		}
-		idx++;
-	}
-}
-
-static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
-		unsigned long action, void *hcpu)
-{
-	switch (action & 0xf) {
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		calculate_tlb_offset();
-	}
-	return NOTIFY_OK;
-}
-
-static int __cpuinit init_smp_flush(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(flush_state); i++)
-		raw_spin_lock_init(&flush_state[i].tlbstate_lock);
-
-	calculate_tlb_offset();
-	hotcpu_notifier(tlb_cpuhp_notify, 0);
-	return 0;
-}
-core_initcall(init_smp_flush);
-
 void flush_tlb_current_task(void)
 {
 	struct mm_struct *mm = current->mm;
@@ -282,27 +149,91 @@ void flush_tlb_current_task(void)
 
 	local_flush_tlb();
 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
+		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
 	preempt_enable();
 }
 
-void flush_tlb_mm(struct mm_struct *mm)
+/*
+ * It can find out the THP large page, or
+ * HUGETLB page in tlb_flush when THP disabled
+ */
+static inline unsigned long has_large_page(struct mm_struct *mm,
+				 unsigned long start, unsigned long end)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	unsigned long addr = ALIGN(start, HPAGE_SIZE);
+	for (; addr < end; addr += HPAGE_SIZE) {
+		pgd = pgd_offset(mm, addr);
+		if (likely(!pgd_none(*pgd))) {
+			pud = pud_offset(pgd, addr);
+			if (likely(!pud_none(*pud))) {
+				pmd = pmd_offset(pud, addr);
+				if (likely(!pmd_none(*pmd)))
+					if (pmd_large(*pmd))
+						return addr;
+			}
+		}
+	}
+	return 0;
+}
+
+void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+				unsigned long end, unsigned long vmflag)
 {
+	unsigned long addr;
+	unsigned act_entries, tlb_entries = 0;
+
 	preempt_disable();
+	if (current->active_mm != mm)
+		goto flush_all;
 
-	if (current->active_mm == mm) {
-		if (current->mm)
+	if (!current->mm) {
+		leave_mm(smp_processor_id());
+		goto flush_all;
+	}
+
+	if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
+					|| vmflag == VM_HUGETLB) {
+		local_flush_tlb();
+		goto flush_all;
+	}
+
+	/* In modern CPU, last level tlb used for both data/ins */
+	if (vmflag & VM_EXEC)
+		tlb_entries = tlb_lli_4k[ENTRIES];
+	else
+		tlb_entries = tlb_lld_4k[ENTRIES];
+	/* Assume all of TLB entries was occupied by this task */
+	act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
+
+	/* tlb_flushall_shift is on balance point, details in commit log */
+	if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+		local_flush_tlb();
+	else {
+		if (has_large_page(mm, start, end)) {
 			local_flush_tlb();
-		else
-			leave_mm(smp_processor_id());
+			goto flush_all;
+		}
+		/* flush range by one by one 'invlpg' */
+		for (addr = start; addr < end;	addr += PAGE_SIZE)
+			__flush_tlb_single(addr);
+
+		if (cpumask_any_but(mm_cpumask(mm),
+				smp_processor_id()) < nr_cpu_ids)
+			flush_tlb_others(mm_cpumask(mm), mm, start, end);
+		preempt_enable();
+		return;
 	}
-	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
 
+flush_all:
+	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
 	preempt_enable();
 }
 
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
 {
 	struct mm_struct *mm = vma->vm_mm;
 
@@ -310,13 +241,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 
 	if (current->active_mm == mm) {
 		if (current->mm)
-			__flush_tlb_one(va);
+			__flush_tlb_one(start);
 		else
 			leave_mm(smp_processor_id());
 	}
 
 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-		flush_tlb_others(mm_cpumask(mm), mm, va);
+		flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
 
 	preempt_enable();
 }
@@ -332,3 +263,83 @@ void flush_tlb_all(void)
 {
 	on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
+
+static void do_kernel_range_flush(void *info)
+{
+	struct flush_tlb_info *f = info;
+	unsigned long addr;
+
+	/* flush range by one by one 'invlpg' */
+	for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+		__flush_tlb_single(addr);
+}
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	unsigned act_entries;
+	struct flush_tlb_info info;
+
+	/* In modern CPU, last level tlb used for both data/ins */
+	act_entries = tlb_lld_4k[ENTRIES];
+
+	/* Balance as user space task's flush, a bit conservative */
+	if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
+		(end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+	else {
+		info.flush_start = start;
+		info.flush_end = end;
+		on_each_cpu(do_kernel_range_flush, &info, 1);
+	}
+}
+
+#ifdef CONFIG_DEBUG_TLBFLUSH
+static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
+			     size_t count, loff_t *ppos)
+{
+	char buf[32];
+	unsigned int len;
+
+	len = sprintf(buf, "%hd\n", tlb_flushall_shift);
+	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t tlbflush_write_file(struct file *file,
+		 const char __user *user_buf, size_t count, loff_t *ppos)
+{
+	char buf[32];
+	ssize_t len;
+	s8 shift;
+
+	len = min(count, sizeof(buf) - 1);
+	if (copy_from_user(buf, user_buf, len))
+		return -EFAULT;
+
+	buf[len] = '\0';
+	if (kstrtos8(buf, 0, &shift))
+		return -EINVAL;
+
+	if (shift > 64)
+		return -EINVAL;
+
+	tlb_flushall_shift = shift;
+	return count;
+}
+
+static const struct file_operations fops_tlbflush = {
+	.read = tlbflush_read_file,
+	.write = tlbflush_write_file,
+	.llseek = default_llseek,
+};
+
+static int __cpuinit create_tlb_flushall_shift(void)
+{
+	if (cpu_has_invlpg) {
+		debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
+			arch_debugfs_dir, NULL, &fops_tlbflush);
+	}
+	return 0;
+}
+late_initcall(create_tlb_flushall_shift);
+#endif
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 0597f95b6da6..33643a8bcbbb 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -309,6 +309,10 @@ void bpf_jit_compile(struct sk_filter *fp)
 				else
 					EMIT1_off32(0x0d, K);	/* or imm32,%eax */
 				break;
+			case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */
+				seen |= SEEN_XREG;
+				EMIT2(0x31, 0xd8);		/* xor %ebx,%eax */
+				break;
 			case BPF_S_ALU_LSH_X: /* A <<= X; */
 				seen |= SEEN_XREG;
 				EMIT4(0x89, 0xd9, 0xd3, 0xe0);	/* mov %ebx,%ecx; shl %cl,%eax */
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 303f08637826..b2b94438ff05 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -312,7 +312,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
 			goto fail;
 		}
 		/* both registers must be reserved */
-		if (num_counters == AMD64_NUM_COUNTERS_F15H) {
+		if (num_counters == AMD64_NUM_COUNTERS_CORE) {
 			msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
 			msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
 		} else {
@@ -514,7 +514,7 @@ static int op_amd_init(struct oprofile_operations *ops)
 	ops->create_files = setup_ibs_files;
 
 	if (boot_cpu_data.x86 == 0x15) {
-		num_counters = AMD64_NUM_COUNTERS_F15H;
+		num_counters = AMD64_NUM_COUNTERS_CORE;
 	} else {
 		num_counters = AMD64_NUM_COUNTERS;
 	}
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index fc09c2754e08..505acdd6d600 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -12,8 +12,13 @@ struct pci_root_info {
 	char name[16];
 	unsigned int res_num;
 	struct resource *res;
-	int busnum;
 	struct pci_sysdata sd;
+#ifdef	CONFIG_PCI_MMCONFIG
+	bool mcfg_added;
+	u16 segment;
+	u8 start_bus;
+	u8 end_bus;
+#endif
 };
 
 static bool pci_use_crs = true;
@@ -120,6 +125,81 @@ void __init pci_acpi_crs_quirks(void)
 	       pci_use_crs ? "nocrs" : "use_crs");
 }
 
+#ifdef	CONFIG_PCI_MMCONFIG
+static int __devinit check_segment(u16 seg, struct device *dev, char *estr)
+{
+	if (seg) {
+		dev_err(dev,
+			"%s can't access PCI configuration "
+			"space under this host bridge.\n",
+			estr);
+		return -EIO;
+	}
+
+	/*
+	 * Failure in adding MMCFG information is not fatal,
+	 * just can't access extended configuration space of
+	 * devices under this host bridge.
+	 */
+	dev_warn(dev,
+		 "%s can't access extended PCI configuration "
+		 "space under this bridge.\n",
+		 estr);
+
+	return 0;
+}
+
+static int __devinit setup_mcfg_map(struct pci_root_info *info,
+				    u16 seg, u8 start, u8 end,
+				    phys_addr_t addr)
+{
+	int result;
+	struct device *dev = &info->bridge->dev;
+
+	info->start_bus = start;
+	info->end_bus = end;
+	info->mcfg_added = false;
+
+	/* return success if MMCFG is not in use */
+	if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg)
+		return 0;
+
+	if (!(pci_probe & PCI_PROBE_MMCONF))
+		return check_segment(seg, dev, "MMCONFIG is disabled,");
+
+	result = pci_mmconfig_insert(dev, seg, start, end, addr);
+	if (result == 0) {
+		/* enable MMCFG if it hasn't been enabled yet */
+		if (raw_pci_ext_ops == NULL)
+			raw_pci_ext_ops = &pci_mmcfg;
+		info->mcfg_added = true;
+	} else if (result != -EEXIST)
+		return check_segment(seg, dev,
+			 "fail to add MMCONFIG information,");
+
+	return 0;
+}
+
+static void teardown_mcfg_map(struct pci_root_info *info)
+{
+	if (info->mcfg_added) {
+		pci_mmconfig_delete(info->segment, info->start_bus,
+				    info->end_bus);
+		info->mcfg_added = false;
+	}
+}
+#else
+static int __devinit setup_mcfg_map(struct pci_root_info *info,
+				    u16 seg, u8 start, u8 end,
+				    phys_addr_t addr)
+{
+	return 0;
+}
+static void teardown_mcfg_map(struct pci_root_info *info)
+{
+}
+#endif
+
 static acpi_status
 resource_to_addr(struct acpi_resource *resource,
 			struct acpi_resource_address64 *addr)
@@ -234,13 +314,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 	}
 
 	info->res_num++;
-	if (addr.translation_offset)
-		dev_info(&info->bridge->dev, "host bridge window %pR "
-			 "(PCI address [%#llx-%#llx])\n",
-			 res, res->start - addr.translation_offset,
-			 res->end - addr.translation_offset);
-	else
-		dev_info(&info->bridge->dev, "host bridge window %pR\n", res);
 
 	return AE_OK;
 }
@@ -332,8 +405,11 @@ static void __release_pci_root_info(struct pci_root_info *info)
 
 	free_pci_root_info_res(info);
 
+	teardown_mcfg_map(info);
+
 	kfree(info);
 }
+
 static void release_pci_root_info(struct pci_host_bridge *bridge)
 {
 	struct pci_root_info *info = bridge->release_data;
@@ -347,7 +423,9 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
 {
 	size_t size;
 
+	sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
 	info->bridge = device;
+
 	info->res_num = 0;
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
 				info);
@@ -360,8 +438,6 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,
 	if (!info->res)
 		return;
 
-	sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
-
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
 				info);
 }
@@ -373,7 +449,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 	int domain = root->segment;
 	int busnum = root->secondary.start;
 	LIST_HEAD(resources);
-	struct pci_bus *bus;
+	struct pci_bus *bus = NULL;
 	struct pci_sysdata *sd;
 	int node;
 #ifdef CONFIG_ACPI_NUMA
@@ -426,6 +502,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 	} else {
 		probe_pci_root_info(info, device, busnum, domain);
 
+		/* insert busn res at first */
+		pci_add_resource(&resources,  &root->secondary);
 		/*
 		 * _CRS with no apertures is normal, so only fall back to
 		 * defaults or native bridge info if we're ignoring _CRS.
@@ -437,10 +515,13 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 			x86_pci_root_bus_resources(busnum, &resources);
 		}
 
-		bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd,
-					  &resources);
+		if (!setup_mcfg_map(info, domain, (u8)root->secondary.start,
+				    (u8)root->secondary.end, root->mcfg_addr))
+			bus = pci_create_root_bus(NULL, busnum, &pci_root_ops,
+						  sd, &resources);
+
 		if (bus) {
-			bus->subordinate = pci_scan_child_bus(bus);
+			pci_scan_child_bus(bus);
 			pci_set_host_bridge_release(
 				to_pci_host_bridge(bus->bridge),
 				release_pci_root_info, info);
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 5aed49bff058..e9e6ed5cdf94 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -121,7 +121,6 @@ static int __init early_fill_mp_bus_info(void)
 		link = (reg >> 8) & 0x03;
 
 		info = alloc_pci_root_info(min_bus, max_bus, node, link);
-		sprintf(info->name, "PCI Bus #%02x", min_bus);
 	}
 
 	/* get the default node and link for left over res */
@@ -300,9 +299,9 @@ static int __init early_fill_mp_bus_info(void)
 		int busnum;
 		struct pci_root_res *root_res;
 
-		busnum = info->bus_min;
-		printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n",
-		       info->bus_min, info->bus_max, info->node, info->link);
+		busnum = info->busn.start;
+		printk(KERN_DEBUG "bus: %pR on node %x link %x\n",
+		       &info->busn, info->node, info->link);
 		list_for_each_entry(root_res, &info->resources, list)
 			printk(KERN_DEBUG "bus: %02x %pR\n",
 				       busnum, &root_res->res);
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 306579f7d0fd..d37e2fec97e5 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -14,7 +14,7 @@ static struct pci_root_info *x86_find_pci_root_info(int bus)
 		return NULL;
 
 	list_for_each_entry(info, &pci_root_infos, list)
-		if (info->bus_min == bus)
+		if (info->busn.start == bus)
 			return info;
 
 	return NULL;
@@ -24,6 +24,8 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 {
 	struct pci_root_info *info = x86_find_pci_root_info(bus);
 	struct pci_root_res *root_res;
+	struct pci_host_bridge_window *window;
+	bool found = false;
 
 	if (!info)
 		goto default_resources;
@@ -31,6 +33,16 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 	printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",
 	       bus);
 
+	/* already added by acpi ? */
+	list_for_each_entry(window, resources, list)
+		if (window->res->flags & IORESOURCE_BUS) {
+			found = true;
+			break;
+		}
+
+	if (!found)
+		pci_add_resource(resources, &info->busn);
+
 	list_for_each_entry(root_res, &info->resources, list) {
 		struct resource *res;
 		struct resource *root;
@@ -66,9 +78,13 @@ struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max,
 	if (!info)
 		return info;
 
+	sprintf(info->name, "PCI Bus #%02x", bus_min);
+
 	INIT_LIST_HEAD(&info->resources);
-	info->bus_min = bus_min;
-	info->bus_max = bus_max;
+	info->busn.name  = info->name;
+	info->busn.start = bus_min;
+	info->busn.end   = bus_max;
+	info->busn.flags = IORESOURCE_BUS;
 	info->node = node;
 	info->link = link;
 
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index 226a466b2b2b..ff8f65b04574 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -13,8 +13,7 @@ struct pci_root_info {
 	struct list_head list;
 	char name[12];
 	struct list_head resources;
-	int bus_min;
-	int bus_max;
+	struct resource busn;
 	int node;
 	int link;
 };
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 0ad990a20d4a..720e973fc34a 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -494,7 +494,7 @@ int __init pcibios_init(void)
 	return 0;
 }
 
-char * __devinit  pcibios_setup(char *str)
+char * __init pcibios_setup(char *str)
 {
 	if (!strcmp(str, "off")) {
 		pci_probe = 0;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 301e325992f6..937bcece7006 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -17,6 +17,8 @@
 #include <linux/bitmap.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
 #include <asm/acpi.h>
@@ -24,7 +26,9 @@
 #define PREFIX "PCI: "
 
 /* Indicate if the mmcfg resources have been placed into the resource table. */
-static int __initdata pci_mmcfg_resources_inserted;
+static bool pci_mmcfg_running_state;
+static bool pci_mmcfg_arch_init_failed;
+static DEFINE_MUTEX(pci_mmcfg_lock);
 
 LIST_HEAD(pci_mmcfg_list);
 
@@ -45,24 +49,25 @@ static __init void free_all_mmcfg(void)
 		pci_mmconfig_remove(cfg);
 }
 
-static __init void list_add_sorted(struct pci_mmcfg_region *new)
+static __devinit void list_add_sorted(struct pci_mmcfg_region *new)
 {
 	struct pci_mmcfg_region *cfg;
 
 	/* keep list sorted by segment and starting bus number */
-	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) {
 		if (cfg->segment > new->segment ||
 		    (cfg->segment == new->segment &&
 		     cfg->start_bus >= new->start_bus)) {
-			list_add_tail(&new->list, &cfg->list);
+			list_add_tail_rcu(&new->list, &cfg->list);
 			return;
 		}
 	}
-	list_add_tail(&new->list, &pci_mmcfg_list);
+	list_add_tail_rcu(&new->list, &pci_mmcfg_list);
 }
 
-static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
-							int end, u64 addr)
+static __devinit struct pci_mmcfg_region *pci_mmconfig_alloc(int segment,
+							     int start,
+							     int end, u64 addr)
 {
 	struct pci_mmcfg_region *new;
 	struct resource *res;
@@ -79,8 +84,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
 	new->start_bus = start;
 	new->end_bus = end;
 
-	list_add_sorted(new);
-
 	res = &new->res;
 	res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
 	res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
@@ -89,9 +92,25 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
 		 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
 	res->name = new->name;
 
-	printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at "
-	       "%pR (base %#lx)\n", segment, start, end, &new->res,
-	       (unsigned long) addr);
+	return new;
+}
+
+static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
+							int end, u64 addr)
+{
+	struct pci_mmcfg_region *new;
+
+	new = pci_mmconfig_alloc(segment, start, end, addr);
+	if (new) {
+		mutex_lock(&pci_mmcfg_lock);
+		list_add_sorted(new);
+		mutex_unlock(&pci_mmcfg_lock);
+
+		pr_info(PREFIX
+		       "MMCONFIG for domain %04x [bus %02x-%02x] at %pR "
+		       "(base %#lx)\n",
+		       segment, start, end, &new->res, (unsigned long)addr);
+	}
 
 	return new;
 }
@@ -100,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
 {
 	struct pci_mmcfg_region *cfg;
 
-	list_for_each_entry(cfg, &pci_mmcfg_list, list)
+	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
 		if (cfg->segment == segment &&
 		    cfg->start_bus <= bus && bus <= cfg->end_bus)
 			return cfg;
@@ -343,8 +362,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
 			name = pci_mmcfg_probes[i].probe();
 
 		if (name)
-			printk(KERN_INFO PREFIX "%s with MMCONFIG support\n",
-			       name);
+			pr_info(PREFIX "%s with MMCONFIG support\n", name);
 	}
 
 	/* some end_bus_number is crazy, fix it */
@@ -353,19 +371,8 @@ static int __init pci_mmcfg_check_hostbridge(void)
 	return !list_empty(&pci_mmcfg_list);
 }
 
-static void __init pci_mmcfg_insert_resources(void)
-{
-	struct pci_mmcfg_region *cfg;
-
-	list_for_each_entry(cfg, &pci_mmcfg_list, list)
-		insert_resource(&iomem_resource, &cfg->res);
-
-	/* Mark that the resources have been inserted. */
-	pci_mmcfg_resources_inserted = 1;
-}
-
-static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
-					      void *data)
+static acpi_status __devinit check_mcfg_resource(struct acpi_resource *res,
+						 void *data)
 {
 	struct resource *mcfg_res = data;
 	struct acpi_resource_address64 address;
@@ -401,8 +408,8 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
 	return AE_OK;
 }
 
-static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl,
-		void *context, void **rv)
+static acpi_status __devinit find_mboard_resource(acpi_handle handle, u32 lvl,
+						  void *context, void **rv)
 {
 	struct resource *mcfg_res = context;
 
@@ -415,7 +422,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl,
 	return AE_OK;
 }
 
-static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
+static int __devinit is_acpi_reserved(u64 start, u64 end, unsigned not_used)
 {
 	struct resource mcfg_res;
 
@@ -434,13 +441,15 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
 
 typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
 
-static int __init is_mmconf_reserved(check_reserved_t is_reserved,
-				    struct pci_mmcfg_region *cfg, int with_e820)
+static int __ref is_mmconf_reserved(check_reserved_t is_reserved,
+				    struct pci_mmcfg_region *cfg,
+				    struct device *dev, int with_e820)
 {
 	u64 addr = cfg->res.start;
 	u64 size = resource_size(&cfg->res);
 	u64 old_size = size;
-	int valid = 0, num_buses;
+	int num_buses;
+	char *method = with_e820 ? "E820" : "ACPI motherboard resources";
 
 	while (!is_reserved(addr, addr + size, E820_RESERVED)) {
 		size >>= 1;
@@ -448,30 +457,76 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
 			break;
 	}
 
-	if (size >= (16UL<<20) || size == old_size) {
-		printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n",
-		       &cfg->res,
-		       with_e820 ? "E820" : "ACPI motherboard resources");
-		valid = 1;
-
-		if (old_size != size) {
-			/* update end_bus */
-			cfg->end_bus = cfg->start_bus + ((size>>20) - 1);
-			num_buses = cfg->end_bus - cfg->start_bus + 1;
-			cfg->res.end = cfg->res.start +
-			    PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
-			snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
-				 "PCI MMCONFIG %04x [bus %02x-%02x]",
-				 cfg->segment, cfg->start_bus, cfg->end_bus);
-			printk(KERN_INFO PREFIX
-			       "MMCONFIG for %04x [bus%02x-%02x] "
-			       "at %pR (base %#lx) (size reduced!)\n",
-			       cfg->segment, cfg->start_bus, cfg->end_bus,
-			       &cfg->res, (unsigned long) cfg->address);
-		}
+	if (size < (16UL<<20) && size != old_size)
+		return 0;
+
+	if (dev)
+		dev_info(dev, "MMCONFIG at %pR reserved in %s\n",
+			 &cfg->res, method);
+	else
+		pr_info(PREFIX "MMCONFIG at %pR reserved in %s\n",
+		       &cfg->res, method);
+
+	if (old_size != size) {
+		/* update end_bus */
+		cfg->end_bus = cfg->start_bus + ((size>>20) - 1);
+		num_buses = cfg->end_bus - cfg->start_bus + 1;
+		cfg->res.end = cfg->res.start +
+		    PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
+		snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
+			 "PCI MMCONFIG %04x [bus %02x-%02x]",
+			 cfg->segment, cfg->start_bus, cfg->end_bus);
+
+		if (dev)
+			dev_info(dev,
+				"MMCONFIG "
+				"at %pR (base %#lx) (size reduced!)\n",
+				&cfg->res, (unsigned long) cfg->address);
+		else
+			pr_info(PREFIX
+				"MMCONFIG for %04x [bus%02x-%02x] "
+				"at %pR (base %#lx) (size reduced!)\n",
+				cfg->segment, cfg->start_bus, cfg->end_bus,
+				&cfg->res, (unsigned long) cfg->address);
 	}
 
-	return valid;
+	return 1;
+}
+
+static int __ref pci_mmcfg_check_reserved(struct device *dev,
+		  struct pci_mmcfg_region *cfg, int early)
+{
+	if (!early && !acpi_disabled) {
+		if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0))
+			return 1;
+
+		if (dev)
+			dev_info(dev, FW_INFO
+				 "MMCONFIG at %pR not reserved in "
+				 "ACPI motherboard resources\n",
+				 &cfg->res);
+		else
+			pr_info(FW_INFO PREFIX
+			       "MMCONFIG at %pR not reserved in "
+			       "ACPI motherboard resources\n",
+			       &cfg->res);
+	}
+
+	/*
+	 * e820_all_mapped() is marked as __init.
+	 * All entries from ACPI MCFG table have been checked at boot time.
+	 * For MCFG information constructed from hotpluggable host bridge's
+	 * _CBA method, just assume it's reserved.
+	 */
+	if (pci_mmcfg_running_state)
+		return 1;
+
+	/* Don't try to do this check unless configuration
+	   type 1 is available. how about type 2 ?*/
+	if (raw_pci_ops)
+		return is_mmconf_reserved(e820_all_mapped, cfg, dev, 1);
+
+	return 0;
 }
 
 static void __init pci_mmcfg_reject_broken(int early)
@@ -479,38 +534,14 @@ static void __init pci_mmcfg_reject_broken(int early)
 	struct pci_mmcfg_region *cfg;
 
 	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
-		int valid = 0;
-
-		if (!early && !acpi_disabled) {
-			valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0);
-
-			if (valid)
-				continue;
-			else
-				printk(KERN_ERR FW_BUG PREFIX
-				       "MMCONFIG at %pR not reserved in "
-				       "ACPI motherboard resources\n",
-				       &cfg->res);
+		if (pci_mmcfg_check_reserved(NULL, cfg, early) == 0) {
+			pr_info(PREFIX "not using MMCONFIG\n");
+			free_all_mmcfg();
+			return;
 		}
-
-		/* Don't try to do this check unless configuration
-		   type 1 is available. how about type 2 ?*/
-		if (raw_pci_ops)
-			valid = is_mmconf_reserved(e820_all_mapped, cfg, 1);
-
-		if (!valid)
-			goto reject;
 	}
-
-	return;
-
-reject:
-	printk(KERN_INFO PREFIX "not using MMCONFIG\n");
-	free_all_mmcfg();
 }
 
-static int __initdata known_bridge;
-
 static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
 					struct acpi_mcfg_allocation *cfg)
 {
@@ -529,7 +560,7 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
 			return 0;
 	}
 
-	printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
+	pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
 	       "is above 4GB, ignored\n", cfg->pci_segment,
 	       cfg->start_bus_number, cfg->end_bus_number, cfg->address);
 	return -EINVAL;
@@ -556,7 +587,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
 		i -= sizeof(struct acpi_mcfg_allocation);
 	};
 	if (entries == 0) {
-		printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
+		pr_err(PREFIX "MMCONFIG has no entries\n");
 		return -ENODEV;
 	}
 
@@ -570,8 +601,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
 
 		if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
 				   cfg->end_bus_number, cfg->address) == NULL) {
-			printk(KERN_WARNING PREFIX
-			       "no memory for MCFG entries\n");
+			pr_warn(PREFIX "no memory for MCFG entries\n");
 			free_all_mmcfg();
 			return -ENOMEM;
 		}
@@ -582,28 +612,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
 
 static void __init __pci_mmcfg_init(int early)
 {
-	/* MMCONFIG disabled */
-	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
-		return;
-
-	/* MMCONFIG already enabled */
-	if (!early && !(pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF))
-		return;
-
-	/* for late to exit */
-	if (known_bridge)
-		return;
-
-	if (early) {
-		if (pci_mmcfg_check_hostbridge())
-			known_bridge = 1;
-	}
-
-	if (!known_bridge)
-		acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
-
 	pci_mmcfg_reject_broken(early);
-
 	if (list_empty(&pci_mmcfg_list))
 		return;
 
@@ -620,33 +629,48 @@ static void __init __pci_mmcfg_init(int early)
 	if (pci_mmcfg_arch_init())
 		pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
 	else {
-		/*
-		 * Signal not to attempt to insert mmcfg resources because
-		 * the architecture mmcfg setup could not initialize.
-		 */
-		pci_mmcfg_resources_inserted = 1;
+		free_all_mmcfg();
+		pci_mmcfg_arch_init_failed = true;
 	}
 }
 
+static int __initdata known_bridge;
+
 void __init pci_mmcfg_early_init(void)
 {
-	__pci_mmcfg_init(1);
+	if (pci_probe & PCI_PROBE_MMCONF) {
+		if (pci_mmcfg_check_hostbridge())
+			known_bridge = 1;
+		else
+			acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+		__pci_mmcfg_init(1);
+	}
 }
 
 void __init pci_mmcfg_late_init(void)
 {
-	__pci_mmcfg_init(0);
+	/* MMCONFIG disabled */
+	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+		return;
+
+	if (known_bridge)
+		return;
+
+	/* MMCONFIG hasn't been enabled yet, try again */
+	if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) {
+		acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+		__pci_mmcfg_init(0);
+	}
 }
 
 static int __init pci_mmcfg_late_insert_resources(void)
 {
-	/*
-	 * If resources are already inserted or we are not using MMCONFIG,
-	 * don't insert the resources.
-	 */
-	if ((pci_mmcfg_resources_inserted == 1) ||
-	    (pci_probe & PCI_PROBE_MMCONF) == 0 ||
-	    list_empty(&pci_mmcfg_list))
+	struct pci_mmcfg_region *cfg;
+
+	pci_mmcfg_running_state = true;
+
+	/* If we are not using MMCONFIG, don't insert the resources. */
+	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
 		return 1;
 
 	/*
@@ -654,7 +678,9 @@ static int __init pci_mmcfg_late_insert_resources(void)
 	 * marked so it won't cause request errors when __request_region is
 	 * called.
 	 */
-	pci_mmcfg_insert_resources();
+	list_for_each_entry(cfg, &pci_mmcfg_list, list)
+		if (!cfg->res.parent)
+			insert_resource(&iomem_resource, &cfg->res);
 
 	return 0;
 }
@@ -665,3 +691,101 @@ static int __init pci_mmcfg_late_insert_resources(void)
  * with other system resources.
  */
 late_initcall(pci_mmcfg_late_insert_resources);
+
+/* Add MMCFG information for host bridges */
+int __devinit pci_mmconfig_insert(struct device *dev,
+				  u16 seg, u8 start, u8 end,
+				  phys_addr_t addr)
+{
+	int rc;
+	struct resource *tmp = NULL;
+	struct pci_mmcfg_region *cfg;
+
+	if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed)
+		return -ENODEV;
+
+	if (start > end)
+		return -EINVAL;
+
+	mutex_lock(&pci_mmcfg_lock);
+	cfg = pci_mmconfig_lookup(seg, start);
+	if (cfg) {
+		if (cfg->end_bus < end)
+			dev_info(dev, FW_INFO
+				 "MMCONFIG for "
+				 "domain %04x [bus %02x-%02x] "
+				 "only partially covers this bridge\n",
+				  cfg->segment, cfg->start_bus, cfg->end_bus);
+		mutex_unlock(&pci_mmcfg_lock);
+		return -EEXIST;
+	}
+
+	if (!addr) {
+		mutex_unlock(&pci_mmcfg_lock);
+		return -EINVAL;
+	}
+
+	rc = -EBUSY;
+	cfg = pci_mmconfig_alloc(seg, start, end, addr);
+	if (cfg == NULL) {
+		dev_warn(dev, "fail to add MMCONFIG (out of memory)\n");
+		rc = -ENOMEM;
+	} else if (!pci_mmcfg_check_reserved(dev, cfg, 0)) {
+		dev_warn(dev, FW_BUG "MMCONFIG %pR isn't reserved\n",
+			 &cfg->res);
+	} else {
+		/* Insert resource if it's not in boot stage */
+		if (pci_mmcfg_running_state)
+			tmp = insert_resource_conflict(&iomem_resource,
+						       &cfg->res);
+
+		if (tmp) {
+			dev_warn(dev,
+				 "MMCONFIG %pR conflicts with "
+				 "%s %pR\n",
+				 &cfg->res, tmp->name, tmp);
+		} else if (pci_mmcfg_arch_map(cfg)) {
+			dev_warn(dev, "fail to map MMCONFIG %pR.\n",
+				 &cfg->res);
+		} else {
+			list_add_sorted(cfg);
+			dev_info(dev, "MMCONFIG at %pR (base %#lx)\n",
+				 &cfg->res, (unsigned long)addr);
+			cfg = NULL;
+			rc = 0;
+		}
+	}
+
+	if (cfg) {
+		if (cfg->res.parent)
+			release_resource(&cfg->res);
+		kfree(cfg);
+	}
+
+	mutex_unlock(&pci_mmcfg_lock);
+
+	return rc;
+}
+
+/* Delete MMCFG information for host bridges */
+int pci_mmconfig_delete(u16 seg, u8 start, u8 end)
+{
+	struct pci_mmcfg_region *cfg;
+
+	mutex_lock(&pci_mmcfg_lock);
+	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
+		if (cfg->segment == seg && cfg->start_bus == start &&
+		    cfg->end_bus == end) {
+			list_del_rcu(&cfg->list);
+			synchronize_rcu();
+			pci_mmcfg_arch_unmap(cfg);
+			if (cfg->res.parent)
+				release_resource(&cfg->res);
+			mutex_unlock(&pci_mmcfg_lock);
+			kfree(cfg);
+			return 0;
+		}
+	mutex_unlock(&pci_mmcfg_lock);
+
+	return -ENOENT;
+}
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 5372e86834c0..db63ac23e3d9 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -11,6 +11,7 @@
 
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/rcupdate.h>
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
 #include <acpi/acpi.h>
@@ -60,9 +61,12 @@ err:		*value = -1;
 		return -EINVAL;
 	}
 
+	rcu_read_lock();
 	base = get_base_addr(seg, bus, devfn);
-	if (!base)
+	if (!base) {
+		rcu_read_unlock();
 		goto err;
+	}
 
 	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
@@ -80,6 +84,7 @@ err:		*value = -1;
 		break;
 	}
 	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+	rcu_read_unlock();
 
 	return 0;
 }
@@ -93,9 +98,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 	if ((bus > 255) || (devfn > 255) || (reg > 4095))
 		return -EINVAL;
 
+	rcu_read_lock();
 	base = get_base_addr(seg, bus, devfn);
-	if (!base)
+	if (!base) {
+		rcu_read_unlock();
 		return -EINVAL;
+	}
 
 	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
@@ -113,11 +121,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 		break;
 	}
 	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+	rcu_read_unlock();
 
 	return 0;
 }
 
-static const struct pci_raw_ops pci_mmcfg = {
+const struct pci_raw_ops pci_mmcfg = {
 	.read =		pci_mmcfg_read,
 	.write =	pci_mmcfg_write,
 };
@@ -132,3 +141,18 @@ int __init pci_mmcfg_arch_init(void)
 void __init pci_mmcfg_arch_free(void)
 {
 }
+
+int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
+{
+	return 0;
+}
+
+void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
+{
+	unsigned long flags;
+
+	/* Invalidate the cached mmcfg map entry. */
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
+	mmcfg_last_accessed_device = 0;
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 915a493502cb..d4ebd07c306d 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -9,6 +9,7 @@
 #include <linux/init.h>
 #include <linux/acpi.h>
 #include <linux/bitmap.h>
+#include <linux/rcupdate.h>
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
 
@@ -34,9 +35,12 @@ err:		*value = -1;
 		return -EINVAL;
 	}
 
+	rcu_read_lock();
 	addr = pci_dev_base(seg, bus, devfn);
-	if (!addr)
+	if (!addr) {
+		rcu_read_unlock();
 		goto err;
+	}
 
 	switch (len) {
 	case 1:
@@ -49,6 +53,7 @@ err:		*value = -1;
 		*value = mmio_config_readl(addr + reg);
 		break;
 	}
+	rcu_read_unlock();
 
 	return 0;
 }
@@ -62,9 +67,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 	if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
 		return -EINVAL;
 
+	rcu_read_lock();
 	addr = pci_dev_base(seg, bus, devfn);
-	if (!addr)
+	if (!addr) {
+		rcu_read_unlock();
 		return -EINVAL;
+	}
 
 	switch (len) {
 	case 1:
@@ -77,16 +85,17 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 		mmio_config_writel(addr + reg, value);
 		break;
 	}
+	rcu_read_unlock();
 
 	return 0;
 }
 
-static const struct pci_raw_ops pci_mmcfg = {
+const struct pci_raw_ops pci_mmcfg = {
 	.read =		pci_mmcfg_read,
 	.write =	pci_mmcfg_write,
 };
 
-static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg)
+static void __iomem * __devinit mcfg_ioremap(struct pci_mmcfg_region *cfg)
 {
 	void __iomem *addr;
 	u64 start, size;
@@ -105,16 +114,14 @@ int __init pci_mmcfg_arch_init(void)
 {
 	struct pci_mmcfg_region *cfg;
 
-	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
-		cfg->virt = mcfg_ioremap(cfg);
-		if (!cfg->virt) {
-			printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n",
-			       &cfg->res);
+	list_for_each_entry(cfg, &pci_mmcfg_list, list)
+		if (pci_mmcfg_arch_map(cfg)) {
 			pci_mmcfg_arch_free();
 			return 0;
 		}
-	}
+
 	raw_pci_ext_ops = &pci_mmcfg;
+
 	return 1;
 }
 
@@ -122,10 +129,25 @@ void __init pci_mmcfg_arch_free(void)
 {
 	struct pci_mmcfg_region *cfg;
 
-	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
-		if (cfg->virt) {
-			iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
-			cfg->virt = NULL;
-		}
+	list_for_each_entry(cfg, &pci_mmcfg_list, list)
+		pci_mmcfg_arch_unmap(cfg);
+}
+
+int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
+{
+	cfg->virt = mcfg_ioremap(cfg);
+	if (!cfg->virt) {
+		pr_err(PREFIX "can't map MMCONFIG at %pR\n", &cfg->res);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
+{
+	if (cfg && cfg->virt) {
+		iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
+		cfg->virt = NULL;
 	}
 }
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 140942f66b31..e14a2ff708b5 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -264,7 +264,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup);
 
 static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev)
 {
-	pci_set_power_state(dev, PCI_D3cold);
+	pci_set_power_state(dev, PCI_D3hot);
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev);
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index 0ce8616c88ae..d75582d1aa55 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -18,6 +18,7 @@
 #include <linux/pm.h>
 #include <linux/mfd/core.h>
 #include <linux/suspend.h>
+#include <linux/olpc-ec.h>
 
 #include <asm/io.h>
 #include <asm/olpc.h>
@@ -51,16 +52,11 @@ EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_clear);
 static int xo1_power_state_enter(suspend_state_t pm_state)
 {
 	unsigned long saved_sci_mask;
-	int r;
 
 	/* Only STR is supported */
 	if (pm_state != PM_SUSPEND_MEM)
 		return -EINVAL;
 
-	r = olpc_ec_cmd(EC_SET_SCI_INHIBIT, NULL, 0, NULL, 0);
-	if (r)
-		return r;
-
 	/*
 	 * Save SCI mask (this gets lost since PM1_EN is used as a mask for
 	 * wakeup events, which is not necessarily the same event set)
@@ -76,16 +72,6 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
 	/* Restore SCI mask (using dword access to CS5536_PM1_EN) */
 	outl(saved_sci_mask, acpi_base + CS5536_PM1_STS);
 
-	/* Tell the EC to stop inhibiting SCIs */
-	olpc_ec_cmd(EC_SET_SCI_INHIBIT_RELEASE, NULL, 0, NULL, 0);
-
-	/*
-	 * Tell the wireless module to restart USB communication.
-	 * Must be done twice.
-	 */
-	olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
-	olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
-
 	return 0;
 }
 
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 04b8c73659c5..63d4aa40956e 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -23,6 +23,7 @@
 #include <linux/power_supply.h>
 #include <linux/suspend.h>
 #include <linux/workqueue.h>
+#include <linux/olpc-ec.h>
 
 #include <asm/io.h>
 #include <asm/msr.h>
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 23e5b9d7977b..2fdca25905ae 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/power_supply.h>
+#include <linux/olpc-ec.h>
 
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
@@ -203,7 +204,7 @@ static int xo15_sci_remove(struct acpi_device *device, int type)
 	return 0;
 }
 
-static int xo15_sci_resume(struct acpi_device *device)
+static int xo15_sci_resume(struct device *dev)
 {
 	/* Enable all EC events */
 	olpc_ec_mask_write(EC_SCI_SRC_ALL);
@@ -215,6 +216,8 @@ static int xo15_sci_resume(struct acpi_device *device)
 	return 0;
 }
 
+static SIMPLE_DEV_PM_OPS(xo15_sci_pm, NULL, xo15_sci_resume);
+
 static const struct acpi_device_id xo15_sci_device_ids[] = {
 	{"XO15EC", 0},
 	{"", 0},
@@ -227,8 +230,8 @@ static struct acpi_driver xo15_sci_drv = {
 	.ops = {
 		.add = xo15_sci_add,
 		.remove = xo15_sci_remove,
-		.resume = xo15_sci_resume,
 	},
+	.drv.pm = &xo15_sci_pm,
 };
 
 static int __init xo15_sci_init(void)
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index a4bee53c2e54..27376081ddec 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -14,14 +14,13 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/delay.h>
-#include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/string.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/syscore_ops.h>
-#include <linux/debugfs.h>
 #include <linux/mutex.h>
+#include <linux/olpc-ec.h>
 
 #include <asm/geode.h>
 #include <asm/setup.h>
@@ -31,17 +30,6 @@
 struct olpc_platform_t olpc_platform_info;
 EXPORT_SYMBOL_GPL(olpc_platform_info);
 
-static DEFINE_SPINLOCK(ec_lock);
-
-/* debugfs interface to EC commands */
-#define EC_MAX_CMD_ARGS (5 + 1)	/* cmd byte + 5 args */
-#define EC_MAX_CMD_REPLY (8)
-
-static struct dentry *ec_debugfs_dir;
-static DEFINE_MUTEX(ec_debugfs_cmd_lock);
-static unsigned char ec_debugfs_resp[EC_MAX_CMD_REPLY];
-static unsigned int ec_debugfs_resp_bytes;
-
 /* EC event mask to be applied during suspend (defining wakeup sources). */
 static u16 ec_wakeup_mask;
 
@@ -125,16 +113,13 @@ static int __wait_on_obf(unsigned int line, unsigned int port, int desired)
  * <http://wiki.laptop.org/go/Ec_specification>.  Unfortunately, while
  * OpenFirmware's source is available, the EC's is not.
  */
-int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
-		unsigned char *outbuf,  size_t outlen)
+static int olpc_xo1_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf,
+		size_t outlen, void *arg)
 {
-	unsigned long flags;
 	int ret = -EIO;
 	int i;
 	int restarts = 0;
 
-	spin_lock_irqsave(&ec_lock, flags);
-
 	/* Clear OBF */
 	for (i = 0; i < 10 && (obf_status(0x6c) == 1); i++)
 		inb(0x68);
@@ -198,10 +183,8 @@ restart:
 
 	ret = 0;
 err:
-	spin_unlock_irqrestore(&ec_lock, flags);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(olpc_ec_cmd);
 
 void olpc_ec_wakeup_set(u16 value)
 {
@@ -280,96 +263,6 @@ int olpc_ec_sci_query(u16 *sci_value)
 }
 EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
 
-static ssize_t ec_debugfs_cmd_write(struct file *file, const char __user *buf,
-				    size_t size, loff_t *ppos)
-{
-	int i, m;
-	unsigned char ec_cmd[EC_MAX_CMD_ARGS];
-	unsigned int ec_cmd_int[EC_MAX_CMD_ARGS];
-	char cmdbuf[64];
-	int ec_cmd_bytes;
-
-	mutex_lock(&ec_debugfs_cmd_lock);
-
-	size = simple_write_to_buffer(cmdbuf, sizeof(cmdbuf), ppos, buf, size);
-
-	m = sscanf(cmdbuf, "%x:%u %x %x %x %x %x", &ec_cmd_int[0],
-		   &ec_debugfs_resp_bytes,
-		   &ec_cmd_int[1], &ec_cmd_int[2], &ec_cmd_int[3],
-		   &ec_cmd_int[4], &ec_cmd_int[5]);
-	if (m < 2 || ec_debugfs_resp_bytes > EC_MAX_CMD_REPLY) {
-		/* reset to prevent overflow on read */
-		ec_debugfs_resp_bytes = 0;
-
-		printk(KERN_DEBUG "olpc-ec: bad ec cmd:  "
-		       "cmd:response-count [arg1 [arg2 ...]]\n");
-		size = -EINVAL;
-		goto out;
-	}
-
-	/* convert scanf'd ints to char */
-	ec_cmd_bytes = m - 2;
-	for (i = 0; i <= ec_cmd_bytes; i++)
-		ec_cmd[i] = ec_cmd_int[i];
-
-	printk(KERN_DEBUG "olpc-ec: debugfs cmd 0x%02x with %d args "
-	       "%02x %02x %02x %02x %02x, want %d returns\n",
-	       ec_cmd[0], ec_cmd_bytes, ec_cmd[1], ec_cmd[2], ec_cmd[3],
-	       ec_cmd[4], ec_cmd[5], ec_debugfs_resp_bytes);
-
-	olpc_ec_cmd(ec_cmd[0], (ec_cmd_bytes == 0) ? NULL : &ec_cmd[1],
-		    ec_cmd_bytes, ec_debugfs_resp, ec_debugfs_resp_bytes);
-
-	printk(KERN_DEBUG "olpc-ec: response "
-	       "%02x %02x %02x %02x %02x %02x %02x %02x (%d bytes expected)\n",
-	       ec_debugfs_resp[0], ec_debugfs_resp[1], ec_debugfs_resp[2],
-	       ec_debugfs_resp[3], ec_debugfs_resp[4], ec_debugfs_resp[5],
-	       ec_debugfs_resp[6], ec_debugfs_resp[7], ec_debugfs_resp_bytes);
-
-out:
-	mutex_unlock(&ec_debugfs_cmd_lock);
-	return size;
-}
-
-static ssize_t ec_debugfs_cmd_read(struct file *file, char __user *buf,
-				   size_t size, loff_t *ppos)
-{
-	unsigned int i, r;
-	char *rp;
-	char respbuf[64];
-
-	mutex_lock(&ec_debugfs_cmd_lock);
-	rp = respbuf;
-	rp += sprintf(rp, "%02x", ec_debugfs_resp[0]);
-	for (i = 1; i < ec_debugfs_resp_bytes; i++)
-		rp += sprintf(rp, ", %02x", ec_debugfs_resp[i]);
-	mutex_unlock(&ec_debugfs_cmd_lock);
-	rp += sprintf(rp, "\n");
-
-	r = rp - respbuf;
-	return simple_read_from_buffer(buf, size, ppos, respbuf, r);
-}
-
-static const struct file_operations ec_debugfs_genops = {
-	.write	 = ec_debugfs_cmd_write,
-	.read	 = ec_debugfs_cmd_read,
-};
-
-static void setup_debugfs(void)
-{
-	ec_debugfs_dir = debugfs_create_dir("olpc-ec", 0);
-	if (ec_debugfs_dir == ERR_PTR(-ENODEV))
-		return;
-
-	debugfs_create_file("cmd", 0600, ec_debugfs_dir, NULL,
-			    &ec_debugfs_genops);
-}
-
-static int olpc_ec_suspend(void)
-{
-	return olpc_ec_mask_write(ec_wakeup_mask);
-}
-
 static bool __init check_ofw_architecture(struct device_node *root)
 {
 	const char *olpc_arch;
@@ -424,8 +317,59 @@ static int __init add_xo1_platform_devices(void)
 	return 0;
 }
 
-static struct syscore_ops olpc_syscore_ops = {
-	.suspend = olpc_ec_suspend,
+static int olpc_xo1_ec_probe(struct platform_device *pdev)
+{
+	/* get the EC revision */
+	olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
+			(unsigned char *) &olpc_platform_info.ecver, 1);
+
+	/* EC version 0x5f adds support for wide SCI mask */
+	if (olpc_platform_info.ecver >= 0x5f)
+		olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI;
+
+	pr_info("OLPC board revision %s%X (EC=%x)\n",
+			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
+			olpc_platform_info.boardrev >> 4,
+			olpc_platform_info.ecver);
+
+	return 0;
+}
+static int olpc_xo1_ec_suspend(struct platform_device *pdev)
+{
+	olpc_ec_mask_write(ec_wakeup_mask);
+
+	/*
+	 * Squelch SCIs while suspended.  This is a fix for
+	 * <http://dev.laptop.org/ticket/1835>.
+	 */
+	return olpc_ec_cmd(EC_SET_SCI_INHIBIT, NULL, 0, NULL, 0);
+}
+
+static int olpc_xo1_ec_resume(struct platform_device *pdev)
+{
+	/* Tell the EC to stop inhibiting SCIs */
+	olpc_ec_cmd(EC_SET_SCI_INHIBIT_RELEASE, NULL, 0, NULL, 0);
+
+	/*
+	 * Tell the wireless module to restart USB communication.
+	 * Must be done twice.
+	 */
+	olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
+	olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
+
+	return 0;
+}
+
+static struct olpc_ec_driver ec_xo1_driver = {
+	.probe = olpc_xo1_ec_probe,
+	.suspend = olpc_xo1_ec_suspend,
+	.resume = olpc_xo1_ec_resume,
+	.ec_cmd = olpc_xo1_ec_cmd,
+};
+
+static struct olpc_ec_driver ec_xo1_5_driver = {
+	.probe = olpc_xo1_ec_probe,
+	.ec_cmd = olpc_xo1_ec_cmd,
 };
 
 static int __init olpc_init(void)
@@ -435,16 +379,17 @@ static int __init olpc_init(void)
 	if (!olpc_ofw_present() || !platform_detect())
 		return 0;
 
-	spin_lock_init(&ec_lock);
+	/* register the XO-1 and 1.5-specific EC handler */
+	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0))	/* XO-1 */
+		olpc_ec_driver_register(&ec_xo1_driver, NULL);
+	else
+		olpc_ec_driver_register(&ec_xo1_5_driver, NULL);
+	platform_device_register_simple("olpc-ec", -1, NULL, 0);
 
 	/* assume B1 and above models always have a DCON */
 	if (olpc_board_at_least(olpc_board(0xb1)))
 		olpc_platform_info.flags |= OLPC_F_DCON;
 
-	/* get the EC revision */
-	olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
-			(unsigned char *) &olpc_platform_info.ecver, 1);
-
 #ifdef CONFIG_PCI_OLPC
 	/* If the VSA exists let it emulate PCI, if not emulate in kernel.
 	 * XO-1 only. */
@@ -452,14 +397,6 @@ static int __init olpc_init(void)
 			!cs5535_has_vsa2())
 		x86_init.pci.arch_init = pci_olpc_init;
 #endif
-	/* EC version 0x5f adds support for wide SCI mask */
-	if (olpc_platform_info.ecver >= 0x5f)
-		olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI;
-
-	printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
-			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
-			olpc_platform_info.boardrev >> 4,
-			olpc_platform_info.ecver);
 
 	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
 		r = add_xo1_platform_devices();
@@ -467,9 +404,6 @@ static int __init olpc_init(void)
 			return r;
 	}
 
-	register_syscore_ops(&olpc_syscore_ops);
-	setup_debugfs();
-
 	return 0;
 }
 
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 59880afa851f..b8b3a37c80cd 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
 /*
  *	SGI UltraViolet TLB flush routines.
  *
- *	(c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI.
+ *	(c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.
  *
  *	This code is released under the GNU General Public License version 2 or
  *	later.
@@ -38,8 +38,7 @@ static int timeout_base_ns[] = {
 
 static int timeout_us;
 static int nobau;
-static int baudisabled;
-static spinlock_t disable_lock;
+static int nobau_perm;
 static cycles_t congested_cycles;
 
 /* tunables: */
@@ -47,12 +46,13 @@ static int max_concurr		= MAX_BAU_CONCURRENT;
 static int max_concurr_const	= MAX_BAU_CONCURRENT;
 static int plugged_delay	= PLUGGED_DELAY;
 static int plugsb4reset		= PLUGSB4RESET;
+static int giveup_limit		= GIVEUP_LIMIT;
 static int timeoutsb4reset	= TIMEOUTSB4RESET;
 static int ipi_reset_limit	= IPI_RESET_LIMIT;
 static int complete_threshold	= COMPLETE_THRESHOLD;
 static int congested_respns_us	= CONGESTED_RESPONSE_US;
 static int congested_reps	= CONGESTED_REPS;
-static int congested_period	= CONGESTED_PERIOD;
+static int disabled_period	= DISABLED_PERIOD;
 
 static struct tunables tunables[] = {
 	{&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
@@ -63,7 +63,8 @@ static struct tunables tunables[] = {
 	{&complete_threshold, COMPLETE_THRESHOLD},
 	{&congested_respns_us, CONGESTED_RESPONSE_US},
 	{&congested_reps, CONGESTED_REPS},
-	{&congested_period, CONGESTED_PERIOD}
+	{&disabled_period, DISABLED_PERIOD},
+	{&giveup_limit, GIVEUP_LIMIT}
 };
 
 static struct dentry *tunables_dir;
@@ -120,6 +121,40 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
 static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
 
+static void
+set_bau_on(void)
+{
+	int cpu;
+	struct bau_control *bcp;
+
+	if (nobau_perm) {
+		pr_info("BAU not initialized; cannot be turned on\n");
+		return;
+	}
+	nobau = 0;
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->nobau = 0;
+	}
+	pr_info("BAU turned on\n");
+	return;
+}
+
+static void
+set_bau_off(void)
+{
+	int cpu;
+	struct bau_control *bcp;
+
+	nobau = 1;
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->nobau = 1;
+	}
+	pr_info("BAU turned off\n");
+	return;
+}
+
 /*
  * Determine the first node on a uvhub. 'Nodes' are used for kernel
  * memory allocation.
@@ -278,7 +313,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
 		 * Both sockets dump their completed count total into
 		 * the message's count.
 		 */
-		smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
+		*sp = 0;
 		asp = (struct atomic_short *)&msg->acknowledge_count;
 		msg_ack_count = atom_asr(socket_ack_count, asp);
 
@@ -491,16 +526,15 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
 }
 
 /*
- * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
+ * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
+ * But not currently used.
  */
 static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
 {
 	unsigned long descriptor_status;
-	unsigned long descriptor_status2;
 
-	descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
-	descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL;
-	descriptor_status = (descriptor_status << 1) | descriptor_status2;
+	descriptor_status =
+		((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
 	return descriptor_status;
 }
 
@@ -531,87 +565,11 @@ int normal_busy(struct bau_control *bcp)
  */
 int handle_uv2_busy(struct bau_control *bcp)
 {
-	int busy_one = bcp->using_desc;
-	int normal = bcp->uvhub_cpu;
-	int selected = -1;
-	int i;
-	unsigned long descriptor_status;
-	unsigned long status;
-	int mmr_offset;
-	struct bau_desc *bau_desc_old;
-	struct bau_desc *bau_desc_new;
-	struct bau_control *hmaster = bcp->uvhub_master;
 	struct ptc_stats *stat = bcp->statp;
-	cycles_t ttm;
 
 	stat->s_uv2_wars++;
-	spin_lock(&hmaster->uvhub_lock);
-	/* try for the original first */
-	if (busy_one != normal) {
-		if (!normal_busy(bcp))
-			selected = normal;
-	}
-	if (selected < 0) {
-		/* can't use the normal, select an alternate */
-		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
-		descriptor_status = read_lmmr(mmr_offset);
-
-		/* scan available descriptors 32-63 */
-		for (i = 0; i < UV_CPUS_PER_AS; i++) {
-			if ((hmaster->inuse_map & (1 << i)) == 0) {
-				status = ((descriptor_status >>
-						(i * UV_ACT_STATUS_SIZE)) &
-						UV_ACT_STATUS_MASK) << 1;
-				if (status != UV2H_DESC_BUSY) {
-					selected = i + UV_CPUS_PER_AS;
-					break;
-				}
-			}
-		}
-	}
-
-	if (busy_one != normal)
-		/* mark the busy alternate as not in-use */
-		hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS));
-
-	if (selected >= 0) {
-		/* switch to the selected descriptor */
-		if (selected != normal) {
-			/* set the selected alternate as in-use */
-			hmaster->inuse_map |=
-					(1 << (selected - UV_CPUS_PER_AS));
-			if (selected > stat->s_uv2_wars_hw)
-				stat->s_uv2_wars_hw = selected;
-		}
-		bau_desc_old = bcp->descriptor_base;
-		bau_desc_old += (ITEMS_PER_DESC * busy_one);
-		bcp->using_desc = selected;
-		bau_desc_new = bcp->descriptor_base;
-		bau_desc_new += (ITEMS_PER_DESC * selected);
-		*bau_desc_new = *bau_desc_old;
-	} else {
-		/*
-		 * All are busy. Wait for the normal one for this cpu to
-		 * free up.
-		 */
-		stat->s_uv2_war_waits++;
-		spin_unlock(&hmaster->uvhub_lock);
-		ttm = get_cycles();
-		do {
-			cpu_relax();
-		} while (normal_busy(bcp));
-		spin_lock(&hmaster->uvhub_lock);
-		/* switch to the original descriptor */
-		bcp->using_desc = normal;
-		bau_desc_old = bcp->descriptor_base;
-		bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc);
-		bcp->using_desc = (ITEMS_PER_DESC * normal);
-		bau_desc_new = bcp->descriptor_base;
-		bau_desc_new += (ITEMS_PER_DESC * normal);
-		*bau_desc_new = *bau_desc_old; /* copy the entire descriptor */
-	}
-	spin_unlock(&hmaster->uvhub_lock);
-	return FLUSH_RETRY_BUSYBUG;
+	bcp->busy = 1;
+	return FLUSH_GIVEUP;
 }
 
 static int uv2_wait_completion(struct bau_desc *bau_desc,
@@ -620,7 +578,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
 {
 	unsigned long descriptor_stat;
 	cycles_t ttm;
-	int desc = bcp->using_desc;
+	int desc = bcp->uvhub_cpu;
 	long busy_reps = 0;
 	struct ptc_stats *stat = bcp->statp;
 
@@ -628,24 +586,38 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
 
 	/* spin on the status MMR, waiting for it to go idle */
 	while (descriptor_stat != UV2H_DESC_IDLE) {
-		/*
-		 * Our software ack messages may be blocked because
-		 * there are no swack resources available.  As long
-		 * as none of them has timed out hardware will NACK
-		 * our message and its state will stay IDLE.
-		 */
-		if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
-		    (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
+		if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) {
+			/*
+			 * A h/w bug on the destination side may
+			 * have prevented the message being marked
+			 * pending, thus it doesn't get replied to
+			 * and gets continually nacked until it times
+			 * out with a SOURCE_TIMEOUT.
+			 */
 			stat->s_stimeout++;
 			return FLUSH_GIVEUP;
-		} else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) {
-			stat->s_strongnacks++;
-			bcp->conseccompletes = 0;
-			return FLUSH_GIVEUP;
 		} else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
+			ttm = get_cycles();
+
+			/*
+			 * Our retries may be blocked by all destination
+			 * swack resources being consumed, and a timeout
+			 * pending.  In that case hardware returns the
+			 * ERROR that looks like a destination timeout.
+			 * Without using the extended status we have to
+			 * deduce from the short time that this was a
+			 * strong nack.
+			 */
+			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
+				bcp->conseccompletes = 0;
+				stat->s_plugged++;
+				/* FLUSH_RETRY_PLUGGED causes hang on boot */
+				return FLUSH_GIVEUP;
+			}
 			stat->s_dtimeout++;
 			bcp->conseccompletes = 0;
-			return FLUSH_RETRY_TIMEOUT;
+			/* FLUSH_RETRY_TIMEOUT causes hang on boot */
+			return FLUSH_GIVEUP;
 		} else {
 			busy_reps++;
 			if (busy_reps > 1000000) {
@@ -653,9 +625,8 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
 				busy_reps = 0;
 				ttm = get_cycles();
 				if ((ttm - bcp->send_message) >
-					(bcp->clocks_per_100_usec)) {
+						bcp->timeout_interval)
 					return handle_uv2_busy(bcp);
-				}
 			}
 			/*
 			 * descriptor_stat is still BUSY
@@ -679,7 +650,7 @@ static int wait_completion(struct bau_desc *bau_desc,
 {
 	int right_shift;
 	unsigned long mmr_offset;
-	int desc = bcp->using_desc;
+	int desc = bcp->uvhub_cpu;
 
 	if (desc < UV_CPUS_PER_AS) {
 		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -758,33 +729,31 @@ static void destination_timeout(struct bau_desc *bau_desc,
 }
 
 /*
- * Completions are taking a very long time due to a congested numalink
- * network.
+ * Stop all cpus on a uvhub from using the BAU for a period of time.
+ * This is reversed by check_enable.
  */
-static void disable_for_congestion(struct bau_control *bcp,
-					struct ptc_stats *stat)
+static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
 {
-	/* let only one cpu do this disabling */
-	spin_lock(&disable_lock);
-
-	if (!baudisabled && bcp->period_requests &&
-	    ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
-		int tcpu;
-		struct bau_control *tbcp;
-		/* it becomes this cpu's job to turn on the use of the
-		   BAU again */
-		baudisabled = 1;
-		bcp->set_bau_off = 1;
-		bcp->set_bau_on_time = get_cycles();
-		bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
+	int tcpu;
+	struct bau_control *tbcp;
+	struct bau_control *hmaster;
+	cycles_t tm1;
+
+	hmaster = bcp->uvhub_master;
+	spin_lock(&hmaster->disable_lock);
+	if (!bcp->baudisabled) {
 		stat->s_bau_disabled++;
+		tm1 = get_cycles();
 		for_each_present_cpu(tcpu) {
 			tbcp = &per_cpu(bau_control, tcpu);
-			tbcp->baudisabled = 1;
+			if (tbcp->uvhub_master == hmaster) {
+				tbcp->baudisabled = 1;
+				tbcp->set_bau_on_time =
+					tm1 + bcp->disabled_period;
+			}
 		}
 	}
-
-	spin_unlock(&disable_lock);
+	spin_unlock(&hmaster->disable_lock);
 }
 
 static void count_max_concurr(int stat, struct bau_control *bcp,
@@ -815,16 +784,30 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
 			bcp->period_requests++;
 			bcp->period_time += elapsed;
 			if ((elapsed > congested_cycles) &&
-			    (bcp->period_requests > bcp->cong_reps))
-				disable_for_congestion(bcp, stat);
+			    (bcp->period_requests > bcp->cong_reps) &&
+			    ((bcp->period_time / bcp->period_requests) >
+							congested_cycles)) {
+				stat->s_congested++;
+				disable_for_period(bcp, stat);
+			}
 		}
 	} else
 		stat->s_requestor--;
 
 	if (completion_status == FLUSH_COMPLETE && try > 1)
 		stat->s_retriesok++;
-	else if (completion_status == FLUSH_GIVEUP)
+	else if (completion_status == FLUSH_GIVEUP) {
 		stat->s_giveup++;
+		if (get_cycles() > bcp->period_end)
+			bcp->period_giveups = 0;
+		bcp->period_giveups++;
+		if (bcp->period_giveups == 1)
+			bcp->period_end = get_cycles() + bcp->disabled_period;
+		if (bcp->period_giveups > bcp->giveup_limit) {
+			disable_for_period(bcp, stat);
+			stat->s_giveuplimit++;
+		}
+	}
 }
 
 /*
@@ -868,7 +851,8 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
  * Returns 1 if it gives up entirely and the original cpu mask is to be
  * returned to the kernel.
  */
-int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
+int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
+	struct bau_desc *bau_desc)
 {
 	int seq_number = 0;
 	int completion_stat = 0;
@@ -881,24 +865,23 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
 	struct bau_control *hmaster = bcp->uvhub_master;
 	struct uv1_bau_msg_header *uv1_hdr = NULL;
 	struct uv2_bau_msg_header *uv2_hdr = NULL;
-	struct bau_desc *bau_desc;
 
-	if (bcp->uvhub_version == 1)
+	if (bcp->uvhub_version == 1) {
+		uv1 = 1;
 		uv1_throttle(hmaster, stat);
+	}
 
 	while (hmaster->uvhub_quiesce)
 		cpu_relax();
 
 	time1 = get_cycles();
+	if (uv1)
+		uv1_hdr = &bau_desc->header.uv1_hdr;
+	else
+		uv2_hdr = &bau_desc->header.uv2_hdr;
+
 	do {
-		bau_desc = bcp->descriptor_base;
-		bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
-		if (bcp->uvhub_version == 1) {
-			uv1 = 1;
-			uv1_hdr = &bau_desc->header.uv1_hdr;
-		} else
-			uv2_hdr = &bau_desc->header.uv2_hdr;
-		if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) {
+		if (try == 0) {
 			if (uv1)
 				uv1_hdr->msg_type = MSG_REGULAR;
 			else
@@ -916,25 +899,24 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
 			uv1_hdr->sequence = seq_number;
 		else
 			uv2_hdr->sequence = seq_number;
-		index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc;
+		index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
 		bcp->send_message = get_cycles();
 
 		write_mmr_activation(index);
 
 		try++;
 		completion_stat = wait_completion(bau_desc, bcp, try);
-		/* UV2: wait_completion() may change the bcp->using_desc */
 
 		handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
 
 		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
 			bcp->ipi_attempts = 0;
+			stat->s_overipilimit++;
 			completion_stat = FLUSH_GIVEUP;
 			break;
 		}
 		cpu_relax();
 	} while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
-		 (completion_stat == FLUSH_RETRY_BUSYBUG) ||
 		 (completion_stat == FLUSH_RETRY_TIMEOUT));
 
 	time2 = get_cycles();
@@ -955,28 +937,33 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
 }
 
 /*
- * The BAU is disabled. When the disabled time period has expired, the cpu
- * that disabled it must re-enable it.
- * Return 0 if it is re-enabled for all cpus.
+ * The BAU is disabled for this uvhub. When the disabled time period has
+ * expired re-enable it.
+ * Return 0 if it is re-enabled for all cpus on this uvhub.
  */
 static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
 {
 	int tcpu;
 	struct bau_control *tbcp;
+	struct bau_control *hmaster;
 
-	if (bcp->set_bau_off) {
-		if (get_cycles() >= bcp->set_bau_on_time) {
-			stat->s_bau_reenabled++;
-			baudisabled = 0;
-			for_each_present_cpu(tcpu) {
-				tbcp = &per_cpu(bau_control, tcpu);
+	hmaster = bcp->uvhub_master;
+	spin_lock(&hmaster->disable_lock);
+	if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
+		stat->s_bau_reenabled++;
+		for_each_present_cpu(tcpu) {
+			tbcp = &per_cpu(bau_control, tcpu);
+			if (tbcp->uvhub_master == hmaster) {
 				tbcp->baudisabled = 0;
 				tbcp->period_requests = 0;
 				tbcp->period_time = 0;
+				tbcp->period_giveups = 0;
 			}
-			return 0;
 		}
+		spin_unlock(&hmaster->disable_lock);
+		return 0;
 	}
+	spin_unlock(&hmaster->disable_lock);
 	return -1;
 }
 
@@ -1068,8 +1055,8 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
  * done.  The returned pointer is valid till preemption is re-enabled.
  */
 const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-				struct mm_struct *mm, unsigned long va,
-				unsigned int cpu)
+				struct mm_struct *mm, unsigned long start,
+				unsigned end, unsigned int cpu)
 {
 	int locals = 0;
 	int remotes = 0;
@@ -1078,18 +1065,32 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	struct cpumask *flush_mask;
 	struct ptc_stats *stat;
 	struct bau_control *bcp;
-
-	/* kernel was booted 'nobau' */
-	if (nobau)
-		return cpumask;
+	unsigned long descriptor_status;
+	unsigned long status;
 
 	bcp = &per_cpu(bau_control, cpu);
 	stat = bcp->statp;
+	stat->s_enters++;
+
+	if (bcp->nobau)
+		return cpumask;
+
+	if (bcp->busy) {
+		descriptor_status =
+			read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0);
+		status = ((descriptor_status >> (bcp->uvhub_cpu *
+			UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1;
+		if (status == UV2H_DESC_BUSY)
+			return cpumask;
+		bcp->busy = 0;
+	}
 
 	/* bau was disabled due to slow response */
 	if (bcp->baudisabled) {
-		if (check_enable(bcp, stat))
+		if (check_enable(bcp, stat)) {
+			stat->s_ipifordisabled++;
 			return cpumask;
+		}
 	}
 
 	/*
@@ -1105,38 +1106,40 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 		stat->s_ntargself++;
 
 	bau_desc = bcp->descriptor_base;
-	bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
+	bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
 	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
 	if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
 		return NULL;
 
 	record_send_statistics(stat, locals, hubs, remotes, bau_desc);
 
-	bau_desc->payload.address = va;
+	bau_desc->payload.address = start;
 	bau_desc->payload.sending_cpu = cpu;
 	/*
 	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
 	 * or 1 if it gave up and the original cpumask should be returned.
 	 */
-	if (!uv_flush_send_and_wait(flush_mask, bcp))
+	if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc))
 		return NULL;
 	else
 		return cpumask;
 }
 
 /*
- * Search the message queue for any 'other' message with the same software
- * acknowledge resource bit vector.
+ * Search the message queue for any 'other' unprocessed message with the
+ * same software acknowledge resource bit vector as the 'msg' message.
  */
 struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
-			struct bau_control *bcp, unsigned char swack_vec)
+					   struct bau_control *bcp)
 {
 	struct bau_pq_entry *msg_next = msg + 1;
+	unsigned char swack_vec = msg->swack_vec;
 
 	if (msg_next > bcp->queue_last)
 		msg_next = bcp->queue_first;
-	while ((msg_next->swack_vec != 0) && (msg_next != msg)) {
-		if (msg_next->swack_vec == swack_vec)
+	while (msg_next != msg) {
+		if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) &&
+				(msg_next->swack_vec == swack_vec))
 			return msg_next;
 		msg_next++;
 		if (msg_next > bcp->queue_last)
@@ -1165,32 +1168,30 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
 		 * This message was assigned a swack resource, but no
 		 * reserved acknowlegment is pending.
 		 * The bug has prevented this message from setting the MMR.
-		 * And no other message has used the same sw_ack resource.
-		 * Do the requested shootdown but do not reply to the msg.
-		 * (the 0 means make no acknowledge)
 		 */
-		bau_process_message(mdp, bcp, 0);
-		return;
-	}
-
-	/*
-	 * Some message has set the MMR 'pending' bit; it might have been
-	 * another message.  Look for that message.
-	 */
-	other_msg = find_another_by_swack(msg, bcp, msg->swack_vec);
-	if (other_msg) {
-		/* There is another.  Do not ack the current one. */
-		bau_process_message(mdp, bcp, 0);
 		/*
-		 * Let the natural processing of that message acknowledge
-		 * it. Don't get the processing of sw_ack's out of order.
+		 * Some message has set the MMR 'pending' bit; it might have
+		 * been another message.  Look for that message.
 		 */
-		return;
+		other_msg = find_another_by_swack(msg, bcp);
+		if (other_msg) {
+			/*
+			 * There is another. Process this one but do not
+			 * ack it.
+			 */
+			bau_process_message(mdp, bcp, 0);
+			/*
+			 * Let the natural processing of that other message
+			 * acknowledge it. Don't get the processing of sw_ack's
+			 * out of order.
+			 */
+			return;
+		}
 	}
 
 	/*
-	 * There is no other message using this sw_ack, so it is safe to
-	 * acknowledge it.
+	 * Either the MMR shows this one pending a reply or there is no
+	 * other message using this sw_ack, so it is safe to acknowledge it.
 	 */
 	bau_process_message(mdp, bcp, 1);
 
@@ -1295,7 +1296,8 @@ static void __init enable_timeouts(void)
 		 */
 		mmr_image |= (1L << SOFTACK_MSHIFT);
 		if (is_uv2_hub()) {
-			mmr_image |= (1L << UV2_EXT_SHFT);
+			/* hw bug workaround; do not use extended status */
+			mmr_image &= ~(1L << UV2_EXT_SHFT);
 		}
 		write_mmr_misc_control(pnode, mmr_image);
 	}
@@ -1338,29 +1340,34 @@ static inline unsigned long long usec_2_cycles(unsigned long microsec)
 static int ptc_seq_show(struct seq_file *file, void *data)
 {
 	struct ptc_stats *stat;
+	struct bau_control *bcp;
 	int cpu;
 
 	cpu = *(loff_t *)data;
 	if (!cpu) {
 		seq_printf(file,
-			"# cpu sent stime self locals remotes ncpus localhub ");
+		 "# cpu bauoff sent stime self locals remotes ncpus localhub ");
 		seq_printf(file,
 			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
 		seq_printf(file,
-		    "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok ");
+			"numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries ");
+		seq_printf(file,
+			"rok resetp resett giveup sto bz throt disable ");
 		seq_printf(file,
-			"resetp resett giveup sto bz throt swack recv rtime ");
+			"enable wars warshw warwaits enters ipidis plugged ");
 		seq_printf(file,
-			"all one mult none retry canc nocan reset rcan ");
+			"ipiover glim cong swack recv rtime all one mult ");
 		seq_printf(file,
-			"disable enable wars warshw warwaits\n");
+			"none retry canc nocan reset rcan\n");
 	}
 	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
-		stat = &per_cpu(ptcstats, cpu);
+		bcp = &per_cpu(bau_control, cpu);
+		stat = bcp->statp;
 		/* source side statistics */
 		seq_printf(file,
-			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
-			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
+			"cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+			   cpu, bcp->nobau, stat->s_requestor,
+			   cycles_2_us(stat->s_time),
 			   stat->s_ntargself, stat->s_ntarglocals,
 			   stat->s_ntargremotes, stat->s_ntargcpu,
 			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
@@ -1374,20 +1381,23 @@ static int ptc_seq_show(struct seq_file *file, void *data)
 			   stat->s_resets_plug, stat->s_resets_timeout,
 			   stat->s_giveup, stat->s_stimeout,
 			   stat->s_busy, stat->s_throttles);
+		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+			   stat->s_bau_disabled, stat->s_bau_reenabled,
+			   stat->s_uv2_wars, stat->s_uv2_wars_hw,
+			   stat->s_uv2_war_waits, stat->s_enters,
+			   stat->s_ipifordisabled, stat->s_plugged,
+			   stat->s_overipilimit, stat->s_giveuplimit,
+			   stat->s_congested);
 
 		/* destination side statistics */
 		seq_printf(file,
-			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+			"%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
 			   read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
 			   stat->d_requestee, cycles_2_us(stat->d_time),
 			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
 			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
 			   stat->d_nocanceled, stat->d_resets,
 			   stat->d_rcanceled);
-		seq_printf(file, "%ld %ld %ld %ld %ld\n",
-			stat->s_bau_disabled, stat->s_bau_reenabled,
-			stat->s_uv2_wars, stat->s_uv2_wars_hw,
-			stat->s_uv2_war_waits);
 	}
 	return 0;
 }
@@ -1401,13 +1411,14 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,
 	char *buf;
 	int ret;
 
-	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
-		"max_concur plugged_delay plugsb4reset",
-		"timeoutsb4reset ipi_reset_limit complete_threshold",
-		"congested_response_us congested_reps congested_period",
+	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n",
+		"max_concur plugged_delay plugsb4reset timeoutsb4reset",
+		"ipi_reset_limit complete_threshold congested_response_us",
+		"congested_reps disabled_period giveup_limit",
 		max_concurr, plugged_delay, plugsb4reset,
 		timeoutsb4reset, ipi_reset_limit, complete_threshold,
-		congested_respns_us, congested_reps, congested_period);
+		congested_respns_us, congested_reps, disabled_period,
+		giveup_limit);
 
 	if (!buf)
 		return -ENOMEM;
@@ -1438,6 +1449,14 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user,
 		return -EFAULT;
 	optstr[count - 1] = '\0';
 
+	if (!strcmp(optstr, "on")) {
+		set_bau_on();
+		return count;
+	} else if (!strcmp(optstr, "off")) {
+		set_bau_off();
+		return count;
+	}
+
 	if (strict_strtol(optstr, 10, &input_arg) < 0) {
 		printk(KERN_DEBUG "%s is invalid\n", optstr);
 		return -EINVAL;
@@ -1570,7 +1589,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
 		bcp->complete_threshold =	complete_threshold;
 		bcp->cong_response_us =		congested_respns_us;
 		bcp->cong_reps =		congested_reps;
-		bcp->cong_period =		congested_period;
+		bcp->disabled_period =		sec_2_cycles(disabled_period);
+		bcp->giveup_limit =		giveup_limit;
 	}
 	return count;
 }
@@ -1699,6 +1719,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
 			 *   fairness chaining multilevel count replied_to
 			 */
 		} else {
+			/*
+			 * BIOS uses legacy mode, but UV2 hardware always
+			 * uses native mode for selective broadcasts.
+			 */
 			uv2_hdr = &bd2->header.uv2_hdr;
 			uv2_hdr->swack_flag =	1;
 			uv2_hdr->base_dest_nasid =
@@ -1811,8 +1835,8 @@ static int calculate_destination_timeout(void)
 		index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
 		mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
 		mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
-		base = timeout_base_ns[index];
-		ts_ns = base * mult1 * mult2;
+		ts_ns = timeout_base_ns[index];
+		ts_ns *= (mult1 * mult2);
 		ret = ts_ns / 1000;
 	} else {
 		/* 4 bits  0/1 for 10/80us base, 3 bits of multiplier */
@@ -1836,6 +1860,8 @@ static void __init init_per_cpu_tunables(void)
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
 		bcp->baudisabled		= 0;
+		if (nobau)
+			bcp->nobau		= 1;
 		bcp->statp			= &per_cpu(ptcstats, cpu);
 		/* time interval to catch a hardware stay-busy bug */
 		bcp->timeout_interval		= usec_2_cycles(2*timeout_us);
@@ -1848,10 +1874,11 @@ static void __init init_per_cpu_tunables(void)
 		bcp->complete_threshold		= complete_threshold;
 		bcp->cong_response_us		= congested_respns_us;
 		bcp->cong_reps			= congested_reps;
-		bcp->cong_period		= congested_period;
-		bcp->clocks_per_100_usec =	usec_2_cycles(100);
+		bcp->disabled_period =		sec_2_cycles(disabled_period);
+		bcp->giveup_limit =		giveup_limit;
 		spin_lock_init(&bcp->queue_lock);
 		spin_lock_init(&bcp->uvhub_lock);
+		spin_lock_init(&bcp->disable_lock);
 	}
 }
 
@@ -1972,7 +1999,6 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
 		}
 		bcp->uvhub_master = *hmasterp;
 		bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
-		bcp->using_desc = bcp->uvhub_cpu;
 		if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
 			printk(KERN_EMERG "%d cpus per uvhub invalid\n",
 				bcp->uvhub_cpu);
@@ -2069,16 +2095,12 @@ static int __init uv_bau_init(void)
 	if (!is_uv_system())
 		return 0;
 
-	if (nobau)
-		return 0;
-
 	for_each_possible_cpu(cur_cpu) {
 		mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
 		zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
 	}
 
 	nuvhubs = uv_num_possible_blades();
-	spin_lock_init(&disable_lock);
 	congested_cycles = usec_2_cycles(congested_respns_us);
 
 	uv_base_pnode = 0x7fffffff;
@@ -2091,7 +2113,8 @@ static int __init uv_bau_init(void)
 	enable_timeouts();
 
 	if (init_per_cpu(nuvhubs, uv_base_pnode)) {
-		nobau = 1;
+		set_bau_off();
+		nobau_perm = 1;
 		return 0;
 	}
 
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index f25c2765a5c9..acf7752da952 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -135,6 +135,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
 	int mmr_pnode, err;
+	unsigned int dest;
 
 	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
 			sizeof(unsigned long));
@@ -143,6 +144,10 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	if (err != 0)
 		return err;
 
+	err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest);
+	if (err != 0)
+		return err;
+
 	if (limit == UV_AFFINITY_CPU)
 		irq_set_status_flags(irq, IRQ_NO_BALANCING);
 	else
@@ -159,7 +164,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	entry->polarity		= 0;
 	entry->trigger		= 0;
 	entry->mask		= 0;
-	entry->dest		= apic->cpu_mask_to_apicid(eligible_cpu);
+	entry->dest		= dest;
 
 	mmr_pnode = uv_blade_to_pnode(mmr_blade);
 	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -222,7 +227,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
 	if (cfg->move_in_progress)
 		send_cleanup_vector(cfg);
 
-	return 0;
+	return IRQ_SET_MASK_OK_NOCOPY;
 }
 
 /*
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 5b84a2d30888..88692871823f 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -22,7 +22,7 @@ wakeup-objs	+= video-bios.o
 realmode-y			+= header.o
 realmode-y			+= trampoline_$(BITS).o
 realmode-y			+= stack.o
-realmode-$(CONFIG_X86_32)	+= reboot_32.o
+realmode-y			+= reboot.o
 realmode-$(CONFIG_ACPI_SLEEP)	+= $(wakeup-objs)
 
 targets	+= $(realmode-y)
@@ -72,7 +72,7 @@ KBUILD_CFLAGS	:= $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \
 		   -Wall -Wstrict-prototypes \
 		   -march=i386 -mregparm=3 \
 		   -include $(srctree)/$(src)/../../boot/code16gcc.h \
-		   -fno-strict-aliasing -fomit-frame-pointer \
+		   -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
 		   $(call cc-option, -ffreestanding) \
 		   $(call cc-option, -fno-toplevel-reorder,\
 			$(call cc-option, -fno-unit-at-a-time)) \
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index fadf48378ada..a28221d94e69 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -6,6 +6,7 @@
 
 #include <linux/linkage.h>
 #include <asm/page_types.h>
+#include <asm/segment.h>
 
 #include "realmode.h"
 	
@@ -28,8 +29,9 @@ GLOBAL(real_mode_header)
 	.long	pa_wakeup_header
 #endif
 	/* APM/BIOS reboot */
-#ifdef CONFIG_X86_32
 	.long	pa_machine_real_restart_asm
+#ifdef CONFIG_X86_64
+	.long	__KERNEL32_CS
 #endif
 END(real_mode_header)
 
diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot.S
index 114044876b3d..f932ea61d1c8 100644
--- a/arch/x86/realmode/rm/reboot_32.S
+++ b/arch/x86/realmode/rm/reboot.S
@@ -2,6 +2,8 @@
 #include <linux/init.h>
 #include <asm/segment.h>
 #include <asm/page_types.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
 #include "realmode.h"
 
 /*
@@ -12,13 +14,35 @@
  * doesn't work with at least one type of 486 motherboard.  It is easy
  * to stop this code working; hence the copious comments.
  *
- * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
+ * This code is called with the restart type (0 = BIOS, 1 = APM) in
+ * the primary argument register (%eax for 32 bit, %edi for 64 bit).
  */
 	.section ".text32", "ax"
 	.code32
-
-	.balign	16
 ENTRY(machine_real_restart_asm)
+
+#ifdef CONFIG_X86_64
+	/* Switch to trampoline GDT as it is guaranteed < 4 GiB */
+	movl	$__KERNEL_DS, %eax
+	movl	%eax, %ds
+	lgdtl	pa_tr_gdt
+
+	/* Disable paging to drop us out of long mode */
+	movl	%cr0, %eax
+	andl	$~X86_CR0_PG, %eax
+	movl	%eax, %cr0
+	ljmpl	$__KERNEL32_CS, $pa_machine_real_restart_paging_off
+
+GLOBAL(machine_real_restart_paging_off)
+	xorl	%eax, %eax
+	xorl	%edx, %edx
+	movl	$MSR_EFER, %ecx
+	wrmsr
+
+	movl	%edi, %eax
+	
+#endif /* CONFIG_X86_64 */
+	
 	/* Set up the IDT for real mode. */
 	lidtl	pa_machine_real_restart_idt
 
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 51171aeff0dc..a582bfed95bb 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -60,8 +60,8 @@
 51	common	getsockname		sys_getsockname
 52	common	getpeername		sys_getpeername
 53	common	socketpair		sys_socketpair
-54	common	setsockopt		sys_setsockopt
-55	common	getsockopt		sys_getsockopt
+54	64	setsockopt		sys_setsockopt
+55	64	getsockopt		sys_getsockopt
 56	common	clone			stub_clone
 57	common	fork			stub_fork
 58	common	vfork			stub_vfork
@@ -318,7 +318,7 @@
 309	common	getcpu			sys_getcpu
 310	64	process_vm_readv	sys_process_vm_readv
 311	64	process_vm_writev	sys_process_vm_writev
-312	64	kcmp			sys_kcmp
+312	common	kcmp			sys_kcmp
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
@@ -353,3 +353,5 @@
 538	x32	sendmmsg		compat_sys_sendmmsg
 539	x32	process_vm_readv	compat_sys_process_vm_readv
 540	x32	process_vm_writev	compat_sys_process_vm_writev
+541	x32	setsockopt		compat_sys_setsockopt
+542	x32	getsockopt		compat_sys_getsockopt
diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h
index 950dfb7b8417..e72cd0df5ba3 100644
--- a/arch/x86/um/asm/ptrace.h
+++ b/arch/x86/um/asm/ptrace.h
@@ -30,10 +30,10 @@
 #define profile_pc(regs) PT_REGS_IP(regs)
 
 #define UPT_RESTART_SYSCALL(r) (UPT_IP(r) -= 2)
-#define UPT_SET_SYSCALL_RETURN(r, res) (UPT_AX(r) = (res))
+#define PT_REGS_SET_SYSCALL_RETURN(r, res) (PT_REGS_AX(r) = (res))
 
-static inline long regs_return_value(struct uml_pt_regs *regs)
+static inline long regs_return_value(struct pt_regs *regs)
 {
-	return UPT_AX(regs);
+	return PT_REGS_AX(regs);
 }
 #endif /* __UM_X86_PTRACE_H */
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 66e6d9359826..0faad646f5fd 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -205,9 +205,9 @@ void syscall32_cpu_init(void)
 {
 	/* Load these always in case some future AMD CPU supports
 	   SYSENTER from compat mode too. */
-	checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-	checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
-	checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
 
 	wrmsrl(MSR_CSTAR, ia32_cstar_target);
 }
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index cb1b1914dbd3..2766746de274 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -38,6 +38,7 @@
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
 #include <xen/interface/memory.h>
+#include <xen/interface/xen-mca.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvm.h>
@@ -109,7 +110,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
  * Point at some empty memory to start with. We map the real shared_info
  * page as soon as fixmap is up and running.
  */
-struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
+struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
 
 /*
  * Flag to determine whether vcpu info placement is available on all
@@ -126,6 +127,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
  */
 static int have_vcpu_info_placement = 1;
 
+struct tls_descs {
+	struct desc_struct desc[3];
+};
+
+/*
+ * Updating the 3 TLS descriptors in the GDT on every task switch is
+ * surprisingly expensive so we avoid updating them if they haven't
+ * changed.  Since Xen writes different descriptors than the one
+ * passed in the update_descriptor hypercall we keep shadow copies to
+ * compare against.
+ */
+static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+
 static void clamp_max_cpus(void)
 {
 #ifdef CONFIG_SMP
@@ -343,9 +357,7 @@ static void __init xen_init_cpuid_mask(void)
 	unsigned int xsave_mask;
 
 	cpuid_leaf1_edx_mask =
-		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
-		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
-		  (1 << X86_FEATURE_MTRR) |  /* disable MTRR */
+		~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */
 		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
 
 	if (!xen_initial_domain())
@@ -542,12 +554,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
 		BUG();
 }
 
+static inline bool desc_equal(const struct desc_struct *d1,
+			      const struct desc_struct *d2)
+{
+	return d1->a == d2->a && d1->b == d2->b;
+}
+
 static void load_TLS_descriptor(struct thread_struct *t,
 				unsigned int cpu, unsigned int i)
 {
-	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
-	struct multicall_space mc = __xen_mc_entry(0);
+	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
+	struct desc_struct *gdt;
+	xmaddr_t maddr;
+	struct multicall_space mc;
+
+	if (desc_equal(shadow, &t->tls_array[i]))
+		return;
+
+	*shadow = t->tls_array[i];
+
+	gdt = get_cpu_gdt_table(cpu);
+	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+	mc = __xen_mc_entry(0);
 
 	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
 }
@@ -629,8 +657,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
 	/*
 	 * Look for known traps using IST, and substitute them
 	 * appropriately.  The debugger ones are the only ones we care
-	 * about.  Xen will handle faults like double_fault and
-	 * machine_check, so we should never see them.  Warn if
+	 * about.  Xen will handle faults like double_fault,
+	 * so we should never see them.  Warn if
 	 * there's an unexpected IST-using fault handler.
 	 */
 	if (addr == (unsigned long)debug)
@@ -645,7 +673,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
 		return 0;
 #ifdef CONFIG_X86_MCE
 	} else if (addr == (unsigned long)machine_check) {
-		return 0;
+		/*
+		 * when xen hypervisor inject vMCE to guest,
+		 * use native mce handler to handle it
+		 */
+		;
 #endif
 	} else {
 		/* Some other trap using IST? */
@@ -1126,9 +1158,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.wbinvd = native_wbinvd,
 
 	.read_msr = native_read_msr_safe,
-	.rdmsr_regs = native_rdmsr_safe_regs,
 	.write_msr = xen_write_msr_safe,
-	.wrmsr_regs = native_wrmsr_safe_regs,
 
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
@@ -1441,32 +1471,6 @@ asmlinkage void __init xen_start_kernel(void)
 #endif
 }
 
-static int init_hvm_pv_info(int *major, int *minor)
-{
-	uint32_t eax, ebx, ecx, edx, pages, msr, base;
-	u64 pfn;
-
-	base = xen_cpuid_base();
-	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
-
-	*major = eax >> 16;
-	*minor = eax & 0xffff;
-	printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
-
-	cpuid(base + 2, &pages, &msr, &ecx, &edx);
-
-	pfn = __pa(hypercall_page);
-	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
-
-	xen_setup_features();
-
-	pv_info.name = "Xen HVM";
-
-	xen_domain_type = XEN_HVM_DOMAIN;
-
-	return 0;
-}
-
 void __ref xen_hvm_init_shared_info(void)
 {
 	int cpu;
@@ -1499,6 +1503,31 @@ void __ref xen_hvm_init_shared_info(void)
 }
 
 #ifdef CONFIG_XEN_PVHVM
+static void __init init_hvm_pv_info(void)
+{
+	int major, minor;
+	uint32_t eax, ebx, ecx, edx, pages, msr, base;
+	u64 pfn;
+
+	base = xen_cpuid_base();
+	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+	major = eax >> 16;
+	minor = eax & 0xffff;
+	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+
+	cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+	pfn = __pa(hypercall_page);
+	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+	xen_setup_features();
+
+	pv_info.name = "Xen HVM";
+
+	xen_domain_type = XEN_HVM_DOMAIN;
+}
+
 static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
 				    unsigned long action, void *hcpu)
 {
@@ -1521,12 +1550,7 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
 
 static void __init xen_hvm_guest_init(void)
 {
-	int r;
-	int major, minor;
-
-	r = init_hvm_pv_info(&major, &minor);
-	if (r < 0)
-		return;
+	init_hvm_pv_info();
 
 	xen_hvm_init_shared_info();
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 2d9e7c9c0e7b..dfc900471aef 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 
 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 {
-	if (!xen_batched_set_pte(ptep, pteval))
-		native_set_pte(ptep, pteval);
+	if (!xen_batched_set_pte(ptep, pteval)) {
+		/*
+		 * Could call native_set_pte() here and trap and
+		 * emulate the PTE write but with 32-bit guests this
+		 * needs two traps (one for each of the two 32-bit
+		 * words in the PTE) so do one hypercall directly
+		 * instead.
+		 */
+		struct mmu_update u;
+
+		u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+		u.val = pte_val_ma(pteval);
+		HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+	}
 }
 
 static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@ -1162,8 +1174,13 @@ static void xen_exit_mmap(struct mm_struct *mm)
 	spin_unlock(&mm->page_table_lock);
 }
 
-static void __init xen_pagetable_setup_start(pgd_t *base)
+static void xen_post_allocator_init(void);
+
+static void __init xen_pagetable_init(void)
 {
+	paging_init();
+	xen_setup_shared_info();
+	xen_post_allocator_init();
 }
 
 static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
@@ -1180,14 +1197,6 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
 	}
 }
 
-static void xen_post_allocator_init(void);
-
-static void __init xen_pagetable_setup_done(pgd_t *base)
-{
-	xen_setup_shared_info();
-	xen_post_allocator_init();
-}
-
 static void xen_write_cr2(unsigned long cr2)
 {
 	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
@@ -1244,7 +1253,8 @@ static void xen_flush_tlb_single(unsigned long addr)
 }
 
 static void xen_flush_tlb_others(const struct cpumask *cpus,
-				 struct mm_struct *mm, unsigned long va)
+				 struct mm_struct *mm, unsigned long start,
+				 unsigned long end)
 {
 	struct {
 		struct mmuext_op op;
@@ -1256,7 +1266,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 	} *args;
 	struct multicall_space mcs;
 
-	trace_xen_mmu_flush_tlb_others(cpus, mm, va);
+	trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
 
 	if (cpumask_empty(cpus))
 		return;		/* nothing to do */
@@ -1269,11 +1279,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
 
-	if (va == TLB_FLUSH_ALL) {
-		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-	} else {
+	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+	if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
 		args->op.cmd = MMUEXT_INVLPG_MULTI;
-		args->op.arg1.linear_addr = va;
+		args->op.arg1.linear_addr = start;
 	}
 
 	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
@@ -1416,13 +1425,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 }
 #endif /* CONFIG_X86_64 */
 
-/* Init-time set_pte while constructing initial pagetables, which
-   doesn't allow RO pagetable pages to be remapped RW */
+/*
+ * Init-time set_pte while constructing initial pagetables, which
+ * doesn't allow RO page table pages to be remapped RW.
+ *
+ * If there is no MFN for this PFN then this page is initially
+ * ballooned out so clear the PTE (as in decrease_reservation() in
+ * drivers/xen/balloon.c).
+ *
+ * Many of these PTE updates are done on unpinned and writable pages
+ * and doing a hypercall for these is unnecessary and expensive.  At
+ * this point it is not possible to tell if a page is pinned or not,
+ * so always write the PTE directly and rely on Xen trapping and
+ * emulating any updates as necessary.
+ */
 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 {
-	pte = mask_rw_pte(ptep, pte);
+	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
+		pte = mask_rw_pte(ptep, pte);
+	else
+		pte = __pte_ma(0);
 
-	xen_set_pte(ptep, pte);
+	native_set_pte(ptep, pte);
 }
 
 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
@@ -2041,8 +2065,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 void __init xen_init_mmu_ops(void)
 {
 	x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
-	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
-	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
+	x86_init.paging.pagetable_init = xen_pagetable_init;
 	pv_mmu_ops = xen_mmu_ops;
 
 	memset(dummy_mapping, 0xff, PAGE_SIZE);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 64effdc6da94..76ba0e97e530 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -194,6 +194,13 @@ RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID
  * boundary violation will require three middle nodes. */
 RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
 
+/* When we populate back during bootup, the amount of pages can vary. The
+ * max we have is seen is 395979, but that does not mean it can't be more.
+ * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
+ * it can re-use Xen provided mfn_list array, so we only need to allocate at
+ * most three P2M top nodes. */
+RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
+
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
 	BUG_ON(pfn >= MAX_P2M_PFN);
@@ -570,12 +577,99 @@ static bool __init early_alloc_p2m(unsigned long pfn)
 	}
 	return true;
 }
+
+/*
+ * Skim over the P2M tree looking at pages that are either filled with
+ * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and
+ * replace the P2M leaf with a p2m_missing or p2m_identity.
+ * Stick the old page in the new P2M tree location.
+ */
+bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn)
+{
+	unsigned topidx;
+	unsigned mididx;
+	unsigned ident_pfns;
+	unsigned inv_pfns;
+	unsigned long *p2m;
+	unsigned long *mid_mfn_p;
+	unsigned idx;
+	unsigned long pfn;
+
+	/* We only look when this entails a P2M middle layer */
+	if (p2m_index(set_pfn))
+		return false;
+
+	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {
+		topidx = p2m_top_index(pfn);
+
+		if (!p2m_top[topidx])
+			continue;
+
+		if (p2m_top[topidx] == p2m_mid_missing)
+			continue;
+
+		mididx = p2m_mid_index(pfn);
+		p2m = p2m_top[topidx][mididx];
+		if (!p2m)
+			continue;
+
+		if ((p2m == p2m_missing) || (p2m == p2m_identity))
+			continue;
+
+		if ((unsigned long)p2m == INVALID_P2M_ENTRY)
+			continue;
+
+		ident_pfns = 0;
+		inv_pfns = 0;
+		for (idx = 0; idx < P2M_PER_PAGE; idx++) {
+			/* IDENTITY_PFNs are 1:1 */
+			if (p2m[idx] == IDENTITY_FRAME(pfn + idx))
+				ident_pfns++;
+			else if (p2m[idx] == INVALID_P2M_ENTRY)
+				inv_pfns++;
+			else
+				break;
+		}
+		if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE))
+			goto found;
+	}
+	return false;
+found:
+	/* Found one, replace old with p2m_identity or p2m_missing */
+	p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
+	/* And the other for save/restore.. */
+	mid_mfn_p = p2m_top_mfn_p[topidx];
+	/* NOTE: Even if it is a p2m_identity it should still be point to
+	 * a page filled with INVALID_P2M_ENTRY entries. */
+	mid_mfn_p[mididx] = virt_to_mfn(p2m_missing);
+
+	/* Reset where we want to stick the old page in. */
+	topidx = p2m_top_index(set_pfn);
+	mididx = p2m_mid_index(set_pfn);
+
+	/* This shouldn't happen */
+	if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
+		early_alloc_p2m(set_pfn);
+
+	if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
+		return false;
+
+	p2m_init(p2m);
+	p2m_top[topidx][mididx] = p2m;
+	mid_mfn_p = p2m_top_mfn_p[topidx];
+	mid_mfn_p[mididx] = virt_to_mfn(p2m);
+
+	return true;
+}
 bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
 		if (!early_alloc_p2m(pfn))
 			return false;
 
+		if (early_can_reuse_p2m_middle(pfn, mfn))
+			return __set_phys_to_machine(pfn, mfn);
+
 		if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/))
 			return false;
 
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a4790bf22c59..d11ca11d14fc 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -78,9 +78,16 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
 	memblock_reserve(start, size);
 
 	xen_max_p2m_pfn = PFN_DOWN(start + size);
+	for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
+		unsigned long mfn = pfn_to_mfn(pfn);
+
+		if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
+			continue;
+		WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
+			pfn, mfn);
 
-	for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++)
 		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+	}
 }
 
 static unsigned long __init xen_do_chunk(unsigned long start,
@@ -157,25 +164,24 @@ static unsigned long __init xen_populate_chunk(
 	unsigned long dest_pfn;
 
 	for (i = 0, entry = list; i < map_size; i++, entry++) {
-		unsigned long credits = credits_left;
 		unsigned long s_pfn;
 		unsigned long e_pfn;
 		unsigned long pfns;
 		long capacity;
 
-		if (credits <= 0)
+		if (credits_left <= 0)
 			break;
 
 		if (entry->type != E820_RAM)
 			continue;
 
-		e_pfn = PFN_UP(entry->addr + entry->size);
+		e_pfn = PFN_DOWN(entry->addr + entry->size);
 
 		/* We only care about E820 after the xen_start_info->nr_pages */
 		if (e_pfn <= max_pfn)
 			continue;
 
-		s_pfn = PFN_DOWN(entry->addr);
+		s_pfn = PFN_UP(entry->addr);
 		/* If the E820 falls within the nr_pages, we want to start
 		 * at the nr_pages PFN.
 		 * If that would mean going past the E820 entry, skip it
@@ -184,23 +190,19 @@ static unsigned long __init xen_populate_chunk(
 			capacity = e_pfn - max_pfn;
 			dest_pfn = max_pfn;
 		} else {
-			/* last_pfn MUST be within E820_RAM regions */
-			if (*last_pfn && e_pfn >= *last_pfn)
-				s_pfn = *last_pfn;
 			capacity = e_pfn - s_pfn;
 			dest_pfn = s_pfn;
 		}
-		/* If we had filled this E820_RAM entry, go to the next one. */
-		if (capacity <= 0)
-			continue;
 
-		if (credits > capacity)
-			credits = capacity;
+		if (credits_left < capacity)
+			capacity = credits_left;
 
-		pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false);
+		pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
 		done += pfns;
-		credits_left -= pfns;
 		*last_pfn = (dest_pfn + pfns);
+		if (pfns < capacity)
+			break;
+		credits_left -= pfns;
 	}
 	return done;
 }
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index afb250d22a6b..f58dca7a6e52 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -80,9 +80,7 @@ static void __cpuinit cpu_bringup(void)
 
 	notify_cpu_starting(cpu);
 
-	ipi_call_lock();
 	set_cpu_online(cpu, true);
-	ipi_call_unlock();
 
 	this_cpu_write(cpu_state, CPU_ONLINE);