diff options
Diffstat (limited to 'arch')
576 files changed, 6674 insertions, 2349 deletions
diff --git a/arch/alpha/include/uapi/asm/Kbuild b/arch/alpha/include/uapi/asm/Kbuild index b15bf6bc0e94..14a2e9af97e9 100644 --- a/arch/alpha/include/uapi/asm/Kbuild +++ b/arch/alpha/include/uapi/asm/Kbuild @@ -1,2 +1,4 @@  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm + +generic-y += bpf_perf_event.h diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi index 4e6e9f57e790..dc91c663bcc0 100644 --- a/arch/arc/boot/dts/axc003.dtsi +++ b/arch/arc/boot/dts/axc003.dtsi @@ -35,6 +35,14 @@  			reg = <0x80 0x10>, <0x100 0x10>;  			#clock-cells = <0>;  			clocks = <&input_clk>; + +			/* +			 * Set initial core pll output frequency to 90MHz. +			 * It will be applied at the core pll driver probing +			 * on early boot. +			 */ +			assigned-clocks = <&core_clk>; +			assigned-clock-rates = <90000000>;  		};  		core_intc: archs-intc@cpu { diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi index 63954a8b0100..69ff4895f2ba 100644 --- a/arch/arc/boot/dts/axc003_idu.dtsi +++ b/arch/arc/boot/dts/axc003_idu.dtsi @@ -35,6 +35,14 @@  			reg = <0x80 0x10>, <0x100 0x10>;  			#clock-cells = <0>;  			clocks = <&input_clk>; + +			/* +			 * Set initial core pll output frequency to 100MHz. +			 * It will be applied at the core pll driver probing +			 * on early boot. +			 */ +			assigned-clocks = <&core_clk>; +			assigned-clock-rates = <100000000>;  		};  		core_intc: archs-intc@cpu { diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index 8f627c200d60..006aa3de5348 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -114,6 +114,14 @@  			reg = <0x00 0x10>, <0x14B8 0x4>;  			#clock-cells = <0>;  			clocks = <&input_clk>; + +			/* +			 * Set initial core pll output frequency to 1GHz. +			 * It will be applied at the core pll driver probing +			 * on early boot. +			 */ +			assigned-clocks = <&core_clk>; +			assigned-clock-rates = <1000000000>;  		};  		serial: serial@5000 { diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig index 7b8f8faf8a24..ac6b0ed8341e 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -49,10 +49,11 @@ CONFIG_SERIAL_8250_DW=y  CONFIG_SERIAL_OF_PLATFORM=y  # CONFIG_HW_RANDOM is not set  # CONFIG_HWMON is not set +CONFIG_DRM=y +# CONFIG_DRM_FBDEV_EMULATION is not set +CONFIG_DRM_UDL=y  CONFIG_FB=y -CONFIG_FB_UDL=y  CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_USB=y  CONFIG_USB_EHCI_HCD=y  CONFIG_USB_EHCI_HCD_PLATFORM=y  CONFIG_USB_OHCI_HCD=y diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h index f35974ee7264..c9173c02081c 100644 --- a/arch/arc/include/asm/uaccess.h +++ b/arch/arc/include/asm/uaccess.h @@ -668,6 +668,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)  		return 0;  	__asm__ __volatile__( +	"	mov	lp_count, %5		\n"  	"	lp	3f			\n"  	"1:	ldb.ab  %3, [%2, 1]		\n"  	"	breq.d	%3, 0, 3f               \n" @@ -684,8 +685,8 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)  	"	.word   1b, 4b			\n"  	"	.previous			\n"  	: "+r"(res), "+r"(dst), "+r"(src), "=r"(val) -	: "g"(-EFAULT), "l"(count) -	: "memory"); +	: "g"(-EFAULT), "r"(count) +	: "lp_count", "lp_start", "lp_end", "memory");  	return res;  } diff --git a/arch/arc/include/uapi/asm/Kbuild b/arch/arc/include/uapi/asm/Kbuild index fa6d0ff4ff89..170b5db64afe 100644 --- a/arch/arc/include/uapi/asm/Kbuild +++ b/arch/arc/include/uapi/asm/Kbuild @@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 7ef7d9a8ff89..9d27331fe69a 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -199,7 +199,7 @@ static void read_arc_build_cfg_regs(void)  			unsigned int exec_ctrl;  			READ_BCR(AUX_EXEC_CTRL, exec_ctrl); -			cpu->extn.dual_enb = exec_ctrl & 1; +			cpu->extn.dual_enb = !(exec_ctrl & 1);  			/* dual issue always present for this core */  			cpu->extn.dual = 1; diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 74315f302971..bf40e06f3fb8 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -163,7 +163,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,   */  static int __print_sym(unsigned int address, void *unused)  { -	__print_symbol("  %s\n", address); +	printk("  %pS\n", (void *)address);  	return 0;  } diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c index bcd7c9fc5d0f..133a4dae41fe 100644 --- a/arch/arc/kernel/traps.c +++ b/arch/arc/kernel/traps.c @@ -83,6 +83,7 @@ DO_ERROR_INFO(SIGILL, "Illegal Insn (or Seq)", insterror_is_error, ILL_ILLOPC)  DO_ERROR_INFO(SIGBUS, "Invalid Mem Access", __weak do_memory_error, BUS_ADRERR)  DO_ERROR_INFO(SIGTRAP, "Breakpoint Set", trap_is_brkpt, TRAP_BRKPT)  DO_ERROR_INFO(SIGBUS, "Misaligned Access", do_misaligned_error, BUS_ADRALN) +DO_ERROR_INFO(SIGSEGV, "gcc generated __builtin_trap", do_trap5_error, 0)  /*   * Entry Point for Misaligned Data access Exception, for emulating in software @@ -115,6 +116,8 @@ void do_machine_check_fault(unsigned long address, struct pt_regs *regs)   * Thus TRAP_S <n> can be used for specific purpose   *  -1 used for software breakpointing (gdb)   *  -2 used by kprobes + *  -5 __builtin_trap() generated by gcc (2018.03 onwards) for toggle such as + *     -fno-isolate-erroneous-paths-dereference   */  void do_non_swi_trap(unsigned long address, struct pt_regs *regs)  { @@ -134,6 +137,9 @@ void do_non_swi_trap(unsigned long address, struct pt_regs *regs)  		kgdb_trap(regs);  		break; +	case 5: +		do_trap5_error(address, regs); +		break;  	default:  		break;  	} @@ -155,3 +161,11 @@ void do_insterror_or_kprobe(unsigned long address, struct pt_regs *regs)  	insterror_is_error(address, regs);  } + +/* + * abort() call generated by older gcc for __builtin_trap() + */ +void abort(void) +{ +	__asm__ __volatile__("trap_s  5\n"); +} diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index 7d8c1d6c2f60..6e9a0a9a6a04 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -163,6 +163,9 @@ static void show_ecr_verbose(struct pt_regs *regs)  		else  			pr_cont("Bus Error, check PRM\n");  #endif +	} else if (vec == ECR_V_TRAP) { +		if (regs->ecr_param == 5) +			pr_cont("gcc generated __builtin_trap\n");  	} else {  		pr_cont("Check Programmer's Manual\n");  	} diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c index f1ac6790da5f..46544e88492d 100644 --- a/arch/arc/plat-axs10x/axs10x.c +++ b/arch/arc/plat-axs10x/axs10x.c @@ -317,25 +317,23 @@ static void __init axs103_early_init(void)  	 * Instead of duplicating defconfig/DT for SMP/QUAD, add a small hack  	 * of fudging the freq in DT  	 */ +#define AXS103_QUAD_CORE_CPU_FREQ_HZ	50000000 +  	unsigned int num_cores = (read_aux_reg(ARC_REG_MCIP_BCR) >> 16) & 0x3F;  	if (num_cores > 2) { -		u32 freq = 50, orig; -		/* -		 * TODO: use cpu node "cpu-freq" param instead of platform-specific -		 * "/cpu_card/core_clk" as it works only if we use fixed-clock for cpu. -		 */ +		u32 freq;  		int off = fdt_path_offset(initial_boot_params, "/cpu_card/core_clk");  		const struct fdt_property *prop;  		prop = fdt_get_property(initial_boot_params, off, -					"clock-frequency", NULL); -		orig = be32_to_cpu(*(u32*)(prop->data)) / 1000000; +					"assigned-clock-rates", NULL); +		freq = be32_to_cpu(*(u32 *)(prop->data));  		/* Patching .dtb in-place with new core clock value */ -		if (freq != orig ) { -			freq = cpu_to_be32(freq * 1000000); +		if (freq != AXS103_QUAD_CORE_CPU_FREQ_HZ) { +			freq = cpu_to_be32(AXS103_QUAD_CORE_CPU_FREQ_HZ);  			fdt_setprop_inplace(initial_boot_params, off, -					    "clock-frequency", &freq, sizeof(freq)); +					    "assigned-clock-rates", &freq, sizeof(freq));  		}  	}  #endif diff --git a/arch/arc/plat-hsdk/platform.c b/arch/arc/plat-hsdk/platform.c index fd0ae5e38639..2958aedb649a 100644 --- a/arch/arc/plat-hsdk/platform.c +++ b/arch/arc/plat-hsdk/platform.c @@ -38,42 +38,6 @@ static void __init hsdk_init_per_cpu(unsigned int cpu)  #define CREG_PAE		(CREG_BASE + 0x180)  #define CREG_PAE_UPDATE		(CREG_BASE + 0x194) -#define CREG_CORE_IF_CLK_DIV	(CREG_BASE + 0x4B8) -#define CREG_CORE_IF_CLK_DIV_2	0x1 -#define CGU_BASE		ARC_PERIPHERAL_BASE -#define CGU_PLL_STATUS		(ARC_PERIPHERAL_BASE + 0x4) -#define CGU_PLL_CTRL		(ARC_PERIPHERAL_BASE + 0x0) -#define CGU_PLL_STATUS_LOCK	BIT(0) -#define CGU_PLL_STATUS_ERR	BIT(1) -#define CGU_PLL_CTRL_1GHZ	0x3A10 -#define HSDK_PLL_LOCK_TIMEOUT	500 - -#define HSDK_PLL_LOCKED() \ -	!!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_LOCK) - -#define HSDK_PLL_ERR() \ -	!!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_ERR) - -static void __init hsdk_set_cpu_freq_1ghz(void) -{ -	u32 timeout = HSDK_PLL_LOCK_TIMEOUT; - -	/* -	 * As we set cpu clock which exceeds 500MHz, the divider for the interface -	 * clock must be programmed to div-by-2. -	 */ -	iowrite32(CREG_CORE_IF_CLK_DIV_2, (void __iomem *) CREG_CORE_IF_CLK_DIV); - -	/* Set cpu clock to 1GHz */ -	iowrite32(CGU_PLL_CTRL_1GHZ, (void __iomem *) CGU_PLL_CTRL); - -	while (!HSDK_PLL_LOCKED() && timeout--) -		cpu_relax(); - -	if (!HSDK_PLL_LOCKED() || HSDK_PLL_ERR()) -		pr_err("Failed to setup CPU frequency to 1GHz!"); -} -  #define SDIO_BASE		(ARC_PERIPHERAL_BASE + 0xA000)  #define SDIO_UHS_REG_EXT	(SDIO_BASE + 0x108)  #define SDIO_UHS_REG_EXT_DIV_2	(2 << 30) @@ -98,12 +62,6 @@ static void __init hsdk_init_early(void)  	 * minimum possible div-by-2.  	 */  	iowrite32(SDIO_UHS_REG_EXT_DIV_2, (void __iomem *) SDIO_UHS_REG_EXT); - -	/* -	 * Setup CPU frequency to 1GHz. -	 * TODO: remove it after smart hsdk pll driver will be introduced. -	 */ -	hsdk_set_cpu_freq_1ghz();  }  static const char *hsdk_compat[] __initconst = { diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi index 1b81c4e75772..d37f95025807 100644 --- a/arch/arm/boot/dts/am33xx.dtsi +++ b/arch/arm/boot/dts/am33xx.dtsi @@ -630,6 +630,7 @@  				reg-names = "phy";  				status = "disabled";  				ti,ctrl_mod = <&usb_ctrl_mod>; +				#phy-cells = <0>;  			};  			usb0: usb@47401000 { @@ -678,6 +679,7 @@  				reg-names = "phy";  				status = "disabled";  				ti,ctrl_mod = <&usb_ctrl_mod>; +				#phy-cells = <0>;  			};  			usb1: usb@47401800 { diff --git a/arch/arm/boot/dts/am4372.dtsi b/arch/arm/boot/dts/am4372.dtsi index e5b061469bf8..4714a59fd86d 100644 --- a/arch/arm/boot/dts/am4372.dtsi +++ b/arch/arm/boot/dts/am4372.dtsi @@ -927,7 +927,8 @@  			reg = <0x48038000 0x2000>,  			      <0x46000000 0x400000>;  			reg-names = "mpu", "dat"; -			interrupts = <80>, <81>; +			interrupts = <GIC_SPI 80 IRQ_TYPE_LEVEL_HIGH>, +				     <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>;  			interrupt-names = "tx", "rx";  			status = "disabled";  			dmas = <&edma 8 2>, @@ -941,7 +942,8 @@  			reg = <0x4803C000 0x2000>,  			      <0x46400000 0x400000>;  			reg-names = "mpu", "dat"; -			interrupts = <82>, <83>; +			interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>, +				     <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;  			interrupt-names = "tx", "rx";  			status = "disabled";  			dmas = <&edma 10 2>, diff --git a/arch/arm/boot/dts/am437x-cm-t43.dts b/arch/arm/boot/dts/am437x-cm-t43.dts index 9e92d480576b..3b9a94c274a7 100644 --- a/arch/arm/boot/dts/am437x-cm-t43.dts +++ b/arch/arm/boot/dts/am437x-cm-t43.dts @@ -301,8 +301,8 @@  	status = "okay";  	pinctrl-names = "default";  	pinctrl-0 = <&spi0_pins>; -	dmas = <&edma 16 -		&edma 17>; +	dmas = <&edma 16 0 +		&edma 17 0>;  	dma-names = "tx0", "rx0";  	flash: w25q64cvzpig@0 { diff --git a/arch/arm/boot/dts/armada-385-db-ap.dts b/arch/arm/boot/dts/armada-385-db-ap.dts index 25d2d720dc0e..678aa023335d 100644 --- a/arch/arm/boot/dts/armada-385-db-ap.dts +++ b/arch/arm/boot/dts/armada-385-db-ap.dts @@ -236,6 +236,7 @@  	usb3_phy: usb3_phy {  		compatible = "usb-nop-xceiv";  		vcc-supply = <®_xhci0_vbus>; +		#phy-cells = <0>;  	};  	reg_xhci0_vbus: xhci0-vbus { diff --git a/arch/arm/boot/dts/armada-385-linksys.dtsi b/arch/arm/boot/dts/armada-385-linksys.dtsi index e1f355ffc8f7..434dc9aaa5e4 100644 --- a/arch/arm/boot/dts/armada-385-linksys.dtsi +++ b/arch/arm/boot/dts/armada-385-linksys.dtsi @@ -66,6 +66,7 @@  	usb3_1_phy: usb3_1-phy {  		compatible = "usb-nop-xceiv";  		vcc-supply = <&usb3_1_vbus>; +		#phy-cells = <0>;  	};  	usb3_1_vbus: usb3_1-vbus { diff --git a/arch/arm/boot/dts/armada-385-synology-ds116.dts b/arch/arm/boot/dts/armada-385-synology-ds116.dts index 36ad571e76f3..0a3552ebda3b 100644 --- a/arch/arm/boot/dts/armada-385-synology-ds116.dts +++ b/arch/arm/boot/dts/armada-385-synology-ds116.dts @@ -191,11 +191,13 @@  	usb3_0_phy: usb3_0_phy {  		compatible = "usb-nop-xceiv";  		vcc-supply = <®_usb3_0_vbus>; +		#phy-cells = <0>;  	};  	usb3_1_phy: usb3_1_phy {  		compatible = "usb-nop-xceiv";  		vcc-supply = <®_usb3_1_vbus>; +		#phy-cells = <0>;  	};  	reg_usb3_0_vbus: usb3-vbus0 { diff --git a/arch/arm/boot/dts/armada-388-gp.dts b/arch/arm/boot/dts/armada-388-gp.dts index f503955dbd3b..51b4ee6df130 100644 --- a/arch/arm/boot/dts/armada-388-gp.dts +++ b/arch/arm/boot/dts/armada-388-gp.dts @@ -276,11 +276,13 @@  	usb2_1_phy: usb2_1_phy {  		compatible = "usb-nop-xceiv";  		vcc-supply = <®_usb2_1_vbus>; +		#phy-cells = <0>;  	};  	usb3_phy: usb3_phy {  		compatible = "usb-nop-xceiv";  		vcc-supply = <®_usb3_vbus>; +		#phy-cells = <0>;  	};  	reg_usb3_vbus: usb3-vbus { diff --git a/arch/arm/boot/dts/aspeed-g4.dtsi b/arch/arm/boot/dts/aspeed-g4.dtsi index 45d815a86d42..de08d9045cb8 100644 --- a/arch/arm/boot/dts/aspeed-g4.dtsi +++ b/arch/arm/boot/dts/aspeed-g4.dtsi @@ -219,7 +219,7 @@  				compatible = "aspeed,ast2400-vuart";  				reg = <0x1e787000 0x40>;  				reg-shift = <2>; -				interrupts = <10>; +				interrupts = <8>;  				clocks = <&clk_uart>;  				no-loopback-test;  				status = "disabled"; diff --git a/arch/arm/boot/dts/at91-tse850-3.dts b/arch/arm/boot/dts/at91-tse850-3.dts index 5f29010cdbd8..9b82cc8843e1 100644 --- a/arch/arm/boot/dts/at91-tse850-3.dts +++ b/arch/arm/boot/dts/at91-tse850-3.dts @@ -221,6 +221,7 @@  	jc42@18 {  		compatible = "nxp,se97b", "jedec,jc-42.4-temp";  		reg = <0x18>; +		smbus-timeout-disable;  	};  	dpot: mcp4651-104@28 { diff --git a/arch/arm/boot/dts/bcm-nsp.dtsi b/arch/arm/boot/dts/bcm-nsp.dtsi index 528b9e3bc1da..dcc55aa84583 100644 --- a/arch/arm/boot/dts/bcm-nsp.dtsi +++ b/arch/arm/boot/dts/bcm-nsp.dtsi @@ -85,7 +85,7 @@  		timer@20200 {  			compatible = "arm,cortex-a9-global-timer";  			reg = <0x20200 0x100>; -			interrupts = <GIC_PPI 11 IRQ_TYPE_LEVEL_HIGH>; +			interrupts = <GIC_PPI 11 IRQ_TYPE_EDGE_RISING>;  			clocks = <&periph_clk>;  		}; @@ -93,7 +93,7 @@  			compatible = "arm,cortex-a9-twd-timer";  			reg = <0x20600 0x20>;  			interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(2) | -						  IRQ_TYPE_LEVEL_HIGH)>; +						  IRQ_TYPE_EDGE_RISING)>;  			clocks = <&periph_clk>;  		}; diff --git a/arch/arm/boot/dts/bcm283x.dtsi b/arch/arm/boot/dts/bcm283x.dtsi index 013431e3d7c3..dcde93c85c2d 100644 --- a/arch/arm/boot/dts/bcm283x.dtsi +++ b/arch/arm/boot/dts/bcm283x.dtsi @@ -639,5 +639,6 @@  	usbphy: phy {  		compatible = "usb-nop-xceiv"; +		#phy-cells = <0>;  	};  }; diff --git a/arch/arm/boot/dts/bcm958623hr.dts b/arch/arm/boot/dts/bcm958623hr.dts index 3bc50849d013..b8bde13de90a 100644 --- a/arch/arm/boot/dts/bcm958623hr.dts +++ b/arch/arm/boot/dts/bcm958623hr.dts @@ -141,10 +141,6 @@  	status = "okay";  }; -&sata { -	status = "okay"; -}; -  &qspi {  	bspi-sel = <0>;  	flash: m25p80@0 { diff --git a/arch/arm/boot/dts/bcm958625hr.dts b/arch/arm/boot/dts/bcm958625hr.dts index d94d14b3c745..6a44b8021702 100644 --- a/arch/arm/boot/dts/bcm958625hr.dts +++ b/arch/arm/boot/dts/bcm958625hr.dts @@ -177,10 +177,6 @@  	status = "okay";  }; -&sata { -	status = "okay"; -}; -  &srab {  	compatible = "brcm,bcm58625-srab", "brcm,nsp-srab";  	status = "okay"; diff --git a/arch/arm/boot/dts/da850-lego-ev3.dts b/arch/arm/boot/dts/da850-lego-ev3.dts index 413dbd5d9f64..81942ae83e1f 100644 --- a/arch/arm/boot/dts/da850-lego-ev3.dts +++ b/arch/arm/boot/dts/da850-lego-ev3.dts @@ -178,7 +178,7 @@  	 */  	battery {  		pinctrl-names = "default"; -		pintctrl-0 = <&battery_pins>; +		pinctrl-0 = <&battery_pins>;  		compatible = "lego,ev3-battery";  		io-channels = <&adc 4>, <&adc 3>;  		io-channel-names = "voltage", "current"; @@ -392,7 +392,7 @@  	batt_volt_en {  		gpio-hog;  		gpios = <6 GPIO_ACTIVE_HIGH>; -		output-low; +		output-high;  	};  }; diff --git a/arch/arm/boot/dts/dm814x.dtsi b/arch/arm/boot/dts/dm814x.dtsi index 9708157f5daf..681f5487406e 100644 --- a/arch/arm/boot/dts/dm814x.dtsi +++ b/arch/arm/boot/dts/dm814x.dtsi @@ -75,6 +75,7 @@  				reg = <0x47401300 0x100>;  				reg-names = "phy";  				ti,ctrl_mod = <&usb_ctrl_mod>; +				#phy-cells = <0>;  			};  			usb0: usb@47401000 { @@ -385,6 +386,7 @@  					reg = <0x1b00 0x100>;  					reg-names = "phy";  					ti,ctrl_mod = <&usb_ctrl_mod>; +					#phy-cells = <0>;  				};  			}; diff --git a/arch/arm/boot/dts/exynos5800-peach-pi.dts b/arch/arm/boot/dts/exynos5800-peach-pi.dts index b2b95ff205e8..0029ec27819c 100644 --- a/arch/arm/boot/dts/exynos5800-peach-pi.dts +++ b/arch/arm/boot/dts/exynos5800-peach-pi.dts @@ -664,6 +664,10 @@  	status = "okay";  }; +&mixer { +	status = "okay"; +}; +  /* eMMC flash */  &mmc_0 {  	status = "okay"; diff --git a/arch/arm/boot/dts/imx53.dtsi b/arch/arm/boot/dts/imx53.dtsi index 589a67c5f796..84f17f7abb71 100644 --- a/arch/arm/boot/dts/imx53.dtsi +++ b/arch/arm/boot/dts/imx53.dtsi @@ -433,15 +433,6 @@  				clock-names = "ipg", "per";  			}; -			srtc: srtc@53fa4000 { -				compatible = "fsl,imx53-rtc", "fsl,imx25-rtc"; -				reg = <0x53fa4000 0x4000>; -				interrupts = <24>; -				interrupt-parent = <&tzic>; -				clocks = <&clks IMX5_CLK_SRTC_GATE>; -				clock-names = "ipg"; -			}; -  			iomuxc: iomuxc@53fa8000 {  				compatible = "fsl,imx53-iomuxc";  				reg = <0x53fa8000 0x4000>; diff --git a/arch/arm/boot/dts/logicpd-som-lv-37xx-devkit.dts b/arch/arm/boot/dts/logicpd-som-lv-37xx-devkit.dts index 38faa90007d7..2fa5eb4bd402 100644 --- a/arch/arm/boot/dts/logicpd-som-lv-37xx-devkit.dts +++ b/arch/arm/boot/dts/logicpd-som-lv-37xx-devkit.dts @@ -72,7 +72,8 @@  };  &gpmc { -	ranges = <1 0 0x08000000 0x1000000>;	/* CS1: 16MB for LAN9221 */ +	ranges = <0 0 0x30000000 0x1000000	/* CS0: 16MB for NAND */ +		  1 0 0x2c000000 0x1000000>;	/* CS1: 16MB for LAN9221 */  	ethernet@gpmc {  		pinctrl-names = "default"; diff --git a/arch/arm/boot/dts/logicpd-som-lv.dtsi b/arch/arm/boot/dts/logicpd-som-lv.dtsi index 26cce4d18405..29cb804d10cc 100644 --- a/arch/arm/boot/dts/logicpd-som-lv.dtsi +++ b/arch/arm/boot/dts/logicpd-som-lv.dtsi @@ -33,11 +33,12 @@  	hsusb2_phy: hsusb2_phy {  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio1 4 GPIO_ACTIVE_LOW>; /* gpio_4 */ +		#phy-cells = <0>;  	};  };  &gpmc { -	ranges = <0 0 0x00000000 0x1000000>;	/* CS0: 16MB for NAND */ +	ranges = <0 0 0x30000000 0x1000000>;	/* CS0: 16MB for NAND */  	nand@0,0 {  		compatible = "ti,omap2-nand"; @@ -121,7 +122,7 @@  &mmc3 {  	interrupts-extended = <&intc 94 &omap3_pmx_core2 0x46>; -	pinctrl-0 = <&mmc3_pins>; +	pinctrl-0 = <&mmc3_pins &wl127x_gpio>;  	pinctrl-names = "default";  	vmmc-supply = <&wl12xx_vmmc>;  	non-removable; @@ -132,8 +133,8 @@  	wlcore: wlcore@2 {  		compatible = "ti,wl1273";  		reg = <2>; -		interrupt-parent = <&gpio5>; -		interrupts = <24 IRQ_TYPE_LEVEL_HIGH>; /* gpio 152 */ +		interrupt-parent = <&gpio1>; +		interrupts = <2 IRQ_TYPE_LEVEL_HIGH>; /* gpio 2 */  		ref-clock-frequency = <26000000>;  	};  }; @@ -157,8 +158,6 @@  			OMAP3_CORE1_IOPAD(0x2166, PIN_INPUT_PULLUP | MUX_MODE3)	/* sdmmc2_dat5.sdmmc3_dat1 */  			OMAP3_CORE1_IOPAD(0x2168, PIN_INPUT_PULLUP | MUX_MODE3)	/* sdmmc2_dat6.sdmmc3_dat2 */  			OMAP3_CORE1_IOPAD(0x216a, PIN_INPUT_PULLUP | MUX_MODE3)	/* sdmmc2_dat6.sdmmc3_dat3 */ -			OMAP3_CORE1_IOPAD(0x2184, PIN_INPUT_PULLUP | MUX_MODE4)	/* mcbsp4_clkx.gpio_152 */ -			OMAP3_CORE1_IOPAD(0x2a0c, PIN_OUTPUT | MUX_MODE4)	/* sys_boot1.gpio_3 */  			OMAP3_CORE1_IOPAD(0x21d0, PIN_INPUT_PULLUP | MUX_MODE3) /* mcspi1_cs1.sdmmc3_cmd */  			OMAP3_CORE1_IOPAD(0x21d2, PIN_INPUT_PULLUP | MUX_MODE3)	/* mcspi1_cs2.sdmmc_clk */  		>; @@ -228,6 +227,12 @@  			OMAP3_WKUP_IOPAD(0x2a0e, PIN_OUTPUT | MUX_MODE4)	/* sys_boot2.gpio_4 */  		>;  	}; +	wl127x_gpio: pinmux_wl127x_gpio_pin { +		pinctrl-single,pins = < +			OMAP3_WKUP_IOPAD(0x2a0c, PIN_INPUT | MUX_MODE4)		/* sys_boot0.gpio_2 */ +			OMAP3_WKUP_IOPAD(0x2a0c, PIN_OUTPUT | MUX_MODE4)	/* sys_boot1.gpio_3 */ +		>; +	};  };  &omap3_pmx_core2 { diff --git a/arch/arm/boot/dts/ls1021a-qds.dts b/arch/arm/boot/dts/ls1021a-qds.dts index 940875316d0f..67b4de0e3439 100644 --- a/arch/arm/boot/dts/ls1021a-qds.dts +++ b/arch/arm/boot/dts/ls1021a-qds.dts @@ -215,7 +215,7 @@  				reg = <0x2a>;  				VDDA-supply = <®_3p3v>;  				VDDIO-supply = <®_3p3v>; -				clocks = <&sys_mclk 1>; +				clocks = <&sys_mclk>;  			};  		};  	}; diff --git a/arch/arm/boot/dts/ls1021a-twr.dts b/arch/arm/boot/dts/ls1021a-twr.dts index a8b148ad1dd2..44715c8ef756 100644 --- a/arch/arm/boot/dts/ls1021a-twr.dts +++ b/arch/arm/boot/dts/ls1021a-twr.dts @@ -187,7 +187,7 @@  		reg = <0x0a>;  		VDDA-supply = <®_3p3v>;  		VDDIO-supply = <®_3p3v>; -		clocks = <&sys_mclk 1>; +		clocks = <&sys_mclk>;  	};  }; diff --git a/arch/arm/boot/dts/meson.dtsi b/arch/arm/boot/dts/meson.dtsi index 4926133077b3..0d9faf1a51ea 100644 --- a/arch/arm/boot/dts/meson.dtsi +++ b/arch/arm/boot/dts/meson.dtsi @@ -85,15 +85,6 @@  				reg = <0x7c00 0x200>;  			}; -			gpio_intc: interrupt-controller@9880 { -				compatible = "amlogic,meson-gpio-intc"; -				reg = <0xc1109880 0x10>; -				interrupt-controller; -				#interrupt-cells = <2>; -				amlogic,channel-interrupts = <64 65 66 67 68 69 70 71>; -				status = "disabled"; -			}; -  			hwrng: rng@8100 {  				compatible = "amlogic,meson-rng";  				reg = <0x8100 0x8>; @@ -191,6 +182,15 @@  				status = "disabled";  			}; +			gpio_intc: interrupt-controller@9880 { +				compatible = "amlogic,meson-gpio-intc"; +				reg = <0x9880 0x10>; +				interrupt-controller; +				#interrupt-cells = <2>; +				amlogic,channel-interrupts = <64 65 66 67 68 69 70 71>; +				status = "disabled"; +			}; +  			wdt: watchdog@9900 {  				compatible = "amlogic,meson6-wdt";  				reg = <0x9900 0x8>; diff --git a/arch/arm/boot/dts/nspire.dtsi b/arch/arm/boot/dts/nspire.dtsi index ec2283b1a638..1a5ae4cd107f 100644 --- a/arch/arm/boot/dts/nspire.dtsi +++ b/arch/arm/boot/dts/nspire.dtsi @@ -56,6 +56,7 @@  	usb_phy: usb_phy {  		compatible = "usb-nop-xceiv"; +		#phy-cells = <0>;  	};  	vbus_reg: vbus_reg { diff --git a/arch/arm/boot/dts/omap3-beagle-xm.dts b/arch/arm/boot/dts/omap3-beagle-xm.dts index 683b96a8f73e..0349fcc9dc26 100644 --- a/arch/arm/boot/dts/omap3-beagle-xm.dts +++ b/arch/arm/boot/dts/omap3-beagle-xm.dts @@ -90,6 +90,7 @@  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio5 19 GPIO_ACTIVE_LOW>; /* gpio_147 */  		vcc-supply = <&hsusb2_power>; +		#phy-cells = <0>;  	};  	tfp410: encoder0 { diff --git a/arch/arm/boot/dts/omap3-beagle.dts b/arch/arm/boot/dts/omap3-beagle.dts index 4d2eaf843fa9..3ca8991a6c3e 100644 --- a/arch/arm/boot/dts/omap3-beagle.dts +++ b/arch/arm/boot/dts/omap3-beagle.dts @@ -64,6 +64,7 @@  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio5 19 GPIO_ACTIVE_LOW>;	/* gpio_147 */  		vcc-supply = <&hsusb2_power>; +		#phy-cells = <0>;  	};  	sound { diff --git a/arch/arm/boot/dts/omap3-cm-t3x.dtsi b/arch/arm/boot/dts/omap3-cm-t3x.dtsi index 31d5ebf38892..ab6003fe5a43 100644 --- a/arch/arm/boot/dts/omap3-cm-t3x.dtsi +++ b/arch/arm/boot/dts/omap3-cm-t3x.dtsi @@ -43,12 +43,14 @@  	hsusb1_phy: hsusb1_phy {  		compatible = "usb-nop-xceiv";  		vcc-supply = <&hsusb1_power>; +		#phy-cells = <0>;  	};  	/* HS USB Host PHY on PORT 2 */  	hsusb2_phy: hsusb2_phy {  		compatible = "usb-nop-xceiv";  		vcc-supply = <&hsusb2_power>; +		#phy-cells = <0>;  	};  	ads7846reg: ads7846-reg { diff --git a/arch/arm/boot/dts/omap3-evm-common.dtsi b/arch/arm/boot/dts/omap3-evm-common.dtsi index dbc3f030a16c..ee64191e41ca 100644 --- a/arch/arm/boot/dts/omap3-evm-common.dtsi +++ b/arch/arm/boot/dts/omap3-evm-common.dtsi @@ -29,6 +29,7 @@  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio1 21 GPIO_ACTIVE_LOW>; /* gpio_21 */  		vcc-supply = <&hsusb2_power>; +		#phy-cells = <0>;  	};  	leds { diff --git a/arch/arm/boot/dts/omap3-gta04.dtsi b/arch/arm/boot/dts/omap3-gta04.dtsi index 4504908c23fe..3dc56fb156b7 100644 --- a/arch/arm/boot/dts/omap3-gta04.dtsi +++ b/arch/arm/boot/dts/omap3-gta04.dtsi @@ -120,6 +120,7 @@  	hsusb2_phy: hsusb2_phy {  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio6 14 GPIO_ACTIVE_LOW>; +		#phy-cells = <0>;  	};  	tv0: connector { diff --git a/arch/arm/boot/dts/omap3-igep0020-common.dtsi b/arch/arm/boot/dts/omap3-igep0020-common.dtsi index 667f96245729..ecbec23af49f 100644 --- a/arch/arm/boot/dts/omap3-igep0020-common.dtsi +++ b/arch/arm/boot/dts/omap3-igep0020-common.dtsi @@ -58,6 +58,7 @@  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio1 24 GPIO_ACTIVE_LOW>; /* gpio_24 */  		vcc-supply = <&hsusb1_power>; +		#phy-cells = <0>;  	};  	tfp410: encoder { diff --git a/arch/arm/boot/dts/omap3-igep0030-common.dtsi b/arch/arm/boot/dts/omap3-igep0030-common.dtsi index e94d9427450c..443f71707437 100644 --- a/arch/arm/boot/dts/omap3-igep0030-common.dtsi +++ b/arch/arm/boot/dts/omap3-igep0030-common.dtsi @@ -37,6 +37,7 @@  	hsusb2_phy: hsusb2_phy {  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio2 22 GPIO_ACTIVE_LOW>;		/* gpio_54 */ +		#phy-cells = <0>;  	};  }; diff --git a/arch/arm/boot/dts/omap3-lilly-a83x.dtsi b/arch/arm/boot/dts/omap3-lilly-a83x.dtsi index 343a36d8031d..7ada1e93e166 100644 --- a/arch/arm/boot/dts/omap3-lilly-a83x.dtsi +++ b/arch/arm/boot/dts/omap3-lilly-a83x.dtsi @@ -51,6 +51,7 @@  	hsusb1_phy: hsusb1_phy {  		compatible = "usb-nop-xceiv";  		vcc-supply = <®_vcc3>; +		#phy-cells = <0>;  	};  }; diff --git a/arch/arm/boot/dts/omap3-overo-base.dtsi b/arch/arm/boot/dts/omap3-overo-base.dtsi index f25e158e7163..ac141fcd1742 100644 --- a/arch/arm/boot/dts/omap3-overo-base.dtsi +++ b/arch/arm/boot/dts/omap3-overo-base.dtsi @@ -51,6 +51,7 @@  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio6 23 GPIO_ACTIVE_LOW>;	/* gpio_183 */  		vcc-supply = <&hsusb2_power>; +		#phy-cells = <0>;  	};  	/* Regulator to trigger the nPoweron signal of the Wifi module */ diff --git a/arch/arm/boot/dts/omap3-pandora-common.dtsi b/arch/arm/boot/dts/omap3-pandora-common.dtsi index 53e007abdc71..cd53dc6c0051 100644 --- a/arch/arm/boot/dts/omap3-pandora-common.dtsi +++ b/arch/arm/boot/dts/omap3-pandora-common.dtsi @@ -205,6 +205,7 @@  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio1 16 GPIO_ACTIVE_LOW>; /* GPIO_16 */  		vcc-supply = <&vaux2>; +		#phy-cells = <0>;  	};  	/* HS USB Host VBUS supply diff --git a/arch/arm/boot/dts/omap3-tao3530.dtsi b/arch/arm/boot/dts/omap3-tao3530.dtsi index 9a601d15247b..6f5bd027b717 100644 --- a/arch/arm/boot/dts/omap3-tao3530.dtsi +++ b/arch/arm/boot/dts/omap3-tao3530.dtsi @@ -46,6 +46,7 @@  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio6 2 GPIO_ACTIVE_LOW>;	/* gpio_162 */  		vcc-supply = <&hsusb2_power>; +		#phy-cells = <0>;  	};  	sound { diff --git a/arch/arm/boot/dts/omap3.dtsi b/arch/arm/boot/dts/omap3.dtsi index 90b5c7148feb..bb33935df7b0 100644 --- a/arch/arm/boot/dts/omap3.dtsi +++ b/arch/arm/boot/dts/omap3.dtsi @@ -715,6 +715,7 @@  				compatible = "ti,ohci-omap3";  				reg = <0x48064400 0x400>;  				interrupts = <76>; +				remote-wakeup-connected;  			};  			usbhsehci: ehci@48064800 { diff --git a/arch/arm/boot/dts/omap4-droid4-xt894.dts b/arch/arm/boot/dts/omap4-droid4-xt894.dts index 8b93d37310f2..24a463f8641f 100644 --- a/arch/arm/boot/dts/omap4-droid4-xt894.dts +++ b/arch/arm/boot/dts/omap4-droid4-xt894.dts @@ -73,6 +73,7 @@  	/* HS USB Host PHY on PORT 1 */  	hsusb1_phy: hsusb1_phy {  		compatible = "usb-nop-xceiv"; +		#phy-cells = <0>;  	};  	/* LCD regulator from sw5 source */ diff --git a/arch/arm/boot/dts/omap4-duovero.dtsi b/arch/arm/boot/dts/omap4-duovero.dtsi index 6e6810c258eb..eb123b24c8e3 100644 --- a/arch/arm/boot/dts/omap4-duovero.dtsi +++ b/arch/arm/boot/dts/omap4-duovero.dtsi @@ -43,6 +43,7 @@  	hsusb1_phy: hsusb1_phy {  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio2 30 GPIO_ACTIVE_LOW>;	/* gpio_62 */ +		#phy-cells = <0>;  		pinctrl-names = "default";  		pinctrl-0 = <&hsusb1phy_pins>; diff --git a/arch/arm/boot/dts/omap4-panda-common.dtsi b/arch/arm/boot/dts/omap4-panda-common.dtsi index 22c1eee9b07a..5501d1b4e6cd 100644 --- a/arch/arm/boot/dts/omap4-panda-common.dtsi +++ b/arch/arm/boot/dts/omap4-panda-common.dtsi @@ -89,6 +89,7 @@  	hsusb1_phy: hsusb1_phy {  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio2 30 GPIO_ACTIVE_LOW>;   /* gpio_62 */ +		#phy-cells = <0>;  		vcc-supply = <&hsusb1_power>;  		clocks = <&auxclk3_ck>;  		clock-names = "main_clk"; diff --git a/arch/arm/boot/dts/omap4-var-som-om44.dtsi b/arch/arm/boot/dts/omap4-var-som-om44.dtsi index 6500bfc8d130..10fce28ceb5b 100644 --- a/arch/arm/boot/dts/omap4-var-som-om44.dtsi +++ b/arch/arm/boot/dts/omap4-var-som-om44.dtsi @@ -44,6 +44,7 @@  		reset-gpios = <&gpio6 17 GPIO_ACTIVE_LOW>; /* gpio 177 */  		vcc-supply = <&vbat>; +		#phy-cells = <0>;  		clocks = <&auxclk3_ck>;  		clock-names = "main_clk"; diff --git a/arch/arm/boot/dts/omap4.dtsi b/arch/arm/boot/dts/omap4.dtsi index 1dc5a76b3c71..cc1a07a3620f 100644 --- a/arch/arm/boot/dts/omap4.dtsi +++ b/arch/arm/boot/dts/omap4.dtsi @@ -398,7 +398,7 @@  		elm: elm@48078000 {  			compatible = "ti,am3352-elm";  			reg = <0x48078000 0x2000>; -			interrupts = <4>; +			interrupts = <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>;  			ti,hwmods = "elm";  			status = "disabled";  		}; @@ -1081,14 +1081,13 @@  			usbhsohci: ohci@4a064800 {  				compatible = "ti,ohci-omap3";  				reg = <0x4a064800 0x400>; -				interrupt-parent = <&gic>;  				interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>; +				remote-wakeup-connected;  			};  			usbhsehci: ehci@4a064c00 {  				compatible = "ti,ehci-omap";  				reg = <0x4a064c00 0x400>; -				interrupt-parent = <&gic>;  				interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>;  			};  		}; diff --git a/arch/arm/boot/dts/omap5-board-common.dtsi b/arch/arm/boot/dts/omap5-board-common.dtsi index 575ecffb0e9e..1b20838bb9a4 100644 --- a/arch/arm/boot/dts/omap5-board-common.dtsi +++ b/arch/arm/boot/dts/omap5-board-common.dtsi @@ -73,12 +73,14 @@  		clocks = <&auxclk1_ck>;  		clock-names = "main_clk";  		clock-frequency = <19200000>; +		#phy-cells = <0>;  	};  	/* HS USB Host PHY on PORT 3 */  	hsusb3_phy: hsusb3_phy {  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio3 15 GPIO_ACTIVE_LOW>; /* gpio3_79 ETH_NRESET */ +		#phy-cells = <0>;  	};  	tpd12s015: encoder { diff --git a/arch/arm/boot/dts/omap5-cm-t54.dts b/arch/arm/boot/dts/omap5-cm-t54.dts index 5b172a04b6f1..5e21fb430a65 100644 --- a/arch/arm/boot/dts/omap5-cm-t54.dts +++ b/arch/arm/boot/dts/omap5-cm-t54.dts @@ -63,12 +63,14 @@  	hsusb2_phy: hsusb2_phy {  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio3 12 GPIO_ACTIVE_LOW>; /* gpio3_76 HUB_RESET */ +		#phy-cells = <0>;  	};  	/* HS USB Host PHY on PORT 3 */  	hsusb3_phy: hsusb3_phy {  		compatible = "usb-nop-xceiv";  		reset-gpios = <&gpio3 19 GPIO_ACTIVE_LOW>; /* gpio3_83 ETH_RESET */ +		#phy-cells = <0>;  	};  	leds { diff --git a/arch/arm/boot/dts/omap5.dtsi b/arch/arm/boot/dts/omap5.dtsi index 4cd0005e462f..51a7fb3d7b9a 100644 --- a/arch/arm/boot/dts/omap5.dtsi +++ b/arch/arm/boot/dts/omap5.dtsi @@ -940,6 +940,7 @@  				compatible = "ti,ohci-omap3";  				reg = <0x4a064800 0x400>;  				interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>; +				remote-wakeup-connected;  			};  			usbhsehci: ehci@4a064c00 { diff --git a/arch/arm/boot/dts/r8a7790.dtsi b/arch/arm/boot/dts/r8a7790.dtsi index 2f017fee4009..62baabd757b6 100644 --- a/arch/arm/boot/dts/r8a7790.dtsi +++ b/arch/arm/boot/dts/r8a7790.dtsi @@ -1201,6 +1201,7 @@  		clock-names = "extal", "usb_extal";  		#clock-cells = <2>;  		#power-domain-cells = <0>; +		#reset-cells = <1>;  	};  	prr: chipid@ff000044 { diff --git a/arch/arm/boot/dts/r8a7792.dtsi b/arch/arm/boot/dts/r8a7792.dtsi index 131f65b0426e..3d080e07374c 100644 --- a/arch/arm/boot/dts/r8a7792.dtsi +++ b/arch/arm/boot/dts/r8a7792.dtsi @@ -829,6 +829,7 @@  			clock-names = "extal";  			#clock-cells = <2>;  			#power-domain-cells = <0>; +			#reset-cells = <1>;  		};  	}; diff --git a/arch/arm/boot/dts/r8a7793.dtsi b/arch/arm/boot/dts/r8a7793.dtsi index 58eae569b4e0..0cd1035de1a4 100644 --- a/arch/arm/boot/dts/r8a7793.dtsi +++ b/arch/arm/boot/dts/r8a7793.dtsi @@ -1088,6 +1088,7 @@  		clock-names = "extal", "usb_extal";  		#clock-cells = <2>;  		#power-domain-cells = <0>; +		#reset-cells = <1>;  	};  	rst: reset-controller@e6160000 { diff --git a/arch/arm/boot/dts/r8a7794.dtsi b/arch/arm/boot/dts/r8a7794.dtsi index 905e50c9b524..5643976c1356 100644 --- a/arch/arm/boot/dts/r8a7794.dtsi +++ b/arch/arm/boot/dts/r8a7794.dtsi @@ -1099,6 +1099,7 @@  		clock-names = "extal", "usb_extal";  		#clock-cells = <2>;  		#power-domain-cells = <0>; +		#reset-cells = <1>;  	};  	rst: reset-controller@e6160000 { diff --git a/arch/arm/boot/dts/rk3066a-marsboard.dts b/arch/arm/boot/dts/rk3066a-marsboard.dts index c6d92c25df42..d23ee6d911ac 100644 --- a/arch/arm/boot/dts/rk3066a-marsboard.dts +++ b/arch/arm/boot/dts/rk3066a-marsboard.dts @@ -83,6 +83,10 @@  	};  }; +&cpu0 { +	cpu0-supply = <&vdd_arm>; +}; +  &i2c1 {  	status = "okay";  	clock-frequency = <400000>; diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index cd24894ee5c6..6102e4e7f35c 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -956,7 +956,7 @@  	iep_mmu: iommu@ff900800 {  		compatible = "rockchip,iommu";  		reg = <0x0 0xff900800 0x0 0x40>; -		interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH 0>; +		interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;  		interrupt-names = "iep_mmu";  		#iommu-cells = <0>;  		status = "disabled"; diff --git a/arch/arm/boot/dts/sun4i-a10.dtsi b/arch/arm/boot/dts/sun4i-a10.dtsi index b91300d49a31..5840f5c75c3b 100644 --- a/arch/arm/boot/dts/sun4i-a10.dtsi +++ b/arch/arm/boot/dts/sun4i-a10.dtsi @@ -502,8 +502,8 @@  			reg = <0x01c16000 0x1000>;  			interrupts = <58>;  			clocks = <&ccu CLK_AHB_HDMI0>, <&ccu CLK_HDMI>, -				 <&ccu 9>, -				 <&ccu 18>; +				 <&ccu CLK_PLL_VIDEO0_2X>, +				 <&ccu CLK_PLL_VIDEO1_2X>;  			clock-names = "ahb", "mod", "pll-0", "pll-1";  			dmas = <&dma SUN4I_DMA_NORMAL 16>,  			       <&dma SUN4I_DMA_NORMAL 16>, diff --git a/arch/arm/boot/dts/sun5i-a10s.dtsi b/arch/arm/boot/dts/sun5i-a10s.dtsi index 6ae4d95e230e..316cb8b2945b 100644 --- a/arch/arm/boot/dts/sun5i-a10s.dtsi +++ b/arch/arm/boot/dts/sun5i-a10s.dtsi @@ -82,8 +82,8 @@  			reg = <0x01c16000 0x1000>;  			interrupts = <58>;  			clocks = <&ccu CLK_AHB_HDMI>, <&ccu CLK_HDMI>, -				 <&ccu 9>, -				 <&ccu 16>; +				 <&ccu CLK_PLL_VIDEO0_2X>, +				 <&ccu CLK_PLL_VIDEO1_2X>;  			clock-names = "ahb", "mod", "pll-0", "pll-1";  			dmas = <&dma SUN4I_DMA_NORMAL 16>,  			       <&dma SUN4I_DMA_NORMAL 16>, diff --git a/arch/arm/boot/dts/sun6i-a31.dtsi b/arch/arm/boot/dts/sun6i-a31.dtsi index 8bfa12b548e0..72d3fe44ecaf 100644 --- a/arch/arm/boot/dts/sun6i-a31.dtsi +++ b/arch/arm/boot/dts/sun6i-a31.dtsi @@ -429,8 +429,8 @@  			interrupts = <GIC_SPI 88 IRQ_TYPE_LEVEL_HIGH>;  			clocks = <&ccu CLK_AHB1_HDMI>, <&ccu CLK_HDMI>,  				 <&ccu CLK_HDMI_DDC>, -				 <&ccu 7>, -				 <&ccu 13>; +				 <&ccu CLK_PLL_VIDEO0_2X>, +				 <&ccu CLK_PLL_VIDEO1_2X>;  			clock-names = "ahb", "mod", "ddc", "pll-0", "pll-1";  			resets = <&ccu RST_AHB1_HDMI>;  			reset-names = "ahb"; diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi index 68dfa82544fc..59655e42e4b0 100644 --- a/arch/arm/boot/dts/sun7i-a20.dtsi +++ b/arch/arm/boot/dts/sun7i-a20.dtsi @@ -581,8 +581,8 @@  			reg = <0x01c16000 0x1000>;  			interrupts = <GIC_SPI 58 IRQ_TYPE_LEVEL_HIGH>;  			clocks = <&ccu CLK_AHB_HDMI0>, <&ccu CLK_HDMI>, -				 <&ccu 9>, -				 <&ccu 18>; +				 <&ccu CLK_PLL_VIDEO0_2X>, +				 <&ccu CLK_PLL_VIDEO1_2X>;  			clock-names = "ahb", "mod", "pll-0", "pll-1";  			dmas = <&dma SUN4I_DMA_NORMAL 16>,  			       <&dma SUN4I_DMA_NORMAL 16>, diff --git a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts index 98715538932f..a021ee6da396 100644 --- a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts +++ b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts @@ -146,6 +146,7 @@  	status = "okay";  	axp81x: pmic@3a3 { +		compatible = "x-powers,axp813";  		reg = <0x3a3>;  		interrupt-parent = <&r_intc>;  		interrupts = <0 IRQ_TYPE_LEVEL_LOW>; diff --git a/arch/arm/boot/dts/tango4-common.dtsi b/arch/arm/boot/dts/tango4-common.dtsi index 0ec1b0a317b4..ff72a8efb73d 100644 --- a/arch/arm/boot/dts/tango4-common.dtsi +++ b/arch/arm/boot/dts/tango4-common.dtsi @@ -156,7 +156,6 @@  			reg = <0x6e000 0x400>;  			ranges = <0 0x6e000 0x400>;  			interrupt-parent = <&gic>; -			interrupt-controller;  			#address-cells = <1>;  			#size-cells = <1>; diff --git a/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts b/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts index 02a6227c717c..4b8edc8982cf 100644 --- a/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts +++ b/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts @@ -121,7 +121,7 @@  					switch0port10: port@10 {  						reg = <10>;  						label = "dsa"; -						phy-mode = "xgmii"; +						phy-mode = "xaui";  						link = <&switch1port10>;  					};  				}; @@ -208,7 +208,7 @@  					switch1port10: port@10 {  						reg = <10>;  						label = "dsa"; -						phy-mode = "xgmii"; +						phy-mode = "xaui";  						link = <&switch0port10>;  					};  				}; @@ -359,7 +359,7 @@  };  &i2c1 { -	at24mac602@0 { +	at24mac602@50 {  		compatible = "atmel,24c02";  		reg = <0x50>;  		read-only; diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index c8781450905b..3ab8b3781bfe 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -161,8 +161,7 @@  #else  #define VTTBR_X		(5 - KVM_T0SZ)  #endif -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1) -#define VTTBR_BADDR_MASK  (((_AC(1, ULL) << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) +#define VTTBR_BADDR_MASK  (((_AC(1, ULL) << (40 - VTTBR_X)) - 1) << VTTBR_X)  #define VTTBR_VMID_SHIFT  _AC(48, ULL)  #define VTTBR_VMID_MASK(size)	(_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 242151ea6908..a9f7d3f47134 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -285,6 +285,11 @@ static inline void kvm_arm_init_debug(void) {}  static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}  static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}  static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} +static inline bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, +					     struct kvm_run *run) +{ +	return false; +}  int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,  			       struct kvm_device_attr *attr); diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 2a029bceaf2f..1a7a17b2a1ba 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -221,7 +221,6 @@ static inline pte_t pte_mkspecial(pte_t pte)  }  #define	__HAVE_ARCH_PTE_SPECIAL -#define __HAVE_ARCH_PMD_WRITE  #define pmd_write(pmd)		(pmd_isclear((pmd), L_PMD_SECT_RDONLY))  #define pmd_dirty(pmd)		(pmd_isset((pmd), L_PMD_SECT_DIRTY))  #define pud_page(pud)		pmd_page(__pmd(pud_val(pud))) diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild index 4d53de308ee0..4d1cc1847edf 100644 --- a/arch/arm/include/uapi/asm/Kbuild +++ b/arch/arm/include/uapi/asm/Kbuild @@ -7,6 +7,7 @@ generated-y += unistd-oabi.h  generated-y += unistd-eabi.h  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += ioctl.h  generic-y += ipcbuf.h diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S index 7f4d80c2db6b..0f07579af472 100644 --- a/arch/arm/kernel/entry-header.S +++ b/arch/arm/kernel/entry-header.S @@ -300,7 +300,7 @@  	mov	r2, sp  	ldr	r1, [r2, #\offset + S_PSR]	@ get calling cpsr  	ldr	lr, [r2, #\offset + S_PC]!	@ get pc -	tst	r1, #0xcf +	tst	r1, #PSR_I_BIT | 0x0f  	bne	1f  	msr	spsr_cxsf, r1			@ save in spsr_svc  #if defined(CONFIG_CPU_V6) || defined(CONFIG_CPU_32v6K) @@ -332,7 +332,7 @@  	ldr	r1, [sp, #\offset + S_PSR]	@ get calling cpsr  	ldr	lr, [sp, #\offset + S_PC]	@ get pc  	add	sp, sp, #\offset + S_SP -	tst	r1, #0xcf +	tst	r1, #PSR_I_BIT | 0x0f  	bne	1f  	msr	spsr_cxsf, r1			@ save in spsr_svc diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 5cf04888c581..3e26c6f7a191 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -793,7 +793,6 @@ void abort(void)  	/* if that doesn't kill us, halt */  	panic("Oops failed to kill thread");  } -EXPORT_SYMBOL(abort);  void __init trap_init(void)  { diff --git a/arch/arm/lib/csumpartialcopyuser.S b/arch/arm/lib/csumpartialcopyuser.S index 1712f132b80d..b83fdc06286a 100644 --- a/arch/arm/lib/csumpartialcopyuser.S +++ b/arch/arm/lib/csumpartialcopyuser.S @@ -85,7 +85,11 @@  		.pushsection .text.fixup,"ax"  		.align	4  9001:		mov	r4, #-EFAULT +#ifdef CONFIG_CPU_SW_DOMAIN_PAN +		ldr	r5, [sp, #9*4]		@ *err_ptr +#else  		ldr	r5, [sp, #8*4]		@ *err_ptr +#endif  		str	r4, [r5]  		ldmia	sp, {r1, r2}		@ retrieve dst, len  		add	r2, r2, r1 diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c index 8be04ec95adf..5ace9380626a 100644 --- a/arch/arm/mach-davinci/dm365.c +++ b/arch/arm/mach-davinci/dm365.c @@ -868,10 +868,10 @@ static const struct dma_slave_map dm365_edma_map[] = {  	{ "spi_davinci.0", "rx", EDMA_FILTER_PARAM(0, 17) },  	{ "spi_davinci.3", "tx", EDMA_FILTER_PARAM(0, 18) },  	{ "spi_davinci.3", "rx", EDMA_FILTER_PARAM(0, 19) }, -	{ "dm6441-mmc.0", "rx", EDMA_FILTER_PARAM(0, 26) }, -	{ "dm6441-mmc.0", "tx", EDMA_FILTER_PARAM(0, 27) }, -	{ "dm6441-mmc.1", "rx", EDMA_FILTER_PARAM(0, 30) }, -	{ "dm6441-mmc.1", "tx", EDMA_FILTER_PARAM(0, 31) }, +	{ "da830-mmc.0", "rx", EDMA_FILTER_PARAM(0, 26) }, +	{ "da830-mmc.0", "tx", EDMA_FILTER_PARAM(0, 27) }, +	{ "da830-mmc.1", "rx", EDMA_FILTER_PARAM(0, 30) }, +	{ "da830-mmc.1", "tx", EDMA_FILTER_PARAM(0, 31) },  };  static struct edma_soc_info dm365_edma_pdata = { @@ -925,12 +925,14 @@ static struct resource edma_resources[] = {  	/* not using TC*_ERR */  }; -static struct platform_device dm365_edma_device = { -	.name			= "edma", -	.id			= 0, -	.dev.platform_data	= &dm365_edma_pdata, -	.num_resources		= ARRAY_SIZE(edma_resources), -	.resource		= edma_resources, +static const struct platform_device_info dm365_edma_device __initconst = { +	.name		= "edma", +	.id		= 0, +	.dma_mask	= DMA_BIT_MASK(32), +	.res		= edma_resources, +	.num_res	= ARRAY_SIZE(edma_resources), +	.data		= &dm365_edma_pdata, +	.size_data	= sizeof(dm365_edma_pdata),  };  static struct resource dm365_asp_resources[] = { @@ -1428,13 +1430,18 @@ int __init dm365_init_video(struct vpfe_config *vpfe_cfg,  static int __init dm365_init_devices(void)  { +	struct platform_device *edma_pdev;  	int ret = 0;  	if (!cpu_is_davinci_dm365())  		return 0;  	davinci_cfg_reg(DM365_INT_EDMA_CC); -	platform_device_register(&dm365_edma_device); +	edma_pdev = platform_device_register_full(&dm365_edma_device); +	if (IS_ERR(edma_pdev)) { +		pr_warn("%s: Failed to register eDMA\n", __func__); +		return PTR_ERR(edma_pdev); +	}  	platform_device_register(&dm365_mdio_device);  	platform_device_register(&dm365_emac_device); diff --git a/arch/arm/mach-meson/platsmp.c b/arch/arm/mach-meson/platsmp.c index 2555f9056a33..cad7ee8f0d6b 100644 --- a/arch/arm/mach-meson/platsmp.c +++ b/arch/arm/mach-meson/platsmp.c @@ -102,7 +102,7 @@ static void __init meson_smp_prepare_cpus(const char *scu_compatible,  	scu_base = of_iomap(node, 0);  	if (!scu_base) { -		pr_err("Couln't map SCU registers\n"); +		pr_err("Couldn't map SCU registers\n");  		return;  	} diff --git a/arch/arm/mach-omap2/cm_common.c b/arch/arm/mach-omap2/cm_common.c index d555791cf349..83c6fa74cc31 100644 --- a/arch/arm/mach-omap2/cm_common.c +++ b/arch/arm/mach-omap2/cm_common.c @@ -68,14 +68,17 @@ void __init omap2_set_globals_cm(void __iomem *cm, void __iomem *cm2)  int cm_split_idlest_reg(struct clk_omap_reg *idlest_reg, s16 *prcm_inst,  			u8 *idlest_reg_id)  { +	int ret;  	if (!cm_ll_data->split_idlest_reg) {  		WARN_ONCE(1, "cm: %s: no low-level function defined\n",  			  __func__);  		return -EINVAL;  	} -	return cm_ll_data->split_idlest_reg(idlest_reg, prcm_inst, +	ret = cm_ll_data->split_idlest_reg(idlest_reg, prcm_inst,  					   idlest_reg_id); +	*prcm_inst -= cm_base.offset; +	return ret;  }  /** @@ -337,6 +340,7 @@ int __init omap2_cm_base_init(void)  		if (mem) {  			mem->pa = res.start + data->offset;  			mem->va = data->mem + data->offset; +			mem->offset = data->offset;  		}  		data->np = np; diff --git a/arch/arm/mach-omap2/omap-secure.c b/arch/arm/mach-omap2/omap-secure.c index 5ac122e88f67..fa7f308c9027 100644 --- a/arch/arm/mach-omap2/omap-secure.c +++ b/arch/arm/mach-omap2/omap-secure.c @@ -73,6 +73,27 @@ phys_addr_t omap_secure_ram_mempool_base(void)  	return omap_secure_memblock_base;  } +#if defined(CONFIG_ARCH_OMAP3) && defined(CONFIG_PM) +u32 omap3_save_secure_ram(void __iomem *addr, int size) +{ +	u32 ret; +	u32 param[5]; + +	if (size != OMAP3_SAVE_SECURE_RAM_SZ) +		return OMAP3_SAVE_SECURE_RAM_SZ; + +	param[0] = 4;		/* Number of arguments */ +	param[1] = __pa(addr);	/* Physical address for saving */ +	param[2] = 0; +	param[3] = 1; +	param[4] = 1; + +	ret = save_secure_ram_context(__pa(param)); + +	return ret; +} +#endif +  /**   * rx51_secure_dispatcher: Routine to dispatch secure PPA API calls   * @idx: The PPA API index diff --git a/arch/arm/mach-omap2/omap-secure.h b/arch/arm/mach-omap2/omap-secure.h index bae263fba640..c509cde71f93 100644 --- a/arch/arm/mach-omap2/omap-secure.h +++ b/arch/arm/mach-omap2/omap-secure.h @@ -31,6 +31,8 @@  /* Maximum Secure memory storage size */  #define OMAP_SECURE_RAM_STORAGE	(88 * SZ_1K) +#define OMAP3_SAVE_SECURE_RAM_SZ	0x803F +  /* Secure low power HAL API index */  #define OMAP4_HAL_SAVESECURERAM_INDEX	0x1a  #define OMAP4_HAL_SAVEHW_INDEX		0x1b @@ -65,6 +67,8 @@ extern u32 omap_smc2(u32 id, u32 falg, u32 pargs);  extern u32 omap_smc3(u32 id, u32 process, u32 flag, u32 pargs);  extern phys_addr_t omap_secure_ram_mempool_base(void);  extern int omap_secure_ram_reserve_memblock(void); +extern u32 save_secure_ram_context(u32 args_pa); +extern u32 omap3_save_secure_ram(void __iomem *save_regs, int size);  extern u32 rx51_secure_dispatcher(u32 idx, u32 process, u32 flag, u32 nargs,  				  u32 arg1, u32 arg2, u32 arg3, u32 arg4); diff --git a/arch/arm/mach-omap2/omap_device.c b/arch/arm/mach-omap2/omap_device.c index d45cbfdb4be6..f0388058b7da 100644 --- a/arch/arm/mach-omap2/omap_device.c +++ b/arch/arm/mach-omap2/omap_device.c @@ -391,10 +391,8 @@ omap_device_copy_resources(struct omap_hwmod *oh,  	const char *name;  	int error, irq = 0; -	if (!oh || !oh->od || !oh->od->pdev) { -		error = -EINVAL; -		goto error; -	} +	if (!oh || !oh->od || !oh->od->pdev) +		return -EINVAL;  	np = oh->od->pdev->dev.of_node;  	if (!np) { @@ -516,8 +514,10 @@ struct platform_device __init *omap_device_build(const char *pdev_name,  		goto odbs_exit1;  	od = omap_device_alloc(pdev, &oh, 1); -	if (IS_ERR(od)) +	if (IS_ERR(od)) { +		ret = PTR_ERR(od);  		goto odbs_exit1; +	}  	ret = platform_device_add_data(pdev, pdata, pdata_len);  	if (ret) diff --git a/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c b/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c index d2106ae4410a..52c9d585b44d 100644 --- a/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c +++ b/arch/arm/mach-omap2/omap_hwmod_3xxx_data.c @@ -1646,6 +1646,7 @@ static struct omap_hwmod omap3xxx_mmc3_hwmod = {  	.main_clk	= "mmchs3_fck",  	.prcm		= {  		.omap2 = { +			.module_offs = CORE_MOD,  			.prcm_reg_id = 1,  			.module_bit = OMAP3430_EN_MMC3_SHIFT,  			.idlest_reg_id = 1, diff --git a/arch/arm/mach-omap2/pm.h b/arch/arm/mach-omap2/pm.h index b668719b9b25..8e30772cfe32 100644 --- a/arch/arm/mach-omap2/pm.h +++ b/arch/arm/mach-omap2/pm.h @@ -81,10 +81,6 @@ extern unsigned int omap3_do_wfi_sz;  /* ... and its pointer from SRAM after copy */  extern void (*omap3_do_wfi_sram)(void); -/* save_secure_ram_context function pointer and size, for copy to SRAM */ -extern int save_secure_ram_context(u32 *addr); -extern unsigned int save_secure_ram_context_sz; -  extern void omap3_save_scratchpad_contents(void);  #define PM_RTA_ERRATUM_i608		(1 << 0) diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c index 841ba19d64a6..36c55547137c 100644 --- a/arch/arm/mach-omap2/pm34xx.c +++ b/arch/arm/mach-omap2/pm34xx.c @@ -48,6 +48,7 @@  #include "prm3xxx.h"  #include "pm.h"  #include "sdrc.h" +#include "omap-secure.h"  #include "sram.h"  #include "control.h"  #include "vc.h" @@ -66,7 +67,6 @@ struct power_state {  static LIST_HEAD(pwrst_list); -static int (*_omap_save_secure_sram)(u32 *addr);  void (*omap3_do_wfi_sram)(void);  static struct powerdomain *mpu_pwrdm, *neon_pwrdm; @@ -121,8 +121,8 @@ static void omap3_save_secure_ram_context(void)  		 * will hang the system.  		 */  		pwrdm_set_next_pwrst(mpu_pwrdm, PWRDM_POWER_ON); -		ret = _omap_save_secure_sram((u32 *)(unsigned long) -				__pa(omap3_secure_ram_storage)); +		ret = omap3_save_secure_ram(omap3_secure_ram_storage, +					    OMAP3_SAVE_SECURE_RAM_SZ);  		pwrdm_set_next_pwrst(mpu_pwrdm, mpu_next_state);  		/* Following is for error tracking, it should not happen */  		if (ret) { @@ -434,15 +434,10 @@ static int __init pwrdms_setup(struct powerdomain *pwrdm, void *unused)   *   * The minimum set of functions is pushed to SRAM for execution:   * - omap3_do_wfi for erratum i581 WA, - * - save_secure_ram_context for security extensions.   */  void omap_push_sram_idle(void)  {  	omap3_do_wfi_sram = omap_sram_push(omap3_do_wfi, omap3_do_wfi_sz); - -	if (omap_type() != OMAP2_DEVICE_TYPE_GP) -		_omap_save_secure_sram = omap_sram_push(save_secure_ram_context, -				save_secure_ram_context_sz);  }  static void __init pm_errata_configure(void) @@ -553,7 +548,7 @@ int __init omap3_pm_init(void)  	clkdm_add_wkdep(neon_clkdm, mpu_clkdm);  	if (omap_type() != OMAP2_DEVICE_TYPE_GP) {  		omap3_secure_ram_storage = -			kmalloc(0x803F, GFP_KERNEL); +			kmalloc(OMAP3_SAVE_SECURE_RAM_SZ, GFP_KERNEL);  		if (!omap3_secure_ram_storage)  			pr_err("Memory allocation failed when allocating for secure sram context\n"); diff --git a/arch/arm/mach-omap2/prcm-common.h b/arch/arm/mach-omap2/prcm-common.h index 0592b23902c6..0977da0dab76 100644 --- a/arch/arm/mach-omap2/prcm-common.h +++ b/arch/arm/mach-omap2/prcm-common.h @@ -528,6 +528,7 @@ struct omap_prcm_irq_setup {  struct omap_domain_base {  	u32 pa;  	void __iomem *va; +	s16 offset;  };  /** diff --git a/arch/arm/mach-omap2/prm33xx.c b/arch/arm/mach-omap2/prm33xx.c index d2c5bcabdbeb..ebaf80d72a10 100644 --- a/arch/arm/mach-omap2/prm33xx.c +++ b/arch/arm/mach-omap2/prm33xx.c @@ -176,17 +176,6 @@ static int am33xx_pwrdm_read_pwrst(struct powerdomain *pwrdm)  	return v;  } -static int am33xx_pwrdm_read_prev_pwrst(struct powerdomain *pwrdm) -{ -	u32 v; - -	v = am33xx_prm_read_reg(pwrdm->prcm_offs, pwrdm->pwrstst_offs); -	v &= AM33XX_LASTPOWERSTATEENTERED_MASK; -	v >>= AM33XX_LASTPOWERSTATEENTERED_SHIFT; - -	return v; -} -  static int am33xx_pwrdm_set_lowpwrstchange(struct powerdomain *pwrdm)  {  	am33xx_prm_rmw_reg_bits(AM33XX_LOWPOWERSTATECHANGE_MASK, @@ -357,7 +346,6 @@ struct pwrdm_ops am33xx_pwrdm_operations = {  	.pwrdm_set_next_pwrst		= am33xx_pwrdm_set_next_pwrst,  	.pwrdm_read_next_pwrst		= am33xx_pwrdm_read_next_pwrst,  	.pwrdm_read_pwrst		= am33xx_pwrdm_read_pwrst, -	.pwrdm_read_prev_pwrst		= am33xx_pwrdm_read_prev_pwrst,  	.pwrdm_set_logic_retst		= am33xx_pwrdm_set_logic_retst,  	.pwrdm_read_logic_pwrst		= am33xx_pwrdm_read_logic_pwrst,  	.pwrdm_read_logic_retst		= am33xx_pwrdm_read_logic_retst, diff --git a/arch/arm/mach-omap2/sleep34xx.S b/arch/arm/mach-omap2/sleep34xx.S index fa5fd24f524c..22daf4efed68 100644 --- a/arch/arm/mach-omap2/sleep34xx.S +++ b/arch/arm/mach-omap2/sleep34xx.S @@ -93,20 +93,13 @@ ENTRY(enable_omap3630_toggle_l2_on_restore)  ENDPROC(enable_omap3630_toggle_l2_on_restore)  /* - * Function to call rom code to save secure ram context. This gets - * relocated to SRAM, so it can be all in .data section. Otherwise - * we need to initialize api_params separately. + * Function to call rom code to save secure ram context. + * + * r0 = physical address of the parameters   */ -	.data -	.align	3  ENTRY(save_secure_ram_context)  	stmfd	sp!, {r4 - r11, lr}	@ save registers on stack -	adr	r3, api_params		@ r3 points to parameters -	str	r0, [r3,#0x4]		@ r0 has sdram address -	ldr	r12, high_mask -	and	r3, r3, r12 -	ldr	r12, sram_phy_addr_mask -	orr	r3, r3, r12 +	mov	r3, r0			@ physical address of parameters  	mov	r0, #25			@ set service ID for PPA  	mov	r12, r0			@ copy secure service ID in r12  	mov	r1, #0			@ set task id for ROM code in r1 @@ -120,18 +113,7 @@ ENTRY(save_secure_ram_context)  	nop  	nop  	ldmfd	sp!, {r4 - r11, pc} -	.align -sram_phy_addr_mask: -	.word	SRAM_BASE_P -high_mask: -	.word	0xffff -api_params: -	.word	0x4, 0x0, 0x0, 0x1, 0x1  ENDPROC(save_secure_ram_context) -ENTRY(save_secure_ram_context_sz) -	.word	. - save_secure_ram_context - -	.text  /*   * ====================== diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a93339f5178f..c9a7e9e1414f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -557,7 +557,6 @@ config QCOM_QDF2400_ERRATUM_0065  	  If unsure, say Y. -  config SOCIONEXT_SYNQUACER_PREITS  	bool "Socionext Synquacer: Workaround for GICv3 pre-ITS"  	default y @@ -576,6 +575,17 @@ config HISILICON_ERRATUM_161600802  	  a 128kB offset to be applied to the target address in this commands.  	  If unsure, say Y. + +config QCOM_FALKOR_ERRATUM_E1041 +	bool "Falkor E1041: Speculative instruction fetches might cause errant memory access" +	default y +	help +	  Falkor CPU may speculatively fetch instructions from an improper +	  memory location when MMU translation is changed from SCTLR_ELn[M]=1 +	  to SCTLR_ELn[M]=0. Prefix an ISB instruction to fix the problem. + +	  If unsure, say Y. +  endmenu diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index b35788c909f1..b481b4a7c011 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -83,9 +83,6 @@ endif  ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)  KBUILD_LDFLAGS_MODULE	+= -T $(srctree)/arch/arm64/kernel/module.lds -ifeq ($(CONFIG_DYNAMIC_FTRACE),y) -KBUILD_LDFLAGS_MODULE	+= $(objtree)/arch/arm64/kernel/ftrace-mod.o -endif  endif  # Default value diff --git a/arch/arm64/boot/dts/Makefile b/arch/arm64/boot/dts/Makefile index d7c22d51bc50..4aa50b9b26bc 100644 --- a/arch/arm64/boot/dts/Makefile +++ b/arch/arm64/boot/dts/Makefile @@ -12,6 +12,7 @@ subdir-y += cavium  subdir-y += exynos  subdir-y += freescale  subdir-y += hisilicon +subdir-y += lg  subdir-y += marvell  subdir-y += mediatek  subdir-y += nvidia @@ -22,5 +23,4 @@ subdir-y += rockchip  subdir-y += socionext  subdir-y += sprd  subdir-y += xilinx -subdir-y += lg  subdir-y += zte diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts index 45bdbfb96126..4a8d3f83a36e 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts @@ -75,6 +75,7 @@  	pinctrl-0 = <&rgmii_pins>;  	phy-mode = "rgmii";  	phy-handle = <&ext_rgmii_phy>; +	phy-supply = <®_dc1sw>;  	status = "okay";  }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts index 806442d3e846..604cdaedac38 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts @@ -77,6 +77,7 @@  	pinctrl-0 = <&rmii_pins>;  	phy-mode = "rmii";  	phy-handle = <&ext_rmii_phy1>; +	phy-supply = <®_dc1sw>;  	status = "okay";  }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts index 0eb2acedf8c3..abe179de35d7 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts @@ -82,6 +82,7 @@  	pinctrl-0 = <&rgmii_pins>;  	phy-mode = "rgmii";  	phy-handle = <&ext_rgmii_phy>; +	phy-supply = <®_dc1sw>;  	status = "okay";  }; @@ -95,7 +96,7 @@  &mmc2 {  	pinctrl-names = "default";  	pinctrl-0 = <&mmc2_pins>; -	vmmc-supply = <®_vcc3v3>; +	vmmc-supply = <®_dcdc1>;  	vqmmc-supply = <®_vcc1v8>;  	bus-width = <8>;  	non-removable; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi index a5da18a6f286..43418bd881d8 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi @@ -45,19 +45,10 @@  #include "sun50i-a64.dtsi" -/ { -	reg_vcc3v3: vcc3v3 { -		compatible = "regulator-fixed"; -		regulator-name = "vcc3v3"; -		regulator-min-microvolt = <3300000>; -		regulator-max-microvolt = <3300000>; -	}; -}; -  &mmc0 {  	pinctrl-names = "default";  	pinctrl-0 = <&mmc0_pins>; -	vmmc-supply = <®_vcc3v3>; +	vmmc-supply = <®_dcdc1>;  	non-removable;  	disable-wp;  	bus-width = <4>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts index b6b7a561df8c..a42fd79a62a3 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts @@ -71,7 +71,7 @@  	pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>;  	vmmc-supply = <®_vcc3v3>;  	bus-width = <4>; -	cd-gpios = <&pio 5 6 GPIO_ACTIVE_HIGH>; +	cd-gpios = <&pio 5 6 GPIO_ACTIVE_LOW>;  	status = "okay";  }; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi index ead895a4e9a5..1fb8b9d6cb4e 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi @@ -753,12 +753,12 @@  &uart_B {  	clocks = <&xtal>, <&clkc CLKID_UART1>, <&xtal>; -	clock-names = "xtal", "core", "baud"; +	clock-names = "xtal", "pclk", "baud";  };  &uart_C {  	clocks = <&xtal>, <&clkc CLKID_UART2>, <&xtal>; -	clock-names = "xtal", "core", "baud"; +	clock-names = "xtal", "pclk", "baud";  };  &vpu { diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi index 8ed981f59e5a..6524b89e7115 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi @@ -688,7 +688,7 @@  &uart_A {  	clocks = <&xtal>, <&clkc CLKID_UART0>, <&xtal>; -	clock-names = "xtal", "core", "baud"; +	clock-names = "xtal", "pclk", "baud";  };  &uart_AO { @@ -703,12 +703,12 @@  &uart_B {  	clocks = <&xtal>, <&clkc CLKID_UART1>, <&xtal>; -	clock-names = "xtal", "core", "baud"; +	clock-names = "xtal", "pclk", "baud";  };  &uart_C {  	clocks = <&xtal>, <&clkc CLKID_UART2>, <&xtal>; -	clock-names = "xtal", "core", "baud"; +	clock-names = "xtal", "pclk", "baud";  };  &vpu { diff --git a/arch/arm64/boot/dts/renesas/salvator-common.dtsi b/arch/arm64/boot/dts/renesas/salvator-common.dtsi index a298df74ca6c..dbe2648649db 100644 --- a/arch/arm64/boot/dts/renesas/salvator-common.dtsi +++ b/arch/arm64/boot/dts/renesas/salvator-common.dtsi @@ -255,7 +255,6 @@  &avb {  	pinctrl-0 = <&avb_pins>;  	pinctrl-names = "default"; -	renesas,no-ether-link;  	phy-handle = <&phy0>;  	status = "okay"; diff --git a/arch/arm64/boot/dts/renesas/ulcb.dtsi b/arch/arm64/boot/dts/renesas/ulcb.dtsi index 0d85b315ce71..73439cf48659 100644 --- a/arch/arm64/boot/dts/renesas/ulcb.dtsi +++ b/arch/arm64/boot/dts/renesas/ulcb.dtsi @@ -145,7 +145,6 @@  &avb {  	pinctrl-0 = <&avb_pins>;  	pinctrl-names = "default"; -	renesas,no-ether-link;  	phy-handle = <&phy0>;  	status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts index d4f80786e7c2..3890468678ce 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts @@ -132,6 +132,8 @@  	assigned-clocks = <&cru SCLK_MAC2IO>, <&cru SCLK_MAC2IO_EXT>;  	assigned-clock-parents = <&gmac_clkin>, <&gmac_clkin>;  	clock_in_out = "input"; +	/* shows instability at 1GBit right now */ +	max-speed = <100>;  	phy-supply = <&vcc_io>;  	phy-mode = "rgmii";  	pinctrl-names = "default"; diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 41d61840fb99..2426da631938 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -514,7 +514,7 @@  	tsadc: tsadc@ff250000 {  		compatible = "rockchip,rk3328-tsadc";  		reg = <0x0 0xff250000 0x0 0x100>; -		interrupts = <GIC_SPI 58 IRQ_TYPE_LEVEL_HIGH 0>; +		interrupts = <GIC_SPI 58 IRQ_TYPE_LEVEL_HIGH>;  		assigned-clocks = <&cru SCLK_TSADC>;  		assigned-clock-rates = <50000>;  		clocks = <&cru SCLK_TSADC>, <&cru PCLK_TSADC>; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi index 910628d18add..1fc5060d7027 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi @@ -155,17 +155,6 @@  		regulator-min-microvolt = <5000000>;  		regulator-max-microvolt = <5000000>;  	}; - -	vdd_log: vdd-log { -		compatible = "pwm-regulator"; -		pwms = <&pwm2 0 25000 0>; -		regulator-name = "vdd_log"; -		regulator-min-microvolt = <800000>; -		regulator-max-microvolt = <1400000>; -		regulator-always-on; -		regulator-boot-on; -		status = "okay"; -	};  };  &cpu_b0 { diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld11-ref.dts b/arch/arm64/boot/dts/socionext/uniphier-ld11-ref.dts index dd7193acc7df..6bdefb26b329 100644 --- a/arch/arm64/boot/dts/socionext/uniphier-ld11-ref.dts +++ b/arch/arm64/boot/dts/socionext/uniphier-ld11-ref.dts @@ -40,7 +40,6 @@  };  ðsc { -	interrupt-parent = <&gpio>;  	interrupts = <0 8>;  }; diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts b/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts index d99e3731358c..254d6795c67e 100644 --- a/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts +++ b/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts @@ -40,7 +40,6 @@  };  ðsc { -	interrupt-parent = <&gpio>;  	interrupts = <0 8>;  }; diff --git a/arch/arm64/boot/dts/socionext/uniphier-pxs3-ref.dts b/arch/arm64/boot/dts/socionext/uniphier-pxs3-ref.dts index 864feeb35180..f9f06fcfb94a 100644 --- a/arch/arm64/boot/dts/socionext/uniphier-pxs3-ref.dts +++ b/arch/arm64/boot/dts/socionext/uniphier-pxs3-ref.dts @@ -38,8 +38,7 @@  };  ðsc { -	interrupt-parent = <&gpio>; -	interrupts = <0 8>; +	interrupts = <4 8>;  };  &serial0 { diff --git a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi index 48e733136db4..0ac2ace82435 100644 --- a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi +++ b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi @@ -198,8 +198,8 @@  			gpio-controller;  			#gpio-cells = <2>;  			gpio-ranges = <&pinctrl 0 0 0>, -				      <&pinctrl 96 0 0>, -				      <&pinctrl 160 0 0>; +				      <&pinctrl 104 0 0>, +				      <&pinctrl 168 0 0>;  			gpio-ranges-group-names = "gpio_range0",  						  "gpio_range1",  						  "gpio_range2"; diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index aef72d886677..8b168280976f 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -512,4 +512,14 @@ alternative_else_nop_endif  #endif  	.endm +/** + * Errata workaround prior to disable MMU. Insert an ISB immediately prior + * to executing the MSR that will change SCTLR_ELn[M] from a value of 1 to 0. + */ +	.macro pre_disable_mmu_workaround +#ifdef CONFIG_QCOM_FALKOR_ERRATUM_E1041 +	isb +#endif +	.endm +  #endif	/* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 76d1cc85d5b1..955130762a3c 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -38,7 +38,7 @@   *   *	See Documentation/cachetlb.txt for more information. Please note that   *	the implementation assumes non-aliasing VIPT D-cache and (aliasing) - *	VIPT or ASID-tagged VIVT I-cache. + *	VIPT I-cache.   *   *	flush_cache_mm(mm)   * diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index ac67cfc2585a..060e3a4008ab 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -60,6 +60,9 @@ enum ftr_type {  #define FTR_VISIBLE	true	/* Feature visible to the user space */  #define FTR_HIDDEN	false	/* Feature is hidden from the user */ +#define FTR_VISIBLE_IF_IS_ENABLED(config)		\ +	(IS_ENABLED(config) ? FTR_VISIBLE : FTR_HIDDEN) +  struct arm64_ftr_bits {  	bool		sign;	/* Value is signed ? */  	bool		visible; diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 235e77d98261..cbf08d7cbf30 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -91,6 +91,7 @@  #define BRCM_CPU_PART_VULCAN		0x516  #define QCOM_CPU_PART_FALKOR_V1		0x800 +#define QCOM_CPU_PART_FALKOR		0xC00  #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)  #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) @@ -99,6 +100,7 @@  #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)  #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)  #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) +#define MIDR_QCOM_FALKOR MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR)  #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 650344d01124..c4cd5081d78b 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -132,11 +132,9 @@ static inline void efi_set_pgd(struct mm_struct *mm)  			 * Defer the switch to the current thread's TTBR0_EL1  			 * until uaccess_enable(). Restore the current  			 * thread's saved ttbr0 corresponding to its active_mm -			 * (if different from init_mm).  			 */  			cpu_set_reserved_ttbr0(); -			if (current->active_mm != &init_mm) -				update_saved_ttbr0(current, current->active_mm); +			update_saved_ttbr0(current, current->active_mm);  		}  	}  } diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 7f069ff37f06..715d395ef45b 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -170,8 +170,7 @@  #define VTCR_EL2_FLAGS			(VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)  #define VTTBR_X				(VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA) -#define VTTBR_BADDR_SHIFT (VTTBR_X - 1) -#define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) +#define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X)  #define VTTBR_VMID_SHIFT  (UL(48))  #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 674912d7a571..ea6cb5b24258 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -370,6 +370,7 @@ void kvm_arm_init_debug(void);  void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);  void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);  void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); +bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run);  int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,  			       struct kvm_device_attr *attr);  int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 3257895a9b5e..9d155fa9a507 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -156,29 +156,21 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);  #define init_new_context(tsk,mm)	({ atomic64_set(&(mm)->context.id, 0); 0; }) -/* - * This is called when "tsk" is about to enter lazy TLB mode. - * - * mm:  describes the currently active mm context - * tsk: task which is entering lazy tlb - * cpu: cpu number which is entering lazy tlb - * - * tsk->mm will be NULL - */ -static inline void -enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ -} -  #ifdef CONFIG_ARM64_SW_TTBR0_PAN  static inline void update_saved_ttbr0(struct task_struct *tsk,  				      struct mm_struct *mm)  { -	if (system_uses_ttbr0_pan()) { -		BUG_ON(mm->pgd == swapper_pg_dir); -		task_thread_info(tsk)->ttbr0 = -			virt_to_phys(mm->pgd) | ASID(mm) << 48; -	} +	u64 ttbr; + +	if (!system_uses_ttbr0_pan()) +		return; + +	if (mm == &init_mm) +		ttbr = __pa_symbol(empty_zero_page); +	else +		ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48; + +	task_thread_info(tsk)->ttbr0 = ttbr;  }  #else  static inline void update_saved_ttbr0(struct task_struct *tsk, @@ -187,6 +179,16 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,  }  #endif +static inline void +enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ +	/* +	 * We don't actually care about the ttbr0 mapping, so point it at the +	 * zero page. +	 */ +	update_saved_ttbr0(tsk, &init_mm); +} +  static inline void __switch_mm(struct mm_struct *next)  {  	unsigned int cpu = smp_processor_id(); @@ -214,11 +216,9 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,  	 * Update the saved TTBR0_EL1 of the scheduled-in task as the previous  	 * value may have not been initialised yet (activate_mm caller) or the  	 * ASID has changed since the last run (following the context switch -	 * of another thread of the same process). Avoid setting the reserved -	 * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit). +	 * of another thread of the same process).  	 */ -	if (next != &init_mm) -		update_saved_ttbr0(tsk, next); +	update_saved_ttbr0(tsk, next);  }  #define deactivate_mm(tsk,mm)	do { } while (0) diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index 19bd97671bb8..4f766178fa6f 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -32,7 +32,7 @@ struct mod_arch_specific {  	struct mod_plt_sec	init;  	/* for CONFIG_DYNAMIC_FTRACE */ -	void			*ftrace_trampoline; +	struct plt_entry 	*ftrace_trampoline;  };  #endif @@ -45,4 +45,48 @@ extern u64 module_alloc_base;  #define module_alloc_base	((u64)_etext - MODULES_VSIZE)  #endif +struct plt_entry { +	/* +	 * A program that conforms to the AArch64 Procedure Call Standard +	 * (AAPCS64) must assume that a veneer that alters IP0 (x16) and/or +	 * IP1 (x17) may be inserted at any branch instruction that is +	 * exposed to a relocation that supports long branches. Since that +	 * is exactly what we are dealing with here, we are free to use x16 +	 * as a scratch register in the PLT veneers. +	 */ +	__le32	mov0;	/* movn	x16, #0x....			*/ +	__le32	mov1;	/* movk	x16, #0x...., lsl #16		*/ +	__le32	mov2;	/* movk	x16, #0x...., lsl #32		*/ +	__le32	br;	/* br	x16				*/ +}; + +static inline struct plt_entry get_plt_entry(u64 val) +{ +	/* +	 * MOVK/MOVN/MOVZ opcode: +	 * +--------+------------+--------+-----------+-------------+---------+ +	 * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] | +	 * +--------+------------+--------+-----------+-------------+---------+ +	 * +	 * Rd     := 0x10 (x16) +	 * hw     := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32) +	 * opc    := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ) +	 * sf     := 1 (64-bit variant) +	 */ +	return (struct plt_entry){ +		cpu_to_le32(0x92800010 | (((~val      ) & 0xffff)) << 5), +		cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5), +		cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5), +		cpu_to_le32(0xd61f0200) +	}; +} + +static inline bool plt_entries_equal(const struct plt_entry *a, +				     const struct plt_entry *b) +{ +	return a->mov0 == b->mov0 && +	       a->mov1 == b->mov1 && +	       a->mov2 == b->mov2; +} +  #endif /* __ASM_MODULE_H */ diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h index 8d5cbec17d80..f9ccc36d3dc3 100644 --- a/arch/arm64/include/asm/perf_event.h +++ b/arch/arm64/include/asm/perf_event.h @@ -18,6 +18,7 @@  #define __ASM_PERF_EVENT_H  #include <asm/stack_pointer.h> +#include <asm/ptrace.h>  #define	ARMV8_PMU_MAX_COUNTERS	32  #define	ARMV8_PMU_COUNTER_MASK	(ARMV8_PMU_MAX_COUNTERS - 1) @@ -79,6 +80,7 @@ struct pt_regs;  extern unsigned long perf_instruction_pointer(struct pt_regs *regs);  extern unsigned long perf_misc_flags(struct pt_regs *regs);  #define perf_misc_flags(regs)	perf_misc_flags(regs) +#define perf_arch_bpf_user_pt_regs(regs) ®s->user_regs  #endif  #define perf_arch_fetch_caller_regs(regs, __ip) { \ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c9530b5b5ca8..bdcc7f1c9d06 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -42,6 +42,8 @@  #include <asm/cmpxchg.h>  #include <asm/fixmap.h>  #include <linux/mmdebug.h> +#include <linux/mm_types.h> +#include <linux/sched.h>  extern void __pte_error(const char *file, int line, unsigned long val);  extern void __pmd_error(const char *file, int line, unsigned long val); @@ -149,12 +151,20 @@ static inline pte_t pte_mkwrite(pte_t pte)  static inline pte_t pte_mkclean(pte_t pte)  { -	return clear_pte_bit(pte, __pgprot(PTE_DIRTY)); +	pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY)); +	pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); + +	return pte;  }  static inline pte_t pte_mkdirty(pte_t pte)  { -	return set_pte_bit(pte, __pgprot(PTE_DIRTY)); +	pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); + +	if (pte_write(pte)) +		pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY)); + +	return pte;  }  static inline pte_t pte_mkold(pte_t pte) @@ -207,9 +217,6 @@ static inline void set_pte(pte_t *ptep, pte_t pte)  	}  } -struct mm_struct; -struct vm_area_struct; -  extern void __sync_icache_dcache(pte_t pteval, unsigned long addr);  /* @@ -238,7 +245,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,  	 * hardware updates of the pte (ptep_set_access_flags safely changes  	 * valid ptes without going through an invalid entry).  	 */ -	if (pte_valid(*ptep) && pte_valid(pte)) { +	if (IS_ENABLED(CONFIG_DEBUG_VM) && pte_valid(*ptep) && pte_valid(pte) && +	   (mm == current->active_mm || atomic_read(&mm->mm_users) > 1)) {  		VM_WARN_ONCE(!pte_young(pte),  			     "%s: racy access flag clearing: 0x%016llx -> 0x%016llx",  			     __func__, pte_val(*ptep), pte_val(pte)); @@ -345,7 +353,6 @@ static inline int pmd_protnone(pmd_t pmd)  #define pmd_thp_or_huge(pmd)	(pmd_huge(pmd) || pmd_trans_huge(pmd)) -#define __HAVE_ARCH_PMD_WRITE  #define pmd_write(pmd)		pte_write(pmd_pte(pmd))  #define pmd_mkhuge(pmd)		(__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) @@ -642,28 +649,23 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */  /* - * ptep_set_wrprotect - mark read-only while preserving the hardware update of - * the Access Flag. + * ptep_set_wrprotect - mark read-only while trasferring potential hardware + * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.   */  #define __HAVE_ARCH_PTEP_SET_WRPROTECT  static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)  {  	pte_t old_pte, pte; -	/* -	 * ptep_set_wrprotect() is only called on CoW mappings which are -	 * private (!VM_SHARED) with the pte either read-only (!PTE_WRITE && -	 * PTE_RDONLY) or writable and software-dirty (PTE_WRITE && -	 * !PTE_RDONLY && PTE_DIRTY); see is_cow_mapping() and -	 * protection_map[]. There is no race with the hardware update of the -	 * dirty state: clearing of PTE_RDONLY when PTE_WRITE (a.k.a. PTE_DBM) -	 * is set. -	 */ -	VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(*ptep), -		     "%s: potential race with hardware DBM", __func__);  	pte = READ_ONCE(*ptep);  	do {  		old_pte = pte; +		/* +		 * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY +		 * clear), set the PTE_DIRTY bit. +		 */ +		if (pte_hw_dirty(pte)) +			pte = pte_mkdirty(pte);  		pte = pte_wrprotect(pte);  		pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),  					       pte_val(old_pte), pte_val(pte)); diff --git a/arch/arm64/include/uapi/asm/bpf_perf_event.h b/arch/arm64/include/uapi/asm/bpf_perf_event.h new file mode 100644 index 000000000000..b551b741653d --- /dev/null +++ b/arch/arm64/include/uapi/asm/bpf_perf_event.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ +#define _UAPI__ASM_BPF_PERF_EVENT_H__ + +#include <asm/ptrace.h> + +typedef struct user_pt_regs bpf_user_pt_regs_t; + +#endif /* _UAPI__ASM_BPF_PERF_EVENT_H__ */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 8265dd790895..067baace74a0 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -61,6 +61,3 @@ extra-y					+= $(head-y) vmlinux.lds  ifeq ($(CONFIG_DEBUG_EFI),y)  AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\""  endif - -# will be included by each individual module but not by the core kernel itself -extra-$(CONFIG_DYNAMIC_FTRACE) += ftrace-mod.o diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 65f42d257414..2a752cb2a0f3 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -37,6 +37,7 @@ ENTRY(__cpu_soft_restart)  	mrs	x12, sctlr_el1  	ldr	x13, =SCTLR_ELx_FLAGS  	bic	x12, x12, x13 +	pre_disable_mmu_workaround  	msr	sctlr_el1, x12  	isb diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c index d16978213c5b..ea001241bdd4 100644 --- a/arch/arm64/kernel/cpu_ops.c +++ b/arch/arm64/kernel/cpu_ops.c @@ -31,13 +31,13 @@ extern const struct cpu_operations cpu_psci_ops;  const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; -static const struct cpu_operations *dt_supported_cpu_ops[] __initconst = { +static const struct cpu_operations *const dt_supported_cpu_ops[] __initconst = {  	&smp_spin_table_ops,  	&cpu_psci_ops,  	NULL,  }; -static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = { +static const struct cpu_operations *const acpi_supported_cpu_ops[] __initconst = {  #ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL  	&acpi_parking_protocol_ops,  #endif @@ -47,7 +47,7 @@ static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = {  static const struct cpu_operations * __init cpu_get_ops(const char *name)  { -	const struct cpu_operations **ops; +	const struct cpu_operations *const *ops;  	ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c5ba0097887f..a73a5928f09b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -145,7 +145,8 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {  };  static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { -	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), +	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), +				   FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0),  	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0),  	S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI),  	S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index 4e6ad355bd05..6b9736c3fb56 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -96,6 +96,7 @@ ENTRY(entry)  	mrs	x0, sctlr_el2  	bic	x0, x0, #1 << 0	// clear SCTLR.M  	bic	x0, x0, #1 << 2	// clear SCTLR.C +	pre_disable_mmu_workaround  	msr	sctlr_el2, x0  	isb  	b	2f @@ -103,6 +104,7 @@ ENTRY(entry)  	mrs	x0, sctlr_el1  	bic	x0, x0, #1 << 0	// clear SCTLR.M  	bic	x0, x0, #1 << 2	// clear SCTLR.C +	pre_disable_mmu_workaround  	msr	sctlr_el1, x0  	isb  2: diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 143b3e72c25e..fae81f7964b4 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -114,7 +114,12 @@   *   returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so   *   whatever is in the FPSIMD registers is not saved to memory, but discarded.   */ -static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state); +struct fpsimd_last_state_struct { +	struct fpsimd_state *st; +	bool sve_in_use; +}; + +static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state);  /* Default VL for tasks that don't set it explicitly: */  static int sve_default_vl = -1; @@ -905,7 +910,7 @@ void fpsimd_thread_switch(struct task_struct *next)  		 */  		struct fpsimd_state *st = &next->thread.fpsimd_state; -		if (__this_cpu_read(fpsimd_last_state) == st +		if (__this_cpu_read(fpsimd_last_state.st) == st  		    && st->cpu == smp_processor_id())  			clear_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE);  		else @@ -992,6 +997,21 @@ void fpsimd_signal_preserve_current_state(void)  }  /* + * Associate current's FPSIMD context with this cpu + * Preemption must be disabled when calling this function. + */ +static void fpsimd_bind_to_cpu(void) +{ +	struct fpsimd_last_state_struct *last = +		this_cpu_ptr(&fpsimd_last_state); +	struct fpsimd_state *st = ¤t->thread.fpsimd_state; + +	last->st = st; +	last->sve_in_use = test_thread_flag(TIF_SVE); +	st->cpu = smp_processor_id(); +} + +/*   * Load the userland FPSIMD state of 'current' from memory, but only if the   * FPSIMD state already held in the registers is /not/ the most recent FPSIMD   * state of 'current' @@ -1004,11 +1024,8 @@ void fpsimd_restore_current_state(void)  	local_bh_disable();  	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { -		struct fpsimd_state *st = ¤t->thread.fpsimd_state; -  		task_fpsimd_load(); -		__this_cpu_write(fpsimd_last_state, st); -		st->cpu = smp_processor_id(); +		fpsimd_bind_to_cpu();  	}  	local_bh_enable(); @@ -1026,18 +1043,14 @@ void fpsimd_update_current_state(struct fpsimd_state *state)  	local_bh_disable(); -	if (system_supports_sve() && test_thread_flag(TIF_SVE)) { -		current->thread.fpsimd_state = *state; +	current->thread.fpsimd_state.user_fpsimd = state->user_fpsimd; +	if (system_supports_sve() && test_thread_flag(TIF_SVE))  		fpsimd_to_sve(current); -	} -	task_fpsimd_load(); -	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { -		struct fpsimd_state *st = ¤t->thread.fpsimd_state; +	task_fpsimd_load(); -		__this_cpu_write(fpsimd_last_state, st); -		st->cpu = smp_processor_id(); -	} +	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) +		fpsimd_bind_to_cpu();  	local_bh_enable();  } @@ -1052,7 +1065,7 @@ void fpsimd_flush_task_state(struct task_struct *t)  static inline void fpsimd_flush_cpu_state(void)  { -	__this_cpu_write(fpsimd_last_state, NULL); +	__this_cpu_write(fpsimd_last_state.st, NULL);  }  /* @@ -1065,14 +1078,10 @@ static inline void fpsimd_flush_cpu_state(void)  #ifdef CONFIG_ARM64_SVE  void sve_flush_cpu_state(void)  { -	struct fpsimd_state *const fpstate = __this_cpu_read(fpsimd_last_state); -	struct task_struct *tsk; - -	if (!fpstate) -		return; +	struct fpsimd_last_state_struct const *last = +		this_cpu_ptr(&fpsimd_last_state); -	tsk = container_of(fpstate, struct task_struct, thread.fpsimd_state); -	if (test_tsk_thread_flag(tsk, TIF_SVE)) +	if (last->st && last->sve_in_use)  		fpsimd_flush_cpu_state();  }  #endif /* CONFIG_ARM64_SVE */ @@ -1267,7 +1276,7 @@ static inline void fpsimd_pm_init(void) { }  #ifdef CONFIG_HOTPLUG_CPU  static int fpsimd_cpu_dead(unsigned int cpu)  { -	per_cpu(fpsimd_last_state, cpu) = NULL; +	per_cpu(fpsimd_last_state.st, cpu) = NULL;  	return 0;  } diff --git a/arch/arm64/kernel/ftrace-mod.S b/arch/arm64/kernel/ftrace-mod.S deleted file mode 100644 index 00c4025be4ff..000000000000 --- a/arch/arm64/kernel/ftrace-mod.S +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - -	.section	".text.ftrace_trampoline", "ax" -	.align		3 -0:	.quad		0 -__ftrace_trampoline: -	ldr		x16, 0b -	br		x16 -ENDPROC(__ftrace_trampoline) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index c13b1fca0e5b..50986e388d2b 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -76,7 +76,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)  	if (offset < -SZ_128M || offset >= SZ_128M) {  #ifdef CONFIG_ARM64_MODULE_PLTS -		unsigned long *trampoline; +		struct plt_entry trampoline;  		struct module *mod;  		/* @@ -104,22 +104,24 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)  		 * is added in the future, but for now, the pr_err() below  		 * deals with a theoretical issue only.  		 */ -		trampoline = (unsigned long *)mod->arch.ftrace_trampoline; -		if (trampoline[0] != addr) { -			if (trampoline[0] != 0) { +		trampoline = get_plt_entry(addr); +		if (!plt_entries_equal(mod->arch.ftrace_trampoline, +				       &trampoline)) { +			if (!plt_entries_equal(mod->arch.ftrace_trampoline, +					       &(struct plt_entry){})) {  				pr_err("ftrace: far branches to multiple entry points unsupported inside a single module\n");  				return -EINVAL;  			}  			/* point the trampoline to our ftrace entry point */  			module_disable_ro(mod); -			trampoline[0] = addr; +			*mod->arch.ftrace_trampoline = trampoline;  			module_enable_ro(mod, true);  			/* update trampoline before patching in the branch */  			smp_wmb();  		} -		addr = (unsigned long)&trampoline[1]; +		addr = (unsigned long)(void *)mod->arch.ftrace_trampoline;  #else /* CONFIG_ARM64_MODULE_PLTS */  		return -EINVAL;  #endif /* CONFIG_ARM64_MODULE_PLTS */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 67e86a0f57ac..e3cb9fbf96b6 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -750,6 +750,7 @@ __primary_switch:  	 * to take into account by discarding the current kernel mapping and  	 * creating a new one.  	 */ +	pre_disable_mmu_workaround  	msr	sctlr_el1, x20			// disable the MMU  	isb  	bl	__create_page_tables		// recreate kernel mapping diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 749f81779420..74bb56f656ef 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -28,6 +28,7 @@  #include <linux/perf_event.h>  #include <linux/ptrace.h>  #include <linux/smp.h> +#include <linux/uaccess.h>  #include <asm/compat.h>  #include <asm/current.h> @@ -36,7 +37,6 @@  #include <asm/traps.h>  #include <asm/cputype.h>  #include <asm/system_misc.h> -#include <asm/uaccess.h>  /* Breakpoint currently in use for each BRP. */  static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[ARM_MAX_BRP]); diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c index d05dbe658409..ea640f92fe5a 100644 --- a/arch/arm64/kernel/module-plts.c +++ b/arch/arm64/kernel/module-plts.c @@ -11,21 +11,6 @@  #include <linux/module.h>  #include <linux/sort.h> -struct plt_entry { -	/* -	 * A program that conforms to the AArch64 Procedure Call Standard -	 * (AAPCS64) must assume that a veneer that alters IP0 (x16) and/or -	 * IP1 (x17) may be inserted at any branch instruction that is -	 * exposed to a relocation that supports long branches. Since that -	 * is exactly what we are dealing with here, we are free to use x16 -	 * as a scratch register in the PLT veneers. -	 */ -	__le32	mov0;	/* movn	x16, #0x....			*/ -	__le32	mov1;	/* movk	x16, #0x...., lsl #16		*/ -	__le32	mov2;	/* movk	x16, #0x...., lsl #32		*/ -	__le32	br;	/* br	x16				*/ -}; -  static bool in_init(const struct module *mod, void *loc)  {  	return (u64)loc - (u64)mod->init_layout.base < mod->init_layout.size; @@ -40,33 +25,14 @@ u64 module_emit_plt_entry(struct module *mod, void *loc, const Elf64_Rela *rela,  	int i = pltsec->plt_num_entries;  	u64 val = sym->st_value + rela->r_addend; -	/* -	 * MOVK/MOVN/MOVZ opcode: -	 * +--------+------------+--------+-----------+-------------+---------+ -	 * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] | -	 * +--------+------------+--------+-----------+-------------+---------+ -	 * -	 * Rd     := 0x10 (x16) -	 * hw     := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32) -	 * opc    := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ) -	 * sf     := 1 (64-bit variant) -	 */ -	plt[i] = (struct plt_entry){ -		cpu_to_le32(0x92800010 | (((~val      ) & 0xffff)) << 5), -		cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5), -		cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5), -		cpu_to_le32(0xd61f0200) -	}; +	plt[i] = get_plt_entry(val);  	/*  	 * Check if the entry we just created is a duplicate. Given that the  	 * relocations are sorted, this will be the last entry we allocated.  	 * (if one exists).  	 */ -	if (i > 0 && -	    plt[i].mov0 == plt[i - 1].mov0 && -	    plt[i].mov1 == plt[i - 1].mov1 && -	    plt[i].mov2 == plt[i - 1].mov2) +	if (i > 0 && plt_entries_equal(plt + i, plt + i - 1))  		return (u64)&plt[i - 1];  	pltsec->plt_num_entries++; @@ -154,6 +120,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,  	unsigned long core_plts = 0;  	unsigned long init_plts = 0;  	Elf64_Sym *syms = NULL; +	Elf_Shdr *tramp = NULL;  	int i;  	/* @@ -165,6 +132,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,  			mod->arch.core.plt = sechdrs + i;  		else if (!strcmp(secstrings + sechdrs[i].sh_name, ".init.plt"))  			mod->arch.init.plt = sechdrs + i; +		else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE) && +			 !strcmp(secstrings + sechdrs[i].sh_name, +				 ".text.ftrace_trampoline")) +			tramp = sechdrs + i;  		else if (sechdrs[i].sh_type == SHT_SYMTAB)  			syms = (Elf64_Sym *)sechdrs[i].sh_addr;  	} @@ -215,5 +186,12 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,  	mod->arch.init.plt_num_entries = 0;  	mod->arch.init.plt_max_entries = init_plts; +	if (tramp) { +		tramp->sh_type = SHT_NOBITS; +		tramp->sh_flags = SHF_EXECINSTR | SHF_ALLOC; +		tramp->sh_addralign = __alignof__(struct plt_entry); +		tramp->sh_size = sizeof(struct plt_entry); +	} +  	return 0;  } diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds index f7c9781a9d48..22e36a21c113 100644 --- a/arch/arm64/kernel/module.lds +++ b/arch/arm64/kernel/module.lds @@ -1,4 +1,5 @@  SECTIONS {  	.plt (NOLOAD) : { BYTE(0) }  	.init.plt (NOLOAD) : { BYTE(0) } +	.text.ftrace_trampoline (NOLOAD) : { BYTE(0) }  } diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 9eaef51f83ff..3affca3dd96a 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -262,12 +262,6 @@ static const unsigned armv8_a73_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]  	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,  	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, - -	[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, -	[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, - -	[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, -	[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,  };  static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index b2adcce7bc18..6b7dcf4310ac 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -314,6 +314,15 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,  	clear_tsk_thread_flag(p, TIF_SVE);  	p->thread.sve_state = NULL; +	/* +	 * In case p was allocated the same task_struct pointer as some +	 * other recently-exited task, make sure p is disassociated from +	 * any cpu that may have run that now-exited task recently. +	 * Otherwise we could erroneously skip reloading the FPSIMD +	 * registers for p. +	 */ +	fpsimd_flush_task_state(p); +  	if (likely(!(p->flags & PF_KTHREAD))) {  		*childregs = *current_pt_regs();  		childregs->regs[0] = 0; diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index ce704a4aeadd..f407e422a720 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -45,6 +45,7 @@ ENTRY(arm64_relocate_new_kernel)  	mrs	x0, sctlr_el2  	ldr	x1, =SCTLR_ELx_FLAGS  	bic	x0, x0, x1 +	pre_disable_mmu_workaround  	msr	sctlr_el2, x0  	isb  1: diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index dbadfaf850a7..fa63b28c65e0 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -221,3 +221,24 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)  		}  	}  } + + +/* + * After successfully emulating an instruction, we might want to + * return to user space with a KVM_EXIT_DEBUG. We can only do this + * once the emulation is complete, though, so for userspace emulations + * we have to wait until we have re-entered KVM before calling this + * helper. + * + * Return true (and set exit_reason) to return to userspace or false + * if no further action is required. + */ +bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ +	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { +		run->exit_reason = KVM_EXIT_DEBUG; +		run->debug.arch.hsr = ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT; +		return true; +	} +	return false; +} diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index b71247995469..304203fa9e33 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -28,6 +28,7 @@  #include <asm/kvm_emulate.h>  #include <asm/kvm_mmu.h>  #include <asm/kvm_psci.h> +#include <asm/debug-monitors.h>  #define CREATE_TRACE_POINTS  #include "trace.h" @@ -187,14 +188,46 @@ static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)  }  /* + * We may be single-stepping an emulated instruction. If the emulation + * has been completed in the kernel, we can return to userspace with a + * KVM_EXIT_DEBUG, otherwise userspace needs to complete its + * emulation first. + */ +static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ +	int handled; + +	/* +	 * See ARM ARM B1.14.1: "Hyp traps on instructions +	 * that fail their condition code check" +	 */ +	if (!kvm_condition_valid(vcpu)) { +		kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); +		handled = 1; +	} else { +		exit_handle_fn exit_handler; + +		exit_handler = kvm_get_exit_handler(vcpu); +		handled = exit_handler(vcpu, run); +	} + +	/* +	 * kvm_arm_handle_step_debug() sets the exit_reason on the kvm_run +	 * structure if we need to return to userspace. +	 */ +	if (handled > 0 && kvm_arm_handle_step_debug(vcpu, run)) +		handled = 0; + +	return handled; +} + +/*   * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on   * proper exit to userspace.   */  int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,  		       int exception_index)  { -	exit_handle_fn exit_handler; -  	if (ARM_SERROR_PENDING(exception_index)) {  		u8 hsr_ec = ESR_ELx_EC(kvm_vcpu_get_hsr(vcpu)); @@ -220,20 +253,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,  		return 1;  	case ARM_EXCEPTION_EL1_SERROR:  		kvm_inject_vabt(vcpu); -		return 1; -	case ARM_EXCEPTION_TRAP: -		/* -		 * See ARM ARM B1.14.1: "Hyp traps on instructions -		 * that fail their condition code check" -		 */ -		if (!kvm_condition_valid(vcpu)) { -			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); +		/* We may still need to return for single-step */ +		if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS) +			&& kvm_arm_handle_step_debug(vcpu, run)) +			return 0; +		else  			return 1; -		} - -		exit_handler = kvm_get_exit_handler(vcpu); - -		return exit_handler(vcpu, run); +	case ARM_EXCEPTION_TRAP: +		return handle_trap_exceptions(vcpu, run);  	case ARM_EXCEPTION_HYP_GONE:  		/*  		 * EL2 has been reset to the hyp-stub. This happens when a guest diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 3f9615582377..870828c364c5 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -151,6 +151,7 @@ reset:  	mrs	x5, sctlr_el2  	ldr	x6, =SCTLR_ELx_FLAGS  	bic	x5, x5, x6		// Clear SCTL_M and etc +	pre_disable_mmu_workaround  	msr	sctlr_el2, x5  	isb diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c index 321c9c05dd9e..f4363d40e2cd 100644 --- a/arch/arm64/kvm/hyp/debug-sr.c +++ b/arch/arm64/kvm/hyp/debug-sr.c @@ -74,6 +74,9 @@ static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1)  {  	u64 reg; +	/* Clear pmscr in case of early return */ +	*pmscr_el1 = 0; +  	/* SPE present on this CPU? */  	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),  						  ID_AA64DFR0_PMSVER_SHIFT)) diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 525c01f48867..f7c651f3a8c0 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -22,6 +22,7 @@  #include <asm/kvm_emulate.h>  #include <asm/kvm_hyp.h>  #include <asm/fpsimd.h> +#include <asm/debug-monitors.h>  static bool __hyp_text __fpsimd_enabled_nvhe(void)  { @@ -269,7 +270,11 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)  	return true;  } -static void __hyp_text __skip_instr(struct kvm_vcpu *vcpu) +/* Skip an instruction which has been emulated. Returns true if + * execution can continue or false if we need to exit hyp mode because + * single-step was in effect. + */ +static bool __hyp_text __skip_instr(struct kvm_vcpu *vcpu)  {  	*vcpu_pc(vcpu) = read_sysreg_el2(elr); @@ -282,6 +287,14 @@ static void __hyp_text __skip_instr(struct kvm_vcpu *vcpu)  	}  	write_sysreg_el2(*vcpu_pc(vcpu), elr); + +	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { +		vcpu->arch.fault.esr_el2 = +			(ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT) | 0x22; +		return false; +	} else { +		return true; +	}  }  int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu) @@ -342,13 +355,21 @@ again:  			int ret = __vgic_v2_perform_cpuif_access(vcpu);  			if (ret == 1) { -				__skip_instr(vcpu); -				goto again; +				if (__skip_instr(vcpu)) +					goto again; +				else +					exit_code = ARM_EXCEPTION_TRAP;  			}  			if (ret == -1) { -				/* Promote an illegal access to an SError */ -				__skip_instr(vcpu); +				/* Promote an illegal access to an +				 * SError. If we would be returning +				 * due to single-step clear the SS +				 * bit so handle_exit knows what to +				 * do after dealing with the error. +				 */ +				if (!__skip_instr(vcpu)) +					*vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;  				exit_code = ARM_EXCEPTION_EL1_SERROR;  			} @@ -363,8 +384,10 @@ again:  		int ret = __vgic_v3_perform_cpuif_access(vcpu);  		if (ret == 1) { -			__skip_instr(vcpu); -			goto again; +			if (__skip_instr(vcpu)) +				goto again; +			else +				exit_code = ARM_EXCEPTION_TRAP;  		}  		/* 0 falls through to be handled out of EL2 */ diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index ab9f5f0fb2c7..6f4017046323 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -96,12 +96,6 @@ static void flush_context(unsigned int cpu)  	set_reserved_asid_bits(); -	/* -	 * Ensure the generation bump is observed before we xchg the -	 * active_asids. -	 */ -	smp_wmb(); -  	for_each_possible_cpu(i) {  		asid = atomic64_xchg_relaxed(&per_cpu(active_asids, i), 0);  		/* @@ -117,7 +111,10 @@ static void flush_context(unsigned int cpu)  		per_cpu(reserved_asids, i) = asid;  	} -	/* Queue a TLB invalidate and flush the I-cache if necessary. */ +	/* +	 * Queue a TLB invalidation for each CPU to perform on next +	 * context-switch +	 */  	cpumask_setall(&tlb_flush_pending);  } @@ -202,11 +199,18 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)  	asid = atomic64_read(&mm->context.id);  	/* -	 * The memory ordering here is subtle. We rely on the control -	 * dependency between the generation read and the update of -	 * active_asids to ensure that we are synchronised with a -	 * parallel rollover (i.e. this pairs with the smp_wmb() in -	 * flush_context). +	 * The memory ordering here is subtle. +	 * If our ASID matches the current generation, then we update +	 * our active_asids entry with a relaxed xchg. Racing with a +	 * concurrent rollover means that either: +	 * +	 * - We get a zero back from the xchg and end up waiting on the +	 *   lock. Taking the lock synchronises with the rollover and so +	 *   we are forced to see the updated generation. +	 * +	 * - We get a valid ASID back from the xchg, which means the +	 *   relaxed xchg in flush_context will treat us as reserved +	 *   because atomic RmWs are totally ordered for a given location.  	 */  	if (!((asid ^ atomic64_read(&asid_generation)) >> asid_bits)  	    && atomic64_xchg_relaxed(&per_cpu(active_asids, cpu), asid)) diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index ca74a2aace42..7b60d62ac593 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -389,7 +389,7 @@ void ptdump_check_wx(void)  		.check_wx = true,  	}; -	walk_pgd(&st, &init_mm, 0); +	walk_pgd(&st, &init_mm, VA_START);  	note_page(&st, 0, 0, 0);  	if (st.wx_pages || st.uxn_pages)  		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 22168cd0dde7..9b7f89df49db 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -574,7 +574,6 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)  {  	struct siginfo info;  	const struct fault_info *inf; -	int ret = 0;  	inf = esr_to_fault_info(esr);  	pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", @@ -589,7 +588,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)  		if (interrupts_enabled(regs))  			nmi_enter(); -		ret = ghes_notify_sea(); +		ghes_notify_sea();  		if (interrupts_enabled(regs))  			nmi_exit(); @@ -604,7 +603,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)  		info.si_addr  = (void __user *)addr;  	arm64_notify_die("", regs, &info, esr); -	return ret; +	return 0;  }  static const struct fault_info fault_info[] = { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 5960bef0170d..00e7b900ca41 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -476,6 +476,8 @@ void __init arm64_memblock_init(void)  	reserve_elfcorehdr(); +	high_memory = __va(memblock_end_of_DRAM() - 1) + 1; +  	dma_contiguous_reserve(arm64_dma_phys_limit);  	memblock_allow_resize(); @@ -502,7 +504,6 @@ void __init bootmem_init(void)  	sparse_init();  	zone_sizes_init(min, max); -	high_memory = __va((max << PAGE_SHIFT) - 1) + 1;  	memblock_dump_all();  } diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 371c5f03a170..051e71ec3335 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -26,7 +26,7 @@  #include <asm/page.h>  #include <asm/tlbflush.h> -static struct kmem_cache *pgd_cache; +static struct kmem_cache *pgd_cache __ro_after_init;  pgd_t *pgd_alloc(struct mm_struct *mm)  { diff --git a/arch/blackfin/include/uapi/asm/Kbuild b/arch/blackfin/include/uapi/asm/Kbuild index aa624b4ab655..2240b38c2915 100644 --- a/arch/blackfin/include/uapi/asm/Kbuild +++ b/arch/blackfin/include/uapi/asm/Kbuild @@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += ioctl.h  generic-y += ipcbuf.h diff --git a/arch/c6x/include/uapi/asm/Kbuild b/arch/c6x/include/uapi/asm/Kbuild index 67ee896a76a7..26644e15d854 100644 --- a/arch/c6x/include/uapi/asm/Kbuild +++ b/arch/c6x/include/uapi/asm/Kbuild @@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/cris/include/uapi/asm/Kbuild b/arch/cris/include/uapi/asm/Kbuild index 3687b54bb18e..3470c6e9c7b9 100644 --- a/arch/cris/include/uapi/asm/Kbuild +++ b/arch/cris/include/uapi/asm/Kbuild @@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/frv/include/uapi/asm/Kbuild b/arch/frv/include/uapi/asm/Kbuild index b15bf6bc0e94..14a2e9af97e9 100644 --- a/arch/frv/include/uapi/asm/Kbuild +++ b/arch/frv/include/uapi/asm/Kbuild @@ -1,2 +1,4 @@  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm + +generic-y += bpf_perf_event.h diff --git a/arch/h8300/include/uapi/asm/Kbuild b/arch/h8300/include/uapi/asm/Kbuild index 187aed820e71..2f65f78792cb 100644 --- a/arch/h8300/include/uapi/asm/Kbuild +++ b/arch/h8300/include/uapi/asm/Kbuild @@ -2,6 +2,7 @@  include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/hexagon/include/uapi/asm/Kbuild b/arch/hexagon/include/uapi/asm/Kbuild index cb5df3aad3a8..41a176dbb53e 100644 --- a/arch/hexagon/include/uapi/asm/Kbuild +++ b/arch/hexagon/include/uapi/asm/Kbuild @@ -2,6 +2,7 @@  include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild index 13a97aa2285f..f5c6967a93bb 100644 --- a/arch/ia64/include/uapi/asm/Kbuild +++ b/arch/ia64/include/uapi/asm/Kbuild @@ -1,4 +1,5 @@  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm +generic-y += bpf_perf_event.h  generic-y += kvm_para.h diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index c6ecb97151a2..9025699049ca 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -88,7 +88,7 @@ void vtime_flush(struct task_struct *tsk)  	}  	if (ti->softirq_time) { -		delta = cycle_to_nsec(ti->softirq_time)); +		delta = cycle_to_nsec(ti->softirq_time);  		account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ);  	} diff --git a/arch/m32r/include/uapi/asm/Kbuild b/arch/m32r/include/uapi/asm/Kbuild index 1c44d3b3eba0..451bf6071c6e 100644 --- a/arch/m32r/include/uapi/asm/Kbuild +++ b/arch/m32r/include/uapi/asm/Kbuild @@ -1,5 +1,6 @@  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm +generic-y += bpf_perf_event.h  generic-y += kvm_para.h  generic-y += siginfo.h diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index cb79fba79d43..b88a8dd14933 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -122,7 +122,6 @@ void abort(void)  	/* if that doesn't kill us, halt */  	panic("Oops failed to kill thread");  } -EXPORT_SYMBOL(abort);  void __init trap_init(void)  { diff --git a/arch/m68k/configs/stmark2_defconfig b/arch/m68k/configs/stmark2_defconfig index 55e55dbc2fb6..3d07b1de7eb0 100644 --- a/arch/m68k/configs/stmark2_defconfig +++ b/arch/m68k/configs/stmark2_defconfig @@ -5,7 +5,6 @@ CONFIG_SYSVIPC=y  CONFIG_LOG_BUF_SHIFT=14  CONFIG_NAMESPACES=y  CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="../uClinux-dist/romfs"  # CONFIG_RD_BZIP2 is not set  # CONFIG_RD_LZMA is not set  # CONFIG_RD_XZ is not set diff --git a/arch/m68k/include/uapi/asm/Kbuild b/arch/m68k/include/uapi/asm/Kbuild index 3717b64a620d..c2e26a44c482 100644 --- a/arch/m68k/include/uapi/asm/Kbuild +++ b/arch/m68k/include/uapi/asm/Kbuild @@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += ioctl.h  generic-y += ipcbuf.h diff --git a/arch/m68k/kernel/vmlinux-nommu.lds b/arch/m68k/kernel/vmlinux-nommu.lds index 3aa571a513b5..cf6edda38971 100644 --- a/arch/m68k/kernel/vmlinux-nommu.lds +++ b/arch/m68k/kernel/vmlinux-nommu.lds @@ -45,6 +45,8 @@ SECTIONS {  	.text : {  		HEAD_TEXT  		TEXT_TEXT +		IRQENTRY_TEXT +		SOFTIRQENTRY_TEXT  		SCHED_TEXT  		CPUIDLE_TEXT  		LOCK_TEXT diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index 89172b8974b9..625a5785804f 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -16,6 +16,8 @@ SECTIONS    .text : {  	HEAD_TEXT  	TEXT_TEXT +	IRQENTRY_TEXT +	SOFTIRQENTRY_TEXT  	SCHED_TEXT  	CPUIDLE_TEXT  	LOCK_TEXT diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index 293990efc917..9868270b0984 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -16,6 +16,8 @@ SECTIONS    .text : {  	HEAD_TEXT  	TEXT_TEXT +	IRQENTRY_TEXT +	SOFTIRQENTRY_TEXT  	SCHED_TEXT  	CPUIDLE_TEXT  	LOCK_TEXT diff --git a/arch/metag/include/uapi/asm/Kbuild b/arch/metag/include/uapi/asm/Kbuild index 6ac763d9a3e3..f9eaf07d29f8 100644 --- a/arch/metag/include/uapi/asm/Kbuild +++ b/arch/metag/include/uapi/asm/Kbuild @@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/microblaze/include/asm/mmu_context_mm.h b/arch/microblaze/include/asm/mmu_context_mm.h index 99472d2ca340..97559fe0b953 100644 --- a/arch/microblaze/include/asm/mmu_context_mm.h +++ b/arch/microblaze/include/asm/mmu_context_mm.h @@ -13,6 +13,7 @@  #include <linux/atomic.h>  #include <linux/mm_types.h> +#include <linux/sched.h>  #include <asm/bitops.h>  #include <asm/mmu.h> diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild index 06609ca36115..2c6a6bffea32 100644 --- a/arch/microblaze/include/uapi/asm/Kbuild +++ b/arch/microblaze/include/uapi/asm/Kbuild @@ -2,6 +2,7 @@  include include/uapi/asm-generic/Kbuild.asm  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 7c8aab23bce8..b1f66699677d 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -16,7 +16,6 @@ generic-y += qrwlock.h  generic-y += qspinlock.h  generic-y += sections.h  generic-y += segment.h -generic-y += serial.h  generic-y += trace_clock.h  generic-y += unaligned.h  generic-y += user.h diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 9e9e94415d08..1a508a74d48d 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -552,7 +552,7 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)  extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,  		       pmd_t *pmdp, pmd_t pmd); -#define __HAVE_ARCH_PMD_WRITE +#define pmd_write pmd_write  static inline int pmd_write(pmd_t pmd)  {  	return !!(pmd_val(pmd) & _PAGE_WRITE); diff --git a/arch/mips/include/asm/serial.h b/arch/mips/include/asm/serial.h new file mode 100644 index 000000000000..1d830c6666c2 --- /dev/null +++ b/arch/mips/include/asm/serial.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2017 MIPS Tech, LLC + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation;  either version 2 of the  License, or (at your + * option) any later version. + */ +#ifndef __ASM__SERIAL_H +#define __ASM__SERIAL_H + +#ifdef CONFIG_MIPS_GENERIC +/* + * Generic kernels cannot know a correct value for all platforms at + * compile time. Set it to 0 to prevent 8250_early using it + */ +#define BASE_BAUD 0 +#else +#include <asm-generic/serial.h> +#endif + +#endif /* __ASM__SERIAL_H */ diff --git a/arch/mips/include/uapi/asm/Kbuild b/arch/mips/include/uapi/asm/Kbuild index a0266feba9e6..7a4becd8963a 100644 --- a/arch/mips/include/uapi/asm/Kbuild +++ b/arch/mips/include/uapi/asm/Kbuild @@ -1,4 +1,5 @@  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm +generic-y += bpf_perf_event.h  generic-y += ipcbuf.h diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S index c7ed26029cbb..e68e6e04063a 100644 --- a/arch/mips/kernel/cps-vec.S +++ b/arch/mips/kernel/cps-vec.S @@ -235,6 +235,7 @@ LEAF(mips_cps_core_init)  	has_mt	t0, 3f  	.set	push +	.set	MIPS_ISA_LEVEL_RAW  	.set	mt  	/* Only allow 1 TC per VPE to execute... */ @@ -388,6 +389,7 @@ LEAF(mips_cps_boot_vpes)  #elif defined(CONFIG_MIPS_MT)  	.set	push +	.set	MIPS_ISA_LEVEL_RAW  	.set	mt  	/* If the core doesn't support MT then return */ diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 45d0b6b037ee..57028d49c202 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -705,6 +705,18 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value)  	struct task_struct *t;  	int max_users; +	/* If nothing to change, return right away, successfully.  */ +	if (value == mips_get_process_fp_mode(task)) +		return 0; + +	/* Only accept a mode change if 64-bit FP enabled for o32.  */ +	if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT)) +		return -EOPNOTSUPP; + +	/* And only for o32 tasks.  */ +	if (IS_ENABLED(CONFIG_64BIT) && !test_thread_flag(TIF_32BIT_REGS)) +		return -EOPNOTSUPP; +  	/* Check the value is valid */  	if (value & ~known_bits)  		return -EOPNOTSUPP; diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index efbd8df8b665..0b23b1ad99e6 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -419,63 +419,160 @@ static int gpr64_set(struct task_struct *target,  #endif /* CONFIG_64BIT */ +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer, + * !CONFIG_CPU_HAS_MSA variant.  FP context's general register slots + * correspond 1:1 to buffer slots.  Only general registers are copied. + */ +static int fpr_get_fpa(struct task_struct *target, +		       unsigned int *pos, unsigned int *count, +		       void **kbuf, void __user **ubuf) +{ +	return user_regset_copyout(pos, count, kbuf, ubuf, +				   &target->thread.fpu, +				   0, NUM_FPU_REGS * sizeof(elf_fpreg_t)); +} + +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer, + * CONFIG_CPU_HAS_MSA variant.  Only lower 64 bits of FP context's + * general register slots are copied to buffer slots.  Only general + * registers are copied. + */ +static int fpr_get_msa(struct task_struct *target, +		       unsigned int *pos, unsigned int *count, +		       void **kbuf, void __user **ubuf) +{ +	unsigned int i; +	u64 fpr_val; +	int err; + +	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); +	for (i = 0; i < NUM_FPU_REGS; i++) { +		fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0); +		err = user_regset_copyout(pos, count, kbuf, ubuf, +					  &fpr_val, i * sizeof(elf_fpreg_t), +					  (i + 1) * sizeof(elf_fpreg_t)); +		if (err) +			return err; +	} + +	return 0; +} + +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer. + * Choose the appropriate helper for general registers, and then copy + * the FCSR register separately. + */  static int fpr_get(struct task_struct *target,  		   const struct user_regset *regset,  		   unsigned int pos, unsigned int count,  		   void *kbuf, void __user *ubuf)  { -	unsigned i; +	const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t);  	int err; -	u64 fpr_val; -	/* XXX fcr31  */ +	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) +		err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf); +	else +		err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf); +	if (err) +		return err; -	if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t)) -		return user_regset_copyout(&pos, &count, &kbuf, &ubuf, -					   &target->thread.fpu, -					   0, sizeof(elf_fpregset_t)); +	err = user_regset_copyout(&pos, &count, &kbuf, &ubuf, +				  &target->thread.fpu.fcr31, +				  fcr31_pos, fcr31_pos + sizeof(u32)); -	for (i = 0; i < NUM_FPU_REGS; i++) { -		fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0); -		err = user_regset_copyout(&pos, &count, &kbuf, &ubuf, -					  &fpr_val, i * sizeof(elf_fpreg_t), -					  (i + 1) * sizeof(elf_fpreg_t)); +	return err; +} + +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context, + * !CONFIG_CPU_HAS_MSA variant.   Buffer slots correspond 1:1 to FP + * context's general register slots.  Only general registers are copied. + */ +static int fpr_set_fpa(struct task_struct *target, +		       unsigned int *pos, unsigned int *count, +		       const void **kbuf, const void __user **ubuf) +{ +	return user_regset_copyin(pos, count, kbuf, ubuf, +				  &target->thread.fpu, +				  0, NUM_FPU_REGS * sizeof(elf_fpreg_t)); +} + +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context, + * CONFIG_CPU_HAS_MSA variant.  Buffer slots are copied to lower 64 + * bits only of FP context's general register slots.  Only general + * registers are copied. + */ +static int fpr_set_msa(struct task_struct *target, +		       unsigned int *pos, unsigned int *count, +		       const void **kbuf, const void __user **ubuf) +{ +	unsigned int i; +	u64 fpr_val; +	int err; + +	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); +	for (i = 0; i < NUM_FPU_REGS && *count > 0; i++) { +		err = user_regset_copyin(pos, count, kbuf, ubuf, +					 &fpr_val, i * sizeof(elf_fpreg_t), +					 (i + 1) * sizeof(elf_fpreg_t));  		if (err)  			return err; +		set_fpr64(&target->thread.fpu.fpr[i], 0, fpr_val);  	}  	return 0;  } +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context. + * Choose the appropriate helper for general registers, and then copy + * the FCSR register separately. + * + * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0', + * which is supposed to have been guaranteed by the kernel before + * calling us, e.g. in `ptrace_regset'.  We enforce that requirement, + * so that we can safely avoid preinitializing temporaries for + * partial register writes. + */  static int fpr_set(struct task_struct *target,  		   const struct user_regset *regset,  		   unsigned int pos, unsigned int count,  		   const void *kbuf, const void __user *ubuf)  { -	unsigned i; +	const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t); +	u32 fcr31;  	int err; -	u64 fpr_val; -	/* XXX fcr31  */ +	BUG_ON(count % sizeof(elf_fpreg_t)); + +	if (pos + count > sizeof(elf_fpregset_t)) +		return -EIO;  	init_fp_ctx(target); -	if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t)) -		return user_regset_copyin(&pos, &count, &kbuf, &ubuf, -					  &target->thread.fpu, -					  0, sizeof(elf_fpregset_t)); +	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) +		err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf); +	else +		err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf); +	if (err) +		return err; -	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); -	for (i = 0; i < NUM_FPU_REGS && count >= sizeof(elf_fpreg_t); i++) { +	if (count > 0) {  		err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, -					 &fpr_val, i * sizeof(elf_fpreg_t), -					 (i + 1) * sizeof(elf_fpreg_t)); +					 &fcr31, +					 fcr31_pos, fcr31_pos + sizeof(u32));  		if (err)  			return err; -		set_fpr64(&target->thread.fpu.fpr[i], 0, fpr_val); + +		ptrace_setfcr31(target, fcr31);  	} -	return 0; +	return err;  }  enum mips_regset { diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index d535edc01434..75fdeaa8c62f 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -445,10 +445,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)  {  	int r = -EINTR; -	sigset_t sigsaved; -	if (vcpu->sigset_active) -		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); +	kvm_sigset_activate(vcpu);  	if (vcpu->mmio_needed) {  		if (!vcpu->mmio_is_write) @@ -480,8 +478,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)  	local_irq_enable();  out: -	if (vcpu->sigset_active) -		sigprocmask(SIG_SETMASK, &sigsaved, NULL); +	kvm_sigset_deactivate(vcpu);  	return r;  } diff --git a/arch/mn10300/include/uapi/asm/Kbuild b/arch/mn10300/include/uapi/asm/Kbuild index c94ee54210bc..81271d3af47c 100644 --- a/arch/mn10300/include/uapi/asm/Kbuild +++ b/arch/mn10300/include/uapi/asm/Kbuild @@ -1,4 +1,5 @@  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm +generic-y	+= bpf_perf_event.h  generic-y	+= siginfo.h diff --git a/arch/nios2/include/uapi/asm/Kbuild b/arch/nios2/include/uapi/asm/Kbuild index ffca24da7647..13a3d77b4d7b 100644 --- a/arch/nios2/include/uapi/asm/Kbuild +++ b/arch/nios2/include/uapi/asm/Kbuild @@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/openrisc/include/uapi/asm/Kbuild b/arch/openrisc/include/uapi/asm/Kbuild index 62286dbeb904..130c16ccba0a 100644 --- a/arch/openrisc/include/uapi/asm/Kbuild +++ b/arch/openrisc/include/uapi/asm/Kbuild @@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c index 9345b44b86f0..f57118e1f6b4 100644 --- a/arch/parisc/boot/compressed/misc.c +++ b/arch/parisc/boot/compressed/misc.c @@ -123,8 +123,8 @@ int puts(const char *s)  	while ((nuline = strchr(s, '\n')) != NULL) {  		if (nuline != s)  			pdc_iodc_print(s, nuline - s); -			pdc_iodc_print("\r\n", 2); -			s = nuline + 1; +		pdc_iodc_print("\r\n", 2); +		s = nuline + 1;  	}  	if (*s != '\0')  		pdc_iodc_print(s, strlen(s)); diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h index dd5a08aaa4da..3eb4bfc1fb36 100644 --- a/arch/parisc/include/asm/ldcw.h +++ b/arch/parisc/include/asm/ldcw.h @@ -12,6 +12,7 @@     for the semaphore.  */  #define __PA_LDCW_ALIGNMENT	16 +#define __PA_LDCW_ALIGN_ORDER	4  #define __ldcw_align(a) ({					\  	unsigned long __ret = (unsigned long) &(a)->lock[0];	\  	__ret = (__ret + __PA_LDCW_ALIGNMENT - 1)		\ @@ -29,6 +30,7 @@     ldcd). */  #define __PA_LDCW_ALIGNMENT	4 +#define __PA_LDCW_ALIGN_ORDER	2  #define __ldcw_align(a) (&(a)->slock)  #define __LDCW	"ldcw,co" diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index c980a02a52bc..598c8d60fa5e 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -35,7 +35,12 @@ struct thread_info {  /* thread information allocation */ +#ifdef CONFIG_IRQSTACKS +#define THREAD_SIZE_ORDER	2 /* PA-RISC requires at least 16k stack */ +#else  #define THREAD_SIZE_ORDER	3 /* PA-RISC requires at least 32k stack */ +#endif +  /* Be sure to hunt all references to this down when you change the size of   * the kernel stack */  #define THREAD_SIZE             (PAGE_SIZE << THREAD_SIZE_ORDER) diff --git a/arch/parisc/include/uapi/asm/Kbuild b/arch/parisc/include/uapi/asm/Kbuild index 196d2a4efb31..286ef5a5904b 100644 --- a/arch/parisc/include/uapi/asm/Kbuild +++ b/arch/parisc/include/uapi/asm/Kbuild @@ -2,6 +2,7 @@  include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h +generic-y += bpf_perf_event.h  generic-y += kvm_para.h  generic-y += param.h  generic-y += poll.h diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index d8f77358e2ba..29b99b8964aa 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -870,7 +870,7 @@ static void print_parisc_device(struct parisc_device *dev)  	static int count;  	print_pa_hwpath(dev, hw_path); -	printk(KERN_INFO "%d. %s at 0x%p [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }", +	printk(KERN_INFO "%d. %s at 0x%px [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }",  		++count, dev->name, (void*) dev->hpa.start, hw_path, dev->id.hw_type,  		dev->id.hversion_rev, dev->id.hversion, dev->id.sversion); diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index a4fd296c958e..e95207c0565e 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -35,6 +35,7 @@  #include <asm/pgtable.h>  #include <asm/signal.h>  #include <asm/unistd.h> +#include <asm/ldcw.h>  #include <asm/thread_info.h>  #include <linux/linkage.h> @@ -46,6 +47,14 @@  #endif  	.import		pa_tlb_lock,data +	.macro  load_pa_tlb_lock reg +#if __PA_LDCW_ALIGNMENT > 4 +	load32	PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg +	depi	0,31,__PA_LDCW_ALIGN_ORDER, \reg +#else +	load32	PA(pa_tlb_lock), \reg +#endif +	.endm  	/* space_to_prot macro creates a prot id from a space id */ @@ -457,7 +466,7 @@  	.macro		tlb_lock	spc,ptp,pte,tmp,tmp1,fault  #ifdef CONFIG_SMP  	cmpib,COND(=),n	0,\spc,2f -	load32		PA(pa_tlb_lock),\tmp +	load_pa_tlb_lock \tmp  1:	LDCW		0(\tmp),\tmp1  	cmpib,COND(=)	0,\tmp1,1b  	nop @@ -480,7 +489,7 @@  	/* Release pa_tlb_lock lock. */  	.macro		tlb_unlock1	spc,tmp  #ifdef CONFIG_SMP -	load32		PA(pa_tlb_lock),\tmp +	load_pa_tlb_lock \tmp  	tlb_unlock0	\spc,\tmp  #endif  	.endm @@ -878,9 +887,6 @@ ENTRY_CFI(syscall_exit_rfi)  	STREG   %r19,PT_SR7(%r16)  intr_return: -	/* NOTE: Need to enable interrupts incase we schedule. */ -	ssm     PSW_SM_I, %r0 -  	/* check for reschedule */  	mfctl   %cr30,%r1  	LDREG   TI_FLAGS(%r1),%r19	/* sched.h: TIF_NEED_RESCHED */ @@ -907,6 +913,11 @@ intr_check_sig:  	LDREG	PT_IASQ1(%r16), %r20  	cmpib,COND(=),n 0,%r20,intr_restore /* backward */ +	/* NOTE: We need to enable interrupts if we have to deliver +	 * signals. We used to do this earlier but it caused kernel +	 * stack overflows. */ +	ssm     PSW_SM_I, %r0 +  	copy	%r0, %r25			/* long in_syscall = 0 */  #ifdef CONFIG_64BIT  	ldo	-16(%r30),%r29			/* Reference param save area */ @@ -958,6 +969,10 @@ intr_do_resched:  	cmpib,COND(=)	0, %r20, intr_do_preempt  	nop +	/* NOTE: We need to enable interrupts if we schedule.  We used +	 * to do this earlier but it caused kernel stack overflows. */ +	ssm     PSW_SM_I, %r0 +  #ifdef CONFIG_64BIT  	ldo	-16(%r30),%r29		/* Reference param save area */  #endif diff --git a/arch/parisc/kernel/hpmc.S b/arch/parisc/kernel/hpmc.S index e3a8e5e4d5de..8d072c44f300 100644 --- a/arch/parisc/kernel/hpmc.S +++ b/arch/parisc/kernel/hpmc.S @@ -305,6 +305,7 @@ ENDPROC_CFI(os_hpmc)  	__INITRODATA +	.align 4  	.export os_hpmc_size  os_hpmc_size:  	.word .os_hpmc_end-.os_hpmc diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S index adf7187f8951..2d40c4ff3f69 100644 --- a/arch/parisc/kernel/pacache.S +++ b/arch/parisc/kernel/pacache.S @@ -36,6 +36,7 @@  #include <asm/assembly.h>  #include <asm/pgtable.h>  #include <asm/cache.h> +#include <asm/ldcw.h>  #include <linux/linkage.h>  	.text @@ -333,8 +334,12 @@ ENDPROC_CFI(flush_data_cache_local)  	.macro	tlb_lock	la,flags,tmp  #ifdef CONFIG_SMP -	ldil		L%pa_tlb_lock,%r1 -	ldo		R%pa_tlb_lock(%r1),\la +#if __PA_LDCW_ALIGNMENT > 4 +	load32		pa_tlb_lock + __PA_LDCW_ALIGNMENT-1, \la +	depi		0,31,__PA_LDCW_ALIGN_ORDER, \la +#else +	load32		pa_tlb_lock, \la +#endif  	rsm		PSW_SM_I,\flags  1:	LDCW		0(\la),\tmp  	cmpib,<>,n	0,\tmp,3f diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 30f92391a93e..cad3e8661cd6 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -39,6 +39,7 @@  #include <linux/kernel.h>  #include <linux/mm.h>  #include <linux/fs.h> +#include <linux/cpu.h>  #include <linux/module.h>  #include <linux/personality.h>  #include <linux/ptrace.h> @@ -184,6 +185,44 @@ int dump_task_fpu (struct task_struct *tsk, elf_fpregset_t *r)  }  /* + * Idle thread support + * + * Detect when running on QEMU with SeaBIOS PDC Firmware and let + * QEMU idle the host too. + */ + +int running_on_qemu __read_mostly; + +void __cpuidle arch_cpu_idle_dead(void) +{ +	/* nop on real hardware, qemu will offline CPU. */ +	asm volatile("or %%r31,%%r31,%%r31\n":::); +} + +void __cpuidle arch_cpu_idle(void) +{ +	local_irq_enable(); + +	/* nop on real hardware, qemu will idle sleep. */ +	asm volatile("or %%r10,%%r10,%%r10\n":::); +} + +static int __init parisc_idle_init(void) +{ +	const char *marker; + +	/* check QEMU/SeaBIOS marker in PAGE0 */ +	marker = (char *) &PAGE0->pad0; +	running_on_qemu = (memcmp(marker, "SeaBIOS", 8) == 0); + +	if (!running_on_qemu) +		cpu_idle_poll_ctrl(1); + +	return 0; +} +arch_initcall(parisc_idle_init); + +/*   * Copy architecture-specific thread state   */  int diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 5a657986ebbf..143f90e2f9f3 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -15,7 +15,6 @@  #include <linux/slab.h>  #include <linux/kallsyms.h>  #include <linux/sort.h> -#include <linux/sched.h>  #include <linux/uaccess.h>  #include <asm/assembly.h> diff --git a/arch/parisc/lib/delay.c b/arch/parisc/lib/delay.c index 7eab4bb8abe6..66e506520505 100644 --- a/arch/parisc/lib/delay.c +++ b/arch/parisc/lib/delay.c @@ -16,9 +16,7 @@  #include <linux/preempt.h>  #include <linux/init.h> -#include <asm/processor.h>  #include <asm/delay.h> -  #include <asm/special_insns.h>    /* for mfctl() */  #include <asm/processor.h> /* for boot_cpu_data */ diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 13f7854e0d49..48f41399fc0b 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -631,11 +631,11 @@ void __init mem_init(void)  	mem_init_print_info(NULL);  #ifdef CONFIG_DEBUG_KERNEL /* double-sanity-check paranoia */  	printk("virtual kernel memory layout:\n" -	       "    vmalloc : 0x%p - 0x%p   (%4ld MB)\n" -	       "    memory  : 0x%p - 0x%p   (%4ld MB)\n" -	       "      .init : 0x%p - 0x%p   (%4ld kB)\n" -	       "      .data : 0x%p - 0x%p   (%4ld kB)\n" -	       "      .text : 0x%p - 0x%p   (%4ld kB)\n", +	       "    vmalloc : 0x%px - 0x%px   (%4ld MB)\n" +	       "    memory  : 0x%px - 0x%px   (%4ld MB)\n" +	       "      .init : 0x%px - 0x%px   (%4ld kB)\n" +	       "      .data : 0x%px - 0x%px   (%4ld kB)\n" +	       "      .text : 0x%px - 0x%px   (%4ld kB)\n",  	       (void*)VMALLOC_START, (void*)VMALLOC_END,  	       (VMALLOC_END - VMALLOC_START) >> 20, diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 9a677cd5997f..44697817ccc6 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1005,7 +1005,6 @@ static inline int pmd_protnone(pmd_t pmd)  }  #endif /* CONFIG_NUMA_BALANCING */ -#define __HAVE_ARCH_PMD_WRITE  #define pmd_write(pmd)		pte_write(pmd_pte(pmd))  #define __pmd_write(pmd)	__pte_write(pmd_pte(pmd))  #define pmd_savedwrite(pmd)	pte_savedwrite(pmd_pte(pmd)) diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index a703452d67b6..555e22d5e07f 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -209,5 +209,11 @@ exc_##label##_book3e:  	ori	r3,r3,vector_offset@l;		\  	mtspr	SPRN_IVOR##vector_number,r3; +#define RFI_TO_KERNEL							\ +	rfi + +#define RFI_TO_USER							\ +	rfi +  #endif /* _ASM_POWERPC_EXCEPTION_64E_H */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index b27205297e1d..7197b179c1b1 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -74,6 +74,59 @@   */  #define EX_R3		EX_DAR +/* + * Macros for annotating the expected destination of (h)rfid + * + * The nop instructions allow us to insert one or more instructions to flush the + * L1-D cache when returning to userspace or a guest. + */ +#define RFI_FLUSH_SLOT							\ +	RFI_FLUSH_FIXUP_SECTION;					\ +	nop;								\ +	nop;								\ +	nop + +#define RFI_TO_KERNEL							\ +	rfid + +#define RFI_TO_USER							\ +	RFI_FLUSH_SLOT;							\ +	rfid;								\ +	b	rfi_flush_fallback + +#define RFI_TO_USER_OR_KERNEL						\ +	RFI_FLUSH_SLOT;							\ +	rfid;								\ +	b	rfi_flush_fallback + +#define RFI_TO_GUEST							\ +	RFI_FLUSH_SLOT;							\ +	rfid;								\ +	b	rfi_flush_fallback + +#define HRFI_TO_KERNEL							\ +	hrfid + +#define HRFI_TO_USER							\ +	RFI_FLUSH_SLOT;							\ +	hrfid;								\ +	b	hrfi_flush_fallback + +#define HRFI_TO_USER_OR_KERNEL						\ +	RFI_FLUSH_SLOT;							\ +	hrfid;								\ +	b	hrfi_flush_fallback + +#define HRFI_TO_GUEST							\ +	RFI_FLUSH_SLOT;							\ +	hrfid;								\ +	b	hrfi_flush_fallback + +#define HRFI_TO_UNKNOWN							\ +	RFI_FLUSH_SLOT;							\ +	hrfid;								\ +	b	hrfi_flush_fallback +  #ifdef CONFIG_RELOCATABLE  #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)			\  	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\ @@ -218,7 +271,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)  	mtspr	SPRN_##h##SRR0,r12;					\  	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\  	mtspr	SPRN_##h##SRR1,r10;					\ -	h##rfid;							\ +	h##RFI_TO_KERNEL;						\  	b	.	/* prevent speculative execution */  #define EXCEPTION_PROLOG_PSERIES_1(label, h)				\  	__EXCEPTION_PROLOG_PSERIES_1(label, h) @@ -232,7 +285,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)  	mtspr	SPRN_##h##SRR0,r12;					\  	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\  	mtspr	SPRN_##h##SRR1,r10;					\ -	h##rfid;							\ +	h##RFI_TO_KERNEL;						\  	b	.	/* prevent speculative execution */  #define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)			\ diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 8f88f771cc55..1e82eb3caabd 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -187,7 +187,20 @@ label##3:					       	\  	FTR_ENTRY_OFFSET label##1b-label##3b;		\  	.popsection; +#define RFI_FLUSH_FIXUP_SECTION				\ +951:							\ +	.pushsection __rfi_flush_fixup,"a";		\ +	.align 2;					\ +952:							\ +	FTR_ENTRY_OFFSET 951b-952b;			\ +	.popsection; + +  #ifndef __ASSEMBLY__ +#include <linux/types.h> + +extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup; +  void apply_feature_fixups(void);  void setup_feature_keys(void);  #endif diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index a409177be8bd..f0461618bf7b 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -241,6 +241,7 @@  #define H_GET_HCA_INFO          0x1B8  #define H_GET_PERF_COUNT        0x1BC  #define H_MANAGE_TRACE          0x1C0 +#define H_GET_CPU_CHARACTERISTICS 0x1C8  #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4  #define H_QUERY_INT_STATE       0x1E4  #define H_POLL_PENDING		0x1D8 @@ -330,6 +331,17 @@  #define H_SIGNAL_SYS_RESET_ALL_OTHERS		-2  /* >= 0 values are CPU number */ +/* H_GET_CPU_CHARACTERISTICS return values */ +#define H_CPU_CHAR_SPEC_BAR_ORI31	(1ull << 63) // IBM bit 0 +#define H_CPU_CHAR_BCCTRL_SERIALISED	(1ull << 62) // IBM bit 1 +#define H_CPU_CHAR_L1D_FLUSH_ORI30	(1ull << 61) // IBM bit 2 +#define H_CPU_CHAR_L1D_FLUSH_TRIG2	(1ull << 60) // IBM bit 3 +#define H_CPU_CHAR_L1D_THREAD_PRIV	(1ull << 59) // IBM bit 4 + +#define H_CPU_BEHAV_FAVOUR_SECURITY	(1ull << 63) // IBM bit 0 +#define H_CPU_BEHAV_L1D_FLUSH_PR	(1ull << 62) // IBM bit 1 +#define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR	(1ull << 61) // IBM bit 2 +  /* Flag values used in H_REGISTER_PROC_TBL hcall */  #define PROC_TABLE_OP_MASK	0x18  #define PROC_TABLE_DEREG	0x10 @@ -436,6 +448,11 @@ static inline unsigned int get_longbusy_msecs(int longbusy_rc)  	}  } +struct h_cpu_char_result { +	u64 character; +	u64 behaviour; +}; +  #endif /* __ASSEMBLY__ */  #endif /* __KERNEL__ */  #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 96753f3aac6d..941c2a3f231b 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -180,6 +180,7 @@ extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,  		struct iommu_group *grp);  extern int kvmppc_switch_mmu_to_hpt(struct kvm *kvm);  extern int kvmppc_switch_mmu_to_radix(struct kvm *kvm); +extern void kvmppc_setup_partition_table(struct kvm *kvm);  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,  				struct kvm_create_spapr_tce_64 *args); diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 73b92017b6d7..cd2fc1cc1cc7 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -76,6 +76,7 @@ struct machdep_calls {  	void __noreturn	(*restart)(char *cmd);  	void __noreturn (*halt)(void); +	void		(*panic)(char *str);  	void		(*cpu_die)(void);  	long		(*time_init)(void); /* Optional, may be NULL */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 6177d43f0ce8..e2a2b8400490 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -160,9 +160,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,  #endif  } -static inline void arch_dup_mmap(struct mm_struct *oldmm, -				 struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, +				struct mm_struct *mm)  { +	return 0;  }  #ifndef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 3892db93b837..23ac7fc0af23 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -232,6 +232,16 @@ struct paca_struct {  	struct sibling_subcore_state *sibling_subcore_state;  #endif  #endif +#ifdef CONFIG_PPC_BOOK3S_64 +	/* +	 * rfi fallback flush must be in its own cacheline to prevent +	 * other paca data leaking into the L1d +	 */ +	u64 exrfi[EX_SIZE] __aligned(0x80); +	void *rfi_flush_fallback_area; +	u64 l1d_flush_congruence; +	u64 l1d_flush_sets; +#endif  };  extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index 7f01b22fa6cb..55eddf50d149 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -326,4 +326,18 @@ static inline long plapr_signal_sys_reset(long cpu)  	return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu);  } +static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p) +{ +	unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; +	long rc; + +	rc = plpar_hcall(H_GET_CPU_CHARACTERISTICS, retbuf); +	if (rc == H_SUCCESS) { +		p->character = retbuf[0]; +		p->behaviour = retbuf[1]; +	} + +	return rc; +} +  #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */ diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 257d23dbf55d..469b7fdc9be4 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -24,6 +24,7 @@ extern void reloc_got2(unsigned long);  void check_for_initrd(void);  void initmem_init(void); +void setup_panic(void);  #define ARCH_PANIC_TIMEOUT 180  #ifdef CONFIG_PPC_PSERIES @@ -38,6 +39,19 @@ static inline void pseries_big_endian_exceptions(void) {}  static inline void pseries_little_endian_exceptions(void) {}  #endif /* CONFIG_PPC_PSERIES */ +void rfi_flush_enable(bool enable); + +/* These are bit flags */ +enum l1d_flush_type { +	L1D_FLUSH_NONE		= 0x1, +	L1D_FLUSH_FALLBACK	= 0x2, +	L1D_FLUSH_ORI		= 0x4, +	L1D_FLUSH_MTTRIG	= 0x8, +}; + +void __init setup_rfi_flush(enum l1d_flush_type, bool enable); +void do_rfi_flush_fixups(enum l1d_flush_type types); +  #endif /* !__ASSEMBLY__ */  #endif	/* _ASM_POWERPC_SETUP_H */ diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild index 0d960ef78a9a..1a6ed5919ffd 100644 --- a/arch/powerpc/include/uapi/asm/Kbuild +++ b/arch/powerpc/include/uapi/asm/Kbuild @@ -1,6 +1,7 @@  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm +generic-y += bpf_perf_event.h  generic-y += param.h  generic-y += poll.h  generic-y += resource.h diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6b958414b4e0..f390d57cf2e1 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -237,6 +237,11 @@ int main(void)  	OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp);  	OFFSET(PACA_IN_MCE, paca_struct, in_mce);  	OFFSET(PACA_IN_NMI, paca_struct, in_nmi); +	OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area); +	OFFSET(PACA_EXRFI, paca_struct, exrfi); +	OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence); +	OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets); +  #endif  	OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);  	OFFSET(PACAKEXECSTATE, paca_struct, kexec_state); diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 610955fe8b81..679bbe714e85 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -102,6 +102,7 @@ _GLOBAL(__setup_cpu_power9)  	li	r0,0  	mtspr	SPRN_PSSCR,r0  	mtspr	SPRN_LPID,r0 +	mtspr	SPRN_PID,r0  	mfspr	r3,SPRN_LPCR  	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE  | LPCR_HEIC)  	or	r3, r3, r4 @@ -126,6 +127,7 @@ _GLOBAL(__restore_cpu_power9)  	li	r0,0  	mtspr	SPRN_PSSCR,r0  	mtspr	SPRN_LPID,r0 +	mtspr	SPRN_PID,r0  	mfspr   r3,SPRN_LPCR  	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC)  	or	r3, r3, r4 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 3320bcac7192..2748584b767d 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -37,6 +37,11 @@  #include <asm/tm.h>  #include <asm/ppc-opcode.h>  #include <asm/export.h> +#ifdef CONFIG_PPC_BOOK3S +#include <asm/exception-64s.h> +#else +#include <asm/exception-64e.h> +#endif  /*   * System calls. @@ -262,13 +267,23 @@ BEGIN_FTR_SECTION  END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)  	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */ +	ld	r2,GPR2(r1) +	ld	r1,GPR1(r1) +	mtlr	r4 +	mtcr	r5 +	mtspr	SPRN_SRR0,r7 +	mtspr	SPRN_SRR1,r8 +	RFI_TO_USER +	b	.	/* prevent speculative execution */ + +	/* exit to kernel */  1:	ld	r2,GPR2(r1)  	ld	r1,GPR1(r1)  	mtlr	r4  	mtcr	r5  	mtspr	SPRN_SRR0,r7  	mtspr	SPRN_SRR1,r8 -	RFI +	RFI_TO_KERNEL  	b	.	/* prevent speculative execution */  .Lsyscall_error: @@ -397,8 +412,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)  	mtmsrd	r10, 1  	mtspr	SPRN_SRR0, r11  	mtspr	SPRN_SRR1, r12 - -	rfid +	RFI_TO_USER  	b	.	/* prevent speculative execution */  #endif  _ASM_NOKPROBE_SYMBOL(system_call_common); @@ -878,7 +892,7 @@ BEGIN_FTR_SECTION  END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)  	ACCOUNT_CPU_USER_EXIT(r13, r2, r4)  	REST_GPR(13, r1) -1: +  	mtspr	SPRN_SRR1,r3  	ld	r2,_CCR(r1) @@ -891,8 +905,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)  	ld	r3,GPR3(r1)  	ld	r4,GPR4(r1)  	ld	r1,GPR1(r1) +	RFI_TO_USER +	b	.	/* prevent speculative execution */ -	rfid +1:	mtspr	SPRN_SRR1,r3 + +	ld	r2,_CCR(r1) +	mtcrf	0xFF,r2 +	ld	r2,_NIP(r1) +	mtspr	SPRN_SRR0,r2 + +	ld	r0,GPR0(r1) +	ld	r2,GPR2(r1) +	ld	r3,GPR3(r1) +	ld	r4,GPR4(r1) +	ld	r1,GPR1(r1) +	RFI_TO_KERNEL  	b	.	/* prevent speculative execution */  #endif /* CONFIG_PPC_BOOK3E */ @@ -1073,7 +1101,7 @@ __enter_rtas:  	mtspr	SPRN_SRR0,r5  	mtspr	SPRN_SRR1,r6 -	rfid +	RFI_TO_KERNEL  	b	.	/* prevent speculative execution */  rtas_return_loc: @@ -1098,7 +1126,7 @@ rtas_return_loc:  	mtspr	SPRN_SRR0,r3  	mtspr	SPRN_SRR1,r4 -	rfid +	RFI_TO_KERNEL  	b	.	/* prevent speculative execution */  _ASM_NOKPROBE_SYMBOL(__enter_rtas)  _ASM_NOKPROBE_SYMBOL(rtas_return_loc) @@ -1171,7 +1199,7 @@ _GLOBAL(enter_prom)  	LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE)  	andc	r11,r11,r12  	mtsrr1	r11 -	rfid +	RFI_TO_KERNEL  #endif /* CONFIG_PPC_BOOK3E */  1:	/* Return from OF */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index e441b469dc8f..2dc10bf646b8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -256,7 +256,7 @@ BEGIN_FTR_SECTION  	LOAD_HANDLER(r12, machine_check_handle_early)  1:	mtspr	SPRN_SRR0,r12  	mtspr	SPRN_SRR1,r11 -	rfid +	RFI_TO_KERNEL  	b	.	/* prevent speculative execution */  2:  	/* Stack overflow. Stay on emergency stack and panic. @@ -445,7 +445,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)  	li	r3,MSR_ME  	andc	r10,r10,r3		/* Turn off MSR_ME */  	mtspr	SPRN_SRR1,r10 -	rfid +	RFI_TO_KERNEL  	b	.  2:  	/* @@ -463,7 +463,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)  	 */  	bl	machine_check_queue_event  	MACHINE_CHECK_HANDLER_WINDUP -	rfid +	RFI_TO_USER_OR_KERNEL  9:  	/* Deliver the machine check to host kernel in V mode. */  	MACHINE_CHECK_HANDLER_WINDUP @@ -598,6 +598,9 @@ EXC_COMMON_BEGIN(slb_miss_common)  	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */  	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */ +	andi.	r9,r11,MSR_PR	// Check for exception from userspace +	cmpdi	cr4,r9,MSR_PR	// And save the result in CR4 for later +  	/*  	 * Test MSR_RI before calling slb_allocate_realmode, because the  	 * MSR in r11 gets clobbered. However we still want to allocate @@ -624,9 +627,12 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)  	/* All done -- return from exception. */ +	bne	cr4,1f		/* returning to kernel */ +  .machine	push  .machine	"power4"  	mtcrf	0x80,r9 +	mtcrf	0x08,r9		/* MSR[PR] indication is in cr4 */  	mtcrf	0x04,r9		/* MSR[RI] indication is in cr5 */  	mtcrf	0x02,r9		/* I/D indication is in cr6 */  	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */ @@ -640,9 +646,30 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)  	ld	r11,PACA_EXSLB+EX_R11(r13)  	ld	r12,PACA_EXSLB+EX_R12(r13)  	ld	r13,PACA_EXSLB+EX_R13(r13) -	rfid +	RFI_TO_USER +	b	.	/* prevent speculative execution */ +1: +.machine	push +.machine	"power4" +	mtcrf	0x80,r9 +	mtcrf	0x08,r9		/* MSR[PR] indication is in cr4 */ +	mtcrf	0x04,r9		/* MSR[RI] indication is in cr5 */ +	mtcrf	0x02,r9		/* I/D indication is in cr6 */ +	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */ +.machine	pop + +	RESTORE_CTR(r9, PACA_EXSLB) +	RESTORE_PPR_PACA(PACA_EXSLB, r9) +	mr	r3,r12 +	ld	r9,PACA_EXSLB+EX_R9(r13) +	ld	r10,PACA_EXSLB+EX_R10(r13) +	ld	r11,PACA_EXSLB+EX_R11(r13) +	ld	r12,PACA_EXSLB+EX_R12(r13) +	ld	r13,PACA_EXSLB+EX_R13(r13) +	RFI_TO_KERNEL  	b	.	/* prevent speculative execution */ +  2:	std     r3,PACA_EXSLB+EX_DAR(r13)  	mr	r3,r12  	mfspr	r11,SPRN_SRR0 @@ -651,7 +678,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)  	mtspr	SPRN_SRR0,r10  	ld	r10,PACAKMSR(r13)  	mtspr	SPRN_SRR1,r10 -	rfid +	RFI_TO_KERNEL  	b	.  8:	std     r3,PACA_EXSLB+EX_DAR(r13) @@ -662,7 +689,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)  	mtspr	SPRN_SRR0,r10  	ld	r10,PACAKMSR(r13)  	mtspr	SPRN_SRR1,r10 -	rfid +	RFI_TO_KERNEL  	b	.  EXC_COMMON_BEGIN(unrecov_slb) @@ -901,7 +928,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)  	mtspr	SPRN_SRR0,r10 ; 				\  	ld	r10,PACAKMSR(r13) ;				\  	mtspr	SPRN_SRR1,r10 ; 				\ -	rfid ; 							\ +	RFI_TO_KERNEL ;						\  	b	. ;	/* prevent speculative execution */  #ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH @@ -917,7 +944,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\  	xori	r12,r12,MSR_LE ;				\  	mtspr	SPRN_SRR1,r12 ;					\  	mr	r13,r9 ;					\ -	rfid ;		/* return to userspace */		\ +	RFI_TO_USER ;	/* return to userspace */		\  	b	. ;	/* prevent speculative execution */  #else  #define SYSCALL_FASTENDIAN_TEST @@ -1063,7 +1090,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early)  	mtcr	r11  	REST_GPR(11, r1)  	ld	r1,GPR1(r1) -	hrfid +	HRFI_TO_USER_OR_KERNEL  1:	mtcr	r11  	REST_GPR(11, r1) @@ -1314,7 +1341,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)  	ld	r11,PACA_EXGEN+EX_R11(r13)  	ld	r12,PACA_EXGEN+EX_R12(r13)  	ld	r13,PACA_EXGEN+EX_R13(r13) -	HRFID +	HRFI_TO_UNKNOWN  	b	.  #endif @@ -1418,10 +1445,94 @@ masked_##_H##interrupt:					\  	ld	r10,PACA_EXGEN+EX_R10(r13);		\  	ld	r11,PACA_EXGEN+EX_R11(r13);		\  	/* returns to kernel where r13 must be set up, so don't restore it */ \ -	##_H##rfid;					\ +	##_H##RFI_TO_KERNEL;				\  	b	.;					\  	MASKED_DEC_HANDLER(_H) +TRAMP_REAL_BEGIN(rfi_flush_fallback) +	SET_SCRATCH0(r13); +	GET_PACA(r13); +	std	r9,PACA_EXRFI+EX_R9(r13) +	std	r10,PACA_EXRFI+EX_R10(r13) +	std	r11,PACA_EXRFI+EX_R11(r13) +	std	r12,PACA_EXRFI+EX_R12(r13) +	std	r8,PACA_EXRFI+EX_R13(r13) +	mfctr	r9 +	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) +	ld	r11,PACA_L1D_FLUSH_SETS(r13) +	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13) +	/* +	 * The load adresses are at staggered offsets within cachelines, +	 * which suits some pipelines better (on others it should not +	 * hurt). +	 */ +	addi	r12,r12,8 +	mtctr	r11 +	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + +	/* order ld/st prior to dcbt stop all streams with flushing */ +	sync +1:	li	r8,0 +	.rept	8 /* 8-way set associative */ +	ldx	r11,r10,r8 +	add	r8,r8,r12 +	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not +	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx +	.endr +	addi	r10,r10,128 /* 128 byte cache line */ +	bdnz	1b + +	mtctr	r9 +	ld	r9,PACA_EXRFI+EX_R9(r13) +	ld	r10,PACA_EXRFI+EX_R10(r13) +	ld	r11,PACA_EXRFI+EX_R11(r13) +	ld	r12,PACA_EXRFI+EX_R12(r13) +	ld	r8,PACA_EXRFI+EX_R13(r13) +	GET_SCRATCH0(r13); +	rfid + +TRAMP_REAL_BEGIN(hrfi_flush_fallback) +	SET_SCRATCH0(r13); +	GET_PACA(r13); +	std	r9,PACA_EXRFI+EX_R9(r13) +	std	r10,PACA_EXRFI+EX_R10(r13) +	std	r11,PACA_EXRFI+EX_R11(r13) +	std	r12,PACA_EXRFI+EX_R12(r13) +	std	r8,PACA_EXRFI+EX_R13(r13) +	mfctr	r9 +	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) +	ld	r11,PACA_L1D_FLUSH_SETS(r13) +	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13) +	/* +	 * The load adresses are at staggered offsets within cachelines, +	 * which suits some pipelines better (on others it should not +	 * hurt). +	 */ +	addi	r12,r12,8 +	mtctr	r11 +	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + +	/* order ld/st prior to dcbt stop all streams with flushing */ +	sync +1:	li	r8,0 +	.rept	8 /* 8-way set associative */ +	ldx	r11,r10,r8 +	add	r8,r8,r12 +	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not +	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx +	.endr +	addi	r10,r10,128 /* 128 byte cache line */ +	bdnz	1b + +	mtctr	r9 +	ld	r9,PACA_EXRFI+EX_R9(r13) +	ld	r10,PACA_EXRFI+EX_R10(r13) +	ld	r11,PACA_EXRFI+EX_R11(r13) +	ld	r12,PACA_EXRFI+EX_R12(r13) +	ld	r8,PACA_EXRFI+EX_R13(r13) +	GET_SCRATCH0(r13); +	hrfid +  /*   * Real mode exceptions actually use this too, but alternate   * instruction code patches (which end up in the common .text area) @@ -1441,7 +1552,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_interrupt)  	addi	r13, r13, 4  	mtspr	SPRN_SRR0, r13  	GET_SCRATCH0(r13) -	rfid +	RFI_TO_KERNEL  	b	.  TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt) @@ -1453,7 +1564,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt)  	addi	r13, r13, 4  	mtspr	SPRN_HSRR0, r13  	GET_SCRATCH0(r13) -	hrfid +	HRFI_TO_KERNEL  	b	.  #endif diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 04ea5c04fd24..3c2c2688918f 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1462,25 +1462,6 @@ static void fadump_init_files(void)  	return;  } -static int fadump_panic_event(struct notifier_block *this, -			      unsigned long event, void *ptr) -{ -	/* -	 * If firmware-assisted dump has been registered then trigger -	 * firmware-assisted dump and let firmware handle everything -	 * else. If this returns, then fadump was not registered, so -	 * go through the rest of the panic path. -	 */ -	crash_fadump(NULL, ptr); - -	return NOTIFY_DONE; -} - -static struct notifier_block fadump_panic_block = { -	.notifier_call = fadump_panic_event, -	.priority = INT_MIN /* may not return; must be done last */ -}; -  /*   * Prepare for firmware-assisted dump.   */ @@ -1513,9 +1494,6 @@ int __init setup_fadump(void)  		init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);  	fadump_init_files(); -	atomic_notifier_chain_register(&panic_notifier_list, -					&fadump_panic_block); -  	return 1;  }  subsys_initcall(setup_fadump); diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 8ac0bd2bddb0..3280953a82cf 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -623,7 +623,9 @@ BEGIN_FTR_SECTION  	 * NOTE, we rely on r0 being 0 from above.  	 */  	mtspr	SPRN_IAMR,r0 +BEGIN_FTR_SECTION_NESTED(42)  	mtspr	SPRN_AMOR,r0 +END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42)  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)  	/* save regs for local vars on new stack. diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index bfdd783e3916..72be0c32e902 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1403,7 +1403,7 @@ void show_regs(struct pt_regs * regs)  	printk("NIP:  "REG" LR: "REG" CTR: "REG"\n",  	       regs->nip, regs->link, regs->ctr); -	printk("REGS: %p TRAP: %04lx   %s  (%s)\n", +	printk("REGS: %px TRAP: %04lx   %s  (%s)\n",  	       regs, regs->trap, print_tainted(), init_utsname()->release);  	printk("MSR:  "REG" ", regs->msr);  	print_msr_bits(regs->msr); @@ -1569,16 +1569,22 @@ void arch_release_task_struct(struct task_struct *t)   */  int set_thread_tidr(struct task_struct *t)  { +	int rc; +  	if (!cpu_has_feature(CPU_FTR_ARCH_300))  		return -EINVAL;  	if (t != current)  		return -EINVAL; -	t->thread.tidr = assign_thread_tidr(); -	if (t->thread.tidr < 0) -		return t->thread.tidr; +	if (t->thread.tidr) +		return 0; + +	rc = assign_thread_tidr(); +	if (rc < 0) +		return rc; +	t->thread.tidr = rc;  	mtspr(SPRN_TIDR, t->thread.tidr);  	return 0; diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 2075322cd225..9d213542a48b 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -704,6 +704,30 @@ int check_legacy_ioport(unsigned long base_port)  }  EXPORT_SYMBOL(check_legacy_ioport); +static int ppc_panic_event(struct notifier_block *this, +                             unsigned long event, void *ptr) +{ +	/* +	 * If firmware-assisted dump has been registered then trigger +	 * firmware-assisted dump and let firmware handle everything else. +	 */ +	crash_fadump(NULL, ptr); +	ppc_md.panic(ptr);  /* May not return */ +	return NOTIFY_DONE; +} + +static struct notifier_block ppc_panic_block = { +	.notifier_call = ppc_panic_event, +	.priority = INT_MIN /* may not return; must be done last */ +}; + +void __init setup_panic(void) +{ +	if (!ppc_md.panic) +		return; +	atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block); +} +  #ifdef CONFIG_CHECK_CACHE_COHERENCY  /*   * For platforms that have configurable cache-coherency.  This function @@ -848,6 +872,9 @@ void __init setup_arch(char **cmdline_p)  	/* Probe the machine type, establish ppc_md. */  	probe_machine(); +	/* Setup panic notifier if requested by the platform. */ +	setup_panic(); +  	/*  	 * Configure ppc_md.power_save (ppc32 only, 64-bit machines do  	 * it from their respective probe() function. diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 8956a9856604..491be4179ddd 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -801,3 +801,104 @@ static int __init disable_hardlockup_detector(void)  	return 0;  }  early_initcall(disable_hardlockup_detector); + +#ifdef CONFIG_PPC_BOOK3S_64 +static enum l1d_flush_type enabled_flush_types; +static void *l1d_flush_fallback_area; +static bool no_rfi_flush; +bool rfi_flush; + +static int __init handle_no_rfi_flush(char *p) +{ +	pr_info("rfi-flush: disabled on command line."); +	no_rfi_flush = true; +	return 0; +} +early_param("no_rfi_flush", handle_no_rfi_flush); + +/* + * The RFI flush is not KPTI, but because users will see doco that says to use + * nopti we hijack that option here to also disable the RFI flush. + */ +static int __init handle_no_pti(char *p) +{ +	pr_info("rfi-flush: disabling due to 'nopti' on command line.\n"); +	handle_no_rfi_flush(NULL); +	return 0; +} +early_param("nopti", handle_no_pti); + +static void do_nothing(void *unused) +{ +	/* +	 * We don't need to do the flush explicitly, just enter+exit kernel is +	 * sufficient, the RFI exit handlers will do the right thing. +	 */ +} + +void rfi_flush_enable(bool enable) +{ +	if (rfi_flush == enable) +		return; + +	if (enable) { +		do_rfi_flush_fixups(enabled_flush_types); +		on_each_cpu(do_nothing, NULL, 1); +	} else +		do_rfi_flush_fixups(L1D_FLUSH_NONE); + +	rfi_flush = enable; +} + +static void init_fallback_flush(void) +{ +	u64 l1d_size, limit; +	int cpu; + +	l1d_size = ppc64_caches.l1d.size; +	limit = min(safe_stack_limit(), ppc64_rma_size); + +	/* +	 * Align to L1d size, and size it at 2x L1d size, to catch possible +	 * hardware prefetch runoff. We don't have a recipe for load patterns to +	 * reliably avoid the prefetcher. +	 */ +	l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit)); +	memset(l1d_flush_fallback_area, 0, l1d_size * 2); + +	for_each_possible_cpu(cpu) { +		/* +		 * The fallback flush is currently coded for 8-way +		 * associativity. Different associativity is possible, but it +		 * will be treated as 8-way and may not evict the lines as +		 * effectively. +		 * +		 * 128 byte lines are mandatory. +		 */ +		u64 c = l1d_size / 8; + +		paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area; +		paca[cpu].l1d_flush_congruence = c; +		paca[cpu].l1d_flush_sets = c / 128; +	} +} + +void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) +{ +	if (types & L1D_FLUSH_FALLBACK) { +		pr_info("rfi-flush: Using fallback displacement flush\n"); +		init_fallback_flush(); +	} + +	if (types & L1D_FLUSH_ORI) +		pr_info("rfi-flush: Using ori type flush\n"); + +	if (types & L1D_FLUSH_MTTRIG) +		pr_info("rfi-flush: Using mttrig type flush\n"); + +	enabled_flush_types = types; + +	if (!no_rfi_flush) +		rfi_flush_enable(enable); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 0494e1566ee2..307843d23682 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -132,6 +132,15 @@ SECTIONS  	/* Read-only data */  	RO_DATA(PAGE_SIZE) +#ifdef CONFIG_PPC64 +	. = ALIGN(8); +	__rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) { +		__start___rfi_flush_fixup = .; +		*(__rfi_flush_fixup) +		__stop___rfi_flush_fixup = .; +	} +#endif +  	EXCEPTION_TABLE(0)  	NOTES :kernel :notes diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 29ebe2fd5867..a93d719edc90 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -235,6 +235,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,  		gpte->may_read = true;  		gpte->may_write = true;  		gpte->page_size = MMU_PAGE_4K; +		gpte->wimg = HPTE_R_M;  		return 0;  	} diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 235319c2574e..b73dbc9e797d 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -65,11 +65,17 @@ struct kvm_resize_hpt {  	u32 order;  	/* These fields protected by kvm->lock */ + +	/* Possible values and their usage: +	 *  <0     an error occurred during allocation, +	 *  -EBUSY allocation is in the progress, +	 *  0      allocation made successfuly. +	 */  	int error; -	bool prepare_done; -	/* Private to the work thread, until prepare_done is true, -	 * then protected by kvm->resize_hpt_sem */ +	/* Private to the work thread, until error != -EBUSY, +	 * then protected by kvm->lock. +	 */  	struct kvm_hpt_info hpt;  }; @@ -159,8 +165,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)  		 * Reset all the reverse-mapping chains for all memslots  		 */  		kvmppc_rmap_reset(kvm); -		/* Ensure that each vcpu will flush its TLB on next entry. */ -		cpumask_setall(&kvm->arch.need_tlb_flush);  		err = 0;  		goto out;  	} @@ -176,6 +180,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)  	kvmppc_set_hpt(kvm, &info);  out: +	if (err == 0) +		/* Ensure that each vcpu will flush its TLB on next entry. */ +		cpumask_setall(&kvm->arch.need_tlb_flush); +  	mutex_unlock(&kvm->lock);  	return err;  } @@ -1238,8 +1246,9 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,  	unsigned long vpte, rpte, guest_rpte;  	int ret;  	struct revmap_entry *rev; -	unsigned long apsize, psize, avpn, pteg, hash; +	unsigned long apsize, avpn, pteg, hash;  	unsigned long new_idx, new_pteg, replace_vpte; +	int pshift;  	hptep = (__be64 *)(old->virt + (idx << 4)); @@ -1298,8 +1307,8 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,  		goto out;  	rpte = be64_to_cpu(hptep[1]); -	psize = hpte_base_page_size(vpte, rpte); -	avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23); +	pshift = kvmppc_hpte_base_page_shift(vpte, rpte); +	avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23);  	pteg = idx / HPTES_PER_GROUP;  	if (vpte & HPTE_V_SECONDARY)  		pteg = ~pteg; @@ -1311,20 +1320,20 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,  		offset = (avpn & 0x1f) << 23;  		vsid = avpn >> 5;  		/* We can find more bits from the pteg value */ -		if (psize < (1ULL << 23)) -			offset |= ((vsid ^ pteg) & old_hash_mask) * psize; +		if (pshift < 23) +			offset |= ((vsid ^ pteg) & old_hash_mask) << pshift; -		hash = vsid ^ (offset / psize); +		hash = vsid ^ (offset >> pshift);  	} else {  		unsigned long offset, vsid;  		/* We only have 40 - 23 bits of seg_off in avpn */  		offset = (avpn & 0x1ffff) << 23;  		vsid = avpn >> 17; -		if (psize < (1ULL << 23)) -			offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize; +		if (pshift < 23) +			offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << pshift; -		hash = vsid ^ (vsid << 25) ^ (offset / psize); +		hash = vsid ^ (vsid << 25) ^ (offset >> pshift);  	}  	new_pteg = hash & new_hash_mask; @@ -1412,16 +1421,20 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize)  static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)  { -	BUG_ON(kvm->arch.resize_hpt != resize); +	if (WARN_ON(!mutex_is_locked(&kvm->lock))) +		return;  	if (!resize)  		return; -	if (resize->hpt.virt) -		kvmppc_free_hpt(&resize->hpt); +	if (resize->error != -EBUSY) { +		if (resize->hpt.virt) +			kvmppc_free_hpt(&resize->hpt); +		kfree(resize); +	} -	kvm->arch.resize_hpt = NULL; -	kfree(resize); +	if (kvm->arch.resize_hpt == resize) +		kvm->arch.resize_hpt = NULL;  }  static void resize_hpt_prepare_work(struct work_struct *work) @@ -1430,17 +1443,41 @@ static void resize_hpt_prepare_work(struct work_struct *work)  						     struct kvm_resize_hpt,  						     work);  	struct kvm *kvm = resize->kvm; -	int err; +	int err = 0; -	resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", -			 resize->order); - -	err = resize_hpt_allocate(resize); +	if (WARN_ON(resize->error != -EBUSY)) +		return;  	mutex_lock(&kvm->lock); +	/* Request is still current? */ +	if (kvm->arch.resize_hpt == resize) { +		/* We may request large allocations here: +		 * do not sleep with kvm->lock held for a while. +		 */ +		mutex_unlock(&kvm->lock); + +		resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", +				 resize->order); + +		err = resize_hpt_allocate(resize); + +		/* We have strict assumption about -EBUSY +		 * when preparing for HPT resize. +		 */ +		if (WARN_ON(err == -EBUSY)) +			err = -EINPROGRESS; + +		mutex_lock(&kvm->lock); +		/* It is possible that kvm->arch.resize_hpt != resize +		 * after we grab kvm->lock again. +		 */ +	} +  	resize->error = err; -	resize->prepare_done = true; + +	if (kvm->arch.resize_hpt != resize) +		resize_hpt_release(kvm, resize);  	mutex_unlock(&kvm->lock);  } @@ -1465,14 +1502,12 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,  	if (resize) {  		if (resize->order == shift) { -			/* Suitable resize in progress */ -			if (resize->prepare_done) { -				ret = resize->error; -				if (ret != 0) -					resize_hpt_release(kvm, resize); -			} else { +			/* Suitable resize in progress? */ +			ret = resize->error; +			if (ret == -EBUSY)  				ret = 100; /* estimated time in ms */ -			} +			else if (ret) +				resize_hpt_release(kvm, resize);  			goto out;  		} @@ -1492,6 +1527,8 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,  		ret = -ENOMEM;  		goto out;  	} + +	resize->error = -EBUSY;  	resize->order = shift;  	resize->kvm = kvm;  	INIT_WORK(&resize->work, resize_hpt_prepare_work); @@ -1546,16 +1583,12 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,  	if (!resize || (resize->order != shift))  		goto out; -	ret = -EBUSY; -	if (!resize->prepare_done) -		goto out; -  	ret = resize->error; -	if (ret != 0) +	if (ret)  		goto out;  	ret = resize_hpt_rehash(resize); -	if (ret != 0) +	if (ret)  		goto out;  	resize_hpt_pivot(resize); @@ -1801,6 +1834,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,  	ssize_t nb;  	long int err, ret;  	int mmu_ready; +	int pshift;  	if (!access_ok(VERIFY_READ, buf, count))  		return -EFAULT; @@ -1855,6 +1889,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,  			err = -EINVAL;  			if (!(v & HPTE_V_VALID))  				goto out; +			pshift = kvmppc_hpte_base_page_shift(v, r); +			if (pshift <= 0) +				goto out;  			lbuf += 2;  			nb += HPTE_SIZE; @@ -1869,14 +1906,18 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,  				goto out;  			}  			if (!mmu_ready && is_vrma_hpte(v)) { -				unsigned long psize = hpte_base_page_size(v, r); -				unsigned long senc = slb_pgsize_encoding(psize); -				unsigned long lpcr; +				unsigned long senc, lpcr; +				senc = slb_pgsize_encoding(1ul << pshift);  				kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |  					(VRMA_VSID << SLB_VSID_SHIFT_1T); -				lpcr = senc << (LPCR_VRMASD_SH - 4); -				kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); +				if (!cpu_has_feature(CPU_FTR_ARCH_300)) { +					lpcr = senc << (LPCR_VRMASD_SH - 4); +					kvmppc_update_lpcr(kvm, lpcr, +							   LPCR_VRMASD); +				} else { +					kvmppc_setup_partition_table(kvm); +				}  				mmu_ready = 1;  			}  			++i; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 79ea3d9269db..2d46037ce936 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -120,7 +120,6 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");  static void kvmppc_end_cede(struct kvm_vcpu *vcpu);  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); -static void kvmppc_setup_partition_table(struct kvm *kvm);  static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,  		int *ip) @@ -3574,7 +3573,7 @@ static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)  	return;  } -static void kvmppc_setup_partition_table(struct kvm *kvm) +void kvmppc_setup_partition_table(struct kvm *kvm)  {  	unsigned long dw0, dw1; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 2659844784b8..9c61f736c75b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -79,7 +79,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)  	mtmsrd	r0,1		/* clear RI in MSR */  	mtsrr0	r5  	mtsrr1	r6 -	RFI +	RFI_TO_KERNEL  kvmppc_call_hv_entry:  BEGIN_FTR_SECTION @@ -199,7 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)  	mtmsrd	r6, 1			/* Clear RI in MSR */  	mtsrr0	r8  	mtsrr1	r7 -	RFI +	RFI_TO_KERNEL  	/* Virtual-mode return */  .Lvirt_return: @@ -1167,8 +1167,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)  	ld	r0, VCPU_GPR(R0)(r4)  	ld	r4, VCPU_GPR(R4)(r4) - -	hrfid +	HRFI_TO_GUEST  	b	.  secondary_too_late: @@ -3320,7 +3319,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)  	ld	r4, PACAKMSR(r13)  	mtspr	SPRN_SRR0, r3  	mtspr	SPRN_SRR1, r4 -	rfid +	RFI_TO_KERNEL  9:	addi	r3, r1, STACK_FRAME_OVERHEAD  	bl	kvmppc_bad_interrupt  	b	9b diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index d0dc8624198f..7deaeeb14b93 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -60,6 +60,7 @@ static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);  #define MSR_USER32 MSR_USER  #define MSR_USER64 MSR_USER  #define HW_PAGE_SIZE PAGE_SIZE +#define HPTE_R_M   _PAGE_COHERENT  #endif  static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu) @@ -557,6 +558,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,  		pte.eaddr = eaddr;  		pte.vpage = eaddr >> 12;  		pte.page_size = MMU_PAGE_64K; +		pte.wimg = HPTE_R_M;  	}  	switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) { diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 42a4b237df5f..34a5adeff084 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -46,6 +46,9 @@  #define FUNC(name)		name +#define RFI_TO_KERNEL	RFI +#define RFI_TO_GUEST	RFI +  .macro INTERRUPT_TRAMPOLINE intno  .global kvmppc_trampoline_\intno @@ -141,7 +144,7 @@ kvmppc_handler_skip_ins:  	GET_SCRATCH0(r13)  	/* And get back into the code */ -	RFI +	RFI_TO_KERNEL  #endif  /* @@ -164,6 +167,6 @@ _GLOBAL_TOC(kvmppc_entry_trampoline)  	ori	r5, r5, MSR_EE  	mtsrr0	r7  	mtsrr1	r6 -	RFI +	RFI_TO_KERNEL  #include "book3s_segment.S" diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 2a2b96d53999..93a180ceefad 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -156,7 +156,7 @@ no_dcbz32_on:  	PPC_LL	r9, SVCPU_R9(r3)  	PPC_LL	r3, (SVCPU_R3)(r3) -	RFI +	RFI_TO_GUEST  kvmppc_handler_trampoline_enter_end: @@ -407,5 +407,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)  	cmpwi	r12, BOOK3S_INTERRUPT_DOORBELL  	beqa	BOOK3S_INTERRUPT_DOORBELL -	RFI +	RFI_TO_KERNEL  kvmppc_handler_trampoline_exit_end: diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index bf457843e032..0d750d274c4e 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -725,7 +725,8 @@ u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)  	/* Return the per-cpu state for state saving/migration */  	return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT | -	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT; +	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT | +	       (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT;  }  int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) @@ -1558,7 +1559,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)  	/*  	 * Restore P and Q. If the interrupt was pending, we -	 * force both P and Q, which will trigger a resend. +	 * force Q and !P, which will trigger a resend.  	 *  	 * That means that a guest that had both an interrupt  	 * pending (queued) and Q set will restore with only @@ -1566,7 +1567,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)  	 * is perfectly fine as coalescing interrupts that haven't  	 * been presented yet is always allowed.  	 */ -	if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING) +	if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING))  		state->old_p = true;  	if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)  		state->old_q = true; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6b6c53c42ac9..1915e86cef6f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1407,7 +1407,6 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)  {  	int r; -	sigset_t sigsaved;  	if (vcpu->mmio_needed) {  		vcpu->mmio_needed = 0; @@ -1448,16 +1447,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)  #endif  	} -	if (vcpu->sigset_active) -		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); +	kvm_sigset_activate(vcpu);  	if (run->immediate_exit)  		r = -EINTR;  	else  		r = kvmppc_vcpu_run(run, vcpu); -	if (vcpu->sigset_active) -		sigprocmask(SIG_SETMASK, &sigsaved, NULL); +	kvm_sigset_deactivate(vcpu);  	return r;  } diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 41cf5ae273cf..a95ea007d654 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -116,6 +116,47 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)  	}  } +#ifdef CONFIG_PPC_BOOK3S_64 +void do_rfi_flush_fixups(enum l1d_flush_type types) +{ +	unsigned int instrs[3], *dest; +	long *start, *end; +	int i; + +	start = PTRRELOC(&__start___rfi_flush_fixup), +	end = PTRRELOC(&__stop___rfi_flush_fixup); + +	instrs[0] = 0x60000000; /* nop */ +	instrs[1] = 0x60000000; /* nop */ +	instrs[2] = 0x60000000; /* nop */ + +	if (types & L1D_FLUSH_FALLBACK) +		/* b .+16 to fallback flush */ +		instrs[0] = 0x48000010; + +	i = 0; +	if (types & L1D_FLUSH_ORI) { +		instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */ +		instrs[i++] = 0x63de0000; /* ori 30,30,0 L1d flush*/ +	} + +	if (types & L1D_FLUSH_MTTRIG) +		instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */ + +	for (i = 0; start < end; start++, i++) { +		dest = (void *)start + *start; + +		pr_devel("patching dest %lx\n", (unsigned long)dest); + +		patch_instruction(dest, instrs[0]); +		patch_instruction(dest + 1, instrs[1]); +		patch_instruction(dest + 2, instrs[2]); +	} + +	printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ +  void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)  {  	long *start, *end; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 4797d08581ce..6e1e39035380 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -145,6 +145,11 @@ static noinline int bad_area(struct pt_regs *regs, unsigned long address)  	return __bad_area(regs, address, SEGV_MAPERR);  } +static noinline int bad_access(struct pt_regs *regs, unsigned long address) +{ +	return __bad_area(regs, address, SEGV_ACCERR); +} +  static int do_sigbus(struct pt_regs *regs, unsigned long address,  		     unsigned int fault)  { @@ -490,7 +495,7 @@ retry:  good_area:  	if (unlikely(access_error(is_write, is_exec, vma))) -		return bad_area(regs, address); +		return bad_access(regs, address);  	/*  	 * If for any reason at all we couldn't handle the fault, diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 3848af167df9..640cf566e986 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -47,7 +47,8 @@  DEFINE_RAW_SPINLOCK(native_tlbie_lock); -static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) +static inline unsigned long  ___tlbie(unsigned long vpn, int psize, +						int apsize, int ssize)  {  	unsigned long va;  	unsigned int penc; @@ -100,7 +101,15 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)  			     : "memory");  		break;  	} -	trace_tlbie(0, 0, va, 0, 0, 0, 0); +	return va; +} + +static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) +{ +	unsigned long rb; + +	rb = ___tlbie(vpn, psize, apsize, ssize); +	trace_tlbie(0, 0, rb, 0, 0, 0, 0);  }  static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) @@ -652,7 +661,7 @@ static void native_hpte_clear(void)  		if (hpte_v & HPTE_V_VALID) {  			hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);  			hptep->v = 0; -			__tlbie(vpn, psize, apsize, ssize); +			___tlbie(vpn, psize, apsize, ssize);  		}  	} diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 46d74e81aff1..d183b4801bdb 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -763,7 +763,8 @@ emit_clear:  			func = (u8 *) __bpf_call_base + imm;  			/* Save skb pointer if we need to re-cache skb data */ -			if (bpf_helper_changes_pkt_data(func)) +			if ((ctx->seen & SEEN_SKB) && +			    bpf_helper_changes_pkt_data(func))  				PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));  			bpf_jit_emit_func_call(image, ctx, (u64)func); @@ -772,7 +773,8 @@ emit_clear:  			PPC_MR(b2p[BPF_REG_0], 3);  			/* refresh skb cache */ -			if (bpf_helper_changes_pkt_data(func)) { +			if ((ctx->seen & SEEN_SKB) && +			    bpf_helper_changes_pkt_data(func)) {  				/* reload skb pointer to r3 */  				PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));  				bpf_jit_emit_skb_loads(image, ctx); diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 9e3da168d54c..fce545774d50 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -410,8 +410,12 @@ static __u64 power_pmu_bhrb_to(u64 addr)  	int ret;  	__u64 target; -	if (is_kernel_addr(addr)) -		return branch_target((unsigned int *)addr); +	if (is_kernel_addr(addr)) { +		if (probe_kernel_read(&instr, (void *)addr, sizeof(instr))) +			return 0; + +		return branch_target(&instr); +	}  	/* Userspace: need copy instruction here then translate it */  	pagefault_disable(); @@ -1415,7 +1419,7 @@ static int collect_events(struct perf_event *group, int max_count,  	int n = 0;  	struct perf_event *event; -	if (!is_software_event(group)) { +	if (group->pmu->task_ctx_nr == perf_hw_context) {  		if (n >= max_count)  			return -1;  		ctrs[n] = group; @@ -1423,7 +1427,7 @@ static int collect_events(struct perf_event *group, int max_count,  		events[n++] = group->hw.config;  	}  	list_for_each_entry(event, &group->sibling_list, group_entry) { -		if (!is_software_event(event) && +		if (event->pmu->task_ctx_nr == perf_hw_context &&  		    event->state != PERF_EVENT_STATE_OFF) {  			if (n >= max_count)  				return -1; diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 0ead3cd73caa..be4e7f84f70a 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -310,6 +310,19 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)  		return 0;  	/* +	 * Check whether nest_imc is registered. We could end up here if the +	 * cpuhotplug callback registration fails. i.e, callback invokes the +	 * offline path for all successfully registered nodes. At this stage, +	 * nest_imc pmu will not be registered and we should return here. +	 * +	 * We return with a zero since this is not an offline failure. And +	 * cpuhp_setup_state() returns the actual failure reason to the caller, +	 * which in turn will call the cleanup routine. +	 */ +	if (!nest_pmus) +		return 0; + +	/*  	 * Now that this cpu is one of the designated,  	 * find a next cpu a) which is online and b) in same chip.  	 */ @@ -1171,6 +1184,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)  		if (nest_pmus == 1) {  			cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);  			kfree(nest_imc_refc); +			kfree(per_nest_pmu_arr);  		}  		if (nest_pmus > 0) @@ -1195,7 +1209,6 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)  		kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);  	kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);  	kfree(pmu_ptr); -	kfree(per_nest_pmu_arr);  	return;  } @@ -1309,6 +1322,8 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id  			ret = nest_pmu_cpumask_init();  			if (ret) {  				mutex_unlock(&nest_init_lock); +				kfree(nest_imc_refc); +				kfree(per_nest_pmu_arr);  				goto err_free;  			}  		} diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 1edfbc1e40f4..4fb21e17504a 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -37,13 +37,62 @@  #include <asm/kexec.h>  #include <asm/smp.h>  #include <asm/tm.h> +#include <asm/setup.h>  #include "powernv.h" +static void pnv_setup_rfi_flush(void) +{ +	struct device_node *np, *fw_features; +	enum l1d_flush_type type; +	int enable; + +	/* Default to fallback in case fw-features are not available */ +	type = L1D_FLUSH_FALLBACK; +	enable = 1; + +	np = of_find_node_by_name(NULL, "ibm,opal"); +	fw_features = of_get_child_by_name(np, "fw-features"); +	of_node_put(np); + +	if (fw_features) { +		np = of_get_child_by_name(fw_features, "inst-l1d-flush-trig2"); +		if (np && of_property_read_bool(np, "enabled")) +			type = L1D_FLUSH_MTTRIG; + +		of_node_put(np); + +		np = of_get_child_by_name(fw_features, "inst-l1d-flush-ori30,30,0"); +		if (np && of_property_read_bool(np, "enabled")) +			type = L1D_FLUSH_ORI; + +		of_node_put(np); + +		/* Enable unless firmware says NOT to */ +		enable = 2; +		np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-hv-1-to-0"); +		if (np && of_property_read_bool(np, "disabled")) +			enable--; + +		of_node_put(np); + +		np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-pr-0-to-1"); +		if (np && of_property_read_bool(np, "disabled")) +			enable--; + +		of_node_put(np); +		of_node_put(fw_features); +	} + +	setup_rfi_flush(type, enable > 0); +} +  static void __init pnv_setup_arch(void)  {  	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); +	pnv_setup_rfi_flush(); +  	/* Initialize SMP */  	pnv_smp_init(); diff --git a/arch/powerpc/platforms/ps3/setup.c b/arch/powerpc/platforms/ps3/setup.c index 9dabea6e1443..6244bc849469 100644 --- a/arch/powerpc/platforms/ps3/setup.c +++ b/arch/powerpc/platforms/ps3/setup.c @@ -104,6 +104,20 @@ static void __noreturn ps3_halt(void)  	ps3_sys_manager_halt(); /* never returns */  } +static void ps3_panic(char *str) +{ +	DBG("%s:%d %s\n", __func__, __LINE__, str); + +	smp_send_stop(); +	printk("\n"); +	printk("   System does not reboot automatically.\n"); +	printk("   Please press POWER button.\n"); +	printk("\n"); + +	while(1) +		lv1_pause(1); +} +  #if defined(CONFIG_FB_PS3) || defined(CONFIG_FB_PS3_MODULE) || \      defined(CONFIG_PS3_FLASH) || defined(CONFIG_PS3_FLASH_MODULE)  static void __init prealloc(struct ps3_prealloc *p) @@ -255,6 +269,7 @@ define_machine(ps3) {  	.probe				= ps3_probe,  	.setup_arch			= ps3_setup_arch,  	.init_IRQ			= ps3_init_IRQ, +	.panic				= ps3_panic,  	.get_boot_time			= ps3_get_boot_time,  	.set_dabr			= ps3_set_dabr,  	.calibrate_decr			= ps3_calibrate_decr, diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 6e35780c5962..a0b20c03f078 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -574,11 +574,26 @@ static ssize_t dlpar_show(struct class *class, struct class_attribute *attr,  static CLASS_ATTR_RW(dlpar); -static int __init pseries_dlpar_init(void) +int __init dlpar_workqueue_init(void)  { +	if (pseries_hp_wq) +		return 0; +  	pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue", -					WQ_UNBOUND, 1); +			WQ_UNBOUND, 1); + +	return pseries_hp_wq ? 0 : -ENOMEM; +} + +static int __init dlpar_sysfs_init(void) +{ +	int rc; + +	rc = dlpar_workqueue_init(); +	if (rc) +		return rc; +  	return sysfs_create_file(kernel_kobj, &class_attr_dlpar.attr);  } -machine_device_initcall(pseries, pseries_dlpar_init); +machine_device_initcall(pseries, dlpar_sysfs_init); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 4470a3194311..1ae1d9f4dbe9 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -98,4 +98,6 @@ static inline unsigned long cmo_get_page_size(void)  	return CMO_PageSize;  } +int dlpar_workqueue_init(void); +  #endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 4923ffe230cf..81d8614e7379 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -69,7 +69,8 @@ static int __init init_ras_IRQ(void)  	/* Hotplug Events */  	np = of_find_node_by_path("/event-sources/hot-plug-events");  	if (np != NULL) { -		request_event_sources_irqs(np, ras_hotplug_interrupt, +		if (dlpar_workqueue_init() == 0) +			request_event_sources_irqs(np, ras_hotplug_interrupt,  					   "RAS_HOTPLUG");  		of_node_put(np);  	} diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 5f1beb8367ac..ae4f596273b5 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -459,6 +459,39 @@ static void __init find_and_init_phbs(void)  	of_pci_check_probe_only();  } +static void pseries_setup_rfi_flush(void) +{ +	struct h_cpu_char_result result; +	enum l1d_flush_type types; +	bool enable; +	long rc; + +	/* Enable by default */ +	enable = true; + +	rc = plpar_get_cpu_characteristics(&result); +	if (rc == H_SUCCESS) { +		types = L1D_FLUSH_NONE; + +		if (result.character & H_CPU_CHAR_L1D_FLUSH_TRIG2) +			types |= L1D_FLUSH_MTTRIG; +		if (result.character & H_CPU_CHAR_L1D_FLUSH_ORI30) +			types |= L1D_FLUSH_ORI; + +		/* Use fallback if nothing set in hcall */ +		if (types == L1D_FLUSH_NONE) +			types = L1D_FLUSH_FALLBACK; + +		if (!(result.behaviour & H_CPU_BEHAV_L1D_FLUSH_PR)) +			enable = false; +	} else { +		/* Default to fallback if case hcall is not available */ +		types = L1D_FLUSH_FALLBACK; +	} + +	setup_rfi_flush(types, enable); +} +  static void __init pSeries_setup_arch(void)  {  	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); @@ -476,6 +509,8 @@ static void __init pSeries_setup_arch(void)  	fwnmi_init(); +	pseries_setup_rfi_flush(); +  	/* By default, only probe PCI (can be overridden by rtas_pci) */  	pci_add_flags(PCI_PROBE_ONLY); @@ -726,6 +761,7 @@ define_machine(pseries) {  	.pcibios_fixup		= pSeries_final_fixup,  	.restart		= rtas_restart,  	.halt			= rtas_halt, +	.panic			= rtas_os_term,  	.get_boot_time		= rtas_get_boot_time,  	.get_rtc_time		= rtas_get_rtc_time,  	.set_rtc_time		= rtas_set_rtc_time, diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 44cbf4c12ea1..df95102e732c 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -354,6 +354,7 @@ static int fsl_of_msi_remove(struct platform_device *ofdev)  }  static struct lock_class_key fsl_msi_irq_class; +static struct lock_class_key fsl_msi_irq_request_class;  static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,  			       int offset, int irq_index) @@ -373,7 +374,8 @@ static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,  		dev_err(&dev->dev, "No memory for MSI cascade data\n");  		return -ENOMEM;  	} -	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class); +	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class, +			      &fsl_msi_irq_request_class);  	cascade_data->index = offset;  	cascade_data->msi_data = msi;  	cascade_data->virq = virt_msir; diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 1b2d8cb49abb..cab24f549e7c 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1590,7 +1590,7 @@ static void print_bug_trap(struct pt_regs *regs)  	printf("kernel BUG at %s:%u!\n",  	       bug->file, bug->line);  #else -	printf("kernel BUG at %p!\n", (void *)bug->bug_addr); +	printf("kernel BUG at %px!\n", (void *)bug->bug_addr);  #endif  #endif /* CONFIG_BUG */  } @@ -2329,7 +2329,7 @@ static void dump_one_paca(int cpu)  	p = &paca[cpu]; -	printf("paca for cpu 0x%x @ %p:\n", cpu, p); +	printf("paca for cpu 0x%x @ %px:\n", cpu, p);  	printf(" %-*s = %s\n", 20, "possible", cpu_possible(cpu) ? "yes" : "no");  	printf(" %-*s = %s\n", 20, "present", cpu_present(cpu) ? "yes" : "no"); @@ -2945,7 +2945,7 @@ static void show_task(struct task_struct *tsk)  		(tsk->exit_state & EXIT_DEAD) ? 'E' :  		(tsk->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; -	printf("%p %016lx %6d %6d %c %2d %s\n", tsk, +	printf("%px %016lx %6d %6d %c %2d %s\n", tsk,  		tsk->thread.ksp,  		tsk->pid, tsk->parent->pid,  		state, task_thread_info(tsk)->cpu, @@ -2988,7 +2988,7 @@ static void show_pte(unsigned long addr)  	if (setjmp(bus_error_jmp) != 0) {  		catch_memory_errors = 0; -		printf("*** Error dumping pte for task %p\n", tsk); +		printf("*** Error dumping pte for task %px\n", tsk);  		return;  	} @@ -3074,7 +3074,7 @@ static void show_tasks(void)  	if (setjmp(bus_error_jmp) != 0) {  		catch_memory_errors = 0; -		printf("*** Error dumping task %p\n", tsk); +		printf("*** Error dumping task %px\n", tsk);  		return;  	} diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index e69de29bb2d1..47dacf06c679 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -0,0 +1,75 @@ +CONFIG_SMP=y +CONFIG_PCI=y +CONFIG_PCIE_XILINX=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_BPF=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_EXPERT=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_BPF_SYSCALL=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_NETLINK_DIAG=y +CONFIG_DEVTMPFS=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_AHCI_PLATFORM=y +CONFIG_NETDEVICES=y +CONFIG_VIRTIO_NET=y +CONFIG_MACB=y +CONFIG_E1000E=y +CONFIG_R8169=y +CONFIG_MICROSEMI_PHY=y +CONFIG_INPUT_MOUSEDEV=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +# CONFIG_PTP_1588_CLOCK is not set +CONFIG_DRM=y +CONFIG_DRM_RADEON=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_USB=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_XHCI_PLATFORM=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_HCD_PLATFORM=y +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_OHCI_HCD_PLATFORM=y +CONFIG_USB_STORAGE=y +CONFIG_USB_UAS=y +CONFIG_VIRTIO_MMIO=y +CONFIG_RAS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_AUTOFS4_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_NFS_FS=y +CONFIG_NFS_V4=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_ROOT_NFS=y +# CONFIG_RCU_TRACE is not set +CONFIG_CRYPTO_USER_API_HASH=y diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index 18158be62a2b..970460a0b492 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -40,6 +40,7 @@ generic-y += resource.h  generic-y += scatterlist.h  generic-y += sections.h  generic-y += sembuf.h +generic-y += serial.h  generic-y += setup.h  generic-y += shmbuf.h  generic-y += shmparam.h diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h index 6cbbb6a68d76..5ad4cb622bed 100644 --- a/arch/riscv/include/asm/asm.h +++ b/arch/riscv/include/asm/asm.h @@ -58,17 +58,17 @@  #endif  #if (__SIZEOF_INT__ == 4) -#define INT		__ASM_STR(.word) -#define SZINT		__ASM_STR(4) -#define LGINT		__ASM_STR(2) +#define RISCV_INT		__ASM_STR(.word) +#define RISCV_SZINT		__ASM_STR(4) +#define RISCV_LGINT		__ASM_STR(2)  #else  #error "Unexpected __SIZEOF_INT__"  #endif  #if (__SIZEOF_SHORT__ == 2) -#define SHORT		__ASM_STR(.half) -#define SZSHORT		__ASM_STR(2) -#define LGSHORT		__ASM_STR(1) +#define RISCV_SHORT		__ASM_STR(.half) +#define RISCV_SZSHORT		__ASM_STR(2) +#define RISCV_LGSHORT		__ASM_STR(1)  #else  #error "Unexpected __SIZEOF_SHORT__"  #endif diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index e2e37c57cbeb..e65d1cd89e28 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -50,30 +50,30 @@ static __always_inline void atomic64_set(atomic64_t *v, long i)   * have the AQ or RL bits set.  These don't return anything, so there's only   * one version to worry about.   */ -#define ATOMIC_OP(op, asm_op, c_op, I, asm_type, c_type, prefix)				\ -static __always_inline void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)		\ -{												\ -	__asm__ __volatile__ (									\ -		"amo" #asm_op "." #asm_type " zero, %1, %0"					\ -		: "+A" (v->counter)								\ -		: "r" (I)									\ -		: "memory");									\ +#define ATOMIC_OP(op, asm_op, I, asm_type, c_type, prefix)				\ +static __always_inline void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)	\ +{											\ +	__asm__ __volatile__ (								\ +		"amo" #asm_op "." #asm_type " zero, %1, %0"				\ +		: "+A" (v->counter)							\ +		: "r" (I)								\ +		: "memory");								\  }  #ifdef CONFIG_GENERIC_ATOMIC64 -#define ATOMIC_OPS(op, asm_op, c_op, I)			\ -        ATOMIC_OP (op, asm_op, c_op, I, w,  int,   ) +#define ATOMIC_OPS(op, asm_op, I)			\ +        ATOMIC_OP (op, asm_op, I, w,  int,   )  #else -#define ATOMIC_OPS(op, asm_op, c_op, I)			\ -        ATOMIC_OP (op, asm_op, c_op, I, w,  int,   )	\ -        ATOMIC_OP (op, asm_op, c_op, I, d, long, 64) +#define ATOMIC_OPS(op, asm_op, I)			\ +        ATOMIC_OP (op, asm_op, I, w,  int,   )	\ +        ATOMIC_OP (op, asm_op, I, d, long, 64)  #endif -ATOMIC_OPS(add, add, +,  i) -ATOMIC_OPS(sub, add, +, -i) -ATOMIC_OPS(and, and, &,  i) -ATOMIC_OPS( or,  or, |,  i) -ATOMIC_OPS(xor, xor, ^,  i) +ATOMIC_OPS(add, add,  i) +ATOMIC_OPS(sub, add, -i) +ATOMIC_OPS(and, and,  i) +ATOMIC_OPS( or,  or,  i) +ATOMIC_OPS(xor, xor,  i)  #undef ATOMIC_OP  #undef ATOMIC_OPS @@ -83,7 +83,7 @@ ATOMIC_OPS(xor, xor, ^,  i)   * There's two flavors of these: the arithmatic ops have both fetch and return   * versions, while the logical ops only have fetch versions.   */ -#define ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, asm_type, c_type, prefix)			\ +#define ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, asm_type, c_type, prefix)				\  static __always_inline c_type atomic##prefix##_fetch_##op##c_or(c_type i, atomic##prefix##_t *v)	\  {													\  	register c_type ret;										\ @@ -103,13 +103,13 @@ static __always_inline c_type atomic##prefix##_##op##_return##c_or(c_type i, ato  #ifdef CONFIG_GENERIC_ATOMIC64  #define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\ -        ATOMIC_FETCH_OP (op, asm_op, c_op, I, asm_or, c_or, w,  int,   )	\ +        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, w,  int,   )	\          ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )  #else  #define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\ -        ATOMIC_FETCH_OP (op, asm_op, c_op, I, asm_or, c_or, w,  int,   )	\ +        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, w,  int,   )	\          ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )	\ -        ATOMIC_FETCH_OP (op, asm_op, c_op, I, asm_or, c_or, d, long, 64)	\ +        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, d, long, 64)	\          ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, d, long, 64)  #endif @@ -126,28 +126,28 @@ ATOMIC_OPS(sub, add, +, -i, .aqrl,         )  #undef ATOMIC_OPS  #ifdef CONFIG_GENERIC_ATOMIC64 -#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\ -        ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, w,  int,   ) +#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or)				\ +        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w,  int,   )  #else -#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\ -        ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )		\ -        ATOMIC_FETCH_OP(op, asm_op, c_op, I, asm_or, c_or, d, long, 64) +#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or)				\ +        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w,  int,   )	\ +        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, d, long, 64)  #endif -ATOMIC_OPS(and, and, &,  i,      , _relaxed) -ATOMIC_OPS(and, and, &,  i, .aq  , _acquire) -ATOMIC_OPS(and, and, &,  i, .rl  , _release) -ATOMIC_OPS(and, and, &,  i, .aqrl,         ) +ATOMIC_OPS(and, and, i,      , _relaxed) +ATOMIC_OPS(and, and, i, .aq  , _acquire) +ATOMIC_OPS(and, and, i, .rl  , _release) +ATOMIC_OPS(and, and, i, .aqrl,         ) -ATOMIC_OPS( or,  or, |,  i,      , _relaxed) -ATOMIC_OPS( or,  or, |,  i, .aq  , _acquire) -ATOMIC_OPS( or,  or, |,  i, .rl  , _release) -ATOMIC_OPS( or,  or, |,  i, .aqrl,         ) +ATOMIC_OPS( or,  or, i,      , _relaxed) +ATOMIC_OPS( or,  or, i, .aq  , _acquire) +ATOMIC_OPS( or,  or, i, .rl  , _release) +ATOMIC_OPS( or,  or, i, .aqrl,         ) -ATOMIC_OPS(xor, xor, ^,  i,      , _relaxed) -ATOMIC_OPS(xor, xor, ^,  i, .aq  , _acquire) -ATOMIC_OPS(xor, xor, ^,  i, .rl  , _release) -ATOMIC_OPS(xor, xor, ^,  i, .aqrl,         ) +ATOMIC_OPS(xor, xor, i,      , _relaxed) +ATOMIC_OPS(xor, xor, i, .aq  , _acquire) +ATOMIC_OPS(xor, xor, i, .rl  , _release) +ATOMIC_OPS(xor, xor, i, .aqrl,         )  #undef ATOMIC_OPS @@ -182,13 +182,13 @@ ATOMIC_OPS(add_negative, add,  <, 0)  #undef ATOMIC_OP  #undef ATOMIC_OPS -#define ATOMIC_OP(op, func_op, c_op, I, c_type, prefix)				\ +#define ATOMIC_OP(op, func_op, I, c_type, prefix)				\  static __always_inline void atomic##prefix##_##op(atomic##prefix##_t *v)	\  {										\  	atomic##prefix##_##func_op(I, v);					\  } -#define ATOMIC_FETCH_OP(op, func_op, c_op, I, c_type, prefix)				\ +#define ATOMIC_FETCH_OP(op, func_op, I, c_type, prefix)					\  static __always_inline c_type atomic##prefix##_fetch_##op(atomic##prefix##_t *v)	\  {											\  	return atomic##prefix##_fetch_##func_op(I, v);					\ @@ -202,16 +202,16 @@ static __always_inline c_type atomic##prefix##_##op##_return(atomic##prefix##_t  #ifdef CONFIG_GENERIC_ATOMIC64  #define ATOMIC_OPS(op, asm_op, c_op, I)						\ -        ATOMIC_OP       (op, asm_op, c_op, I,  int,   )				\ -        ATOMIC_FETCH_OP (op, asm_op, c_op, I,  int,   )				\ +        ATOMIC_OP       (op, asm_op,       I,  int,   )				\ +        ATOMIC_FETCH_OP (op, asm_op,       I,  int,   )				\          ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )  #else  #define ATOMIC_OPS(op, asm_op, c_op, I)						\ -        ATOMIC_OP       (op, asm_op, c_op, I,  int,   )				\ -        ATOMIC_FETCH_OP (op, asm_op, c_op, I,  int,   )				\ +        ATOMIC_OP       (op, asm_op,       I,  int,   )				\ +        ATOMIC_FETCH_OP (op, asm_op,       I,  int,   )				\          ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )				\ -        ATOMIC_OP       (op, asm_op, c_op, I, long, 64)				\ -        ATOMIC_FETCH_OP (op, asm_op, c_op, I, long, 64)				\ +        ATOMIC_OP       (op, asm_op,       I, long, 64)				\ +        ATOMIC_FETCH_OP (op, asm_op,       I, long, 64)				\          ATOMIC_OP_RETURN(op, asm_op, c_op, I, long, 64)  #endif @@ -300,8 +300,13 @@ static __always_inline long atomic64_inc_not_zero(atomic64_t *v)  /*   * atomic_{cmp,}xchg is required to have exactly the same ordering semantics as - * {cmp,}xchg and the operations that return, so they need a barrier.  We just - * use the other implementations directly. + * {cmp,}xchg and the operations that return, so they need a barrier. + */ +/* + * FIXME: atomic_cmpxchg_{acquire,release,relaxed} are all implemented by + * assigning the same barrier to both the LR and SC operations, but that might + * not make any sense.  We're waiting on a memory model specification to + * determine exactly what the right thing to do is here.   */  #define ATOMIC_OP(c_t, prefix, c_or, size, asm_or)						\  static __always_inline c_t atomic##prefix##_cmpxchg##c_or(atomic##prefix##_t *v, c_t o, c_t n) 	\ diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h index 183534b7c39b..c0319cbf1eec 100644 --- a/arch/riscv/include/asm/barrier.h +++ b/arch/riscv/include/asm/barrier.h @@ -39,27 +39,23 @@  #define smp_wmb()	RISCV_FENCE(w,w)  /* - * These fences exist to enforce ordering around the relaxed AMOs.  The - * documentation defines that - * " - *     atomic_fetch_add(); - *   is equivalent to: - *     smp_mb__before_atomic(); - *     atomic_fetch_add_relaxed(); - *     smp_mb__after_atomic(); - * " - * So we emit full fences on both sides. - */ -#define __smb_mb__before_atomic()	smp_mb() -#define __smb_mb__after_atomic()	smp_mb() - -/* - * These barriers prevent accesses performed outside a spinlock from being moved - * inside a spinlock.  Since RISC-V sets the aq/rl bits on our spinlock only - * enforce release consistency, we need full fences here. + * This is a very specific barrier: it's currently only used in two places in + * the kernel, both in the scheduler.  See include/linux/spinlock.h for the two + * orderings it guarantees, but the "critical section is RCsc" guarantee + * mandates a barrier on RISC-V.  The sequence looks like: + * + *    lr.aq lock + *    sc    lock <= LOCKED + *    smp_mb__after_spinlock() + *    // critical section + *    lr    lock + *    sc.rl lock <= UNLOCKED + * + * The AQ/RL pair provides a RCpc critical section, but there's not really any + * way we can take advantage of that here because the ordering is only enforced + * on that one lock.  Thus, we're just doing a full fence.   */ -#define smb_mb__before_spinlock()	smp_mb() -#define smb_mb__after_spinlock()	smp_mb() +#define smp_mb__after_spinlock()	RISCV_FENCE(rw,rw)  #include <asm-generic/barrier.h> diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 7c281ef1d583..f30daf26f08f 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -67,7 +67,7 @@  		: "memory");  #define __test_and_op_bit(op, mod, nr, addr) 			\ -	__test_and_op_bit_ord(op, mod, nr, addr, ) +	__test_and_op_bit_ord(op, mod, nr, addr, .aqrl)  #define __op_bit(op, mod, nr, addr)				\  	__op_bit_ord(op, mod, nr, addr, ) diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h index c3e13764a943..bfc7f099ab1f 100644 --- a/arch/riscv/include/asm/bug.h +++ b/arch/riscv/include/asm/bug.h @@ -27,8 +27,8 @@  typedef u32 bug_insn_t;  #ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS -#define __BUG_ENTRY_ADDR	INT " 1b - 2b" -#define __BUG_ENTRY_FILE	INT " %0 - 2b" +#define __BUG_ENTRY_ADDR	RISCV_INT " 1b - 2b" +#define __BUG_ENTRY_FILE	RISCV_INT " %0 - 2b"  #else  #define __BUG_ENTRY_ADDR	RISCV_PTR " 1b"  #define __BUG_ENTRY_FILE	RISCV_PTR " %0" @@ -38,7 +38,7 @@ typedef u32 bug_insn_t;  #define __BUG_ENTRY			\  	__BUG_ENTRY_ADDR "\n\t"		\  	__BUG_ENTRY_FILE "\n\t"		\ -	SHORT " %1" +	RISCV_SHORT " %1"  #else  #define __BUG_ENTRY			\  	__BUG_ENTRY_ADDR diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 0595585013b0..efd89a88d2d0 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -18,22 +18,44 @@  #undef flush_icache_range  #undef flush_icache_user_range +#undef flush_dcache_page  static inline void local_flush_icache_all(void)  {  	asm volatile ("fence.i" ::: "memory");  } +#define PG_dcache_clean PG_arch_1 + +static inline void flush_dcache_page(struct page *page) +{ +	if (test_bit(PG_dcache_clean, &page->flags)) +		clear_bit(PG_dcache_clean, &page->flags); +} + +/* + * RISC-V doesn't have an instruction to flush parts of the instruction cache, + * so instead we just flush the whole thing. + */ +#define flush_icache_range(start, end) flush_icache_all() +#define flush_icache_user_range(vma, pg, addr, len) flush_icache_all() +  #ifndef CONFIG_SMP -#define flush_icache_range(start, end) local_flush_icache_all() -#define flush_icache_user_range(vma, pg, addr, len) local_flush_icache_all() +#define flush_icache_all() local_flush_icache_all() +#define flush_icache_mm(mm, local) flush_icache_all()  #else /* CONFIG_SMP */ -#define flush_icache_range(start, end) sbi_remote_fence_i(0) -#define flush_icache_user_range(vma, pg, addr, len) sbi_remote_fence_i(0) +#define flush_icache_all() sbi_remote_fence_i(0) +void flush_icache_mm(struct mm_struct *mm, bool local);  #endif /* CONFIG_SMP */ +/* + * Bits in sys_riscv_flush_icache()'s flags argument. + */ +#define SYS_RISCV_FLUSH_ICACHE_LOCAL 1UL +#define SYS_RISCV_FLUSH_ICACHE_ALL   (SYS_RISCV_FLUSH_ICACHE_LOCAL) +  #endif /* _ASM_RISCV_CACHEFLUSH_H */ diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 0d64bc9f4f91..3c7a2c97e377 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -17,10 +17,10 @@  #include <linux/const.h>  /* Status register flags */ -#define SR_IE   _AC(0x00000002, UL) /* Interrupt Enable */ -#define SR_PIE  _AC(0x00000020, UL) /* Previous IE */ -#define SR_PS   _AC(0x00000100, UL) /* Previously Supervisor */ -#define SR_SUM  _AC(0x00040000, UL) /* Supervisor may access User Memory */ +#define SR_SIE	_AC(0x00000002, UL) /* Supervisor Interrupt Enable */ +#define SR_SPIE	_AC(0x00000020, UL) /* Previous Supervisor IE */ +#define SR_SPP	_AC(0x00000100, UL) /* Previously Supervisor */ +#define SR_SUM	_AC(0x00040000, UL) /* Supervisor may access User Memory */  #define SR_FS           _AC(0x00006000, UL) /* Floating-point Status */  #define SR_FS_OFF       _AC(0x00000000, UL) diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index c1f32cfcc79b..b269451e7e85 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -19,7 +19,7 @@  #ifndef _ASM_RISCV_IO_H  #define _ASM_RISCV_IO_H -#ifdef CONFIG_MMU +#include <linux/types.h>  extern void __iomem *ioremap(phys_addr_t offset, unsigned long size); @@ -32,9 +32,7 @@ extern void __iomem *ioremap(phys_addr_t offset, unsigned long size);  #define ioremap_wc(addr, size) ioremap((addr), (size))  #define ioremap_wt(addr, size) ioremap((addr), (size)) -extern void iounmap(void __iomem *addr); - -#endif /* CONFIG_MMU */ +extern void iounmap(volatile void __iomem *addr);  /* Generic IO read/write.  These perform native-endian accesses. */  #define __raw_writeb __raw_writeb @@ -250,7 +248,7 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)  			const ctype *buf = buffer;				\  										\  			do {							\ -				__raw_writeq(*buf++, addr);			\ +				__raw_write ## len(*buf++, addr);		\  			} while (--count);					\  		}								\  		afence;								\ @@ -266,9 +264,9 @@ __io_reads_ins(reads, u32, l, __io_br(), __io_ar())  __io_reads_ins(ins,  u8, b, __io_pbr(), __io_par())  __io_reads_ins(ins, u16, w, __io_pbr(), __io_par())  __io_reads_ins(ins, u32, l, __io_pbr(), __io_par()) -#define insb(addr, buffer, count) __insb((void __iomem *)addr, buffer, count) -#define insw(addr, buffer, count) __insw((void __iomem *)addr, buffer, count) -#define insl(addr, buffer, count) __insl((void __iomem *)addr, buffer, count) +#define insb(addr, buffer, count) __insb((void __iomem *)(long)addr, buffer, count) +#define insw(addr, buffer, count) __insw((void __iomem *)(long)addr, buffer, count) +#define insl(addr, buffer, count) __insl((void __iomem *)(long)addr, buffer, count)  __io_writes_outs(writes,  u8, b, __io_bw(), __io_aw())  __io_writes_outs(writes, u16, w, __io_bw(), __io_aw()) @@ -280,9 +278,9 @@ __io_writes_outs(writes, u32, l, __io_bw(), __io_aw())  __io_writes_outs(outs,  u8, b, __io_pbw(), __io_paw())  __io_writes_outs(outs, u16, w, __io_pbw(), __io_paw())  __io_writes_outs(outs, u32, l, __io_pbw(), __io_paw()) -#define outsb(addr, buffer, count) __outsb((void __iomem *)addr, buffer, count) -#define outsw(addr, buffer, count) __outsw((void __iomem *)addr, buffer, count) -#define outsl(addr, buffer, count) __outsl((void __iomem *)addr, buffer, count) +#define outsb(addr, buffer, count) __outsb((void __iomem *)(long)addr, buffer, count) +#define outsw(addr, buffer, count) __outsw((void __iomem *)(long)addr, buffer, count) +#define outsl(addr, buffer, count) __outsl((void __iomem *)(long)addr, buffer, count)  #ifdef CONFIG_64BIT  __io_reads_ins(reads, u64, q, __io_br(), __io_ar()) diff --git a/arch/riscv/include/asm/irqflags.h b/arch/riscv/include/asm/irqflags.h index 6fdc860d7f84..07a3c6d5706f 100644 --- a/arch/riscv/include/asm/irqflags.h +++ b/arch/riscv/include/asm/irqflags.h @@ -27,25 +27,25 @@ static inline unsigned long arch_local_save_flags(void)  /* unconditionally enable interrupts */  static inline void arch_local_irq_enable(void)  { -	csr_set(sstatus, SR_IE); +	csr_set(sstatus, SR_SIE);  }  /* unconditionally disable interrupts */  static inline void arch_local_irq_disable(void)  { -	csr_clear(sstatus, SR_IE); +	csr_clear(sstatus, SR_SIE);  }  /* get status and disable interrupts */  static inline unsigned long arch_local_irq_save(void)  { -	return csr_read_clear(sstatus, SR_IE); +	return csr_read_clear(sstatus, SR_SIE);  }  /* test flags */  static inline int arch_irqs_disabled_flags(unsigned long flags)  { -	return !(flags & SR_IE); +	return !(flags & SR_SIE);  }  /* test hardware interrupt enable bit */ @@ -57,7 +57,7 @@ static inline int arch_irqs_disabled(void)  /* set interrupt enabled status */  static inline void arch_local_irq_restore(unsigned long flags)  { -	csr_set(sstatus, flags & SR_IE); +	csr_set(sstatus, flags & SR_SIE);  }  #endif /* _ASM_RISCV_IRQFLAGS_H */ diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 66805cba9a27..5df2dccdba12 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -19,6 +19,10 @@  typedef struct {  	void *vdso; +#ifdef CONFIG_SMP +	/* A local icache flush is needed before user execution can resume. */ +	cpumask_t icache_stale_mask; +#endif  } mm_context_t;  #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/include/asm/mmu_context.h b/arch/riscv/include/asm/mmu_context.h index de1fc1631fc4..97424834dce2 100644 --- a/arch/riscv/include/asm/mmu_context.h +++ b/arch/riscv/include/asm/mmu_context.h @@ -1,5 +1,6 @@  /*   * Copyright (C) 2012 Regents of the University of California + * Copyright (C) 2017 SiFive   *   *   This program is free software; you can redistribute it and/or   *   modify it under the terms of the GNU General Public License @@ -14,11 +15,13 @@  #ifndef _ASM_RISCV_MMU_CONTEXT_H  #define _ASM_RISCV_MMU_CONTEXT_H +#include <linux/mm_types.h>  #include <asm-generic/mm_hooks.h>  #include <linux/mm.h>  #include <linux/sched.h>  #include <asm/tlbflush.h> +#include <asm/cacheflush.h>  static inline void enter_lazy_tlb(struct mm_struct *mm,  	struct task_struct *task) @@ -46,12 +49,54 @@ static inline void set_pgdir(pgd_t *pgd)  	csr_write(sptbr, virt_to_pfn(pgd) | SPTBR_MODE);  } +/* + * When necessary, performs a deferred icache flush for the given MM context, + * on the local CPU.  RISC-V has no direct mechanism for instruction cache + * shoot downs, so instead we send an IPI that informs the remote harts they + * need to flush their local instruction caches.  To avoid pathologically slow + * behavior in a common case (a bunch of single-hart processes on a many-hart + * machine, ie 'make -j') we avoid the IPIs for harts that are not currently + * executing a MM context and instead schedule a deferred local instruction + * cache flush to be performed before execution resumes on each hart.  This + * actually performs that local instruction cache flush, which implicitly only + * refers to the current hart. + */ +static inline void flush_icache_deferred(struct mm_struct *mm) +{ +#ifdef CONFIG_SMP +	unsigned int cpu = smp_processor_id(); +	cpumask_t *mask = &mm->context.icache_stale_mask; + +	if (cpumask_test_cpu(cpu, mask)) { +		cpumask_clear_cpu(cpu, mask); +		/* +		 * Ensure the remote hart's writes are visible to this hart. +		 * This pairs with a barrier in flush_icache_mm. +		 */ +		smp_mb(); +		local_flush_icache_all(); +	} +#endif +} +  static inline void switch_mm(struct mm_struct *prev,  	struct mm_struct *next, struct task_struct *task)  {  	if (likely(prev != next)) { +		/* +		 * Mark the current MM context as inactive, and the next as +		 * active.  This is at least used by the icache flushing +		 * routines in order to determine who should +		 */ +		unsigned int cpu = smp_processor_id(); + +		cpumask_clear_cpu(cpu, mm_cpumask(prev)); +		cpumask_set_cpu(cpu, mm_cpumask(next)); +  		set_pgdir(next->pgd);  		local_flush_tlb_all(); + +		flush_icache_deferred(next);  	}  } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 3399257780b2..16301966d65b 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -20,8 +20,6 @@  #ifndef __ASSEMBLY__ -#ifdef CONFIG_MMU -  /* Page Upper Directory not used in RISC-V */  #include <asm-generic/pgtable-nopud.h>  #include <asm/page.h> @@ -178,28 +176,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long addr)  #define pte_offset_map(dir, addr)	pte_offset_kernel((dir), (addr))  #define pte_unmap(pte)			((void)(pte)) -/* - * Certain architectures need to do special things when PTEs within - * a page table are directly modified.  Thus, the following hook is - * made available. - */ -static inline void set_pte(pte_t *ptep, pte_t pteval) -{ -	*ptep = pteval; -} - -static inline void set_pte_at(struct mm_struct *mm, -	unsigned long addr, pte_t *ptep, pte_t pteval) -{ -	set_pte(ptep, pteval); -} - -static inline void pte_clear(struct mm_struct *mm, -	unsigned long addr, pte_t *ptep) -{ -	set_pte_at(mm, addr, ptep, __pte(0)); -} -  static inline int pte_present(pte_t pte)  {  	return (pte_val(pte) & _PAGE_PRESENT); @@ -210,21 +186,22 @@ static inline int pte_none(pte_t pte)  	return (pte_val(pte) == 0);  } -/* static inline int pte_read(pte_t pte) */ -  static inline int pte_write(pte_t pte)  {  	return pte_val(pte) & _PAGE_WRITE;  } +static inline int pte_exec(pte_t pte) +{ +	return pte_val(pte) & _PAGE_EXEC; +} +  static inline int pte_huge(pte_t pte)  {  	return pte_present(pte)  		&& (pte_val(pte) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC));  } -/* static inline int pte_exec(pte_t pte) */ -  static inline int pte_dirty(pte_t pte)  {  	return pte_val(pte) & _PAGE_DIRTY; @@ -311,6 +288,33 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b)  	return pte_val(pte_a) == pte_val(pte_b);  } +/* + * Certain architectures need to do special things when PTEs within + * a page table are directly modified.  Thus, the following hook is + * made available. + */ +static inline void set_pte(pte_t *ptep, pte_t pteval) +{ +	*ptep = pteval; +} + +void flush_icache_pte(pte_t pte); + +static inline void set_pte_at(struct mm_struct *mm, +	unsigned long addr, pte_t *ptep, pte_t pteval) +{ +	if (pte_present(pteval) && pte_exec(pteval)) +		flush_icache_pte(pteval); + +	set_pte(ptep, pteval); +} + +static inline void pte_clear(struct mm_struct *mm, +	unsigned long addr, pte_t *ptep) +{ +	set_pte_at(mm, addr, ptep, __pte(0)); +} +  #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS  static inline int ptep_set_access_flags(struct vm_area_struct *vma,  					unsigned long address, pte_t *ptep, @@ -407,8 +411,6 @@ static inline void pgtable_cache_init(void)  	/* No page table caches to initialize */  } -#endif /* CONFIG_MMU */ -  #define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)  #define VMALLOC_END      (PAGE_OFFSET - 1)  #define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE) diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index 93b8956e25e4..2c5df945d43c 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -66,7 +66,7 @@ struct pt_regs {  #define REG_FMT "%08lx"  #endif -#define user_mode(regs) (((regs)->sstatus & SR_PS) == 0) +#define user_mode(regs) (((regs)->sstatus & SR_SPP) == 0)  /* Helpers for working with the instruction pointer */ diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h index 04c71d938afd..2fd27e8ef1fd 100644 --- a/arch/riscv/include/asm/spinlock.h +++ b/arch/riscv/include/asm/spinlock.h @@ -24,7 +24,7 @@  /* FIXME: Replace this with a ticket lock, like MIPS. */ -#define arch_spin_is_locked(x)	((x)->lock != 0) +#define arch_spin_is_locked(x)	(READ_ONCE((x)->lock) != 0)  static inline void arch_spin_unlock(arch_spinlock_t *lock)  { @@ -58,15 +58,6 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)  	}  } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ -	smp_rmb(); -	do { -		cpu_relax(); -	} while (arch_spin_is_locked(lock)); -	smp_acquire__after_ctrl_dep(); -} -  /***********************************************************/  static inline void arch_read_lock(arch_rwlock_t *lock) diff --git a/arch/riscv/include/asm/timex.h b/arch/riscv/include/asm/timex.h index 3df4932d8964..2f26989cb864 100644 --- a/arch/riscv/include/asm/timex.h +++ b/arch/riscv/include/asm/timex.h @@ -18,7 +18,7 @@  typedef unsigned long cycles_t; -static inline cycles_t get_cycles(void) +static inline cycles_t get_cycles_inline(void)  {  	cycles_t n; @@ -27,6 +27,7 @@ static inline cycles_t get_cycles(void)  		: "=r" (n));  	return n;  } +#define get_cycles get_cycles_inline  #ifdef CONFIG_64BIT  static inline uint64_t get_cycles64(void) diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index 5ee4ae370b5e..7b9c24ebdf52 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -15,9 +15,12 @@  #ifndef _ASM_RISCV_TLBFLUSH_H  #define _ASM_RISCV_TLBFLUSH_H -#ifdef CONFIG_MMU +#include <linux/mm_types.h> -/* Flush entire local TLB */ +/* + * Flush entire local TLB.  'sfence.vma' implicitly fences with the instruction + * cache as well, so a 'fence.i' is not necessary. + */  static inline void local_flush_tlb_all(void)  {  	__asm__ __volatile__ ("sfence.vma" : : : "memory"); @@ -59,6 +62,4 @@ static inline void flush_tlb_kernel_range(unsigned long start,  	flush_tlb_all();  } -#endif /* CONFIG_MMU */ -  #endif /* _ASM_RISCV_TLBFLUSH_H */ diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 27b90d64814b..14b0b22fb578 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -127,7 +127,6 @@ extern int fixup_exception(struct pt_regs *state);   * call.   */ -#ifdef CONFIG_MMU  #define __get_user_asm(insn, x, ptr, err)			\  do {								\  	uintptr_t __tmp;					\ @@ -153,13 +152,11 @@ do {								\  	__disable_user_access();				\  	(x) = __x;						\  } while (0) -#endif /* CONFIG_MMU */  #ifdef CONFIG_64BIT  #define __get_user_8(x, ptr, err) \  	__get_user_asm("ld", x, ptr, err)  #else /* !CONFIG_64BIT */ -#ifdef CONFIG_MMU  #define __get_user_8(x, ptr, err)				\  do {								\  	u32 __user *__ptr = (u32 __user *)(ptr);		\ @@ -193,7 +190,6 @@ do {								\  	(x) = (__typeof__(x))((__typeof__((x)-(x)))(		\  		(((u64)__hi << 32) | __lo)));			\  } while (0) -#endif /* CONFIG_MMU */  #endif /* CONFIG_64BIT */ @@ -267,8 +263,6 @@ do {								\  		((x) = 0, -EFAULT);				\  }) - -#ifdef CONFIG_MMU  #define __put_user_asm(insn, x, ptr, err)			\  do {								\  	uintptr_t __tmp;					\ @@ -292,14 +286,11 @@ do {								\  		: "rJ" (__x), "i" (-EFAULT));			\  	__disable_user_access();				\  } while (0) -#endif /* CONFIG_MMU */ -  #ifdef CONFIG_64BIT  #define __put_user_8(x, ptr, err) \  	__put_user_asm("sd", x, ptr, err)  #else /* !CONFIG_64BIT */ -#ifdef CONFIG_MMU  #define __put_user_8(x, ptr, err)				\  do {								\  	u32 __user *__ptr = (u32 __user *)(ptr);		\ @@ -329,7 +320,6 @@ do {								\  		: "rJ" (__x), "rJ" (__x >> 32), "i" (-EFAULT));	\  	__disable_user_access();				\  } while (0) -#endif /* CONFIG_MMU */  #endif /* CONFIG_64BIT */ @@ -438,7 +428,6 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n)   * will set "err" to -EFAULT, while successful accesses return the previous   * value.   */ -#ifdef CONFIG_MMU  #define __cmpxchg_user(ptr, old, new, err, size, lrb, scb)	\  ({								\  	__typeof__(ptr) __ptr = (ptr);				\ @@ -508,6 +497,5 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n)  	(err) = __err;						\  	__ret;							\  }) -#endif /* CONFIG_MMU */  #endif /* _ASM_RISCV_UACCESS_H */ diff --git a/arch/riscv/include/asm/unistd.h b/arch/riscv/include/asm/unistd.h index 9f250ed007cd..2f704a5c4196 100644 --- a/arch/riscv/include/asm/unistd.h +++ b/arch/riscv/include/asm/unistd.h @@ -14,3 +14,4 @@  #define __ARCH_HAVE_MMU  #define __ARCH_WANT_SYS_CLONE  #include <uapi/asm/unistd.h> +#include <uapi/asm/syscalls.h> diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index 602f61257553..541544d64c33 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -38,4 +38,8 @@ struct vdso_data {  	(void __user *)((unsigned long)(base) + __vdso_##name);			\  }) +#ifdef CONFIG_SMP +asmlinkage long sys_riscv_flush_icache(uintptr_t, uintptr_t, uintptr_t); +#endif +  #endif /* _ASM_RISCV_VDSO_H */ diff --git a/arch/riscv/include/uapi/asm/Kbuild b/arch/riscv/include/uapi/asm/Kbuild index 5ded96b06352..7e91f4850475 100644 --- a/arch/riscv/include/uapi/asm/Kbuild +++ b/arch/riscv/include/uapi/asm/Kbuild @@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm  generic-y += setup.h  generic-y += unistd.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/riscv/include/uapi/asm/syscalls.h b/arch/riscv/include/uapi/asm/syscalls.h new file mode 100644 index 000000000000..818655b0d535 --- /dev/null +++ b/arch/riscv/include/uapi/asm/syscalls.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2017 SiFive + */ + +#ifndef _ASM__UAPI__SYSCALLS_H +#define _ASM__UAPI__SYSCALLS_H + +/* + * Allows the instruction cache to be flushed from userspace.  Despite RISC-V + * having a direct 'fence.i' instruction available to userspace (which we + * can't trap!), that's not actually viable when running on Linux because the + * kernel might schedule a process on another hart.  There is no way for + * userspace to handle this without invoking the kernel (as it doesn't know the + * thread->hart mappings), so we've defined a RISC-V specific system call to + * flush the instruction cache. + * + * __NR_riscv_flush_icache is defined to flush the instruction cache over an + * address range, with the flush applying to either all threads or just the + * caller.  We don't currently do anything with the address range, that's just + * in there for forwards compatibility. + */ +#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15) +__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache) + +#endif diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 20ee86f782a9..7404ec222406 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -196,7 +196,7 @@ handle_syscall:  	addi s2, s2, 0x4  	REG_S s2, PT_SEPC(sp)  	/* System calls run with interrupts enabled */ -	csrs sstatus, SR_IE +	csrs sstatus, SR_SIE  	/* Trace syscalls, but only if requested by the user. */  	REG_L t0, TASK_TI_FLAGS(tp)  	andi t0, t0, _TIF_SYSCALL_TRACE @@ -224,8 +224,8 @@ ret_from_syscall:  ret_from_exception:  	REG_L s0, PT_SSTATUS(sp) -	csrc sstatus, SR_IE -	andi s0, s0, SR_PS +	csrc sstatus, SR_SIE +	andi s0, s0, SR_SPP  	bnez s0, restore_all  resume_userspace: @@ -255,7 +255,7 @@ work_pending:  	bnez s1, work_resched  work_notifysig:  	/* Handle pending signals and notify-resume requests */ -	csrs sstatus, SR_IE /* Enable interrupts for do_notify_resume() */ +	csrs sstatus, SR_SIE /* Enable interrupts for do_notify_resume() */  	move a0, sp /* pt_regs */  	move a1, s0 /* current_thread_info->flags */  	tail do_notify_resume diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 76af908f87c1..78f670d70133 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -152,6 +152,3 @@ END(_start)  __PAGE_ALIGNED_BSS  	/* Empty zero page */  	.balign PAGE_SIZE -ENTRY(empty_zero_page) -	.fill (empty_zero_page + PAGE_SIZE) - ., 1, 0x00 -END(empty_zero_page) diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 0d90dcc1fbd3..d74d4adf2d54 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -76,7 +76,7 @@ void show_regs(struct pt_regs *regs)  void start_thread(struct pt_regs *regs, unsigned long pc,  	unsigned long sp)  { -	regs->sstatus = SR_PIE /* User mode, irqs on */ | SR_FS_INITIAL; +	regs->sstatus = SR_SPIE /* User mode, irqs on */ | SR_FS_INITIAL;  	regs->sepc = pc;  	regs->sp = sp;  	set_fs(USER_DS); @@ -110,7 +110,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,  		const register unsigned long gp __asm__ ("gp");  		memset(childregs, 0, sizeof(struct pt_regs));  		childregs->gp = gp; -		childregs->sstatus = SR_PS | SR_PIE; /* Supervisor, irqs on */ +		childregs->sstatus = SR_SPP | SR_SPIE; /* Supervisor, irqs on */  		p->thread.ra = (unsigned long)ret_from_kernel_thread;  		p->thread.s[0] = usp; /* fn */ diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c index 23cc81ec9e94..551734248748 100644 --- a/arch/riscv/kernel/riscv_ksyms.c +++ b/arch/riscv/kernel/riscv_ksyms.c @@ -12,4 +12,7 @@  /*   * Assembly functions that may be used (directly or indirectly) by modules   */ +EXPORT_SYMBOL(__clear_user);  EXPORT_SYMBOL(__copy_user); +EXPORT_SYMBOL(memset); +EXPORT_SYMBOL(memcpy); diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index de7db114c315..cb7b0c63014e 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -38,10 +38,6 @@  #include <asm/tlbflush.h>  #include <asm/thread_info.h> -#ifdef CONFIG_HVC_RISCV_SBI -#include <asm/hvc_riscv_sbi.h> -#endif -  #ifdef CONFIG_DUMMY_CONSOLE  struct screen_info screen_info = {  	.orig_video_lines	= 30, @@ -58,7 +54,12 @@ static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;  #endif /* CONFIG_CMDLINE_BOOL */  unsigned long va_pa_offset; +EXPORT_SYMBOL(va_pa_offset);  unsigned long pfn_base; +EXPORT_SYMBOL(pfn_base); + +unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; +EXPORT_SYMBOL(empty_zero_page);  /* The lucky hart to first increment this variable will boot the other cores */  atomic_t hart_lottery; @@ -207,13 +208,6 @@ static void __init setup_bootmem(void)  void __init setup_arch(char **cmdline_p)  { -#if defined(CONFIG_HVC_RISCV_SBI) -	if (likely(early_console == NULL)) { -		early_console = &riscv_sbi_early_console_dev; -		register_console(early_console); -	} -#endif -  #ifdef CONFIG_CMDLINE_BOOL  #ifdef CONFIG_CMDLINE_OVERRIDE  	strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index b4a71ec5906f..6d3962435720 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -38,6 +38,13 @@ enum ipi_message_type {  	IPI_MAX  }; + +/* Unsupported */ +int setup_profiling_timer(unsigned int multiplier) +{ +	return -EINVAL; +} +  irqreturn_t handle_ipi(void)  {  	unsigned long *pending_ipis = &ipi_data[smp_processor_id()].bits; @@ -108,3 +115,51 @@ void smp_send_reschedule(int cpu)  {  	send_ipi_message(cpumask_of(cpu), IPI_RESCHEDULE);  } + +/* + * Performs an icache flush for the given MM context.  RISC-V has no direct + * mechanism for instruction cache shoot downs, so instead we send an IPI that + * informs the remote harts they need to flush their local instruction caches. + * To avoid pathologically slow behavior in a common case (a bunch of + * single-hart processes on a many-hart machine, ie 'make -j') we avoid the + * IPIs for harts that are not currently executing a MM context and instead + * schedule a deferred local instruction cache flush to be performed before + * execution resumes on each hart. + */ +void flush_icache_mm(struct mm_struct *mm, bool local) +{ +	unsigned int cpu; +	cpumask_t others, *mask; + +	preempt_disable(); + +	/* Mark every hart's icache as needing a flush for this MM. */ +	mask = &mm->context.icache_stale_mask; +	cpumask_setall(mask); +	/* Flush this hart's I$ now, and mark it as flushed. */ +	cpu = smp_processor_id(); +	cpumask_clear_cpu(cpu, mask); +	local_flush_icache_all(); + +	/* +	 * Flush the I$ of other harts concurrently executing, and mark them as +	 * flushed. +	 */ +	cpumask_andnot(&others, mm_cpumask(mm), cpumask_of(cpu)); +	local |= cpumask_empty(&others); +	if (mm != current->active_mm || !local) +		sbi_remote_fence_i(others.bits); +	else { +		/* +		 * It's assumed that at least one strongly ordered operation is +		 * performed on this hart between setting a hart's cpumask bit +		 * and scheduling this MM context on that hart.  Sending an SBI +		 * remote message will do this, but in the case where no +		 * messages are sent we still need to order this hart's writes +		 * with flush_icache_deferred(). +		 */ +		smp_mb(); +	} + +	preempt_enable(); +} diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index 4351be7d0533..79c78668258e 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -14,8 +14,8 @@   */  #include <linux/syscalls.h> -#include <asm/cmpxchg.h>  #include <asm/unistd.h> +#include <asm/cacheflush.h>  static long riscv_sys_mmap(unsigned long addr, unsigned long len,  			   unsigned long prot, unsigned long flags, @@ -47,3 +47,34 @@ SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len,  	return riscv_sys_mmap(addr, len, prot, flags, fd, offset, 12);  }  #endif /* !CONFIG_64BIT */ + +#ifdef CONFIG_SMP +/* + * Allows the instruction cache to be flushed from userspace.  Despite RISC-V + * having a direct 'fence.i' instruction available to userspace (which we + * can't trap!), that's not actually viable when running on Linux because the + * kernel might schedule a process on another hart.  There is no way for + * userspace to handle this without invoking the kernel (as it doesn't know the + * thread->hart mappings), so we've defined a RISC-V specific system call to + * flush the instruction cache. + * + * sys_riscv_flush_icache() is defined to flush the instruction cache over an + * address range, with the flush applying to either all threads or just the + * caller.  We don't currently do anything with the address range, that's just + * in there for forwards compatibility. + */ +SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end, +	uintptr_t, flags) +{ +	struct mm_struct *mm = current->mm; +	bool local = (flags & SYS_RISCV_FLUSH_ICACHE_LOCAL) != 0; + +	/* Check the reserved flags. */ +	if (unlikely(flags & ~SYS_RISCV_FLUSH_ICACHE_ALL)) +		return -EINVAL; + +	flush_icache_mm(mm, local); + +	return 0; +} +#endif diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c index 4e30dc5fb593..ade52b903a43 100644 --- a/arch/riscv/kernel/syscall_table.c +++ b/arch/riscv/kernel/syscall_table.c @@ -15,6 +15,7 @@  #include <linux/linkage.h>  #include <linux/syscalls.h>  #include <asm-generic/syscalls.h> +#include <asm/vdso.h>  #undef __SYSCALL  #define __SYSCALL(nr, call)	[nr] = (call), diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 523d0a8ac8db..324568d33921 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -1,7 +1,12 @@  # Copied from arch/tile/kernel/vdso/Makefile  # Symbols present in the vdso -vdso-syms = rt_sigreturn +vdso-syms  = rt_sigreturn +vdso-syms += gettimeofday +vdso-syms += clock_gettime +vdso-syms += clock_getres +vdso-syms += getcpu +vdso-syms += flush_icache  # Files to link into the vdso  obj-vdso = $(patsubst %, %.o, $(vdso-syms)) diff --git a/arch/riscv/kernel/vdso/clock_getres.S b/arch/riscv/kernel/vdso/clock_getres.S new file mode 100644 index 000000000000..edf7e2339648 --- /dev/null +++ b/arch/riscv/kernel/vdso/clock_getres.S @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2017 SiFive + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, + *   but WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *   GNU General Public License for more details. + */ + +#include <linux/linkage.h> +#include <asm/unistd.h> + +	.text +/* int __vdso_clock_getres(clockid_t clock_id, struct timespec *res); */ +ENTRY(__vdso_clock_getres) +	.cfi_startproc +	/* For now, just do the syscall. */ +	li a7, __NR_clock_getres +	ecall +	ret +	.cfi_endproc +ENDPROC(__vdso_clock_getres) diff --git a/arch/riscv/kernel/vdso/clock_gettime.S b/arch/riscv/kernel/vdso/clock_gettime.S new file mode 100644 index 000000000000..aac65676c6d5 --- /dev/null +++ b/arch/riscv/kernel/vdso/clock_gettime.S @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2017 SiFive + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, + *   but WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *   GNU General Public License for more details. + */ + +#include <linux/linkage.h> +#include <asm/unistd.h> + +	.text +/* int __vdso_clock_gettime(clockid_t clock_id, struct timespec *tp); */ +ENTRY(__vdso_clock_gettime) +	.cfi_startproc +	/* For now, just do the syscall. */ +	li a7, __NR_clock_gettime +	ecall +	ret +	.cfi_endproc +ENDPROC(__vdso_clock_gettime) diff --git a/arch/riscv/kernel/vdso/flush_icache.S b/arch/riscv/kernel/vdso/flush_icache.S new file mode 100644 index 000000000000..023e4d4aef58 --- /dev/null +++ b/arch/riscv/kernel/vdso/flush_icache.S @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2017 SiFive + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, + *   but WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *   GNU General Public License for more details. + */ + +#include <linux/linkage.h> +#include <asm/unistd.h> + +	.text +/* int __vdso_flush_icache(void *start, void *end, unsigned long flags); */ +ENTRY(__vdso_flush_icache) +	.cfi_startproc +#ifdef CONFIG_SMP +	li a7, __NR_riscv_flush_icache +	ecall +#else +	fence.i +	li a0, 0 +#endif +	ret +	.cfi_endproc +ENDPROC(__vdso_flush_icache) diff --git a/arch/riscv/kernel/vdso/getcpu.S b/arch/riscv/kernel/vdso/getcpu.S new file mode 100644 index 000000000000..cc7e98924484 --- /dev/null +++ b/arch/riscv/kernel/vdso/getcpu.S @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2017 SiFive + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, + *   but WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *   GNU General Public License for more details. + */ + +#include <linux/linkage.h> +#include <asm/unistd.h> + +	.text +/* int __vdso_getcpu(unsigned *cpu, unsigned *node, void *unused); */ +ENTRY(__vdso_getcpu) +	.cfi_startproc +	/* For now, just do the syscall. */ +	li a7, __NR_getcpu +	ecall +	ret +	.cfi_endproc +ENDPROC(__vdso_getcpu) diff --git a/arch/riscv/kernel/vdso/gettimeofday.S b/arch/riscv/kernel/vdso/gettimeofday.S new file mode 100644 index 000000000000..da85d33e8990 --- /dev/null +++ b/arch/riscv/kernel/vdso/gettimeofday.S @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2017 SiFive + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, + *   but WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *   GNU General Public License for more details. + */ + +#include <linux/linkage.h> +#include <asm/unistd.h> + +	.text +/* int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); */ +ENTRY(__vdso_gettimeofday) +	.cfi_startproc +	/* For now, just do the syscall. */ +	li a7, __NR_gettimeofday +	ecall +	ret +	.cfi_endproc +ENDPROC(__vdso_gettimeofday) diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index 8c9dce95c11d..cd1d47e0724b 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -70,8 +70,11 @@ VERSION  	LINUX_4.15 {  	global:  		__vdso_rt_sigreturn; -		__vdso_cmpxchg32; -		__vdso_cmpxchg64; +		__vdso_gettimeofday; +		__vdso_clock_gettime; +		__vdso_clock_getres; +		__vdso_getcpu; +		__vdso_flush_icache;  	local: *;  	};  } diff --git a/arch/riscv/lib/delay.c b/arch/riscv/lib/delay.c index 1cc4ac3964b4..dce8ae24c6d3 100644 --- a/arch/riscv/lib/delay.c +++ b/arch/riscv/lib/delay.c @@ -84,6 +84,7 @@ void __delay(unsigned long cycles)  	while ((unsigned long)(get_cycles() - t0) < cycles)  		cpu_relax();  } +EXPORT_SYMBOL(__delay);  void udelay(unsigned long usecs)  { diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index 81f7d9ce6d88..eb22ab49b3e0 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -2,3 +2,4 @@ obj-y += init.o  obj-y += fault.o  obj-y += extable.o  obj-y += ioremap.o +obj-y += cacheflush.o diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c new file mode 100644 index 000000000000..498c0a0814fe --- /dev/null +++ b/arch/riscv/mm/cacheflush.c @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2017 SiFive + * + *   This program is free software; you can redistribute it and/or + *   modify it under the terms of the GNU General Public License + *   as published by the Free Software Foundation, version 2. + * + *   This program is distributed in the hope that it will be useful, + *   but WITHOUT ANY WARRANTY; without even the implied warranty of + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *   GNU General Public License for more details. + */ + +#include <asm/pgtable.h> +#include <asm/cacheflush.h> + +void flush_icache_pte(pte_t pte) +{ +	struct page *page = pte_page(pte); + +	if (!test_and_set_bit(PG_dcache_clean, &page->flags)) +		flush_icache_all(); +} diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index df2ca3c65048..0713f3c67ab4 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -63,7 +63,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs)  		goto vmalloc_fault;  	/* Enable interrupts if they were enabled in the parent context. */ -	if (likely(regs->sstatus & SR_PIE)) +	if (likely(regs->sstatus & SR_SPIE))  		local_irq_enable();  	/* diff --git a/arch/riscv/mm/ioremap.c b/arch/riscv/mm/ioremap.c index e99194a4077e..70ef2724cdf6 100644 --- a/arch/riscv/mm/ioremap.c +++ b/arch/riscv/mm/ioremap.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL(ioremap);   *   * Caller must ensure there is only one unmapping for the same pointer.   */ -void iounmap(void __iomem *addr) +void iounmap(volatile void __iomem *addr)  {  	vunmap((void *)((unsigned long)addr & PAGE_MASK));  } diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild index eae2c64cf69d..9fdff3fe1a42 100644 --- a/arch/s390/Kbuild +++ b/arch/s390/Kbuild @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  obj-y				+= kernel/  obj-y				+= mm/  obj-$(CONFIG_KVM)		+= kvm/ diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 6b3f41985f28..de54cfc6109d 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  #  # s390/Makefile  # @@ -6,10 +7,6 @@  # for "archclean" and "archdep" for cleaning up and making dependencies for  # this architecture  # -# This file is subject to the terms and conditions of the GNU General Public -# License.  See the file "COPYING" in the main directory of this archive -# for more details. -#  # Copyright (C) 1994 by Linus Torvalds  # diff --git a/arch/s390/appldata/Makefile b/arch/s390/appldata/Makefile index 99f1cf071304..b06def4a4f2f 100644 --- a/arch/s390/appldata/Makefile +++ b/arch/s390/appldata/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  #  # Makefile for the Linux - z/VM Monitor Stream.  # diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index ef3fb1b9201f..cb6e8066b1ad 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Base infrastructure for Linux-z/VM Monitor Stream, Stage 1.   * Exports appldata_register_ops() and appldata_unregister_ops() for the diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c index 598df5708501..e68136c3c23a 100644 --- a/arch/s390/appldata/appldata_mem.c +++ b/arch/s390/appldata/appldata_mem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Data gathering module for Linux-VM Monitor Stream, Stage 1.   * Collects data related to memory management. diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c index 66037d2622b4..8bc14b0d1def 100644 --- a/arch/s390/appldata/appldata_net_sum.c +++ b/arch/s390/appldata/appldata_net_sum.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Data gathering module for Linux-VM Monitor Stream, Stage 1.   * Collects accumulated network statistics (Packets received/transmitted, diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 45b3178200ab..433a994b1a89 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Data gathering module for Linux-VM Monitor Stream, Stage 1.   * Collects misc. OS related data (CPU utilization, running processes). diff --git a/arch/s390/boot/compressed/vmlinux.scr b/arch/s390/boot/compressed/vmlinux.scr index f02382ae5c48..42a242597f34 100644 --- a/arch/s390/boot/compressed/vmlinux.scr +++ b/arch/s390/boot/compressed/vmlinux.scr @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  SECTIONS  {    .rodata.compressed : { diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh index aed3069699bd..bed227f267ae 100644 --- a/arch/s390/boot/install.sh +++ b/arch/s390/boot/install.sh @@ -1,11 +1,8 @@  #!/bin/sh +# SPDX-License-Identifier: GPL-2.0  #  # arch/s390x/boot/install.sh  # -# This file is subject to the terms and conditions of the GNU General Public -# License.  See the file "COPYING" in the main directory of this archive -# for more details. -#  # Copyright (C) 1995 by Linus Torvalds  #  # Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index b48e20dd94e9..d60798737d86 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Cryptographic API.   * @@ -11,12 +12,6 @@   *		Harald Freudenberger <freude@de.ibm.com>   *   * Derived from "crypto/aes_generic.c" - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - *   */  #define KMSG_COMPONENT "aes_s390" diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c index 36aefc07d10c..8720e9203ecf 100644 --- a/arch/s390/crypto/arch_random.c +++ b/arch/s390/crypto/arch_random.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * s390 arch random implementation.   *   * Copyright IBM Corp. 2017   * Author(s): Harald Freudenberger <freude@de.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   */  #include <linux/kernel.h> diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c index 992e630c227b..436865926c26 100644 --- a/arch/s390/crypto/crc32-vx.c +++ b/arch/s390/crypto/crc32-vx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Crypto-API module for CRC-32 algorithms implemented with the   * z/Architecture Vector Extension Facility. diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c index 0d296662bbf0..5346b5a80bb6 100644 --- a/arch/s390/crypto/des_s390.c +++ b/arch/s390/crypto/des_s390.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Cryptographic API.   * @@ -6,12 +7,6 @@   * Copyright IBM Corp. 2003, 2011   * Author(s): Thomas Spatzier   *	      Jan Glauber (jan.glauber@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - *   */  #include <linux/init.h> diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c index 564616d48d8b..3b7f96c9eead 100644 --- a/arch/s390/crypto/ghash_s390.c +++ b/arch/s390/crypto/ghash_s390.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Cryptographic API.   * diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c index a4e903ed7e21..003932db8d12 100644 --- a/arch/s390/crypto/paes_s390.c +++ b/arch/s390/crypto/paes_s390.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Cryptographic API.   * @@ -7,11 +8,6 @@   *   Copyright IBM Corp. 2017   *   Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>   *		Harald Freudenberger <freude@de.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   */  #define KMSG_COMPONENT "paes_s390" diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index 3e47c4a0f18b..a97a1802cfb4 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Copyright IBM Corp. 2006, 2015   * Author(s): Jan Glauber <jan.glauber@de.ibm.com> diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h index 10f200790079..d6f8258b44df 100644 --- a/arch/s390/crypto/sha.h +++ b/arch/s390/crypto/sha.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0+ */  /*   * Cryptographic API.   * @@ -5,12 +6,6 @@   *   * Copyright IBM Corp. 2007   * Author(s): Jan Glauber (jang@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - *   */  #ifndef _CRYPTO_ARCH_S390_SHA_H  #define _CRYPTO_ARCH_S390_SHA_H diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c index c7de53d8da75..a00c17f761c1 100644 --- a/arch/s390/crypto/sha1_s390.c +++ b/arch/s390/crypto/sha1_s390.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Cryptographic API.   * @@ -16,12 +17,6 @@   *   Copyright (c) Alan Smithee.   *   Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>   *   Copyright (c) Jean-Francois Dive <jef@linuxbe.org> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - *   */  #include <crypto/internal/hash.h>  #include <linux/init.h> diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c index 53c277999a28..944aa6b237cd 100644 --- a/arch/s390/crypto/sha256_s390.c +++ b/arch/s390/crypto/sha256_s390.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Cryptographic API.   * @@ -6,12 +7,6 @@   * s390 Version:   *   Copyright IBM Corp. 2005, 2011   *   Author(s): Jan Glauber (jang@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - *   */  #include <crypto/internal/hash.h>  #include <linux/init.h> diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c index 2f4caa1ef123..b17eded532b1 100644 --- a/arch/s390/crypto/sha512_s390.c +++ b/arch/s390/crypto/sha512_s390.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Cryptographic API.   * @@ -5,12 +6,6 @@   *   * Copyright IBM Corp. 2007   * Author(s): Jan Glauber (jang@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - *   */  #include <crypto/internal/hash.h>  #include <crypto/sha.h> diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c index c740f77285b2..cf0718d121bc 100644 --- a/arch/s390/crypto/sha_common.c +++ b/arch/s390/crypto/sha_common.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   * Cryptographic API.   * @@ -5,12 +6,6 @@   *   * Copyright IBM Corp. 2007   * Author(s): Jan Glauber (jang@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - *   */  #include <crypto/internal/hash.h> diff --git a/arch/s390/hypfs/Makefile b/arch/s390/hypfs/Makefile index 2ee25ba252d6..06f601509ce9 100644 --- a/arch/s390/hypfs/Makefile +++ b/arch/s390/hypfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  #  # Makefile for the linux hypfs filesystem routines.  # diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index cf8a2d92467f..43bbe63e2992 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -1,9 +1,9 @@ +// SPDX-License-Identifier: GPL-1.0+  /*   *    Hypervisor filesystem for Linux on s390.   *   *    Copyright IBM Corp. 2006, 2008   *    Author(s): Michael Holzheu <holzheu@de.ibm.com> - *    License: GPL   */  #define KMSG_COMPONENT "hypfs" diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 41c211a4d8b1..048450869328 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  generic-y += asm-offsets.h  generic-y += cacheflush.h  generic-y += clkdev.h diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index a72002056b54..c2cf7bcdef9b 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _ASM_S390_ALTERNATIVE_H  #define _ASM_S390_ALTERNATIVE_H diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h index c02f4aba88a6..cfce6835b109 100644 --- a/arch/s390/include/asm/ap.h +++ b/arch/s390/include/asm/ap.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Adjunct processor (AP) interfaces   *   * Copyright IBM Corp. 2017   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   * Author(s): Tony Krowiak <akrowia@linux.vnet.ibm.com>   *	      Martin Schwidefsky <schwidefsky@de.ibm.com>   *	      Harald Freudenberger <freude@de.ibm.com> diff --git a/arch/s390/include/asm/bugs.h b/arch/s390/include/asm/bugs.h index 0f5bd894f4dc..aa42a179be33 100644 --- a/arch/s390/include/asm/bugs.h +++ b/arch/s390/include/asm/bugs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   *  S390 version   *    Copyright IBM Corp. 1999 diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index 792cda339af1..dd08db491b89 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -1,13 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * CPU-measurement facilities   *   *  Copyright IBM Corp. 2012   *  Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>   *	       Jan Glauber <jang@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #ifndef _ASM_S390_CPU_MF_H  #define _ASM_S390_CPU_MF_H diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 9a3cb3983c01..1a61b1b997f2 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -194,13 +194,14 @@ struct arch_elf_state {  #define CORE_DUMP_USE_REGSET  #define ELF_EXEC_PAGESIZE	PAGE_SIZE -/* - * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is raised to 4GB to leave the entire 32-bit address - * space open for things that want to use the area for 32-bit pointers. - */ -#define ELF_ET_DYN_BASE		(is_compat_task() ? 0x000400000UL : \ -						    0x100000000UL) +/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical +   use of this is to invoke "./ld.so someprog" to test out a new version of +   the loader.  We need to make sure that it is out of the way of the program +   that it will "exec", and that there is sufficient room for the brk. 64-bit +   tasks are aligned to 4GB. */ +#define ELF_ET_DYN_BASE (is_compat_task() ? \ +				(STACK_TOP / 3 * 2) : \ +				(STACK_TOP / 3 * 2) & ~((1UL << 32) - 1))  /* This yields a mask that user programs can use to figure out what     instruction set this CPU supports. */ diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index 921391f2341e..13de80cf741c 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -1,22 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0+ */  #ifndef _ASM_S390_KPROBES_H  #define _ASM_S390_KPROBES_H  /*   *  Kernel Probes (KProbes)   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - *   * Copyright IBM Corp. 2002, 2006   *   * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index f3a9b5a445b6..e14f381757f6 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * definition for kernel virtual machines on s390   *   * Copyright IBM Corp. 2008, 2009   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Carsten Otte <cotte@de.ibm.com>   */ diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h index 41393052ac57..74eeec9c0a80 100644 --- a/arch/s390/include/asm/kvm_para.h +++ b/arch/s390/include/asm/kvm_para.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * definition for paravirtual devices on s390   *   * Copyright IBM Corp. 2008   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Christian Borntraeger <borntraeger@de.ibm.com>   */  /* @@ -20,8 +17,6 @@   *   * Copyright IBM Corp. 2007,2008   * Author(s): Christian Borntraeger <borntraeger@de.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2.   */  #ifndef __S390_KVM_PARA_H  #define __S390_KVM_PARA_H diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h index 6de5c6cb0061..672f95b12d40 100644 --- a/arch/s390/include/asm/livepatch.h +++ b/arch/s390/include/asm/livepatch.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0+ */  /*   * livepatch.h - s390-specific Kernel Live Patching Core   * @@ -7,13 +8,6 @@   *	      Jiri Slaby   */ -/* - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - */ -  #ifndef ASM_LIVEPATCH_H  #define ASM_LIVEPATCH_H diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index f4a07f788f78..65154eaa3714 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -28,7 +28,7 @@ static inline int init_new_context(struct task_struct *tsk,  #ifdef CONFIG_PGSTE  	mm->context.alloc_pgste = page_table_allocate_pgste ||  		test_thread_flag(TIF_PGSTE) || -		current->mm->context.alloc_pgste; +		(current->mm && current->mm->context.alloc_pgste);  	mm->context.has_pgste = 0;  	mm->context.use_skey = 0;  	mm->context.use_cmma = 0; diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index d6c9d1e0dc2d..b9c0e361748b 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -40,6 +40,7 @@ struct pt_regs;  extern unsigned long perf_instruction_pointer(struct pt_regs *regs);  extern unsigned long perf_misc_flags(struct pt_regs *regs);  #define perf_misc_flags(regs) perf_misc_flags(regs) +#define perf_arch_bpf_user_pt_regs(regs) ®s->user_regs  /* Perf pt_regs extension for sample-data-entry indicators */  struct perf_sf_sde_regs { diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index d7fe9838084d..0a6b0286c32e 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -709,7 +709,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)  	return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT;  } -#define __HAVE_ARCH_PMD_WRITE +#define pmd_write pmd_write  static inline int pmd_write(pmd_t pmd)  {  	return (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) != 0; diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index a3788dafc0e1..6f70d81c40f2 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -74,9 +74,14 @@ enum {   */  struct pt_regs   { -	unsigned long args[1]; -	psw_t psw; -	unsigned long gprs[NUM_GPRS]; +	union { +		user_pt_regs user_regs; +		struct { +			unsigned long args[1]; +			psw_t psw; +			unsigned long gprs[NUM_GPRS]; +		}; +	};  	unsigned long orig_gpr2;  	unsigned int int_code;  	unsigned int int_parm; diff --git a/arch/s390/include/asm/segment.h b/arch/s390/include/asm/segment.h index 8bfce3475b1c..97a0582b8d0f 100644 --- a/arch/s390/include/asm/segment.h +++ b/arch/s390/include/asm/segment.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _ASM_SEGMENT_H  #define _ASM_SEGMENT_H diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h index ec7b476c1ac5..c61b2cc1a8a8 100644 --- a/arch/s390/include/asm/switch_to.h +++ b/arch/s390/include/asm/switch_to.h @@ -30,21 +30,20 @@ static inline void restore_access_regs(unsigned int *acrs)  	asm volatile("lam 0,15,%0" : : "Q" (*(acrstype *)acrs));  } -#define switch_to(prev,next,last) do {					\ -	if (prev->mm) {							\ -		save_fpu_regs();					\ -		save_access_regs(&prev->thread.acrs[0]);		\ -		save_ri_cb(prev->thread.ri_cb);				\ -		save_gs_cb(prev->thread.gs_cb);				\ -	}								\ +#define switch_to(prev, next, last) do {				\ +	/* save_fpu_regs() sets the CIF_FPU flag, which enforces	\ +	 * a restore of the floating point / vector registers as	\ +	 * soon as the next task returns to user space			\ +	 */								\ +	save_fpu_regs();						\ +	save_access_regs(&prev->thread.acrs[0]);			\ +	save_ri_cb(prev->thread.ri_cb);					\ +	save_gs_cb(prev->thread.gs_cb);					\  	update_cr_regs(next);						\ -	if (next->mm) {							\ -		set_cpu_flag(CIF_FPU);					\ -		restore_access_regs(&next->thread.acrs[0]);		\ -		restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb);	\ -		restore_gs_cb(next->thread.gs_cb);			\ -	}								\ -	prev = __switch_to(prev,next);					\ +	restore_access_regs(&next->thread.acrs[0]);			\ +	restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb);		\ +	restore_gs_cb(next->thread.gs_cb);				\ +	prev = __switch_to(prev, next);					\  } while (0)  #endif /* __ASM_SWITCH_TO_H */ diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index 6bc941be6921..96f9a9151fde 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Access to user system call parameters and results   *   *  Copyright IBM Corp. 2008   *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #ifndef _ASM_SYSCALL_H diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index a702cb9d4269..25057c118d56 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * definition for store system information stsi   *   * Copyright IBM Corp. 2001, 2008   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Ulrich Weigand <weigand@de.ibm.com>   *		 Christian Borntraeger <borntraeger@de.ibm.com>   */ diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 1807229b292f..cca406fdbe51 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -53,6 +53,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu);  static inline void topology_init_early(void) { }  static inline void topology_schedule_update(void) { }  static inline int topology_cpu_init(struct cpu *cpu) { return 0; } +static inline int topology_cpu_dedicated(int cpu_nr) { return 0; }  static inline void topology_expect_change(void) { }  #endif /* CONFIG_SCHED_TOPOLOGY */ diff --git a/arch/s390/include/asm/vga.h b/arch/s390/include/asm/vga.h index d375526c261f..605dc46bac5e 100644 --- a/arch/s390/include/asm/vga.h +++ b/arch/s390/include/asm/vga.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  #ifndef _ASM_S390_VGA_H  #define _ASM_S390_VGA_H diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index 098f28778a13..92b7c9b3e641 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm diff --git a/arch/s390/include/uapi/asm/bpf_perf_event.h b/arch/s390/include/uapi/asm/bpf_perf_event.h new file mode 100644 index 000000000000..cefe7c7cd4f6 --- /dev/null +++ b/arch/s390/include/uapi/asm/bpf_perf_event.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _UAPI__ASM_BPF_PERF_EVENT_H__ +#define _UAPI__ASM_BPF_PERF_EVENT_H__ + +#include <asm/ptrace.h> + +typedef user_pt_regs bpf_user_pt_regs_t; + +#endif /* _UAPI__ASM_BPF_PERF_EVENT_H__ */ diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 9ad172dcd912..38535a57fef8 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -6,10 +6,6 @@   *   * Copyright IBM Corp. 2008   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Carsten Otte <cotte@de.ibm.com>   *               Christian Borntraeger <borntraeger@de.ibm.com>   */ diff --git a/arch/s390/include/uapi/asm/kvm_para.h b/arch/s390/include/uapi/asm/kvm_para.h index 0dc86b3a7cb0..b9ab584adf43 100644 --- a/arch/s390/include/uapi/asm/kvm_para.h +++ b/arch/s390/include/uapi/asm/kvm_para.h @@ -4,9 +4,5 @@   *   * Copyright IBM Corp. 2008   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Christian Borntraeger <borntraeger@de.ibm.com>   */ diff --git a/arch/s390/include/uapi/asm/kvm_perf.h b/arch/s390/include/uapi/asm/kvm_perf.h index c36c97ffdc6f..84606b8cc49e 100644 --- a/arch/s390/include/uapi/asm/kvm_perf.h +++ b/arch/s390/include/uapi/asm/kvm_perf.h @@ -4,10 +4,6 @@   *   * Copyright 2014 IBM Corp.   * Author(s): Alexander Yarygin <yarygin@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #ifndef __LINUX_KVM_PERF_S390_H diff --git a/arch/s390/include/uapi/asm/perf_regs.h b/arch/s390/include/uapi/asm/perf_regs.h index 7c8564f98205..d17dd9e5d516 100644 --- a/arch/s390/include/uapi/asm/perf_regs.h +++ b/arch/s390/include/uapi/asm/perf_regs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */  #ifndef _ASM_S390_PERF_REGS_H  #define _ASM_S390_PERF_REGS_H diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h index 0d23c8ff2900..543dd70e12c8 100644 --- a/arch/s390/include/uapi/asm/ptrace.h +++ b/arch/s390/include/uapi/asm/ptrace.h @@ -162,7 +162,7 @@  #define GPR_SIZE	8  #define CR_SIZE		8 -#define STACK_FRAME_OVERHEAD    160      /* size of minimum stack frame */ +#define STACK_FRAME_OVERHEAD	160	 /* size of minimum stack frame */  #endif /* __s390x__ */ @@ -179,17 +179,16 @@  #define ACR_SIZE	4 -#define PTRACE_OLDSETOPTIONS         21 +#define PTRACE_OLDSETOPTIONS	     21  #ifndef __ASSEMBLY__  #include <linux/stddef.h>  #include <linux/types.h> -typedef union -{ -	float   f; -	double  d; -        __u64   ui; +typedef union { +	float	f; +	double	d; +	__u64	ui;  	struct  	{  		__u32 hi; @@ -197,23 +196,21 @@ typedef union  	} fp;  } freg_t; -typedef struct -{ -	__u32   fpc; +typedef struct { +	__u32	fpc;  	__u32	pad; -	freg_t  fprs[NUM_FPRS];               +	freg_t	fprs[NUM_FPRS];  } s390_fp_regs; -#define FPC_EXCEPTION_MASK      0xF8000000 -#define FPC_FLAGS_MASK          0x00F80000 -#define FPC_DXC_MASK            0x0000FF00 -#define FPC_RM_MASK             0x00000003 +#define FPC_EXCEPTION_MASK	0xF8000000 +#define FPC_FLAGS_MASK		0x00F80000 +#define FPC_DXC_MASK		0x0000FF00 +#define FPC_RM_MASK		0x00000003  /* this typedef defines how a Program Status Word looks like */ -typedef struct  -{ -        unsigned long mask; -        unsigned long addr; +typedef struct { +	unsigned long mask; +	unsigned long addr;  } __attribute__ ((aligned(8))) psw_t;  #ifndef __s390x__ @@ -282,8 +279,7 @@ typedef struct  /*   * The s390_regs structure is used to define the elf_gregset_t.   */ -typedef struct -{ +typedef struct {  	psw_t psw;  	unsigned long gprs[NUM_GPRS];  	unsigned int  acrs[NUM_ACRS]; @@ -291,24 +287,32 @@ typedef struct  } s390_regs;  /* + * The user_pt_regs structure exports the beginning of + * the in-kernel pt_regs structure to user space. + */ +typedef struct { +	unsigned long args[1]; +	psw_t psw; +	unsigned long gprs[NUM_GPRS]; +} user_pt_regs; + +/*   * Now for the user space program event recording (trace) definitions.   * The following structures are used only for the ptrace interface, don't   * touch or even look at it if you don't want to modify the user-space   * ptrace interface. In particular stay away from it for in-kernel PER.   */ -typedef struct -{ +typedef struct {  	unsigned long cr[NUM_CR_WORDS];  } per_cr_words;  #define PER_EM_MASK 0xE8000000UL -typedef	struct -{ +typedef struct {  #ifdef __s390x__ -	unsigned                       : 32; +	unsigned		       : 32;  #endif /* __s390x__ */ -	unsigned em_branching          : 1; +	unsigned em_branching	       : 1;  	unsigned em_instruction_fetch  : 1;  	/*  	 * Switching on storage alteration automatically fixes @@ -317,44 +321,41 @@ typedef	struct  	unsigned em_storage_alteration : 1;  	unsigned em_gpr_alt_unused     : 1;  	unsigned em_store_real_address : 1; -	unsigned                       : 3; +	unsigned		       : 3;  	unsigned branch_addr_ctl       : 1; -	unsigned                       : 1; +	unsigned		       : 1;  	unsigned storage_alt_space_ctl : 1; -	unsigned                       : 21; +	unsigned		       : 21;  	unsigned long starting_addr;  	unsigned long ending_addr;  } per_cr_bits; -typedef struct -{ +typedef struct {  	unsigned short perc_atmid;  	unsigned long address;  	unsigned char access_id;  } per_lowcore_words; -typedef struct -{ -	unsigned perc_branching          : 1; +typedef struct { +	unsigned perc_branching		 : 1;  	unsigned perc_instruction_fetch  : 1;  	unsigned perc_storage_alteration : 1; -	unsigned perc_gpr_alt_unused     : 1; +	unsigned perc_gpr_alt_unused	 : 1;  	unsigned perc_store_real_address : 1; -	unsigned                         : 3; -	unsigned atmid_psw_bit_31        : 1; -	unsigned atmid_validity_bit      : 1; -	unsigned atmid_psw_bit_32        : 1; -	unsigned atmid_psw_bit_5         : 1; -	unsigned atmid_psw_bit_16        : 1; -	unsigned atmid_psw_bit_17        : 1; -	unsigned si                      : 2; +	unsigned			 : 3; +	unsigned atmid_psw_bit_31	 : 1; +	unsigned atmid_validity_bit	 : 1; +	unsigned atmid_psw_bit_32	 : 1; +	unsigned atmid_psw_bit_5	 : 1; +	unsigned atmid_psw_bit_16	 : 1; +	unsigned atmid_psw_bit_17	 : 1; +	unsigned si			 : 2;  	unsigned long address; -	unsigned                         : 4; -	unsigned access_id               : 4; +	unsigned			 : 4; +	unsigned access_id		 : 4;  } per_lowcore_bits; -typedef struct -{ +typedef struct {  	union {  		per_cr_words   words;  		per_cr_bits    bits; @@ -364,9 +365,9 @@ typedef struct  	 * the kernel always sets them to zero. To enable single  	 * stepping use ptrace(PTRACE_SINGLESTEP) instead.  	 */ -	unsigned  single_step       : 1; +	unsigned  single_step	    : 1;  	unsigned  instruction_fetch : 1; -	unsigned                    : 30; +	unsigned		    : 30;  	/*  	 * These addresses are copied into cr10 & cr11 if single  	 * stepping is switched off @@ -376,11 +377,10 @@ typedef struct  	union {  		per_lowcore_words words;  		per_lowcore_bits  bits; -	} lowcore;  +	} lowcore;  } per_struct; -typedef struct -{ +typedef struct {  	unsigned int  len;  	unsigned long kernel_addr;  	unsigned long process_addr; @@ -390,12 +390,12 @@ typedef struct   * S/390 specific non posix ptrace requests. I chose unusual values so   * they are unlikely to clash with future ptrace definitions.   */ -#define PTRACE_PEEKUSR_AREA           0x5000 -#define PTRACE_POKEUSR_AREA           0x5001 +#define PTRACE_PEEKUSR_AREA	      0x5000 +#define PTRACE_POKEUSR_AREA	      0x5001  #define PTRACE_PEEKTEXT_AREA	      0x5002  #define PTRACE_PEEKDATA_AREA	      0x5003  #define PTRACE_POKETEXT_AREA	      0x5004 -#define PTRACE_POKEDATA_AREA 	      0x5005 +#define PTRACE_POKEDATA_AREA	      0x5005  #define PTRACE_GET_LAST_BREAK	      0x5006  #define PTRACE_PEEK_SYSTEM_CALL       0x5007  #define PTRACE_POKE_SYSTEM_CALL	      0x5008 @@ -413,21 +413,19 @@ typedef struct   * PT_PROT definition is loosely based on hppa bsd definition in   * gdb/hppab-nat.c   */ -#define PTRACE_PROT                       21 +#define PTRACE_PROT			  21 -typedef enum -{ +typedef enum {  	ptprot_set_access_watchpoint,  	ptprot_set_write_watchpoint,  	ptprot_disable_watchpoint  } ptprot_flags; -typedef struct -{ +typedef struct {  	unsigned long lowaddr;  	unsigned long hiaddr;  	ptprot_flags prot; -} ptprot_area;                      +} ptprot_area;  /* Sequence of bytes for breakpoint illegal instruction.  */  #define S390_BREAKPOINT     {0x0,0x1} @@ -439,8 +437,7 @@ typedef struct   * The user_regs_struct defines the way the user registers are   * store on the stack for signal handling.   */ -struct user_regs_struct -{ +struct user_regs_struct {  	psw_t psw;  	unsigned long gprs[NUM_GPRS];  	unsigned int  acrs[NUM_ACRS]; diff --git a/arch/s390/include/uapi/asm/sthyi.h b/arch/s390/include/uapi/asm/sthyi.h index ec113db4eb7e..b1b022316983 100644 --- a/arch/s390/include/uapi/asm/sthyi.h +++ b/arch/s390/include/uapi/asm/sthyi.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */  #ifndef _UAPI_ASM_STHYI_H  #define _UAPI_ASM_STHYI_H diff --git a/arch/s390/include/uapi/asm/virtio-ccw.h b/arch/s390/include/uapi/asm/virtio-ccw.h index 967aad390105..2b605f7e8483 100644 --- a/arch/s390/include/uapi/asm/virtio-ccw.h +++ b/arch/s390/include/uapi/asm/virtio-ccw.h @@ -1,13 +1,9 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */  /*   * Definitions for virtio-ccw devices.   *   * Copyright IBM Corp. 2013   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *  Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>   */  #ifndef __KVM_VIRTIO_CCW_H diff --git a/arch/s390/include/uapi/asm/vmcp.h b/arch/s390/include/uapi/asm/vmcp.h index 4caf71714a55..aeaaa030030e 100644 --- a/arch/s390/include/uapi/asm/vmcp.h +++ b/arch/s390/include/uapi/asm/vmcp.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */  /*   * Copyright IBM Corp. 2004, 2005   * Interface implementation for communication with the z/VM control program diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h index 137ef473584e..d568307321fc 100644 --- a/arch/s390/include/uapi/asm/zcrypt.h +++ b/arch/s390/include/uapi/asm/zcrypt.h @@ -9,20 +9,6 @@   *	       Eric Rossman (edrossma@us.ibm.com)   *   *  Hotplug & misc device support: Jochen Roehrig (roehrig@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.   */  #ifndef __ASM_S390_ZCRYPT_H diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index 315986a06cf5..574e77622c04 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/module.h>  #include <asm/alternative.h>  #include <asm/facility.h> diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index f04db3779b34..59eea9c65d3e 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis  		return retval;  	} +	groups_sort(group_info);  	retval = set_current_groups(group_info);  	put_group_info(group_info); diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 58b9e127b615..80e974adb9e8 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -1392,7 +1392,7 @@ int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view,  	else  		except_str = "-";  	caller = (unsigned long) entry->caller; -	rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %02i %p  ", +	rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %02i %pK  ",  		      area, sec, usec, level, except_str,  		      entry->id.fields.cpuid, (void *)caller);  	return rc; diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 3be829721cf9..b2c68fbf2634 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Disassemble s390 instructions.   * @@ -396,9 +397,14 @@ struct s390_insn *find_insn(unsigned char *code)  	unsigned char opfrag;  	int i; +	/* Search the opcode offset table to find an entry which +	 * matches the beginning of the opcode. If there is no match +	 * the last entry will be used, which is the default entry for +	 * unknown instructions as well as 1-byte opcode instructions. +	 */  	for (i = 0; i < ARRAY_SIZE(opcode_offset); i++) {  		entry = &opcode_offset[i]; -		if (entry->opcode == code[0] || entry->opcode == 0) +		if (entry->opcode == code[0])  			break;  	} @@ -543,7 +549,7 @@ void show_code(struct pt_regs *regs)  		start += opsize;  		pr_cont("%s", buffer);  		ptr = buffer; -		ptr += sprintf(ptr, "\n\t  "); +		ptr += sprintf(ptr, "\n          ");  		hops++;  	}  	pr_cont("\n"); diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 2aa545dca4d5..5b23c4f6e50c 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Stack dumping functions   * diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index a316cd6999ad..9e5f6cd8e4c2 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -180,18 +180,17 @@ _PIF_WORK	= (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)   */  ENTRY(__switch_to)  	stmg	%r6,%r15,__SF_GPRS(%r15)	# store gprs of prev task -	lgr	%r1,%r2 -	aghi	%r1,__TASK_thread		# thread_struct of prev task -	lg	%r5,__TASK_stack(%r3)		# start of kernel stack of next -	stg	%r15,__THREAD_ksp(%r1)		# store kernel stack of prev -	lgr	%r1,%r3 -	aghi	%r1,__TASK_thread		# thread_struct of next task +	lghi	%r4,__TASK_stack +	lghi	%r1,__TASK_thread +	lg	%r5,0(%r4,%r3)			# start of kernel stack of next +	stg	%r15,__THREAD_ksp(%r1,%r2)	# store kernel stack of prev  	lgr	%r15,%r5  	aghi	%r15,STACK_INIT			# end of kernel stack of next  	stg	%r3,__LC_CURRENT		# store task struct of next  	stg	%r15,__LC_KERNEL_STACK		# store end of kernel stack -	lg	%r15,__THREAD_ksp(%r1)		# load kernel stack of next -	mvc	__LC_CURRENT_PID(4,%r0),__TASK_pid(%r3) # store pid of next +	lg	%r15,__THREAD_ksp(%r1,%r3)	# load kernel stack of next +	aghi	%r3,__TASK_pid +	mvc	__LC_CURRENT_PID(4,%r0),0(%r3)	# store pid of next  	lmg	%r6,%r15,__SF_GPRS(%r15)	# load gprs of next task  	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_LPP  	bzr	%r14 diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 310e59e6eb4b..8ecb8726ac47 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *    ipl/reipl/dump support for Linux on s390.   * diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 1a6521af1751..af3722c28fd9 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   *  Kernel Probes (KProbes)   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - *   * Copyright IBM Corp. 2002, 2006   *   * s390 port, used ppc64 as template. Mike Grundy <grundym@us.ibm.com> diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index bf9622f0e6b1..452502f9a0d9 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Linux Guest Relocation (LGR) detection   * diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 7b87991416fd..b7abfad4fd7d 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   *  Kernel module help for s390.   * @@ -8,20 +9,6 @@   *   *  based on i386 version   *    Copyright (C) 2001 Rusty Russell. - * - *  This program is free software; you can redistribute it and/or modify - *  it under the terms of the GNU General Public License as published by - *  the Free Software Foundation; either version 2 of the License, or - *  (at your option) any later version. - * - *  This program is distributed in the hope that it will be useful, - *  but WITHOUT ANY WARRANTY; without even the implied warranty of - *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - *  GNU General Public License for more details. - * - *  You should have received a copy of the GNU General Public License - *  along with this program; if not, write to the Free Software - *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA   */  #include <linux/module.h>  #include <linux/elf.h> diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 6ff169253cae..c7a627620e5e 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *   Machine check handler   * diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 746d03423333..cc085e2d2ce9 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Performance event support for s390x - CPU-measurement Counter Facility   *   *  Copyright IBM Corp. 2012, 2017   *  Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #define KMSG_COMPONENT	"cpum_cf"  #define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 227b38bd82c9..1c9ddd7aa5ec 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Performance event support for the System z CPU-measurement Sampling Facility   *   * Copyright IBM Corp. 2013   * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #define KMSG_COMPONENT	"cpum_sf"  #define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index 93a386f4a3b5..0d770e513abf 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Performance event support for s390x   *   *  Copyright IBM Corp. 2012, 2013   *  Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #define KMSG_COMPONENT	"perf"  #define pr_fmt(fmt)	KMSG_COMPONENT ": " fmt diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c index f8603ebed669..54e2d634b849 100644 --- a/arch/s390/kernel/perf_regs.c +++ b/arch/s390/kernel/perf_regs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  #include <linux/perf_event.h>  #include <linux/perf_regs.h>  #include <linux/kernel.h> diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 26c0523c1488..cd3df5514552 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -1651,6 +1651,14 @@ static const struct user_regset s390_compat_regsets[] = {  		.set = s390_gs_cb_set,  	},  	{ +		.core_note_type = NT_S390_GS_BC, +		.n = sizeof(struct gs_cb) / sizeof(__u64), +		.size = sizeof(__u64), +		.align = sizeof(__u64), +		.get = s390_gs_bc_get, +		.set = s390_gs_bc_set, +	}, +	{  		.core_note_type = NT_S390_RI_CB,  		.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),  		.size = sizeof(__u64), diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 090053cf279b..793da97f9a6e 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  S390 version   *    Copyright IBM Corp. 1999, 2012 diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index cd4334e80b64..b8c1a85bcf2d 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -55,6 +55,7 @@  #include <asm/sigp.h>  #include <asm/idle.h>  #include <asm/nmi.h> +#include <asm/topology.h>  #include "entry.h"  enum { diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index e66687dc6144..460dcfba7d4e 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Stack trace management functions   * diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index 12981e197f01..80b862e9c53c 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -1,10 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * store hypervisor information instruction emulation functions.   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   * Copyright IBM Corp. 2016   * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com>   */ diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 308a7b63348b..f7fc63385553 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -370,10 +370,10 @@ SYSCALL(sys_recvmmsg,compat_sys_recvmmsg)  SYSCALL(sys_sendmmsg,compat_sys_sendmmsg)  SYSCALL(sys_socket,sys_socket)  SYSCALL(sys_socketpair,compat_sys_socketpair)		/* 360 */ -SYSCALL(sys_bind,sys_bind) -SYSCALL(sys_connect,sys_connect) +SYSCALL(sys_bind,compat_sys_bind) +SYSCALL(sys_connect,compat_sys_connect)  SYSCALL(sys_listen,sys_listen) -SYSCALL(sys_accept4,sys_accept4) +SYSCALL(sys_accept4,compat_sys_accept4)  SYSCALL(sys_getsockopt,compat_sys_getsockopt)		/* 365 */  SYSCALL(sys_setsockopt,compat_sys_setsockopt)  SYSCALL(sys_getsockname,compat_sys_getsockname) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index be6198193ec2..cf561160ea88 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *    Time of day based timer functions.   * diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index f9b393d4a078..4d5b65e527b5 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *    Copyright IBM Corp. 2007, 2011   *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 39a218703c50..f3a1c7c6824e 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * vdso setup for s390   *   *  Copyright IBM Corp. 2008   *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #include <linux/init.h> diff --git a/arch/s390/kernel/vdso32/clock_getres.S b/arch/s390/kernel/vdso32/clock_getres.S index eca3f001f081..f61df5253c23 100644 --- a/arch/s390/kernel/vdso32/clock_getres.S +++ b/arch/s390/kernel/vdso32/clock_getres.S @@ -1,13 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Userland implementation of clock_getres() for 32 bits processes in a   * s390 kernel for use in the vDSO   *   *  Copyright IBM Corp. 2008   *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #include <asm/vdso.h>  #include <asm/asm-offsets.h> diff --git a/arch/s390/kernel/vdso32/clock_gettime.S b/arch/s390/kernel/vdso32/clock_gettime.S index a5769b83d90e..2d6ec3abe095 100644 --- a/arch/s390/kernel/vdso32/clock_gettime.S +++ b/arch/s390/kernel/vdso32/clock_gettime.S @@ -1,13 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Userland implementation of clock_gettime() for 32 bits processes in a   * s390 kernel for use in the vDSO   *   *  Copyright IBM Corp. 2008   *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #include <asm/vdso.h>  #include <asm/asm-offsets.h> diff --git a/arch/s390/kernel/vdso32/gettimeofday.S b/arch/s390/kernel/vdso32/gettimeofday.S index 63b86dceb0bf..aa8bf13a2edb 100644 --- a/arch/s390/kernel/vdso32/gettimeofday.S +++ b/arch/s390/kernel/vdso32/gettimeofday.S @@ -1,13 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Userland implementation of gettimeofday() for 32 bits processes in a   * s390 kernel for use in the vDSO   *   *  Copyright IBM Corp. 2008   *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #include <asm/vdso.h>  #include <asm/asm-offsets.h> diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S index c8513deb8c66..faf5213b15df 100644 --- a/arch/s390/kernel/vdso64/clock_getres.S +++ b/arch/s390/kernel/vdso64/clock_getres.S @@ -1,13 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Userland implementation of clock_getres() for 64 bits processes in a   * s390 kernel for use in the vDSO   *   *  Copyright IBM Corp. 2008   *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #include <asm/vdso.h>  #include <asm/asm-offsets.h> diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S index 5d7b56b49458..6046b3bfca46 100644 --- a/arch/s390/kernel/vdso64/clock_gettime.S +++ b/arch/s390/kernel/vdso64/clock_gettime.S @@ -1,13 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Userland implementation of clock_gettime() for 64 bits processes in a   * s390 kernel for use in the vDSO   *   *  Copyright IBM Corp. 2008   *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #include <asm/vdso.h>  #include <asm/asm-offsets.h> diff --git a/arch/s390/kernel/vdso64/gettimeofday.S b/arch/s390/kernel/vdso64/gettimeofday.S index b02e62f3bc12..cc9dbc27da6f 100644 --- a/arch/s390/kernel/vdso64/gettimeofday.S +++ b/arch/s390/kernel/vdso64/gettimeofday.S @@ -1,13 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Userland implementation of gettimeofday() for 64 bits processes in a   * s390 kernel for use in the vDSO   *   *  Copyright IBM Corp. 2008   *  Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation.   */  #include <asm/vdso.h>  #include <asm/asm-offsets.h> diff --git a/arch/s390/kernel/vdso64/note.S b/arch/s390/kernel/vdso64/note.S index 79a071e4357e..db19d0680a0a 100644 --- a/arch/s390/kernel/vdso64/note.S +++ b/arch/s390/kernel/vdso64/note.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.   * Here we can supply some information useful to userland. diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index dd7178fbb4f3..f24395a01918 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *    Virtual cpu timer based timer functions.   * diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 6048b1c6e580..05ee90a5ea08 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -1,10 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0  # Makefile for kernel virtual machines on s390  #  # Copyright IBM Corp. 2008 -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License (version 2 only) -# as published by the Free Software Foundation.  KVM := ../../../virt/kvm  common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index d93a2c0474bf..89aa114a2cba 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * handling diagnose instructions   *   * Copyright IBM Corp. 2008, 2011   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Carsten Otte <cotte@de.ibm.com>   *               Christian Borntraeger <borntraeger@de.ibm.com>   */ diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index bec42b852246..f4c51756c462 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * access guest memory   *   * Copyright IBM Corp. 2008, 2014   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Carsten Otte <cotte@de.ibm.com>   */ diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index bcbd86621d01..b5f3e82006d0 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * kvm guest debug support   *   * Copyright IBM Corp. 2014   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>   */  #include <linux/kvm_host.h> diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 8fe034beb623..9c7d70715862 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * in-kernel handling for sie intercepts   *   * Copyright IBM Corp. 2008, 2014   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Carsten Otte <cotte@de.ibm.com>   *               Christian Borntraeger <borntraeger@de.ibm.com>   */ diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index fa557372d600..024ad8bcc516 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * handling kvm guest interrupts   *   * Copyright IBM Corp. 2008, 2015   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Carsten Otte <cotte@de.ibm.com>   */ diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h index d98e4159643d..484608c71dd0 100644 --- a/arch/s390/kvm/irq.h +++ b/arch/s390/kvm/irq.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * s390 irqchip routines   *   * Copyright IBM Corp. 2014   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>   */  #ifndef __KVM_IRQ_H diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 98ad8b9e0360..2c93cbbcd15e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1,11 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0  /* - * hosting zSeries kernel virtual machines + * hosting IBM Z kernel virtual machines (s390x)   * - * Copyright IBM Corp. 2008, 2009 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. + * Copyright IBM Corp. 2008, 2017   *   *    Author(s): Carsten Otte <cotte@de.ibm.com>   *               Christian Borntraeger <borntraeger@de.ibm.com> @@ -795,11 +792,12 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)  	if (kvm->arch.use_cmma) {  		/* -		 * Get the last slot. They should be sorted by base_gfn, so the -		 * last slot is also the one at the end of the address space. -		 * We have verified above that at least one slot is present. +		 * Get the first slot. They are reverse sorted by base_gfn, so +		 * the first slot is also the one at the end of the address +		 * space. We have verified above that at least one slot is +		 * present.  		 */ -		ms = slots->memslots + slots->used_slots - 1; +		ms = slots->memslots;  		/* round up so we only use full longs */  		ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);  		/* allocate enough bytes to store all the bits */ @@ -3372,7 +3370,6 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)  {  	int rc; -	sigset_t sigsaved;  	if (kvm_run->immediate_exit)  		return -EINTR; @@ -3382,8 +3379,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)  		return 0;  	} -	if (vcpu->sigset_active) -		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); +	kvm_sigset_activate(vcpu);  	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) {  		kvm_s390_vcpu_start(vcpu); @@ -3417,8 +3413,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)  	disable_cpu_timer_accounting(vcpu);  	store_regs(vcpu, kvm_run); -	if (vcpu->sigset_active) -		sigprocmask(SIG_SETMASK, &sigsaved, NULL); +	kvm_sigset_deactivate(vcpu);  	vcpu->stat.exit_userspace++;  	return rc; @@ -3811,6 +3806,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,  			r = -EINVAL;  			break;  		} +		/* do not use irq_state.flags, it will break old QEMUs */  		r = kvm_s390_set_irq_state(vcpu,  					   (void __user *) irq_state.buf,  					   irq_state.len); @@ -3826,6 +3822,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,  			r = -EINVAL;  			break;  		} +		/* do not use irq_state.flags, it will break old QEMUs */  		r = kvm_s390_get_irq_state(vcpu,  					   (__u8 __user *)  irq_state.buf,  					   irq_state.len); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 10d65dfbc306..5e46ba429bcb 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * definition for kvm on s390   *   * Copyright IBM Corp. 2008, 2009   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Carsten Otte <cotte@de.ibm.com>   *               Christian Borntraeger <borntraeger@de.ibm.com>   *               Christian Ehrhardt <ehrhardt@de.ibm.com> diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index c954ac49eee4..0714bfa56da0 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * handling privileged instructions   *   * Copyright IBM Corp. 2008, 2013   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Carsten Otte <cotte@de.ibm.com>   *               Christian Borntraeger <borntraeger@de.ibm.com>   */ @@ -235,8 +232,6 @@ static int try_handle_skey(struct kvm_vcpu *vcpu)  		VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");  		return -EAGAIN;  	} -	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) -		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);  	return 0;  } @@ -247,6 +242,9 @@ static int handle_iske(struct kvm_vcpu *vcpu)  	int reg1, reg2;  	int rc; +	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) +		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); +  	rc = try_handle_skey(vcpu);  	if (rc)  		return rc != -EAGAIN ? rc : 0; @@ -276,6 +274,9 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)  	int reg1, reg2;  	int rc; +	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) +		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); +  	rc = try_handle_skey(vcpu);  	if (rc)  		return rc != -EAGAIN ? rc : 0; @@ -311,6 +312,9 @@ static int handle_sske(struct kvm_vcpu *vcpu)  	int reg1, reg2;  	int rc; +	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) +		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); +  	rc = try_handle_skey(vcpu);  	if (rc)  		return rc != -EAGAIN ? rc : 0; @@ -1002,7 +1006,7 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)  		cbrlo[entries] = gfn << PAGE_SHIFT;  	} -	if (orc) { +	if (orc && gfn < ms->bitmap_size) {  		/* increment only if we are really flipping the bit to 1 */  		if (!test_and_set_bit(gfn, ms->pgste_bitmap))  			atomic64_inc(&ms->dirty_pages); diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 9d592ef4104b..c1f5cde2c878 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * handling interprocessor communication   *   * Copyright IBM Corp. 2008, 2013   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): Carsten Otte <cotte@de.ibm.com>   *               Christian Borntraeger <borntraeger@de.ibm.com>   *               Christian Ehrhardt <ehrhardt@de.ibm.com> diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index a311938b63b3..5d6ae0326d9e 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * kvm nested virtualization support for s390x   *   * Copyright IBM Corp. 2016   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - *   *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>   */  #include <linux/vmalloc.h> diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index cae5a1e16cbd..c4f8039a35e8 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -89,11 +89,11 @@ EXPORT_SYMBOL(enable_sacf_uaccess);  void disable_sacf_uaccess(mm_segment_t old_fs)  { +	current->thread.mm_segment = old_fs;  	if (old_fs == USER_DS && test_facility(27)) {  		__ctl_load(S390_lowcore.user_asce, 1, 1);  		clear_cpu_flag(CIF_ASCE_PRIMARY);  	} -	current->thread.mm_segment = old_fs;  }  EXPORT_SYMBOL(disable_sacf_uaccess); diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 3d017171ff8f..6cf024eb2085 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  Collaborative memory management interface.   * diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index b2c140193b0a..05d459b638f5 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  KVM guest address space mapping code   * diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 5bea139517a2..831bdcf407bb 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -1,24 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+  /*   *  flexible mmap layout support   *   * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.   * All Rights Reserved.   * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA - * - *   * Started by Ingo Molnar <mingo@elte.hu>   */ diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 434a9564917b..cb364153c43c 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -83,8 +83,6 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)  	/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */  	VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE); -	if (end >= TASK_SIZE_MAX) -		return -ENOMEM;  	rc = 0;  	notify = 0;  	while (mm->context.asce_limit < end) { diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index ae677f814bc0..4f2b65d01a70 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *    Copyright IBM Corp. 2007, 2011   *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> diff --git a/arch/s390/net/Makefile b/arch/s390/net/Makefile index 90568c33ddb0..e0d5f245e42b 100644 --- a/arch/s390/net/Makefile +++ b/arch/s390/net/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  #  # Arch-specific network modules  # diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index e81c16838b90..9557d8b516df 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -55,8 +55,7 @@ struct bpf_jit {  #define SEEN_LITERAL	8	/* code uses literals */  #define SEEN_FUNC	16	/* calls C functions */  #define SEEN_TAIL_CALL	32	/* code uses tail calls */ -#define SEEN_SKB_CHANGE	64	/* code changes skb data */ -#define SEEN_REG_AX	128	/* code uses constant blinding */ +#define SEEN_REG_AX	64	/* code uses constant blinding */  #define SEEN_STACK	(SEEN_FUNC | SEEN_MEM | SEEN_SKB)  /* @@ -448,12 +447,12 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)  			EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,  				      REG_15, 152);  	} -	if (jit->seen & SEEN_SKB) +	if (jit->seen & SEEN_SKB) {  		emit_load_skb_data_hlen(jit); -	if (jit->seen & SEEN_SKB_CHANGE)  		/* stg %b1,ST_OFF_SKBP(%r0,%r15) */  		EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,  			      STK_OFF_SKBP); +	}  }  /* @@ -983,8 +982,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i  		EMIT2(0x0d00, REG_14, REG_W1);  		/* lgr %b0,%r2: load return value into %b0 */  		EMIT4(0xb9040000, BPF_REG_0, REG_2); -		if (bpf_helper_changes_pkt_data((void *)func)) { -			jit->seen |= SEEN_SKB_CHANGE; +		if ((jit->seen & SEEN_SKB) && +		    bpf_helper_changes_pkt_data((void *)func)) {  			/* lg %b1,ST_OFF_SKBP(%r15) */  			EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,  				      REG_15, STK_OFF_SKBP); diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile index f94ecaffa71b..66c2dff74895 100644 --- a/arch/s390/numa/Makefile +++ b/arch/s390/numa/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  obj-y			+= numa.o  obj-y			+= toptree.o  obj-$(CONFIG_NUMA_EMU)	+= mode_emu.o diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile index 805d8b29193a..22d0871291ee 100644 --- a/arch/s390/pci/Makefile +++ b/arch/s390/pci/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  #  # Makefile for the s390 PCI subsystem.  # diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 0fe649c0d542..4902fed221c0 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Copyright IBM Corp. 2012   * diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index c2f786f0ea06..b482e95b6249 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   *  Copyright IBM Corp. 2012,2015   * diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 0d300ee00f4e..2d15d84c20ed 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * Copyright IBM Corp. 2012   * @@ -180,6 +181,9 @@ out_unlock:  static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,  			   size_t size, int flags)  { +	unsigned long irqflags; +	int ret; +  	/*  	 * With zdev->tlb_refresh == 0, rpcit is not required to establish new  	 * translations when previously invalid translation-table entries are @@ -195,8 +199,22 @@ static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,  			return 0;  	} -	return zpci_refresh_trans((u64) zdev->fh << 32, dma_addr, -				  PAGE_ALIGN(size)); +	ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr, +				 PAGE_ALIGN(size)); +	if (ret == -ENOMEM && !s390_iommu_strict) { +		/* enable the hypervisor to free some resources */ +		if (zpci_refresh_global(zdev)) +			goto out; + +		spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags); +		bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap, +			      zdev->lazy_bitmap, zdev->iommu_pages); +		bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages); +		spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags); +		ret = 0; +	} +out: +	return ret;  }  static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa, diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 81b840bc6e4e..f069929e8211 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0  /*   * s390 specific pci instructions   * @@ -88,6 +89,9 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range)  	if (cc)  		zpci_err_insn(cc, status, addr, range); +	if (cc == 1 && (status == 4 || status == 16)) +		return -ENOMEM; +  	return (cc) ? -EIO : 0;  } diff --git a/arch/s390/tools/gen_opcode_table.c b/arch/s390/tools/gen_opcode_table.c index 01d4c5a4bfe9..357d42681cef 100644 --- a/arch/s390/tools/gen_opcode_table.c +++ b/arch/s390/tools/gen_opcode_table.c @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */  /*   * Generate opcode table initializers for the in-kernel disassembler.   * diff --git a/arch/score/include/uapi/asm/Kbuild b/arch/score/include/uapi/asm/Kbuild index c94ee54210bc..81271d3af47c 100644 --- a/arch/score/include/uapi/asm/Kbuild +++ b/arch/score/include/uapi/asm/Kbuild @@ -1,4 +1,5 @@  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm +generic-y	+= bpf_perf_event.h  generic-y	+= siginfo.h diff --git a/arch/sh/boards/mach-se/770x/setup.c b/arch/sh/boards/mach-se/770x/setup.c index 77c35350ee77..412326d59e6f 100644 --- a/arch/sh/boards/mach-se/770x/setup.c +++ b/arch/sh/boards/mach-se/770x/setup.c @@ -9,6 +9,7 @@   */  #include <linux/init.h>  #include <linux/platform_device.h> +#include <linux/sh_eth.h>  #include <mach-se/mach/se.h>  #include <mach-se/mach/mrshpc.h>  #include <asm/machvec.h> @@ -115,13 +116,23 @@ static struct platform_device heartbeat_device = {  #if defined(CONFIG_CPU_SUBTYPE_SH7710) ||\  	defined(CONFIG_CPU_SUBTYPE_SH7712)  /* SH771X Ethernet driver */ +static struct sh_eth_plat_data sh_eth_plat = { +	.phy = PHY_ID, +	.phy_interface = PHY_INTERFACE_MODE_MII, +}; +  static struct resource sh_eth0_resources[] = {  	[0] = {  		.start = SH_ETH0_BASE, -		.end = SH_ETH0_BASE + 0x1B8, +		.end = SH_ETH0_BASE + 0x1B8 - 1,  		.flags = IORESOURCE_MEM,  	},  	[1] = { +		.start = SH_TSU_BASE, +		.end = SH_TSU_BASE + 0x200 - 1, +		.flags = IORESOURCE_MEM, +	}, +	[2] = {  		.start = SH_ETH0_IRQ,  		.end = SH_ETH0_IRQ,  		.flags = IORESOURCE_IRQ, @@ -132,7 +143,7 @@ static struct platform_device sh_eth0_device = {  	.name = "sh771x-ether",  	.id = 0,  	.dev = { -		.platform_data = PHY_ID, +		.platform_data = &sh_eth_plat,  	},  	.num_resources = ARRAY_SIZE(sh_eth0_resources),  	.resource = sh_eth0_resources, @@ -141,10 +152,15 @@ static struct platform_device sh_eth0_device = {  static struct resource sh_eth1_resources[] = {  	[0] = {  		.start = SH_ETH1_BASE, -		.end = SH_ETH1_BASE + 0x1B8, +		.end = SH_ETH1_BASE + 0x1B8 - 1,  		.flags = IORESOURCE_MEM,  	},  	[1] = { +		.start = SH_TSU_BASE, +		.end = SH_TSU_BASE + 0x200 - 1, +		.flags = IORESOURCE_MEM, +	}, +	[2] = {  		.start = SH_ETH1_IRQ,  		.end = SH_ETH1_IRQ,  		.flags = IORESOURCE_IRQ, @@ -155,7 +171,7 @@ static struct platform_device sh_eth1_device = {  	.name = "sh771x-ether",  	.id = 1,  	.dev = { -		.platform_data = PHY_ID, +		.platform_data = &sh_eth_plat,  	},  	.num_resources = ARRAY_SIZE(sh_eth1_resources),  	.resource = sh_eth1_resources, diff --git a/arch/sh/include/mach-se/mach/se.h b/arch/sh/include/mach-se/mach/se.h index 4246ef9b07a3..aa83fe1ff0b1 100644 --- a/arch/sh/include/mach-se/mach/se.h +++ b/arch/sh/include/mach-se/mach/se.h @@ -100,6 +100,7 @@  /* Base address */  #define SH_ETH0_BASE 0xA7000000  #define SH_ETH1_BASE 0xA7000400 +#define SH_TSU_BASE  0xA7000800  /* PHY ID */  #if defined(CONFIG_CPU_SUBTYPE_SH7710)  # define PHY_ID 0x00 diff --git a/arch/sh/include/uapi/asm/Kbuild b/arch/sh/include/uapi/asm/Kbuild index e28531333efa..ba4d39cb321d 100644 --- a/arch/sh/include/uapi/asm/Kbuild +++ b/arch/sh/include/uapi/asm/Kbuild @@ -2,6 +2,7 @@  include include/uapi/asm-generic/Kbuild.asm  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 5a9e96be1665..9937c5ff94a9 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -715,7 +715,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)  	return pte_pfn(pte);  } -#define __HAVE_ARCH_PMD_WRITE +#define pmd_write pmd_write  static inline unsigned long pmd_write(pmd_t pmd)  {  	pte_t pte = __pte(pmd_val(pmd)); diff --git a/arch/sparc/include/uapi/asm/Kbuild b/arch/sparc/include/uapi/asm/Kbuild index 2178c78c7c1a..4680ba246b55 100644 --- a/arch/sparc/include/uapi/asm/Kbuild +++ b/arch/sparc/include/uapi/asm/Kbuild @@ -1,4 +1,5 @@  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm +generic-y += bpf_perf_event.h  generic-y += types.h diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 0f0f76b4f6cd..063556fe2cb1 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -19,7 +19,7 @@ lib-$(CONFIG_SPARC32) += muldi3.o bitext.o cmpdi2.o  lib-$(CONFIG_SPARC64) += multi3.o  lib-$(CONFIG_SPARC64) += fls.o  lib-$(CONFIG_SPARC64) += fls64.o -obj-$(CONFIG_SPARC64) += NG4fls.o +lib-$(CONFIG_SPARC64) += NG4fls.o  lib-$(CONFIG_SPARC64) += copy_page.o clear_page.o bzero.o  lib-$(CONFIG_SPARC64) += csum_copy.o csum_copy_from_user.o csum_copy_to_user.o diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S index e5547b22cd18..0ddbbb031822 100644 --- a/arch/sparc/lib/hweight.S +++ b/arch/sparc/lib/hweight.S @@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32)  	.previous  ENTRY(__arch_hweight64) -	sethi	%hi(__sw_hweight16), %g1 -	jmpl	%g1 + %lo(__sw_hweight16), %g0 +	sethi	%hi(__sw_hweight64), %g1 +	jmpl	%g1 + %lo(__sw_hweight64), %g0  	 nop  ENDPROC(__arch_hweight64)  EXPORT_SYMBOL(__arch_hweight64) diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index be3136f142a9..a8103a84b4ac 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -113,7 +113,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,  	if (!printk_ratelimit())  		return; -	printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", +	printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",  	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,  	       tsk->comm, task_pid_nr(tsk), address,  	       (void *)regs->pc, (void *)regs->u_regs[UREG_I7], diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 815c03d7a765..41363f46797b 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -154,7 +154,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,  	if (!printk_ratelimit())  		return; -	printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", +	printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",  	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,  	       tsk->comm, task_pid_nr(tsk), address,  	       (void *)regs->tpc, (void *)regs->u_regs[UREG_I7], diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 5765e7e711f7..ff5f9cb3039a 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1245,14 +1245,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)  		u8 *func = ((u8 *)__bpf_call_base) + imm;  		ctx->saw_call = true; +		if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func)) +			emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);  		emit_call((u32 *)func, ctx);  		emit_nop(ctx);  		emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx); -		if (bpf_helper_changes_pkt_data(func) && ctx->saw_ld_abs_ind) -			load_skb_regs(ctx, bpf2sparc[BPF_REG_6]); +		if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func)) +			load_skb_regs(ctx, L7);  		break;  	} diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index 2a26cc4fefc2..adfa21b18488 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h @@ -475,7 +475,6 @@ static inline void pmd_clear(pmd_t *pmdp)  #define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))  #define pmd_huge_page(pmd)	pte_huge(pmd_pte(pmd))  #define pmd_mkhuge(pmd)		pte_pmd(pte_mkhuge(pmd_pte(pmd))) -#define __HAVE_ARCH_PMD_WRITE  #define pfn_pmd(pfn, pgprot)	pte_pmd(pfn_pte((pfn), (pgprot)))  #define pmd_pfn(pmd)		pte_pfn(pmd_pte(pmd)) diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild index 5711de0a1b5e..cc439612bcd5 100644 --- a/arch/tile/include/uapi/asm/Kbuild +++ b/arch/tile/include/uapi/asm/Kbuild @@ -1,6 +1,7 @@  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 50a32c33d729..73c57f614c9e 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,4 +1,5 @@  generic-y += barrier.h +generic-y += bpf_perf_event.h  generic-y += bug.h  generic-y += clkdev.h  generic-y += current.h diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index b668e351fd6c..fca34b2177e2 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h @@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm);  /*   * Needed since we do not use the asm-generic/mm_hooks.h:   */ -static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)  {  	uml_setup_stubs(mm); +	return 0;  }  extern void arch_exit_mmap(struct mm_struct *mm);  static inline void arch_unmap(struct mm_struct *mm, diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 4e6fcb32620f..428644175956 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -150,7 +150,7 @@ static void show_segv_info(struct uml_pt_regs *regs)  	if (!printk_ratelimit())  		return; -	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x", +	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",  		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,  		tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),  		(void *)UPT_IP(regs), (void *)UPT_SP(regs), diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index 59b06b48f27d..5c205a9cb5a6 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h @@ -81,9 +81,10 @@ do { \  	} \  } while (0) -static inline void arch_dup_mmap(struct mm_struct *oldmm, -				 struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, +				struct mm_struct *mm)  { +	return 0;  }  static inline void arch_unmap(struct mm_struct *mm, diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild index 759a71411169..8611ef980554 100644 --- a/arch/unicore32/include/uapi/asm/Kbuild +++ b/arch/unicore32/include/uapi/asm/Kbuild @@ -3,6 +3,7 @@ include include/uapi/asm-generic/Kbuild.asm  generic-y += auxvec.h  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 5f25b39f04d4..c4ac6043ebb0 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -298,7 +298,6 @@ void abort(void)  	/* if that doesn't kill us, halt */  	panic("Oops failed to kill thread");  } -EXPORT_SYMBOL(abort);  void __init trap_init(void)  { diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8eed3f94bfc7..20da391b5f32 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -55,7 +55,6 @@ config X86  	select ARCH_HAS_GCOV_PROFILE_ALL  	select ARCH_HAS_KCOV			if X86_64  	select ARCH_HAS_PMEM_API		if X86_64 -	# Causing hangs/crashes, see the commit that added this change for details.  	select ARCH_HAS_REFCOUNT  	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64  	select ARCH_HAS_SET_MEMORY @@ -89,6 +88,7 @@ config X86  	select GENERIC_CLOCKEVENTS_MIN_ADJUST  	select GENERIC_CMOS_UPDATE  	select GENERIC_CPU_AUTOPROBE +	select GENERIC_CPU_VULNERABILITIES  	select GENERIC_EARLY_IOREMAP  	select GENERIC_FIND_FIRST_BIT  	select GENERIC_IOMAP @@ -429,6 +429,19 @@ config GOLDFISH         def_bool y         depends on X86_GOLDFISH +config RETPOLINE +	bool "Avoid speculative indirect branches in kernel" +	default y +	help +	  Compile kernel with the retpoline compiler options to guard against +	  kernel-to-user data leaks by avoiding speculative indirect +	  branches. Requires a compiler with -mindirect-branch=thunk-extern +	  support for full protection. The kernel may run slower. + +	  Without compiler support, at least indirect branches in assembler +	  code are eliminated. Since this includes the syscall entry path, +	  it is not entirely pointless. +  config INTEL_RDT  	bool "Intel Resource Director Technology support"  	default n @@ -926,7 +939,8 @@ config MAXSMP  config NR_CPUS  	int "Maximum number of CPUs" if SMP && !MAXSMP  	range 2 8 if SMP && X86_32 && !X86_BIGSMP -	range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK +	range 2 64 if SMP && X86_32 && X86_BIGSMP +	range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64  	range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64  	default "1" if !SMP  	default "8192" if MAXSMP diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 6293a8768a91..672441c008c7 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -400,6 +400,7 @@ config UNWINDER_FRAME_POINTER  config UNWINDER_GUESS  	bool "Guess unwinder"  	depends on EXPERT +	depends on !STACKDEPOT  	---help---  	  This option enables the "guess" unwinder for unwinding kernel stack  	  traces.  It scans the stack and reports every kernel text address it diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 3e73bc255e4e..fad55160dcb9 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -230,6 +230,14 @@ KBUILD_CFLAGS += -Wno-sign-compare  #  KBUILD_CFLAGS += -fno-asynchronous-unwind-tables +# Avoid indirect branches in kernel to deal with Spectre +ifdef CONFIG_RETPOLINE +    RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) +    ifneq ($(RETPOLINE_CFLAGS),) +        KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE +    endif +endif +  archscripts: scripts_basic  	$(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 1e9c322e973a..f25e1530e064 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -80,6 +80,7 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o  ifdef CONFIG_X86_64  	vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o  	vmlinux-objs-y += $(obj)/mem_encrypt.o +	vmlinux-objs-y += $(obj)/pgtable_64.o  endif  $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 20919b4f3133..fc313e29fe2c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -305,10 +305,18 @@ ENTRY(startup_64)  	leaq	boot_stack_end(%rbx), %rsp  #ifdef CONFIG_X86_5LEVEL -	/* Check if 5-level paging has already enabled */ -	movq	%cr4, %rax -	testl	$X86_CR4_LA57, %eax -	jnz	lvl5 +	/* +	 * Check if we need to enable 5-level paging. +	 * RSI holds real mode data and need to be preserved across +	 * a function call. +	 */ +	pushq	%rsi +	call	l5_paging_required +	popq	%rsi + +	/* If l5_paging_required() returned zero, we're done here. */ +	cmpq	$0, %rax +	je	lvl5  	/*  	 * At this point we are in long mode with 4-level paging enabled, diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b50c42455e25..98761a1576ce 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -169,6 +169,16 @@ void __puthex(unsigned long value)  	}  } +static bool l5_supported(void) +{ +	/* Check if leaf 7 is supported. */ +	if (native_cpuid_eax(0) < 7) +		return 0; + +	/* Check if la57 is supported. */ +	return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); +} +  #if CONFIG_X86_NEED_RELOCS  static void handle_relocations(void *output, unsigned long output_len,  			       unsigned long virt_addr) @@ -362,6 +372,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,  	console_init();  	debug_putstr("early console in extract_kernel\n"); +	if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) { +		error("This linux kernel as configured requires 5-level paging\n" +			"This CPU does not support the required 'cr4.la57' feature\n" +			"Unable to boot - please use a kernel appropriate for your CPU\n"); +	} +  	free_mem_ptr     = heap;	/* Heap */  	free_mem_end_ptr = heap + BOOT_HEAP_SIZE; diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index d5364ca2e3f9..b5e5e02f8cde 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -23,6 +23,9 @@   */  #undef CONFIG_AMD_MEM_ENCRYPT +/* No PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_PAGE_TABLE_ISOLATION +  #include "misc.h"  /* These actually do the work of building the kernel identity maps. */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c new file mode 100644 index 000000000000..b4469a37e9a1 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -0,0 +1,28 @@ +#include <asm/processor.h> + +/* + * __force_order is used by special_insns.h asm code to force instruction + * serialization. + * + * It is not referenced from the code, but GCC < 5 with -fPIE would fail + * due to an undefined symbol. Define it to make these ancient GCCs work. + */ +unsigned long __force_order; + +int l5_paging_required(void) +{ +	/* Check if leaf 7 is supported. */ + +	if (native_cpuid_eax(0) < 7) +		return 0; + +	/* Check if la57 is supported. */ +	if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) +		return 0; + +	/* Check if 5-level paging has already been enabled. */ +	if (native_read_cr4() & X86_CR4_LA57) +		return 0; + +	return 1; +} diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 49f4970f693b..6a10d52a4145 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -44,9 +44,9 @@ FDINITRD=$6  # Make sure the files actually exist  verify "$FBZIMAGE" -verify "$MTOOLSRC"  genbzdisk() { +	verify "$MTOOLSRC"  	mformat a:  	syslinux $FIMAGE  	echo "$KCMDLINE" | mcopy - a:syslinux.cfg @@ -57,6 +57,7 @@ genbzdisk() {  }  genfdimage144() { +	verify "$MTOOLSRC"  	dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null  	mformat v:  	syslinux $FIMAGE @@ -68,6 +69,7 @@ genfdimage144() {  }  genfdimage288() { +	verify "$MTOOLSRC"  	dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null  	mformat w:  	syslinux $FIMAGE @@ -78,39 +80,43 @@ genfdimage288() {  	mcopy $FBZIMAGE w:linux  } -genisoimage() { +geniso() {  	tmp_dir=`dirname $FIMAGE`/isoimage  	rm -rf $tmp_dir  	mkdir $tmp_dir -	for i in lib lib64 share end ; do +	for i in lib lib64 share ; do  		for j in syslinux ISOLINUX ; do  			if [ -f /usr/$i/$j/isolinux.bin ] ; then  				isolinux=/usr/$i/$j/isolinux.bin -				cp $isolinux $tmp_dir  			fi  		done  		for j in syslinux syslinux/modules/bios ; do  			if [ -f /usr/$i/$j/ldlinux.c32 ]; then  				ldlinux=/usr/$i/$j/ldlinux.c32 -				cp $ldlinux $tmp_dir  			fi  		done  		if [ -n "$isolinux" -a -n "$ldlinux" ] ; then  			break  		fi -		if [ $i = end -a -z "$isolinux" ] ; then -			echo 'Need an isolinux.bin file, please install syslinux/isolinux.' -			exit 1 -		fi  	done +	if [ -z "$isolinux" ] ; then +		echo 'Need an isolinux.bin file, please install syslinux/isolinux.' +		exit 1 +	fi +	if [ -z "$ldlinux" ] ; then +		echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.' +		exit 1 +	fi +	cp $isolinux $tmp_dir +	cp $ldlinux $tmp_dir  	cp $FBZIMAGE $tmp_dir/linux  	echo "$KCMDLINE" > $tmp_dir/isolinux.cfg  	if [ -f "$FDINITRD" ] ; then  		cp "$FDINITRD" $tmp_dir/initrd.img  	fi -	mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \ -		-c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \ -		$tmp_dir +	genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \ +		-b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \ +		-boot-info-table $tmp_dir  	isohybrid $FIMAGE 2>/dev/null || true  	rm -rf $tmp_dir  } @@ -119,6 +125,6 @@ case $1 in  	bzdisk)     genbzdisk;;  	fdimage144) genfdimage144;;  	fdimage288) genfdimage288;; -	isoimage)   genisoimage;; +	isoimage)   geniso;;  	*)          echo 'Unknown image format'; exit 1;  esac diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 16627fec80b2..3d09e3aca18d 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -32,6 +32,7 @@  #include <linux/linkage.h>  #include <asm/inst.h>  #include <asm/frame.h> +#include <asm/nospec-branch.h>  /*   * The following macros are used to move an (un)aligned 16 byte value to/from @@ -2884,7 +2885,7 @@ ENTRY(aesni_xts_crypt8)  	pxor INC, STATE4  	movdqu IV, 0x30(OUTP) -	call *%r11 +	CALL_NOSPEC %r11  	movdqu 0x00(OUTP), INC  	pxor INC, STATE1 @@ -2929,7 +2930,7 @@ ENTRY(aesni_xts_crypt8)  	_aesni_gf128mul_x_ble()  	movups IV, (IVP) -	call *%r11 +	CALL_NOSPEC %r11  	movdqu 0x40(OUTP), INC  	pxor INC, STATE1 diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index f7c495e2863c..a14af6eb09cb 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -17,6 +17,7 @@  #include <linux/linkage.h>  #include <asm/frame.h> +#include <asm/nospec-branch.h>  #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1227,7 +1228,7 @@ camellia_xts_crypt_16way:  	vpxor 14 * 16(%rax), %xmm15, %xmm14;  	vpxor 15 * 16(%rax), %xmm15, %xmm15; -	call *%r9; +	CALL_NOSPEC %r9;  	addq $(16 * 16), %rsp; diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index eee5b3982cfd..b66bbfa62f50 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -12,6 +12,7 @@  #include <linux/linkage.h>  #include <asm/frame.h> +#include <asm/nospec-branch.h>  #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1343,7 +1344,7 @@ camellia_xts_crypt_32way:  	vpxor 14 * 32(%rax), %ymm15, %ymm14;  	vpxor 15 * 32(%rax), %ymm15, %ymm15; -	call *%r9; +	CALL_NOSPEC %r9;  	addq $(16 * 32), %rsp; diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 7a7de27c6f41..d9b734d0c8cc 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -45,6 +45,7 @@  #include <asm/inst.h>  #include <linux/linkage.h> +#include <asm/nospec-branch.h>  ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction @@ -172,7 +173,7 @@ continue_block:  	movzxw  (bufp, %rax, 2), len  	lea	crc_array(%rip), bufp  	lea     (bufp, len, 1), bufp -	jmp     *bufp +	JMP_NOSPEC bufp  	################################################################  	## 2a) PROCESS FULL BLOCKS: diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index 399a29d067d6..cb91a64a99e7 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -59,13 +59,6 @@ static int encrypt(struct blkcipher_desc *desc,  	salsa20_ivsetup(ctx, walk.iv); -	if (likely(walk.nbytes == nbytes)) -	{ -		salsa20_encrypt_bytes(ctx, walk.src.virt.addr, -				      walk.dst.virt.addr, nbytes); -		return blkcipher_walk_done(desc, &walk, 0); -	} -  	while (walk.nbytes >= 64) {  		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,  				      walk.dst.virt.addr, diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 3fd8bc560fae..3f48f695d5e6 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,6 +1,11 @@  /* SPDX-License-Identifier: GPL-2.0 */  #include <linux/jump_label.h>  #include <asm/unwind_hints.h> +#include <asm/cpufeatures.h> +#include <asm/page_types.h> +#include <asm/percpu.h> +#include <asm/asm-offsets.h> +#include <asm/processor-flags.h>  /* @@ -187,6 +192,148 @@ For 32-bit we have the following conventions - kernel is built with  #endif  .endm +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +/* + * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two + * halves: + */ +#define PTI_USER_PGTABLE_BIT		PAGE_SHIFT +#define PTI_USER_PGTABLE_MASK		(1 << PTI_USER_PGTABLE_BIT) +#define PTI_USER_PCID_BIT		X86_CR3_PTI_PCID_USER_BIT +#define PTI_USER_PCID_MASK		(1 << PTI_USER_PCID_BIT) +#define PTI_USER_PGTABLE_AND_PCID_MASK  (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK) + +.macro SET_NOFLUSH_BIT	reg:req +	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg +.endm + +.macro ADJUST_KERNEL_CR3 reg:req +	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID +	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ +	andq    $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg +.endm + +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req +	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI +	mov	%cr3, \scratch_reg +	ADJUST_KERNEL_CR3 \scratch_reg +	mov	\scratch_reg, %cr3 +.Lend_\@: +.endm + +#define THIS_CPU_user_pcid_flush_mask   \ +	PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask + +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req +	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI +	mov	%cr3, \scratch_reg + +	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID + +	/* +	 * Test if the ASID needs a flush. +	 */ +	movq	\scratch_reg, \scratch_reg2 +	andq	$(0x7FF), \scratch_reg		/* mask ASID */ +	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask +	jnc	.Lnoflush_\@ + +	/* Flush needed, clear the bit */ +	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask +	movq	\scratch_reg2, \scratch_reg +	jmp	.Lwrcr3_pcid_\@ + +.Lnoflush_\@: +	movq	\scratch_reg2, \scratch_reg +	SET_NOFLUSH_BIT \scratch_reg + +.Lwrcr3_pcid_\@: +	/* Flip the ASID to the user version */ +	orq	$(PTI_USER_PCID_MASK), \scratch_reg + +.Lwrcr3_\@: +	/* Flip the PGD to the user version */ +	orq     $(PTI_USER_PGTABLE_MASK), \scratch_reg +	mov	\scratch_reg, %cr3 +.Lend_\@: +.endm + +.macro SWITCH_TO_USER_CR3_STACK	scratch_reg:req +	pushq	%rax +	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax +	popq	%rax +.endm + +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req +	ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI +	movq	%cr3, \scratch_reg +	movq	\scratch_reg, \save_reg +	/* +	 * Test the user pagetable bit. If set, then the user page tables +	 * are active. If clear CR3 already has the kernel page table +	 * active. +	 */ +	bt	$PTI_USER_PGTABLE_BIT, \scratch_reg +	jnc	.Ldone_\@ + +	ADJUST_KERNEL_CR3 \scratch_reg +	movq	\scratch_reg, %cr3 + +.Ldone_\@: +.endm + +.macro RESTORE_CR3 scratch_reg:req save_reg:req +	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + +	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID + +	/* +	 * KERNEL pages can always resume with NOFLUSH as we do +	 * explicit flushes. +	 */ +	bt	$PTI_USER_PGTABLE_BIT, \save_reg +	jnc	.Lnoflush_\@ + +	/* +	 * Check if there's a pending flush for the user ASID we're +	 * about to set. +	 */ +	movq	\save_reg, \scratch_reg +	andq	$(0x7FF), \scratch_reg +	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask +	jnc	.Lnoflush_\@ + +	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask +	jmp	.Lwrcr3_\@ + +.Lnoflush_\@: +	SET_NOFLUSH_BIT \save_reg + +.Lwrcr3_\@: +	/* +	 * The CR3 write could be avoided when not changing its value, +	 * but would require a CR3 read *and* a scratch register. +	 */ +	movq	\save_reg, %cr3 +.Lend_\@: +.endm + +#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ + +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req +.endm +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req +.endm +.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req +.endm +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req +.endm +.macro RESTORE_CR3 scratch_reg:req save_reg:req +.endm + +#endif +  #endif /* CONFIG_X86_64 */  /* diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4838037f97f6..a1f28a54f23a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -44,6 +44,7 @@  #include <asm/asm.h>  #include <asm/smap.h>  #include <asm/frame.h> +#include <asm/nospec-branch.h>  	.section .entry.text, "ax" @@ -290,7 +291,7 @@ ENTRY(ret_from_fork)  	/* kernel thread */  1:	movl	%edi, %eax -	call	*%ebx +	CALL_NOSPEC %ebx  	/*  	 * A kernel thread is allowed to return here after successfully  	 * calling do_execve().  Exit to userspace to complete the execve() @@ -919,7 +920,7 @@ common_exception:  	movl	%ecx, %es  	TRACE_IRQS_OFF  	movl	%esp, %eax			# pt_regs pointer -	call	*%edi +	CALL_NOSPEC %edi  	jmp	ret_from_exception  END(common_exception) @@ -941,9 +942,10 @@ ENTRY(debug)  	movl	%esp, %eax			# pt_regs pointer  	/* Are we currently on the SYSENTER stack? */ -	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) -	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */ -	cmpl	$SIZEOF_SYSENTER_stack, %ecx +	movl	PER_CPU_VAR(cpu_entry_area), %ecx +	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx +	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */ +	cmpl	$SIZEOF_entry_stack, %ecx  	jb	.Ldebug_from_sysenter_stack  	TRACE_IRQS_OFF @@ -984,9 +986,10 @@ ENTRY(nmi)  	movl	%esp, %eax			# pt_regs pointer  	/* Are we currently on the SYSENTER stack? */ -	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) -	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */ -	cmpl	$SIZEOF_SYSENTER_stack, %ecx +	movl	PER_CPU_VAR(cpu_entry_area), %ecx +	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx +	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */ +	cmpl	$SIZEOF_entry_stack, %ecx  	jb	.Lnmi_from_sysenter_stack  	/* Not on SYSENTER stack. */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f81d50d7ceac..4f8e1d35a97c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -23,7 +23,6 @@  #include <asm/segment.h>  #include <asm/cache.h>  #include <asm/errno.h> -#include "calling.h"  #include <asm/asm-offsets.h>  #include <asm/msr.h>  #include <asm/unistd.h> @@ -38,8 +37,11 @@  #include <asm/pgtable_types.h>  #include <asm/export.h>  #include <asm/frame.h> +#include <asm/nospec-branch.h>  #include <linux/err.h> +#include "calling.h" +  .code64  .section .entry.text, "ax" @@ -140,6 +142,67 @@ END(native_usergs_sysret64)   * with them due to bugs in both AMD and Intel CPUs.   */ +	.pushsection .entry_trampoline, "ax" + +/* + * The code in here gets remapped into cpu_entry_area's trampoline.  This means + * that the assembler and linker have the wrong idea as to where this code + * lives (and, in fact, it's mapped more than once, so it's not even at a + * fixed address).  So we can't reference any symbols outside the entry + * trampoline and expect it to work. + * + * Instead, we carefully abuse %rip-relative addressing. + * _entry_trampoline(%rip) refers to the start of the remapped) entry + * trampoline.  We can thus find cpu_entry_area with this macro: + */ + +#define CPU_ENTRY_AREA \ +	_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +#define RSP_SCRATCH	CPU_ENTRY_AREA_entry_stack + \ +			SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA + +ENTRY(entry_SYSCALL_64_trampoline) +	UNWIND_HINT_EMPTY +	swapgs + +	/* Stash the user RSP. */ +	movq	%rsp, RSP_SCRATCH + +	/* Note: using %rsp as a scratch reg. */ +	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + +	/* Load the top of the task stack into RSP */ +	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp + +	/* Start building the simulated IRET frame. */ +	pushq	$__USER_DS			/* pt_regs->ss */ +	pushq	RSP_SCRATCH			/* pt_regs->sp */ +	pushq	%r11				/* pt_regs->flags */ +	pushq	$__USER_CS			/* pt_regs->cs */ +	pushq	%rcx				/* pt_regs->ip */ + +	/* +	 * x86 lacks a near absolute jump, and we can't jump to the real +	 * entry text with a relative jump.  We could push the target +	 * address and then use retq, but this destroys the pipeline on +	 * many CPUs (wasting over 20 cycles on Sandy Bridge).  Instead, +	 * spill RDI and restore it in a second-stage trampoline. +	 */ +	pushq	%rdi +	movq	$entry_SYSCALL_64_stage2, %rdi +	JMP_NOSPEC %rdi +END(entry_SYSCALL_64_trampoline) + +	.popsection + +ENTRY(entry_SYSCALL_64_stage2) +	UNWIND_HINT_EMPTY +	popq	%rdi +	jmp	entry_SYSCALL_64_after_hwframe +END(entry_SYSCALL_64_stage2) +  ENTRY(entry_SYSCALL_64)  	UNWIND_HINT_EMPTY  	/* @@ -149,6 +212,10 @@ ENTRY(entry_SYSCALL_64)  	 */  	swapgs +	/* +	 * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it +	 * is not required to switch CR3. +	 */  	movq	%rsp, PER_CPU_VAR(rsp_scratch)  	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -204,7 +271,12 @@ entry_SYSCALL_64_fastpath:  	 * It might end up jumping to the slow path.  If it jumps, RAX  	 * and all argument registers are clobbered.  	 */ +#ifdef CONFIG_RETPOLINE +	movq	sys_call_table(, %rax, 8), %rax +	call	__x86_indirect_thunk_rax +#else  	call	*sys_call_table(, %rax, 8) +#endif  .Lentry_SYSCALL_64_after_fastpath_call:  	movq	%rax, RAX(%rsp) @@ -330,8 +402,25 @@ syscall_return_via_sysret:  	popq	%rsi	/* skip rcx */  	popq	%rdx  	popq	%rsi + +	/* +	 * Now all regs are restored except RSP and RDI. +	 * Save old stack pointer and switch to trampoline stack. +	 */ +	movq	%rsp, %rdi +	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + +	pushq	RSP-RDI(%rdi)	/* RSP */ +	pushq	(%rdi)		/* RDI */ + +	/* +	 * We are on the trampoline stack.  All regs except RDI are live. +	 * We can do future final exit work right here. +	 */ +	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi +  	popq	%rdi -	movq	RSP-ORIG_RAX(%rsp), %rsp +	popq	%rsp  	USERGS_SYSRET64  END(entry_SYSCALL_64) @@ -359,7 +448,7 @@ ENTRY(stub_ptregs_64)  	jmp	entry_SYSCALL64_slow_path  1: -	jmp	*%rax				/* Called from C */ +	JMP_NOSPEC %rax				/* Called from C */  END(stub_ptregs_64)  .macro ptregs_stub func @@ -438,7 +527,7 @@ ENTRY(ret_from_fork)  1:  	/* kernel thread */  	movq	%r12, %rdi -	call	*%rbx +	CALL_NOSPEC %rbx  	/*  	 * A kernel thread is allowed to return here after successfully  	 * calling do_execve().  Exit to userspace to complete the execve() @@ -466,12 +555,13 @@ END(irq_entries_start)  .macro DEBUG_ENTRY_ASSERT_IRQS_OFF  #ifdef CONFIG_DEBUG_ENTRY -	pushfq -	testl $X86_EFLAGS_IF, (%rsp) +	pushq %rax +	SAVE_FLAGS(CLBR_RAX) +	testl $X86_EFLAGS_IF, %eax  	jz .Lokay_\@  	ud2  .Lokay_\@: -	addq $8, %rsp +	popq %rax  #endif  .endm @@ -563,6 +653,13 @@ END(irq_entries_start)  /* 0(%rsp): ~(interrupt number) */  	.macro interrupt func  	cld + +	testb	$3, CS-ORIG_RAX(%rsp) +	jz	1f +	SWAPGS +	call	switch_to_thread_stack +1: +  	ALLOC_PT_GPREGS_ON_STACK  	SAVE_C_REGS  	SAVE_EXTRA_REGS @@ -572,12 +669,8 @@ END(irq_entries_start)  	jz	1f  	/* -	 * IRQ from user mode.  Switch to kernel gsbase and inform context -	 * tracking that we're in kernel mode. -	 */ -	SWAPGS - -	/* +	 * IRQ from user mode. +	 *  	 * We need to tell lockdep that IRQs are off.  We can't do this until  	 * we fix gsbase, and we should do it before enter_from_user_mode  	 * (which can take locks).  Since TRACE_IRQS_OFF idempotent, @@ -630,10 +723,43 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)  	ud2  1:  #endif -	SWAPGS  	POP_EXTRA_REGS -	POP_C_REGS -	addq	$8, %rsp	/* skip regs->orig_ax */ +	popq	%r11 +	popq	%r10 +	popq	%r9 +	popq	%r8 +	popq	%rax +	popq	%rcx +	popq	%rdx +	popq	%rsi + +	/* +	 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. +	 * Save old stack pointer and switch to trampoline stack. +	 */ +	movq	%rsp, %rdi +	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + +	/* Copy the IRET frame to the trampoline stack. */ +	pushq	6*8(%rdi)	/* SS */ +	pushq	5*8(%rdi)	/* RSP */ +	pushq	4*8(%rdi)	/* EFLAGS */ +	pushq	3*8(%rdi)	/* CS */ +	pushq	2*8(%rdi)	/* RIP */ + +	/* Push user RDI on the trampoline stack. */ +	pushq	(%rdi) + +	/* +	 * We are on the trampoline stack.  All regs except RDI are live. +	 * We can do future final exit work right here. +	 */ + +	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + +	/* Restore RDI. */ +	popq	%rdi +	SWAPGS  	INTERRUPT_RETURN @@ -713,7 +839,9 @@ native_irq_return_ldt:  	 */  	pushq	%rdi				/* Stash user RDI */ -	SWAPGS +	SWAPGS					/* to kernel GS */ +	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */ +  	movq	PER_CPU_VAR(espfix_waddr), %rdi  	movq	%rax, (0*8)(%rdi)		/* user RAX */  	movq	(1*8)(%rsp), %rax		/* user RIP */ @@ -729,7 +857,6 @@ native_irq_return_ldt:  	/* Now RAX == RSP. */  	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */ -	popq	%rdi				/* Restore user RDI */  	/*  	 * espfix_stack[31:16] == 0.  The page tables are set up such that @@ -740,7 +867,11 @@ native_irq_return_ldt:  	 * still points to an RO alias of the ESPFIX stack.  	 */  	orq	PER_CPU_VAR(espfix_stack), %rax -	SWAPGS + +	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi +	SWAPGS					/* to user GS */ +	popq	%rdi				/* Restore user RDI */ +  	movq	%rax, %rsp  	UNWIND_HINT_IRET_REGS offset=8 @@ -829,7 +960,35 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt  /*   * Exception entry points.   */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) + +/* + * Switch to the thread stack.  This is called with the IRET frame and + * orig_ax on the stack.  (That is, RDI..R12 are not on the stack and + * space has not been allocated for them.) + */ +ENTRY(switch_to_thread_stack) +	UNWIND_HINT_FUNC + +	pushq	%rdi +	/* Need to switch before accessing the thread stack. */ +	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi +	movq	%rsp, %rdi +	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp +	UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI + +	pushq	7*8(%rdi)		/* regs->ss */ +	pushq	6*8(%rdi)		/* regs->rsp */ +	pushq	5*8(%rdi)		/* regs->eflags */ +	pushq	4*8(%rdi)		/* regs->cs */ +	pushq	3*8(%rdi)		/* regs->ip */ +	pushq	2*8(%rdi)		/* regs->orig_ax */ +	pushq	8(%rdi)			/* return address */ +	UNWIND_HINT_FUNC + +	movq	(%rdi), %rdi +	ret +END(switch_to_thread_stack)  .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1  ENTRY(\sym) @@ -848,11 +1007,12 @@ ENTRY(\sym)  	ALLOC_PT_GPREGS_ON_STACK -	.if \paranoid -	.if \paranoid == 1 +	.if \paranoid < 2  	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */ -	jnz	1f +	jnz	.Lfrom_usermode_switch_stack_\@  	.endif + +	.if \paranoid  	call	paranoid_entry  	.else  	call	error_entry @@ -894,20 +1054,15 @@ ENTRY(\sym)  	jmp	error_exit  	.endif -	.if \paranoid == 1 +	.if \paranoid < 2  	/* -	 * Paranoid entry from userspace.  Switch stacks and treat it +	 * Entry from userspace.  Switch stacks and treat it  	 * as a normal entry.  This means that paranoid handlers  	 * run in real process context if user_mode(regs).  	 */ -1: +.Lfrom_usermode_switch_stack_\@:  	call	error_entry - -	movq	%rsp, %rdi			/* pt_regs pointer */ -	call	sync_regs -	movq	%rax, %rsp			/* switch stack */ -  	movq	%rsp, %rdi			/* pt_regs pointer */  	.if \has_error_code @@ -1119,7 +1274,11 @@ ENTRY(paranoid_entry)  	js	1f				/* negative -> in kernel */  	SWAPGS  	xorl	%ebx, %ebx -1:	ret + +1: +	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + +	ret  END(paranoid_entry)  /* @@ -1141,6 +1300,7 @@ ENTRY(paranoid_exit)  	testl	%ebx, %ebx			/* swapgs needed? */  	jnz	.Lparanoid_exit_no_swapgs  	TRACE_IRQS_IRETQ +	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14  	SWAPGS_UNSAFE_STACK  	jmp	.Lparanoid_exit_restore  .Lparanoid_exit_no_swapgs: @@ -1168,8 +1328,18 @@ ENTRY(error_entry)  	 * from user mode due to an IRET fault.  	 */  	SWAPGS +	/* We have user CR3.  Change to kernel CR3. */ +	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax  .Lerror_entry_from_usermode_after_swapgs: +	/* Put us onto the real thread stack. */ +	popq	%r12				/* save return addr in %12 */ +	movq	%rsp, %rdi			/* arg0 = pt_regs pointer */ +	call	sync_regs +	movq	%rax, %rsp			/* switch stack */ +	ENCODE_FRAME_POINTER +	pushq	%r12 +  	/*  	 * We need to tell lockdep that IRQs are off.  We can't do this until  	 * we fix gsbase, and we should do it before enter_from_user_mode @@ -1206,6 +1376,7 @@ ENTRY(error_entry)  	 * .Lgs_change's error handler with kernel gsbase.  	 */  	SWAPGS +	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax  	jmp .Lerror_entry_done  .Lbstep_iret: @@ -1215,10 +1386,11 @@ ENTRY(error_entry)  .Lerror_bad_iret:  	/* -	 * We came from an IRET to user mode, so we have user gsbase. -	 * Switch to kernel gsbase: +	 * We came from an IRET to user mode, so we have user +	 * gsbase and CR3.  Switch to kernel gsbase and CR3:  	 */  	SWAPGS +	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax  	/*  	 * Pretend that the exception came from user mode: set up pt_regs @@ -1250,6 +1422,10 @@ END(error_exit)  /*   * Runs on exception stack.  Xen PV does not go through this path at all,   * so we can use real assembly here. + * + * Registers: + *	%r14: Used to save/restore the CR3 of the interrupted context + *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.   */  ENTRY(nmi)  	UNWIND_HINT_IRET_REGS @@ -1313,6 +1489,7 @@ ENTRY(nmi)  	swapgs  	cld +	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx  	movq	%rsp, %rdx  	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp  	UNWIND_HINT_IRET_REGS base=%rdx offset=8 @@ -1565,6 +1742,8 @@ end_repeat_nmi:  	movq	$-1, %rsi  	call	do_nmi +	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 +  	testl	%ebx, %ebx			/* swapgs needed? */  	jnz	nmi_restore  nmi_swapgs: diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 568e130d932c..98d5358e4041 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -48,7 +48,11 @@   */  ENTRY(entry_SYSENTER_compat)  	/* Interrupts are off on entry. */ -	SWAPGS_UNSAFE_STACK +	SWAPGS + +	/* We are about to clobber %rsp anyway, clobbering here is OK */ +	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp +  	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp  	/* @@ -186,8 +190,13 @@ ENTRY(entry_SYSCALL_compat)  	/* Interrupts are off on entry. */  	swapgs -	/* Stash user ESP and switch to the kernel stack. */ +	/* Stash user ESP */  	movl	%esp, %r8d + +	/* Use %rsp as scratch reg. User ESP is stashed in r8 */ +	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + +	/* Switch to the kernel stack */  	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp  	/* Construct struct pt_regs on stack */ @@ -256,10 +265,22 @@ sysret32_from_system_call:  	 * when the system call started, which is already known to user  	 * code.  We zero R8-R10 to avoid info leaks.           */ +	movq	RSP-ORIG_RAX(%rsp), %rsp + +	/* +	 * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored +	 * on the process stack which is not mapped to userspace and +	 * not readable after we SWITCH_TO_USER_CR3.  Delay the CR3 +	 * switch until after after the last reference to the process +	 * stack. +	 * +	 * %r8/%r9 are zeroed before the sysret, thus safe to clobber. +	 */ +	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 +  	xorq	%r8, %r8  	xorq	%r9, %r9  	xorq	%r10, %r10 -	movq	RSP-ORIG_RAX(%rsp), %rsp  	swapgs  	sysretl  END(entry_SYSCALL_compat) @@ -306,8 +327,11 @@ ENTRY(entry_INT80_compat)  	 */  	movl	%eax, %eax -	/* Construct struct pt_regs on stack (iret frame is already on stack) */  	pushq	%rax			/* pt_regs->orig_ax */ + +	/* switch to thread stack expects orig_ax to be pushed */ +	call	switch_to_thread_stack +  	pushq	%rdi			/* pt_regs->di */  	pushq	%rsi			/* pt_regs->si */  	pushq	%rdx			/* pt_regs->dx */ diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 11b13c4b43d5..f19856d95c60 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -324,5 +324,5 @@ notrace time_t __vdso_time(time_t *t)  		*t = result;  	return result;  } -int time(time_t *t) +time_t time(time_t *t)  	__attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index f279ba2643dc..577fa8adb785 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -37,6 +37,7 @@  #include <asm/unistd.h>  #include <asm/fixmap.h>  #include <asm/traps.h> +#include <asm/paravirt.h>  #define CREATE_TRACE_POINTS  #include "vsyscall_trace.h" @@ -138,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  	WARN_ON_ONCE(address != regs->ip); +	/* This should be unreachable in NATIVE mode. */ +	if (WARN_ON(vsyscall_mode == NATIVE)) +		return false; +  	if (vsyscall_mode == NONE) {  		warn_bad_vsyscall(KERN_INFO, regs,  				  "vsyscall attempted with vsyscall=none"); @@ -329,16 +334,47 @@ int in_gate_area_no_mm(unsigned long addr)  	return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;  } +/* + * The VSYSCALL page is the only user-accessible page in the kernel address + * range.  Normally, the kernel page tables can have _PAGE_USER clear, but + * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls + * are enabled. + * + * Some day we may create a "minimal" vsyscall mode in which we emulate + * vsyscalls but leave the page not present.  If so, we skip calling + * this. + */ +void __init set_vsyscall_pgtable_user_bits(pgd_t *root) +{ +	pgd_t *pgd; +	p4d_t *p4d; +	pud_t *pud; +	pmd_t *pmd; + +	pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); +	set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); +	p4d = p4d_offset(pgd, VSYSCALL_ADDR); +#if CONFIG_PGTABLE_LEVELS >= 5 +	p4d->p4d |= _PAGE_USER; +#endif +	pud = pud_offset(p4d, VSYSCALL_ADDR); +	set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); +	pmd = pmd_offset(pud, VSYSCALL_ADDR); +	set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); +} +  void __init map_vsyscall(void)  {  	extern char __vsyscall_page;  	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); -	if (vsyscall_mode != NONE) +	if (vsyscall_mode != NONE) {  		__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,  			     vsyscall_mode == NATIVE  			     ? PAGE_KERNEL_VSYSCALL  			     : PAGE_KERNEL_VVAR); +		set_vsyscall_pgtable_user_bits(swapper_pg_dir); +	}  	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=  		     (unsigned long)VSYSCALL_ADDR); diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 141e07b06216..24ffa1e88cf9 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -582,6 +582,24 @@ static __init int bts_init(void)  	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)  		return -ENODEV; +	if (boot_cpu_has(X86_FEATURE_PTI)) { +		/* +		 * BTS hardware writes through a virtual memory map we must +		 * either use the kernel physical map, or the user mapping of +		 * the AUX buffer. +		 * +		 * However, since this driver supports per-CPU and per-task inherit +		 * we cannot use the user mapping since it will not be availble +		 * if we're not running the owning process. +		 * +		 * With PTI we can't use the kernal map either, because its not +		 * there when we run userspace. +		 * +		 * For now, disable this driver when using PTI. +		 */ +		return -ENODEV; +	} +  	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |  				  PERF_PMU_CAP_EXCLUSIVE;  	bts_pmu.task_ctx_nr	= perf_sw_context; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 09c26a4f139c..731153a4681e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3847,6 +3847,8 @@ static struct attribute *intel_pmu_attrs[] = {  __init int intel_pmu_init(void)  { +	struct attribute **extra_attr = NULL; +	struct attribute **to_free = NULL;  	union cpuid10_edx edx;  	union cpuid10_eax eax;  	union cpuid10_ebx ebx; @@ -3854,7 +3856,6 @@ __init int intel_pmu_init(void)  	unsigned int unused;  	struct extra_reg *er;  	int version, i; -	struct attribute **extra_attr = NULL;  	char *name;  	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { @@ -4294,6 +4295,7 @@ __init int intel_pmu_init(void)  		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?  			hsw_format_attr : nhm_format_attr;  		extra_attr = merge_attr(extra_attr, skl_format_attr); +		to_free = extra_attr;  		x86_pmu.cpu_events = get_hsw_events_attrs();  		intel_pmu_pebs_data_source_skl(  			boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); @@ -4401,6 +4403,7 @@ __init int intel_pmu_init(void)  		pr_cont("full-width counters, ");  	} +	kfree(to_free);  	return 0;  } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 3674a4b6f8bd..8156e47da7ba 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -3,16 +3,19 @@  #include <linux/types.h>  #include <linux/slab.h> +#include <asm/cpu_entry_area.h>  #include <asm/perf_event.h> +#include <asm/tlbflush.h>  #include <asm/insn.h>  #include "../perf_event.h" +/* Waste a full page so it can be mapped into the cpu_entry_area */ +DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); +  /* The size of a BTS record in bytes: */  #define BTS_RECORD_SIZE		24 -#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)  #define PEBS_FIXUP_SIZE		PAGE_SIZE  /* @@ -279,17 +282,67 @@ void fini_debug_store_on_cpu(int cpu)  static DEFINE_PER_CPU(void *, insn_buffer); -static int alloc_pebs_buffer(int cpu) +static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)  { -	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; +	unsigned long start = (unsigned long)cea; +	phys_addr_t pa; +	size_t msz = 0; + +	pa = virt_to_phys(addr); + +	preempt_disable(); +	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) +		cea_set_pte(cea, pa, prot); + +	/* +	 * This is a cross-CPU update of the cpu_entry_area, we must shoot down +	 * all TLB entries for it. +	 */ +	flush_tlb_kernel_range(start, start + size); +	preempt_enable(); +} + +static void ds_clear_cea(void *cea, size_t size) +{ +	unsigned long start = (unsigned long)cea; +	size_t msz = 0; + +	preempt_disable(); +	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) +		cea_set_pte(cea, 0, PAGE_NONE); + +	flush_tlb_kernel_range(start, start + size); +	preempt_enable(); +} + +static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) +{ +	unsigned int order = get_order(size);  	int node = cpu_to_node(cpu); -	int max; -	void *buffer, *ibuffer; +	struct page *page; + +	page = __alloc_pages_node(node, flags | __GFP_ZERO, order); +	return page ? page_address(page) : NULL; +} + +static void dsfree_pages(const void *buffer, size_t size) +{ +	if (buffer) +		free_pages((unsigned long)buffer, get_order(size)); +} + +static int alloc_pebs_buffer(int cpu) +{ +	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); +	struct debug_store *ds = hwev->ds; +	size_t bsiz = x86_pmu.pebs_buffer_size; +	int max, node = cpu_to_node(cpu); +	void *buffer, *ibuffer, *cea;  	if (!x86_pmu.pebs)  		return 0; -	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); +	buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);  	if (unlikely(!buffer))  		return -ENOMEM; @@ -300,25 +353,27 @@ static int alloc_pebs_buffer(int cpu)  	if (x86_pmu.intel_cap.pebs_format < 2) {  		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);  		if (!ibuffer) { -			kfree(buffer); +			dsfree_pages(buffer, bsiz);  			return -ENOMEM;  		}  		per_cpu(insn_buffer, cpu) = ibuffer;  	} - -	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; - -	ds->pebs_buffer_base = (u64)(unsigned long)buffer; +	hwev->ds_pebs_vaddr = buffer; +	/* Update the cpu entry area mapping */ +	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; +	ds->pebs_buffer_base = (unsigned long) cea; +	ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);  	ds->pebs_index = ds->pebs_buffer_base; -	ds->pebs_absolute_maximum = ds->pebs_buffer_base + -		max * x86_pmu.pebs_record_size; - +	max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size); +	ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;  	return 0;  }  static void release_pebs_buffer(int cpu)  { -	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; +	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); +	struct debug_store *ds = hwev->ds; +	void *cea;  	if (!ds || !x86_pmu.pebs)  		return; @@ -326,73 +381,70 @@ static void release_pebs_buffer(int cpu)  	kfree(per_cpu(insn_buffer, cpu));  	per_cpu(insn_buffer, cpu) = NULL; -	kfree((void *)(unsigned long)ds->pebs_buffer_base); +	/* Clear the fixmap */ +	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; +	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);  	ds->pebs_buffer_base = 0; +	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); +	hwev->ds_pebs_vaddr = NULL;  }  static int alloc_bts_buffer(int cpu)  { -	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; -	int node = cpu_to_node(cpu); -	int max, thresh; -	void *buffer; +	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); +	struct debug_store *ds = hwev->ds; +	void *buffer, *cea; +	int max;  	if (!x86_pmu.bts)  		return 0; -	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); +	buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);  	if (unlikely(!buffer)) {  		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);  		return -ENOMEM;  	} - -	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; -	thresh = max / 16; - -	ds->bts_buffer_base = (u64)(unsigned long)buffer; +	hwev->ds_bts_vaddr = buffer; +	/* Update the fixmap */ +	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; +	ds->bts_buffer_base = (unsigned long) cea; +	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);  	ds->bts_index = ds->bts_buffer_base; -	ds->bts_absolute_maximum = ds->bts_buffer_base + -		max * BTS_RECORD_SIZE; -	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - -		thresh * BTS_RECORD_SIZE; - +	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); +	ds->bts_absolute_maximum = ds->bts_buffer_base + max; +	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);  	return 0;  }  static void release_bts_buffer(int cpu)  { -	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; +	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); +	struct debug_store *ds = hwev->ds; +	void *cea;  	if (!ds || !x86_pmu.bts)  		return; -	kfree((void *)(unsigned long)ds->bts_buffer_base); +	/* Clear the fixmap */ +	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; +	ds_clear_cea(cea, BTS_BUFFER_SIZE);  	ds->bts_buffer_base = 0; +	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); +	hwev->ds_bts_vaddr = NULL;  }  static int alloc_ds_buffer(int cpu)  { -	int node = cpu_to_node(cpu); -	struct debug_store *ds; - -	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); -	if (unlikely(!ds)) -		return -ENOMEM; +	struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store; +	memset(ds, 0, sizeof(*ds));  	per_cpu(cpu_hw_events, cpu).ds = ds; -  	return 0;  }  static void release_ds_buffer(int cpu)  { -	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - -	if (!ds) -		return; -  	per_cpu(cpu_hw_events, cpu).ds = NULL; -	kfree(ds);  }  void release_ds_buffers(void) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f7aaadf9331f..8e4ea143ed96 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -14,6 +14,8 @@  #include <linux/perf_event.h> +#include <asm/intel_ds.h> +  /* To enable MSR tracing please use the generic trace points. */  /* @@ -77,8 +79,6 @@ struct amd_nb {  	struct event_constraint event_constraints[X86_PMC_IDX_MAX];  }; -/* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS		8  #define PEBS_COUNTER_MASK	((1ULL << MAX_PEBS_EVENTS) - 1)  /* @@ -95,23 +95,6 @@ struct amd_nb {  	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \  	PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { -	u64	bts_buffer_base; -	u64	bts_index; -	u64	bts_absolute_maximum; -	u64	bts_interrupt_threshold; -	u64	pebs_buffer_base; -	u64	pebs_index; -	u64	pebs_absolute_maximum; -	u64	pebs_interrupt_threshold; -	u64	pebs_event_reset[MAX_PEBS_EVENTS]; -}; -  #define PEBS_REGS \  	(PERF_REG_X86_AX | \  	 PERF_REG_X86_BX | \ @@ -216,6 +199,8 @@ struct cpu_hw_events {  	 * Intel DebugStore bits  	 */  	struct debug_store	*ds; +	void			*ds_pebs_vaddr; +	void			*ds_bts_vaddr;  	u64			pebs_enabled;  	int			n_pebs;  	int			n_large_pebs; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index dbfd0854651f..cf5961ca8677 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -140,7 +140,7 @@ static inline int alternatives_text_reserved(void *start, void *end)  	".popsection\n"							\  	".pushsection .altinstr_replacement, \"ax\"\n"			\  	ALTINSTR_REPLACEMENT(newinstr, feature, 1)			\ -	".popsection" +	".popsection\n"  #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\  	OLDINSTR_2(oldinstr, 1, 2)					\ @@ -151,7 +151,7 @@ static inline int alternatives_text_reserved(void *start, void *end)  	".pushsection .altinstr_replacement, \"ax\"\n"			\  	ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)			\  	ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)			\ -	".popsection" +	".popsection\n"  /*   * Alternative instructions for different CPU types or capabilities. diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index ff700d81e91e..0927cdc4f946 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -11,7 +11,32 @@  #include <asm/pgtable.h>  #include <asm/special_insns.h>  #include <asm/preempt.h> +#include <asm/asm.h>  #ifndef CONFIG_X86_CMPXCHG64  extern void cmpxchg8b_emu(void);  #endif + +#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_X86_32 +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); +#else +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); +INDIRECT_THUNK(8) +INDIRECT_THUNK(9) +INDIRECT_THUNK(10) +INDIRECT_THUNK(11) +INDIRECT_THUNK(12) +INDIRECT_THUNK(13) +INDIRECT_THUNK(14) +INDIRECT_THUNK(15) +#endif +INDIRECT_THUNK(ax) +INDIRECT_THUNK(bx) +INDIRECT_THUNK(cx) +INDIRECT_THUNK(dx) +INDIRECT_THUNK(si) +INDIRECT_THUNK(di) +INDIRECT_THUNK(bp) +INDIRECT_THUNK(sp) +#endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 219faaec51df..386a6900e206 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -136,6 +136,7 @@  #endif  #ifndef __ASSEMBLY__ +#ifndef __BPF__  /*   * This output constraint should be used for any inline asm which has a "call"   * instruction.  Otherwise the asm may be inserted before the frame pointer @@ -145,5 +146,6 @@  register unsigned long current_stack_pointer asm(_ASM_SP);  #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)  #endif +#endif  #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h new file mode 100644 index 000000000000..4a7884b8dca5 --- /dev/null +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _ASM_X86_CPU_ENTRY_AREA_H +#define _ASM_X86_CPU_ENTRY_AREA_H + +#include <linux/percpu-defs.h> +#include <asm/processor.h> +#include <asm/intel_ds.h> + +/* + * cpu_entry_area is a percpu region that contains things needed by the CPU + * and early entry/exit code.  Real types aren't used for all fields here + * to avoid circular header dependencies. + * + * Every field is a virtual alias of some other allocated backing store. + * There is no direct allocation of a struct cpu_entry_area. + */ +struct cpu_entry_area { +	char gdt[PAGE_SIZE]; + +	/* +	 * The GDT is just below entry_stack and thus serves (on x86_64) as +	 * a a read-only guard page. +	 */ +	struct entry_stack_page entry_stack_page; + +	/* +	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because +	 * we need task switches to work, and task switches write to the TSS. +	 */ +	struct tss_struct tss; + +	char entry_trampoline[PAGE_SIZE]; + +#ifdef CONFIG_X86_64 +	/* +	 * Exception stacks used for IST entries. +	 * +	 * In the future, this should have a separate slot for each stack +	 * with guard pages between them. +	 */ +	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; +#endif +#ifdef CONFIG_CPU_SUP_INTEL +	/* +	 * Per CPU debug store for Intel performance monitoring. Wastes a +	 * full page at the moment. +	 */ +	struct debug_store cpu_debug_store; +	/* +	 * The actual PEBS/BTS buffers must be mapped to user space +	 * Reserve enough fixmap PTEs. +	 */ +	struct debug_store_buffers cpu_debug_buffers; +#endif +}; + +#define CPU_ENTRY_AREA_SIZE	(sizeof(struct cpu_entry_area)) +#define CPU_ENTRY_AREA_TOT_SIZE	(CPU_ENTRY_AREA_SIZE * NR_CPUS) + +DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); + +extern void setup_cpu_entry_areas(void); +extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); + +#define	CPU_ENTRY_AREA_RO_IDT		CPU_ENTRY_AREA_BASE +#define CPU_ENTRY_AREA_PER_CPU		(CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) + +#define CPU_ENTRY_AREA_RO_IDT_VADDR	((void *)CPU_ENTRY_AREA_RO_IDT) + +#define CPU_ENTRY_AREA_MAP_SIZE			\ +	(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) + +extern struct cpu_entry_area *get_cpu_entry_area(int cpu); + +static inline struct entry_stack *cpu_entry_stack(int cpu) +{ +	return &get_cpu_entry_area(cpu)->entry_stack_page.stack; +} + +#endif diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bf6a76202a77..ea9a7dde62e5 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);  	set_bit(bit, (unsigned long *)cpu_caps_set);	\  } while (0) +#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) +  #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)  /*   * Static testing of CPU features.  Used the same as boot_cpu_has(). diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c0b0e9e8aa66..f275447862f4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -197,11 +197,14 @@  #define X86_FEATURE_CAT_L3		( 7*32+ 4) /* Cache Allocation Technology L3 */  #define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */  #define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */ +#define X86_FEATURE_INVPCID_SINGLE	( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */  #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */  #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */  #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */ - +#define X86_FEATURE_PTI			( 7*32+11) /* Kernel Page Table Isolation enabled */ +#define X86_FEATURE_RETPOLINE		( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_AMD	( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */  #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */  #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */  #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */ @@ -266,6 +269,7 @@  /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */  #define X86_FEATURE_CLZERO		(13*32+ 0) /* CLZERO instruction */  #define X86_FEATURE_IRPERF		(13*32+ 1) /* Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR		(13*32+ 2) /* Always save/restore FP error pointers */  /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */  #define X86_FEATURE_DTHERM		(14*32+ 0) /* Digital Thermal Sensor */ @@ -339,5 +343,8 @@  #define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */  #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */  #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */ +#define X86_BUG_CPU_MELTDOWN		X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ +#define X86_BUG_SPECTRE_V1		X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ +#define X86_BUG_SPECTRE_V2		X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */  #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 4011cb03ef08..13c5ee878a47 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -7,6 +7,7 @@  #include <asm/mmu.h>  #include <asm/fixmap.h>  #include <asm/irq_vectors.h> +#include <asm/cpu_entry_area.h>  #include <linux/smp.h>  #include <linux/percpu.h> @@ -20,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in  	desc->type		= (info->read_exec_only ^ 1) << 1;  	desc->type	       |= info->contents << 2; +	/* Set the ACCESS bit so it can be mapped RO */ +	desc->type	       |= 1;  	desc->s			= 1;  	desc->dpl		= 0x3; @@ -60,17 +63,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)  	return this_cpu_ptr(&gdt_page)->gdt;  } -/* Get the fixmap index for a specific processor */ -static inline unsigned int get_cpu_gdt_ro_index(int cpu) -{ -	return FIX_GDT_REMAP_BEGIN + cpu; -} -  /* Provide the fixmap address of the remapped GDT */  static inline struct desc_struct *get_cpu_gdt_ro(int cpu)  { -	unsigned int idx = get_cpu_gdt_ro_index(cpu); -	return (struct desc_struct *)__fix_to_virt(idx); +	return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;  }  /* Provide the current read-only GDT */ @@ -185,7 +181,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,  #endif  } -static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)  {  	struct desc_struct *d = get_cpu_gdt_rw(cpu);  	tss_desc tss; diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 14d6d5007314..b027633e7300 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -50,6 +50,12 @@  # define DISABLE_LA57	(1<<(X86_FEATURE_LA57 & 31))  #endif +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define DISABLE_PTI		0 +#else +# define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31)) +#endif +  /*   * Make sure to add features to the correct mask   */ @@ -60,7 +66,7 @@  #define DISABLED_MASK4	(DISABLE_PCID)  #define DISABLED_MASK5	0  #define DISABLED_MASK6	0 -#define DISABLED_MASK7	0 +#define DISABLED_MASK7	(DISABLE_PTI)  #define DISABLED_MASK8	0  #define DISABLED_MASK9	(DISABLE_MPX)  #define DISABLED_MASK10	0 diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h index 0211029076ea..6777480d8a42 100644 --- a/arch/x86/include/asm/espfix.h +++ b/arch/x86/include/asm/espfix.h @@ -2,7 +2,7 @@  #ifndef _ASM_X86_ESPFIX_H  #define _ASM_X86_ESPFIX_H -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_ESPFIX64  #include <asm/percpu.h> @@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);  extern void init_espfix_bsp(void);  extern void init_espfix_ap(int cpu); - -#endif /* CONFIG_X86_64 */ +#else +static inline void init_espfix_ap(int cpu) { } +#endif  #endif /* _ASM_X86_ESPFIX_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b0c505fe9a95..64c4a30e0d39 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -44,7 +44,6 @@ extern unsigned long __FIXADDR_TOP;  			 PAGE_SIZE)  #endif -  /*   * Here we define all the compile-time 'special' virtual   * addresses. The point is to have a constant address at @@ -84,7 +83,6 @@ enum fixed_addresses {  	FIX_IO_APIC_BASE_0,  	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,  #endif -	FIX_RO_IDT,	/* Virtual mapping for read-only IDT */  #ifdef CONFIG_X86_32  	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */  	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, @@ -100,9 +98,6 @@ enum fixed_addresses {  #ifdef	CONFIG_X86_INTEL_MID  	FIX_LNW_VRTC,  #endif -	/* Fixmap entries to remap the GDTs, one per processor. */ -	FIX_GDT_REMAP_BEGIN, -	FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,  #ifdef CONFIG_ACPI_APEI_GHES  	/* Used for GHES mapping from assorted contexts */ @@ -143,7 +138,7 @@ enum fixed_addresses {  extern void reserve_top_address(unsigned long reserve);  #define FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START		(FIXADDR_TOP - FIXADDR_SIZE) +#define FIXADDR_START	(FIXADDR_TOP - FIXADDR_SIZE)  extern int fixmaps_set; diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 1b0a5abcd8ae..96aa6b9884dc 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -20,16 +20,7 @@  #ifndef _ASM_X86_HYPERVISOR_H  #define _ASM_X86_HYPERVISOR_H -#ifdef CONFIG_HYPERVISOR_GUEST - -#include <asm/kvm_para.h> -#include <asm/x86_init.h> -#include <asm/xen/hypervisor.h> - -/* - * x86 hypervisor information - */ - +/* x86 hypervisor types  */  enum x86_hypervisor_type {  	X86_HYPER_NATIVE = 0,  	X86_HYPER_VMWARE, @@ -39,6 +30,12 @@ enum x86_hypervisor_type {  	X86_HYPER_KVM,  }; +#ifdef CONFIG_HYPERVISOR_GUEST + +#include <asm/kvm_para.h> +#include <asm/x86_init.h> +#include <asm/xen/hypervisor.h> +  struct hypervisor_x86 {  	/* Hypervisor name */  	const char	*name; @@ -58,7 +55,15 @@ struct hypervisor_x86 {  extern enum x86_hypervisor_type x86_hyper_type;  extern void init_hypervisor_platform(void); +static inline bool hypervisor_is_type(enum x86_hypervisor_type type) +{ +	return x86_hyper_type == type; +}  #else  static inline void init_hypervisor_platform(void) { } +static inline bool hypervisor_is_type(enum x86_hypervisor_type type) +{ +	return type == X86_HYPER_NATIVE; +}  #endif /* CONFIG_HYPERVISOR_GUEST */  #endif /* _ASM_X86_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h new file mode 100644 index 000000000000..62a9f4966b42 --- /dev/null +++ b/arch/x86/include/asm/intel_ds.h @@ -0,0 +1,36 @@ +#ifndef _ASM_INTEL_DS_H +#define _ASM_INTEL_DS_H + +#include <linux/percpu-defs.h> + +#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4) +#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4) + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS		8 + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { +	u64	bts_buffer_base; +	u64	bts_index; +	u64	bts_absolute_maximum; +	u64	bts_interrupt_threshold; +	u64	pebs_buffer_base; +	u64	pebs_index; +	u64	pebs_absolute_maximum; +	u64	pebs_interrupt_threshold; +	u64	pebs_event_reset[MAX_PEBS_EVENTS]; +} __aligned(PAGE_SIZE); + +DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); + +struct debug_store_buffers { +	char	bts_buffer[BTS_BUFFER_SIZE]; +	char	pebs_buffer[PEBS_BUFFER_SIZE]; +}; + +#endif diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h new file mode 100644 index 000000000000..989cfa86de85 --- /dev/null +++ b/arch/x86/include/asm/invpcid.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_INVPCID +#define _ASM_X86_INVPCID + +static inline void __invpcid(unsigned long pcid, unsigned long addr, +			     unsigned long type) +{ +	struct { u64 d[2]; } desc = { { pcid, addr } }; + +	/* +	 * The memory clobber is because the whole point is to invalidate +	 * stale TLB entries and, especially if we're flushing global +	 * mappings, we don't want the compiler to reorder any subsequent +	 * memory accesses before the TLB flush. +	 * +	 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and +	 * invpcid (%rcx), %rax in long mode. +	 */ +	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" +		      : : "m" (desc), "a" (type), "c" (&desc) : "memory"); +} + +#define INVPCID_TYPE_INDIV_ADDR		0 +#define INVPCID_TYPE_SINGLE_CTXT	1 +#define INVPCID_TYPE_ALL_INCL_GLOBAL	2 +#define INVPCID_TYPE_ALL_NON_GLOBAL	3 + +/* Flush all mappings for a given pcid and addr, not including globals. */ +static inline void invpcid_flush_one(unsigned long pcid, +				     unsigned long addr) +{ +	__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invpcid_flush_single_context(unsigned long pcid) +{ +	__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invpcid_flush_all(void) +{ +	__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invpcid_flush_all_nonglobals(void) +{ +	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +} + +#endif /* _ASM_X86_INVPCID */ diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index 139feef467f7..c066ffae222b 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -44,7 +44,7 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,  extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,  			      unsigned int nr_irqs);  extern int mp_irqdomain_activate(struct irq_domain *domain, -				 struct irq_data *irq_data, bool early); +				 struct irq_data *irq_data, bool reserve);  extern void mp_irqdomain_deactivate(struct irq_domain *domain,  				    struct irq_data *irq_data);  extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index c8ef23f2c28f..89f08955fff7 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)  	swapgs;					\  	sysretl +#ifdef CONFIG_DEBUG_ENTRY +#define SAVE_FLAGS(x)		pushfq; popq %rax +#endif  #else  #define INTERRUPT_RETURN		iret  #define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index f86a8caa561e..395c9631e000 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);  extern int __must_check __die(const char *, struct pt_regs *, long);  extern void show_stack_regs(struct pt_regs *regs);  extern void __show_regs(struct pt_regs *regs, int all); +extern void show_iret_regs(struct pt_regs *regs);  extern unsigned long oops_begin(void);  extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/include/asm/kmemcheck.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 034caa1a084e..b24b1c8b3979 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -214,8 +214,6 @@ struct x86_emulate_ops {  	void (*halt)(struct x86_emulate_ctxt *ctxt);  	void (*wbinvd)(struct x86_emulate_ctxt *ctxt);  	int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); -	void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ -	void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */  	int (*intercept)(struct x86_emulate_ctxt *ctxt,  			 struct x86_instruction_info *info,  			 enum x86_intercept_stage stage); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1bfb99770c34..516798431328 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -536,7 +536,20 @@ struct kvm_vcpu_arch {  	struct kvm_mmu_memory_cache mmu_page_cache;  	struct kvm_mmu_memory_cache mmu_page_header_cache; +	/* +	 * QEMU userspace and the guest each have their own FPU state. +	 * In vcpu_run, we switch between the user and guest FPU contexts. +	 * While running a VCPU, the VCPU thread will have the guest FPU +	 * context. +	 * +	 * Note that while the PKRU state lives inside the fpu registers, +	 * it is switched out separately at VMENTER and VMEXIT time. The +	 * "guest_fpu" state here contains the guest FPU context, with the +	 * host PRKU bits. +	 */ +	struct fpu user_fpu;  	struct fpu guest_fpu; +  	u64 xcr0;  	u64 guest_supported_xcr0;  	u32 guest_xstate_size; @@ -1161,7 +1174,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,  static inline int emulate_instruction(struct kvm_vcpu *vcpu,  			int emulation_type)  { -	return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); +	return x86_emulate_instruction(vcpu, 0, +			emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0);  }  void kvm_enable_efer_bits(u64); @@ -1434,4 +1448,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)  #define put_smstate(type, buf, offset, val)                      \  	*(type *)((buf) + (offset) - 0x7e00) = val +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, +		unsigned long start, unsigned long end); +  #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 9ea26f167497..5ff3e8af2c20 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -3,6 +3,7 @@  #define _ASM_X86_MMU_H  #include <linux/spinlock.h> +#include <linux/rwsem.h>  #include <linux/mutex.h>  #include <linux/atomic.h> @@ -27,7 +28,8 @@ typedef struct {  	atomic64_t tlb_gen;  #ifdef CONFIG_MODIFY_LDT_SYSCALL -	struct ldt_struct *ldt; +	struct rw_semaphore	ldt_usr_sem; +	struct ldt_struct	*ldt;  #endif  #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 6d16d15d09a0..c931b88982a0 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -50,22 +50,53 @@ struct ldt_struct {  	 * call gates.  On native, we could merge the ldt_struct and LDT  	 * allocations, but it's not worth trying to optimize.  	 */ -	struct desc_struct *entries; -	unsigned int nr_entries; +	struct desc_struct	*entries; +	unsigned int		nr_entries; + +	/* +	 * If PTI is in use, then the entries array is not mapped while we're +	 * in user mode.  The whole array will be aliased at the addressed +	 * given by ldt_slot_va(slot).  We use two slots so that we can allocate +	 * and map, and enable a new LDT without invalidating the mapping +	 * of an older, still-in-use LDT. +	 * +	 * slot will be -1 if this LDT doesn't have an alias mapping. +	 */ +	int			slot;  }; +/* This is a multiple of PAGE_SIZE. */ +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) + +static inline void *ldt_slot_va(int slot) +{ +#ifdef CONFIG_X86_64 +	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); +#else +	BUG(); +#endif +} +  /*   * Used for LDT copy/destruction.   */ -int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); +static inline void init_new_context_ldt(struct mm_struct *mm) +{ +	mm->context.ldt = NULL; +	init_rwsem(&mm->context.ldt_usr_sem); +} +int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);  void destroy_context_ldt(struct mm_struct *mm); +void ldt_arch_exit_mmap(struct mm_struct *mm);  #else	/* CONFIG_MODIFY_LDT_SYSCALL */ -static inline int init_new_context_ldt(struct task_struct *tsk, -				       struct mm_struct *mm) +static inline void init_new_context_ldt(struct mm_struct *mm) { } +static inline int ldt_dup_context(struct mm_struct *oldmm, +				  struct mm_struct *mm)  {  	return 0;  } -static inline void destroy_context_ldt(struct mm_struct *mm) {} +static inline void destroy_context_ldt(struct mm_struct *mm) { } +static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }  #endif  static inline void load_mm_ldt(struct mm_struct *mm) @@ -90,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)  	 * that we can see.  	 */ -	if (unlikely(ldt)) -		set_ldt(ldt->entries, ldt->nr_entries); -	else +	if (unlikely(ldt)) { +		if (static_cpu_has(X86_FEATURE_PTI)) { +			if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { +				/* +				 * Whoops -- either the new LDT isn't mapped +				 * (if slot == -1) or is mapped into a bogus +				 * slot (if slot > 1). +				 */ +				clear_LDT(); +				return; +			} + +			/* +			 * If page table isolation is enabled, ldt->entries +			 * will not be mapped in the userspace pagetables. +			 * Tell the CPU to access the LDT through the alias +			 * at ldt_slot_va(ldt->slot). +			 */ +			set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); +		} else { +			set_ldt(ldt->entries, ldt->nr_entries); +		} +	} else {  		clear_LDT(); +	}  #else  	clear_LDT();  #endif @@ -132,18 +184,21 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);  static inline int init_new_context(struct task_struct *tsk,  				   struct mm_struct *mm)  { +	mutex_init(&mm->context.lock); +  	mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);  	atomic64_set(&mm->context.tlb_gen, 0); -	#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS  	if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {  		/* pkey 0 is the default and always allocated */  		mm->context.pkey_allocation_map = 0x1;  		/* -1 means unallocated or invalid */  		mm->context.execute_only_pkey = -1;  	} -	#endif -	return init_new_context_ldt(tsk, mm); +#endif +	init_new_context_ldt(mm); +	return 0;  }  static inline void destroy_context(struct mm_struct *mm)  { @@ -176,15 +231,16 @@ do {						\  } while (0)  #endif -static inline void arch_dup_mmap(struct mm_struct *oldmm, -				 struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)  {  	paravirt_arch_dup_mmap(oldmm, mm); +	return ldt_dup_context(oldmm, mm);  }  static inline void arch_exit_mmap(struct mm_struct *mm)  {  	paravirt_arch_exit_mmap(mm); +	ldt_arch_exit_mmap(mm);  }  #ifdef CONFIG_X86_64 @@ -282,33 +338,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,  }  /* - * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID - * bits.  This serves two purposes.  It prevents a nasty situation in - * which PCID-unaware code saves CR3, loads some other value (with PCID - * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if - * the saved ASID was nonzero.  It also means that any bugs involving - * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger - * deterministically. - */ - -static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) -{ -	if (static_cpu_has(X86_FEATURE_PCID)) { -		VM_WARN_ON_ONCE(asid > 4094); -		return __sme_pa(mm->pgd) | (asid + 1); -	} else { -		VM_WARN_ON_ONCE(asid != 0); -		return __sme_pa(mm->pgd); -	} -} - -static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) -{ -	VM_WARN_ON_ONCE(asid > 4094); -	return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; -} - -/*   * This can be used from process context to figure out what the value of   * CR3 is without needing to do a (slow) __read_cr3().   * @@ -317,7 +346,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)   */  static inline unsigned long __get_current_cr3_fast(void)  { -	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), +	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,  		this_cpu_read(cpu_tlbstate.loaded_mm_asid));  	/* For now, be very restrictive about when this can be called. */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 5400add2885b..8bf450b13d9f 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -7,6 +7,7 @@  #include <linux/nmi.h>  #include <asm/io.h>  #include <asm/hyperv.h> +#include <asm/nospec-branch.h>  /*   * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent @@ -186,10 +187,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)  		return U64_MAX;  	__asm__ __volatile__("mov %4, %%r8\n" -			     "call *%5" +			     CALL_NOSPEC  			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,  			       "+c" (control), "+d" (input_address) -			     :  "r" (output_address), "m" (hv_hypercall_pg) +			     :  "r" (output_address), +				THUNK_TARGET(hv_hypercall_pg)  			     : "cc", "memory", "r8", "r9", "r10", "r11");  #else  	u32 input_address_hi = upper_32_bits(input_address); @@ -200,13 +202,13 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)  	if (!hv_hypercall_pg)  		return U64_MAX; -	__asm__ __volatile__("call *%7" +	__asm__ __volatile__(CALL_NOSPEC  			     : "=A" (hv_status),  			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT  			     : "A" (control),  			       "b" (input_address_hi),  			       "D"(output_address_hi), "S"(output_address_lo), -			       "m" (hv_hypercall_pg) +			       THUNK_TARGET(hv_hypercall_pg)  			     : "cc", "memory");  #endif /* !x86_64 */  	return hv_status; @@ -227,10 +229,10 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)  #ifdef CONFIG_X86_64  	{ -		__asm__ __volatile__("call *%4" +		__asm__ __volatile__(CALL_NOSPEC  				     : "=a" (hv_status), ASM_CALL_CONSTRAINT,  				       "+c" (control), "+d" (input1) -				     : "m" (hv_hypercall_pg) +				     : THUNK_TARGET(hv_hypercall_pg)  				     : "cc", "r8", "r9", "r10", "r11");  	}  #else @@ -238,13 +240,13 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)  		u32 input1_hi = upper_32_bits(input1);  		u32 input1_lo = lower_32_bits(input1); -		__asm__ __volatile__ ("call *%5" +		__asm__ __volatile__ (CALL_NOSPEC  				      : "=A"(hv_status),  					"+c"(input1_lo),  					ASM_CALL_CONSTRAINT  				      :	"A" (control),  					"b" (input1_hi), -					"m" (hv_hypercall_pg) +					THUNK_TARGET(hv_hypercall_pg)  				      : "cc", "edi", "esi");  	}  #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 34c4922bbc3f..e7b983a35506 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -355,6 +355,9 @@  #define FAM10H_MMIO_CONF_BASE_MASK	0xfffffffULL  #define FAM10H_MMIO_CONF_BASE_SHIFT	20  #define MSR_FAM10H_NODE_ID		0xc001100c +#define MSR_F10H_DECFG			0xc0011029 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT	1 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE		BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT)  /* K8 MSRs */  #define MSR_K8_TOP_MEM1			0xc001001a diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h new file mode 100644 index 000000000000..402a11c803c3 --- /dev/null +++ b/arch/x86/include/asm/nospec-branch.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NOSPEC_BRANCH_H__ +#define __NOSPEC_BRANCH_H__ + +#include <asm/alternative.h> +#include <asm/alternative-asm.h> +#include <asm/cpufeatures.h> + +/* + * Fill the CPU return stack buffer. + * + * Each entry in the RSB, if used for a speculative 'ret', contains an + * infinite 'pause; jmp' loop to capture speculative execution. + * + * This is required in various cases for retpoline and IBRS-based + * mitigations for the Spectre variant 2 vulnerability. Sometimes to + * eliminate potentially bogus entries from the RSB, and sometimes + * purely to ensure that it doesn't get empty, which on some CPUs would + * allow predictions from other (unwanted!) sources to be used. + * + * We define a CPP macro such that it can be used from both .S files and + * inline assembly. It's possible to do a .macro and then include that + * from C via asm(".include <asm/nospec-branch.h>") but let's not go there. + */ + +#define RSB_CLEAR_LOOPS		32	/* To forcibly overwrite all entries */ +#define RSB_FILL_LOOPS		16	/* To avoid underflow */ + +/* + * Google experimented with loop-unrolling and this turned out to be + * the optimal version — two calls, each with their own speculation + * trap should their return address end up getting used, in a loop. + */ +#define __FILL_RETURN_BUFFER(reg, nr, sp)	\ +	mov	$(nr/2), reg;			\ +771:						\ +	call	772f;				\ +773:	/* speculation trap */			\ +	pause;					\ +	jmp	773b;				\ +772:						\ +	call	774f;				\ +775:	/* speculation trap */			\ +	pause;					\ +	jmp	775b;				\ +774:						\ +	dec	reg;				\ +	jnz	771b;				\ +	add	$(BITS_PER_LONG/8) * nr, sp; + +#ifdef __ASSEMBLY__ + +/* + * This should be used immediately before a retpoline alternative.  It tells + * objtool where the retpolines are so that it can make sense of the control + * flow by just reading the original instruction(s) and ignoring the + * alternatives. + */ +.macro ANNOTATE_NOSPEC_ALTERNATIVE +	.Lannotate_\@: +	.pushsection .discard.nospec +	.long .Lannotate_\@ - . +	.popsection +.endm + +/* + * These are the bare retpoline primitives for indirect jmp and call. + * Do not use these directly; they only exist to make the ALTERNATIVE + * invocation below less ugly. + */ +.macro RETPOLINE_JMP reg:req +	call	.Ldo_rop_\@ +.Lspec_trap_\@: +	pause +	jmp	.Lspec_trap_\@ +.Ldo_rop_\@: +	mov	\reg, (%_ASM_SP) +	ret +.endm + +/* + * This is a wrapper around RETPOLINE_JMP so the called function in reg + * returns to the instruction after the macro. + */ +.macro RETPOLINE_CALL reg:req +	jmp	.Ldo_call_\@ +.Ldo_retpoline_jmp_\@: +	RETPOLINE_JMP \reg +.Ldo_call_\@: +	call	.Ldo_retpoline_jmp_\@ +.endm + +/* + * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple + * indirect jmp/call which may be susceptible to the Spectre variant 2 + * attack. + */ +.macro JMP_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE +	ANNOTATE_NOSPEC_ALTERNATIVE +	ALTERNATIVE_2 __stringify(jmp *\reg),				\ +		__stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE,	\ +		__stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD +#else +	jmp	*\reg +#endif +.endm + +.macro CALL_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE +	ANNOTATE_NOSPEC_ALTERNATIVE +	ALTERNATIVE_2 __stringify(call *\reg),				\ +		__stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ +		__stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD +#else +	call	*\reg +#endif +.endm + + /* +  * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP +  * monstrosity above, manually. +  */ +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +#ifdef CONFIG_RETPOLINE +	ANNOTATE_NOSPEC_ALTERNATIVE +	ALTERNATIVE "jmp .Lskip_rsb_\@",				\ +		__stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP))	\ +		\ftr +.Lskip_rsb_\@: +#endif +.endm + +#else /* __ASSEMBLY__ */ + +#define ANNOTATE_NOSPEC_ALTERNATIVE				\ +	"999:\n\t"						\ +	".pushsection .discard.nospec\n\t"			\ +	".long 999b - .\n\t"					\ +	".popsection\n\t" + +#if defined(CONFIG_X86_64) && defined(RETPOLINE) + +/* + * Since the inline asm uses the %V modifier which is only in newer GCC, + * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE. + */ +# define CALL_NOSPEC						\ +	ANNOTATE_NOSPEC_ALTERNATIVE				\ +	ALTERNATIVE(						\ +	"call *%[thunk_target]\n",				\ +	"call __x86_indirect_thunk_%V[thunk_target]\n",		\ +	X86_FEATURE_RETPOLINE) +# define THUNK_TARGET(addr) [thunk_target] "r" (addr) + +#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) +/* + * For i386 we use the original ret-equivalent retpoline, because + * otherwise we'll run out of registers. We don't care about CET + * here, anyway. + */ +# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n",	\ +	"       jmp    904f;\n"					\ +	"       .align 16\n"					\ +	"901:	call   903f;\n"					\ +	"902:	pause;\n"					\ +	"       jmp    902b;\n"					\ +	"       .align 16\n"					\ +	"903:	addl   $4, %%esp;\n"				\ +	"       pushl  %[thunk_target];\n"			\ +	"       ret;\n"						\ +	"       .align 16\n"					\ +	"904:	call   901b;\n",				\ +	X86_FEATURE_RETPOLINE) + +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#else /* No retpoline for C / inline asm */ +# define CALL_NOSPEC "call *%[thunk_target]\n" +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#endif + +/* The Spectre V2 mitigation variants */ +enum spectre_v2_mitigation { +	SPECTRE_V2_NONE, +	SPECTRE_V2_RETPOLINE_MINIMAL, +	SPECTRE_V2_RETPOLINE_MINIMAL_AMD, +	SPECTRE_V2_RETPOLINE_GENERIC, +	SPECTRE_V2_RETPOLINE_AMD, +	SPECTRE_V2_IBRS, +}; + +/* + * On VMEXIT we must ensure that no RSB predictions learned in the guest + * can be followed in the host, by overwriting the RSB completely. Both + * retpoline and IBRS mitigations for Spectre v2 need this; only on future + * CPUs with IBRS_ATT *might* it be avoided. + */ +static inline void vmexit_fill_RSB(void) +{ +#ifdef CONFIG_RETPOLINE +	unsigned long loops = RSB_CLEAR_LOOPS / 2; + +	asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE +		      ALTERNATIVE("jmp 910f", +				  __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), +				  X86_FEATURE_RETPOLINE) +		      "910:" +		      : "=&r" (loops), ASM_CALL_CONSTRAINT +		      : "r" (loops) : "memory" ); +#endif +} +#endif /* __ASSEMBLY__ */ +#endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 283efcaac8af..892df375b615 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -927,6 +927,15 @@ extern void default_banner(void);  	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),	\  		  CLBR_NONE,						\  		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) + +#ifdef CONFIG_DEBUG_ENTRY +#define SAVE_FLAGS(clobbers)                                        \ +	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \ +		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);        \ +		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl);    \ +		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) +#endif +  #endif	/* CONFIG_X86_32 */  #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 7a5d6695abd3..eb66fa9cd0fc 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -38,6 +38,7 @@ do {						\  #define PCI_NOASSIGN_ROMS	0x80000  #define PCI_ROOT_NO_CRS		0x100000  #define PCI_NOASSIGN_BARS	0x200000 +#define PCI_BIG_ROOT_WINDOW	0x400000  extern unsigned int pci_probe;  extern unsigned long pirq_table_addr; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 4b5e1eafada7..aff42e1da6ee 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}   */  extern gfp_t __userpte_alloc_gfp; +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Instead of one PGD, we acquire two PGDs.  Being order-1, it is + * both 8k in size and 8k-aligned.  That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +#define PGD_ALLOCATION_ORDER 1 +#else +#define PGD_ALLOCATION_ORDER 0 +#endif +  /*   * Allocate and free page tables.   */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 09f9e1e00e3b..e42b8943cb1a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];  int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);  void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);  void ptdump_walk_pgd_level_checkwx(void);  #ifdef CONFIG_DEBUG_WX @@ -841,7 +842,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)  static inline int p4d_bad(p4d_t p4d)  { -	return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; +	unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; + +	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) +		ignore_flags |= _PAGE_NX; + +	return (p4d_flags(p4d) & ~ignore_flags) != 0;  }  #endif  /* CONFIG_PGTABLE_LEVELS > 3 */ @@ -875,7 +881,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)  static inline int pgd_bad(pgd_t pgd)  { -	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; +	unsigned long ignore_flags = _PAGE_USER; + +	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) +		ignore_flags |= _PAGE_NX; + +	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;  }  static inline int pgd_none(pgd_t pgd) @@ -904,7 +915,11 @@ static inline int pgd_none(pgd_t pgd)   * pgd_offset() returns a (pgd_t *)   * pgd_index() is used get the offset into the pgd page's array of pgd_t's;   */ -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) +#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address))) +/* + * a shortcut to get a pgd_t in a given mm + */ +#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))  /*   * a shortcut which implies the use of the kernel's pgd, instead   * of a process's @@ -1061,7 +1076,7 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,  				  unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMD_WRITE +#define pmd_write pmd_write  static inline int pmd_write(pmd_t pmd)  {  	return pmd_flags(pmd) & _PAGE_RW; @@ -1088,6 +1103,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,  	clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);  } +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ +	return pud_flags(pud) & _PAGE_RW; +} +  /*   * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);   * @@ -1100,7 +1121,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,   */  static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)  { -       memcpy(dst, src, count * sizeof(pgd_t)); +	memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	if (!static_cpu_has(X86_FEATURE_PTI)) +		return; +	/* Clone the user space pgd as well */ +	memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src), +	       count * sizeof(pgd_t)); +#endif  }  #define PTE_SHIFT ilog2(PTRS_PER_PTE) diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index f2ca9b28fd68..ce245b0cdfca 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */  #define LAST_PKMAP 1024  #endif -#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1))	\ -		    & PMD_MASK) +/* + * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c + * to avoid include recursion hell + */ +#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 40) + +#define CPU_ENTRY_AREA_BASE				\ +	((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK) + +#define PKMAP_BASE		\ +	((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)  #ifdef CONFIG_HIGHMEM  # define VMALLOC_END	(PKMAP_BASE - 2 * PAGE_SIZE)  #else -# define VMALLOC_END	(FIXADDR_START - 2 * PAGE_SIZE) +# define VMALLOC_END	(CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE)  #endif  #define MODULES_VADDR	VMALLOC_START diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e9f05331e732..81462e9a34f6 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)  #endif  } +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and + * the user one is in the last 4k.  To switch between them, you + * just need to flip the 12th bit in their addresses. + */ +#define PTI_PGTABLE_SWITCH_BIT	PAGE_SHIFT + +/* + * This generates better code than the inline assembly in + * __set_bit(). + */ +static inline void *ptr_set_bit(void *ptr, int bit) +{ +	unsigned long __ptr = (unsigned long)ptr; + +	__ptr |= BIT(bit); +	return (void *)__ptr; +} +static inline void *ptr_clear_bit(void *ptr, int bit) +{ +	unsigned long __ptr = (unsigned long)ptr; + +	__ptr &= ~BIT(bit); +	return (void *)__ptr; +} + +static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) +{ +	return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) +{ +	return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) +{ +	return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) +{ +	return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +/* + * Page table pages are page-aligned.  The lower half of the top + * level is used for userspace and the top half for the kernel. + * + * Returns true for parts of the PGD that map userspace and + * false for the parts that map the kernel. + */ +static inline bool pgdp_maps_userspace(void *__ptr) +{ +	unsigned long ptr = (unsigned long)__ptr; + +	return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); +} + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); + +/* + * Take a PGD location (pgdp) and a pgd value that needs to be set there. + * Populates the user and returns the resulting PGD that must be set in + * the kernel copy of the page tables. + */ +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ +	if (!static_cpu_has(X86_FEATURE_PTI)) +		return pgd; +	return __pti_set_user_pgd(pgdp, pgd); +} +#else +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ +	return pgd; +} +#endif +  static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)  { +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) +	p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); +#else  	*p4dp = p4d; +#endif  }  static inline void native_p4d_clear(p4d_t *p4d) @@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)  static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)  { +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	*pgdp = pti_set_user_pgd(pgdp, pgd); +#else  	*pgdp = pgd; +#endif  }  static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6d5f45dcd4a1..6b8f73dcbc2c 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -75,33 +75,52 @@ typedef struct { pteval_t pte; } pte_t;  #define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)  #define PGDIR_MASK	(~(PGDIR_SIZE - 1)) -/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -#define MAXMEM		_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +/* + * See Documentation/x86/x86_64/mm.txt for a description of the memory map. + * + * Be very careful vs. KASLR when changing anything here. The KASLR address + * range must not overlap with anything except the KASAN shadow area, which + * is correct as KASAN disables KASLR. + */ +#define MAXMEM			_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +  #ifdef CONFIG_X86_5LEVEL -#define VMALLOC_SIZE_TB _AC(16384, UL) -#define __VMALLOC_BASE	_AC(0xff92000000000000, UL) -#define __VMEMMAP_BASE	_AC(0xffd4000000000000, UL) +# define VMALLOC_SIZE_TB	_AC(12800, UL) +# define __VMALLOC_BASE		_AC(0xffa0000000000000, UL) +# define __VMEMMAP_BASE		_AC(0xffd4000000000000, UL) +# define LDT_PGD_ENTRY		_AC(-112, UL) +# define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)  #else -#define VMALLOC_SIZE_TB	_AC(32, UL) -#define __VMALLOC_BASE	_AC(0xffffc90000000000, UL) -#define __VMEMMAP_BASE	_AC(0xffffea0000000000, UL) +# define VMALLOC_SIZE_TB	_AC(32, UL) +# define __VMALLOC_BASE		_AC(0xffffc90000000000, UL) +# define __VMEMMAP_BASE		_AC(0xffffea0000000000, UL) +# define LDT_PGD_ENTRY		_AC(-3, UL) +# define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)  #endif +  #ifdef CONFIG_RANDOMIZE_MEMORY -#define VMALLOC_START	vmalloc_base -#define VMEMMAP_START	vmemmap_base +# define VMALLOC_START		vmalloc_base +# define VMEMMAP_START		vmemmap_base  #else -#define VMALLOC_START	__VMALLOC_BASE -#define VMEMMAP_START	__VMEMMAP_BASE +# define VMALLOC_START		__VMALLOC_BASE +# define VMEMMAP_START		__VMEMMAP_BASE  #endif /* CONFIG_RANDOMIZE_MEMORY */ -#define VMALLOC_END	(VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) -#define MODULES_VADDR    (__START_KERNEL_map + KERNEL_IMAGE_SIZE) + +#define VMALLOC_END		(VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) + +#define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)  /* The module sections ends with the start of the fixmap */ -#define MODULES_END   __fix_to_virt(__end_of_fixed_addresses + 1) -#define MODULES_LEN   (MODULES_END - MODULES_VADDR) -#define ESPFIX_PGD_ENTRY _AC(-2, UL) -#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) -#define EFI_VA_START	 ( -4 * (_AC(1, UL) << 30)) -#define EFI_VA_END	 (-68 * (_AC(1, UL) << 30)) +#define MODULES_END		_AC(0xffffffffff000000, UL) +#define MODULES_LEN		(MODULES_END - MODULES_VADDR) + +#define ESPFIX_PGD_ENTRY	_AC(-2, UL) +#define ESPFIX_BASE_ADDR	(ESPFIX_PGD_ENTRY << P4D_SHIFT) + +#define CPU_ENTRY_AREA_PGD	_AC(-4, UL) +#define CPU_ENTRY_AREA_BASE	(CPU_ENTRY_AREA_PGD << P4D_SHIFT) + +#define EFI_VA_START		( -4 * (_AC(1, UL) << 30)) +#define EFI_VA_END		(-68 * (_AC(1, UL) << 30))  #define EARLY_DYNAMIC_PAGE_TABLES	64 diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 43212a43ee69..625a52a5594f 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -38,6 +38,11 @@  #define CR3_ADDR_MASK	__sme_clr(0x7FFFFFFFFFFFF000ull)  #define CR3_PCID_MASK	0xFFFull  #define CR3_NOFLUSH	BIT_ULL(63) + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define X86_CR3_PTI_PCID_USER_BIT	11 +#endif +  #else  /*   * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cc16fa882e3e..d3a67fba200a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -163,9 +163,9 @@ enum cpuid_regs_idx {  extern struct cpuinfo_x86	boot_cpu_data;  extern struct cpuinfo_x86	new_cpu_data; -extern struct tss_struct	doublefault_tss; -extern __u32			cpu_caps_cleared[NCAPINTS]; -extern __u32			cpu_caps_set[NCAPINTS]; +extern struct x86_hw_tss	doublefault_tss; +extern __u32			cpu_caps_cleared[NCAPINTS + NBUGINTS]; +extern __u32			cpu_caps_set[NCAPINTS + NBUGINTS];  #ifdef CONFIG_SMP  DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); @@ -253,6 +253,11 @@ static inline void load_cr3(pgd_t *pgdir)  	write_cr3(__sme_pa(pgdir));  } +/* + * Note that while the legacy 'TSS' name comes from 'Task State Segment', + * on modern x86 CPUs the TSS also holds information important to 64-bit mode, + * unrelated to the task-switch mechanism: + */  #ifdef CONFIG_X86_32  /* This is the TSS defined by the hardware. */  struct x86_hw_tss { @@ -305,7 +310,13 @@ struct x86_hw_tss {  struct x86_hw_tss {  	u32			reserved1;  	u64			sp0; + +	/* +	 * We store cpu_current_top_of_stack in sp1 so it's always accessible. +	 * Linux does not use ring 1, so sp1 is not otherwise needed. +	 */  	u64			sp1; +  	u64			sp2;  	u64			reserved2;  	u64			ist[7]; @@ -323,12 +334,22 @@ struct x86_hw_tss {  #define IO_BITMAP_BITS			65536  #define IO_BITMAP_BYTES			(IO_BITMAP_BITS/8)  #define IO_BITMAP_LONGS			(IO_BITMAP_BYTES/sizeof(long)) -#define IO_BITMAP_OFFSET		offsetof(struct tss_struct, io_bitmap) +#define IO_BITMAP_OFFSET		(offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))  #define INVALID_IO_BITMAP_OFFSET	0x8000 +struct entry_stack { +	unsigned long		words[64]; +}; + +struct entry_stack_page { +	struct entry_stack stack; +} __aligned(PAGE_SIZE); +  struct tss_struct {  	/* -	 * The hardware state: +	 * The fixed hardware portion.  This must not cross a page boundary +	 * at risk of violating the SDM's advice and potentially triggering +	 * errata.  	 */  	struct x86_hw_tss	x86_tss; @@ -339,18 +360,9 @@ struct tss_struct {  	 * be within the limit.  	 */  	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1]; +} __aligned(PAGE_SIZE); -#ifdef CONFIG_X86_32 -	/* -	 * Space for the temporary SYSENTER stack. -	 */ -	unsigned long		SYSENTER_stack_canary; -	unsigned long		SYSENTER_stack[64]; -#endif - -} ____cacheline_aligned; - -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);  /*   * sizeof(unsigned long) coming from an extra "long" at the end @@ -364,6 +376,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);  #ifdef CONFIG_X86_32  DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +#else +/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ +#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1  #endif  /* @@ -523,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask)  static inline void  native_load_sp0(unsigned long sp0)  { -	this_cpu_write(cpu_tss.x86_tss.sp0, sp0); +	this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);  }  static inline void native_swapgs(void) @@ -535,12 +550,12 @@ static inline void native_swapgs(void)  static inline unsigned long current_top_of_stack(void)  { -#ifdef CONFIG_X86_64 -	return this_cpu_read_stable(cpu_tss.x86_tss.sp0); -#else -	/* sp0 on x86_32 is special in and around vm86 mode. */ +	/* +	 *  We can't read directly from tss.sp0: sp0 on x86_32 is special in +	 *  and around vm86 mode and sp0 on x86_64 is special because of the +	 *  entry trampoline. +	 */  	return this_cpu_read_stable(cpu_current_top_of_stack); -#endif  }  static inline bool on_thread_stack(void) @@ -837,13 +852,22 @@ static inline void spin_lock_prefetch(const void *x)  #else  /* - * User space process size. 47bits minus one guard page.  The guard - * page is necessary on Intel CPUs: if a SYSCALL instruction is at - * the highest possible canonical userspace address, then that - * syscall will enter the kernel with a non-canonical return - * address, and SYSRET will explode dangerously.  We avoid this - * particular problem by preventing anything from being mapped - * at the maximum canonical address. + * User space process size.  This is the first address outside the user range. + * There are a few constraints that determine this: + * + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical + * address, then that syscall will enter the kernel with a + * non-canonical return address, and SYSRET will explode dangerously. + * We avoid this particular problem by preventing anything executable + * from being mapped at the maximum canonical address. + * + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the + * CPUs malfunction if they execute code from the highest canonical page. + * They'll speculate right off the end of the canonical space, and + * bad things happen.  This is worked around in the same way as the + * Intel problem. + * + * With page table isolation enabled, we map the LDT in ... [stay tuned]   */  #define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h new file mode 100644 index 000000000000..0b5ef05b2d2d --- /dev/null +++ b/arch/x86/include/asm/pti.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _ASM_X86_PTI_H +#define _ASM_X86_PTI_H +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +extern void pti_init(void); +extern void pti_check_boottime_disable(void); +#else +static inline void pti_check_boottime_disable(void) { } +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_PTI_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index b20f9d623f9c..8f09012b92e7 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -236,11 +236,23 @@   */  #define EARLY_IDT_HANDLER_SIZE 9 +/* + * xen_early_idt_handler_array is for Xen pv guests: for each entry in + * early_idt_handler_array it contains a prequel in the form of + * pop %rcx; pop %r11; jmp early_idt_handler_array[i]; summing up to + * max 8 bytes. + */ +#define XEN_EARLY_IDT_HANDLER_SIZE 8 +  #ifndef __ASSEMBLY__  extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];  extern void early_ignore_irq(void); +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +extern const char xen_early_idt_handler_array[NUM_EXCEPTION_VECTORS][XEN_EARLY_IDT_HANDLER_SIZE]; +#endif +  /*   * Load a segment. Fall back on loading the zero segment if something goes   * wrong.  This variant assumes that loading zero fully clears the segment. diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 8da111b3c342..f73706878772 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -16,6 +16,7 @@ enum stack_type {  	STACK_TYPE_TASK,  	STACK_TYPE_IRQ,  	STACK_TYPE_SOFTIRQ, +	STACK_TYPE_ENTRY,  	STACK_TYPE_EXCEPTION,  	STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,  }; @@ -28,6 +29,8 @@ struct stack_info {  bool in_task_stack(unsigned long *stack, struct task_struct *task,  		   struct stack_info *info); +bool in_entry_stack(unsigned long *stack, struct stack_info *info); +  int get_stack_info(unsigned long *stack, struct task_struct *task,  		   struct stack_info *info, unsigned long *visit_mask); diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 982c325dad33..8be6afb58471 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -12,7 +12,13 @@  /* image of the saved processor state */  struct saved_context { -	u16 es, fs, gs, ss; +	/* +	 * On x86_32, all segment registers, with the possible exception of +	 * gs, are saved at kernel entry in pt_regs. +	 */ +#ifdef CONFIG_X86_32_LAZY_GS +	u16 gs; +#endif  	unsigned long cr0, cr2, cr3, cr4;  	u64 misc_enable;  	bool misc_enable_saved; diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 7306e911faee..a7af9f53c0cb 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -20,8 +20,20 @@   */  struct saved_context {  	struct pt_regs regs; -	u16 ds, es, fs, gs, ss; -	unsigned long gs_base, gs_kernel_base, fs_base; + +	/* +	 * User CS and SS are saved in current_pt_regs().  The rest of the +	 * segment selectors need to be saved and restored here. +	 */ +	u16 ds, es, fs, gs; + +	/* +	 * Usermode FSBASE and GSBASE may not match the fs and gs selectors, +	 * so we save them separately.  We save the kernelmode GSBASE to +	 * restore percpu access after resume. +	 */ +	unsigned long kernelmode_gs_base, usermode_gs_base, fs_base; +  	unsigned long cr0, cr2, cr3, cr4, cr8;  	u64 misc_enable;  	bool misc_enable_saved; @@ -30,8 +42,7 @@ struct saved_context {  	u16 gdt_pad; /* Unused */  	struct desc_ptr gdt_desc;  	u16 idt_pad; -	u16 idt_limit; -	unsigned long idt_base; +	struct desc_ptr idt;  	u16 ldt;  	u16 tss;  	unsigned long tr; diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8c6bd6863db9..eb5f7999a893 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -16,8 +16,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,  		      struct tss_struct *tss);  /* This runs runs on the previous thread's stack. */ -static inline void prepare_switch_to(struct task_struct *prev, -				     struct task_struct *next) +static inline void prepare_switch_to(struct task_struct *next)  {  #ifdef CONFIG_VMAP_STACK  	/* @@ -70,7 +69,7 @@ struct fork_frame {  #define switch_to(prev, next, last)					\  do {									\ -	prepare_switch_to(prev, next);					\ +	prepare_switch_to(next);					\  									\  	((last) = __switch_to_asm((prev), (next)));			\  } while (0) @@ -79,10 +78,10 @@ do {									\  static inline void refresh_sysenter_cs(struct thread_struct *thread)  {  	/* Only happens when SEP is enabled, no need to test "SEP"arately: */ -	if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) +	if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))  		return; -	this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); +	this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);  	wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);  }  #endif @@ -90,10 +89,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)  /* This is used when switching tasks or entering/exiting vm86 mode. */  static inline void update_sp0(struct task_struct *task)  { +	/* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */  #ifdef CONFIG_X86_32  	load_sp0(task->thread.sp0);  #else -	load_sp0(task_top_of_stack(task)); +	if (static_cpu_has(X86_FEATURE_XENPV)) +		load_sp0(task_top_of_stack(task));  #endif  } diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 70f425947dc5..00223333821a 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,  #else /* !__ASSEMBLY__ */  #ifdef CONFIG_X86_64 -# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) +# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)  #endif  #endif diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 509046cfa5ce..d33e4a26dc7e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -9,70 +9,130 @@  #include <asm/cpufeature.h>  #include <asm/special_insns.h>  #include <asm/smp.h> +#include <asm/invpcid.h> +#include <asm/pti.h> +#include <asm/processor-flags.h> -static inline void __invpcid(unsigned long pcid, unsigned long addr, -			     unsigned long type) -{ -	struct { u64 d[2]; } desc = { { pcid, addr } }; +/* + * The x86 feature is called PCID (Process Context IDentifier). It is similar + * to what is traditionally called ASID on the RISC processors. + * + * We don't use the traditional ASID implementation, where each process/mm gets + * its own ASID and flush/restart when we run out of ASID space. + * + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's + * that came by on this CPU, allowing cheaper switch_mm between processes on + * this CPU. + * + * We end up with different spaces for different things. To avoid confusion we + * use different names for each of them: + * + * ASID  - [0, TLB_NR_DYN_ASIDS-1] + *         the canonical identifier for an mm + * + * kPCID - [1, TLB_NR_DYN_ASIDS] + *         the value we write into the PCID part of CR3; corresponds to the + *         ASID+1, because PCID 0 is special. + * + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + *         for KPTI each mm has two address spaces and thus needs two + *         PCID values, but we can still do with a single ASID denomination + *         for each mm. Corresponds to kPCID + 2048. + * + */ -	/* -	 * The memory clobber is because the whole point is to invalidate -	 * stale TLB entries and, especially if we're flushing global -	 * mappings, we don't want the compiler to reorder any subsequent -	 * memory accesses before the TLB flush. -	 * -	 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and -	 * invpcid (%rcx), %rax in long mode. -	 */ -	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" -		      : : "m" (desc), "a" (type), "c" (&desc) : "memory"); -} +/* There are 12 bits of space for ASIDS in CR3 */ +#define CR3_HW_ASID_BITS		12 -#define INVPCID_TYPE_INDIV_ADDR		0 -#define INVPCID_TYPE_SINGLE_CTXT	1 -#define INVPCID_TYPE_ALL_INCL_GLOBAL	2 -#define INVPCID_TYPE_ALL_NON_GLOBAL	3 +/* + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS	1 +#else +# define PTI_CONSUMED_PCID_BITS	0 +#endif -/* Flush all mappings for a given pcid and addr, not including globals. */ -static inline void invpcid_flush_one(unsigned long pcid, -				     unsigned long addr) -{ -	__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); -} +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) + +/* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account + * for them being zero-based.  Another -1 is because PCID 0 is reserved for + * use by non-PCID-aware users. + */ +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) -/* Flush all mappings for a given PCID, not including globals. */ -static inline void invpcid_flush_single_context(unsigned long pcid) +/* + * 6 because 6 should be plenty and struct tlb_state will fit in two cache + * lines. + */ +#define TLB_NR_DYN_ASIDS	6 + +/* + * Given @asid, compute kPCID + */ +static inline u16 kern_pcid(u16 asid)  { -	__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	/* +	 * Make sure that the dynamic ASID space does not confict with the +	 * bit we are using to switch between user and kernel ASIDs. +	 */ +	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); + +	/* +	 * The ASID being passed in here should have respected the +	 * MAX_ASID_AVAILABLE and thus never have the switch bit set. +	 */ +	VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); +#endif +	/* +	 * The dynamically-assigned ASIDs that get passed in are small +	 * (<TLB_NR_DYN_ASIDS).  They never have the high switch bit set, +	 * so do not bother to clear it. +	 * +	 * If PCID is on, ASID-aware code paths put the ASID+1 into the +	 * PCID bits.  This serves two purposes.  It prevents a nasty +	 * situation in which PCID-unaware code saves CR3, loads some other +	 * value (with PCID == 0), and then restores CR3, thus corrupting +	 * the TLB for ASID 0 if the saved ASID was nonzero.  It also means +	 * that any bugs involving loading a PCID-enabled CR3 with +	 * CR4.PCIDE off will trigger deterministically. +	 */ +	return asid + 1;  } -/* Flush all mappings, including globals, for all PCIDs. */ -static inline void invpcid_flush_all(void) +/* + * Given @asid, compute uPCID + */ +static inline u16 user_pcid(u16 asid)  { -	__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +	u16 ret = kern_pcid(asid); +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; +#endif +	return ret;  } -/* Flush all mappings for all PCIDs except globals. */ -static inline void invpcid_flush_all_nonglobals(void) +struct pgd_t; +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)  { -	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +	if (static_cpu_has(X86_FEATURE_PCID)) { +		return __sme_pa(pgd) | kern_pcid(asid); +	} else { +		VM_WARN_ON_ONCE(asid != 0); +		return __sme_pa(pgd); +	}  } -static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)  { -	u64 new_tlb_gen; - -	/* -	 * Bump the generation count.  This also serves as a full barrier -	 * that synchronizes with switch_mm(): callers are required to order -	 * their read of mm_cpumask after their writes to the paging -	 * structures. -	 */ -	smp_mb__before_atomic(); -	new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); -	smp_mb__after_atomic(); - -	return new_tlb_gen; +	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); +	VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID)); +	return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;  }  #ifdef CONFIG_PARAVIRT @@ -99,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)  	return !static_cpu_has(X86_FEATURE_PCID);  } -/* - * 6 because 6 should be plenty and struct tlb_state will fit in - * two cache lines. - */ -#define TLB_NR_DYN_ASIDS 6 -  struct tlb_context {  	u64 ctx_id;  	u64 tlb_gen; @@ -139,6 +193,24 @@ struct tlb_state {  	bool is_lazy;  	/* +	 * If set we changed the page tables in such a way that we +	 * needed an invalidation of all contexts (aka. PCIDs / ASIDs). +	 * This tells us to go invalidate all the non-loaded ctxs[] +	 * on the next context switch. +	 * +	 * The current ctx was kept up-to-date as it ran and does not +	 * need to be invalidated. +	 */ +	bool invalidate_other; + +	/* +	 * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate +	 * the corresponding user PCID needs a flush next time we +	 * switch to it; see SWITCH_TO_USER_CR3. +	 */ +	unsigned short user_pcid_flush_mask; + +	/*  	 * Access to this CR4 shadow and to H/W CR4 is protected by  	 * disabling interrupts when modifying either one.  	 */ @@ -173,40 +245,43 @@ static inline void cr4_init_shadow(void)  	this_cpu_write(cpu_tlbstate.cr4, __read_cr4());  } +static inline void __cr4_set(unsigned long cr4) +{ +	lockdep_assert_irqs_disabled(); +	this_cpu_write(cpu_tlbstate.cr4, cr4); +	__write_cr4(cr4); +} +  /* Set in this cpu's CR4. */  static inline void cr4_set_bits(unsigned long mask)  { -	unsigned long cr4; +	unsigned long cr4, flags; +	local_irq_save(flags);  	cr4 = this_cpu_read(cpu_tlbstate.cr4); -	if ((cr4 | mask) != cr4) { -		cr4 |= mask; -		this_cpu_write(cpu_tlbstate.cr4, cr4); -		__write_cr4(cr4); -	} +	if ((cr4 | mask) != cr4) +		__cr4_set(cr4 | mask); +	local_irq_restore(flags);  }  /* Clear in this cpu's CR4. */  static inline void cr4_clear_bits(unsigned long mask)  { -	unsigned long cr4; +	unsigned long cr4, flags; +	local_irq_save(flags);  	cr4 = this_cpu_read(cpu_tlbstate.cr4); -	if ((cr4 & ~mask) != cr4) { -		cr4 &= ~mask; -		this_cpu_write(cpu_tlbstate.cr4, cr4); -		__write_cr4(cr4); -	} +	if ((cr4 & ~mask) != cr4) +		__cr4_set(cr4 & ~mask); +	local_irq_restore(flags);  } -static inline void cr4_toggle_bits(unsigned long mask) +static inline void cr4_toggle_bits_irqsoff(unsigned long mask)  {  	unsigned long cr4;  	cr4 = this_cpu_read(cpu_tlbstate.cr4); -	cr4 ^= mask; -	this_cpu_write(cpu_tlbstate.cr4, cr4); -	__write_cr4(cr4); +	__cr4_set(cr4 ^ mask);  }  /* Read the CR4 shadow. */ @@ -216,6 +291,14 @@ static inline unsigned long cr4_read_shadow(void)  }  /* + * Mark all other ASIDs as invalid, preserves the current. + */ +static inline void invalidate_other_asid(void) +{ +	this_cpu_write(cpu_tlbstate.invalidate_other, true); +} + +/*   * Save some of cr4 feature set we're using (e.g.  Pentium 4MB   * enable and PPro Global page enable), so that any CPU's that boot   * up after us can get the correct flags.  This should only be used @@ -234,37 +317,63 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)  extern void initialize_tlbstate_and_flush(void); -static inline void __native_flush_tlb(void) +/* + * Given an ASID, flush the corresponding user ASID.  We can delay this + * until the next time we switch to it. + * + * See SWITCH_TO_USER_CR3. + */ +static inline void invalidate_user_asid(u16 asid)  { +	/* There is no user ASID if address space separation is off */ +	if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) +		return; +  	/* -	 * If current->mm == NULL then we borrow a mm which may change during a -	 * task switch and therefore we must not be preempted while we write CR3 -	 * back: +	 * We only have a single ASID if PCID is off and the CR3 +	 * write will have flushed it.  	 */ -	preempt_disable(); -	native_write_cr3(__native_read_cr3()); -	preempt_enable(); +	if (!cpu_feature_enabled(X86_FEATURE_PCID)) +		return; + +	if (!static_cpu_has(X86_FEATURE_PTI)) +		return; + +	__set_bit(kern_pcid(asid), +		  (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));  } -static inline void __native_flush_tlb_global_irq_disabled(void) +/* + * flush the entire current user mapping + */ +static inline void __native_flush_tlb(void)  { -	unsigned long cr4; +	/* +	 * Preemption or interrupts must be disabled to protect the access +	 * to the per CPU variable and to prevent being preempted between +	 * read_cr3() and write_cr3(). +	 */ +	WARN_ON_ONCE(preemptible()); -	cr4 = this_cpu_read(cpu_tlbstate.cr4); -	/* clear PGE */ -	native_write_cr4(cr4 & ~X86_CR4_PGE); -	/* write old PGE again and flush TLBs */ -	native_write_cr4(cr4); +	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + +	/* If current->mm == NULL then the read_cr3() "borrows" an mm */ +	native_write_cr3(__native_read_cr3());  } +/* + * flush everything + */  static inline void __native_flush_tlb_global(void)  { -	unsigned long flags; +	unsigned long cr4, flags;  	if (static_cpu_has(X86_FEATURE_INVPCID)) {  		/*  		 * Using INVPCID is considerably faster than a pair of writes  		 * to CR4 sandwiched inside an IRQ flag save/restore. +		 * +		 * Note, this works with CR4.PCIDE=0 or 1.  		 */  		invpcid_flush_all();  		return; @@ -277,36 +386,69 @@ static inline void __native_flush_tlb_global(void)  	 */  	raw_local_irq_save(flags); -	__native_flush_tlb_global_irq_disabled(); +	cr4 = this_cpu_read(cpu_tlbstate.cr4); +	/* toggle PGE */ +	native_write_cr4(cr4 ^ X86_CR4_PGE); +	/* write old PGE again and flush TLBs */ +	native_write_cr4(cr4);  	raw_local_irq_restore(flags);  } +/* + * flush one page in the user mapping + */  static inline void __native_flush_tlb_single(unsigned long addr)  { +	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); +  	asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + +	if (!static_cpu_has(X86_FEATURE_PTI)) +		return; + +	/* +	 * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. +	 * Just use invalidate_user_asid() in case we are called early. +	 */ +	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) +		invalidate_user_asid(loaded_mm_asid); +	else +		invpcid_flush_one(user_pcid(loaded_mm_asid), addr);  } +/* + * flush everything + */  static inline void __flush_tlb_all(void)  { -	if (boot_cpu_has(X86_FEATURE_PGE)) +	if (boot_cpu_has(X86_FEATURE_PGE)) {  		__flush_tlb_global(); -	else +	} else { +		/* +		 * !PGE -> !PCID (setup_pcid()), thus every flush is total. +		 */  		__flush_tlb(); - -	/* -	 * Note: if we somehow had PCID but not PGE, then this wouldn't work -- -	 * we'd end up flushing kernel translations for the current ASID but -	 * we might fail to flush kernel translations for other cached ASIDs. -	 * -	 * To avoid this issue, we force PCID off if PGE is off. -	 */ +	}  } +/* + * flush one page in the kernel mapping + */  static inline void __flush_tlb_one(unsigned long addr)  {  	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);  	__flush_tlb_single(addr); + +	if (!static_cpu_has(X86_FEATURE_PTI)) +		return; + +	/* +	 * __flush_tlb_single() will have cleared the TLB entry for this ASID, +	 * but since kernel space is replicated across all, we must also +	 * invalidate all others. +	 */ +	invalidate_other_asid();  }  #define TLB_FLUSH_ALL	-1UL @@ -367,6 +509,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)  void native_flush_tlb_others(const struct cpumask *cpumask,  			     const struct flush_tlb_info *info); +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ +	/* +	 * Bump the generation count.  This also serves as a full barrier +	 * that synchronizes with switch_mm(): callers are required to order +	 * their read of mm_cpumask after their writes to the paging +	 * structures. +	 */ +	return atomic64_inc_return(&mm->context.tlb_gen); +} +  static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,  					struct mm_struct *mm)  { diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 84b9ec0c1bc0..22647a642e98 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -283,34 +283,34 @@ TRACE_EVENT(vector_alloc_managed,  DECLARE_EVENT_CLASS(vector_activate,  	TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve, -		 bool early), +		 bool reserve), -	TP_ARGS(irq, is_managed, can_reserve, early), +	TP_ARGS(irq, is_managed, can_reserve, reserve),  	TP_STRUCT__entry(  		__field(	unsigned int,	irq		)  		__field(	bool,		is_managed	)  		__field(	bool,		can_reserve	) -		__field(	bool,		early		) +		__field(	bool,		reserve		)  	),  	TP_fast_assign(  		__entry->irq		= irq;  		__entry->is_managed	= is_managed;  		__entry->can_reserve	= can_reserve; -		__entry->early		= early; +		__entry->reserve	= reserve;  	), -	TP_printk("irq=%u is_managed=%d can_reserve=%d early=%d", +	TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d",  		  __entry->irq, __entry->is_managed, __entry->can_reserve, -		  __entry->early) +		  __entry->reserve)  );  #define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name)				\  DEFINE_EVENT_FN(vector_activate, name,					\  	TP_PROTO(unsigned int irq, bool is_managed,			\ -		 bool can_reserve, bool early),				\ -	TP_ARGS(irq, is_managed, can_reserve, early), NULL, NULL);	\ +		 bool can_reserve, bool reserve),			\ +	TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL);	\  DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate);  DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate); diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 1fadd310ff68..31051f35cbb7 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);  dotraplinkage void do_stack_segment(struct pt_regs *, long);  #ifdef CONFIG_X86_64  dotraplinkage void do_double_fault(struct pt_regs *, long); -asmlinkage struct pt_regs *sync_regs(struct pt_regs *);  #endif  dotraplinkage void do_general_protection(struct pt_regs *, long);  dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e9cc6fe1fc6f..1f86e1b0a5cd 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -7,6 +7,9 @@  #include <asm/ptrace.h>  #include <asm/stacktrace.h> +#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) +#define IRET_FRAME_SIZE   (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) +  struct unwind_state {  	struct stack_info stack_info;  	unsigned long stack_mask; @@ -52,15 +55,28 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,  }  #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +/* + * If 'partial' returns true, only the iret frame registers are valid. + */ +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, +						    bool *partial)  {  	if (unwind_done(state))  		return NULL; +	if (partial) { +#ifdef CONFIG_UNWINDER_ORC +		*partial = !state->full_regs; +#else +		*partial = false; +#endif +	} +  	return state->regs;  }  #else -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, +						    bool *partial)  {  	return NULL;  } diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d9a7c659009c..b986b2ca688a 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -7,6 +7,7 @@  #ifdef CONFIG_X86_VSYSCALL_EMULATION  extern void map_vsyscall(void); +extern void set_vsyscall_pgtable_user_bits(pgd_t *root);  /*   * Called on instruction fetch fault in vsyscall page. diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 7cb282e9e587..bfd882617613 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -44,6 +44,7 @@  #include <asm/page.h>  #include <asm/pgtable.h>  #include <asm/smap.h> +#include <asm/nospec-branch.h>  #include <xen/interface/xen.h>  #include <xen/interface/sched.h> @@ -217,9 +218,9 @@ privcmd_call(unsigned call,  	__HYPERCALL_5ARG(a1, a2, a3, a4, a5);  	stac(); -	asm volatile("call *%[call]" +	asm volatile(CALL_NOSPEC  		     : __HYPERCALL_5PARAM -		     : [call] "a" (&hypercall_page[call]) +		     : [thunk_target] "a" (&hypercall_page[call])  		     : __HYPERCALL_CLOBBER5);  	clac(); diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index da1489cb64dc..1e901e421f2d 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -1,6 +1,7 @@  # UAPI Header export list  include include/uapi/asm-generic/Kbuild.asm +generic-y += bpf_perf_event.h  generated-y += unistd_32.h  generated-y += unistd_64.h  generated-y += unistd_x32.h diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 7e1e730396ae..bcba3c643e63 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -78,7 +78,12 @@  #define X86_CR3_PWT		_BITUL(X86_CR3_PWT_BIT)  #define X86_CR3_PCD_BIT		4 /* Page Cache Disable */  #define X86_CR3_PCD		_BITUL(X86_CR3_PCD_BIT) -#define X86_CR3_PCID_MASK	_AC(0x00000fff,UL) /* PCID Mask */ + +#define X86_CR3_PCID_BITS	12 +#define X86_CR3_PCID_MASK	(_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) + +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ +#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)  /*   * Intel CPU features in CR4 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index dbaf14d69ebd..4817d743c263 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -344,9 +344,12 @@ done:  static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)  {  	unsigned long flags; +	int i; -	if (instr[0] != 0x90) -		return; +	for (i = 0; i < a->padlen; i++) { +		if (instr[i] != 0x90) +			return; +	}  	local_irq_save(flags);  	add_nops(instr + (a->instrlen - a->padlen), a->padlen); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6e272f3ea984..880441f24146 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2626,11 +2626,13 @@ static int __init apic_set_verbosity(char *arg)  		apic_verbosity = APIC_DEBUG;  	else if (strcmp("verbose", arg) == 0)  		apic_verbosity = APIC_VERBOSE; +#ifdef CONFIG_X86_64  	else {  		pr_warning("APIC Verbosity level %s not recognised"  			" use apic=verbose or apic=debug\n", arg);  		return -EINVAL;  	} +#endif  	return 0;  } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index aa85690e9b64..25a87028cb3f 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = {  	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= flat_apic_id_registered, -	.irq_delivery_mode		= dest_LowestPrio, +	.irq_delivery_mode		= dest_Fixed,  	.irq_dest_mode			= 1, /* logical */  	.disable_esr			= 0, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 7b659c4480c9..5078b5ce63a7 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = {  	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= noop_apic_id_registered, -	.irq_delivery_mode		= dest_LowestPrio, +	.irq_delivery_mode		= dest_Fixed,  	/* logical delivery broadcast to all CPUs: */  	.irq_dest_mode			= 1, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 201579dc5242..8a7963421460 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2988,7 +2988,7 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,  }  int mp_irqdomain_activate(struct irq_domain *domain, -			  struct irq_data *irq_data, bool early) +			  struct irq_data *irq_data, bool reserve)  {  	unsigned long flags; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9b18be764422..ce503c99f5c4 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)  		((apic->irq_dest_mode == 0) ?  			MSI_ADDR_DEST_MODE_PHYSICAL :  			MSI_ADDR_DEST_MODE_LOGICAL) | -		((apic->irq_delivery_mode != dest_LowestPrio) ? -			MSI_ADDR_REDIRECTION_CPU : -			MSI_ADDR_REDIRECTION_LOWPRI) | +		MSI_ADDR_REDIRECTION_CPU |  		MSI_ADDR_DEST_ID(cfg->dest_apicid);  	msg->data =  		MSI_DATA_TRIGGER_EDGE |  		MSI_DATA_LEVEL_ASSERT | -		((apic->irq_delivery_mode != dest_LowestPrio) ? -			MSI_DATA_DELIVERY_FIXED : -			MSI_DATA_DELIVERY_LOWPRI) | +		MSI_DATA_DELIVERY_FIXED |  		MSI_DATA_VECTOR(cfg->vector);  } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index fa22017de806..02e8acb134f8 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = {  	.apic_id_valid			= default_apic_id_valid,  	.apic_id_registered		= default_apic_id_registered, -	.irq_delivery_mode		= dest_LowestPrio, +	.irq_delivery_mode		= dest_Fixed,  	/* logical delivery broadcast to all CPUs: */  	.irq_dest_mode			= 1, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6a823a25eaff..f8b03bb8e725 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(struct irq_data *irqd)  	irq_matrix_reserve(vector_matrix);  	apicd->can_reserve = true;  	apicd->has_reserved = true; +	irqd_set_can_reserve(irqd);  	trace_vector_reserve(irqd->irq, 0);  	vector_assign_managed_shutdown(irqd);  } @@ -368,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd)  	int ret;  	ret = assign_irq_vector_any_locked(irqd); -	if (!ret) +	if (!ret) {  		apicd->has_reserved = false; +		/* +		 * Core might have disabled reservation mode after +		 * allocating the irq descriptor. Ideally this should +		 * happen before allocation time, but that would require +		 * completely convoluted ways of transporting that +		 * information. +		 */ +		if (!irqd_can_reserve(irqd)) +			apicd->can_reserve = false; +	}  	return ret;  } @@ -398,21 +409,21 @@ static int activate_managed(struct irq_data *irqd)  }  static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, -			       bool early) +			       bool reserve)  {  	struct apic_chip_data *apicd = apic_chip_data(irqd);  	unsigned long flags;  	int ret = 0;  	trace_vector_activate(irqd->irq, apicd->is_managed, -			      apicd->can_reserve, early); +			      apicd->can_reserve, reserve);  	/* Nothing to do for fixed assigned vectors */  	if (!apicd->can_reserve && !apicd->is_managed)  		return 0;  	raw_spin_lock_irqsave(&vector_lock, flags); -	if (early || irqd_is_managed_and_shutdown(irqd)) +	if (reserve || irqd_is_managed_and_shutdown(irqd))  		vector_assign_managed_shutdown(irqd);  	else if (apicd->is_managed)  		ret = activate_managed(irqd); @@ -478,6 +489,7 @@ static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,  	} else {  		/* Release the vector */  		apicd->can_reserve = true; +		irqd_set_can_reserve(irqd);  		clear_irq_vector(irqd);  		realloc = true;  	} @@ -542,8 +554,8 @@ error:  }  #ifdef CONFIG_GENERIC_IRQ_DEBUGFS -void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, -			   struct irq_data *irqd, int ind) +static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, +				  struct irq_data *irqd, int ind)  {  	unsigned int cpu, vector, prev_cpu, prev_vector;  	struct apic_chip_data *apicd; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 622f13ca8a94..8b04234e010b 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {  	.apic_id_valid			= x2apic_apic_id_valid,  	.apic_id_registered		= x2apic_apic_id_registered, -	.irq_delivery_mode		= dest_LowestPrio, +	.irq_delivery_mode		= dest_Fixed,  	.irq_dest_mode			= 1, /* logical */  	.disable_esr			= 0, diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 8ea78275480d..76417a9aab73 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -17,6 +17,7 @@  #include <asm/sigframe.h>  #include <asm/bootparam.h>  #include <asm/suspend.h> +#include <asm/tlbflush.h>  #ifdef CONFIG_XEN  #include <xen/interface/xen.h> @@ -93,4 +94,13 @@ void common(void) {  	BLANK();  	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + +	/* TLB state for the entry code */ +	OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); + +	/* Layout info for cpu_entry_area */ +	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); +	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); +	OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); +	DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));  } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index dedf428b20b6..fa1261eefa16 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -47,13 +47,8 @@ void foo(void)  	BLANK();  	/* Offset from the sysenter stack to tss.sp0 */ -	DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - -	       offsetofend(struct tss_struct, SYSENTER_stack)); - -	/* Offset from cpu_tss to SYSENTER_stack */ -	OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); -	/* Size of SYSENTER_stack */ -	DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); +	DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - +	       offsetofend(struct cpu_entry_area, entry_stack_page.stack));  #ifdef CONFIG_CC_STACKPROTECTOR  	BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 630212fa9b9d..bf51e51d808d 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -23,6 +23,9 @@ int main(void)  #ifdef CONFIG_PARAVIRT  	OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);  	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); +#ifdef CONFIG_DEBUG_ENTRY +	OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); +#endif  	BLANK();  #endif @@ -63,6 +66,7 @@ int main(void)  	OFFSET(TSS_ist, tss_struct, x86_tss.ist);  	OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); +	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);  	BLANK();  #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d58184b7cd44..ea831c858195 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -804,8 +804,11 @@ static void init_amd(struct cpuinfo_x86 *c)  	case 0x17: init_amd_zn(c); break;  	} -	/* Enable workaround for FXSAVE leak */ -	if (c->x86 >= 6) +	/* +	 * Enable workaround for FXSAVE leak on CPUs +	 * without a XSaveErPtr feature +	 */ +	if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR)))  		set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);  	cpu_detect_cache_sizes(c); @@ -826,8 +829,32 @@ static void init_amd(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_K8);  	if (cpu_has(c, X86_FEATURE_XMM2)) { -		/* MFENCE stops RDTSC speculation */ -		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); +		unsigned long long val; +		int ret; + +		/* +		 * A serializing LFENCE has less overhead than MFENCE, so +		 * use it for execution serialization.  On families which +		 * don't have that MSR, LFENCE is already serializing. +		 * msr_set_bit() uses the safe accessors, too, even if the MSR +		 * is not present. +		 */ +		msr_set_bit(MSR_F10H_DECFG, +			    MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + +		/* +		 * Verify that the MSR write was successful (could be running +		 * under a hypervisor) and only then assume that LFENCE is +		 * serializing. +		 */ +		ret = rdmsrl_safe(MSR_F10H_DECFG, &val); +		if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { +			/* A serializing LFENCE stops RDTSC speculation */ +			set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +		} else { +			/* MFENCE stops RDTSC speculation */ +			set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); +		}  	}  	/* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ba0b2424c9b0..e4dc26185aa7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -10,6 +10,10 @@   */  #include <linux/init.h>  #include <linux/utsname.h> +#include <linux/cpu.h> + +#include <asm/nospec-branch.h> +#include <asm/cmdline.h>  #include <asm/bugs.h>  #include <asm/processor.h>  #include <asm/processor-flags.h> @@ -20,6 +24,8 @@  #include <asm/pgtable.h>  #include <asm/set_memory.h> +static void __init spectre_v2_select_mitigation(void); +  void __init check_bugs(void)  {  	identify_boot_cpu(); @@ -29,6 +35,9 @@ void __init check_bugs(void)  		print_cpu_info(&boot_cpu_data);  	} +	/* Select the proper spectre mitigation before patching alternatives */ +	spectre_v2_select_mitigation(); +  #ifdef CONFIG_X86_32  	/*  	 * Check whether we are able to run this kernel safely on SMP. @@ -60,3 +69,179 @@ void __init check_bugs(void)  		set_memory_4k((unsigned long)__va(0), 1);  #endif  } + +/* The kernel command line selection */ +enum spectre_v2_mitigation_cmd { +	SPECTRE_V2_CMD_NONE, +	SPECTRE_V2_CMD_AUTO, +	SPECTRE_V2_CMD_FORCE, +	SPECTRE_V2_CMD_RETPOLINE, +	SPECTRE_V2_CMD_RETPOLINE_GENERIC, +	SPECTRE_V2_CMD_RETPOLINE_AMD, +}; + +static const char *spectre_v2_strings[] = { +	[SPECTRE_V2_NONE]			= "Vulnerable", +	[SPECTRE_V2_RETPOLINE_MINIMAL]		= "Vulnerable: Minimal generic ASM retpoline", +	[SPECTRE_V2_RETPOLINE_MINIMAL_AMD]	= "Vulnerable: Minimal AMD ASM retpoline", +	[SPECTRE_V2_RETPOLINE_GENERIC]		= "Mitigation: Full generic retpoline", +	[SPECTRE_V2_RETPOLINE_AMD]		= "Mitigation: Full AMD retpoline", +}; + +#undef pr_fmt +#define pr_fmt(fmt)     "Spectre V2 mitigation: " fmt + +static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; + +static void __init spec2_print_if_insecure(const char *reason) +{ +	if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +		pr_info("%s\n", reason); +} + +static void __init spec2_print_if_secure(const char *reason) +{ +	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +		pr_info("%s\n", reason); +} + +static inline bool retp_compiler(void) +{ +	return __is_defined(RETPOLINE); +} + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ +	int len = strlen(opt); + +	return len == arglen && !strncmp(arg, opt, len); +} + +static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +{ +	char arg[20]; +	int ret; + +	ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, +				  sizeof(arg)); +	if (ret > 0)  { +		if (match_option(arg, ret, "off")) { +			goto disable; +		} else if (match_option(arg, ret, "on")) { +			spec2_print_if_secure("force enabled on command line."); +			return SPECTRE_V2_CMD_FORCE; +		} else if (match_option(arg, ret, "retpoline")) { +			spec2_print_if_insecure("retpoline selected on command line."); +			return SPECTRE_V2_CMD_RETPOLINE; +		} else if (match_option(arg, ret, "retpoline,amd")) { +			if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { +				pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); +				return SPECTRE_V2_CMD_AUTO; +			} +			spec2_print_if_insecure("AMD retpoline selected on command line."); +			return SPECTRE_V2_CMD_RETPOLINE_AMD; +		} else if (match_option(arg, ret, "retpoline,generic")) { +			spec2_print_if_insecure("generic retpoline selected on command line."); +			return SPECTRE_V2_CMD_RETPOLINE_GENERIC; +		} else if (match_option(arg, ret, "auto")) { +			return SPECTRE_V2_CMD_AUTO; +		} +	} + +	if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2")) +		return SPECTRE_V2_CMD_AUTO; +disable: +	spec2_print_if_insecure("disabled on command line."); +	return SPECTRE_V2_CMD_NONE; +} + +static void __init spectre_v2_select_mitigation(void) +{ +	enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); +	enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + +	/* +	 * If the CPU is not affected and the command line mode is NONE or AUTO +	 * then nothing to do. +	 */ +	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && +	    (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) +		return; + +	switch (cmd) { +	case SPECTRE_V2_CMD_NONE: +		return; + +	case SPECTRE_V2_CMD_FORCE: +		/* FALLTRHU */ +	case SPECTRE_V2_CMD_AUTO: +		goto retpoline_auto; + +	case SPECTRE_V2_CMD_RETPOLINE_AMD: +		if (IS_ENABLED(CONFIG_RETPOLINE)) +			goto retpoline_amd; +		break; +	case SPECTRE_V2_CMD_RETPOLINE_GENERIC: +		if (IS_ENABLED(CONFIG_RETPOLINE)) +			goto retpoline_generic; +		break; +	case SPECTRE_V2_CMD_RETPOLINE: +		if (IS_ENABLED(CONFIG_RETPOLINE)) +			goto retpoline_auto; +		break; +	} +	pr_err("kernel not compiled with retpoline; no mitigation available!"); +	return; + +retpoline_auto: +	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { +	retpoline_amd: +		if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { +			pr_err("LFENCE not serializing. Switching to generic retpoline\n"); +			goto retpoline_generic; +		} +		mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : +					 SPECTRE_V2_RETPOLINE_MINIMAL_AMD; +		setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); +		setup_force_cpu_cap(X86_FEATURE_RETPOLINE); +	} else { +	retpoline_generic: +		mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC : +					 SPECTRE_V2_RETPOLINE_MINIMAL; +		setup_force_cpu_cap(X86_FEATURE_RETPOLINE); +	} + +	spectre_v2_enabled = mode; +	pr_info("%s\n", spectre_v2_strings[mode]); +} + +#undef pr_fmt + +#ifdef CONFIG_SYSFS +ssize_t cpu_show_meltdown(struct device *dev, +			  struct device_attribute *attr, char *buf) +{ +	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) +		return sprintf(buf, "Not affected\n"); +	if (boot_cpu_has(X86_FEATURE_PTI)) +		return sprintf(buf, "Mitigation: PTI\n"); +	return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v1(struct device *dev, +			    struct device_attribute *attr, char *buf) +{ +	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) +		return sprintf(buf, "Not affected\n"); +	return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, +			    struct device_attribute *attr, char *buf) +{ +	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +		return sprintf(buf, "Not affected\n"); + +	return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]); +} +#endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fa998ca8aa5a..ef29ad001991 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)  	return NULL;		/* Not found */  } -__u32 cpu_caps_cleared[NCAPINTS]; -__u32 cpu_caps_set[NCAPINTS]; +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +__u32 cpu_caps_set[NCAPINTS + NBUGINTS];  void load_percpu_segment(int cpu)  { @@ -490,28 +490,23 @@ void load_percpu_segment(int cpu)  	load_stack_canary_segment();  } -/* Setup the fixmap mapping only once per-processor */ -static inline void setup_fixmap_gdt(int cpu) -{ -#ifdef CONFIG_X86_64 -	/* On 64-bit systems, we use a read-only fixmap GDT. */ -	pgprot_t prot = PAGE_KERNEL_RO; -#else -	/* -	 * On native 32-bit systems, the GDT cannot be read-only because -	 * our double fault handler uses a task gate, and entering through -	 * a task gate needs to change an available TSS to busy.  If the GDT -	 * is read-only, that will triple fault. -	 * -	 * On Xen PV, the GDT must be read-only because the hypervisor requires -	 * it. -	 */ -	pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? -		PAGE_KERNEL_RO : PAGE_KERNEL; +#ifdef CONFIG_X86_32 +/* The 32-bit entry code needs to find cpu_entry_area. */ +DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);  #endif -	__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); -} +#ifdef CONFIG_X86_64 +/* + * Special IST stacks which the CPU switches to when it calls + * an IST-marked descriptor entry. Up to 7 stacks (hardware + * limit), all of them are 4K, except the debug stack which + * is 8K. + */ +static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { +	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ, +	  [DEBUG_STACK - 1]			= DEBUG_STKSZ +}; +#endif  /* Load the original GDT from the per-cpu structure */  void load_direct_gdt(int cpu) @@ -747,7 +742,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)  {  	int i; -	for (i = 0; i < NCAPINTS; i++) { +	for (i = 0; i < NCAPINTS + NBUGINTS; i++) {  		c->x86_capability[i] &= ~cpu_caps_cleared[i];  		c->x86_capability[i] |= cpu_caps_set[i];  	} @@ -927,6 +922,13 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)  	}  	setup_force_cpu_cap(X86_FEATURE_ALWAYS); + +	if (c->x86_vendor != X86_VENDOR_AMD) +		setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + +	setup_force_cpu_bug(X86_BUG_SPECTRE_V1); +	setup_force_cpu_bug(X86_BUG_SPECTRE_V2); +  	fpu__init_system(c);  #ifdef CONFIG_X86_32 @@ -1250,7 +1252,7 @@ void enable_sep_cpu(void)  		return;  	cpu = get_cpu(); -	tss = &per_cpu(cpu_tss, cpu); +	tss = &per_cpu(cpu_tss_rw, cpu);  	/*  	 * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- @@ -1259,11 +1261,7 @@ void enable_sep_cpu(void)  	tss->x86_tss.ss1 = __KERNEL_CS;  	wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - -	wrmsr(MSR_IA32_SYSENTER_ESP, -	      (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), -	      0); - +	wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);  	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);  	put_cpu(); @@ -1357,25 +1355,22 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;  DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;  EXPORT_PER_CPU_SYMBOL(__preempt_count); -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { -	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ, -	  [DEBUG_STACK - 1]			= DEBUG_STKSZ -}; - -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks -	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); -  /* May not be marked __init: used by software suspend */  void syscall_init(void)  { +	extern char _entry_trampoline[]; +	extern char entry_SYSCALL_64_trampoline[]; + +	int cpu = smp_processor_id(); +	unsigned long SYSCALL64_entry_trampoline = +		(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + +		(entry_SYSCALL_64_trampoline - _entry_trampoline); +  	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); -	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); +	if (static_cpu_has(X86_FEATURE_PTI)) +		wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); +	else +		wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);  #ifdef CONFIG_IA32_EMULATION  	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); @@ -1386,7 +1381,7 @@ void syscall_init(void)  	 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).  	 */  	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); -	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); +	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));  	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);  #else  	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1530,7 +1525,7 @@ void cpu_init(void)  	if (cpu)  		load_ucode_ap(); -	t = &per_cpu(cpu_tss, cpu); +	t = &per_cpu(cpu_tss_rw, cpu);  	oist = &per_cpu(orig_ist, cpu);  #ifdef CONFIG_NUMA @@ -1569,7 +1564,7 @@ void cpu_init(void)  	 * set up and load the per-CPU TSS  	 */  	if (!oist->ist[0]) { -		char *estacks = per_cpu(exception_stacks, cpu); +		char *estacks = get_cpu_entry_area(cpu)->exception_stacks;  		for (v = 0; v < N_EXCEPTION_STACKS; v++) {  			estacks += exception_stack_sizes[v]; @@ -1580,7 +1575,7 @@ void cpu_init(void)  		}  	} -	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); +	t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;  	/*  	 * <= is required because the CPU will access up to @@ -1596,11 +1591,12 @@ void cpu_init(void)  	enter_lazy_tlb(&init_mm, me);  	/* -	 * Initialize the TSS.  Don't bother initializing sp0, as the initial -	 * task never enters user mode. +	 * Initialize the TSS.  sp0 points to the entry trampoline stack +	 * regardless of what task is running.  	 */ -	set_tss_desc(cpu, t); +	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);  	load_TR_desc(); +	load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));  	load_mm_ldt(&init_mm); @@ -1612,7 +1608,6 @@ void cpu_init(void)  	if (is_uv_system())  		uv_cpu_init(); -	setup_fixmap_gdt(cpu);  	load_fixmap_gdt(cpu);  } @@ -1622,7 +1617,7 @@ void cpu_init(void)  {  	int cpu = smp_processor_id();  	struct task_struct *curr = current; -	struct tss_struct *t = &per_cpu(cpu_tss, cpu); +	struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);  	wait_for_master_cpu(cpu); @@ -1657,12 +1652,12 @@ void cpu_init(void)  	 * Initialize the TSS.  Don't bother initializing sp0, as the initial  	 * task never enters user mode.  	 */ -	set_tss_desc(cpu, t); +	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);  	load_TR_desc();  	load_mm_ldt(&init_mm); -	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); +	t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;  #ifdef CONFIG_DOUBLEFAULT  	/* Set up doublefault TSS pointer in the GDT */ @@ -1674,7 +1669,6 @@ void cpu_init(void)  	fpu__init_cpu(); -	setup_fixmap_gdt(cpu);  	load_fixmap_gdt(cpu);  }  #endif diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index c6daec4bdba5..330b8462d426 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -470,6 +470,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,  #define F14H_MPB_MAX_SIZE 1824  #define F15H_MPB_MAX_SIZE 4096  #define F16H_MPB_MAX_SIZE 3458 +#define F17H_MPB_MAX_SIZE 3200  	switch (family) {  	case 0x14: @@ -481,6 +482,9 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,  	case 0x16:  		max_size = F16H_MPB_MAX_SIZE;  		break; +	case 0x17: +		max_size = F17H_MPB_MAX_SIZE; +		break;  	default:  		max_size = F1XH_MPB_MAX_SIZE;  		break; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 7dbcb7adf797..d9e460fc7a3b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci)  }  #else -/* - * Flush global tlb. We only do this in x86_64 where paging has been enabled - * already and PGE should be enabled as well. - */ -static inline void flush_tlb_early(void) -{ -	__native_flush_tlb_global_irq_disabled(); -} -  static inline void print_ucode(struct ucode_cpu_info *uci)  {  	struct microcode_intel *mc; @@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)  	if (rev != mc->hdr.rev)  		return -1; -#ifdef CONFIG_X86_64 -	/* Flush global tlb. This is precaution. */ -	flush_tlb_early(); -#endif  	uci->cpu_sig.rev = rev;  	if (early) @@ -923,8 +910,17 @@ static bool is_blacklisted(unsigned int cpu)  {  	struct cpuinfo_x86 *c = &cpu_data(cpu); -	if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { -		pr_err_once("late loading on model 79 is disabled.\n"); +	/* +	 * Late loading on model 79 with microcode revision less than 0x0b000021 +	 * may result in a system hang. This behavior is documented in item +	 * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family). +	 */ +	if (c->x86 == 6 && +	    c->x86_model == INTEL_FAM6_BROADWELL_X && +	    c->x86_mask == 0x01 && +	    c->microcode < 0x0b000021) { +		pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); +		pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");  		return true;  	} diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0e662c55ae90..0b8cedb20d6d 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -50,25 +50,23 @@ static void doublefault_fn(void)  		cpu_relax();  } -struct tss_struct doublefault_tss __cacheline_aligned = { -	.x86_tss = { -		.sp0		= STACK_START, -		.ss0		= __KERNEL_DS, -		.ldt		= 0, -		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET, - -		.ip		= (unsigned long) doublefault_fn, -		/* 0x2 bit is always set */ -		.flags		= X86_EFLAGS_SF | 0x2, -		.sp		= STACK_START, -		.es		= __USER_DS, -		.cs		= __KERNEL_CS, -		.ss		= __KERNEL_DS, -		.ds		= __USER_DS, -		.fs		= __KERNEL_PERCPU, - -		.__cr3		= __pa_nodebug(swapper_pg_dir), -	} +struct x86_hw_tss doublefault_tss __cacheline_aligned = { +	.sp0		= STACK_START, +	.ss0		= __KERNEL_DS, +	.ldt		= 0, +	.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET, + +	.ip		= (unsigned long) doublefault_fn, +	/* 0x2 bit is always set */ +	.flags		= X86_EFLAGS_SF | 0x2, +	.sp		= STACK_START, +	.es		= __USER_DS, +	.cs		= __KERNEL_CS, +	.ss		= __KERNEL_DS, +	.ds		= __USER_DS, +	.fs		= __KERNEL_PERCPU, + +	.__cr3		= __pa_nodebug(swapper_pg_dir),  };  /* dummy for do_double_fault() call */ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f13b4c00a5de..afbecff161d1 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -18,6 +18,7 @@  #include <linux/nmi.h>  #include <linux/sysfs.h> +#include <asm/cpu_entry_area.h>  #include <asm/stacktrace.h>  #include <asm/unwind.h> @@ -43,6 +44,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,  	return true;  } +bool in_entry_stack(unsigned long *stack, struct stack_info *info) +{ +	struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); + +	void *begin = ss; +	void *end = ss + 1; + +	if ((void *)stack < begin || (void *)stack >= end) +		return false; + +	info->type	= STACK_TYPE_ENTRY; +	info->begin	= begin; +	info->end	= end; +	info->next_sp	= NULL; + +	return true; +} +  static void printk_stack_address(unsigned long address, int reliable,  				 char *log_lvl)  { @@ -50,6 +69,39 @@ static void printk_stack_address(unsigned long address, int reliable,  	printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);  } +void show_iret_regs(struct pt_regs *regs) +{ +	printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); +	printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, +		regs->sp, regs->flags); +} + +static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, +				  bool partial) +{ +	/* +	 * These on_stack() checks aren't strictly necessary: the unwind code +	 * has already validated the 'regs' pointer.  The checks are done for +	 * ordering reasons: if the registers are on the next stack, we don't +	 * want to print them out yet.  Otherwise they'll be shown as part of +	 * the wrong stack.  Later, when show_trace_log_lvl() switches to the +	 * next stack, this function will be called again with the same regs so +	 * they can be printed in the right context. +	 */ +	if (!partial && on_stack(info, regs, sizeof(*regs))) { +		__show_regs(regs, 0); + +	} else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, +				       IRET_FRAME_SIZE)) { +		/* +		 * When an interrupt or exception occurs in entry code, the +		 * full pt_regs might not have been saved yet.  In that case +		 * just print the iret frame. +		 */ +		show_iret_regs(regs); +	} +} +  void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,  			unsigned long *stack, char *log_lvl)  { @@ -57,11 +109,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,  	struct stack_info stack_info = {0};  	unsigned long visit_mask = 0;  	int graph_idx = 0; +	bool partial;  	printk("%sCall Trace:\n", log_lvl);  	unwind_start(&state, task, regs, stack);  	stack = stack ? : get_stack_pointer(task, regs); +	regs = unwind_get_entry_regs(&state, &partial);  	/*  	 * Iterate through the stacks, starting with the current stack pointer. @@ -71,31 +125,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,  	 * - task stack  	 * - interrupt stack  	 * - HW exception stacks (double fault, nmi, debug, mce) +	 * - entry stack  	 * -	 * x86-32 can have up to three stacks: +	 * x86-32 can have up to four stacks:  	 * - task stack  	 * - softirq stack  	 * - hardirq stack +	 * - entry stack  	 */ -	for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { +	for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {  		const char *stack_name; -		/* -		 * If we overflowed the task stack into a guard page, jump back -		 * to the bottom of the usable stack. -		 */ -		if (task_stack_page(task) - (void *)stack < PAGE_SIZE) -			stack = task_stack_page(task); - -		if (get_stack_info(stack, task, &stack_info, &visit_mask)) -			break; +		if (get_stack_info(stack, task, &stack_info, &visit_mask)) { +			/* +			 * We weren't on a valid stack.  It's possible that +			 * we overflowed a valid stack into a guard page. +			 * See if the next page up is valid so that we can +			 * generate some kind of backtrace if this happens. +			 */ +			stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); +			if (get_stack_info(stack, task, &stack_info, &visit_mask)) +				break; +		}  		stack_name = stack_type_name(stack_info.type);  		if (stack_name)  			printk("%s <%s>\n", log_lvl, stack_name); -		if (regs && on_stack(&stack_info, regs, sizeof(*regs))) -			__show_regs(regs, 0); +		if (regs) +			show_regs_if_on_stack(&stack_info, regs, partial);  		/*  		 * Scan the stack, printing any text addresses we find.  At the @@ -119,7 +177,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,  			/*  			 * Don't print regs->ip again if it was already printed -			 * by __show_regs() below. +			 * by show_regs_if_on_stack().  			 */  			if (regs && stack == ®s->ip)  				goto next; @@ -154,9 +212,9 @@ next:  			unwind_next_frame(&state);  			/* if the frame has entry regs, print them */ -			regs = unwind_get_entry_regs(&state); -			if (regs && on_stack(&stack_info, regs, sizeof(*regs))) -				__show_regs(regs, 0); +			regs = unwind_get_entry_regs(&state, &partial); +			if (regs) +				show_regs_if_on_stack(&stack_info, regs, partial);  		}  		if (stack_name) @@ -252,11 +310,13 @@ int __die(const char *str, struct pt_regs *regs, long err)  	unsigned long sp;  #endif  	printk(KERN_DEFAULT -	       "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, +	       "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,  	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "",  	       IS_ENABLED(CONFIG_SMP)     ? " SMP"             : "",  	       debug_pagealloc_enabled()  ? " DEBUG_PAGEALLOC" : "", -	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : ""); +	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "", +	       IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? +	       (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");  	if (notify_die(DIE_OOPS, str, regs, err,  			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index daefae83a3aa..04170f63e3a1 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)  	if (type == STACK_TYPE_SOFTIRQ)  		return "SOFTIRQ"; +	if (type == STACK_TYPE_ENTRY) +		return "ENTRY_TRAMPOLINE"; +  	return NULL;  } @@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,  	if (task != current)  		goto unknown; +	if (in_entry_stack(stack, info)) +		goto recursion_check; +  	if (in_hardirq_stack(stack, info))  		goto recursion_check; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 88ce2ffdb110..563e28d14f2c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -37,6 +37,15 @@ const char *stack_type_name(enum stack_type type)  	if (type == STACK_TYPE_IRQ)  		return "IRQ"; +	if (type == STACK_TYPE_ENTRY) { +		/* +		 * On 64-bit, we have a generic entry stack that we +		 * use for all the kernel entry points, including +		 * SYSENTER. +		 */ +		return "ENTRY_TRAMPOLINE"; +	} +  	if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)  		return exception_stack_names[type - STACK_TYPE_EXCEPTION]; @@ -115,6 +124,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,  	if (in_irq_stack(stack, info))  		goto recursion_check; +	if (in_entry_stack(stack, info)) +		goto recursion_check; +  	goto unknown;  recursion_check: diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index b6c6468e10bc..4c8440de3355 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -8,6 +8,7 @@  #include <asm/segment.h>  #include <asm/export.h>  #include <asm/ftrace.h> +#include <asm/nospec-branch.h>  #ifdef CC_USING_FENTRY  # define function_hook	__fentry__ @@ -197,7 +198,8 @@ ftrace_stub:  	movl	0x4(%ebp), %edx  	subl	$MCOUNT_INSN_SIZE, %eax -	call	*ftrace_trace_function +	movl	ftrace_trace_function, %ecx +	CALL_NOSPEC %ecx  	popl	%edx  	popl	%ecx @@ -241,5 +243,5 @@ return_to_handler:  	movl	%eax, %ecx  	popl	%edx  	popl	%eax -	jmp	*%ecx +	JMP_NOSPEC %ecx  #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index c832291d948a..7cb8ba08beb9 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -7,7 +7,7 @@  #include <asm/ptrace.h>  #include <asm/ftrace.h>  #include <asm/export.h> - +#include <asm/nospec-branch.h>  	.code64  	.section .entry.text, "ax" @@ -286,8 +286,8 @@ trace:  	 * ip and parent ip are used and the list function is called when  	 * function tracing is enabled.  	 */ -	call   *ftrace_trace_function - +	movq ftrace_trace_function, %r8 +	CALL_NOSPEC %r8  	restore_mcount_regs  	jmp fgraph_trace @@ -329,5 +329,5 @@ GLOBAL(return_to_handler)  	movq 8(%rsp), %rdx  	movq (%rsp), %rax  	addq $24, %rsp -	jmp *%rdi +	JMP_NOSPEC %rdi  #endif diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7dca675fe78d..04a625f0fcda 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)  	.balign	PAGE_SIZE; \  GLOBAL(name) +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned.  We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define PTI_USER_PGD_FILL	512 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ +	.balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define PTI_USER_PGD_FILL	0 +#endif +  /* Automate the creation of 1 to 1 mapping pmd entries */  #define PMDS(START, PERM, COUNT)			\  	i = 0 ;						\ @@ -350,13 +371,14 @@ GLOBAL(name)  	.endr  	__INITDATA -NEXT_PAGE(early_top_pgt) +NEXT_PGD_PAGE(early_top_pgt)  	.fill	511,8,0  #ifdef CONFIG_X86_5LEVEL  	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC  #else  	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC  #endif +	.fill	PTI_USER_PGD_FILL,8,0  NEXT_PAGE(early_dynamic_pgts)  	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0 @@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)  	.data  #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) -NEXT_PAGE(init_top_pgt) +NEXT_PGD_PAGE(init_top_pgt)  	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC  	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0  	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC  	.org    init_top_pgt + PGD_START_KERNEL*8, 0  	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */  	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +	.fill	PTI_USER_PGD_FILL,8,0  NEXT_PAGE(level3_ident_pgt)  	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC @@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)  	 */  	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)  #else -NEXT_PAGE(init_top_pgt) +NEXT_PGD_PAGE(init_top_pgt)  	.fill	512,8,0 +	.fill	PTI_USER_PGD_FILL,8,0  #endif  #ifdef CONFIG_X86_5LEVEL diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 3feb648781c4..2f723301eb58 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)  	 * because the ->io_bitmap_max value must match the bitmap  	 * contents:  	 */ -	tss = &per_cpu(cpu_tss, get_cpu()); +	tss = &per_cpu(cpu_tss_rw, get_cpu());  	if (turn_on)  		bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 49cfd9fe7589..68e1867cca80 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)  	/* high bit used in ret_from_ code  */  	unsigned vector = ~regs->orig_ax; -	/* -	 * NB: Unlike exception entries, IRQ entries do not reliably -	 * handle context tracking in the low-level entry code.  This is -	 * because syscall entries execute briefly with IRQs on before -	 * updating context tracking state, so we can take an IRQ from -	 * kernel mode with CONTEXT_USER.  The low-level entry code only -	 * updates the context if we came from user mode, so we won't -	 * switch to CONTEXT_KERNEL.  We'll fix that once the syscall -	 * code is cleaned up enough that we can cleanly defer enabling -	 * IRQs. -	 */ -  	entering_irq();  	/* entering_irq() tells RCU that we're not quiescent.  Check it. */ diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a83b3346a0e1..c1bdbd3d3232 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -20,6 +20,7 @@  #include <linux/mm.h>  #include <asm/apic.h> +#include <asm/nospec-branch.h>  #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack);  static void call_on_stack(void *func, void *stack)  {  	asm volatile("xchgl	%%ebx,%%esp	\n" -		     "call	*%%edi		\n" +		     CALL_NOSPEC  		     "movl	%%ebx,%%esp	\n"  		     : "=b" (stack)  		     : "0" (stack), -		       "D"(func) +		       [thunk_target] "D"(func)  		     : "memory", "cc", "edx", "ecx", "eax");  } @@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)  		call_on_stack(print_stack_overflow, isp);  	asm volatile("xchgl	%%ebx,%%esp	\n" -		     "call	*%%edi		\n" +		     CALL_NOSPEC  		     "movl	%%ebx,%%esp	\n"  		     : "=a" (arg1), "=b" (isp)  		     :  "0" (desc),   "1" (isp), -			"D" (desc->handle_irq) +			[thunk_target] "D" (desc->handle_irq)  		     : "memory", "cc", "ecx");  	return 1;  } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 020efbf5786b..d86e344f5b3d 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)  	if (regs->sp >= estack_top && regs->sp <= estack_bottom)  		return; -	WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", +	WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",  		current->comm, curbase, regs->sp,  		irq_stack_top, irq_stack_bottom, -		estack_top, estack_bottom); +		estack_top, estack_bottom, (void *)regs->ip);  	if (sysctl_panic_on_stackoverflow)  		panic("low stack detected by irq handler - check messages\n"); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 1c1eae961340..26d713ecad34 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -5,6 +5,11 @@   * Copyright (C) 2002 Andi Kleen   *   * This handles calls from both 32bit and 64bit mode. + * + * Lock order: + *	contex.ldt_usr_sem + *	  mmap_sem + *	    context.lock   */  #include <linux/errno.h> @@ -19,6 +24,7 @@  #include <linux/uaccess.h>  #include <asm/ldt.h> +#include <asm/tlb.h>  #include <asm/desc.h>  #include <asm/mmu_context.h>  #include <asm/syscalls.h> @@ -42,17 +48,15 @@ static void refresh_ldt_segments(void)  #endif  } -/* context.lock is held for us, so we don't need any locking. */ +/* context.lock is held by the task which issued the smp function call */  static void flush_ldt(void *__mm)  {  	struct mm_struct *mm = __mm; -	mm_context_t *pc;  	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)  		return; -	pc = &mm->context; -	set_ldt(pc->ldt->entries, pc->ldt->nr_entries); +	load_mm_ldt(mm);  	refresh_ldt_segments();  } @@ -89,25 +93,143 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)  		return NULL;  	} +	/* The new LDT isn't aliased for PTI yet. */ +	new_ldt->slot = -1; +  	new_ldt->nr_entries = num_entries;  	return new_ldt;  } +/* + * If PTI is enabled, this maps the LDT into the kernelmode and + * usermode tables for the given mm. + * + * There is no corresponding unmap function.  Even if the LDT is freed, we + * leave the PTEs around until the slot is reused or the mm is destroyed. + * This is harmless: the LDT is always in ordinary memory, and no one will + * access the freed slot. + * + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make + * it useful, and the flush would slow down modify_ldt(). + */ +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	bool is_vmalloc, had_top_level_entry; +	unsigned long va; +	spinlock_t *ptl; +	pgd_t *pgd; +	int i; + +	if (!static_cpu_has(X86_FEATURE_PTI)) +		return 0; + +	/* +	 * Any given ldt_struct should have map_ldt_struct() called at most +	 * once. +	 */ +	WARN_ON(ldt->slot != -1); + +	/* +	 * Did we already have the top level entry allocated?  We can't +	 * use pgd_none() for this because it doens't do anything on +	 * 4-level page table kernels. +	 */ +	pgd = pgd_offset(mm, LDT_BASE_ADDR); +	had_top_level_entry = (pgd->pgd != 0); + +	is_vmalloc = is_vmalloc_addr(ldt->entries); + +	for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { +		unsigned long offset = i << PAGE_SHIFT; +		const void *src = (char *)ldt->entries + offset; +		unsigned long pfn; +		pte_t pte, *ptep; + +		va = (unsigned long)ldt_slot_va(slot) + offset; +		pfn = is_vmalloc ? vmalloc_to_pfn(src) : +			page_to_pfn(virt_to_page(src)); +		/* +		 * Treat the PTI LDT range as a *userspace* range. +		 * get_locked_pte() will allocate all needed pagetables +		 * and account for them in this mm. +		 */ +		ptep = get_locked_pte(mm, va, &ptl); +		if (!ptep) +			return -ENOMEM; +		/* +		 * Map it RO so the easy to find address is not a primary +		 * target via some kernel interface which misses a +		 * permission check. +		 */ +		pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); +		set_pte_at(mm, va, ptep, pte); +		pte_unmap_unlock(ptep, ptl); +	} + +	if (mm->context.ldt) { +		/* +		 * We already had an LDT.  The top-level entry should already +		 * have been allocated and synchronized with the usermode +		 * tables. +		 */ +		WARN_ON(!had_top_level_entry); +		if (static_cpu_has(X86_FEATURE_PTI)) +			WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); +	} else { +		/* +		 * This is the first time we're mapping an LDT for this process. +		 * Sync the pgd to the usermode tables. +		 */ +		WARN_ON(had_top_level_entry); +		if (static_cpu_has(X86_FEATURE_PTI)) { +			WARN_ON(kernel_to_user_pgdp(pgd)->pgd); +			set_pgd(kernel_to_user_pgdp(pgd), *pgd); +		} +	} + +	va = (unsigned long)ldt_slot_va(slot); +	flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); + +	ldt->slot = slot; +#endif +	return 0; +} + +static void free_ldt_pgtables(struct mm_struct *mm) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	struct mmu_gather tlb; +	unsigned long start = LDT_BASE_ADDR; +	unsigned long end = start + (1UL << PGDIR_SHIFT); + +	if (!static_cpu_has(X86_FEATURE_PTI)) +		return; + +	tlb_gather_mmu(&tlb, mm, start, end); +	free_pgd_range(&tlb, start, end, start, end); +	tlb_finish_mmu(&tlb, start, end); +#endif +} +  /* After calling this, the LDT is immutable. */  static void finalize_ldt_struct(struct ldt_struct *ldt)  {  	paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);  } -/* context.lock is held */ -static void install_ldt(struct mm_struct *current_mm, -			struct ldt_struct *ldt) +static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)  { +	mutex_lock(&mm->context.lock); +  	/* Synchronizes with READ_ONCE in load_mm_ldt. */ -	smp_store_release(¤t_mm->context.ldt, ldt); +	smp_store_release(&mm->context.ldt, ldt); -	/* Activate the LDT for all CPUs using current_mm. */ -	on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); +	/* Activate the LDT for all CPUs using currents mm. */ +	on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); + +	mutex_unlock(&mm->context.lock);  }  static void free_ldt_struct(struct ldt_struct *ldt) @@ -124,27 +246,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)  }  /* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. + * Called on fork from arch_dup_mmap(). Just copy the current LDT state, + * the new task is not running, so nothing can be installed.   */ -int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) +int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)  {  	struct ldt_struct *new_ldt; -	struct mm_struct *old_mm;  	int retval = 0; -	mutex_init(&mm->context.lock); -	old_mm = current->mm; -	if (!old_mm) { -		mm->context.ldt = NULL; +	if (!old_mm)  		return 0; -	}  	mutex_lock(&old_mm->context.lock); -	if (!old_mm->context.ldt) { -		mm->context.ldt = NULL; +	if (!old_mm->context.ldt)  		goto out_unlock; -	}  	new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);  	if (!new_ldt) { @@ -156,6 +271,12 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)  	       new_ldt->nr_entries * LDT_ENTRY_SIZE);  	finalize_ldt_struct(new_ldt); +	retval = map_ldt_struct(mm, new_ldt, 0); +	if (retval) { +		free_ldt_pgtables(mm); +		free_ldt_struct(new_ldt); +		goto out_unlock; +	}  	mm->context.ldt = new_ldt;  out_unlock: @@ -174,13 +295,18 @@ void destroy_context_ldt(struct mm_struct *mm)  	mm->context.ldt = NULL;  } +void ldt_arch_exit_mmap(struct mm_struct *mm) +{ +	free_ldt_pgtables(mm); +} +  static int read_ldt(void __user *ptr, unsigned long bytecount)  {  	struct mm_struct *mm = current->mm;  	unsigned long entries_size;  	int retval; -	mutex_lock(&mm->context.lock); +	down_read(&mm->context.ldt_usr_sem);  	if (!mm->context.ldt) {  		retval = 0; @@ -209,7 +335,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)  	retval = bytecount;  out_unlock: -	mutex_unlock(&mm->context.lock); +	up_read(&mm->context.ldt_usr_sem);  	return retval;  } @@ -269,7 +395,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)  			ldt.avl = 0;  	} -	mutex_lock(&mm->context.lock); +	if (down_write_killable(&mm->context.ldt_usr_sem)) +		return -EINTR;  	old_ldt       = mm->context.ldt;  	old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; @@ -286,12 +413,31 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)  	new_ldt->entries[ldt_info.entry_number] = ldt;  	finalize_ldt_struct(new_ldt); +	/* +	 * If we are using PTI, map the new LDT into the userspace pagetables. +	 * If there is already an LDT, use the other slot so that other CPUs +	 * will continue to use the old LDT until install_ldt() switches +	 * them over to the new LDT. +	 */ +	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); +	if (error) { +		/* +		 * This only can fail for the first LDT setup. If an LDT is +		 * already installed then the PTE page is already +		 * populated. Mop up a half populated page table. +		 */ +		if (!WARN_ON_ONCE(old_ldt)) +			free_ldt_pgtables(mm); +		free_ldt_struct(new_ldt); +		goto out_unlock; +	} +  	install_ldt(mm, new_ldt);  	free_ldt_struct(old_ldt);  	error = 0;  out_unlock: -	mutex_unlock(&mm->context.lock); +	up_write(&mm->context.ldt_usr_sem);  out:  	return error;  } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 00bc751c861c..edfede768688 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -48,8 +48,6 @@ static void load_segments(void)  		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"  		"\tmovl %%eax,%%ds\n"  		"\tmovl %%eax,%%es\n" -		"\tmovl %%eax,%%fs\n" -		"\tmovl %%eax,%%gs\n"  		"\tmovl %%eax,%%ss\n"  		: : : "eax", "memory");  #undef STR @@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image)  	 * The gdt & idt are now invalid.  	 * If you want to load them you must set up your own idt & gdt.  	 */ -	set_gdt(phys_to_virt(0), 0);  	idt_invalidate(phys_to_virt(0)); +	set_gdt(phys_to_virt(0), 0);  	/* now call it */  	image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index ac0be8283325..9edadabf04f6 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");  DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");  DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");  DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");  DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");  DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); @@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,  		PATCH_SITE(pv_mmu_ops, read_cr2);  		PATCH_SITE(pv_mmu_ops, read_cr3);  		PATCH_SITE(pv_mmu_ops, write_cr3); -		PATCH_SITE(pv_mmu_ops, flush_tlb_single);  		PATCH_SITE(pv_cpu_ops, wbinvd);  #if defined(CONFIG_PARAVIRT_SPINLOCKS)  		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 97fb3e5737f5..832a6acd730f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -47,7 +47,7 @@   * section. Since TSS's are completely CPU-local, we want them   * on exact cacheline boundaries, to eliminate cacheline ping-pong.   */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {  	.x86_tss = {  		/*  		 * .sp0 is only used when entering ring 0 from a lower @@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {  		 * Poison it.  		 */  		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1, + +#ifdef CONFIG_X86_64 +		/* +		 * .sp1 is cpu_current_top_of_stack.  The init task never +		 * runs user code, but cpu_current_top_of_stack should still +		 * be well defined before the first context switch. +		 */ +		.sp1 = TOP_OF_INIT_STACK, +#endif +  #ifdef CONFIG_X86_32  		.ss0 = __KERNEL_DS,  		.ss1 = __KERNEL_CS, @@ -71,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {  	  */  	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },  #endif -#ifdef CONFIG_X86_32 -	.SYSENTER_stack_canary	= STACK_END_MAGIC, -#endif  }; -EXPORT_PER_CPU_SYMBOL(cpu_tss); +EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);  DEFINE_PER_CPU(bool, __tss_limit_invalid);  EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); @@ -104,7 +111,7 @@ void exit_thread(struct task_struct *tsk)  	struct fpu *fpu = &t->fpu;  	if (bp) { -		struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); +		struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());  		t->io_bitmap_ptr = NULL;  		clear_thread_flag(TIF_IO_BITMAP); @@ -299,7 +306,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,  	}  	if ((tifp ^ tifn) & _TIF_NOTSC) -		cr4_toggle_bits(X86_CR4_TSD); +		cr4_toggle_bits_irqsoff(X86_CR4_TSD);  	if ((tifp ^ tifn) & _TIF_NOCPUID)  		set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 45bf0c5f93e1..5224c6099184 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	struct fpu *prev_fpu = &prev->fpu;  	struct fpu *next_fpu = &next->fpu;  	int cpu = smp_processor_id(); -	struct tss_struct *tss = &per_cpu(cpu_tss, cpu); +	struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);  	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index eeeb34f85c25..c75466232016 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)  	unsigned int fsindex, gsindex;  	unsigned int ds, cs, es; -	printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); -	printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, -		regs->sp, regs->flags); +	show_iret_regs(regs); +  	if (regs->orig_ax != -1)  		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);  	else @@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)  	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",  	       regs->r13, regs->r14, regs->r15); +	if (!all) +		return; +  	asm("movl %%ds,%0" : "=r" (ds));  	asm("movl %%cs,%0" : "=r" (cs));  	asm("movl %%es,%0" : "=r" (es)); @@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)  	rdmsrl(MSR_GS_BASE, gs);  	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); -	if (!all) -		return; -  	cr0 = read_cr0();  	cr2 = read_cr2();  	cr3 = __read_cr3(); @@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	struct fpu *prev_fpu = &prev->fpu;  	struct fpu *next_fpu = &next->fpu;  	int cpu = smp_processor_id(); -	struct tss_struct *tss = &per_cpu(cpu_tss, cpu); +	struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);  	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&  		     this_cpu_read(irq_count) != -1); @@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	 * Switch the PDA and FPU contexts.  	 */  	this_cpu_write(current_task, next_p); +	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));  	/* Reload sp0. */  	update_sp0(next_p); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8af2e8d0c0a1..145810b0edf6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -906,9 +906,6 @@ void __init setup_arch(char **cmdline_p)  		set_bit(EFI_BOOT, &efi.flags);  		set_bit(EFI_64BIT, &efi.flags);  	} - -	if (efi_enabled(EFI_BOOT)) -		efi_memblock_x86_reserve_range();  #endif  	x86_init.oem.arch_setup(); @@ -962,6 +959,8 @@ void __init setup_arch(char **cmdline_p)  	parse_early_param(); +	if (efi_enabled(EFI_BOOT)) +		efi_memblock_x86_reserve_range();  #ifdef CONFIG_MEMORY_HOTPLUG  	/*  	 * Memory used by the kernel cannot be hot-removed because Linux diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3d01df7d7cf6..ed556d50d7ed 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -106,7 +106,7 @@ EXPORT_SYMBOL(__max_logical_packages);  static unsigned int logical_packages __read_mostly;  /* Maximum number of SMT threads on any online core */ -int __max_smt_threads __read_mostly; +int __read_mostly __max_smt_threads = 1;  /* Flag to indicate if a complete sched domain rebuild is required */  bool x86_topology_update; @@ -126,14 +126,10 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)  	spin_lock_irqsave(&rtc_lock, flags);  	CMOS_WRITE(0xa, 0xf);  	spin_unlock_irqrestore(&rtc_lock, flags); -	local_flush_tlb(); -	pr_debug("1.\n");  	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =  							start_eip >> 4; -	pr_debug("2.\n");  	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =  							start_eip & 0xf; -	pr_debug("3.\n");  }  static inline void smpboot_restore_warm_reset_vector(void) @@ -141,11 +137,6 @@ static inline void smpboot_restore_warm_reset_vector(void)  	unsigned long flags;  	/* -	 * Install writable page 0 entry to set BIOS data area. -	 */ -	local_flush_tlb(); - -	/*  	 * Paranoid:  Set warm reset code and vector here back  	 * to default values.  	 */ @@ -237,7 +228,7 @@ static void notrace start_secondary(void *unused)  	load_cr3(swapper_pg_dir);  	__flush_tlb_all();  #endif - +	load_current_idt();  	cpu_init();  	x86_cpuinit.early_percpu_clock_init();  	preempt_disable(); @@ -932,12 +923,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,  	initial_code = (unsigned long)start_secondary;  	initial_stack  = idle->thread.sp; -	/* -	 * Enable the espfix hack for this CPU -	*/ -#ifdef CONFIG_X86_ESPFIX64 +	/* Enable the espfix hack for this CPU */  	init_espfix_ap(cpu); -#endif  	/* So we see what's up */  	announce_cpu(cpu, apicid); @@ -1304,7 +1291,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)  	 * Today neither Intel nor AMD support heterogenous systems so  	 * extrapolate the boot cpu's data to all packages.  	 */ -	ncpus = cpu_data(0).booted_cores * smp_num_siblings; +	ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();  	__max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);  	pr_info("Max logical packages: %u\n", __max_logical_packages); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 77835bc021c7..093f2ea5dd56 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -102,7 +102,7 @@ __save_stack_trace_reliable(struct stack_trace *trace,  	for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state);  	     unwind_next_frame(&state)) { -		regs = unwind_get_entry_regs(&state); +		regs = unwind_get_entry_regs(&state, NULL);  		if (regs) {  			/*  			 * Kernel mode registers on the stack indicate an @@ -164,8 +164,12 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk,  {  	int ret; +	/* +	 * If the task doesn't have a stack (e.g., a zombie), the stack is +	 * "reliably" empty. +	 */  	if (!try_get_task_stack(tsk)) -		return -EINVAL; +		return 0;  	ret = __save_stack_trace_reliable(trace, tsk); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a4eb27918ceb..a2486f444073 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -138,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,  		return -1;  	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));  	pte_unmap(pte); + +	/* +	 * PTI poisons low addresses in the kernel page tables in the +	 * name of making them unusable for userspace.  To execute +	 * code at such a low address, the poison must be cleared. +	 * +	 * Note: 'pgd' actually gets set in p4d_alloc() _or_ +	 * pud_alloc() depending on 4/5-level paging. +	 */ +	pgd->pgd &= ~_PAGE_NX; +  	return 0;  } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 9a9c9b076955..a5b802a12212 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,  	cpu = get_cpu();  	while (n-- > 0) { -		if (LDT_empty(info) || LDT_zero(info)) { +		if (LDT_empty(info) || LDT_zero(info))  			memset(desc, 0, sizeof(*desc)); -		} else { +		else  			fill_ldt(desc, info); - -			/* -			 * Always set the accessed bit so that the CPU -			 * doesn't try to write to the (read-only) GDT. -			 */ -			desc->type |= 1; -		}  		++info;  		++desc;  	} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 989514c94a55..446c9ef8cfc3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -51,6 +51,7 @@  #include <asm/traps.h>  #include <asm/desc.h>  #include <asm/fpu/internal.h> +#include <asm/cpu_entry_area.h>  #include <asm/mce.h>  #include <asm/fixmap.h>  #include <asm/mach_traps.h> @@ -348,23 +349,42 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)  	/*  	 * If IRET takes a non-IST fault on the espfix64 stack, then we -	 * end up promoting it to a doublefault.  In that case, modify -	 * the stack to make it look like we just entered the #GP -	 * handler from user space, similar to bad_iret. +	 * end up promoting it to a doublefault.  In that case, take +	 * advantage of the fact that we're not using the normal (TSS.sp0) +	 * stack right now.  We can write a fake #GP(0) frame at TSS.sp0 +	 * and then modify our own IRET frame so that, when we return, +	 * we land directly at the #GP(0) vector with the stack already +	 * set up according to its expectations. +	 * +	 * The net result is that our #GP handler will think that we +	 * entered from usermode with the bad user context.  	 *  	 * No need for ist_enter here because we don't use RCU.  	 */ -	if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && +	if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&  		regs->cs == __KERNEL_CS &&  		regs->ip == (unsigned long)native_irq_return_iret)  	{ -		struct pt_regs *normal_regs = task_pt_regs(current); +		struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; -		/* Fake a #GP(0) from userspace. */ -		memmove(&normal_regs->ip, (void *)regs->sp, 5*8); -		normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */ +		/* +		 * regs->sp points to the failing IRET frame on the +		 * ESPFIX64 stack.  Copy it to the entry stack.  This fills +		 * in gpregs->ss through gpregs->ip. +		 * +		 */ +		memmove(&gpregs->ip, (void *)regs->sp, 5*8); +		gpregs->orig_ax = 0;  /* Missing (lost) #GP error code */ + +		/* +		 * Adjust our frame so that we return straight to the #GP +		 * vector with the expected RSP value.  This is safe because +		 * we won't enable interupts or schedule before we invoke +		 * general_protection, so nothing will clobber the stack +		 * frame we just set up. +		 */  		regs->ip = (unsigned long)general_protection; -		regs->sp = (unsigned long)&normal_regs->orig_ax; +		regs->sp = (unsigned long)&gpregs->orig_ax;  		return;  	} @@ -389,7 +409,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)  	 *  	 *   Processors update CR2 whenever a page fault is detected. If a  	 *   second page fault occurs while an earlier page fault is being -	 *   deliv- ered, the faulting linear address of the second fault will +	 *   delivered, the faulting linear address of the second fault will  	 *   overwrite the contents of CR2 (replacing the previous  	 *   address). These updates to CR2 occur even if the page fault  	 *   results in a double fault or occurs during the delivery of a @@ -605,14 +625,15 @@ NOKPROBE_SYMBOL(do_int3);  #ifdef CONFIG_X86_64  /* - * Help handler running on IST stack to switch off the IST stack if the - * interrupted code was in user mode. The actual stack switch is done in - * entry_64.S + * Help handler running on a per-cpu (IST or entry trampoline) stack + * to switch to the normal thread stack if the interrupted code was in + * user mode. The actual stack switch is done in entry_64.S   */  asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)  { -	struct pt_regs *regs = task_pt_regs(current); -	*regs = *eregs; +	struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; +	if (regs != eregs) +		*regs = *eregs;  	return regs;  }  NOKPROBE_SYMBOL(sync_regs); @@ -628,13 +649,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)  	/*  	 * This is called from entry_64.S early in handling a fault  	 * caused by a bad iret to user mode.  To handle the fault -	 * correctly, we want move our stack frame to task_pt_regs -	 * and we want to pretend that the exception came from the -	 * iret target. +	 * correctly, we want to move our stack frame to where it would +	 * be had we entered directly on the entry stack (rather than +	 * just below the IRET frame) and we want to pretend that the +	 * exception came from the IRET target.  	 */  	struct bad_iret_stack *new_stack = -		container_of(task_pt_regs(current), -			     struct bad_iret_stack, regs); +		(struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;  	/* Copy the IRET target to the new stack. */  	memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); @@ -795,14 +816,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)  	debug_stack_usage_dec();  exit: -#if defined(CONFIG_X86_32) -	/* -	 * This is the most likely code path that involves non-trivial use -	 * of the SYSENTER stack.  Check that we haven't overrun it. -	 */ -	WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, -	     "Overran or corrupted SYSENTER stack\n"); -#endif  	ist_exit(regs);  }  NOKPROBE_SYMBOL(do_debug); @@ -929,6 +942,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)  void __init trap_init(void)  { +	/* Init cpu_entry_area before IST entries are set up */ +	setup_cpu_entry_areas(); +  	idt_setup_traps();  	/* @@ -936,8 +952,9 @@ void __init trap_init(void)  	 * "sidt" instruction will not leak the location of the kernel, and  	 * to defend the IDT against arbitrary memory write vulnerabilities.  	 * It will be reloaded in cpu_init() */ -	__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); -	idt_descr.address = fix_to_virt(FIX_RO_IDT); +	cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), +		    PAGE_KERNEL_RO); +	idt_descr.address = CPU_ENTRY_AREA_RO_IDT;  	/*  	 * Should be a barrier for any external CPU state: diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index a3f973b2c97a..be86a865087a 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)  	return NULL;  } -static bool stack_access_ok(struct unwind_state *state, unsigned long addr, +static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,  			    size_t len)  {  	struct stack_info *info = &state->stack_info; +	void *addr = (void *)_addr; -	/* -	 * If the address isn't on the current stack, switch to the next one. -	 * -	 * We may have to traverse multiple stacks to deal with the possibility -	 * that info->next_sp could point to an empty stack and the address -	 * could be on a subsequent stack. -	 */ -	while (!on_stack(info, (void *)addr, len)) -		if (get_stack_info(info->next_sp, state->task, info, -				   &state->stack_mask)) -			return false; +	if (!on_stack(info, addr, len) && +	    (get_stack_info(addr, state->task, info, &state->stack_mask))) +		return false;  	return true;  } @@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,  	return true;  } -#define REGS_SIZE (sizeof(struct pt_regs)) -#define SP_OFFSET (offsetof(struct pt_regs, sp)) -#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) -#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) -  static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, -			     unsigned long *ip, unsigned long *sp, bool full) +			     unsigned long *ip, unsigned long *sp)  { -	size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; -	size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; -	struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); - -	if (IS_ENABLED(CONFIG_X86_64)) { -		if (!stack_access_ok(state, addr, regs_size)) -			return false; +	struct pt_regs *regs = (struct pt_regs *)addr; -		*ip = regs->ip; -		*sp = regs->sp; +	/* x86-32 support will be more complicated due to the ®s->sp hack */ +	BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32)); -		return true; -	} - -	if (!stack_access_ok(state, addr, sp_offset)) +	if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))  		return false;  	*ip = regs->ip; +	*sp = regs->sp; +	return true; +} -	if (user_mode(regs)) { -		if (!stack_access_ok(state, addr + sp_offset, -				     REGS_SIZE - SP_OFFSET)) -			return false; +static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr, +				  unsigned long *ip, unsigned long *sp) +{ +	struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET; -		*sp = regs->sp; -	} else -		*sp = (unsigned long)®s->sp; +	if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) +		return false; +	*ip = regs->ip; +	*sp = regs->sp;  	return true;  } @@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)  	unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;  	enum stack_type prev_type = state->stack_info.type;  	struct orc_entry *orc; -	struct pt_regs *ptregs;  	bool indirect = false;  	if (unwind_done(state)) @@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)  		break;  	case ORC_TYPE_REGS: -		if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { +		if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {  			orc_warn("can't dereference registers at %p for ip %pB\n",  				 (void *)sp, (void *)orig_ip);  			goto done; @@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)  		break;  	case ORC_TYPE_REGS_IRET: -		if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { +		if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {  			orc_warn("can't dereference iret registers at %p for ip %pB\n",  				 (void *)sp, (void *)orig_ip);  			goto done;  		} -		ptregs = container_of((void *)sp, struct pt_regs, ip); -		if ((unsigned long)ptregs >= prev_sp && -		    on_stack(&state->stack_info, ptregs, REGS_SIZE)) { -			state->regs = ptregs; -			state->full_regs = false; -		} else -			state->regs = NULL; - +		state->regs = (void *)sp - IRET_FRAME_OFFSET; +		state->full_regs = false;  		state->signal = true;  		break; @@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,  	}  	if (get_stack_info((unsigned long *)state->sp, state->task, -			   &state->stack_info, &state->stack_mask)) -		return; +			   &state->stack_info, &state->stack_mask)) { +		/* +		 * We weren't on a valid stack.  It's possible that +		 * we overflowed a valid stack into a guard page. +		 * See if the next page up is valid so that we can +		 * generate some kind of backtrace if this happens. +		 */ +		void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); +		if (get_stack_info(next_page, state->task, &state->stack_info, +				   &state->stack_mask)) +			return; +	}  	/*  	 * The caller can provide the address of the first frame directly diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a4009fb9be87..1e413a9326aa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -61,11 +61,17 @@ jiffies_64 = jiffies;  		. = ALIGN(HPAGE_SIZE);				\  		__end_rodata_hpage_align = .; +#define ALIGN_ENTRY_TEXT_BEGIN	. = ALIGN(PMD_SIZE); +#define ALIGN_ENTRY_TEXT_END	. = ALIGN(PMD_SIZE); +  #else  #define X64_ALIGN_RODATA_BEGIN  #define X64_ALIGN_RODATA_END +#define ALIGN_ENTRY_TEXT_BEGIN +#define ALIGN_ENTRY_TEXT_END +  #endif  PHDRS { @@ -102,11 +108,22 @@ SECTIONS  		CPUIDLE_TEXT  		LOCK_TEXT  		KPROBES_TEXT +		ALIGN_ENTRY_TEXT_BEGIN  		ENTRY_TEXT  		IRQENTRY_TEXT +		ALIGN_ENTRY_TEXT_END  		SOFTIRQENTRY_TEXT  		*(.fixup)  		*(.gnu.warning) + +#ifdef CONFIG_X86_64 +		. = ALIGN(PAGE_SIZE); +		_entry_trampoline = .; +		*(.entry_trampoline) +		. = ALIGN(PAGE_SIZE); +		ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); +#endif +  		/* End of text section */  		_etext = .;  	} :text = 0x9090 diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index cdc70a3a6583..c2cea6651279 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -44,7 +44,7 @@ static const struct cpuid_reg reverse_cpuid[] = {  	[CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},  	[CPUID_1_ECX]         = {         1, 0, CPUID_ECX},  	[CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX}, -	[CPUID_8000_0001_ECX] = {0xc0000001, 0, CPUID_ECX}, +	[CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},  	[CPUID_7_0_EBX]       = {         7, 0, CPUID_EBX},  	[CPUID_D_1_EAX]       = {       0xd, 1, CPUID_EAX},  	[CPUID_F_0_EDX]       = {       0xf, 0, CPUID_EDX}, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8079d141792a..b514b2b2845a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1046,7 +1046,6 @@ static void fetch_register_operand(struct operand *op)  static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)  { -	ctxt->ops->get_fpu(ctxt);  	switch (reg) {  	case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;  	case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; @@ -1068,13 +1067,11 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)  #endif  	default: BUG();  	} -	ctxt->ops->put_fpu(ctxt);  }  static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,  			  int reg)  { -	ctxt->ops->get_fpu(ctxt);  	switch (reg) {  	case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;  	case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; @@ -1096,12 +1093,10 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,  #endif  	default: BUG();  	} -	ctxt->ops->put_fpu(ctxt);  }  static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)  { -	ctxt->ops->get_fpu(ctxt);  	switch (reg) {  	case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;  	case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; @@ -1113,12 +1108,10 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)  	case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;  	default: BUG();  	} -	ctxt->ops->put_fpu(ctxt);  }  static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)  { -	ctxt->ops->get_fpu(ctxt);  	switch (reg) {  	case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;  	case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; @@ -1130,7 +1123,6 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)  	case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;  	default: BUG();  	} -	ctxt->ops->put_fpu(ctxt);  }  static int em_fninit(struct x86_emulate_ctxt *ctxt) @@ -1138,9 +1130,7 @@ static int em_fninit(struct x86_emulate_ctxt *ctxt)  	if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))  		return emulate_nm(ctxt); -	ctxt->ops->get_fpu(ctxt);  	asm volatile("fninit"); -	ctxt->ops->put_fpu(ctxt);  	return X86EMUL_CONTINUE;  } @@ -1151,9 +1141,7 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)  	if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))  		return emulate_nm(ctxt); -	ctxt->ops->get_fpu(ctxt);  	asm volatile("fnstcw %0": "+m"(fcw)); -	ctxt->ops->put_fpu(ctxt);  	ctxt->dst.val = fcw; @@ -1167,9 +1155,7 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)  	if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))  		return emulate_nm(ctxt); -	ctxt->ops->get_fpu(ctxt);  	asm volatile("fnstsw %0": "+m"(fsw)); -	ctxt->ops->put_fpu(ctxt);  	ctxt->dst.val = fsw; @@ -2404,9 +2390,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)  }  static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, -				     u64 cr0, u64 cr4) +				    u64 cr0, u64 cr3, u64 cr4)  {  	int bad; +	u64 pcid; + +	/* In order to later set CR4.PCIDE, CR3[11:0] must be zero.  */ +	pcid = 0; +	if (cr4 & X86_CR4_PCIDE) { +		pcid = cr3 & 0xfff; +		cr3 &= ~0xfff; +	} + +	bad = ctxt->ops->set_cr(ctxt, 3, cr3); +	if (bad) +		return X86EMUL_UNHANDLEABLE;  	/*  	 * First enable PAE, long mode needs it before CR0.PG = 1 is set. @@ -2425,6 +2423,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,  		bad = ctxt->ops->set_cr(ctxt, 4, cr4);  		if (bad)  			return X86EMUL_UNHANDLEABLE; +		if (pcid) { +			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); +			if (bad) +				return X86EMUL_UNHANDLEABLE; +		} +  	}  	return X86EMUL_CONTINUE; @@ -2435,11 +2439,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)  	struct desc_struct desc;  	struct desc_ptr dt;  	u16 selector; -	u32 val, cr0, cr4; +	u32 val, cr0, cr3, cr4;  	int i;  	cr0 =                      GET_SMSTATE(u32, smbase, 0x7ffc); -	ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); +	cr3 =                      GET_SMSTATE(u32, smbase, 0x7ff8);  	ctxt->eflags =             GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;  	ctxt->_eip =               GET_SMSTATE(u32, smbase, 0x7ff0); @@ -2481,14 +2485,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)  	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); -	return rsm_enter_protected_mode(ctxt, cr0, cr4); +	return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);  }  static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)  {  	struct desc_struct desc;  	struct desc_ptr dt; -	u64 val, cr0, cr4; +	u64 val, cr0, cr3, cr4;  	u32 base3;  	u16 selector;  	int i, r; @@ -2505,7 +2509,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)  	ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);  	cr0 =                       GET_SMSTATE(u64, smbase, 0x7f58); -	ctxt->ops->set_cr(ctxt, 3,  GET_SMSTATE(u64, smbase, 0x7f50)); +	cr3 =                       GET_SMSTATE(u64, smbase, 0x7f50);  	cr4 =                       GET_SMSTATE(u64, smbase, 0x7f48);  	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));  	val =                       GET_SMSTATE(u64, smbase, 0x7ed0); @@ -2533,7 +2537,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)  	dt.address =                GET_SMSTATE(u64, smbase, 0x7e68);  	ctxt->ops->set_gdt(ctxt, &dt); -	r = rsm_enter_protected_mode(ctxt, cr0, cr4); +	r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);  	if (r != X86EMUL_CONTINUE)  		return r; @@ -4001,12 +4005,8 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)  	if (rc != X86EMUL_CONTINUE)  		return rc; -	ctxt->ops->get_fpu(ctxt); -  	rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); -	ctxt->ops->put_fpu(ctxt); -  	if (rc != X86EMUL_CONTINUE)  		return rc; @@ -4014,6 +4014,26 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)  		                   fxstate_size(ctxt));  } +/* + * FXRSTOR might restore XMM registers not provided by the guest. Fill + * in the host registers (via FXSAVE) instead, so they won't be modified. + * (preemption has to stay disabled until FXRSTOR). + * + * Use noinline to keep the stack for other functions called by callers small. + */ +static noinline int fxregs_fixup(struct fxregs_state *fx_state, +				 const size_t used_size) +{ +	struct fxregs_state fx_tmp; +	int rc; + +	rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_tmp)); +	memcpy((void *)fx_state + used_size, (void *)&fx_tmp + used_size, +	       __fxstate_size(16) - used_size); + +	return rc; +} +  static int em_fxrstor(struct x86_emulate_ctxt *ctxt)  {  	struct fxregs_state fx_state; @@ -4024,19 +4044,17 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)  	if (rc != X86EMUL_CONTINUE)  		return rc; -	ctxt->ops->get_fpu(ctxt); -  	size = fxstate_size(ctxt); +	rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); +	if (rc != X86EMUL_CONTINUE) +		return rc; +  	if (size < __fxstate_size(16)) { -		rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); +		rc = fxregs_fixup(&fx_state, size);  		if (rc != X86EMUL_CONTINUE)  			goto out;  	} -	rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); -	if (rc != X86EMUL_CONTINUE) -		goto out; -  	if (fx_state.mxcsr >> 16) {  		rc = emulate_gp(ctxt, 0);  		goto out; @@ -4046,8 +4064,6 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)  		rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));  out: -	ctxt->ops->put_fpu(ctxt); -  	return rc;  } @@ -5000,6 +5016,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)  	bool op_prefix = false;  	bool has_seg_override = false;  	struct opcode opcode; +	u16 dummy; +	struct desc_struct desc;  	ctxt->memop.type = OP_NONE;  	ctxt->memopp = NULL; @@ -5018,6 +5036,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)  	switch (mode) {  	case X86EMUL_MODE_REAL:  	case X86EMUL_MODE_VM86: +		def_op_bytes = def_ad_bytes = 2; +		ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS); +		if (desc.d) +			def_op_bytes = def_ad_bytes = 4; +		break;  	case X86EMUL_MODE_PROT16:  		def_op_bytes = def_ad_bytes = 2;  		break; @@ -5290,9 +5313,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)  {  	int rc; -	ctxt->ops->get_fpu(ctxt);  	rc = asm_safe("fwait"); -	ctxt->ops->put_fpu(ctxt);  	if (unlikely(rc != X86EMUL_CONTINUE))  		return emulate_exception(ctxt, MF_VECTOR, 0, false); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index bdff437acbcb..4e822ad363f3 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -209,12 +209,12 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,  	old_irr = ioapic->irr;  	ioapic->irr |= mask; -	if (edge) +	if (edge) {  		ioapic->irr_delivered &= ~mask; -	if ((edge && old_irr == ioapic->irr) || -	    (!edge && entry.fields.remote_irr)) { -		ret = 0; -		goto out; +		if (old_irr == ioapic->irr) { +			ret = 0; +			goto out; +		}  	}  	ret = ioapic_service(ioapic, irq, line_status); @@ -257,8 +257,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)  		    index == RTC_GSI) {  			if (kvm_apic_match_dest(vcpu, NULL, 0,  			             e->fields.dest_id, e->fields.dest_mode) || -			    (e->fields.trig_mode == IOAPIC_EDGE_TRIG && -			     kvm_apic_pending_eoi(vcpu, e->fields.vector))) +			    kvm_apic_pending_eoi(vcpu, e->fields.vector))  				__set_bit(e->fields.vector,  					  ioapic_handled_vectors);  		} @@ -277,6 +276,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)  {  	unsigned index;  	bool mask_before, mask_after; +	int old_remote_irr, old_delivery_status;  	union kvm_ioapic_redirect_entry *e;  	switch (ioapic->ioregsel) { @@ -299,14 +299,28 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)  			return;  		e = &ioapic->redirtbl[index];  		mask_before = e->fields.mask; +		/* Preserve read-only fields */ +		old_remote_irr = e->fields.remote_irr; +		old_delivery_status = e->fields.delivery_status;  		if (ioapic->ioregsel & 1) {  			e->bits &= 0xffffffff;  			e->bits |= (u64) val << 32;  		} else {  			e->bits &= ~0xffffffffULL;  			e->bits |= (u32) val; -			e->fields.remote_irr = 0;  		} +		e->fields.remote_irr = old_remote_irr; +		e->fields.delivery_status = old_delivery_status; + +		/* +		 * Some OSes (Linux, Xen) assume that Remote IRR bit will +		 * be cleared by IOAPIC hardware when the entry is configured +		 * as edge-triggered. This behavior is used to simulate an +		 * explicit EOI on IOAPICs that don't have the EOI register. +		 */ +		if (e->fields.trig_mode == IOAPIC_EDGE_TRIG) +			e->fields.remote_irr = 0; +  		mask_after = e->fields.mask;  		if (mask_before != mask_after)  			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); @@ -324,7 +338,9 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)  	struct kvm_lapic_irq irqe;  	int ret; -	if (entry->fields.mask) +	if (entry->fields.mask || +	    (entry->fields.trig_mode == IOAPIC_LEVEL_TRIG && +	    entry->fields.remote_irr))  		return -1;  	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 943acbf00c69..e2c1fb8d35ce 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -266,9 +266,14 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)  	recalculate_apic_map(apic->vcpu->kvm);  } +static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) +{ +	return ((id >> 4) << 16) | (1 << (id & 0xf)); +} +  static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)  { -	u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); +	u32 ldr = kvm_apic_calc_x2apic_ldr(id);  	WARN_ON_ONCE(id != apic->vcpu->vcpu_id); @@ -2245,6 +2250,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,  {  	if (apic_x2apic_mode(vcpu->arch.apic)) {  		u32 *id = (u32 *)(s->regs + APIC_ID); +		u32 *ldr = (u32 *)(s->regs + APIC_LDR);  		if (vcpu->kvm->arch.x2apic_format) {  			if (*id != vcpu->vcpu_id) @@ -2255,6 +2261,10 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,  			else  				*id <<= 24;  		} + +		/* In x2APIC mode, the LDR is fixed and based on the id */ +		if (set) +			*ldr = kvm_apic_calc_x2apic_ldr(*id);  	}  	return 0; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e5e66e5c6640..2b8eb4da4d08 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3395,7 +3395,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)  		spin_lock(&vcpu->kvm->mmu_lock);  		if(make_mmu_pages_available(vcpu) < 0) {  			spin_unlock(&vcpu->kvm->mmu_lock); -			return 1; +			return -ENOSPC;  		}  		sp = kvm_mmu_get_page(vcpu, 0, 0,  				vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); @@ -3410,7 +3410,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)  			spin_lock(&vcpu->kvm->mmu_lock);  			if (make_mmu_pages_available(vcpu) < 0) {  				spin_unlock(&vcpu->kvm->mmu_lock); -				return 1; +				return -ENOSPC;  			}  			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),  					i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); @@ -3450,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)  		spin_lock(&vcpu->kvm->mmu_lock);  		if (make_mmu_pages_available(vcpu) < 0) {  			spin_unlock(&vcpu->kvm->mmu_lock); -			return 1; +			return -ENOSPC;  		}  		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,  				vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); @@ -3487,7 +3487,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)  		spin_lock(&vcpu->kvm->mmu_lock);  		if (make_mmu_pages_available(vcpu) < 0) {  			spin_unlock(&vcpu->kvm->mmu_lock); -			return 1; +			return -ENOSPC;  		}  		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,  				      0, ACC_ALL); @@ -3781,7 +3781,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)  bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)  {  	if (unlikely(!lapic_in_kernel(vcpu) || -		     kvm_event_needs_reinjection(vcpu))) +		     kvm_event_needs_reinjection(vcpu) || +		     vcpu->arch.exception.pending))  		return false;  	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) @@ -5465,30 +5466,34 @@ static void mmu_destroy_caches(void)  int kvm_mmu_module_init(void)  { +	int ret = -ENOMEM; +  	kvm_mmu_clear_all_pte_masks();  	pte_list_desc_cache = kmem_cache_create("pte_list_desc",  					    sizeof(struct pte_list_desc),  					    0, SLAB_ACCOUNT, NULL);  	if (!pte_list_desc_cache) -		goto nomem; +		goto out;  	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",  						  sizeof(struct kvm_mmu_page),  						  0, SLAB_ACCOUNT, NULL);  	if (!mmu_page_header_cache) -		goto nomem; +		goto out;  	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) -		goto nomem; +		goto out; -	register_shrinker(&mmu_shrinker); +	ret = register_shrinker(&mmu_shrinker); +	if (ret) +		goto out;  	return 0; -nomem: +out:  	mmu_destroy_caches(); -	return -ENOMEM; +	return ret;  }  /* diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 59e13a79c2e3..f40d0da1f1d3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -45,6 +45,7 @@  #include <asm/debugreg.h>  #include <asm/kvm_para.h>  #include <asm/irq_remapping.h> +#include <asm/nospec-branch.h>  #include <asm/virtext.h>  #include "trace.h" @@ -2197,6 +2198,8 @@ static int ud_interception(struct vcpu_svm *svm)  	int er;  	er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); +	if (er == EMULATE_USER_EXIT) +		return 0;  	if (er != EMULATE_DONE)  		kvm_queue_exception(&svm->vcpu, UD_VECTOR);  	return 1; @@ -4977,6 +4980,25 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)  		"mov %%r14, %c[r14](%[svm]) \n\t"  		"mov %%r15, %c[r15](%[svm]) \n\t"  #endif +		/* +		* Clear host registers marked as clobbered to prevent +		* speculative use. +		*/ +		"xor %%" _ASM_BX ", %%" _ASM_BX " \n\t" +		"xor %%" _ASM_CX ", %%" _ASM_CX " \n\t" +		"xor %%" _ASM_DX ", %%" _ASM_DX " \n\t" +		"xor %%" _ASM_SI ", %%" _ASM_SI " \n\t" +		"xor %%" _ASM_DI ", %%" _ASM_DI " \n\t" +#ifdef CONFIG_X86_64 +		"xor %%r8, %%r8 \n\t" +		"xor %%r9, %%r9 \n\t" +		"xor %%r10, %%r10 \n\t" +		"xor %%r11, %%r11 \n\t" +		"xor %%r12, %%r12 \n\t" +		"xor %%r13, %%r13 \n\t" +		"xor %%r14, %%r14 \n\t" +		"xor %%r15, %%r15 \n\t" +#endif  		"pop %%" _ASM_BP  		:  		: [svm]"a"(svm), @@ -5006,6 +5028,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)  #endif  		); +	/* Eliminate branch target predictions from guest mode */ +	vmexit_fill_RSB(); +  #ifdef CONFIG_X86_64  	wrmsrl(MSR_GS_BASE, svm->host.gs_base);  #else diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 714a0673ec3c..c829d89e2e63 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -50,6 +50,7 @@  #include <asm/apic.h>  #include <asm/irq_remapping.h>  #include <asm/mmu_context.h> +#include <asm/nospec-branch.h>  #include "trace.h"  #include "pmu.h" @@ -899,8 +900,16 @@ static inline short vmcs_field_to_offset(unsigned long field)  {  	BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); -	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || -	    vmcs_field_to_offset_table[field] == 0) +	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) +		return -ENOENT; + +	/* +	 * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a +	 * generic mechanism. +	 */ +	asm("lfence"); + +	if (vmcs_field_to_offset_table[field] == 0)  		return -ENOENT;  	return vmcs_field_to_offset_table[field]; @@ -2300,7 +2309,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  		 * processors.  See 22.2.4.  		 */  		vmcs_writel(HOST_TR_BASE, -			    (unsigned long)this_cpu_ptr(&cpu_tss)); +			    (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);  		vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */  		/* @@ -5600,7 +5609,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)  		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);  	} -	vmcs_writel(GUEST_RFLAGS, 0x02); +	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);  	kvm_rip_write(vcpu, 0xfff0);  	vmcs_writel(GUEST_GDTR_BASE, 0); @@ -5915,11 +5924,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)  		return 1;  /* already handled by vmx_vcpu_run() */  	if (is_invalid_opcode(intr_info)) { -		if (is_guest_mode(vcpu)) { -			kvm_queue_exception(vcpu, UD_VECTOR); -			return 1; -		}  		er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); +		if (er == EMULATE_USER_EXIT) +			return 0;  		if (er != EMULATE_DONE)  			kvm_queue_exception(vcpu, UD_VECTOR);  		return 1; @@ -6602,7 +6609,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)  		if (kvm_test_request(KVM_REQ_EVENT, vcpu))  			return 1; -		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); +		err = emulate_instruction(vcpu, 0);  		if (err == EMULATE_USER_EXIT) {  			++vcpu->stat.mmio_exits; @@ -6750,16 +6757,10 @@ static __init int hardware_setup(void)  			goto out;  	} -	vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);  	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);  	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); -	/* -	 * Allow direct access to the PC debug port (it is often used for I/O -	 * delays, but the vmexits simply slow things down). -	 */  	memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); -	clear_bit(0x80, vmx_io_bitmap_a);  	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); @@ -7414,10 +7415,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)   */  static void free_nested(struct vcpu_vmx *vmx)  { -	if (!vmx->nested.vmxon) +	if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)  		return;  	vmx->nested.vmxon = false; +	vmx->nested.smm.vmxon = false;  	free_vpid(vmx->nested.vpid02);  	vmx->nested.posted_intr_nv = -1;  	vmx->nested.current_vmptr = -1ull; @@ -9419,6 +9421,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)  		/* Save guest registers, load host registers, keep flags */  		"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"  		"pop %0 \n\t" +		"setbe %c[fail](%0)\n\t"  		"mov %%" _ASM_AX ", %c[rax](%0) \n\t"  		"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"  		__ASM_SIZE(pop) " %c[rcx](%0) \n\t" @@ -9435,12 +9438,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)  		"mov %%r13, %c[r13](%0) \n\t"  		"mov %%r14, %c[r14](%0) \n\t"  		"mov %%r15, %c[r15](%0) \n\t" +		"xor %%r8d,  %%r8d \n\t" +		"xor %%r9d,  %%r9d \n\t" +		"xor %%r10d, %%r10d \n\t" +		"xor %%r11d, %%r11d \n\t" +		"xor %%r12d, %%r12d \n\t" +		"xor %%r13d, %%r13d \n\t" +		"xor %%r14d, %%r14d \n\t" +		"xor %%r15d, %%r15d \n\t"  #endif  		"mov %%cr2, %%" _ASM_AX "   \n\t"  		"mov %%" _ASM_AX ", %c[cr2](%0) \n\t" +		"xor %%eax, %%eax \n\t" +		"xor %%ebx, %%ebx \n\t" +		"xor %%esi, %%esi \n\t" +		"xor %%edi, %%edi \n\t"  		"pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t" -		"setbe %c[fail](%0) \n\t"  		".pushsection .rodata \n\t"  		".global vmx_return \n\t"  		"vmx_return: " _ASM_PTR " 2b \n\t" @@ -9477,6 +9491,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)  #endif  	      ); +	/* Eliminate branch target predictions from guest mode */ +	vmexit_fill_RSB(); +  	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */  	if (debugctlmsr)  		update_debugctlmsr(debugctlmsr); @@ -9800,8 +9817,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)  	cr4_fixed1_update(X86_CR4_SMEP,       ebx, bit(X86_FEATURE_SMEP));  	cr4_fixed1_update(X86_CR4_SMAP,       ebx, bit(X86_FEATURE_SMAP));  	cr4_fixed1_update(X86_CR4_PKE,        ecx, bit(X86_FEATURE_PKU)); -	/* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */ -	cr4_fixed1_update(bit(11),            ecx, bit(2)); +	cr4_fixed1_update(X86_CR4_UMIP,       ecx, bit(X86_FEATURE_UMIP));  #undef cr4_fixed1_update  } @@ -10875,6 +10891,11 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,  			return 1;  	} +	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && +		(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || +		(vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) +			return 1; +  	return 0;  } @@ -11099,13 +11120,12 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu);  	unsigned long exit_qual; - -	if (kvm_event_needs_reinjection(vcpu)) -		return -EBUSY; +	bool block_nested_events = +	    vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);  	if (vcpu->arch.exception.pending &&  		nested_vmx_check_exception(vcpu, &exit_qual)) { -		if (vmx->nested.nested_run_pending) +		if (block_nested_events)  			return -EBUSY;  		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);  		vcpu->arch.exception.pending = false; @@ -11114,14 +11134,14 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)  	if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&  	    vmx->nested.preemption_timer_expired) { -		if (vmx->nested.nested_run_pending) +		if (block_nested_events)  			return -EBUSY;  		nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);  		return 0;  	}  	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { -		if (vmx->nested.nested_run_pending) +		if (block_nested_events)  			return -EBUSY;  		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,  				  NMI_VECTOR | INTR_TYPE_NMI_INTR | @@ -11137,7 +11157,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)  	if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&  	    nested_exit_on_intr(vcpu)) { -		if (vmx->nested.nested_run_pending) +		if (block_nested_events)  			return -EBUSY;  		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);  		return 0; @@ -11324,6 +11344,24 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,  	kvm_clear_interrupt_queue(vcpu);  } +static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu, +			struct vmcs12 *vmcs12) +{ +	u32 entry_failure_code; + +	nested_ept_uninit_mmu_context(vcpu); + +	/* +	 * Only PDPTE load can fail as the value of cr3 was checked on entry and +	 * couldn't have changed. +	 */ +	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) +		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); + +	if (!enable_ept) +		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; +} +  /*   * A part of what we need to when the nested L2 guest exits and we want to   * run its L1 parent, is to reset L1's guest state to the host state specified @@ -11337,7 +11375,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,  				   struct vmcs12 *vmcs12)  {  	struct kvm_segment seg; -	u32 entry_failure_code;  	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)  		vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -11364,17 +11401,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,  	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);  	vmx_set_cr4(vcpu, vmcs12->host_cr4); -	nested_ept_uninit_mmu_context(vcpu); - -	/* -	 * Only PDPTE load can fail as the value of cr3 was checked on entry and -	 * couldn't have changed. -	 */ -	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) -		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); - -	if (!enable_ept) -		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; +	load_vmcs12_mmu_host_state(vcpu, vmcs12);  	if (enable_vpid) {  		/* @@ -11604,6 +11631,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,  	 * accordingly.  	 */  	nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + +	load_vmcs12_mmu_host_state(vcpu, vmcs12); +  	/*  	 * The emulated instruction was already skipped in  	 * nested_vmx_run, but the updated RIP was never diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 34c85aa2e2d1..1cec2c62a0b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -107,6 +107,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);  static bool __read_mostly ignore_msrs = 0;  module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); +static bool __read_mostly report_ignored_msrs = true; +module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); +  unsigned int min_timer_period_us = 500;  module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); @@ -1795,10 +1798,13 @@ u64 get_kvmclock_ns(struct kvm *kvm)  	/* both __this_cpu_read() and rdtsc() should be on the same cpu */  	get_cpu(); -	kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, -			   &hv_clock.tsc_shift, -			   &hv_clock.tsc_to_system_mul); -	ret = __pvclock_read_cycles(&hv_clock, rdtsc()); +	if (__this_cpu_read(cpu_tsc_khz)) { +		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, +				   &hv_clock.tsc_shift, +				   &hv_clock.tsc_to_system_mul); +		ret = __pvclock_read_cycles(&hv_clock, rdtsc()); +	} else +		ret = ktime_get_boot_ns() + ka->kvmclock_offset;  	put_cpu(); @@ -1830,6 +1836,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)  	 */  	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); +	if (guest_hv_clock.version & 1) +		++guest_hv_clock.version;  /* first time write, random junk */ +  	vcpu->hv_clock.version = guest_hv_clock.version + 1;  	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,  				&vcpu->hv_clock, @@ -2322,7 +2331,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  		/* Drop writes to this legacy MSR -- see rdmsr  		 * counterpart for further detail.  		 */ -		vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); +		if (report_ignored_msrs) +			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", +				msr, data);  		break;  	case MSR_AMD64_OSVW_ID_LENGTH:  		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) @@ -2359,8 +2370,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  				    msr, data);  			return 1;  		} else { -			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", -				    msr, data); +			if (report_ignored_msrs) +				vcpu_unimpl(vcpu, +					"ignored wrmsr: 0x%x data 0x%llx\n", +					msr, data);  			break;  		}  	} @@ -2578,7 +2591,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  					       msr_info->index);  			return 1;  		} else { -			vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); +			if (report_ignored_msrs) +				vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", +					msr_info->index);  			msr_info->data = 0;  		}  		break; @@ -2922,7 +2937,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)  	srcu_read_unlock(&vcpu->kvm->srcu, idx);  	pagefault_enable();  	kvm_x86_ops->vcpu_put(vcpu); -	kvm_put_guest_fpu(vcpu);  	vcpu->arch.last_host_tsc = rdtsc();  } @@ -4370,7 +4384,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)  					 addr, n, v))  		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))  			break; -		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); +		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);  		handled += n;  		addr += n;  		len -= n; @@ -4629,7 +4643,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)  {  	if (vcpu->mmio_read_completed) {  		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, -			       vcpu->mmio_fragments[0].gpa, *(u64 *)val); +			       vcpu->mmio_fragments[0].gpa, val);  		vcpu->mmio_read_completed = 0;  		return 1;  	} @@ -4651,14 +4665,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,  static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)  { -	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); +	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);  	return vcpu_mmio_write(vcpu, gpa, bytes, val);  }  static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,  			  void *val, int bytes)  { -	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); +	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);  	return X86EMUL_IO_NEEDED;  } @@ -5237,17 +5251,6 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt)  	emul_to_vcpu(ctxt)->arch.halt_request = 1;  } -static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) -{ -	preempt_disable(); -	kvm_load_guest_fpu(emul_to_vcpu(ctxt)); -} - -static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) -{ -	preempt_enable(); -} -  static int emulator_intercept(struct x86_emulate_ctxt *ctxt,  			      struct x86_instruction_info *info,  			      enum x86_intercept_stage stage) @@ -5325,8 +5328,6 @@ static const struct x86_emulate_ops emulate_ops = {  	.halt                = emulator_halt,  	.wbinvd              = emulator_wbinvd,  	.fix_hypercall       = emulator_fix_hypercall, -	.get_fpu             = emulator_get_fpu, -	.put_fpu             = emulator_put_fpu,  	.intercept           = emulator_intercept,  	.get_cpuid           = emulator_get_cpuid,  	.set_nmi_mask        = emulator_set_nmi_mask, @@ -5430,7 +5431,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)  		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;  		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;  		vcpu->run->internal.ndata = 0; -		r = EMULATE_FAIL; +		r = EMULATE_USER_EXIT;  	}  	kvm_queue_exception(vcpu, UD_VECTOR); @@ -5722,6 +5723,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,  			if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,  						emulation_type))  				return EMULATE_DONE; +			if (ctxt->have_exception && inject_emulated_exception(vcpu)) +				return EMULATE_DONE;  			if (emulation_type & EMULTYPE_SKIP)  				return EMULATE_FAIL;  			return handle_emulation_failure(vcpu); @@ -6761,6 +6764,20 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)  	kvm_x86_ops->tlb_flush(vcpu);  } +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, +		unsigned long start, unsigned long end) +{ +	unsigned long apic_address; + +	/* +	 * The physical address of apic access page is stored in the VMCS. +	 * Update it when it becomes invalid. +	 */ +	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); +	if (start <= apic_address && apic_address < end) +		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); +} +  void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)  {  	struct page *page = NULL; @@ -6935,7 +6952,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  	preempt_disable();  	kvm_x86_ops->prepare_guest_switch(vcpu); -	kvm_load_guest_fpu(vcpu);  	/*  	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt @@ -7248,14 +7264,11 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)  { -	struct fpu *fpu = ¤t->thread.fpu;  	int r; -	sigset_t sigsaved; -	fpu__initialize(fpu); +	kvm_sigset_activate(vcpu); -	if (vcpu->sigset_active) -		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); +	kvm_load_guest_fpu(vcpu);  	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {  		if (kvm_run->immediate_exit) { @@ -7297,9 +7310,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)  		r = vcpu_run(vcpu);  out: +	kvm_put_guest_fpu(vcpu);  	post_kvm_run_save(vcpu); -	if (vcpu->sigset_active) -		sigprocmask(SIG_SETMASK, &sigsaved, NULL); +	kvm_sigset_deactivate(vcpu);  	return r;  } @@ -7367,7 +7380,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  #endif  	kvm_rip_write(vcpu, regs->rip); -	kvm_set_rflags(vcpu, regs->rflags); +	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);  	vcpu->arch.exception.pending = false; @@ -7481,6 +7494,29 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,  }  EXPORT_SYMBOL_GPL(kvm_task_switch); +int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ +	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG_BIT)) { +		/* +		 * When EFER.LME and CR0.PG are set, the processor is in +		 * 64-bit mode (though maybe in a 32-bit code segment). +		 * CR4.PAE and EFER.LMA must be set. +		 */ +		if (!(sregs->cr4 & X86_CR4_PAE_BIT) +		    || !(sregs->efer & EFER_LMA)) +			return -EINVAL; +	} else { +		/* +		 * Not in 64-bit mode: EFER.LMA is clear and the code +		 * segment cannot be 64-bit. +		 */ +		if (sregs->efer & EFER_LMA || sregs->cs.l) +			return -EINVAL; +	} + +	return 0; +} +  int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,  				  struct kvm_sregs *sregs)  { @@ -7493,6 +7529,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,  			(sregs->cr4 & X86_CR4_OSXSAVE))  		return -EINVAL; +	if (kvm_valid_sregs(vcpu, sregs)) +		return -EINVAL; +  	apic_base_msr.data = sregs->apic_base;  	apic_base_msr.host_initiated = true;  	if (kvm_set_apic_base(vcpu, &apic_base_msr)) @@ -7690,32 +7729,25 @@ static void fx_init(struct kvm_vcpu *vcpu)  	vcpu->arch.cr0 |= X86_CR0_ET;  } +/* Swap (qemu) user FPU context for the guest FPU context. */  void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)  { -	if (vcpu->guest_fpu_loaded) -		return; - -	/* -	 * Restore all possible states in the guest, -	 * and assume host would use all available bits. -	 * Guest xcr0 would be loaded later. -	 */ -	vcpu->guest_fpu_loaded = 1; -	__kernel_fpu_begin(); +	preempt_disable(); +	copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);  	/* PKRU is separately restored in kvm_x86_ops->run.  */  	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,  				~XFEATURE_MASK_PKRU); +	preempt_enable();  	trace_kvm_fpu(1);  } +/* When vcpu_run ends, restore user space FPU context. */  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)  { -	if (!vcpu->guest_fpu_loaded) -		return; - -	vcpu->guest_fpu_loaded = 0; +	preempt_disable();  	copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); -	__kernel_fpu_end(); +	copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); +	preempt_enable();  	++vcpu->stat.fpu_reload;  	trace_kvm_fpu(0);  } @@ -7832,7 +7864,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)  		 * To avoid have the INIT path from kvm_apic_has_events() that be  		 * called with loaded FPU and does not let userspace fix the state.  		 */ -		kvm_put_guest_fpu(vcpu); +		if (init_event) +			kvm_put_guest_fpu(vcpu);  		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,  					XFEATURE_MASK_BNDREGS);  		if (mpx_state_buffer) @@ -7841,6 +7874,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)  					XFEATURE_MASK_BNDCSR);  		if (mpx_state_buffer)  			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); +		if (init_event) +			kvm_load_guest_fpu(vcpu);  	}  	if (!init_event) { diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 7b181b61170e..f23934bbaf4e 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o  lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o  lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o  lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +lib-$(CONFIG_RETPOLINE) += retpoline.o  obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 4d34bb548b41..46e71a74e612 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -29,7 +29,8 @@  #include <asm/errno.h>  #include <asm/asm.h>  #include <asm/export.h> -				 +#include <asm/nospec-branch.h> +  /*   * computes a partial checksum, e.g. for TCP/UDP fragments   */ @@ -156,7 +157,7 @@ ENTRY(csum_partial)  	negl %ebx  	lea 45f(%ebx,%ebx,2), %ebx  	testl %esi, %esi -	jmp *%ebx +	JMP_NOSPEC %ebx  	# Handle 2-byte-aligned regions  20:	addw (%esi), %ax @@ -439,7 +440,7 @@ ENTRY(csum_partial_copy_generic)  	andl $-32,%edx  	lea 3f(%ebx,%ebx), %ebx  	testl %esi, %esi  -	jmp *%ebx +	JMP_NOSPEC %ebx  1:	addl $64,%esi  	addl $64,%edi   	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 553f8fd23cc4..4846eff7e4c8 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops)  		delay = min_t(u64, MWAITX_MAX_LOOPS, loops);  		/* -		 * Use cpu_tss as a cacheline-aligned, seldomly +		 * Use cpu_tss_rw as a cacheline-aligned, seldomly  		 * accessed per-cpu variable as the monitor target.  		 */ -		__monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); +		__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);  		/*  		 * AMD, like Intel, supports the EAX hint and EAX=0xf diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S new file mode 100644 index 000000000000..cb45c6cb465f --- /dev/null +++ b/arch/x86/lib/retpoline.S @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/stringify.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/cpufeatures.h> +#include <asm/alternative-asm.h> +#include <asm/export.h> +#include <asm/nospec-branch.h> + +.macro THUNK reg +	.section .text.__x86.indirect_thunk.\reg + +ENTRY(__x86_indirect_thunk_\reg) +	CFI_STARTPROC +	JMP_NOSPEC %\reg +	CFI_ENDPROC +ENDPROC(__x86_indirect_thunk_\reg) +.endm + +/* + * Despite being an assembler file we can't just use .irp here + * because __KSYM_DEPS__ only uses the C preprocessor and would + * only see one instance of "__x86_indirect_thunk_\reg" rather + * than one per register with the correct names. So we do it + * the simple and nasty way... + */ +#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg) +#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) + +GENERATE_THUNK(_ASM_AX) +GENERATE_THUNK(_ASM_BX) +GENERATE_THUNK(_ASM_CX) +GENERATE_THUNK(_ASM_DX) +GENERATE_THUNK(_ASM_SI) +GENERATE_THUNK(_ASM_DI) +GENERATE_THUNK(_ASM_BP) +GENERATE_THUNK(_ASM_SP) +#ifdef CONFIG_64BIT +GENERATE_THUNK(r8) +GENERATE_THUNK(r9) +GENERATE_THUNK(r10) +GENERATE_THUNK(r11) +GENERATE_THUNK(r12) +GENERATE_THUNK(r13) +GENERATE_THUNK(r14) +GENERATE_THUNK(r15) +#endif diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index c4d55919fac1..e0b85930dd77 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1)  fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1)  fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1)  fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0  EndTable  Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2  7e: vpermt2d/q Vx,Hx,Wx (66),(ev)  7f: vpermt2ps/d Vx,Hx,Wx (66),(ev)  80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66)  82: INVPCID Gy,Mdq (66)  83: vpmultishiftqb Vx,Hx,Wx (66),(ev)  88: vexpandps/d Vpd,Wpd (66),(ev) @@ -970,6 +970,15 @@ GrpTable: Grp9  EndTable  GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1  EndTable  # Grp11A and Grp11B are expressed as Grp11 in Intel SDM diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 8e13b8cc6bed..27e9e90a8d35 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o	= -pg  endif  obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ -	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o +	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o  # Make sure __phys_addr has no stackprotector  nostackp := $(call cc-option, -fno-stack-protector) @@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o  obj-$(CONFIG_ACPI_NUMA)		+= srat.o  obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o -obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o -obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_X86_INTEL_MPX)			+= mpx.o +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o +obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o +obj-$(CONFIG_PAGE_TABLE_ISOLATION)		+= pti.o  obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o  obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c new file mode 100644 index 000000000000..b9283cc27622 --- /dev/null +++ b/arch/x86/mm/cpu_entry_area.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/spinlock.h> +#include <linux/percpu.h> + +#include <asm/cpu_entry_area.h> +#include <asm/pgtable.h> +#include <asm/fixmap.h> +#include <asm/desc.h> + +static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +#endif + +struct cpu_entry_area *get_cpu_entry_area(int cpu) +{ +	unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; +	BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); + +	return (struct cpu_entry_area *) va; +} +EXPORT_SYMBOL(get_cpu_entry_area); + +void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) +{ +	unsigned long va = (unsigned long) cea_vaddr; + +	set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); +} + +static void __init +cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) +{ +	for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) +		cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); +} + +static void percpu_setup_debug_store(int cpu) +{ +#ifdef CONFIG_CPU_SUP_INTEL +	int npages; +	void *cea; + +	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) +		return; + +	cea = &get_cpu_entry_area(cpu)->cpu_debug_store; +	npages = sizeof(struct debug_store) / PAGE_SIZE; +	BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); +	cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, +			     PAGE_KERNEL); + +	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; +	/* +	 * Force the population of PMDs for not yet allocated per cpu +	 * memory like debug store buffers. +	 */ +	npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; +	for (; npages; npages--, cea += PAGE_SIZE) +		cea_set_pte(cea, 0, PAGE_NONE); +#endif +} + +/* Setup the fixmap mappings only once per-processor */ +static void __init setup_cpu_entry_area(int cpu) +{ +#ifdef CONFIG_X86_64 +	extern char _entry_trampoline[]; + +	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ +	pgprot_t gdt_prot = PAGE_KERNEL_RO; +	pgprot_t tss_prot = PAGE_KERNEL_RO; +#else +	/* +	 * On native 32-bit systems, the GDT cannot be read-only because +	 * our double fault handler uses a task gate, and entering through +	 * a task gate needs to change an available TSS to busy.  If the +	 * GDT is read-only, that will triple fault.  The TSS cannot be +	 * read-only because the CPU writes to it on task switches. +	 * +	 * On Xen PV, the GDT must be read-only because the hypervisor +	 * requires it. +	 */ +	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? +		PAGE_KERNEL_RO : PAGE_KERNEL; +	pgprot_t tss_prot = PAGE_KERNEL; +#endif + +	cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), +		    gdt_prot); + +	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, +			     per_cpu_ptr(&entry_stack_storage, cpu), 1, +			     PAGE_KERNEL); + +	/* +	 * The Intel SDM says (Volume 3, 7.2.1): +	 * +	 *  Avoid placing a page boundary in the part of the TSS that the +	 *  processor reads during a task switch (the first 104 bytes). The +	 *  processor may not correctly perform address translations if a +	 *  boundary occurs in this area. During a task switch, the processor +	 *  reads and writes into the first 104 bytes of each TSS (using +	 *  contiguous physical addresses beginning with the physical address +	 *  of the first byte of the TSS). So, after TSS access begins, if +	 *  part of the 104 bytes is not physically contiguous, the processor +	 *  will access incorrect information without generating a page-fault +	 *  exception. +	 * +	 * There are also a lot of errata involving the TSS spanning a page +	 * boundary.  Assert that we're not doing that. +	 */ +	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ +		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); +	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); +	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, +			     &per_cpu(cpu_tss_rw, cpu), +			     sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); + +#ifdef CONFIG_X86_32 +	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +#endif + +#ifdef CONFIG_X86_64 +	BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); +	BUILD_BUG_ON(sizeof(exception_stacks) != +		     sizeof(((struct cpu_entry_area *)0)->exception_stacks)); +	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, +			     &per_cpu(exception_stacks, cpu), +			     sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); + +	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, +		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +#endif +	percpu_setup_debug_store(cpu); +} + +static __init void setup_cpu_entry_area_ptes(void) +{ +#ifdef CONFIG_X86_32 +	unsigned long start, end; + +	BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); +	BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); + +	start = CPU_ENTRY_AREA_BASE; +	end = start + CPU_ENTRY_AREA_MAP_SIZE; + +	/* Careful here: start + PMD_SIZE might wrap around */ +	for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) +		populate_extra_pte(start); +#endif +} + +void __init setup_cpu_entry_areas(void) +{ +	unsigned int cpu; + +	setup_cpu_entry_area_ptes(); + +	for_each_possible_cpu(cpu) +		setup_cpu_entry_area(cpu); +} diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index bfcffdf6c577..421f2664ffa0 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -5,7 +5,7 @@  static int ptdump_show(struct seq_file *m, void *v)  { -	ptdump_walk_pgd_level(m, NULL); +	ptdump_walk_pgd_level_debugfs(m, NULL, false);  	return 0;  } @@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {  	.release	= single_release,  }; -static struct dentry *pe; +static int ptdump_show_curknl(struct seq_file *m, void *v) +{ +	if (current->mm->pgd) { +		down_read(¤t->mm->mmap_sem); +		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); +		up_read(¤t->mm->mmap_sem); +	} +	return 0; +} + +static int ptdump_open_curknl(struct inode *inode, struct file *filp) +{ +	return single_open(filp, ptdump_show_curknl, NULL); +} + +static const struct file_operations ptdump_curknl_fops = { +	.owner		= THIS_MODULE, +	.open		= ptdump_open_curknl, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static struct dentry *pe_curusr; + +static int ptdump_show_curusr(struct seq_file *m, void *v) +{ +	if (current->mm->pgd) { +		down_read(¤t->mm->mmap_sem); +		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); +		up_read(¤t->mm->mmap_sem); +	} +	return 0; +} + +static int ptdump_open_curusr(struct inode *inode, struct file *filp) +{ +	return single_open(filp, ptdump_show_curusr, NULL); +} + +static const struct file_operations ptdump_curusr_fops = { +	.owner		= THIS_MODULE, +	.open		= ptdump_open_curusr, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; +#endif + +static struct dentry *dir, *pe_knl, *pe_curknl;  static int __init pt_dump_debug_init(void)  { -	pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, -				 &ptdump_fops); -	if (!pe) +	dir = debugfs_create_dir("page_tables", NULL); +	if (!dir)  		return -ENOMEM; +	pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, +				     &ptdump_fops); +	if (!pe_knl) +		goto err; + +	pe_curknl = debugfs_create_file("current_kernel", 0400, +					dir, NULL, &ptdump_curknl_fops); +	if (!pe_curknl) +		goto err; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	pe_curusr = debugfs_create_file("current_user", 0400, +					dir, NULL, &ptdump_curusr_fops); +	if (!pe_curusr) +		goto err; +#endif  	return 0; +err: +	debugfs_remove_recursive(dir); +	return -ENOMEM;  }  static void __exit pt_dump_debug_exit(void)  { -	debugfs_remove_recursive(pe); +	debugfs_remove_recursive(dir);  }  module_init(pt_dump_debug_init); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 5e3ac6fe6c9e..2a4849e92831 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -44,68 +44,97 @@ struct addr_marker {  	unsigned long max_lines;  }; -/* indices for address_markers; keep sync'd w/ address_markers below */ +/* Address space markers hints */ + +#ifdef CONFIG_X86_64 +  enum address_markers_idx {  	USER_SPACE_NR = 0, -#ifdef CONFIG_X86_64  	KERNEL_SPACE_NR,  	LOW_KERNEL_NR, +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) +	LDT_NR, +#endif  	VMALLOC_START_NR,  	VMEMMAP_START_NR,  #ifdef CONFIG_KASAN  	KASAN_SHADOW_START_NR,  	KASAN_SHADOW_END_NR,  #endif -# ifdef CONFIG_X86_ESPFIX64 +	CPU_ENTRY_AREA_NR, +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) +	LDT_NR, +#endif +#ifdef CONFIG_X86_ESPFIX64  	ESPFIX_START_NR, -# endif +#endif +#ifdef CONFIG_EFI +	EFI_END_NR, +#endif  	HIGH_KERNEL_NR,  	MODULES_VADDR_NR,  	MODULES_END_NR, -#else +	FIXADDR_START_NR, +	END_OF_SPACE_NR, +}; + +static struct addr_marker address_markers[] = { +	[USER_SPACE_NR]		= { 0,			"User Space" }, +	[KERNEL_SPACE_NR]	= { (1UL << 63),	"Kernel Space" }, +	[LOW_KERNEL_NR]		= { 0UL,		"Low Kernel Mapping" }, +	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" }, +	[VMEMMAP_START_NR]	= { 0UL,		"Vmemmap" }, +#ifdef CONFIG_KASAN +	[KASAN_SHADOW_START_NR]	= { KASAN_SHADOW_START,	"KASAN shadow" }, +	[KASAN_SHADOW_END_NR]	= { KASAN_SHADOW_END,	"KASAN shadow end" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL +	[LDT_NR]		= { LDT_BASE_ADDR,	"LDT remap" }, +#endif +	[CPU_ENTRY_AREA_NR]	= { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, +#ifdef CONFIG_X86_ESPFIX64 +	[ESPFIX_START_NR]	= { ESPFIX_BASE_ADDR,	"ESPfix Area", 16 }, +#endif +#ifdef CONFIG_EFI +	[EFI_END_NR]		= { EFI_VA_END,		"EFI Runtime Services" }, +#endif +	[HIGH_KERNEL_NR]	= { __START_KERNEL_map,	"High Kernel Mapping" }, +	[MODULES_VADDR_NR]	= { MODULES_VADDR,	"Modules" }, +	[MODULES_END_NR]	= { MODULES_END,	"End Modules" }, +	[FIXADDR_START_NR]	= { FIXADDR_START,	"Fixmap Area" }, +	[END_OF_SPACE_NR]	= { -1,			NULL } +}; + +#else /* CONFIG_X86_64 */ + +enum address_markers_idx { +	USER_SPACE_NR = 0,  	KERNEL_SPACE_NR,  	VMALLOC_START_NR,  	VMALLOC_END_NR, -# ifdef CONFIG_HIGHMEM +#ifdef CONFIG_HIGHMEM  	PKMAP_BASE_NR, -# endif -	FIXADDR_START_NR,  #endif +	CPU_ENTRY_AREA_NR, +	FIXADDR_START_NR, +	END_OF_SPACE_NR,  }; -/* Address space markers hints */  static struct addr_marker address_markers[] = { -	{ 0, "User Space" }, -#ifdef CONFIG_X86_64 -	{ 0x8000000000000000UL, "Kernel Space" }, -	{ 0/* PAGE_OFFSET */,   "Low Kernel Mapping" }, -	{ 0/* VMALLOC_START */, "vmalloc() Area" }, -	{ 0/* VMEMMAP_START */, "Vmemmap" }, -#ifdef CONFIG_KASAN -	{ KASAN_SHADOW_START,	"KASAN shadow" }, -	{ KASAN_SHADOW_END,	"KASAN shadow end" }, +	[USER_SPACE_NR]		= { 0,			"User Space" }, +	[KERNEL_SPACE_NR]	= { PAGE_OFFSET,	"Kernel Mapping" }, +	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" }, +	[VMALLOC_END_NR]	= { 0UL,		"vmalloc() End" }, +#ifdef CONFIG_HIGHMEM +	[PKMAP_BASE_NR]		= { 0UL,		"Persistent kmap() Area" },  #endif -# ifdef CONFIG_X86_ESPFIX64 -	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 }, -# endif -# ifdef CONFIG_EFI -	{ EFI_VA_END,		"EFI Runtime Services" }, -# endif -	{ __START_KERNEL_map,   "High Kernel Mapping" }, -	{ MODULES_VADDR,        "Modules" }, -	{ MODULES_END,          "End Modules" }, -#else -	{ PAGE_OFFSET,          "Kernel Mapping" }, -	{ 0/* VMALLOC_START */, "vmalloc() Area" }, -	{ 0/*VMALLOC_END*/,     "vmalloc() End" }, -# ifdef CONFIG_HIGHMEM -	{ 0/*PKMAP_BASE*/,      "Persistent kmap() Area" }, -# endif -	{ 0/*FIXADDR_START*/,   "Fixmap Area" }, -#endif -	{ -1, NULL }		/* End of list */ +	[CPU_ENTRY_AREA_NR]	= { 0UL,		"CPU entry area" }, +	[FIXADDR_START_NR]	= { 0UL,		"Fixmap area" }, +	[END_OF_SPACE_NR]	= { -1,			NULL }  }; +#endif /* !CONFIG_X86_64 */ +  /* Multipliers for offsets within the PTEs */  #define PTE_LEVEL_MULT (PAGE_SIZE)  #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) @@ -140,7 +169,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)  	static const char * const level_name[] =  		{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; -	if (!pgprot_val(prot)) { +	if (!(pr & _PAGE_PRESENT)) {  		/* Not present */  		pt_dump_cont_printf(m, dmsg, "                              ");  	} else { @@ -447,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)  }  static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, -				       bool checkwx) +				       bool checkwx, bool dmesg)  {  #ifdef CONFIG_X86_64  	pgd_t *start = (pgd_t *) &init_top_pgt; @@ -460,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,  	if (pgd) {  		start = pgd; -		st.to_dmesg = true; +		st.to_dmesg = dmesg;  	}  	st.check_wx = checkwx; @@ -498,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,  void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)  { -	ptdump_walk_pgd_level_core(m, pgd, false); +	ptdump_walk_pgd_level_core(m, pgd, false, true); +} + +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	if (user && static_cpu_has(X86_FEATURE_PTI)) +		pgd = kernel_to_user_pgdp(pgd); +#endif +	ptdump_walk_pgd_level_core(m, pgd, false, false); +} +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); + +static void ptdump_walk_user_pgd_level_checkwx(void) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +	pgd_t *pgd = (pgd_t *) &init_top_pgt; + +	if (!static_cpu_has(X86_FEATURE_PTI)) +		return; + +	pr_info("x86/mm: Checking user space page tables\n"); +	pgd = kernel_to_user_pgdp(pgd); +	ptdump_walk_pgd_level_core(NULL, pgd, true, false); +#endif  } -EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);  void ptdump_walk_pgd_level_checkwx(void)  { -	ptdump_walk_pgd_level_core(NULL, NULL, true); +	ptdump_walk_pgd_level_core(NULL, NULL, true, false); +	ptdump_walk_user_pgd_level_checkwx();  }  static int __init pt_dump_init(void) @@ -525,8 +578,8 @@ static int __init pt_dump_init(void)  	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;  # endif  	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; +	address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;  #endif -  	return 0;  }  __initcall(pt_dump_init); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 3321b446b66c..9fe656c42aa5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,6 +1,7 @@  #include <linux/extable.h>  #include <linux/uaccess.h>  #include <linux/sched/debug.h> +#include <xen/xen.h>  #include <asm/fpu/internal.h>  #include <asm/traps.h> @@ -82,7 +83,7 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup,  	return true;  } -EXPORT_SYMBOL_GPL(ex_handler_refcount); +EXPORT_SYMBOL(ex_handler_refcount);  /*   * Handler for when we fail to restore a task's FPU state.  We should never get @@ -212,8 +213,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)  	 * Old CPUs leave the high bits of CS on the stack  	 * undefined.  I'm not sure which CPUs do this, but at least  	 * the 486 DX works this way. +	 * Xen pv domains are not using the default __KERNEL_CS.  	 */ -	if (regs->cs != __KERNEL_CS) +	if (!xen_pv_domain() && regs->cs != __KERNEL_CS)  		goto fail;  	/* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 78ca9a8ee454..06fe3d51d385 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -701,7 +701,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,  	else  		printk(KERN_CONT "paging request"); -	printk(KERN_CONT " at %p\n", (void *) address); +	printk(KERN_CONT " at %px\n", (void *) address);  	printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);  	dump_pagetable(address); @@ -860,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,  	if (!printk_ratelimit())  		return; -	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", +	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",  		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,  		tsk->comm, task_pid_nr(tsk), address,  		(void *)regs->ip, (void *)regs->sp, error_code); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6fdf91ef130a..82f5252c723a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -20,6 +20,7 @@  #include <asm/kaslr.h>  #include <asm/hypervisor.h>  #include <asm/cpufeature.h> +#include <asm/pti.h>  /*   * We need to define the tracepoints somewhere, and tlb.c @@ -160,6 +161,12 @@ struct map_range {  static int page_size_mask; +static void enable_global_pages(void) +{ +	if (!static_cpu_has(X86_FEATURE_PTI)) +		__supported_pte_mask |= _PAGE_GLOBAL; +} +  static void __init probe_page_size_mask(void)  {  	/* @@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void)  		cr4_set_bits_and_update_boot(X86_CR4_PSE);  	/* Enable PGE if available */ +	__supported_pte_mask &= ~_PAGE_GLOBAL;  	if (boot_cpu_has(X86_FEATURE_PGE)) {  		cr4_set_bits_and_update_boot(X86_CR4_PGE); -		__supported_pte_mask |= _PAGE_GLOBAL; -	} else -		__supported_pte_mask &= ~_PAGE_GLOBAL; +		enable_global_pages(); +	}  	/* Enable 1 GB linear kernel mappings if available: */  	if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { @@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void)  static void setup_pcid(void)  { -#ifdef CONFIG_X86_64 -	if (boot_cpu_has(X86_FEATURE_PCID)) { -		if (boot_cpu_has(X86_FEATURE_PGE)) { -			/* -			 * This can't be cr4_set_bits_and_update_boot() -- -			 * the trampoline code can't handle CR4.PCIDE and -			 * it wouldn't do any good anyway.  Despite the name, -			 * cr4_set_bits_and_update_boot() doesn't actually -			 * cause the bits in question to remain set all the -			 * way through the secondary boot asm. -			 * -			 * Instead, we brute-force it and set CR4.PCIDE -			 * manually in start_secondary(). -			 */ -			cr4_set_bits(X86_CR4_PCIDE); -		} else { -			/* -			 * flush_tlb_all(), as currently implemented, won't -			 * work if PCID is on but PGE is not.  Since that -			 * combination doesn't exist on real hardware, there's -			 * no reason to try to fully support it, but it's -			 * polite to avoid corrupting data if we're on -			 * an improperly configured VM. -			 */ -			setup_clear_cpu_cap(X86_FEATURE_PCID); -		} +	if (!IS_ENABLED(CONFIG_X86_64)) +		return; + +	if (!boot_cpu_has(X86_FEATURE_PCID)) +		return; + +	if (boot_cpu_has(X86_FEATURE_PGE)) { +		/* +		 * This can't be cr4_set_bits_and_update_boot() -- the +		 * trampoline code can't handle CR4.PCIDE and it wouldn't +		 * do any good anyway.  Despite the name, +		 * cr4_set_bits_and_update_boot() doesn't actually cause +		 * the bits in question to remain set all the way through +		 * the secondary boot asm. +		 * +		 * Instead, we brute-force it and set CR4.PCIDE manually in +		 * start_secondary(). +		 */ +		cr4_set_bits(X86_CR4_PCIDE); + +		/* +		 * INVPCID's single-context modes (2/3) only work if we set +		 * X86_CR4_PCIDE, *and* we INVPCID support.  It's unusable +		 * on systems that have X86_CR4_PCIDE clear, or that have +		 * no INVPCID support at all. +		 */ +		if (boot_cpu_has(X86_FEATURE_INVPCID)) +			setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); +	} else { +		/* +		 * flush_tlb_all(), as currently implemented, won't work if +		 * PCID is on but PGE is not.  Since that combination +		 * doesn't exist on real hardware, there's no reason to try +		 * to fully support it, but it's polite to avoid corrupting +		 * data if we're on an improperly configured VM. +		 */ +		setup_clear_cpu_cap(X86_FEATURE_PCID);  	} -#endif  }  #ifdef CONFIG_X86_32 @@ -622,6 +639,7 @@ void __init init_mem_mapping(void)  {  	unsigned long end; +	pti_check_boottime_disable();  	probe_page_size_mask();  	setup_pcid(); @@ -845,12 +863,12 @@ void __init zone_sizes_init(void)  	free_area_init_nodes(max_zone_pfns);  } -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {  	.loaded_mm = &init_mm,  	.next_asid = 1,  	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */  }; -EXPORT_SYMBOL_GPL(cpu_tlbstate); +EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);  void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)  { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8a64a6f2848d..135c9a7898c7 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -50,6 +50,7 @@  #include <asm/setup.h>  #include <asm/set_memory.h>  #include <asm/page_types.h> +#include <asm/cpu_entry_area.h>  #include <asm/init.h>  #include "mm_internal.h" @@ -766,6 +767,7 @@ void __init mem_init(void)  	mem_init_print_info(NULL);  	printk(KERN_INFO "virtual kernel memory layout:\n"  		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n" +		"  cpu_entry : 0x%08lx - 0x%08lx   (%4ld kB)\n"  #ifdef CONFIG_HIGHMEM  		"    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"  #endif @@ -777,6 +779,10 @@ void __init mem_init(void)  		FIXADDR_START, FIXADDR_TOP,  		(FIXADDR_TOP - FIXADDR_START) >> 10, +		CPU_ENTRY_AREA_BASE, +		CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, +		CPU_ENTRY_AREA_MAP_SIZE >> 10, +  #ifdef CONFIG_HIGHMEM  		PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,  		(LAST_PKMAP*PAGE_SIZE) >> 10, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6e4573b1da34..c45b6ec5357b 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -404,11 +404,11 @@ void iounmap(volatile void __iomem *addr)  		return;  	} +	mmiotrace_iounmap(addr); +  	addr = (volatile void __iomem *)  		(PAGE_MASK & (unsigned long __force)addr); -	mmiotrace_iounmap(addr); -  	/* Use the vm area unlocked, assuming the caller  	   ensures there isn't another iounmap for the same address  	   in parallel. Reuse of the virtual address is prevented by diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 99dfed6dfef8..47388f0c0e59 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -15,6 +15,7 @@  #include <asm/tlbflush.h>  #include <asm/sections.h>  #include <asm/pgtable.h> +#include <asm/cpu_entry_area.h>  extern struct range pfn_mapped[E820_MAX_ENTRIES]; @@ -277,6 +278,7 @@ void __init kasan_early_init(void)  void __init kasan_init(void)  {  	int i; +	void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;  #ifdef CONFIG_KASAN_INLINE  	register_die_notifier(&kasan_die_notifier); @@ -321,16 +323,33 @@ void __init kasan_init(void)  		map_range(&pfn_mapped[i]);  	} +	shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; +	shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); +	shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, +						PAGE_SIZE); + +	shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + +					CPU_ENTRY_AREA_MAP_SIZE); +	shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); +	shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, +					PAGE_SIZE); +  	kasan_populate_zero_shadow(  		kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), -		kasan_mem_to_shadow((void *)__START_KERNEL_map)); +		shadow_cpu_entry_begin); + +	kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, +			      (unsigned long)shadow_cpu_entry_end, 0); + +	kasan_populate_zero_shadow(shadow_cpu_entry_end, +				kasan_mem_to_shadow((void *)__START_KERNEL_map));  	kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),  			      (unsigned long)kasan_mem_to_shadow(_end),  			      early_pfn_to_nid(__pa(_stext)));  	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), -			(void *)KASAN_SHADOW_END); +				(void *)KASAN_SHADOW_END);  	load_cr3(init_top_pgt);  	__flush_tlb_all(); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 879ef930e2c2..aedebd2ebf1e 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -34,25 +34,14 @@  #define TB_SHIFT 40  /* - * Virtual address start and end range for randomization. The end changes base - * on configuration to have the highest amount of space for randomization. - * It increases the possible random position for each randomized region. + * Virtual address start and end range for randomization.   * - * You need to add an if/def entry if you introduce a new memory region - * compatible with KASLR. Your entry must be in logical order with memory - * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to - * ensure that this order is correct and won't be changed. + * The end address could depend on more configuration options to make the + * highest amount of space for randomization available, but that's too hard + * to keep straight and caused issues already.   */  static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; - -#if defined(CONFIG_X86_ESPFIX64) -static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; -#elif defined(CONFIG_EFI) -static const unsigned long vaddr_end = EFI_VA_END; -#else -static const unsigned long vaddr_end = __START_KERNEL_map; -#endif +static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;  /* Default values */  unsigned long page_offset_base = __PAGE_OFFSET_BASE; @@ -101,15 +90,12 @@ void __init kernel_randomize_memory(void)  	unsigned long remain_entropy;  	/* -	 * All these BUILD_BUG_ON checks ensures the memory layout is -	 * consistent with the vaddr_start/vaddr_end variables. +	 * These BUILD_BUG_ON checks ensure the memory layout is consistent +	 * with the vaddr_start/vaddr_end variables. These checks are very +	 * limited....  	 */  	BUILD_BUG_ON(vaddr_start >= vaddr_end); -	BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && -		     vaddr_end >= EFI_VA_END); -	BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || -		      IS_ENABLED(CONFIG_EFI)) && -		     vaddr_end >= __START_KERNEL_map); +	BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);  	BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);  	if (!kaslr_memory_enabled()) diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/error.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/error.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/opcode.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/opcode.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/pte.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/pte.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c deleted file mode 100644 index cec594032515..000000000000 --- a/arch/x86/mm/kmemcheck/selftest.c +++ /dev/null @@ -1 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/selftest.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h deleted file mode 100644 index ea32a7d3cf1b..000000000000 --- a/arch/x86/mm/kmemcheck/shadow.h +++ /dev/null @@ -1 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index c21c2ed04612..58477ec3d66d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -435,17 +435,18 @@ int register_kmmio_probe(struct kmmio_probe *p)  	unsigned long flags;  	int ret = 0;  	unsigned long size = 0; +	unsigned long addr = p->addr & PAGE_MASK;  	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);  	unsigned int l;  	pte_t *pte;  	spin_lock_irqsave(&kmmio_lock, flags); -	if (get_kmmio_probe(p->addr)) { +	if (get_kmmio_probe(addr)) {  		ret = -EEXIST;  		goto out;  	} -	pte = lookup_address(p->addr, &l); +	pte = lookup_address(addr, &l);  	if (!pte) {  		ret = -EINVAL;  		goto out; @@ -454,7 +455,7 @@ int register_kmmio_probe(struct kmmio_probe *p)  	kmmio_count++;  	list_add_rcu(&p->list, &kmmio_probes);  	while (size < size_lim) { -		if (add_kmmio_fault_page(p->addr + size)) +		if (add_kmmio_fault_page(addr + size))  			pr_err("Unable to set page fault.\n");  		size += page_level_size(l);  	} @@ -528,19 +529,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p)  {  	unsigned long flags;  	unsigned long size = 0; +	unsigned long addr = p->addr & PAGE_MASK;  	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);  	struct kmmio_fault_page *release_list = NULL;  	struct kmmio_delayed_release *drelease;  	unsigned int l;  	pte_t *pte; -	pte = lookup_address(p->addr, &l); +	pte = lookup_address(addr, &l);  	if (!pte)  		return;  	spin_lock_irqsave(&kmmio_lock, flags);  	while (size < size_lim) { -		release_kmmio_fault_page(p->addr + size, &release_list); +		release_kmmio_fault_page(addr + size, &release_list);  		size += page_level_size(l);  	}  	list_del_rcu(&p->list); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index d9a9e9fc75dd..391b13402e40 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -405,13 +405,13 @@ bool sme_active(void)  {  	return sme_me_mask && !sev_enabled;  } -EXPORT_SYMBOL_GPL(sme_active); +EXPORT_SYMBOL(sme_active);  bool sev_active(void)  {  	return sme_me_mask && sev_enabled;  } -EXPORT_SYMBOL_GPL(sev_active); +EXPORT_SYMBOL(sev_active);  static const struct dma_map_ops sev_dma_ops = {  	.alloc                  = sev_alloc, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 96d456a94b03..004abf9ebf12 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)  		kmem_cache_free(pgd_cache, pgd);  }  #else +  static inline pgd_t *_pgd_alloc(void)  { -	return (pgd_t *)__get_free_page(PGALLOC_GFP); +	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);  }  static inline void _pgd_free(pgd_t *pgd)  { -	free_page((unsigned long)pgd); +	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);  }  #endif /* CONFIG_X86_PAE */ diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 6b9bf023a700..c3c5274410a9 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,6 +10,7 @@  #include <linux/pagemap.h>  #include <linux/spinlock.h> +#include <asm/cpu_entry_area.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h>  #include <asm/fixmap.h> diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c new file mode 100644 index 000000000000..ce38f165489b --- /dev/null +++ b/arch/x86/mm/pti.c @@ -0,0 +1,368 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * This code is based in part on work published here: + * + *	https://github.com/IAIK/KAISER + * + * The original work was written by and and signed off by for the Linux + * kernel by: + * + *   Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> + *   Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> + *   Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> + *   Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> + * + * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> + * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and + *		       Andy Lutomirsky <luto@amacapital.net> + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/uaccess.h> + +#include <asm/cpufeature.h> +#include <asm/hypervisor.h> +#include <asm/vsyscall.h> +#include <asm/cmdline.h> +#include <asm/pti.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/desc.h> + +#undef pr_fmt +#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt + +/* Backporting helper */ +#ifndef __GFP_NOTRACK +#define __GFP_NOTRACK	0 +#endif + +static void __init pti_print_if_insecure(const char *reason) +{ +	if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) +		pr_info("%s\n", reason); +} + +static void __init pti_print_if_secure(const char *reason) +{ +	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) +		pr_info("%s\n", reason); +} + +void __init pti_check_boottime_disable(void) +{ +	char arg[5]; +	int ret; + +	if (hypervisor_is_type(X86_HYPER_XEN_PV)) { +		pti_print_if_insecure("disabled on XEN PV."); +		return; +	} + +	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); +	if (ret > 0)  { +		if (ret == 3 && !strncmp(arg, "off", 3)) { +			pti_print_if_insecure("disabled on command line."); +			return; +		} +		if (ret == 2 && !strncmp(arg, "on", 2)) { +			pti_print_if_secure("force enabled on command line."); +			goto enable; +		} +		if (ret == 4 && !strncmp(arg, "auto", 4)) +			goto autosel; +	} + +	if (cmdline_find_option_bool(boot_command_line, "nopti")) { +		pti_print_if_insecure("disabled on command line."); +		return; +	} + +autosel: +	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) +		return; +enable: +	setup_force_cpu_cap(X86_FEATURE_PTI); +} + +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ +	/* +	 * Changes to the high (kernel) portion of the kernelmode page +	 * tables are not automatically propagated to the usermode tables. +	 * +	 * Users should keep in mind that, unlike the kernelmode tables, +	 * there is no vmalloc_fault equivalent for the usermode tables. +	 * Top-level entries added to init_mm's usermode pgd after boot +	 * will not be automatically propagated to other mms. +	 */ +	if (!pgdp_maps_userspace(pgdp)) +		return pgd; + +	/* +	 * The user page tables get the full PGD, accessible from +	 * userspace: +	 */ +	kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; + +	/* +	 * If this is normal user memory, make it NX in the kernel +	 * pagetables so that, if we somehow screw up and return to +	 * usermode with the kernel CR3 loaded, we'll get a page fault +	 * instead of allowing user code to execute with the wrong CR3. +	 * +	 * As exceptions, we don't set NX if: +	 *  - _PAGE_USER is not set.  This could be an executable +	 *     EFI runtime mapping or something similar, and the kernel +	 *     may execute from it +	 *  - we don't have NX support +	 *  - we're clearing the PGD (i.e. the new pgd is not present). +	 */ +	if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && +	    (__supported_pte_mask & _PAGE_NX)) +		pgd.pgd |= _PAGE_NX; + +	/* return the copy of the PGD we want the kernel to use: */ +	return pgd; +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a P4D on success, or NULL on failure. + */ +static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +{ +	pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); +	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + +	if (address < PAGE_OFFSET) { +		WARN_ONCE(1, "attempt to walk user address\n"); +		return NULL; +	} + +	if (pgd_none(*pgd)) { +		unsigned long new_p4d_page = __get_free_page(gfp); +		if (!new_p4d_page) +			return NULL; + +		set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); +	} +	BUILD_BUG_ON(pgd_large(*pgd) != 0); + +	return p4d_offset(pgd, address); +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a PMD on success, or NULL on failure. + */ +static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +{ +	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); +	p4d_t *p4d = pti_user_pagetable_walk_p4d(address); +	pud_t *pud; + +	BUILD_BUG_ON(p4d_large(*p4d) != 0); +	if (p4d_none(*p4d)) { +		unsigned long new_pud_page = __get_free_page(gfp); +		if (!new_pud_page) +			return NULL; + +		set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); +	} + +	pud = pud_offset(p4d, address); +	/* The user page tables do not use large mappings: */ +	if (pud_large(*pud)) { +		WARN_ON(1); +		return NULL; +	} +	if (pud_none(*pud)) { +		unsigned long new_pmd_page = __get_free_page(gfp); +		if (!new_pmd_page) +			return NULL; + +		set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); +	} + +	return pmd_offset(pud, address); +} + +#ifdef CONFIG_X86_VSYSCALL_EMULATION +/* + * Walk the shadow copy of the page tables (optionally) trying to allocate + * page table pages on the way down.  Does not support large pages. + * + * Note: this is only used when mapping *new* kernel data into the + * user/shadow page tables.  It is never used for userspace data. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) +{ +	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); +	pmd_t *pmd = pti_user_pagetable_walk_pmd(address); +	pte_t *pte; + +	/* We can't do anything sensible if we hit a large mapping. */ +	if (pmd_large(*pmd)) { +		WARN_ON(1); +		return NULL; +	} + +	if (pmd_none(*pmd)) { +		unsigned long new_pte_page = __get_free_page(gfp); +		if (!new_pte_page) +			return NULL; + +		set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); +	} + +	pte = pte_offset_kernel(pmd, address); +	if (pte_flags(*pte) & _PAGE_USER) { +		WARN_ONCE(1, "attempt to walk to user pte\n"); +		return NULL; +	} +	return pte; +} + +static void __init pti_setup_vsyscall(void) +{ +	pte_t *pte, *target_pte; +	unsigned int level; + +	pte = lookup_address(VSYSCALL_ADDR, &level); +	if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) +		return; + +	target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); +	if (WARN_ON(!target_pte)) +		return; + +	*target_pte = *pte; +	set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); +} +#else +static void __init pti_setup_vsyscall(void) { } +#endif + +static void __init +pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) +{ +	unsigned long addr; + +	/* +	 * Clone the populated PMDs which cover start to end. These PMD areas +	 * can have holes. +	 */ +	for (addr = start; addr < end; addr += PMD_SIZE) { +		pmd_t *pmd, *target_pmd; +		pgd_t *pgd; +		p4d_t *p4d; +		pud_t *pud; + +		pgd = pgd_offset_k(addr); +		if (WARN_ON(pgd_none(*pgd))) +			return; +		p4d = p4d_offset(pgd, addr); +		if (WARN_ON(p4d_none(*p4d))) +			return; +		pud = pud_offset(p4d, addr); +		if (pud_none(*pud)) +			continue; +		pmd = pmd_offset(pud, addr); +		if (pmd_none(*pmd)) +			continue; + +		target_pmd = pti_user_pagetable_walk_pmd(addr); +		if (WARN_ON(!target_pmd)) +			return; + +		/* +		 * Copy the PMD.  That is, the kernelmode and usermode +		 * tables will share the last-level page tables of this +		 * address range +		 */ +		*target_pmd = pmd_clear_flags(*pmd, clear); +	} +} + +/* + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a + * next-level entry on 5-level systems. + */ +static void __init pti_clone_p4d(unsigned long addr) +{ +	p4d_t *kernel_p4d, *user_p4d; +	pgd_t *kernel_pgd; + +	user_p4d = pti_user_pagetable_walk_p4d(addr); +	kernel_pgd = pgd_offset_k(addr); +	kernel_p4d = p4d_offset(kernel_pgd, addr); +	*user_p4d = *kernel_p4d; +} + +/* + * Clone the CPU_ENTRY_AREA into the user space visible page table. + */ +static void __init pti_clone_user_shared(void) +{ +	pti_clone_p4d(CPU_ENTRY_AREA_BASE); +} + +/* + * Clone the ESPFIX P4D into the user space visinble page table + */ +static void __init pti_setup_espfix64(void) +{ +#ifdef CONFIG_X86_ESPFIX64 +	pti_clone_p4d(ESPFIX_BASE_ADDR); +#endif +} + +/* + * Clone the populated PMDs of the entry and irqentry text and force it RO. + */ +static void __init pti_clone_entry_text(void) +{ +	pti_clone_pmds((unsigned long) __entry_text_start, +			(unsigned long) __irqentry_text_end, +		       _PAGE_RW | _PAGE_GLOBAL); +} + +/* + * Initialize kernel page table isolation + */ +void __init pti_init(void) +{ +	if (!static_cpu_has(X86_FEATURE_PTI)) +		return; + +	pr_info("enabled\n"); + +	pti_clone_user_shared(); +	pti_clone_entry_text(); +	pti_setup_espfix64(); +	pti_setup_vsyscall(); +} diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3118392cdf75..a1561957dccb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,38 @@   *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi   */ +/* + * We get here when we do something requiring a TLB invalidation + * but could not go invalidate all of the contexts.  We do the + * necessary invalidation by clearing out the 'ctx_id' which + * forces a TLB flush when the context is loaded. + */ +void clear_asid_other(void) +{ +	u16 asid; + +	/* +	 * This is only expected to be set if we have disabled +	 * kernel _PAGE_GLOBAL pages. +	 */ +	if (!static_cpu_has(X86_FEATURE_PTI)) { +		WARN_ON_ONCE(1); +		return; +	} + +	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { +		/* Do not need to flush the current asid */ +		if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) +			continue; +		/* +		 * Make sure the next time we go to switch to +		 * this asid, we do a flush: +		 */ +		this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); +	} +	this_cpu_write(cpu_tlbstate.invalidate_other, false); +} +  atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); @@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,  		return;  	} +	if (this_cpu_read(cpu_tlbstate.invalidate_other)) +		clear_asid_other(); +  	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {  		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=  		    next->context.ctx_id) @@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,  	*need_flush = true;  } +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) +{ +	unsigned long new_mm_cr3; + +	if (need_flush) { +		invalidate_user_asid(new_asid); +		new_mm_cr3 = build_cr3(pgdir, new_asid); +	} else { +		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); +	} + +	/* +	 * Caution: many callers of this function expect +	 * that load_cr3() is serializing and orders TLB +	 * fills with respect to the mm_cpumask writes. +	 */ +	write_cr3(new_mm_cr3); +} +  void leave_mm(int cpu)  {  	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -128,7 +182,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  	 * isn't free.  	 */  #ifdef CONFIG_DEBUG_VM -	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { +	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {  		/*  		 * If we were to BUG here, we'd be very likely to kill  		 * the system so hard that we don't see the call trace. @@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  		if (need_flush) {  			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);  			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); -			write_cr3(build_cr3(next, new_asid)); +			load_new_mm_cr3(next->pgd, new_asid, true);  			/*  			 * NB: This gets called via leave_mm() in the idle path @@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);  		} else {  			/* The new ASID is already up to date. */ -			write_cr3(build_cr3_noflush(next, new_asid)); +			load_new_mm_cr3(next->pgd, new_asid, false);  			/* See above wrt _rcuidle. */  			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); @@ -288,7 +342,7 @@ void initialize_tlbstate_and_flush(void)  		!(cr4_read_shadow() & X86_CR4_PCIDE));  	/* Force ASID 0 and force a TLB flush. */ -	write_cr3(build_cr3(mm, 0)); +	write_cr3(build_cr3(mm->pgd, 0));  	/* Reinitialize tlbstate. */  	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); @@ -551,7 +605,7 @@ static void do_kernel_range_flush(void *info)  	/* flush range by one by one 'invlpg' */  	for (addr = f->start; addr < f->end; addr += PAGE_SIZE) -		__flush_tlb_single(addr); +		__flush_tlb_one(addr);  }  void flush_tlb_kernel_range(unsigned long start, unsigned long end) diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index bb461cfd01ab..526536c81ddc 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c @@ -97,7 +97,7 @@ static int __init broadcom_postcore_init(void)  	 * We should get host bridge information from ACPI unless the BIOS  	 * doesn't support it.  	 */ -	if (acpi_os_get_root_pointer()) +	if (!acpi_disabled && acpi_os_get_root_pointer())  		return 0;  #endif diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 7a5350d08cef..563049c483a1 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -594,6 +594,11 @@ char *__init pcibios_setup(char *str)  	} else if (!strcmp(str, "nocrs")) {  		pci_probe |= PCI_ROOT_NO_CRS;  		return NULL; +#ifdef CONFIG_PHYS_ADDR_T_64BIT +	} else if (!strcmp(str, "big_root_window")) { +		pci_probe |= PCI_BIG_ROOT_WINDOW; +		return NULL; +#endif  	} else if (!strcmp(str, "earlydump")) {  		pci_early_dump_regs = 1;  		return NULL; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 1e996df687a3..f6a26e3cb476 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -662,9 +662,23 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);   */  static void pci_amd_enable_64bit_bar(struct pci_dev *dev)  { -	unsigned i;  	u32 base, limit, high; -	struct resource *res, *conflict; +	struct pci_dev *other; +	struct resource *res; +	unsigned i; +	int r; + +	if (!(pci_probe & PCI_BIG_ROOT_WINDOW)) +		return; + +	/* Check that we are the only device of that type */ +	other = pci_get_device(dev->vendor, dev->device, NULL); +	if (other != dev || +	    (other = pci_get_device(dev->vendor, dev->device, other))) { +		/* This is a multi-socket system, don't touch it for now */ +		pci_dev_put(other); +		return; +	}  	for (i = 0; i < 8; i++) {  		pci_read_config_dword(dev, AMD_141b_MMIO_BASE(i), &base); @@ -689,17 +703,25 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev)  	if (!res)  		return; +	/* +	 * Allocate a 256GB window directly below the 0xfd00000000 hardware +	 * limit (see AMD Family 15h Models 30h-3Fh BKDG, sec 2.4.6). +	 */  	res->name = "PCI Bus 0000:00";  	res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM |  		IORESOURCE_MEM_64 | IORESOURCE_WINDOW; -	res->start = 0x100000000ull; +	res->start = 0xbd00000000ull;  	res->end = 0xfd00000000ull - 1; -	/* Just grab the free area behind system memory for this */ -	while ((conflict = request_resource_conflict(&iomem_resource, res))) -		res->start = conflict->end + 1; +	r = request_resource(&iomem_resource, res); +	if (r) { +		kfree(res); +		return; +	} -	dev_info(&dev->dev, "adding root bus resource %pR\n", res); +	dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", +		 res); +	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);  	base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) |  		AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK; @@ -714,10 +736,10 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev)  	pci_bus_add_resource(dev->bus, res, 0);  } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);  #endif diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 6a151ce70e86..2dd15e967c3f 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -135,7 +135,9 @@ pgd_t * __init efi_call_phys_prolog(void)  				pud[j] = *pud_offset(p4d_k, vaddr);  			}  		} +		pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX;  	} +  out:  	__flush_tlb_all(); @@ -196,6 +198,9 @@ static pgd_t *efi_pgd;   * because we want to avoid inserting EFI region mappings (EFI_VA_END   * to EFI_VA_START) into the standard kernel page tables. Everything   * else can be shared, see efi_sync_low_kernel_mappings(). + * + * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the + * allocation.   */  int __init efi_alloc_page_tables(void)  { @@ -208,7 +213,7 @@ int __init efi_alloc_page_tables(void)  		return 0;  	gfp_mask = GFP_KERNEL | __GFP_ZERO; -	efi_pgd = (pgd_t *)__get_free_page(gfp_mask); +	efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);  	if (!efi_pgd)  		return -ENOMEM; diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 8a99a2e96537..5b513ccffde4 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -592,7 +592,18 @@ static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff,  	/*  	 * Update the first page pointer to skip over the CSH header.  	 */ -	cap_info->pages[0] += csh->headersize; +	cap_info->phys[0] += csh->headersize; + +	/* +	 * cap_info->capsule should point at a virtual mapping of the entire +	 * capsule, starting at the capsule header. Our image has the Quark +	 * security header prepended, so we cannot rely on the default vmap() +	 * mapping created by the generic capsule code. +	 * Given that the Quark firmware does not appear to care about the +	 * virtual mapping, let's just point cap_info->capsule at our copy +	 * of the capsule header. +	 */ +	cap_info->capsule = &cap_info->header;  	return 1;  } diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index dc036e511f48..5a0483e7bf66 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -60,7 +60,7 @@ static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata)  	return 0;  } -static const struct bt_sfi_data tng_bt_sfi_data __initdata = { +static struct bt_sfi_data tng_bt_sfi_data __initdata = {  	.setup	= tng_bt_sfi_setup,  }; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index f44c0bc95aa2..8538a6723171 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,  		local_flush_tlb();  		stat->d_alltlb++;  	} else { -		__flush_tlb_one(msg->address); +		__flush_tlb_single(msg->address);  		stat->d_onetlb++;  	}  	stat->d_requestee++; diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 5f6fd860820a..e4cb9f4cde8a 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -128,7 +128,7 @@ static void uv_domain_free(struct irq_domain *domain, unsigned int virq,   * on the specified blade to allow the sending of MSIs to the specified CPU.   */  static int uv_domain_activate(struct irq_domain *domain, -			      struct irq_data *irq_data, bool early) +			      struct irq_data *irq_data, bool reserve)  {  	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);  	return 0; diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index c34bd8233f7c..5f64f30873e2 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -905,7 +905,7 @@ static inline void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)  /*   * UV NMI handler   */ -int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) +static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)  {  	struct uv_hub_nmi_s *hub_nmi = uv_hub_nmi;  	int cpu = smp_processor_id(); @@ -1013,7 +1013,7 @@ void uv_nmi_init(void)  }  /* Setup HUB NMI info */ -void __init uv_nmi_setup_common(bool hubbed) +static void __init uv_nmi_setup_common(bool hubbed)  {  	int size = sizeof(void *) * (1 << NODES_SHIFT);  	int cpu; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 84fcfde53f8f..a7d966964c6f 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -82,12 +82,8 @@ static void __save_processor_state(struct saved_context *ctxt)  	/*  	 * descriptor tables  	 */ -#ifdef CONFIG_X86_32  	store_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ -	store_idt((struct desc_ptr *)&ctxt->idt_limit); -#endif +  	/*  	 * We save it here, but restore it only in the hibernate case.  	 * For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit @@ -103,22 +99,18 @@ static void __save_processor_state(struct saved_context *ctxt)  	/*  	 * segment registers  	 */ -#ifdef CONFIG_X86_32 -	savesegment(es, ctxt->es); -	savesegment(fs, ctxt->fs); +#ifdef CONFIG_X86_32_LAZY_GS  	savesegment(gs, ctxt->gs); -	savesegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ -	asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); -	asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); -	asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); -	asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); -	asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); +#endif +#ifdef CONFIG_X86_64 +	savesegment(gs, ctxt->gs); +	savesegment(fs, ctxt->fs); +	savesegment(ds, ctxt->ds); +	savesegment(es, ctxt->es);  	rdmsrl(MSR_FS_BASE, ctxt->fs_base); -	rdmsrl(MSR_GS_BASE, ctxt->gs_base); -	rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); +	rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); +	rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base);  	mtrr_save_fixed_ranges(NULL);  	rdmsrl(MSR_EFER, ctxt->efer); @@ -160,17 +152,19 @@ static void do_fpu_end(void)  static void fix_processor_context(void)  {  	int cpu = smp_processor_id(); -	struct tss_struct *t = &per_cpu(cpu_tss, cpu);  #ifdef CONFIG_X86_64  	struct desc_struct *desc = get_cpu_gdt_rw(cpu);  	tss_desc tss;  #endif -	set_tss_desc(cpu, t);	/* -				 * This just modifies memory; should not be -				 * necessary. But... This is necessary, because -				 * 386 hardware has concept of busy TSS or some -				 * similar stupidity. -				 */ + +	/* +	 * We need to reload TR, which requires that we change the +	 * GDT entry to indicate "available" first. +	 * +	 * XXX: This could probably all be replaced by a call to +	 * force_reload_TR(). +	 */ +	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);  #ifdef CONFIG_X86_64  	memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); @@ -178,6 +172,9 @@ static void fix_processor_context(void)  	write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);  	syscall_init();				/* This sets MSR_*STAR and related */ +#else +	if (boot_cpu_has(X86_FEATURE_SEP)) +		enable_sep_cpu();  #endif  	load_TR_desc();				/* This does ltr */  	load_mm_ldt(current->active_mm);	/* This does lldt */ @@ -190,9 +187,12 @@ static void fix_processor_context(void)  }  /** - *	__restore_processor_state - restore the contents of CPU registers saved - *		by __save_processor_state() - *	@ctxt - structure to load the registers contents from + * __restore_processor_state - restore the contents of CPU registers saved + *                             by __save_processor_state() + * @ctxt - structure to load the registers contents from + * + * The asm code that gets us here will have restored a usable GDT, although + * it will be pointing to the wrong alias.   */  static void notrace __restore_processor_state(struct saved_context *ctxt)  { @@ -215,46 +215,52 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)  	write_cr2(ctxt->cr2);  	write_cr0(ctxt->cr0); +	/* Restore the IDT. */ +	load_idt(&ctxt->idt); +  	/* -	 * now restore the descriptor tables to their proper values -	 * ltr is done i fix_processor_context(). +	 * Just in case the asm code got us here with the SS, DS, or ES +	 * out of sync with the GDT, update them.  	 */ -#ifdef CONFIG_X86_32 -	load_idt(&ctxt->idt); +	loadsegment(ss, __KERNEL_DS); +	loadsegment(ds, __USER_DS); +	loadsegment(es, __USER_DS); + +	/* +	 * Restore percpu access.  Percpu access can happen in exception +	 * handlers or in complicated helpers like load_gs_index(). +	 */ +#ifdef CONFIG_X86_64 +	wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base);  #else -/* CONFIG_X86_64 */ -	load_idt((const struct desc_ptr *)&ctxt->idt_limit); +	loadsegment(fs, __KERNEL_PERCPU); +	loadsegment(gs, __KERNEL_STACK_CANARY);  #endif +	/* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */ +	fix_processor_context(); +  	/* -	 * segment registers +	 * Now that we have descriptor tables fully restored and working +	 * exception handling, restore the usermode segments.  	 */ -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_64 +	loadsegment(ds, ctxt->es);  	loadsegment(es, ctxt->es);  	loadsegment(fs, ctxt->fs); -	loadsegment(gs, ctxt->gs); -	loadsegment(ss, ctxt->ss); +	load_gs_index(ctxt->gs);  	/* -	 * sysenter MSRs +	 * Restore FSBASE and GSBASE after restoring the selectors, since +	 * restoring the selectors clobbers the bases.  Keep in mind +	 * that MSR_KERNEL_GS_BASE is horribly misnamed.  	 */ -	if (boot_cpu_has(X86_FEATURE_SEP)) -		enable_sep_cpu(); -#else -/* CONFIG_X86_64 */ -	asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); -	asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); -	asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); -	load_gs_index(ctxt->gs); -	asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); -  	wrmsrl(MSR_FS_BASE, ctxt->fs_base); -	wrmsrl(MSR_GS_BASE, ctxt->gs_base); -	wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); +	wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); +#elif defined(CONFIG_X86_32_LAZY_GS) +	loadsegment(gs, ctxt->gs);  #endif -	fix_processor_context(); -  	do_fpu_end();  	tsc_verify_tsc_adjust(true);  	x86_platform.restore_sched_clock_state(); diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 6b830d4cb4c8..de58533d3664 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -57,7 +57,7 @@ static u32 xen_apic_read(u32 reg)  		return 0;  	if (reg == APIC_LVR) -		return 0x10; +		return 0x14;  #ifdef CONFIG_X86_32  	if (reg == APIC_LDR)  		return SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d669e9d89001..c9081c6671f0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,8 +1,12 @@ +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +#include <linux/bootmem.h> +#endif  #include <linux/cpu.h>  #include <linux/kexec.h>  #include <xen/features.h>  #include <xen/page.h> +#include <xen/interface/memory.h>  #include <asm/xen/hypercall.h>  #include <asm/xen/hypervisor.h> @@ -331,3 +335,80 @@ void xen_arch_unregister_cpu(int num)  }  EXPORT_SYMBOL(xen_arch_unregister_cpu);  #endif + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +void __init arch_xen_balloon_init(struct resource *hostmem_resource) +{ +	struct xen_memory_map memmap; +	int rc; +	unsigned int i, last_guest_ram; +	phys_addr_t max_addr = PFN_PHYS(max_pfn); +	struct e820_table *xen_e820_table; +	const struct e820_entry *entry; +	struct resource *res; + +	if (!xen_initial_domain()) +		return; + +	xen_e820_table = kmalloc(sizeof(*xen_e820_table), GFP_KERNEL); +	if (!xen_e820_table) +		return; + +	memmap.nr_entries = ARRAY_SIZE(xen_e820_table->entries); +	set_xen_guest_handle(memmap.buffer, xen_e820_table->entries); +	rc = HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap); +	if (rc) { +		pr_warn("%s: Can't read host e820 (%d)\n", __func__, rc); +		goto out; +	} + +	last_guest_ram = 0; +	for (i = 0; i < memmap.nr_entries; i++) { +		if (xen_e820_table->entries[i].addr >= max_addr) +			break; +		if (xen_e820_table->entries[i].type == E820_TYPE_RAM) +			last_guest_ram = i; +	} + +	entry = &xen_e820_table->entries[last_guest_ram]; +	if (max_addr >= entry->addr + entry->size) +		goto out; /* No unallocated host RAM. */ + +	hostmem_resource->start = max_addr; +	hostmem_resource->end = entry->addr + entry->size; + +	/* +	 * Mark non-RAM regions between the end of dom0 RAM and end of host RAM +	 * as unavailable. The rest of that region can be used for hotplug-based +	 * ballooning. +	 */ +	for (; i < memmap.nr_entries; i++) { +		entry = &xen_e820_table->entries[i]; + +		if (entry->type == E820_TYPE_RAM) +			continue; + +		if (entry->addr >= hostmem_resource->end) +			break; + +		res = kzalloc(sizeof(*res), GFP_KERNEL); +		if (!res) +			goto out; + +		res->name = "Unavailable host RAM"; +		res->start = entry->addr; +		res->end = (entry->addr + entry->size < hostmem_resource->end) ? +			    entry->addr + entry->size : hostmem_resource->end; +		rc = insert_resource(hostmem_resource, res); +		if (rc) { +			pr_warn("%s: Can't insert [%llx - %llx) (%d)\n", +				__func__, res->start, res->end, rc); +			kfree(res); +			goto  out; +		} +	} + + out: +	kfree(xen_e820_table); +} +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5b2b3f3f6531..c047f42552e1 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -88,6 +88,8 @@  #include "multicalls.h"  #include "pmu.h" +#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */ +  void *xen_initial_gdt;  static int xen_cpu_up_prepare_pv(unsigned int cpu); @@ -622,7 +624,7 @@ static struct trap_array_entry trap_array[] = {  	{ simd_coprocessor_error,      xen_simd_coprocessor_error,      false },  }; -static bool get_trap_addr(void **addr, unsigned int ist) +static bool __ref get_trap_addr(void **addr, unsigned int ist)  {  	unsigned int nr;  	bool ist_okay = false; @@ -644,6 +646,14 @@ static bool get_trap_addr(void **addr, unsigned int ist)  		}  	} +	if (nr == ARRAY_SIZE(trap_array) && +	    *addr >= (void *)early_idt_handler_array[0] && +	    *addr < (void *)early_idt_handler_array[NUM_EXCEPTION_VECTORS]) { +		nr = (*addr - (void *)early_idt_handler_array[0]) / +		     EARLY_IDT_HANDLER_SIZE; +		*addr = (void *)xen_early_idt_handler_array[nr]; +	} +  	if (WARN_ON(ist != 0 && !ist_okay))  		return false; @@ -818,7 +828,7 @@ static void xen_load_sp0(unsigned long sp0)  	mcs = xen_mc_entry(0);  	MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);  	xen_mc_issue(PARAVIRT_LAZY_CPU); -	this_cpu_write(cpu_tss.x86_tss.sp0, sp0); +	this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);  }  void xen_set_iopl_mask(unsigned mask) @@ -1250,6 +1260,7 @@ asmlinkage __visible void __init xen_start_kernel(void)  	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;  	/* Work out if we support NX */ +	get_cpu_cap(&boot_cpu_data);  	x86_configure_nx();  	/* Get mfn list */ @@ -1262,6 +1273,21 @@ asmlinkage __visible void __init xen_start_kernel(void)  	xen_setup_gdt(0);  	xen_init_irq_ops(); + +	/* Let's presume PV guests always boot on vCPU with id 0. */ +	per_cpu(xen_vcpu_id, 0) = 0; + +	/* +	 * Setup xen_vcpu early because idt_setup_early_handler needs it for +	 * local_irq_disable(), irqs_disabled(). +	 * +	 * Don't do the full vcpu_info placement stuff until we have +	 * the cpu_possible_mask and a non-dummy shared_info. +	 */ +	xen_vcpu_info_reset(0); + +	idt_setup_early_handler(); +  	xen_init_capabilities();  #ifdef CONFIG_X86_LOCAL_APIC @@ -1295,18 +1321,6 @@ asmlinkage __visible void __init xen_start_kernel(void)  	 */  	acpi_numa = -1;  #endif -	/* Let's presume PV guests always boot on vCPU with id 0. */ -	per_cpu(xen_vcpu_id, 0) = 0; - -	/* -	 * Setup xen_vcpu early because start_kernel needs it for -	 * local_irq_disable(), irqs_disabled(). -	 * -	 * Don't do the full vcpu_info placement stuff until we have -	 * the cpu_possible_mask and a non-dummy shared_info. -	 */ -	xen_vcpu_info_reset(0); -  	WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));  	local_irq_disable(); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index fc048ec686e7..d85076223a69 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1325,20 +1325,18 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,  {  	struct {  		struct mmuext_op op; -#ifdef CONFIG_SMP -		DECLARE_BITMAP(mask, num_processors); -#else  		DECLARE_BITMAP(mask, NR_CPUS); -#endif  	} *args;  	struct multicall_space mcs; +	const size_t mc_entry_size = sizeof(args->op) + +		sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());  	trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);  	if (cpumask_empty(cpus))  		return;		/* nothing to do */ -	mcs = xen_mc_entry(sizeof(*args)); +	mcs = xen_mc_entry(mc_entry_size);  	args = mcs.args;  	args->op.arg2.vcpumask = to_cpumask(args->mask); @@ -1902,6 +1900,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)  	/* Graft it onto L4[511][510] */  	copy_page(level2_kernel_pgt, l2); +	/* +	 * Zap execute permission from the ident map. Due to the sharing of +	 * L1 entries we need to do this in the L2. +	 */ +	if (__supported_pte_mask & _PAGE_NX) { +		for (i = 0; i < PTRS_PER_PMD; ++i) { +			if (pmd_none(level2_ident_pgt[i])) +				continue; +			level2_ident_pgt[i] = pmd_set_flags(level2_ident_pgt[i], _PAGE_NX); +		} +	} +  	/* Copy the initial P->M table mappings if necessary. */  	i = pgd_index(xen_start_info->mfn_list);  	if (i && i < pgd_index(__START_KERNEL_map)) @@ -2261,7 +2271,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)  	switch (idx) {  	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: -	case FIX_RO_IDT:  #ifdef CONFIG_X86_32  	case FIX_WP_TEST:  # ifdef CONFIG_HIGHMEM @@ -2272,7 +2281,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)  #endif  	case FIX_TEXT_POKE0:  	case FIX_TEXT_POKE1: -	case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:  		/* All local page mappings */  		pte = pfn_pte(phys, prot);  		break; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index c114ca767b3b..6e0d2086eacb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -808,7 +808,6 @@ char * __init xen_memory_setup(void)  	addr = xen_e820_table.entries[0].addr;  	size = xen_e820_table.entries[0].size;  	while (i < xen_e820_table.nr_entries) { -		bool discard = false;  		chunk_size = size;  		type = xen_e820_table.entries[i].type; @@ -824,11 +823,10 @@ char * __init xen_memory_setup(void)  				xen_add_extra_mem(pfn_s, n_pfns);  				xen_max_p2m_pfn = pfn_s + n_pfns;  			} else -				discard = true; +				type = E820_TYPE_UNUSABLE;  		} -		if (!discard) -			xen_align_and_add_e820_region(addr, chunk_size, type); +		xen_align_and_add_e820_region(addr, chunk_size, type);  		addr += chunk_size;  		size -= chunk_size; diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 8a10c9a9e2b5..417b339e5c8e 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -15,6 +15,7 @@  #include <xen/interface/xen.h> +#include <linux/init.h>  #include <linux/linkage.h>  .macro xen_pv_trap name @@ -54,6 +55,19 @@ xen_pv_trap entry_INT80_compat  #endif  xen_pv_trap hypervisor_callback +	__INIT +ENTRY(xen_early_idt_handler_array) +	i = 0 +	.rept NUM_EXCEPTION_VECTORS +	pop %rcx +	pop %r11 +	jmp early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE +	i = i + 1 +	.fill xen_early_idt_handler_array + i*XEN_EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc +	.endr +END(xen_early_idt_handler_array) +	__FINIT +  hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32  /*   * Xen64 iret frame: diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 75011b80660f..3b34745d0a52 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -72,7 +72,7 @@ u64 xen_clocksource_read(void);  void xen_setup_cpu_clockevents(void);  void xen_save_time_memory_area(void);  void xen_restore_time_memory_area(void); -void __init xen_init_time_ops(void); +void __ref xen_init_time_ops(void);  void __init xen_hvm_init_time_ops(void);  irqreturn_t xen_debug_interrupt(int irq, void *dev_id); diff --git a/arch/xtensa/include/uapi/asm/Kbuild b/arch/xtensa/include/uapi/asm/Kbuild index a5bcdfb890f1..837d4dd76785 100644 --- a/arch/xtensa/include/uapi/asm/Kbuild +++ b/arch/xtensa/include/uapi/asm/Kbuild @@ -2,6 +2,7 @@  include include/uapi/asm-generic/Kbuild.asm  generic-y += bitsperlong.h +generic-y += bpf_perf_event.h  generic-y += errno.h  generic-y += fcntl.h  generic-y += ioctl.h  | 

