55 files changed, 1164 insertions, 389 deletions
diff --git a/drivers/gpu/drm/i915/Kconfig.debug b/drivers/gpu/drm/i915/Kconfig.debug
index 80815a5229a1..eea79125b3ea 100644
--- a/drivers/gpu/drm/i915/Kconfig.debug
+++ b/drivers/gpu/drm/i915/Kconfig.debug
@@ -36,7 +36,6 @@ config DRM_I915_DEBUG
 	select DRM_I915_SELFTEST
 	select DRM_I915_DEBUG_RUNTIME_PM
 	select DRM_I915_DEBUG_MMIO
-	select BROKEN # for prototype uAPI
 	default n
 	help
 	  Choose this option to turn on extra driver debugging that may affect
diff --git a/drivers/gpu/drm/i915/Kconfig.profile b/drivers/gpu/drm/i915/Kconfig.profile
index 1799537a3228..c280b6ae38eb 100644
--- a/drivers/gpu/drm/i915/Kconfig.profile
+++ b/drivers/gpu/drm/i915/Kconfig.profile
@@ -25,7 +25,7 @@ config DRM_I915_HEARTBEAT_INTERVAL
 
 config DRM_I915_PREEMPT_TIMEOUT
 	int "Preempt timeout (ms, jiffy granularity)"
-	default 100 # milliseconds
+	default 640 # milliseconds
 	help
 	  How long to wait (in milliseconds) for a preemption event to occur
 	  when submitting a new context via execlists. If the current context
diff --git a/drivers/gpu/drm/i915/display/intel_atomic.c b/drivers/gpu/drm/i915/display/intel_atomic.c
index 9cd6d2348a1e..c2875b10adf9 100644
--- a/drivers/gpu/drm/i915/display/intel_atomic.c
+++ b/drivers/gpu/drm/i915/display/intel_atomic.c
@@ -200,6 +200,7 @@ intel_crtc_duplicate_state(struct drm_crtc *crtc)
 	crtc_state->update_wm_pre = false;
 	crtc_state->update_wm_post = false;
 	crtc_state->fifo_changed = false;
+	crtc_state->preload_luts = false;
 	crtc_state->wm.need_postvbl_update = false;
 	crtc_state->fb_bits = 0;
 	crtc_state->update_planes = 0;
diff --git a/drivers/gpu/drm/i915/display/intel_cdclk.c b/drivers/gpu/drm/i915/display/intel_cdclk.c
index 0caef2592a7e..ed8c7ce62119 100644
--- a/drivers/gpu/drm/i915/display/intel_cdclk.c
+++ b/drivers/gpu/drm/i915/display/intel_cdclk.c
@@ -1273,7 +1273,9 @@ static u8 icl_calc_voltage_level(int cdclk)
 
 static u8 ehl_calc_voltage_level(int cdclk)
 {
-	if (cdclk > 312000)
+	if (cdclk > 326400)
+		return 3;
+	else if (cdclk > 312000)
 		return 2;
 	else if (cdclk > 180000)
 		return 1;
diff --git a/drivers/gpu/drm/i915/display/intel_color.c b/drivers/gpu/drm/i915/display/intel_color.c
index fa44eb73d088..aa3a063549c3 100644
--- a/drivers/gpu/drm/i915/display/intel_color.c
+++ b/drivers/gpu/drm/i915/display/intel_color.c
@@ -1022,6 +1022,55 @@ void intel_color_commit(const struct intel_crtc_state *crtc_state)
 	dev_priv->display.color_commit(crtc_state);
 }
 
+static bool intel_can_preload_luts(const struct intel_crtc_state *new_crtc_state)
+{
+	struct intel_crtc *crtc = to_intel_crtc(new_crtc_state->base.crtc);
+	struct intel_atomic_state *state =
+		to_intel_atomic_state(new_crtc_state->base.state);
+	const struct intel_crtc_state *old_crtc_state =
+		intel_atomic_get_old_crtc_state(state, crtc);
+
+	return !old_crtc_state->base.gamma_lut &&
+		!old_crtc_state->base.degamma_lut;
+}
+
+static bool chv_can_preload_luts(const struct intel_crtc_state *new_crtc_state)
+{
+	struct intel_crtc *crtc = to_intel_crtc(new_crtc_state->base.crtc);
+	struct intel_atomic_state *state =
+		to_intel_atomic_state(new_crtc_state->base.state);
+	const struct intel_crtc_state *old_crtc_state =
+		intel_atomic_get_old_crtc_state(state, crtc);
+
+	/*
+	 * CGM_PIPE_MODE is itself single buffered. We'd have to
+	 * somehow split it out from chv_load_luts() if we wanted
+	 * the ability to preload the CGM LUTs/CSC without tearing.
+	 */
+	if (old_crtc_state->cgm_mode || new_crtc_state->cgm_mode)
+		return false;
+
+	return !old_crtc_state->base.gamma_lut;
+}
+
+static bool glk_can_preload_luts(const struct intel_crtc_state *new_crtc_state)
+{
+	struct intel_crtc *crtc = to_intel_crtc(new_crtc_state->base.crtc);
+	struct intel_atomic_state *state =
+		to_intel_atomic_state(new_crtc_state->base.state);
+	const struct intel_crtc_state *old_crtc_state =
+		intel_atomic_get_old_crtc_state(state, crtc);
+
+	/*
+	 * The hardware degamma is active whenever the pipe
+	 * CSC is active. Thus even if the old state has no
+	 * software degamma we need to avoid clobbering the
+	 * linear hardware degamma mid scanout.
+	 */
+	return !old_crtc_state->csc_enable &&
+		!old_crtc_state->base.gamma_lut;
+}
+
 int intel_color_check(struct intel_crtc_state *crtc_state)
 {
 	struct drm_i915_private *dev_priv = to_i915(crtc_state->base.crtc->dev);
@@ -1165,6 +1214,8 @@ static int i9xx_color_check(struct intel_crtc_state *crtc_state)
 	if (ret)
 		return ret;
 
+	crtc_state->preload_luts = intel_can_preload_luts(crtc_state);
+
 	return 0;
 }
 
@@ -1217,6 +1268,8 @@ static int chv_color_check(struct intel_crtc_state *crtc_state)
 	if (ret)
 		return ret;
 
+	crtc_state->preload_luts = chv_can_preload_luts(crtc_state);
+
 	return 0;
 }
 
@@ -1271,6 +1324,8 @@ static int ilk_color_check(struct intel_crtc_state *crtc_state)
 	if (ret)
 		return ret;
 
+	crtc_state->preload_luts = intel_can_preload_luts(crtc_state);
+
 	return 0;
 }
 
@@ -1328,6 +1383,8 @@ static int ivb_color_check(struct intel_crtc_state *crtc_state)
 	if (ret)
 		return ret;
 
+	crtc_state->preload_luts = intel_can_preload_luts(crtc_state);
+
 	return 0;
 }
 
@@ -1366,6 +1423,8 @@ static int glk_color_check(struct intel_crtc_state *crtc_state)
 	if (ret)
 		return ret;
 
+	crtc_state->preload_luts = glk_can_preload_luts(crtc_state);
+
 	return 0;
 }
 
@@ -1415,6 +1474,8 @@ static int icl_color_check(struct intel_crtc_state *crtc_state)
 
 	crtc_state->csc_mode = icl_csc_mode(crtc_state);
 
+	crtc_state->preload_luts = intel_can_preload_luts(crtc_state);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c
index b51f244ad7a5..c7c2b349858d 100644
--- a/drivers/gpu/drm/i915/display/intel_ddi.c
+++ b/drivers/gpu/drm/i915/display/intel_ddi.c
@@ -593,7 +593,7 @@ struct tgl_dkl_phy_ddi_buf_trans {
 	u32 dkl_de_emphasis_control;
 };
 
-static const struct tgl_dkl_phy_ddi_buf_trans tgl_dkl_phy_ddi_translations[] = {
+static const struct tgl_dkl_phy_ddi_buf_trans tgl_dkl_phy_dp_ddi_trans[] = {
 				/* VS	pre-emp	Non-trans mV	Pre-emph dB */
 	{ 0x7, 0x0, 0x00 },	/* 0	0	400mV		0 dB */
 	{ 0x5, 0x0, 0x03 },	/* 0	1	400mV		3.5 dB */
@@ -607,6 +607,20 @@ static const struct tgl_dkl_phy_ddi_buf_trans tgl_dkl_phy_ddi_translations[] = {
 	{ 0x0, 0x0, 0x00 },	/* 3	0	1200mV		0 dB HDMI default */
 };
 
+static const struct tgl_dkl_phy_ddi_buf_trans tgl_dkl_phy_hdmi_ddi_trans[] = {
+				/* HDMI Preset	VS	Pre-emph */
+	{ 0x7, 0x0, 0x0 },	/* 1		400mV	0dB */
+	{ 0x6, 0x0, 0x0 },	/* 2		500mV	0dB */
+	{ 0x4, 0x0, 0x0 },	/* 3		650mV	0dB */
+	{ 0x2, 0x0, 0x0 },	/* 4		800mV	0dB */
+	{ 0x0, 0x0, 0x0 },	/* 5		1000mV	0dB */
+	{ 0x0, 0x0, 0x5 },	/* 6		Full	-1.5 dB */
+	{ 0x0, 0x0, 0x6 },	/* 7		Full	-1.8 dB */
+	{ 0x0, 0x0, 0x7 },	/* 8		Full	-2 dB */
+	{ 0x0, 0x0, 0x8 },	/* 9		Full	-2.5 dB */
+	{ 0x0, 0x0, 0xA },	/* 10		Full	-3 dB */
+};
+
 static const struct ddi_buf_trans *
 bdw_get_buf_trans_edp(struct drm_i915_private *dev_priv, int *n_entries)
 {
@@ -898,7 +912,7 @@ static int intel_ddi_hdmi_level(struct drm_i915_private *dev_priv, enum port por
 			icl_get_combo_buf_trans(dev_priv, INTEL_OUTPUT_HDMI,
 						0, &n_entries);
 		else
-			n_entries = ARRAY_SIZE(tgl_dkl_phy_ddi_translations);
+			n_entries = ARRAY_SIZE(tgl_dkl_phy_hdmi_ddi_trans);
 		default_entry = n_entries - 1;
 	} else if (INTEL_GEN(dev_priv) == 11) {
 		if (intel_phy_is_combo(dev_priv, phy))
@@ -1794,10 +1808,8 @@ void intel_ddi_set_dp_msa(const struct intel_crtc_state *crtc_state,
 	 * of Color Encoding Format and Content Color Gamut] while sending
 	 * YCBCR 420, HDR BT.2020 signals we should program MSA MISC1 fields
 	 * which indicate VSC SDP for the Pixel Encoding/Colorimetry Format.
-	 *
-	 * FIXME MST doesn't pass in the conn_state
 	 */
-	if (conn_state && intel_dp_needs_vsc_sdp(crtc_state, conn_state))
+	if (intel_dp_needs_vsc_sdp(crtc_state, conn_state))
 		temp |= DP_MSA_MISC_COLOR_VSC_SDP;
 
 	I915_WRITE(TRANS_MSA_MISC(cpu_transcoder), temp);
@@ -2373,7 +2385,7 @@ u8 intel_ddi_dp_voltage_max(struct intel_encoder *encoder)
 			icl_get_combo_buf_trans(dev_priv, encoder->type,
 						intel_dp->link_rate, &n_entries);
 		else
-			n_entries = ARRAY_SIZE(tgl_dkl_phy_ddi_translations);
+			n_entries = ARRAY_SIZE(tgl_dkl_phy_dp_ddi_trans);
 	} else if (INTEL_GEN(dev_priv) == 11) {
 		if (intel_phy_is_combo(dev_priv, phy))
 			icl_get_combo_buf_trans(dev_priv, encoder->type,
@@ -2825,8 +2837,13 @@ tgl_dkl_phy_ddi_vswing_sequence(struct intel_encoder *encoder, int link_clock,
 	const struct tgl_dkl_phy_ddi_buf_trans *ddi_translations;
 	u32 n_entries, val, ln, dpcnt_mask, dpcnt_val;
 
-	n_entries = ARRAY_SIZE(tgl_dkl_phy_ddi_translations);
-	ddi_translations = tgl_dkl_phy_ddi_translations;
+	if (encoder->type == INTEL_OUTPUT_HDMI) {
+		n_entries = ARRAY_SIZE(tgl_dkl_phy_hdmi_ddi_trans);
+		ddi_translations = tgl_dkl_phy_hdmi_ddi_trans;
+	} else {
+		n_entries = ARRAY_SIZE(tgl_dkl_phy_dp_ddi_trans);
+		ddi_translations = tgl_dkl_phy_dp_ddi_trans;
+	}
 
 	if (level >= n_entries)
 		level = n_entries - 1;
@@ -3605,7 +3622,11 @@ static void intel_ddi_pre_enable_dp(struct intel_encoder *encoder,
 	else
 		hsw_ddi_pre_enable_dp(encoder, crtc_state, conn_state);
 
-	intel_ddi_set_dp_msa(crtc_state, conn_state);
+	/* MST will call a setting of MSA after an allocating of Virtual Channel
+	 * from MST encoder pre_enable callback.
+	 */
+	if (!intel_crtc_has_type(crtc_state, INTEL_OUTPUT_DP_MST))
+		intel_ddi_set_dp_msa(crtc_state, conn_state);
 }
 
 static void intel_ddi_pre_enable_hdmi(struct intel_encoder *encoder,
diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c
index 348ce0456696..6f5e3bd13ad1 100644
--- a/drivers/gpu/drm/i915/display/intel_display.c
+++ b/drivers/gpu/drm/i915/display/intel_display.c
@@ -66,6 +66,7 @@
 #include "intel_cdclk.h"
 #include "intel_color.h"
 #include "intel_display_types.h"
+#include "intel_dp_link_training.h"
 #include "intel_fbc.h"
 #include "intel_fbdev.h"
 #include "intel_fifo_underrun.h"
@@ -2528,6 +2529,9 @@ u32 intel_plane_fb_max_stride(struct drm_i915_private *dev_priv,
 	 * the highest stride limits of them all.
 	 */
 	crtc = intel_get_crtc_for_pipe(dev_priv, PIPE_A);
+	if (!crtc)
+		return 0;
+
 	plane = to_intel_plane(crtc->base.primary);
 
 	return plane->max_stride(plane, pixel_format, modifier,
@@ -14201,6 +14205,11 @@ static void intel_update_crtc(struct intel_crtc *crtc,
 		/* vblanks work again, re-enable pipe CRC. */
 		intel_crtc_enable_pipe_crc(crtc);
 	} else {
+		if (new_crtc_state->preload_luts &&
+		    (new_crtc_state->base.color_mgmt_changed ||
+		     new_crtc_state->update_pipe))
+			intel_color_load_luts(new_crtc_state);
+
 		intel_pre_plane_update(old_crtc_state, new_crtc_state);
 
 		if (new_crtc_state->update_pipe)
@@ -14713,6 +14722,7 @@ static void intel_atomic_commit_tail(struct intel_atomic_state *state)
 	for_each_new_intel_crtc_in_state(state, crtc, new_crtc_state, i) {
 		if (new_crtc_state->base.active &&
 		    !needs_modeset(new_crtc_state) &&
+		    !new_crtc_state->preload_luts &&
 		    (new_crtc_state->base.color_mgmt_changed ||
 		     new_crtc_state->update_pipe))
 			intel_color_load_luts(new_crtc_state);
diff --git a/drivers/gpu/drm/i915/display/intel_display.h b/drivers/gpu/drm/i915/display/intel_display.h
index 355c50088589..f417e0948001 100644
--- a/drivers/gpu/drm/i915/display/intel_display.h
+++ b/drivers/gpu/drm/i915/display/intel_display.h
@@ -27,7 +27,6 @@
 
 #include <drm/drm_util.h>
 #include <drm/i915_drm.h>
-#include "intel_dp_link_training.h"
 
 enum link_m_n_set;
 struct dpll;
diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h
index 4341bd66a418..1a7334dbe802 100644
--- a/drivers/gpu/drm/i915/display/intel_display_types.h
+++ b/drivers/gpu/drm/i915/display/intel_display_types.h
@@ -775,6 +775,7 @@ struct intel_crtc_state {
 	bool disable_cxsr;
 	bool update_wm_pre, update_wm_post; /* watermarks are updated */
 	bool fifo_changed; /* FIFO split is changed */
+	bool preload_luts;
 
 	/* Pipe source size (ie. panel fitter input size)
 	 * All planes will be positioned inside this space,
diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c
index 9ae5b8b6bbbc..03d1cba0b696 100644
--- a/drivers/gpu/drm/i915/display/intel_dp_mst.c
+++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c
@@ -331,6 +331,8 @@ static void intel_mst_pre_enable_dp(struct intel_encoder *encoder,
 	ret = drm_dp_update_payload_part1(&intel_dp->mst_mgr);
 
 	intel_ddi_enable_pipe_clock(pipe_config);
+
+	intel_ddi_set_dp_msa(pipe_config, conn_state);
 }
 
 static void intel_mst_enable_dp(struct intel_encoder *encoder,
diff --git a/drivers/gpu/drm/i915/display/intel_fbdev.c b/drivers/gpu/drm/i915/display/intel_fbdev.c
index 3d1061470e76..48c960ca12fb 100644
--- a/drivers/gpu/drm/i915/display/intel_fbdev.c
+++ b/drivers/gpu/drm/i915/display/intel_fbdev.c
@@ -234,6 +234,11 @@ static int intelfb_create(struct drm_fb_helper *helper,
 	info->apertures->ranges[0].base = ggtt->gmadr.start;
 	info->apertures->ranges[0].size = ggtt->mappable_end;
 
+	/* Our framebuffer is the entirety of fbdev's system memory */
+	info->fix.smem_start =
+		(unsigned long)(ggtt->gmadr.start + vma->node.start);
+	info->fix.smem_len = vma->node.size;
+
 	vaddr = i915_vma_pin_iomap(vma);
 	if (IS_ERR(vaddr)) {
 		DRM_ERROR("Failed to remap framebuffer into virtual memory\n");
@@ -243,10 +248,6 @@ static int intelfb_create(struct drm_fb_helper *helper,
 	info->screen_base = vaddr;
 	info->screen_size = vma->node.size;
 
-	/* Our framebuffer is the entirety of fbdev's system memory */
-	info->fix.smem_start = (unsigned long)info->screen_base;
-	info->fix.smem_len = info->screen_size;
-
 	drm_fb_helper_fill_info(info, &ifbdev->helper, sizes);
 
 	/* If the object is shmemfs backed, it will have given us zeroed pages.
diff --git a/drivers/gpu/drm/i915/display/intel_sprite.c b/drivers/gpu/drm/i915/display/intel_sprite.c
index edc41fc40726..72fda0430062 100644
--- a/drivers/gpu/drm/i915/display/intel_sprite.c
+++ b/drivers/gpu/drm/i915/display/intel_sprite.c
@@ -2885,7 +2885,7 @@ struct intel_plane *
 skl_universal_plane_create(struct drm_i915_private *dev_priv,
 			   enum pipe pipe, enum plane_id plane_id)
 {
-	static const struct drm_plane_funcs *plane_funcs;
+	const struct drm_plane_funcs *plane_funcs;
 	struct intel_plane *plane;
 	enum drm_plane_type plane_type;
 	unsigned int supported_rotations;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index de6e55af82cf..4237a2887ff2 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -236,6 +236,8 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
 	free_engines(rcu_access_pointer(ctx->engines));
 	mutex_destroy(&ctx->engines_mutex);
 
+	kfree(ctx->jump_whitelist);
+
 	if (ctx->timeline)
 		intel_timeline_put(ctx->timeline);
 
@@ -366,7 +368,7 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce)
 	if (!ce->timeline)
 		return NULL;
 
-	rcu_read_lock();
+	mutex_lock(&ce->timeline->mutex);
 	list_for_each_entry_reverse(rq, &ce->timeline->requests, link) {
 		if (i915_request_completed(rq))
 			break;
@@ -376,7 +378,7 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce)
 		if (engine)
 			break;
 	}
-	rcu_read_unlock();
+	mutex_unlock(&ce->timeline->mutex);
 
 	return engine;
 }
@@ -527,6 +529,9 @@ __create_context(struct drm_i915_private *i915)
 	for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
 		ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES;
 
+	ctx->jump_whitelist = NULL;
+	ctx->jump_whitelist_cmds = 0;
+
 	spin_lock(&i915->gem.contexts.lock);
 	list_add_tail(&ctx->link, &i915->gem.contexts.list);
 	spin_unlock(&i915->gem.contexts.lock);
@@ -722,6 +727,7 @@ int i915_gem_init_contexts(struct drm_i915_private *i915)
 void i915_gem_driver_release__contexts(struct drm_i915_private *i915)
 {
 	destroy_kernel_context(&i915->kernel_context);
+	flush_work(&i915->gem.contexts.free_work);
 }
 
 static int context_idr_cleanup(int id, void *p, void *data)
@@ -1136,7 +1142,7 @@ static int set_ppgtt(struct drm_i915_file_private *file_priv,
 
 	if (i915_gem_context_is_closed(ctx)) {
 		err = -ENOENT;
-		goto out;
+		goto unlock;
 	}
 
 	if (vm == rcu_access_pointer(ctx->vm))
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index 861d7d92fe9f..3870dd5daaa0 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -176,6 +176,13 @@ struct i915_gem_context {
 	 * per vm, which may be one per context or shared with the global GTT)
 	 */
 	struct radix_tree_root handles_vma;
+
+	/** jump_whitelist: Bit array for tracking cmds during cmdparsing
+	 *  Guarded by struct_mutex
+	 */
+	unsigned long *jump_whitelist;
+	/** jump_whitelist_cmds: No of cmd slots available */
+	u32 jump_whitelist_cmds;
 };
 
 #endif /* __I915_GEM_CONTEXT_TYPES_H__ */
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index e4f5c269150a..f0998f1225af 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -298,7 +298,9 @@ static inline u64 gen8_noncanonical_addr(u64 address)
 
 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
 {
-	return intel_engine_needs_cmd_parser(eb->engine) && eb->batch_len;
+	return intel_engine_requires_cmd_parser(eb->engine) ||
+		(intel_engine_using_cmd_parser(eb->engine) &&
+		 eb->args->batch_len);
 }
 
 static int eb_create(struct i915_execbuffer *eb)
@@ -1990,40 +1992,94 @@ static int i915_reset_gen7_sol_offsets(struct i915_request *rq)
 	return 0;
 }
 
-static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
+static struct i915_vma *
+shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_private *dev_priv = eb->i915;
+	struct i915_vma * const vma = *eb->vma;
+	struct i915_address_space *vm;
+	u64 flags;
+
+	/*
+	 * PPGTT backed shadow buffers must be mapped RO, to prevent
+	 * post-scan tampering
+	 */
+	if (CMDPARSER_USES_GGTT(dev_priv)) {
+		flags = PIN_GLOBAL;
+		vm = &dev_priv->ggtt.vm;
+	} else if (vma->vm->has_read_only) {
+		flags = PIN_USER;
+		vm = vma->vm;
+		i915_gem_object_set_readonly(obj);
+	} else {
+		DRM_DEBUG("Cannot prevent post-scan tampering without RO capable vm\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	return i915_gem_object_pin(obj, vm, NULL, 0, 0, flags);
+}
+
+static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
 {
 	struct intel_engine_pool_node *pool;
 	struct i915_vma *vma;
+	u64 batch_start;
+	u64 shadow_batch_start;
 	int err;
 
 	pool = intel_engine_get_pool(eb->engine, eb->batch_len);
 	if (IS_ERR(pool))
 		return ERR_CAST(pool);
 
-	err = intel_engine_cmd_parser(eb->engine,
+	vma = shadow_batch_pin(eb, pool->obj);
+	if (IS_ERR(vma))
+		goto err;
+
+	batch_start = gen8_canonical_addr(eb->batch->node.start) +
+		      eb->batch_start_offset;
+
+	shadow_batch_start = gen8_canonical_addr(vma->node.start);
+
+	err = intel_engine_cmd_parser(eb->gem_context,
+				      eb->engine,
 				      eb->batch->obj,
-				      pool->obj,
+				      batch_start,
 				      eb->batch_start_offset,
 				      eb->batch_len,
-				      is_master);
+				      pool->obj,
+				      shadow_batch_start);
+
 	if (err) {
-		if (err == -EACCES) /* unhandled chained batch */
+		i915_vma_unpin(vma);
+
+		/*
+		 * Unsafe GGTT-backed buffers can still be submitted safely
+		 * as non-secure.
+		 * For PPGTT backing however, we have no choice but to forcibly
+		 * reject unsafe buffers
+		 */
+		if (CMDPARSER_USES_GGTT(eb->i915) && (err == -EACCES))
+			/* Execute original buffer non-secure */
 			vma = NULL;
 		else
 			vma = ERR_PTR(err);
 		goto err;
 	}
 
-	vma = i915_gem_object_ggtt_pin(pool->obj, NULL, 0, 0, 0);
-	if (IS_ERR(vma))
-		goto err;
-
 	eb->vma[eb->buffer_count] = i915_vma_get(vma);
 	eb->flags[eb->buffer_count] =
 		__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF;
 	vma->exec_flags = &eb->flags[eb->buffer_count];
 	eb->buffer_count++;
 
+	eb->batch_start_offset = 0;
+	eb->batch = vma;
+
+	if (CMDPARSER_USES_GGTT(eb->i915))
+		eb->batch_flags |= I915_DISPATCH_SECURE;
+
+	/* eb->batch_len unchanged */
+
 	vma->private = pool;
 	return vma;
 
@@ -2430,6 +2486,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		       struct drm_i915_gem_exec_object2 *exec,
 		       struct drm_syncobj **fences)
 {
+	struct drm_i915_private *i915 = to_i915(dev);
 	struct i915_execbuffer eb;
 	struct dma_fence *in_fence = NULL;
 	struct dma_fence *exec_fence = NULL;
@@ -2441,7 +2498,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS &
 		     ~__EXEC_OBJECT_UNKNOWN_FLAGS);
 
-	eb.i915 = to_i915(dev);
+	eb.i915 = i915;
 	eb.file = file;
 	eb.args = args;
 	if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC))
@@ -2461,8 +2518,15 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 
 	eb.batch_flags = 0;
 	if (args->flags & I915_EXEC_SECURE) {
+		if (INTEL_GEN(i915) >= 11)
+			return -ENODEV;
+
+		/* Return -EPERM to trigger fallback code on old binaries. */
+		if (!HAS_SECURE_BATCHES(i915))
+			return -EPERM;
+
 		if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN))
-		    return -EPERM;
+			return -EPERM;
 
 		eb.batch_flags |= I915_DISPATCH_SECURE;
 	}
@@ -2539,34 +2603,19 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		goto err_vma;
 	}
 
+	if (eb.batch_len == 0)
+		eb.batch_len = eb.batch->size - eb.batch_start_offset;
+
 	if (eb_use_cmdparser(&eb)) {
 		struct i915_vma *vma;
 
-		vma = eb_parse(&eb, drm_is_current_master(file));
+		vma = eb_parse(&eb);
 		if (IS_ERR(vma)) {
 			err = PTR_ERR(vma);
 			goto err_vma;
 		}
-
-		if (vma) {
-			/*
-			 * Batch parsed and accepted:
-			 *
-			 * Set the DISPATCH_SECURE bit to remove the NON_SECURE
-			 * bit from MI_BATCH_BUFFER_START commands issued in
-			 * the dispatch_execbuffer implementations. We
-			 * specifically don't want that set on batches the
-			 * command parser has accepted.
-			 */
-			eb.batch_flags |= I915_DISPATCH_SECURE;
-			eb.batch_start_offset = 0;
-			eb.batch = vma;
-		}
 	}
 
-	if (eb.batch_len == 0)
-		eb.batch_len = eb.batch->size - eb.batch_start_offset;
-
 	/*
 	 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure
 	 * batch" bit. Hence we need to pin secure batches into the global gtt.
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 1e045c337044..4c72d74d6576 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -646,8 +646,28 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj,
 		obj->mm.dirty = false;
 
 	for_each_sgt_page(page, sgt_iter, pages) {
-		if (obj->mm.dirty)
+		if (obj->mm.dirty && trylock_page(page)) {
+			/*
+			 * As this may not be anonymous memory (e.g. shmem)
+			 * but exist on a real mapping, we have to lock
+			 * the page in order to dirty it -- holding
+			 * the page reference is not sufficient to
+			 * prevent the inode from being truncated.
+			 * Play safe and take the lock.
+			 *
+			 * However...!
+			 *
+			 * The mmu-notifier can be invalidated for a
+			 * migrate_page, that is alreadying holding the lock
+			 * on the page. Such a try_to_unmap() will result
+			 * in us calling put_pages() and so recursively try
+			 * to lock the page. We avoid that deadlock with
+			 * a trylock_page() and in exchange we risk missing
+			 * some page dirtying.
+			 */
 			set_page_dirty(page);
+			unlock_page(page);
+		}
 
 		mark_page_accessed(page);
 		put_page(page);
diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index ee9d2bcd2c13..ef7bc41ffffa 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -310,10 +310,23 @@ int intel_context_prepare_remote_request(struct intel_context *ce,
 	GEM_BUG_ON(rq->hw_context == ce);
 
 	if (rcu_access_pointer(rq->timeline) != tl) { /* timeline sharing! */
-		err = mutex_lock_interruptible_nested(&tl->mutex,
-						      SINGLE_DEPTH_NESTING);
-		if (err)
-			return err;
+		/*
+		 * Ideally, we just want to insert our foreign fence as
+		 * a barrier into the remove context, such that this operation
+		 * occurs after all current operations in that context, and
+		 * all future operations must occur after this.
+		 *
+		 * Currently, the timeline->last_request tracking is guarded
+		 * by its mutex and so we must obtain that to atomically
+		 * insert our barrier. However, since we already hold our
+		 * timeline->mutex, we must be careful against potential
+		 * inversion if we are the kernel_context as the remote context
+		 * will itself poke at the kernel_context when it needs to
+		 * unpin. Ergo, if already locked, we drop both locks and
+		 * try again (through the magic of userspace repeating EAGAIN).
+		 */
+		if (!mutex_trylock(&tl->mutex))
+			return -EAGAIN;
 
 		/* Queue this switch after current activity by this context. */
 		err = i915_active_fence_set(&tl->last_request, rq);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index bc3b72bfa9e3..01765a7ec18f 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -100,9 +100,7 @@ execlists_num_ports(const struct intel_engine_execlists * const execlists)
 static inline struct i915_request *
 execlists_active(const struct intel_engine_execlists *execlists)
 {
-	GEM_BUG_ON(execlists->active - execlists->inflight >
-		   execlists_num_ports(execlists));
-	return READ_ONCE(*execlists->active);
+	return *READ_ONCE(execlists->active);
 }
 
 static inline void
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index f8113bc756c6..813bd3a610d2 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -28,13 +28,13 @@
 
 #include "i915_drv.h"
 
-#include "gt/intel_gt.h"
-
+#include "intel_context.h"
 #include "intel_engine.h"
 #include "intel_engine_pm.h"
 #include "intel_engine_pool.h"
 #include "intel_engine_user.h"
-#include "intel_context.h"
+#include "intel_gt.h"
+#include "intel_gt_requests.h"
 #include "intel_lrc.h"
 #include "intel_reset.h"
 #include "intel_ring.h"
@@ -616,6 +616,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
 	intel_engine_init_execlists(engine);
 	intel_engine_init_cmd_parser(engine);
 	intel_engine_init__pm(engine);
+	intel_engine_init_retire(engine);
 
 	intel_engine_pool_init(&engine->pool);
 
@@ -838,6 +839,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 
 	cleanup_status_page(engine);
 
+	intel_engine_fini_retire(engine);
 	intel_engine_pool_fini(&engine->pool);
 	intel_engine_fini_breadcrumbs(engine);
 	intel_engine_cleanup_cmd_parser(engine);
@@ -1372,6 +1374,7 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine,
 		}
 
 		execlists_active_lock_bh(execlists);
+		rcu_read_lock();
 		for (port = execlists->active; (rq = *port); port++) {
 			char hdr[80];
 			int len;
@@ -1409,6 +1412,7 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine,
 			if (tl)
 				intel_timeline_put(tl);
 		}
+		rcu_read_unlock();
 		execlists_active_unlock_bh(execlists);
 	} else if (INTEL_GEN(dev_priv) > 6) {
 		drm_printf(m, "\tPP_DIR_BASE: 0x%08x\n",
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index 5051f304705b..06aa14c7aa8c 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -141,8 +141,8 @@ void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
 
 void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
 {
-	cancel_delayed_work(&engine->heartbeat.work);
-	i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
+	if (cancel_delayed_work(&engine->heartbeat.work))
+		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
 }
 
 void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index 3c0f490ff2c7..0e1ad4a4bd97 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -73,8 +73,42 @@ static inline void __timeline_mark_unlock(struct intel_context *ce,
 
 #endif /* !IS_ENABLED(CONFIG_LOCKDEP) */
 
+static void
+__queue_and_release_pm(struct i915_request *rq,
+		       struct intel_timeline *tl,
+		       struct intel_engine_cs *engine)
+{
+	struct intel_gt_timelines *timelines = &engine->gt->timelines;
+
+	GEM_TRACE("%s\n", engine->name);
+
+	/*
+	 * We have to serialise all potential retirement paths with our
+	 * submission, as we don't want to underflow either the
+	 * engine->wakeref.counter or our timeline->active_count.
+	 *
+	 * Equally, we cannot allow a new submission to start until
+	 * after we finish queueing, nor could we allow that submitter
+	 * to retire us before we are ready!
+	 */
+	spin_lock(&timelines->lock);
+
+	/* Let intel_gt_retire_requests() retire us (acquired under lock) */
+	if (!atomic_fetch_inc(&tl->active_count))
+		list_add_tail(&tl->link, &timelines->active_list);
+
+	/* Hand the request over to HW and so engine_retire() */
+	__i915_request_queue(rq, NULL);
+
+	/* Let new submissions commence (and maybe retire this timeline) */
+	__intel_wakeref_defer_park(&engine->wakeref);
+
+	spin_unlock(&timelines->lock);
+}
+
 static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 {
+	struct intel_context *ce = engine->kernel_context;
 	struct i915_request *rq;
 	unsigned long flags;
 	bool result = true;
@@ -98,16 +132,31 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 	 * This should hold true as we can only park the engine after
 	 * retiring the last request, thus all rings should be empty and
 	 * all timelines idle.
+	 *
+	 * For unlocking, there are 2 other parties and the GPU who have a
+	 * stake here.
+	 *
+	 * A new gpu user will be waiting on the engine-pm to start their
+	 * engine_unpark. New waiters are predicated on engine->wakeref.count
+	 * and so intel_wakeref_defer_park() acts like a mutex_unlock of the
+	 * engine->wakeref.
+	 *
+	 * The other party is intel_gt_retire_requests(), which is walking the
+	 * list of active timelines looking for completions. Meanwhile as soon
+	 * as we call __i915_request_queue(), the GPU may complete our request.
+	 * Ergo, if we put ourselves on the timelines.active_list
+	 * (se intel_timeline_enter()) before we increment the
+	 * engine->wakeref.count, we may see the request completion and retire
+	 * it causing an undeflow of the engine->wakeref.
 	 */
-	flags = __timeline_mark_lock(engine->kernel_context);
+	flags = __timeline_mark_lock(ce);
+	GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0);
 
-	rq = __i915_request_create(engine->kernel_context, GFP_NOWAIT);
+	rq = __i915_request_create(ce, GFP_NOWAIT);
 	if (IS_ERR(rq))
 		/* Context switch failed, hope for the best! Maybe reset? */
 		goto out_unlock;
 
-	intel_timeline_enter(i915_request_timeline(rq));
-
 	/* Check again on the next retirement. */
 	engine->wakeref_serial = engine->serial + 1;
 	i915_request_add_active_barriers(rq);
@@ -116,13 +165,12 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine)
 	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
 	__i915_request_commit(rq);
 
-	/* Release our exclusive hold on the engine */
-	__intel_wakeref_defer_park(&engine->wakeref);
-	__i915_request_queue(rq, NULL);
+	/* Expose ourselves to the world */
+	__queue_and_release_pm(rq, ce->timeline, engine);
 
 	result = false;
 out_unlock:
-	__timeline_mark_unlock(engine->kernel_context, flags);
+	__timeline_mark_unlock(ce, flags);
 	return result;
 }
 
@@ -177,7 +225,8 @@ static int __engine_park(struct intel_wakeref *wf)
 
 	engine->execlists.no_priolist = false;
 
-	intel_gt_pm_put(engine->gt);
+	/* While gt calls i915_vma_parked(), we have to break the lock cycle */
+	intel_gt_pm_put_async(engine->gt);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.h b/drivers/gpu/drm/i915/gt/intel_engine_pm.h
index 739c50fefcef..24e20344dc22 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.h
@@ -31,6 +31,16 @@ static inline void intel_engine_pm_put(struct intel_engine_cs *engine)
 	intel_wakeref_put(&engine->wakeref);
 }
 
+static inline void intel_engine_pm_put_async(struct intel_engine_cs *engine)
+{
+	intel_wakeref_put_async(&engine->wakeref);
+}
+
+static inline void intel_engine_pm_flush(struct intel_engine_cs *engine)
+{
+	intel_wakeref_unlock_wait(&engine->wakeref);
+}
+
 void intel_engine_init__pm(struct intel_engine_cs *engine);
 
 #endif /* INTEL_ENGINE_PM_H */
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index c5d1047a4bc5..17f1f1441efc 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -451,16 +451,25 @@ struct intel_engine_cs {
 
 	struct intel_engine_execlists execlists;
 
+	/*
+	 * Keep track of completed timelines on this engine for early
+	 * retirement with the goal of quickly enabling powersaving as
+	 * soon as the engine is idle.
+	 */
+	struct intel_timeline *retire;
+	struct work_struct retire_work;
+
 	/* status_notifier: list of callbacks for context-switch changes */
 	struct atomic_notifier_head context_status_notifier;
 
-#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
+#define I915_ENGINE_USING_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
 #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
 #define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
 #define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4)
 #define I915_ENGINE_IS_VIRTUAL       BIT(5)
 #define I915_ENGINE_HAS_RELATIVE_MMIO BIT(6)
+#define I915_ENGINE_REQUIRES_CMD_PARSER BIT(7)
 	unsigned int flags;
 
 	/*
@@ -528,9 +537,15 @@ struct intel_engine_cs {
 };
 
 static inline bool
-intel_engine_needs_cmd_parser(const struct intel_engine_cs *engine)
+intel_engine_using_cmd_parser(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_USING_CMD_PARSER;
+}
+
+static inline bool
+intel_engine_requires_cmd_parser(const struct intel_engine_cs *engine)
 {
-	return engine->flags & I915_ENGINE_NEEDS_CMD_PARSER;
+	return engine->flags & I915_ENGINE_REQUIRES_CMD_PARSER;
 }
 
 static inline bool
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 060a27d9af34..a459a42ad5c2 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -61,6 +61,9 @@ static int __gt_unpark(struct intel_wakeref *wf)
 	gt->awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 	GEM_BUG_ON(!gt->awake);
 
+	if (NEEDS_RC6_CTX_CORRUPTION_WA(i915))
+		intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL);
+
 	intel_rps_unpark(&gt->rps);
 	i915_pmu_gt_unparked(i915);
 
@@ -86,6 +89,11 @@ static int __gt_park(struct intel_wakeref *wf)
 	/* Everything switched off, flush any residual interrupt just in case */
 	intel_synchronize_irq(i915);
 
+	if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) {
+		intel_rc6_ctx_wa_check(&i915->gt.rc6);
+		intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL);
+	}
+
 	GEM_BUG_ON(!wakeref);
 	intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ, wakeref);
 
@@ -97,7 +105,6 @@ static int __gt_park(struct intel_wakeref *wf)
 static const struct intel_wakeref_ops wf_ops = {
 	.get = __gt_unpark,
 	.put = __gt_park,
-	.flags = INTEL_WAKEREF_PUT_ASYNC,
 };
 
 void intel_gt_pm_init_early(struct intel_gt *gt)
@@ -264,7 +271,7 @@ void intel_gt_suspend_prepare(struct intel_gt *gt)
 
 static suspend_state_t pm_suspend_target(void)
 {
-#if IS_ENABLED(CONFIG_PM_SLEEP)
+#if IS_ENABLED(CONFIG_SUSPEND) && IS_ENABLED(CONFIG_PM_SLEEP)
 	return pm_suspend_target_state;
 #else
 	return PM_SUSPEND_TO_IDLE;
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
index b3e17399be9b..990efc27a4e4 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
@@ -32,6 +32,11 @@ static inline void intel_gt_pm_put(struct intel_gt *gt)
 	intel_wakeref_put(&gt->wakeref);
 }
 
+static inline void intel_gt_pm_put_async(struct intel_gt *gt)
+{
+	intel_wakeref_put_async(&gt->wakeref);
+}
+
 static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
 {
 	return intel_wakeref_wait_for_idle(&gt->wakeref);
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
index b73229a84d85..3dc13ecf41bf 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
@@ -4,6 +4,8 @@
  * Copyright © 2019 Intel Corporation
  */
 
+#include <linux/workqueue.h>
+
 #include "i915_drv.h" /* for_each_engine() */
 #include "i915_request.h"
 #include "intel_gt.h"
@@ -29,6 +31,79 @@ static void flush_submission(struct intel_gt *gt)
 		intel_engine_flush_submission(engine);
 }
 
+static void engine_retire(struct work_struct *work)
+{
+	struct intel_engine_cs *engine =
+		container_of(work, typeof(*engine), retire_work);
+	struct intel_timeline *tl = xchg(&engine->retire, NULL);
+
+	do {
+		struct intel_timeline *next = xchg(&tl->retire, NULL);
+
+		/*
+		 * Our goal here is to retire _idle_ timelines as soon as
+		 * possible (as they are idle, we do not expect userspace
+		 * to be cleaning up anytime soon).
+		 *
+		 * If the timeline is currently locked, either it is being
+		 * retired elsewhere or about to be!
+		 */
+		if (mutex_trylock(&tl->mutex)) {
+			retire_requests(tl);
+			mutex_unlock(&tl->mutex);
+		}
+		intel_timeline_put(tl);
+
+		GEM_BUG_ON(!next);
+		tl = ptr_mask_bits(next, 1);
+	} while (tl);
+}
+
+static bool add_retire(struct intel_engine_cs *engine,
+		       struct intel_timeline *tl)
+{
+	struct intel_timeline *first;
+
+	/*
+	 * We open-code a llist here to include the additional tag [BIT(0)]
+	 * so that we know when the timeline is already on a
+	 * retirement queue: either this engine or another.
+	 *
+	 * However, we rely on that a timeline can only be active on a single
+	 * engine at any one time and that add_retire() is called before the
+	 * engine releases the timeline and transferred to another to retire.
+	 */
+
+	if (READ_ONCE(tl->retire)) /* already queued */
+		return false;
+
+	intel_timeline_get(tl);
+	first = READ_ONCE(engine->retire);
+	do
+		tl->retire = ptr_pack_bits(first, 1, 1);
+	while (!try_cmpxchg(&engine->retire, &first, tl));
+
+	return !first;
+}
+
+void intel_engine_add_retire(struct intel_engine_cs *engine,
+			     struct intel_timeline *tl)
+{
+	if (add_retire(engine, tl))
+		schedule_work(&engine->retire_work);
+}
+
+void intel_engine_init_retire(struct intel_engine_cs *engine)
+{
+	INIT_WORK(&engine->retire_work, engine_retire);
+}
+
+void intel_engine_fini_retire(struct intel_engine_cs *engine)
+{
+	flush_work(&engine->retire_work);
+	GEM_BUG_ON(engine->retire);
+}
+
 long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 {
 	struct intel_gt_timelines *timelines = &gt->timelines;
@@ -52,8 +127,8 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 		}
 
 		intel_timeline_get(tl);
-		GEM_BUG_ON(!tl->active_count);
-		tl->active_count++; /* pin the list element */
+		GEM_BUG_ON(!atomic_read(&tl->active_count));
+		atomic_inc(&tl->active_count); /* pin the list element */
 		spin_unlock_irqrestore(&timelines->lock, flags);
 
 		if (timeout > 0) {
@@ -74,16 +149,16 @@ long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
 
 		/* Resume iteration after dropping lock */
 		list_safe_reset_next(tl, tn, link);
-		if (--tl->active_count)
-			active_count += !!rcu_access_pointer(tl->last_request.fence);
-		else
+		if (atomic_dec_and_test(&tl->active_count))
 			list_del(&tl->link);
+		else
+			active_count += !!rcu_access_pointer(tl->last_request.fence);
 
 		mutex_unlock(&tl->mutex);
 
 		/* Defer the final release to after the spinlock */
 		if (refcount_dec_and_test(&tl->kref.refcount)) {
-			GEM_BUG_ON(tl->active_count);
+			GEM_BUG_ON(atomic_read(&tl->active_count));
 			list_add(&tl->link, &free);
 		}
 	}
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.h b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
index bd31cbce47e0..d626fb115386 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
@@ -7,7 +7,9 @@
 #ifndef INTEL_GT_REQUESTS_H
 #define INTEL_GT_REQUESTS_H
 
+struct intel_engine_cs;
 struct intel_gt;
+struct intel_timeline;
 
 long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout);
 static inline void intel_gt_retire_requests(struct intel_gt *gt)
@@ -15,6 +17,11 @@ static inline void intel_gt_retire_requests(struct intel_gt *gt)
 	intel_gt_retire_requests_timeout(gt, 0);
 }
 
+void intel_engine_init_retire(struct intel_engine_cs *engine);
+void intel_engine_add_retire(struct intel_engine_cs *engine,
+			     struct intel_timeline *tl);
+void intel_engine_fini_retire(struct intel_engine_cs *engine);
+
 int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout);
 
 void intel_gt_init_requests(struct intel_gt *gt);
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 51aef2a233cb..9fdefbdc3546 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -142,6 +142,7 @@
 #include "intel_engine_pm.h"
 #include "intel_gt.h"
 #include "intel_gt_pm.h"
+#include "intel_gt_requests.h"
 #include "intel_lrc_reg.h"
 #include "intel_mocs.h"
 #include "intel_reset.h"
@@ -990,6 +991,59 @@ static void intel_engine_context_out(struct intel_engine_cs *engine)
 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
 }
 
+static void restore_default_state(struct intel_context *ce,
+				  struct intel_engine_cs *engine)
+{
+	u32 *regs = ce->lrc_reg_state;
+
+	if (engine->pinned_default_state)
+		memcpy(regs, /* skip restoring the vanilla PPHWSP */
+		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
+		       engine->context_size - PAGE_SIZE);
+
+	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
+}
+
+static void reset_active(struct i915_request *rq,
+			 struct intel_engine_cs *engine)
+{
+	struct intel_context * const ce = rq->hw_context;
+	u32 head;
+
+	/*
+	 * The executing context has been cancelled. We want to prevent
+	 * further execution along this context and propagate the error on
+	 * to anything depending on its results.
+	 *
+	 * In __i915_request_submit(), we apply the -EIO and remove the
+	 * requests' payloads for any banned requests. But first, we must
+	 * rewind the context back to the start of the incomplete request so
+	 * that we do not jump back into the middle of the batch.
+	 *
+	 * We preserve the breadcrumbs and semaphores of the incomplete
+	 * requests so that inter-timeline dependencies (i.e other timelines)
+	 * remain correctly ordered. And we defer to __i915_request_submit()
+	 * so that all asynchronous waits are correctly handled.
+	 */
+	GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
+		  __func__, engine->name, rq->fence.context, rq->fence.seqno);
+
+	/* On resubmission of the active request, payload will be scrubbed */
+	if (i915_request_completed(rq))
+		head = rq->tail;
+	else
+		head = active_request(ce->timeline, rq)->head;
+	ce->ring->head = intel_ring_wrap(ce->ring, head);
+	intel_ring_update_space(ce->ring);
+
+	/* Scrub the context image to prevent replaying the previous batch */
+	restore_default_state(ce, engine);
+	__execlists_update_reg_state(ce, engine);
+
+	/* We've switched away, so this should be a no-op, but intent matters */
+	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
+}
+
 static inline struct intel_engine_cs *
 __execlists_schedule_in(struct i915_request *rq)
 {
@@ -998,6 +1052,9 @@ __execlists_schedule_in(struct i915_request *rq)
 
 	intel_context_get(ce);
 
+	if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
+		reset_active(rq, engine);
+
 	if (ce->tag) {
 		/* Use a fixed tag for OA and friends */
 		ce->lrc_desc |= (u64)ce->tag << 32;
@@ -1047,71 +1104,29 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
 		tasklet_schedule(&ve->base.execlists.tasklet);
 }
 
-static void restore_default_state(struct intel_context *ce,
-				  struct intel_engine_cs *engine)
-{
-	u32 *regs = ce->lrc_reg_state;
-
-	if (engine->pinned_default_state)
-		memcpy(regs, /* skip restoring the vanilla PPHWSP */
-		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
-		       engine->context_size - PAGE_SIZE);
-
-	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
-}
-
-static void reset_active(struct i915_request *rq,
-			 struct intel_engine_cs *engine)
+static inline void
+__execlists_schedule_out(struct i915_request *rq,
+			 struct intel_engine_cs * const engine)
 {
 	struct intel_context * const ce = rq->hw_context;
-	u32 head;
 
 	/*
-	 * The executing context has been cancelled. We want to prevent
-	 * further execution along this context and propagate the error on
-	 * to anything depending on its results.
-	 *
-	 * In __i915_request_submit(), we apply the -EIO and remove the
-	 * requests' payloads for any banned requests. But first, we must
-	 * rewind the context back to the start of the incomplete request so
-	 * that we do not jump back into the middle of the batch.
-	 *
-	 * We preserve the breadcrumbs and semaphores of the incomplete
-	 * requests so that inter-timeline dependencies (i.e other timelines)
-	 * remain correctly ordered. And we defer to __i915_request_submit()
-	 * so that all asynchronous waits are correctly handled.
+	 * NB process_csb() is not under the engine->active.lock and hence
+	 * schedule_out can race with schedule_in meaning that we should
+	 * refrain from doing non-trivial work here.
 	 */
-	GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
-		  __func__, engine->name, rq->fence.context, rq->fence.seqno);
 
-	/* On resubmission of the active request, payload will be scrubbed */
-	if (i915_request_completed(rq))
-		head = rq->tail;
-	else
-		head = active_request(ce->timeline, rq)->head;
-	ce->ring->head = intel_ring_wrap(ce->ring, head);
-	intel_ring_update_space(ce->ring);
-
-	/* Scrub the context image to prevent replaying the previous batch */
-	restore_default_state(ce, engine);
-	__execlists_update_reg_state(ce, engine);
-
-	/* We've switched away, so this should be a no-op, but intent matters */
-	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
-}
-
-static inline void
-__execlists_schedule_out(struct i915_request *rq,
-			 struct intel_engine_cs * const engine)
-{
-	struct intel_context * const ce = rq->hw_context;
+	/*
+	 * If we have just completed this context, the engine may now be
+	 * idle and we want to re-enter powersaving.
+	 */
+	if (list_is_last(&rq->link, &ce->timeline->requests) &&
+	    i915_request_completed(rq))
+		intel_engine_add_retire(engine, ce->timeline);
 
 	intel_engine_context_out(engine);
 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
-	intel_gt_pm_put(engine->gt);
-
-	if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
-		reset_active(rq, engine);
+	intel_gt_pm_put_async(engine->gt);
 
 	/*
 	 * If this is part of a virtual engine, its next request may
@@ -1931,16 +1946,17 @@ skip_submit:
 static void
 cancel_port_requests(struct intel_engine_execlists * const execlists)
 {
-	struct i915_request * const *port, *rq;
+	struct i915_request * const *port;
 
-	for (port = execlists->pending; (rq = *port); port++)
-		execlists_schedule_out(rq);
+	for (port = execlists->pending; *port; port++)
+		execlists_schedule_out(*port);
 	memset(execlists->pending, 0, sizeof(execlists->pending));
 
-	for (port = execlists->active; (rq = *port); port++)
-		execlists_schedule_out(rq);
-	execlists->active =
-		memset(execlists->inflight, 0, sizeof(execlists->inflight));
+	/* Mark the end of active before we overwrite *active */
+	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
+		execlists_schedule_out(*port);
+	WRITE_ONCE(execlists->active,
+		   memset(execlists->inflight, 0, sizeof(execlists->inflight)));
 }
 
 static inline void
@@ -2093,23 +2109,27 @@ static void process_csb(struct intel_engine_cs *engine)
 		else
 			promote = gen8_csb_parse(execlists, buf + 2 * head);
 		if (promote) {
+			struct i915_request * const *old = execlists->active;
+
+			/* Point active to the new ELSP; prevent overwriting */
+			WRITE_ONCE(execlists->active, execlists->pending);
+			set_timeslice(engine);
+
 			if (!inject_preempt_hang(execlists))
 				ring_set_paused(engine, 0);
 
 			/* cancel old inflight, prepare for switch */
-			trace_ports(execlists, "preempted", execlists->active);
-			while (*execlists->active)
-				execlists_schedule_out(*execlists->active++);
+			trace_ports(execlists, "preempted", old);
+			while (*old)
+				execlists_schedule_out(*old++);
 
 			/* switch pending to inflight */
 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
-			execlists->active =
-				memcpy(execlists->inflight,
-				       execlists->pending,
-				       execlists_num_ports(execlists) *
-				       sizeof(*execlists->pending));
-
-			set_timeslice(engine);
+			WRITE_ONCE(execlists->active,
+				   memcpy(execlists->inflight,
+					  execlists->pending,
+					  execlists_num_ports(execlists) *
+					  sizeof(*execlists->pending)));
 
 			WRITE_ONCE(execlists->pending[0], NULL);
 		} else {
diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c
index 5ad4a92a9582..700104b90163 100644
--- a/drivers/gpu/drm/i915/gt/intel_rc6.c
+++ b/drivers/gpu/drm/i915/gt/intel_rc6.c
@@ -178,8 +178,13 @@ static void gen9_rc6_enable(struct intel_rc6 *rc6)
 	    GEN6_RC_CTL_RC6_ENABLE |
 	    rc6_mode);
 
-	set(uncore, GEN9_PG_ENABLE,
-	    GEN9_RENDER_PG_ENABLE | GEN9_MEDIA_PG_ENABLE);
+	/*
+	 * WaRsDisableCoarsePowerGating:skl,cnl
+	 *   - Render/Media PG need to be disabled with RC6.
+	 */
+	if (!NEEDS_WaRsDisableCoarsePowerGating(rc6_to_i915(rc6)))
+		set(uncore, GEN9_PG_ENABLE,
+		    GEN9_RENDER_PG_ENABLE | GEN9_MEDIA_PG_ENABLE);
 }
 
 static void gen8_rc6_enable(struct intel_rc6 *rc6)
@@ -486,6 +491,66 @@ static void rpm_put(struct intel_rc6 *rc6)
 	rc6->wakeref = false;
 }
 
+static bool intel_rc6_ctx_corrupted(struct intel_rc6 *rc6)
+{
+	return !intel_uncore_read(rc6_to_uncore(rc6), GEN8_RC6_CTX_INFO);
+}
+
+static void intel_rc6_ctx_wa_init(struct intel_rc6 *rc6)
+{
+	struct drm_i915_private *i915 = rc6_to_i915(rc6);
+
+	if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915))
+		return;
+
+	if (intel_rc6_ctx_corrupted(rc6)) {
+		DRM_INFO("RC6 context corrupted, disabling runtime power management\n");
+		rc6->ctx_corrupted = true;
+	}
+}
+
+/**
+ * intel_rc6_ctx_wa_resume - system resume sequence for the RC6 CTX WA
+ * @rc6: rc6 state
+ *
+ * Perform any steps needed to re-init the RC6 CTX WA after system resume.
+ */
+void intel_rc6_ctx_wa_resume(struct intel_rc6 *rc6)
+{
+	if (rc6->ctx_corrupted && !intel_rc6_ctx_corrupted(rc6)) {
+		DRM_INFO("RC6 context restored, re-enabling runtime power management\n");
+		rc6->ctx_corrupted = false;
+	}
+}
+
+/**
+ * intel_rc6_ctx_wa_check - check for a new RC6 CTX corruption
+ * @rc6: rc6 state
+ *
+ * Check if an RC6 CTX corruption has happened since the last check and if so
+ * disable RC6 and runtime power management.
+*/
+void intel_rc6_ctx_wa_check(struct intel_rc6 *rc6)
+{
+	struct drm_i915_private *i915 = rc6_to_i915(rc6);
+
+	if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915))
+		return;
+
+	if (rc6->ctx_corrupted)
+		return;
+
+	if (!intel_rc6_ctx_corrupted(rc6))
+		return;
+
+	DRM_NOTE("RC6 context corruption, disabling runtime power management\n");
+
+	intel_rc6_disable(rc6);
+	rc6->ctx_corrupted = true;
+
+	return;
+}
+
 static void __intel_rc6_disable(struct intel_rc6 *rc6)
 {
 	struct drm_i915_private *i915 = rc6_to_i915(rc6);
@@ -510,6 +575,8 @@ void intel_rc6_init(struct intel_rc6 *rc6)
 	if (!rc6_supported(rc6))
 		return;
 
+	intel_rc6_ctx_wa_init(rc6);
+
 	if (IS_CHERRYVIEW(i915))
 		err = chv_rc6_init(rc6);
 	else if (IS_VALLEYVIEW(i915))
@@ -544,6 +611,9 @@ void intel_rc6_enable(struct intel_rc6 *rc6)
 
 	GEM_BUG_ON(rc6->enabled);
 
+	if (rc6->ctx_corrupted)
+		return;
+
 	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
 
 	if (IS_CHERRYVIEW(i915))
diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.h b/drivers/gpu/drm/i915/gt/intel_rc6.h
index 5e6711f36457..1370f6834a4c 100644
--- a/drivers/gpu/drm/i915/gt/intel_rc6.h
+++ b/drivers/gpu/drm/i915/gt/intel_rc6.h
@@ -22,4 +22,7 @@ void intel_rc6_disable(struct intel_rc6 *rc6);
 u64 intel_rc6_residency_ns(struct intel_rc6 *rc6, i915_reg_t reg);
 u64 intel_rc6_residency_us(struct intel_rc6 *rc6, i915_reg_t reg);
 
+void intel_rc6_ctx_wa_check(struct intel_rc6 *rc6);
+void intel_rc6_ctx_wa_resume(struct intel_rc6 *rc6);
+
 #endif /* INTEL_RC6_H */
diff --git a/drivers/gpu/drm/i915/gt/intel_rc6_types.h b/drivers/gpu/drm/i915/gt/intel_rc6_types.h
index 214f354d6ae4..89ad5697a8d4 100644
--- a/drivers/gpu/drm/i915/gt/intel_rc6_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_rc6_types.h
@@ -23,6 +23,7 @@ struct intel_rc6 {
 	bool supported : 1;
 	bool enabled : 1;
 	bool wakeref : 1;
+	bool ctx_corrupted : 1;
 };
 
 #endif /* INTEL_RC6_TYPES_H */
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index f03e000051c1..c97423a76642 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1114,7 +1114,7 @@ int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
 out:
 	intel_engine_cancel_stop_cs(engine);
 	reset_finish_engine(engine);
-	intel_engine_pm_put(engine);
+	intel_engine_pm_put_async(engine);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/gt/intel_ring.c b/drivers/gpu/drm/i915/gt/intel_ring.c
index ece20504d240..374b28f13ca0 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring.c
@@ -57,9 +57,10 @@ int intel_ring_pin(struct intel_ring *ring)
 
 	i915_vma_make_unshrinkable(vma);
 
-	GEM_BUG_ON(ring->vaddr);
-	ring->vaddr = addr;
+	/* Discard any unused bytes beyond that submitted to hw. */
+	intel_ring_reset(ring, ring->emit);
 
+	ring->vaddr = addr;
 	return 0;
 
 err_ring:
@@ -85,20 +86,14 @@ void intel_ring_unpin(struct intel_ring *ring)
 	if (!atomic_dec_and_test(&ring->pin_count))
 		return;
 
-	/* Discard any unused bytes beyond that submitted to hw. */
-	intel_ring_reset(ring, ring->emit);
-
 	i915_vma_unset_ggtt_write(vma);
 	if (i915_vma_is_map_and_fenceable(vma))
 		i915_vma_unpin_iomap(vma);
 	else
 		i915_gem_object_unpin_map(vma->obj);
 
-	GEM_BUG_ON(!ring->vaddr);
-	ring->vaddr = NULL;
-
-	i915_vma_unpin(vma);
 	i915_vma_make_purgeable(vma);
+	i915_vma_unpin(vma);
 }
 
 static struct i915_vma *create_ring_vma(struct i915_ggtt *ggtt, int size)
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
index 14ad10acd548..649798c184fb 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline.c
+++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
@@ -282,6 +282,7 @@ void intel_timeline_fini(struct intel_timeline *timeline)
 {
 	GEM_BUG_ON(atomic_read(&timeline->pin_count));
 	GEM_BUG_ON(!list_empty(&timeline->requests));
+	GEM_BUG_ON(timeline->retire);
 
 	if (timeline->hwsp_cacheline)
 		cacheline_free(timeline->hwsp_cacheline);
@@ -339,15 +340,33 @@ void intel_timeline_enter(struct intel_timeline *tl)
 	struct intel_gt_timelines *timelines = &tl->gt->timelines;
 	unsigned long flags;
 
+	/*
+	 * Pretend we are serialised by the timeline->mutex.
+	 *
+	 * While generally true, there are a few exceptions to the rule
+	 * for the engine->kernel_context being used to manage power
+	 * transitions. As the engine_park may be called from under any
+	 * timeline, it uses the power mutex as a global serialisation
+	 * lock to prevent any other request entering its timeline.
+	 *
+	 * The rule is generally tl->mutex, otherwise engine->wakeref.mutex.
+	 *
+	 * However, intel_gt_retire_request() does not know which engine
+	 * it is retiring along and so cannot partake in the engine-pm
+	 * barrier, and there we use the tl->active_count as a means to
+	 * pin the timeline in the active_list while the locks are dropped.
+	 * Ergo, as that is outside of the engine-pm barrier, we need to
+	 * use atomic to manipulate tl->active_count.
+	 */
 	lockdep_assert_held(&tl->mutex);
-
 	GEM_BUG_ON(!atomic_read(&tl->pin_count));
-	if (tl->active_count++)
+
+	if (atomic_add_unless(&tl->active_count, 1, 0))
 		return;
-	GEM_BUG_ON(!tl->active_count); /* overflow? */
 
 	spin_lock_irqsave(&timelines->lock, flags);
-	list_add(&tl->link, &timelines->active_list);
+	if (!atomic_fetch_inc(&tl->active_count))
+		list_add_tail(&tl->link, &timelines->active_list);
 	spin_unlock_irqrestore(&timelines->lock, flags);
 }
 
@@ -356,14 +375,16 @@ void intel_timeline_exit(struct intel_timeline *tl)
 	struct intel_gt_timelines *timelines = &tl->gt->timelines;
 	unsigned long flags;
 
+	/* See intel_timeline_enter() */
 	lockdep_assert_held(&tl->mutex);
 
-	GEM_BUG_ON(!tl->active_count);
-	if (--tl->active_count)
+	GEM_BUG_ON(!atomic_read(&tl->active_count));
+	if (atomic_add_unless(&tl->active_count, -1, 1))
 		return;
 
 	spin_lock_irqsave(&timelines->lock, flags);
-	list_del(&tl->link);
+	if (atomic_dec_and_test(&tl->active_count))
+		list_del(&tl->link);
 	spin_unlock_irqrestore(&timelines->lock, flags);
 
 	/*
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
index 98d9ee166379..aaf15cbe1ce1 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
@@ -42,7 +42,7 @@ struct intel_timeline {
 	 * from the intel_context caller plus internal atomicity.
 	 */
 	atomic_t pin_count;
-	unsigned int active_count;
+	atomic_t active_count;
 
 	const u32 *hwsp_seqno;
 	struct i915_vma *hwsp_ggtt;
@@ -66,6 +66,9 @@ struct intel_timeline {
 	 */
 	struct i915_active_fence last_request;
 
+	/** A chain of completed timelines ready for early retirement. */
+	struct intel_timeline *retire;
+
 	/**
 	 * We track the most recent seqno that we wait on in every context so
 	 * that we only have to emit a new await and dependency on a more
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
index 20b9c83f43ad..cbf6b0735272 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -51,11 +51,12 @@ static int live_engine_pm(void *arg)
 				pr_err("intel_engine_pm_get_if_awake(%s) failed under %s\n",
 				       engine->name, p->name);
 			else
-				intel_engine_pm_put(engine);
-			intel_engine_pm_put(engine);
+				intel_engine_pm_put_async(engine);
+			intel_engine_pm_put_async(engine);
 			p->critical_section_end();
 
-			/* engine wakeref is sync (instant) */
+			intel_engine_pm_flush(engine);
+
 			if (intel_engine_pm_is_awake(engine)) {
 				pr_err("%s is still awake after flushing pm\n",
 				       engine->name);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
index 019ae6486e8d..3ee4a4e7689d 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -554,6 +554,13 @@ int intel_guc_suspend(struct intel_guc *guc)
 	};
 
 	/*
+	 * If GuC communication is enabled but submission is not supported,
+	 * we do not need to suspend the GuC.
+	 */
+	if (!intel_guc_submission_is_enabled(guc))
+		return 0;
+
+	/*
 	 * The ENTER_S_STATE action queues the save/restore operation in GuC FW
 	 * and then returns, so waiting on the H2G is not enough to guarantee
 	 * GuC is done. When all the processing is done, GuC writes
@@ -610,6 +617,14 @@ int intel_guc_resume(struct intel_guc *guc)
 		GUC_POWER_D0,
 	};
 
+	/*
+	 * If GuC communication is enabled but submission is not supported,
+	 * we do not need to resume the GuC but we do need to enable the
+	 * GuC communication on resume (above).
+	 */
+	if (!intel_guc_submission_is_enabled(guc))
+		return 0;
+
 	return intel_guc_send(guc, action, ARRAY_SIZE(action));
 }
 
diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index 207383dda84d..dca15ace88f6 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -91,14 +91,15 @@ static void debug_active_init(struct i915_active *ref)
 
 static void debug_active_activate(struct i915_active *ref)
 {
-	lockdep_assert_held(&ref->mutex);
+	spin_lock_irq(&ref->tree_lock);
 	if (!atomic_read(&ref->count)) /* before the first inc */
 		debug_object_activate(ref, &active_debug_desc);
+	spin_unlock_irq(&ref->tree_lock);
 }
 
 static void debug_active_deactivate(struct i915_active *ref)
 {
-	lockdep_assert_held(&ref->mutex);
+	lockdep_assert_held(&ref->tree_lock);
 	if (!atomic_read(&ref->count)) /* after the last dec */
 		debug_object_deactivate(ref, &active_debug_desc);
 }
@@ -128,29 +129,22 @@ __active_retire(struct i915_active *ref)
 {
 	struct active_node *it, *n;
 	struct rb_root root;
-	bool retire = false;
+	unsigned long flags;
 
-	lockdep_assert_held(&ref->mutex);
 	GEM_BUG_ON(i915_active_is_idle(ref));
 
 	/* return the unused nodes to our slabcache -- flushing the allocator */
-	if (atomic_dec_and_test(&ref->count)) {
-		debug_active_deactivate(ref);
-		root = ref->tree;
-		ref->tree = RB_ROOT;
-		ref->cache = NULL;
-		retire = true;
-	}
-
-	mutex_unlock(&ref->mutex);
-	if (!retire)
+	if (!atomic_dec_and_lock_irqsave(&ref->count, &ref->tree_lock, flags))
 		return;
 
 	GEM_BUG_ON(rcu_access_pointer(ref->excl.fence));
-	rbtree_postorder_for_each_entry_safe(it, n, &root, node) {
-		GEM_BUG_ON(i915_active_fence_isset(&it->base));
-		kmem_cache_free(global.slab_cache, it);
-	}
+	debug_active_deactivate(ref);
+
+	root = ref->tree;
+	ref->tree = RB_ROOT;
+	ref->cache = NULL;
+
+	spin_unlock_irqrestore(&ref->tree_lock, flags);
 
 	/* After the final retire, the entire struct may be freed */
 	if (ref->retire)
@@ -158,6 +152,11 @@ __active_retire(struct i915_active *ref)
 
 	/* ... except if you wait on it, you must manage your own references! */
 	wake_up_var(ref);
+
+	rbtree_postorder_for_each_entry_safe(it, n, &root, node) {
+		GEM_BUG_ON(i915_active_fence_isset(&it->base));
+		kmem_cache_free(global.slab_cache, it);
+	}
 }
 
 static void
@@ -169,7 +168,6 @@ active_work(struct work_struct *wrk)
 	if (atomic_add_unless(&ref->count, -1, 1))
 		return;
 
-	mutex_lock(&ref->mutex);
 	__active_retire(ref);
 }
 
@@ -180,9 +178,7 @@ active_retire(struct i915_active *ref)
 	if (atomic_add_unless(&ref->count, -1, 1))
 		return;
 
-	/* If we are inside interrupt context (fence signaling), defer */
-	if (ref->flags & I915_ACTIVE_RETIRE_SLEEPS ||
-	    !mutex_trylock(&ref->mutex)) {
+	if (ref->flags & I915_ACTIVE_RETIRE_SLEEPS) {
 		queue_work(system_unbound_wq, &ref->work);
 		return;
 	}
@@ -227,7 +223,7 @@ active_instance(struct i915_active *ref, struct intel_timeline *tl)
 	if (!prealloc)
 		return NULL;
 
-	mutex_lock(&ref->mutex);
+	spin_lock_irq(&ref->tree_lock);
 	GEM_BUG_ON(i915_active_is_idle(ref));
 
 	parent = NULL;
@@ -257,7 +253,7 @@ active_instance(struct i915_active *ref, struct intel_timeline *tl)
 
 out:
 	ref->cache = node;
-	mutex_unlock(&ref->mutex);
+	spin_unlock_irq(&ref->tree_lock);
 
 	BUILD_BUG_ON(offsetof(typeof(*node), base));
 	return &node->base;
@@ -278,8 +274,10 @@ void __i915_active_init(struct i915_active *ref,
 	if (bits & I915_ACTIVE_MAY_SLEEP)
 		ref->flags |= I915_ACTIVE_RETIRE_SLEEPS;
 
+	spin_lock_init(&ref->tree_lock);
 	ref->tree = RB_ROOT;
 	ref->cache = NULL;
+
 	init_llist_head(&ref->preallocated_barriers);
 	atomic_set(&ref->count, 0);
 	__mutex_init(&ref->mutex, "i915_active", key);
@@ -510,7 +508,7 @@ static struct active_node *reuse_idle_barrier(struct i915_active *ref, u64 idx)
 	if (RB_EMPTY_ROOT(&ref->tree))
 		return NULL;
 
-	mutex_lock(&ref->mutex);
+	spin_lock_irq(&ref->tree_lock);
 	GEM_BUG_ON(i915_active_is_idle(ref));
 
 	/*
@@ -575,7 +573,7 @@ static struct active_node *reuse_idle_barrier(struct i915_active *ref, u64 idx)
 			goto match;
 	}
 
-	mutex_unlock(&ref->mutex);
+	spin_unlock_irq(&ref->tree_lock);
 
 	return NULL;
 
@@ -583,7 +581,7 @@ match:
 	rb_erase(p, &ref->tree); /* Hide from waits and sibling allocations */
 	if (p == &ref->cache->node)
 		ref->cache = NULL;
-	mutex_unlock(&ref->mutex);
+	spin_unlock_irq(&ref->tree_lock);
 
 	return rb_entry(p, struct active_node, node);
 }
@@ -664,6 +662,7 @@ unwind:
 void i915_active_acquire_barrier(struct i915_active *ref)
 {
 	struct llist_node *pos, *next;
+	unsigned long flags;
 
 	GEM_BUG_ON(i915_active_is_idle(ref));
 
@@ -673,12 +672,13 @@ void i915_active_acquire_barrier(struct i915_active *ref)
 	 * populated by i915_request_add_active_barriers() to point to the
 	 * request that will eventually release them.
 	 */
-	mutex_lock_nested(&ref->mutex, SINGLE_DEPTH_NESTING);
 	llist_for_each_safe(pos, next, take_preallocated_barriers(ref)) {
 		struct active_node *node = barrier_from_ll(pos);
 		struct intel_engine_cs *engine = barrier_to_engine(node);
 		struct rb_node **p, *parent;
 
+		spin_lock_irqsave_nested(&ref->tree_lock, flags,
+					 SINGLE_DEPTH_NESTING);
 		parent = NULL;
 		p = &ref->tree.rb_node;
 		while (*p) {
@@ -694,12 +694,12 @@ void i915_active_acquire_barrier(struct i915_active *ref)
 		}
 		rb_link_node(&node->node, parent, p);
 		rb_insert_color(&node->node, &ref->tree);
+		spin_unlock_irqrestore(&ref->tree_lock, flags);
 
 		GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
 		llist_add(barrier_to_ll(node), &engine->barrier_tasks);
 		intel_engine_pm_put(engine);
 	}
-	mutex_unlock(&ref->mutex);
 }
 
 void i915_request_add_active_barriers(struct i915_request *rq)
diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
index d89a74c142c6..96aed0ee700a 100644
--- a/drivers/gpu/drm/i915/i915_active_types.h
+++ b/drivers/gpu/drm/i915/i915_active_types.h
@@ -48,6 +48,7 @@ struct i915_active {
 	atomic_t count;
 	struct mutex mutex;
 
+	spinlock_t tree_lock;
 	struct active_node *cache;
 	struct rb_root tree;
 
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 24555102e198..f24096e27bef 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -53,13 +53,11 @@
  * granting userspace undue privileges. There are three categories of privilege.
  *
  * First, commands which are explicitly defined as privileged or which should
- * only be used by the kernel driver. The parser generally rejects such
- * commands, though it may allow some from the drm master process.
+ * only be used by the kernel driver. The parser rejects such commands
  *
  * Second, commands which access registers. To support correct/enhanced
  * userspace functionality, particularly certain OpenGL extensions, the parser
- * provides a whitelist of registers which userspace may safely access (for both
- * normal and drm master processes).
+ * provides a whitelist of registers which userspace may safely access
  *
  * Third, commands which access privileged memory (i.e. GGTT, HWS page, etc).
  * The parser always rejects such commands.
@@ -84,9 +82,9 @@
  * in the per-engine command tables.
  *
  * Other command table entries map fairly directly to high level categories
- * mentioned above: rejected, master-only, register whitelist. The parser
- * implements a number of checks, including the privileged memory checks, via a
- * general bitmasking mechanism.
+ * mentioned above: rejected, register whitelist. The parser implements a number
+ * of checks, including the privileged memory checks, via a general bitmasking
+ * mechanism.
  */
 
 /*
@@ -104,8 +102,6 @@ struct drm_i915_cmd_descriptor {
 	 * CMD_DESC_REJECT: The command is never allowed
 	 * CMD_DESC_REGISTER: The command should be checked against the
 	 *                    register whitelist for the appropriate ring
-	 * CMD_DESC_MASTER: The command is allowed if the submitting process
-	 *                  is the DRM master
 	 */
 	u32 flags;
 #define CMD_DESC_FIXED    (1<<0)
@@ -113,7 +109,6 @@ struct drm_i915_cmd_descriptor {
 #define CMD_DESC_REJECT   (1<<2)
 #define CMD_DESC_REGISTER (1<<3)
 #define CMD_DESC_BITMASK  (1<<4)
-#define CMD_DESC_MASTER   (1<<5)
 
 	/*
 	 * The command's unique identification bits and the bitmask to get them.
@@ -194,7 +189,7 @@ struct drm_i915_cmd_table {
 #define CMD(op, opm, f, lm, fl, ...)				\
 	{							\
 		.flags = (fl) | ((f) ? CMD_DESC_FIXED : 0),	\
-		.cmd = { (op), ~0u << (opm) },			\
+		.cmd = { (op & ~0u << (opm)), ~0u << (opm) },	\
 		.length = { (lm) },				\
 		__VA_ARGS__					\
 	}
@@ -209,14 +204,13 @@ struct drm_i915_cmd_table {
 #define R CMD_DESC_REJECT
 #define W CMD_DESC_REGISTER
 #define B CMD_DESC_BITMASK
-#define M CMD_DESC_MASTER
 
 /*            Command                          Mask   Fixed Len   Action
 	      ---------------------------------------------------------- */
-static const struct drm_i915_cmd_descriptor common_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_common_cmds[] = {
 	CMD(  MI_NOOP,                          SMI,    F,  1,      S  ),
 	CMD(  MI_USER_INTERRUPT,                SMI,    F,  1,      R  ),
-	CMD(  MI_WAIT_FOR_EVENT,                SMI,    F,  1,      M  ),
+	CMD(  MI_WAIT_FOR_EVENT,                SMI,    F,  1,      R  ),
 	CMD(  MI_ARB_CHECK,                     SMI,    F,  1,      S  ),
 	CMD(  MI_REPORT_HEAD,                   SMI,    F,  1,      S  ),
 	CMD(  MI_SUSPEND_FLUSH,                 SMI,    F,  1,      S  ),
@@ -246,7 +240,7 @@ static const struct drm_i915_cmd_descriptor common_cmds[] = {
 	CMD(  MI_BATCH_BUFFER_START,            SMI,   !F,  0xFF,   S  ),
 };
 
-static const struct drm_i915_cmd_descriptor render_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_render_cmds[] = {
 	CMD(  MI_FLUSH,                         SMI,    F,  1,      S  ),
 	CMD(  MI_ARB_ON_OFF,                    SMI,    F,  1,      R  ),
 	CMD(  MI_PREDICATE,                     SMI,    F,  1,      S  ),
@@ -313,7 +307,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = {
 	CMD(  MI_URB_ATOMIC_ALLOC,              SMI,    F,  1,      S  ),
 	CMD(  MI_SET_APPID,                     SMI,    F,  1,      S  ),
 	CMD(  MI_RS_CONTEXT,                    SMI,    F,  1,      S  ),
-	CMD(  MI_LOAD_SCAN_LINES_INCL,          SMI,   !F,  0x3F,   M  ),
+	CMD(  MI_LOAD_SCAN_LINES_INCL,          SMI,   !F,  0x3F,   R  ),
 	CMD(  MI_LOAD_SCAN_LINES_EXCL,          SMI,   !F,  0x3F,   R  ),
 	CMD(  MI_LOAD_REGISTER_REG,             SMI,   !F,  0xFF,   W,
 	      .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 }    ),
@@ -330,7 +324,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = {
 	CMD(  GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS,  S3D,   !F,  0x1FF,  S  ),
 };
 
-static const struct drm_i915_cmd_descriptor video_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_video_cmds[] = {
 	CMD(  MI_ARB_ON_OFF,                    SMI,    F,  1,      R  ),
 	CMD(  MI_SET_APPID,                     SMI,    F,  1,      S  ),
 	CMD(  MI_STORE_DWORD_IMM,               SMI,   !F,  0xFF,   B,
@@ -374,7 +368,7 @@ static const struct drm_i915_cmd_descriptor video_cmds[] = {
 	CMD(  MFX_WAIT,                         SMFX,   F,  1,      S  ),
 };
 
-static const struct drm_i915_cmd_descriptor vecs_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_vecs_cmds[] = {
 	CMD(  MI_ARB_ON_OFF,                    SMI,    F,  1,      R  ),
 	CMD(  MI_SET_APPID,                     SMI,    F,  1,      S  ),
 	CMD(  MI_STORE_DWORD_IMM,               SMI,   !F,  0xFF,   B,
@@ -412,7 +406,7 @@ static const struct drm_i915_cmd_descriptor vecs_cmds[] = {
 	      }},						       ),
 };
 
-static const struct drm_i915_cmd_descriptor blt_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_blt_cmds[] = {
 	CMD(  MI_DISPLAY_FLIP,                  SMI,   !F,  0xFF,   R  ),
 	CMD(  MI_STORE_DWORD_IMM,               SMI,   !F,  0x3FF,  B,
 	      .bits = {{
@@ -446,10 +440,64 @@ static const struct drm_i915_cmd_descriptor blt_cmds[] = {
 };
 
 static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = {
-	CMD(  MI_LOAD_SCAN_LINES_INCL,          SMI,   !F,  0x3F,   M  ),
+	CMD(  MI_LOAD_SCAN_LINES_INCL,          SMI,   !F,  0x3F,   R  ),
 	CMD(  MI_LOAD_SCAN_LINES_EXCL,          SMI,   !F,  0x3F,   R  ),
 };
 
+/*
+ * For Gen9 we can still rely on the h/w to enforce cmd security, and only
+ * need to re-enforce the register access checks. We therefore only need to
+ * teach the cmdparser how to find the end of each command, and identify
+ * register accesses. The table doesn't need to reject any commands, and so
+ * the only commands listed here are:
+ *   1) Those that touch registers
+ *   2) Those that do not have the default 8-bit length
+ *
+ * Note that the default MI length mask chosen for this table is 0xFF, not
+ * the 0x3F used on older devices. This is because the vast majority of MI
+ * cmds on Gen9 use a standard 8-bit Length field.
+ * All the Gen9 blitter instructions are standard 0xFF length mask, and
+ * none allow access to non-general registers, so in fact no BLT cmds are
+ * included in the table at all.
+ *
+ */
+static const struct drm_i915_cmd_descriptor gen9_blt_cmds[] = {
+	CMD(  MI_NOOP,                          SMI,    F,  1,      S  ),
+	CMD(  MI_USER_INTERRUPT,                SMI,    F,  1,      S  ),
+	CMD(  MI_WAIT_FOR_EVENT,                SMI,    F,  1,      S  ),
+	CMD(  MI_FLUSH,                         SMI,    F,  1,      S  ),
+	CMD(  MI_ARB_CHECK,                     SMI,    F,  1,      S  ),
+	CMD(  MI_REPORT_HEAD,                   SMI,    F,  1,      S  ),
+	CMD(  MI_ARB_ON_OFF,                    SMI,    F,  1,      S  ),
+	CMD(  MI_SUSPEND_FLUSH,                 SMI,    F,  1,      S  ),
+	CMD(  MI_LOAD_SCAN_LINES_INCL,          SMI,   !F,  0x3F,   S  ),
+	CMD(  MI_LOAD_SCAN_LINES_EXCL,          SMI,   !F,  0x3F,   S  ),
+	CMD(  MI_STORE_DWORD_IMM,               SMI,   !F,  0x3FF,  S  ),
+	CMD(  MI_LOAD_REGISTER_IMM(1),          SMI,   !F,  0xFF,   W,
+	      .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 2 }    ),
+	CMD(  MI_UPDATE_GTT,                    SMI,   !F,  0x3FF,  S  ),
+	CMD(  MI_STORE_REGISTER_MEM_GEN8,       SMI,    F,  4,      W,
+	      .reg = { .offset = 1, .mask = 0x007FFFFC }               ),
+	CMD(  MI_FLUSH_DW,                      SMI,   !F,  0x3F,   S  ),
+	CMD(  MI_LOAD_REGISTER_MEM_GEN8,        SMI,    F,  4,      W,
+	      .reg = { .offset = 1, .mask = 0x007FFFFC }               ),
+	CMD(  MI_LOAD_REGISTER_REG,             SMI,    !F,  0xFF,  W,
+	      .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 }    ),
+
+	/*
+	 * We allow BB_START but apply further checks. We just sanitize the
+	 * basic fields here.
+	 */
+#define MI_BB_START_OPERAND_MASK   GENMASK(SMI-1, 0)
+#define MI_BB_START_OPERAND_EXPECT (MI_BATCH_PPGTT_HSW | 1)
+	CMD(  MI_BATCH_BUFFER_START_GEN8,       SMI,    !F,  0xFF,  B,
+	      .bits = {{
+			.offset = 0,
+			.mask = MI_BB_START_OPERAND_MASK,
+			.expected = MI_BB_START_OPERAND_EXPECT,
+	      }},						       ),
+};
+
 static const struct drm_i915_cmd_descriptor noop_desc =
 	CMD(MI_NOOP, SMI, F, 1, S);
 
@@ -463,40 +511,44 @@ static const struct drm_i915_cmd_descriptor noop_desc =
 #undef R
 #undef W
 #undef B
-#undef M
 
-static const struct drm_i915_cmd_table gen7_render_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ render_cmds, ARRAY_SIZE(render_cmds) },
+static const struct drm_i915_cmd_table gen7_render_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) },
 };
 
-static const struct drm_i915_cmd_table hsw_render_ring_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ render_cmds, ARRAY_SIZE(render_cmds) },
+static const struct drm_i915_cmd_table hsw_render_ring_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) },
 	{ hsw_render_cmds, ARRAY_SIZE(hsw_render_cmds) },
 };
 
-static const struct drm_i915_cmd_table gen7_video_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ video_cmds, ARRAY_SIZE(video_cmds) },
+static const struct drm_i915_cmd_table gen7_video_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_video_cmds, ARRAY_SIZE(gen7_video_cmds) },
 };
 
-static const struct drm_i915_cmd_table hsw_vebox_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ vecs_cmds, ARRAY_SIZE(vecs_cmds) },
+static const struct drm_i915_cmd_table hsw_vebox_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_vecs_cmds, ARRAY_SIZE(gen7_vecs_cmds) },
 };
 
-static const struct drm_i915_cmd_table gen7_blt_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ blt_cmds, ARRAY_SIZE(blt_cmds) },
+static const struct drm_i915_cmd_table gen7_blt_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) },
 };
 
-static const struct drm_i915_cmd_table hsw_blt_ring_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ blt_cmds, ARRAY_SIZE(blt_cmds) },
+static const struct drm_i915_cmd_table hsw_blt_ring_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) },
 	{ hsw_blt_cmds, ARRAY_SIZE(hsw_blt_cmds) },
 };
 
+static const struct drm_i915_cmd_table gen9_blt_cmd_table[] = {
+	{ gen9_blt_cmds, ARRAY_SIZE(gen9_blt_cmds) },
+};
+
+
 /*
  * Register whitelists, sorted by increasing register offset.
  */
@@ -612,17 +664,27 @@ static const struct drm_i915_reg_descriptor gen7_blt_regs[] = {
 	REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
 };
 
-static const struct drm_i915_reg_descriptor ivb_master_regs[] = {
-	REG32(FORCEWAKE_MT),
-	REG32(DERRMR),
-	REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_A)),
-	REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_B)),
-	REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_C)),
-};
-
-static const struct drm_i915_reg_descriptor hsw_master_regs[] = {
-	REG32(FORCEWAKE_MT),
-	REG32(DERRMR),
+static const struct drm_i915_reg_descriptor gen9_blt_regs[] = {
+	REG64_IDX(RING_TIMESTAMP, RENDER_RING_BASE),
+	REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE),
+	REG32(BCS_SWCTRL),
+	REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
+	REG64_IDX(BCS_GPR, 0),
+	REG64_IDX(BCS_GPR, 1),
+	REG64_IDX(BCS_GPR, 2),
+	REG64_IDX(BCS_GPR, 3),
+	REG64_IDX(BCS_GPR, 4),
+	REG64_IDX(BCS_GPR, 5),
+	REG64_IDX(BCS_GPR, 6),
+	REG64_IDX(BCS_GPR, 7),
+	REG64_IDX(BCS_GPR, 8),
+	REG64_IDX(BCS_GPR, 9),
+	REG64_IDX(BCS_GPR, 10),
+	REG64_IDX(BCS_GPR, 11),
+	REG64_IDX(BCS_GPR, 12),
+	REG64_IDX(BCS_GPR, 13),
+	REG64_IDX(BCS_GPR, 14),
+	REG64_IDX(BCS_GPR, 15),
 };
 
 #undef REG64
@@ -631,28 +693,27 @@ static const struct drm_i915_reg_descriptor hsw_master_regs[] = {
 struct drm_i915_reg_table {
 	const struct drm_i915_reg_descriptor *regs;
 	int num_regs;
-	bool master;
 };
 
 static const struct drm_i915_reg_table ivb_render_reg_tables[] = {
-	{ gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false },
-	{ ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true },
+	{ gen7_render_regs, ARRAY_SIZE(gen7_render_regs) },
 };
 
 static const struct drm_i915_reg_table ivb_blt_reg_tables[] = {
-	{ gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false },
-	{ ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true },
+	{ gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) },
 };
 
 static const struct drm_i915_reg_table hsw_render_reg_tables[] = {
-	{ gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false },
-	{ hsw_render_regs, ARRAY_SIZE(hsw_render_regs), false },
-	{ hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true },
+	{ gen7_render_regs, ARRAY_SIZE(gen7_render_regs) },
+	{ hsw_render_regs, ARRAY_SIZE(hsw_render_regs) },
 };
 
 static const struct drm_i915_reg_table hsw_blt_reg_tables[] = {
-	{ gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false },
-	{ hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true },
+	{ gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) },
+};
+
+static const struct drm_i915_reg_table gen9_blt_reg_tables[] = {
+	{ gen9_blt_regs, ARRAY_SIZE(gen9_blt_regs) },
 };
 
 static u32 gen7_render_get_cmd_length_mask(u32 cmd_header)
@@ -710,6 +771,17 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header)
 	return 0;
 }
 
+static u32 gen9_blt_get_cmd_length_mask(u32 cmd_header)
+{
+	u32 client = cmd_header >> INSTR_CLIENT_SHIFT;
+
+	if (client == INSTR_MI_CLIENT || client == INSTR_BC_CLIENT)
+		return 0xFF;
+
+	DRM_DEBUG_DRIVER("CMD: Abnormal blt cmd length! 0x%08X\n", cmd_header);
+	return 0;
+}
+
 static bool validate_cmds_sorted(const struct intel_engine_cs *engine,
 				 const struct drm_i915_cmd_table *cmd_tables,
 				 int cmd_table_count)
@@ -867,18 +939,19 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 	int cmd_table_count;
 	int ret;
 
-	if (!IS_GEN(engine->i915, 7))
+	if (!IS_GEN(engine->i915, 7) && !(IS_GEN(engine->i915, 9) &&
+					  engine->class == COPY_ENGINE_CLASS))
 		return;
 
 	switch (engine->class) {
 	case RENDER_CLASS:
 		if (IS_HASWELL(engine->i915)) {
-			cmd_tables = hsw_render_ring_cmds;
+			cmd_tables = hsw_render_ring_cmd_table;
 			cmd_table_count =
-				ARRAY_SIZE(hsw_render_ring_cmds);
+				ARRAY_SIZE(hsw_render_ring_cmd_table);
 		} else {
-			cmd_tables = gen7_render_cmds;
-			cmd_table_count = ARRAY_SIZE(gen7_render_cmds);
+			cmd_tables = gen7_render_cmd_table;
+			cmd_table_count = ARRAY_SIZE(gen7_render_cmd_table);
 		}
 
 		if (IS_HASWELL(engine->i915)) {
@@ -888,36 +961,46 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 			engine->reg_tables = ivb_render_reg_tables;
 			engine->reg_table_count = ARRAY_SIZE(ivb_render_reg_tables);
 		}
-
 		engine->get_cmd_length_mask = gen7_render_get_cmd_length_mask;
 		break;
 	case VIDEO_DECODE_CLASS:
-		cmd_tables = gen7_video_cmds;
-		cmd_table_count = ARRAY_SIZE(gen7_video_cmds);
+		cmd_tables = gen7_video_cmd_table;
+		cmd_table_count = ARRAY_SIZE(gen7_video_cmd_table);
 		engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
 		break;
 	case COPY_ENGINE_CLASS:
-		if (IS_HASWELL(engine->i915)) {
-			cmd_tables = hsw_blt_ring_cmds;
-			cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds);
+		engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask;
+		if (IS_GEN(engine->i915, 9)) {
+			cmd_tables = gen9_blt_cmd_table;
+			cmd_table_count = ARRAY_SIZE(gen9_blt_cmd_table);
+			engine->get_cmd_length_mask =
+				gen9_blt_get_cmd_length_mask;
+
+			/* BCS Engine unsafe without parser */
+			engine->flags |= I915_ENGINE_REQUIRES_CMD_PARSER;
+		} else if (IS_HASWELL(engine->i915)) {
+			cmd_tables = hsw_blt_ring_cmd_table;
+			cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmd_table);
 		} else {
-			cmd_tables = gen7_blt_cmds;
-			cmd_table_count = ARRAY_SIZE(gen7_blt_cmds);
+			cmd_tables = gen7_blt_cmd_table;
+			cmd_table_count = ARRAY_SIZE(gen7_blt_cmd_table);
 		}
 
-		if (IS_HASWELL(engine->i915)) {
+		if (IS_GEN(engine->i915, 9)) {
+			engine->reg_tables = gen9_blt_reg_tables;
+			engine->reg_table_count =
+				ARRAY_SIZE(gen9_blt_reg_tables);
+		} else if (IS_HASWELL(engine->i915)) {
 			engine->reg_tables = hsw_blt_reg_tables;
 			engine->reg_table_count = ARRAY_SIZE(hsw_blt_reg_tables);
 		} else {
 			engine->reg_tables = ivb_blt_reg_tables;
 			engine->reg_table_count = ARRAY_SIZE(ivb_blt_reg_tables);
 		}
-
-		engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask;
 		break;
 	case VIDEO_ENHANCEMENT_CLASS:
-		cmd_tables = hsw_vebox_cmds;
-		cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds);
+		cmd_tables = hsw_vebox_cmd_table;
+		cmd_table_count = ARRAY_SIZE(hsw_vebox_cmd_table);
 		/* VECS can use the same length_mask function as VCS */
 		engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
 		break;
@@ -943,7 +1026,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 		return;
 	}
 
-	engine->flags |= I915_ENGINE_NEEDS_CMD_PARSER;
+	engine->flags |= I915_ENGINE_USING_CMD_PARSER;
 }
 
 /**
@@ -955,7 +1038,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
  */
 void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine)
 {
-	if (!intel_engine_needs_cmd_parser(engine))
+	if (!intel_engine_using_cmd_parser(engine))
 		return;
 
 	fini_hash_table(engine);
@@ -1029,22 +1112,16 @@ __find_reg(const struct drm_i915_reg_descriptor *table, int count, u32 addr)
 }
 
 static const struct drm_i915_reg_descriptor *
-find_reg(const struct intel_engine_cs *engine, bool is_master, u32 addr)
+find_reg(const struct intel_engine_cs *engine, u32 addr)
 {
 	const struct drm_i915_reg_table *table = engine->reg_tables;
+	const struct drm_i915_reg_descriptor *reg = NULL;
 	int count = engine->reg_table_count;
 
-	for (; count > 0; ++table, --count) {
-		if (!table->master || is_master) {
-			const struct drm_i915_reg_descriptor *reg;
+	for (; !reg && (count > 0); ++table, --count)
+		reg = __find_reg(table->regs, table->num_regs, addr);
 
-			reg = __find_reg(table->regs, table->num_regs, addr);
-			if (reg != NULL)
-				return reg;
-		}
-	}
-
-	return NULL;
+	return reg;
 }
 
 /* Returns a vmap'd pointer to dst_obj, which the caller must unmap */
@@ -1128,8 +1205,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 
 static bool check_cmd(const struct intel_engine_cs *engine,
 		      const struct drm_i915_cmd_descriptor *desc,
-		      const u32 *cmd, u32 length,
-		      const bool is_master)
+		      const u32 *cmd, u32 length)
 {
 	if (desc->flags & CMD_DESC_SKIP)
 		return true;
@@ -1139,12 +1215,6 @@ static bool check_cmd(const struct intel_engine_cs *engine,
 		return false;
 	}
 
-	if ((desc->flags & CMD_DESC_MASTER) && !is_master) {
-		DRM_DEBUG_DRIVER("CMD: Rejected master-only command: 0x%08X\n",
-				 *cmd);
-		return false;
-	}
-
 	if (desc->flags & CMD_DESC_REGISTER) {
 		/*
 		 * Get the distance between individual register offset
@@ -1158,7 +1228,7 @@ static bool check_cmd(const struct intel_engine_cs *engine,
 		     offset += step) {
 			const u32 reg_addr = cmd[offset] & desc->reg.mask;
 			const struct drm_i915_reg_descriptor *reg =
-				find_reg(engine, is_master, reg_addr);
+				find_reg(engine, reg_addr);
 
 			if (!reg) {
 				DRM_DEBUG_DRIVER("CMD: Rejected register 0x%08X in command: 0x%08X (%s)\n",
@@ -1236,16 +1306,112 @@ static bool check_cmd(const struct intel_engine_cs *engine,
 	return true;
 }
 
+static int check_bbstart(const struct i915_gem_context *ctx,
+			 u32 *cmd, u32 offset, u32 length,
+			 u32 batch_len,
+			 u64 batch_start,
+			 u64 shadow_batch_start)
+{
+	u64 jump_offset, jump_target;
+	u32 target_cmd_offset, target_cmd_index;
+
+	/* For igt compatibility on older platforms */
+	if (CMDPARSER_USES_GGTT(ctx->i915)) {
+		DRM_DEBUG("CMD: Rejecting BB_START for ggtt based submission\n");
+		return -EACCES;
+	}
+
+	if (length != 3) {
+		DRM_DEBUG("CMD: Recursive BB_START with bad length(%u)\n",
+			  length);
+		return -EINVAL;
+	}
+
+	jump_target = *(u64*)(cmd+1);
+	jump_offset = jump_target - batch_start;
+
+	/*
+	 * Any underflow of jump_target is guaranteed to be outside the range
+	 * of a u32, so >= test catches both too large and too small
+	 */
+	if (jump_offset >= batch_len) {
+		DRM_DEBUG("CMD: BB_START to 0x%llx jumps out of BB\n",
+			  jump_target);
+		return -EINVAL;
+	}
+
+	/*
+	 * This cannot overflow a u32 because we already checked jump_offset
+	 * is within the BB, and the batch_len is a u32
+	 */
+	target_cmd_offset = lower_32_bits(jump_offset);
+	target_cmd_index = target_cmd_offset / sizeof(u32);
+
+	*(u64*)(cmd + 1) = shadow_batch_start + target_cmd_offset;
+
+	if (target_cmd_index == offset)
+		return 0;
+
+	if (ctx->jump_whitelist_cmds <= target_cmd_index) {
+		DRM_DEBUG("CMD: Rejecting BB_START - truncated whitelist array\n");
+		return -EINVAL;
+	} else if (!test_bit(target_cmd_index, ctx->jump_whitelist)) {
+		DRM_DEBUG("CMD: BB_START to 0x%llx not a previously executed cmd\n",
+			  jump_target);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void init_whitelist(struct i915_gem_context *ctx, u32 batch_len)
+{
+	const u32 batch_cmds = DIV_ROUND_UP(batch_len, sizeof(u32));
+	const u32 exact_size = BITS_TO_LONGS(batch_cmds);
+	u32 next_size = BITS_TO_LONGS(roundup_pow_of_two(batch_cmds));
+	unsigned long *next_whitelist;
+
+	if (CMDPARSER_USES_GGTT(ctx->i915))
+		return;
+
+	if (batch_cmds <= ctx->jump_whitelist_cmds) {
+		bitmap_zero(ctx->jump_whitelist, batch_cmds);
+		return;
+	}
+
+again:
+	next_whitelist = kcalloc(next_size, sizeof(long), GFP_KERNEL);
+	if (next_whitelist) {
+		kfree(ctx->jump_whitelist);
+		ctx->jump_whitelist = next_whitelist;
+		ctx->jump_whitelist_cmds =
+			next_size * BITS_PER_BYTE * sizeof(long);
+		return;
+	}
+
+	if (next_size > exact_size) {
+		next_size = exact_size;
+		goto again;
+	}
+
+	DRM_DEBUG("CMD: Failed to extend whitelist. BB_START may be disallowed\n");
+	bitmap_zero(ctx->jump_whitelist, ctx->jump_whitelist_cmds);
+
+	return;
+}
+
 #define LENGTH_BIAS 2
 
 /**
  * i915_parse_cmds() - parse a submitted batch buffer for privilege violations
+ * @ctx: the context in which the batch is to execute
  * @engine: the engine on which the batch is to execute
  * @batch_obj: the batch buffer in question
- * @shadow_batch_obj: copy of the batch buffer in question
+ * @batch_start: Canonical base address of batch
  * @batch_start_offset: byte offset in the batch at which execution starts
  * @batch_len: length of the commands in batch_obj
- * @is_master: is the submitting process the drm master?
+ * @shadow_batch_obj: copy of the batch buffer in question
+ * @shadow_batch_start: Canonical base address of shadow_batch_obj
  *
  * Parses the specified batch buffer looking for privilege violations as
  * described in the overview.
@@ -1253,14 +1419,17 @@ static bool check_cmd(const struct intel_engine_cs *engine,
  * Return: non-zero if the parser finds violations or otherwise fails; -EACCES
  * if the batch appears legal but should use hardware parsing
  */
-int intel_engine_cmd_parser(struct intel_engine_cs *engine,
+
+int intel_engine_cmd_parser(struct i915_gem_context *ctx,
+			    struct intel_engine_cs *engine,
 			    struct drm_i915_gem_object *batch_obj,
-			    struct drm_i915_gem_object *shadow_batch_obj,
+			    u64 batch_start,
 			    u32 batch_start_offset,
 			    u32 batch_len,
-			    bool is_master)
+			    struct drm_i915_gem_object *shadow_batch_obj,
+			    u64 shadow_batch_start)
 {
-	u32 *cmd, *batch_end;
+	u32 *cmd, *batch_end, offset = 0;
 	struct drm_i915_cmd_descriptor default_desc = noop_desc;
 	const struct drm_i915_cmd_descriptor *desc = &default_desc;
 	bool needs_clflush_after = false;
@@ -1274,6 +1443,8 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 		return PTR_ERR(cmd);
 	}
 
+	init_whitelist(ctx, batch_len);
+
 	/*
 	 * We use the batch length as size because the shadow object is as
 	 * large or larger and copy_batch() will write MI_NOPs to the extra
@@ -1283,31 +1454,15 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 	do {
 		u32 length;
 
-		if (*cmd == MI_BATCH_BUFFER_END) {
-			if (needs_clflush_after) {
-				void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping);
-				drm_clflush_virt_range(ptr,
-						       (void *)(cmd + 1) - ptr);
-			}
+		if (*cmd == MI_BATCH_BUFFER_END)
 			break;
-		}
 
 		desc = find_cmd(engine, *cmd, desc, &default_desc);
 		if (!desc) {
 			DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
 					 *cmd);
 			ret = -EINVAL;
-			break;
-		}
-
-		/*
-		 * If the batch buffer contains a chained batch, return an
-		 * error that tells the caller to abort and dispatch the
-		 * workload as a non-secure batch.
-		 */
-		if (desc->cmd.value == MI_BATCH_BUFFER_START) {
-			ret = -EACCES;
-			break;
+			goto err;
 		}
 
 		if (desc->flags & CMD_DESC_FIXED)
@@ -1321,22 +1476,43 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 					 length,
 					 batch_end - cmd);
 			ret = -EINVAL;
-			break;
+			goto err;
 		}
 
-		if (!check_cmd(engine, desc, cmd, length, is_master)) {
+		if (!check_cmd(engine, desc, cmd, length)) {
 			ret = -EACCES;
+			goto err;
+		}
+
+		if (desc->cmd.value == MI_BATCH_BUFFER_START) {
+			ret = check_bbstart(ctx, cmd, offset, length,
+					    batch_len, batch_start,
+					    shadow_batch_start);
+
+			if (ret)
+				goto err;
 			break;
 		}
 
+		if (ctx->jump_whitelist_cmds > offset)
+			set_bit(offset, ctx->jump_whitelist);
+
 		cmd += length;
+		offset += length;
 		if  (cmd >= batch_end) {
 			DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
 			ret = -EINVAL;
-			break;
+			goto err;
 		}
 	} while (1);
 
+	if (needs_clflush_after) {
+		void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping);
+
+		drm_clflush_virt_range(ptr, (void *)(cmd + 1) - ptr);
+	}
+
+err:
 	i915_gem_object_unpin_map(shadow_batch_obj);
 	return ret;
 }
@@ -1357,7 +1533,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv)
 
 	/* If the command parser is not enabled, report 0 - unsupported */
 	for_each_uabi_engine(engine, dev_priv) {
-		if (intel_engine_needs_cmd_parser(engine)) {
+		if (intel_engine_using_cmd_parser(engine)) {
 			active = true;
 			break;
 		}
@@ -1382,6 +1558,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv)
 	 *    the parser enabled.
 	 * 9. Don't whitelist or handle oacontrol specially, as ownership
 	 *    for oacontrol state is moving to i915-perf.
+	 * 10. Support for Gen9 BCS Parsing
 	 */
-	return 9;
+	return 10;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ccb5b566795f..87e05ca3646a 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -63,6 +63,7 @@
 #include "gem/i915_gem_ioctls.h"
 #include "gt/intel_gt.h"
 #include "gt/intel_gt_pm.h"
+#include "gt/intel_rc6.h"
 
 #include "i915_debugfs.h"
 #include "i915_drv.h"
@@ -1819,6 +1820,8 @@ static int i915_drm_resume(struct drm_device *dev)
 
 	disable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
 
+	intel_rc6_ctx_wa_resume(&dev_priv->gt.rc6);
+
 	intel_gt_sanitize(&dev_priv->gt, true);
 
 	ret = i915_ggtt_enable_hw(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0567fe67e8c5..e29bc137e7ba 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1614,9 +1614,16 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
 #define VEBOX_MASK(dev_priv) \
 	ENGINE_INSTANCES_MASK(dev_priv, VECS0, I915_MAX_VECS)
 
+/*
+ * The Gen7 cmdparser copies the scanned buffer to the ggtt for execution
+ * All later gens can run the final buffer from the ppgtt
+ */
+#define CMDPARSER_USES_GGTT(dev_priv) IS_GEN(dev_priv, 7)
+
 #define HAS_LLC(dev_priv)	(INTEL_INFO(dev_priv)->has_llc)
 #define HAS_SNOOP(dev_priv)	(INTEL_INFO(dev_priv)->has_snoop)
 #define HAS_EDRAM(dev_priv)	((dev_priv)->edram_size_mb)
+#define HAS_SECURE_BATCHES(dev_priv) (INTEL_GEN(dev_priv) < 6)
 #define HAS_WT(dev_priv)	((IS_HASWELL(dev_priv) || \
 				 IS_BROADWELL(dev_priv)) && HAS_EDRAM(dev_priv))
 
@@ -1649,10 +1656,12 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
 /* Early gen2 have a totally busted CS tlb and require pinned batches. */
 #define HAS_BROKEN_CS_TLB(dev_priv)	(IS_I830(dev_priv) || IS_I845G(dev_priv))
 
+#define NEEDS_RC6_CTX_CORRUPTION_WA(dev_priv)	\
+	(IS_BROADWELL(dev_priv) || IS_GEN(dev_priv, 9))
+
 /* WaRsDisableCoarsePowerGating:skl,cnl */
 #define NEEDS_WaRsDisableCoarsePowerGating(dev_priv) \
-	(IS_CANNONLAKE(dev_priv) || \
-	 IS_SKL_GT3(dev_priv) || IS_SKL_GT4(dev_priv))
+	(IS_CANNONLAKE(dev_priv) || IS_GEN(dev_priv, 9))
 
 #define HAS_GMBUS_IRQ(dev_priv) (INTEL_GEN(dev_priv) >= 4)
 #define HAS_GMBUS_BURST_READ(dev_priv) (INTEL_GEN(dev_priv) >= 10 || \
@@ -1834,6 +1843,14 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj,
 			   unsigned long flags);
 #define I915_GEM_OBJECT_UNBIND_ACTIVE BIT(0)
 
+struct i915_vma * __must_check
+i915_gem_object_pin(struct drm_i915_gem_object *obj,
+		    struct i915_address_space *vm,
+		    const struct i915_ggtt_view *view,
+		    u64 size,
+		    u64 alignment,
+		    u64 flags);
+
 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv);
 
 static inline int __must_check
@@ -1939,12 +1956,14 @@ const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
 int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv);
 void intel_engine_init_cmd_parser(struct intel_engine_cs *engine);
 void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine);
-int intel_engine_cmd_parser(struct intel_engine_cs *engine,
+int intel_engine_cmd_parser(struct i915_gem_context *cxt,
+			    struct intel_engine_cs *engine,
 			    struct drm_i915_gem_object *batch_obj,
-			    struct drm_i915_gem_object *shadow_batch_obj,
+			    u64 user_batch_start,
 			    u32 batch_start_offset,
 			    u32 batch_len,
-			    bool is_master);
+			    struct drm_i915_gem_object *shadow_batch_obj,
+			    u64 shadow_batch_start);
 
 /* intel_device_info.c */
 static inline struct intel_device_info *
@@ -2026,4 +2045,10 @@ i915_coherent_map_type(struct drm_i915_private *i915)
 	return HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC;
 }
 
+static inline bool intel_guc_submission_is_enabled(struct intel_guc *guc)
+{
+	return intel_guc_is_submission_supported(guc) &&
+		intel_guc_is_running(guc);
+}
+
 #endif
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ac23322fb382..b9eb6b3149b7 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -893,6 +893,20 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
 {
 	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 	struct i915_address_space *vm = &dev_priv->ggtt.vm;
+
+	return i915_gem_object_pin(obj, vm, view, size, alignment,
+				   flags | PIN_GLOBAL);
+}
+
+struct i915_vma *
+i915_gem_object_pin(struct drm_i915_gem_object *obj,
+		    struct i915_address_space *vm,
+		    const struct i915_ggtt_view *view,
+		    u64 size,
+		    u64 alignment,
+		    u64 flags)
+{
+	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 	struct i915_vma *vma;
 	int ret;
 
@@ -958,7 +972,7 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
 			return ERR_PTR(ret);
 	}
 
-	ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
+	ret = i915_vma_pin(vma, size, alignment, flags);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 88179202c556..6239a9adbf14 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2609,8 +2609,6 @@ static int init_aliasing_ppgtt(struct i915_ggtt *ggtt)
 	GEM_BUG_ON(ggtt->vm.vma_ops.unbind_vma != ggtt_unbind_vma);
 	ggtt->vm.vma_ops.unbind_vma = aliasing_gtt_unbind_vma;
 
-	ppgtt->vm.total = ggtt->vm.total;
-
 	return 0;
 
 err_ppgtt:
diff --git a/drivers/gpu/drm/i915/i915_getparam.c b/drivers/gpu/drm/i915/i915_getparam.c
index ad33fbe90a28..cf8a8c3ef047 100644
--- a/drivers/gpu/drm/i915/i915_getparam.c
+++ b/drivers/gpu/drm/i915/i915_getparam.c
@@ -63,7 +63,7 @@ int i915_getparam_ioctl(struct drm_device *dev, void *data,
 		value = !!(i915->caps.scheduler & I915_SCHEDULER_CAP_SEMAPHORES);
 		break;
 	case I915_PARAM_HAS_SECURE_BATCHES:
-		value = capable(CAP_SYS_ADMIN);
+		value = HAS_SECURE_BATCHES(i915) && capable(CAP_SYS_ADMIN);
 		break;
 	case I915_PARAM_CMD_PARSER_VERSION:
 		value = i915_cmd_parser_get_version(i915);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index e8b67f5e521d..3c85cb0ee99f 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1029,9 +1029,9 @@ i915_error_object_create(struct drm_i915_private *i915,
 		for_each_sgt_daddr(dma, iter, vma->pages) {
 			void __iomem *s;
 
-			s = io_mapping_map_atomic_wc(&mem->iomap, dma);
+			s = io_mapping_map_wc(&mem->iomap, dma, PAGE_SIZE);
 			ret = compress_page(compress, (void __force *)s, dst);
-			io_mapping_unmap_atomic(s);
+			io_mapping_unmap(s);
 			if (ret)
 				break;
 		}
@@ -1043,9 +1043,9 @@ i915_error_object_create(struct drm_i915_private *i915,
 
 			drm_clflush_pages(&page, 1);
 
-			s = kmap_atomic(page);
+			s = kmap(page);
 			ret = compress_page(compress, s, dst);
-			kunmap_atomic(s);
+			kunmap(s);
 
 			drm_clflush_pages(&page, 1);
 
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index a8c2318d3d5e..65d7c2e599de 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1870,7 +1870,7 @@ alloc_oa_config_buffer(struct i915_perf_stream *stream,
 	config_length += num_lri_dwords(oa_config->mux_regs_len);
 	config_length += num_lri_dwords(oa_config->b_counter_regs_len);
 	config_length += num_lri_dwords(oa_config->flex_regs_len);
-	config_length++; /* MI_BATCH_BUFFER_END */
+	config_length += 3; /* MI_BATCH_BUFFER_START */
 	config_length = ALIGN(sizeof(u32) * config_length, I915_GTT_PAGE_SIZE);
 
 	obj = i915_gem_object_create_shmem(stream->perf->i915, config_length);
@@ -1895,7 +1895,12 @@ alloc_oa_config_buffer(struct i915_perf_stream *stream,
 			     oa_config->flex_regs,
 			     oa_config->flex_regs_len);
 
-	*cs++ = MI_BATCH_BUFFER_END;
+	/* Jump into the active wait. */
+	*cs++ = (INTEL_GEN(stream->perf->i915) < 8 ?
+		 MI_BATCH_BUFFER_START :
+		 MI_BATCH_BUFFER_START_GEN8);
+	*cs++ = i915_ggtt_offset(stream->noa_wait);
+	*cs++ = 0;
 
 	i915_gem_object_flush_map(obj);
 	i915_gem_object_unpin_map(obj);
@@ -3307,15 +3312,6 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf,
 		}
 	}
 
-	if (props->hold_preemption) {
-		if (!props->single_context) {
-			DRM_DEBUG("preemption disable with no context\n");
-			ret = -EINVAL;
-			goto err;
-		}
-		privileged_op = true;
-	}
-
 	/*
 	 * On Haswell the OA unit supports clock gating off for a specific
 	 * context and in this mode there's no visibility of metrics for the
@@ -3335,12 +3331,21 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf,
 	 * doesn't request global stream access (i.e. query based sampling
 	 * using MI_RECORD_PERF_COUNT.
 	 */
-	if (IS_HASWELL(perf->i915) && specific_ctx && !props->hold_preemption)
+	if (IS_HASWELL(perf->i915) && specific_ctx)
 		privileged_op = false;
 	else if (IS_GEN(perf->i915, 12) && specific_ctx &&
 		 (props->sample_flags & SAMPLE_OA_REPORT) == 0)
 		privileged_op = false;
 
+	if (props->hold_preemption) {
+		if (!props->single_context) {
+			DRM_DEBUG("preemption disable with no context\n");
+			ret = -EINVAL;
+			goto err;
+		}
+		privileged_op = true;
+	}
+
 	/* Similar to perf's kernel.perf_paranoid_cpu sysctl option
 	 * we check a dev.i915.perf_stream_paranoid sysctl option
 	 * to determine if it's ok to access system wide OA counters
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 05395015d1f2..2814218c5ba1 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -190,7 +190,7 @@ static u64 get_rc6(struct intel_gt *gt)
 	val = 0;
 	if (intel_gt_pm_get_if_awake(gt)) {
 		val = __get_rc6(gt);
-		intel_gt_pm_put(gt);
+		intel_gt_pm_put_async(gt);
 	}
 
 	spin_lock_irqsave(&pmu->lock, flags);
@@ -343,7 +343,7 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
 
 skip:
 		spin_unlock_irqrestore(&engine->uncore->lock, flags);
-		intel_engine_pm_put(engine);
+		intel_engine_pm_put_async(engine);
 	}
 }
 
@@ -368,7 +368,7 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
 		if (intel_gt_pm_get_if_awake(gt)) {
 			val = intel_uncore_read_notrace(uncore, GEN6_RPSTAT1);
 			val = intel_get_cagf(rps, val);
-			intel_gt_pm_put(gt);
+			intel_gt_pm_put_async(gt);
 		}
 
 		add_sample_mult(&pmu->sample[__I915_SAMPLE_FREQ_ACT],
@@ -878,8 +878,8 @@ create_event_attributes(struct i915_pmu *pmu)
 		const char *name;
 		const char *unit;
 	} events[] = {
-		__event(I915_PMU_ACTUAL_FREQUENCY, "actual-frequency", "MHz"),
-		__event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", "MHz"),
+		__event(I915_PMU_ACTUAL_FREQUENCY, "actual-frequency", "M"),
+		__event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", "M"),
 		__event(I915_PMU_INTERRUPTS, "interrupts", NULL),
 		__event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"),
 	};
diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
index c27cfef9281c..ef25ce6e395e 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -103,15 +103,18 @@ query_engine_info(struct drm_i915_private *i915,
 	struct drm_i915_engine_info __user *info_ptr;
 	struct drm_i915_query_engine_info query;
 	struct drm_i915_engine_info info = { };
+	unsigned int num_uabi_engines = 0;
 	struct intel_engine_cs *engine;
 	int len, ret;
 
 	if (query_item->flags)
 		return -EINVAL;
 
+	for_each_uabi_engine(engine, i915)
+		num_uabi_engines++;
+
 	len = sizeof(struct drm_i915_query_engine_info) +
-	      RUNTIME_INFO(i915)->num_engines *
-	      sizeof(struct drm_i915_engine_info);
+	      num_uabi_engines * sizeof(struct drm_i915_engine_info);
 
 	ret = copy_query_item(&query, sizeof(query), len, query_item);
 	if (ret != 0)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 53c280c4e741..73079b503724 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -474,6 +474,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   ECOCHK_PPGTT_WT_HSW		(0x2 << 3)
 #define   ECOCHK_PPGTT_WB_HSW		(0x3 << 3)
 
+#define GEN8_RC6_CTX_INFO		_MMIO(0x8504)
+
 #define GAC_ECO_BITS			_MMIO(0x14090)
 #define   ECOBITS_SNB_BIT		(1 << 13)
 #define   ECOBITS_PPGTT_CACHE64B	(3 << 8)
@@ -560,6 +562,10 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
  */
 #define BCS_SWCTRL _MMIO(0x22200)
 
+/* There are 16 GPR registers */
+#define BCS_GPR(n)	_MMIO(0x22600 + (n) * 8)
+#define BCS_GPR_UDW(n)	_MMIO(0x22600 + (n) * 8 + 4)
+
 #define GPGPU_THREADS_DISPATCHED        _MMIO(0x2290)
 #define GPGPU_THREADS_DISPATCHED_UDW	_MMIO(0x2290 + 4)
 #define HS_INVOCATION_COUNT             _MMIO(0x2300)
@@ -7353,6 +7359,10 @@ enum {
 
 #define DMC_DEBUG3		_MMIO(0x101090)
 
+/* Display Internal Timeout Register */
+#define RM_TIMEOUT		_MMIO(0x42060)
+#define  MMIO_TIMEOUT_US(us)	((us) << 0)
+
 /* interrupts */
 #define DE_MASTER_IRQ_CONTROL   (1 << 31)
 #define DE_SPRITEB_FLIP_DONE    (1 << 29)
@@ -9661,7 +9671,7 @@ enum skl_power_gate {
 #define  TRANS_DDI_EDP_INPUT_A_ONOFF	(4 << 12)
 #define  TRANS_DDI_EDP_INPUT_B_ONOFF	(5 << 12)
 #define  TRANS_DDI_EDP_INPUT_C_ONOFF	(6 << 12)
-#define  TRANS_DDI_MST_TRANSPORT_SELECT_MASK	REG_GENMASK(12, 10)
+#define  TRANS_DDI_MST_TRANSPORT_SELECT_MASK	REG_GENMASK(11, 10)
 #define  TRANS_DDI_MST_TRANSPORT_SELECT(trans)	\
 	REG_FIELD_PREP(TRANS_DDI_MST_TRANSPORT_SELECT_MASK, trans)
 #define  TRANS_DDI_HDCP_SIGNALLING	(1 << 9)
diff --git a/drivers/gpu/drm/i915/intel_pch.c b/drivers/gpu/drm/i915/intel_pch.c
index 000ba43e2c02..8fd92b9130a7 100644
--- a/drivers/gpu/drm/i915/intel_pch.c
+++ b/drivers/gpu/drm/i915/intel_pch.c
@@ -62,7 +62,6 @@ intel_pch_type(const struct drm_i915_private *dev_priv, unsigned short id)
 		/* KBP is SPT compatible */
 		return PCH_SPT;
 	case INTEL_PCH_CNP_DEVICE_ID_TYPE:
-	case INTEL_PCH_CNP2_DEVICE_ID_TYPE:
 		DRM_DEBUG_KMS("Found Cannon Lake PCH (CNP)\n");
 		WARN_ON(!IS_CANNONLAKE(dev_priv) && !IS_COFFEELAKE(dev_priv));
 		return PCH_CNP;
@@ -76,6 +75,11 @@ intel_pch_type(const struct drm_i915_private *dev_priv, unsigned short id)
 		WARN_ON(!IS_COFFEELAKE(dev_priv));
 		/* CometPoint is CNP Compatible */
 		return PCH_CNP;
+	case INTEL_PCH_CMP_V_DEVICE_ID_TYPE:
+		DRM_DEBUG_KMS("Found Comet Lake V PCH (CMP-V)\n");
+		WARN_ON(!IS_COFFEELAKE(dev_priv));
+		/* Comet Lake V PCH is based on KBP, which is SPT compatible */
+		return PCH_SPT;
 	case INTEL_PCH_ICP_DEVICE_ID_TYPE:
 		DRM_DEBUG_KMS("Found Ice Lake PCH\n");
 		WARN_ON(!IS_ICELAKE(dev_priv));
diff --git a/drivers/gpu/drm/i915/intel_pch.h b/drivers/gpu/drm/i915/intel_pch.h
index 1115c6a0522c..d26c25dd8d54 100644
--- a/drivers/gpu/drm/i915/intel_pch.h
+++ b/drivers/gpu/drm/i915/intel_pch.h
@@ -40,10 +40,10 @@ enum intel_pch {
 #define INTEL_PCH_SPT_LP_DEVICE_ID_TYPE		0x9D00
 #define INTEL_PCH_KBP_DEVICE_ID_TYPE		0xA280
 #define INTEL_PCH_CNP_DEVICE_ID_TYPE		0xA300
-#define INTEL_PCH_CNP2_DEVICE_ID_TYPE		0xA380
 #define INTEL_PCH_CNP_LP_DEVICE_ID_TYPE		0x9D80
 #define INTEL_PCH_CMP_DEVICE_ID_TYPE		0x0280
 #define INTEL_PCH_CMP2_DEVICE_ID_TYPE		0x0680
+#define INTEL_PCH_CMP_V_DEVICE_ID_TYPE		0xA380
 #define INTEL_PCH_ICP_DEVICE_ID_TYPE		0x3480
 #define INTEL_PCH_MCC_DEVICE_ID_TYPE		0x4B00
 #define INTEL_PCH_TGP_DEVICE_ID_TYPE		0xA080
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 5d2b460d3ee5..809bff955b5a 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -107,6 +107,14 @@ static void bxt_init_clock_gating(struct drm_i915_private *dev_priv)
 	 */
 	I915_WRITE(GEN9_CLKGATE_DIS_0, I915_READ(GEN9_CLKGATE_DIS_0) |
 		   PWM1_GATING_DIS | PWM2_GATING_DIS);
+
+	/*
+	 * Lower the display internal timeout.
+	 * This is needed to avoid any hard hangs when DSI port PLL
+	 * is off and a MMIO access is attempted by any privilege
+	 * application, using batch buffers or any other means.
+	 */
+	I915_WRITE(RM_TIMEOUT, MMIO_TIMEOUT_US(950));
 }
 
 static void glk_init_clock_gating(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c
index 868cc78048d0..59aa1b6f1827 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.c
+++ b/drivers/gpu/drm/i915/intel_wakeref.c
@@ -54,7 +54,8 @@ int __intel_wakeref_get_first(struct intel_wakeref *wf)
 
 static void ____intel_wakeref_put_last(struct intel_wakeref *wf)
 {
-	if (!atomic_dec_and_test(&wf->count))
+	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count) <= 0);
+	if (unlikely(!atomic_dec_and_test(&wf->count)))
 		goto unlock;
 
 	/* ops->put() must reschedule its own release on error/deferral */
@@ -67,13 +68,12 @@ unlock:
 	mutex_unlock(&wf->mutex);
 }
 
-void __intel_wakeref_put_last(struct intel_wakeref *wf)
+void __intel_wakeref_put_last(struct intel_wakeref *wf, unsigned long flags)
 {
 	INTEL_WAKEREF_BUG_ON(work_pending(&wf->work));
 
 	/* Assume we are not in process context and so cannot sleep. */
-	if (wf->ops->flags & INTEL_WAKEREF_PUT_ASYNC ||
-	    !mutex_trylock(&wf->mutex)) {
+	if (flags & INTEL_WAKEREF_PUT_ASYNC || !mutex_trylock(&wf->mutex)) {
 		schedule_work(&wf->work);
 		return;
 	}
@@ -109,8 +109,17 @@ void __intel_wakeref_init(struct intel_wakeref *wf,
 
 int intel_wakeref_wait_for_idle(struct intel_wakeref *wf)
 {
-	return wait_var_event_killable(&wf->wakeref,
-				       !intel_wakeref_is_active(wf));
+	int err;
+
+	might_sleep();
+
+	err = wait_var_event_killable(&wf->wakeref,
+				      !intel_wakeref_is_active(wf));
+	if (err)
+		return err;
+
+	intel_wakeref_unlock_wait(wf);
+	return 0;
 }
 
 static void wakeref_auto_timeout(struct timer_list *t)
diff --git a/drivers/gpu/drm/i915/intel_wakeref.h b/drivers/gpu/drm/i915/intel_wakeref.h
index 5f0c972a80fb..da6e8fd506e6 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.h
+++ b/drivers/gpu/drm/i915/intel_wakeref.h
@@ -9,6 +9,7 @@
 
 #include <linux/atomic.h>
 #include <linux/bits.h>
+#include <linux/lockdep.h>
 #include <linux/mutex.h>
 #include <linux/refcount.h>
 #include <linux/stackdepot.h>
@@ -29,9 +30,6 @@ typedef depot_stack_handle_t intel_wakeref_t;
 struct intel_wakeref_ops {
 	int (*get)(struct intel_wakeref *wf);
 	int (*put)(struct intel_wakeref *wf);
-
-	unsigned long flags;
-#define INTEL_WAKEREF_PUT_ASYNC BIT(0)
 };
 
 struct intel_wakeref {
@@ -57,7 +55,7 @@ void __intel_wakeref_init(struct intel_wakeref *wf,
 } while (0)
 
 int __intel_wakeref_get_first(struct intel_wakeref *wf);
-void __intel_wakeref_put_last(struct intel_wakeref *wf);
+void __intel_wakeref_put_last(struct intel_wakeref *wf, unsigned long flags);
 
 /**
  * intel_wakeref_get: Acquire the wakeref
@@ -100,10 +98,9 @@ intel_wakeref_get_if_active(struct intel_wakeref *wf)
 }
 
 /**
- * intel_wakeref_put: Release the wakeref
- * @i915: the drm_i915_private device
+ * intel_wakeref_put_flags: Release the wakeref
  * @wf: the wakeref
- * @fn: callback for releasing the wakeref, called only on final release.
+ * @flags: control flags
  *
  * Release our hold on the wakeref. When there are no more users,
  * the runtime pm wakeref will be released after the @fn callback is called
@@ -116,11 +113,25 @@ intel_wakeref_get_if_active(struct intel_wakeref *wf)
  * code otherwise.
  */
 static inline void
-intel_wakeref_put(struct intel_wakeref *wf)
+__intel_wakeref_put(struct intel_wakeref *wf, unsigned long flags)
+#define INTEL_WAKEREF_PUT_ASYNC BIT(0)
 {
 	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count) <= 0);
 	if (unlikely(!atomic_add_unless(&wf->count, -1, 1)))
-		__intel_wakeref_put_last(wf);
+		__intel_wakeref_put_last(wf, flags);
+}
+
+static inline void
+intel_wakeref_put(struct intel_wakeref *wf)
+{
+	might_sleep();
+	__intel_wakeref_put(wf, 0);
+}
+
+static inline void
+intel_wakeref_put_async(struct intel_wakeref *wf)
+{
+	__intel_wakeref_put(wf, INTEL_WAKEREF_PUT_ASYNC);
 }
 
 /**
@@ -152,6 +163,21 @@ intel_wakeref_unlock(struct intel_wakeref *wf)
 }
 
 /**
+ * intel_wakeref_unlock_wait: Wait until the active callback is complete
+ * @wf: the wakeref
+ *
+ * Waits for the active callback (under the @wf->mutex or another CPU) is
+ * complete.
+ */
+static inline void
+intel_wakeref_unlock_wait(struct intel_wakeref *wf)
+{
+	mutex_lock(&wf->mutex);
+	mutex_unlock(&wf->mutex);
+	flush_work(&wf->work);
+}
+
+/**
  * intel_wakeref_is_active: Query whether the wakeref is currently held
  * @wf: the wakeref
  *
@@ -170,6 +196,7 @@ intel_wakeref_is_active(const struct intel_wakeref *wf)
 static inline void
 __intel_wakeref_defer_park(struct intel_wakeref *wf)
 {
+	lockdep_assert_held(&wf->mutex);
 	INTEL_WAKEREF_BUG_ON(atomic_read(&wf->count));
 	atomic_set_release(&wf->count, 1);
 }