From b7ea04d299c78b6cf96ab281a1683ff62a74f969 Mon Sep 17 00:00:00 2001
From: Sean Paul <seanpaul@chromium.org>
Date: Thu, 29 Nov 2018 10:04:17 -0500
Subject: drm: Add DRM_MODESET_LOCK_BEGIN/END helpers

This patch adds a couple of helpers to remove the boilerplate involved
in grabbing all of the modeset locks.

I've also converted the obvious cases in drm core to use the helpers.

The only remaining instance of drm_modeset_lock_all_ctx() is in
drm_framebuffer. It's complicated by the state clear that occurs on
deadlock. ATM, there's no way to inject code in the deadlock path with
the helpers, so it's unfit for conversion.

Changes in v2:
- Relocate ret argument to the end of the list (Daniel)
- Incorporate Daniel's doc suggestions (Daniel)

Suggested-by: Daniel Vetter <daniel@ffwll.ch>
Reviewed-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Sean Paul <seanpaul@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20181129150423.239081-4-sean@poorly.run
---
 include/drm/drm_modeset_lock.h | 59 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_modeset_lock.h b/include/drm/drm_modeset_lock.h
index a685d1bb21f2..a308f2d6496f 100644
--- a/include/drm/drm_modeset_lock.h
+++ b/include/drm/drm_modeset_lock.h
@@ -130,4 +130,63 @@ void drm_warn_on_modeset_not_all_locked(struct drm_device *dev);
 int drm_modeset_lock_all_ctx(struct drm_device *dev,
 			     struct drm_modeset_acquire_ctx *ctx);
 
+/**
+ * DRM_MODESET_LOCK_ALL_BEGIN - Helper to acquire modeset locks
+ * @dev: drm device
+ * @ctx: local modeset acquire context, will be dereferenced
+ * @flags: DRM_MODESET_ACQUIRE_* flags to pass to drm_modeset_acquire_init()
+ * @ret: local ret/err/etc variable to track error status
+ *
+ * Use these macros to simplify grabbing all modeset locks using a local
+ * context. This has the advantage of reducing boilerplate, but also properly
+ * checking return values where appropriate.
+ *
+ * Any code run between BEGIN and END will be holding the modeset locks.
+ *
+ * This must be paired with DRM_MODESET_LOCK_ALL_END(). We will jump back and
+ * forth between the labels on deadlock and error conditions.
+ *
+ * Drivers can acquire additional modeset locks. If any lock acquisition
+ * fails, the control flow needs to jump to DRM_MODESET_LOCK_ALL_END() with
+ * the @ret parameter containing the return value of drm_modeset_lock().
+ *
+ * Returns:
+ * The only possible value of ret immediately after DRM_MODESET_LOCK_ALL_BEGIN()
+ * is 0, so no error checking is necessary
+ */
+#define DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, flags, ret)		\
+	drm_modeset_acquire_init(&ctx, flags);				\
+modeset_lock_retry:							\
+	ret = drm_modeset_lock_all_ctx(dev, &ctx);			\
+	if (ret)							\
+		goto modeset_lock_fail;
+
+/**
+ * DRM_MODESET_LOCK_ALL_END - Helper to release and cleanup modeset locks
+ * @ctx: local modeset acquire context, will be dereferenced
+ * @ret: local ret/err/etc variable to track error status
+ *
+ * The other side of DRM_MODESET_LOCK_ALL_BEGIN(). It will bounce back to BEGIN
+ * if ret is -EDEADLK.
+ *
+ * It's important that you use the same ret variable for begin and end so
+ * deadlock conditions are properly handled.
+ *
+ * Returns:
+ * ret will be untouched unless it is -EDEADLK on entry. That means that if you
+ * successfully acquire the locks, ret will be whatever your code sets it to. If
+ * there is a deadlock or other failure with acquire or backoff, ret will be set
+ * to that failure. In both of these cases the code between BEGIN/END will not
+ * be run, so the failure will reflect the inability to grab the locks.
+ */
+#define DRM_MODESET_LOCK_ALL_END(ctx, ret)				\
+modeset_lock_fail:							\
+	if (ret == -EDEADLK) {						\
+		ret = drm_modeset_backoff(&ctx);			\
+		if (!ret)						\
+			goto modeset_lock_retry;			\
+	}								\
+	drm_modeset_drop_locks(&ctx);					\
+	drm_modeset_acquire_fini(&ctx);
+
 #endif /* DRM_MODESET_LOCK_H_ */
-- 
cgit v1.2.3


From 1d8224e790c7f9d0091a299b985c76ba0b229f43 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Wed, 28 Nov 2018 11:07:28 +0100
Subject: drm: Fix up drm_atomic_state_helper.[hc] extraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I've misplaced two functions by accident:
- drm_atomic_helper_duplicate_state is really part of the
  resume/suspend/shutdown device-wide helpers.
- drm_atomic_helper_legacy_gamma_set is part of the legacy ioctl
  compat helpers.

Move them both back.

Fixes: 9ef8a9dc4b21 ("drm: Extract drm_atomic_state_helper.[hc]")
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Sean Paul <sean@poorly.run>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181128100728.4674-1-daniel.vetter@ffwll.ch
---
 drivers/gpu/drm/drm_atomic_helper.c       | 157 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/drm_atomic_state_helper.c | 157 ------------------------------
 include/drm/drm_atomic_helper.h           |   7 ++
 include/drm/drm_atomic_state_helper.h     |   7 --
 4 files changed, 164 insertions(+), 164 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 0ee83efeb94a..8150fa8387d5 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -3138,6 +3138,93 @@ void drm_atomic_helper_shutdown(struct drm_device *dev)
 }
 EXPORT_SYMBOL(drm_atomic_helper_shutdown);
 
+/**
+ * drm_atomic_helper_duplicate_state - duplicate an atomic state object
+ * @dev: DRM device
+ * @ctx: lock acquisition context
+ *
+ * Makes a copy of the current atomic state by looping over all objects and
+ * duplicating their respective states. This is used for example by suspend/
+ * resume support code to save the state prior to suspend such that it can
+ * be restored upon resume.
+ *
+ * Note that this treats atomic state as persistent between save and restore.
+ * Drivers must make sure that this is possible and won't result in confusion
+ * or erroneous behaviour.
+ *
+ * Note that if callers haven't already acquired all modeset locks this might
+ * return -EDEADLK, which must be handled by calling drm_modeset_backoff().
+ *
+ * Returns:
+ * A pointer to the copy of the atomic state object on success or an
+ * ERR_PTR()-encoded error code on failure.
+ *
+ * See also:
+ * drm_atomic_helper_suspend(), drm_atomic_helper_resume()
+ */
+struct drm_atomic_state *
+drm_atomic_helper_duplicate_state(struct drm_device *dev,
+				  struct drm_modeset_acquire_ctx *ctx)
+{
+	struct drm_atomic_state *state;
+	struct drm_connector *conn;
+	struct drm_connector_list_iter conn_iter;
+	struct drm_plane *plane;
+	struct drm_crtc *crtc;
+	int err = 0;
+
+	state = drm_atomic_state_alloc(dev);
+	if (!state)
+		return ERR_PTR(-ENOMEM);
+
+	state->acquire_ctx = ctx;
+
+	drm_for_each_crtc(crtc, dev) {
+		struct drm_crtc_state *crtc_state;
+
+		crtc_state = drm_atomic_get_crtc_state(state, crtc);
+		if (IS_ERR(crtc_state)) {
+			err = PTR_ERR(crtc_state);
+			goto free;
+		}
+	}
+
+	drm_for_each_plane(plane, dev) {
+		struct drm_plane_state *plane_state;
+
+		plane_state = drm_atomic_get_plane_state(state, plane);
+		if (IS_ERR(plane_state)) {
+			err = PTR_ERR(plane_state);
+			goto free;
+		}
+	}
+
+	drm_connector_list_iter_begin(dev, &conn_iter);
+	drm_for_each_connector_iter(conn, &conn_iter) {
+		struct drm_connector_state *conn_state;
+
+		conn_state = drm_atomic_get_connector_state(state, conn);
+		if (IS_ERR(conn_state)) {
+			err = PTR_ERR(conn_state);
+			drm_connector_list_iter_end(&conn_iter);
+			goto free;
+		}
+	}
+	drm_connector_list_iter_end(&conn_iter);
+
+	/* clear the acquire context so that it isn't accidentally reused */
+	state->acquire_ctx = NULL;
+
+free:
+	if (err < 0) {
+		drm_atomic_state_put(state);
+		state = ERR_PTR(err);
+	}
+
+	return state;
+}
+EXPORT_SYMBOL(drm_atomic_helper_duplicate_state);
+
 /**
  * drm_atomic_helper_suspend - subsystem-level suspend helper
  * @dev: DRM device
@@ -3407,3 +3494,73 @@ fail:
 	return ret;
 }
 EXPORT_SYMBOL(drm_atomic_helper_page_flip_target);
+
+/**
+ * drm_atomic_helper_legacy_gamma_set - set the legacy gamma correction table
+ * @crtc: CRTC object
+ * @red: red correction table
+ * @green: green correction table
+ * @blue: green correction table
+ * @size: size of the tables
+ * @ctx: lock acquire context
+ *
+ * Implements support for legacy gamma correction table for drivers
+ * that support color management through the DEGAMMA_LUT/GAMMA_LUT
+ * properties. See drm_crtc_enable_color_mgmt() and the containing chapter for
+ * how the atomic color management and gamma tables work.
+ */
+int drm_atomic_helper_legacy_gamma_set(struct drm_crtc *crtc,
+				       u16 *red, u16 *green, u16 *blue,
+				       uint32_t size,
+				       struct drm_modeset_acquire_ctx *ctx)
+{
+	struct drm_device *dev = crtc->dev;
+	struct drm_atomic_state *state;
+	struct drm_crtc_state *crtc_state;
+	struct drm_property_blob *blob = NULL;
+	struct drm_color_lut *blob_data;
+	int i, ret = 0;
+	bool replaced;
+
+	state = drm_atomic_state_alloc(crtc->dev);
+	if (!state)
+		return -ENOMEM;
+
+	blob = drm_property_create_blob(dev,
+					sizeof(struct drm_color_lut) * size,
+					NULL);
+	if (IS_ERR(blob)) {
+		ret = PTR_ERR(blob);
+		blob = NULL;
+		goto fail;
+	}
+
+	/* Prepare GAMMA_LUT with the legacy values. */
+	blob_data = blob->data;
+	for (i = 0; i < size; i++) {
+		blob_data[i].red = red[i];
+		blob_data[i].green = green[i];
+		blob_data[i].blue = blue[i];
+	}
+
+	state->acquire_ctx = ctx;
+	crtc_state = drm_atomic_get_crtc_state(state, crtc);
+	if (IS_ERR(crtc_state)) {
+		ret = PTR_ERR(crtc_state);
+		goto fail;
+	}
+
+	/* Reset DEGAMMA_LUT and CTM properties. */
+	replaced  = drm_property_replace_blob(&crtc_state->degamma_lut, NULL);
+	replaced |= drm_property_replace_blob(&crtc_state->ctm, NULL);
+	replaced |= drm_property_replace_blob(&crtc_state->gamma_lut, blob);
+	crtc_state->color_mgmt_changed |= replaced;
+
+	ret = drm_atomic_commit(state);
+
+fail:
+	drm_atomic_state_put(state);
+	drm_property_blob_put(blob);
+	return ret;
+}
+EXPORT_SYMBOL(drm_atomic_helper_legacy_gamma_set);
diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c
index 3ba996069d69..60bd7d708e35 100644
--- a/drivers/gpu/drm/drm_atomic_state_helper.c
+++ b/drivers/gpu/drm/drm_atomic_state_helper.c
@@ -393,93 +393,6 @@ drm_atomic_helper_connector_duplicate_state(struct drm_connector *connector)
 }
 EXPORT_SYMBOL(drm_atomic_helper_connector_duplicate_state);
 
-/**
- * drm_atomic_helper_duplicate_state - duplicate an atomic state object
- * @dev: DRM device
- * @ctx: lock acquisition context
- *
- * Makes a copy of the current atomic state by looping over all objects and
- * duplicating their respective states. This is used for example by suspend/
- * resume support code to save the state prior to suspend such that it can
- * be restored upon resume.
- *
- * Note that this treats atomic state as persistent between save and restore.
- * Drivers must make sure that this is possible and won't result in confusion
- * or erroneous behaviour.
- *
- * Note that if callers haven't already acquired all modeset locks this might
- * return -EDEADLK, which must be handled by calling drm_modeset_backoff().
- *
- * Returns:
- * A pointer to the copy of the atomic state object on success or an
- * ERR_PTR()-encoded error code on failure.
- *
- * See also:
- * drm_atomic_helper_suspend(), drm_atomic_helper_resume()
- */
-struct drm_atomic_state *
-drm_atomic_helper_duplicate_state(struct drm_device *dev,
-				  struct drm_modeset_acquire_ctx *ctx)
-{
-	struct drm_atomic_state *state;
-	struct drm_connector *conn;
-	struct drm_connector_list_iter conn_iter;
-	struct drm_plane *plane;
-	struct drm_crtc *crtc;
-	int err = 0;
-
-	state = drm_atomic_state_alloc(dev);
-	if (!state)
-		return ERR_PTR(-ENOMEM);
-
-	state->acquire_ctx = ctx;
-
-	drm_for_each_crtc(crtc, dev) {
-		struct drm_crtc_state *crtc_state;
-
-		crtc_state = drm_atomic_get_crtc_state(state, crtc);
-		if (IS_ERR(crtc_state)) {
-			err = PTR_ERR(crtc_state);
-			goto free;
-		}
-	}
-
-	drm_for_each_plane(plane, dev) {
-		struct drm_plane_state *plane_state;
-
-		plane_state = drm_atomic_get_plane_state(state, plane);
-		if (IS_ERR(plane_state)) {
-			err = PTR_ERR(plane_state);
-			goto free;
-		}
-	}
-
-	drm_connector_list_iter_begin(dev, &conn_iter);
-	drm_for_each_connector_iter(conn, &conn_iter) {
-		struct drm_connector_state *conn_state;
-
-		conn_state = drm_atomic_get_connector_state(state, conn);
-		if (IS_ERR(conn_state)) {
-			err = PTR_ERR(conn_state);
-			drm_connector_list_iter_end(&conn_iter);
-			goto free;
-		}
-	}
-	drm_connector_list_iter_end(&conn_iter);
-
-	/* clear the acquire context so that it isn't accidentally reused */
-	state->acquire_ctx = NULL;
-
-free:
-	if (err < 0) {
-		drm_atomic_state_put(state);
-		state = ERR_PTR(err);
-	}
-
-	return state;
-}
-EXPORT_SYMBOL(drm_atomic_helper_duplicate_state);
-
 /**
  * __drm_atomic_helper_connector_destroy_state - release connector state
  * @state: connector state object to release
@@ -515,76 +428,6 @@ void drm_atomic_helper_connector_destroy_state(struct drm_connector *connector,
 }
 EXPORT_SYMBOL(drm_atomic_helper_connector_destroy_state);
 
-/**
- * drm_atomic_helper_legacy_gamma_set - set the legacy gamma correction table
- * @crtc: CRTC object
- * @red: red correction table
- * @green: green correction table
- * @blue: green correction table
- * @size: size of the tables
- * @ctx: lock acquire context
- *
- * Implements support for legacy gamma correction table for drivers
- * that support color management through the DEGAMMA_LUT/GAMMA_LUT
- * properties. See drm_crtc_enable_color_mgmt() and the containing chapter for
- * how the atomic color management and gamma tables work.
- */
-int drm_atomic_helper_legacy_gamma_set(struct drm_crtc *crtc,
-				       u16 *red, u16 *green, u16 *blue,
-				       uint32_t size,
-				       struct drm_modeset_acquire_ctx *ctx)
-{
-	struct drm_device *dev = crtc->dev;
-	struct drm_atomic_state *state;
-	struct drm_crtc_state *crtc_state;
-	struct drm_property_blob *blob = NULL;
-	struct drm_color_lut *blob_data;
-	int i, ret = 0;
-	bool replaced;
-
-	state = drm_atomic_state_alloc(crtc->dev);
-	if (!state)
-		return -ENOMEM;
-
-	blob = drm_property_create_blob(dev,
-					sizeof(struct drm_color_lut) * size,
-					NULL);
-	if (IS_ERR(blob)) {
-		ret = PTR_ERR(blob);
-		blob = NULL;
-		goto fail;
-	}
-
-	/* Prepare GAMMA_LUT with the legacy values. */
-	blob_data = blob->data;
-	for (i = 0; i < size; i++) {
-		blob_data[i].red = red[i];
-		blob_data[i].green = green[i];
-		blob_data[i].blue = blue[i];
-	}
-
-	state->acquire_ctx = ctx;
-	crtc_state = drm_atomic_get_crtc_state(state, crtc);
-	if (IS_ERR(crtc_state)) {
-		ret = PTR_ERR(crtc_state);
-		goto fail;
-	}
-
-	/* Reset DEGAMMA_LUT and CTM properties. */
-	replaced  = drm_property_replace_blob(&crtc_state->degamma_lut, NULL);
-	replaced |= drm_property_replace_blob(&crtc_state->ctm, NULL);
-	replaced |= drm_property_replace_blob(&crtc_state->gamma_lut, blob);
-	crtc_state->color_mgmt_changed |= replaced;
-
-	ret = drm_atomic_commit(state);
-
-fail:
-	drm_atomic_state_put(state);
-	drm_property_blob_put(blob);
-	return ret;
-}
-EXPORT_SYMBOL(drm_atomic_helper_legacy_gamma_set);
-
 /**
  * __drm_atomic_helper_private_duplicate_state - copy atomic private state
  * @obj: CRTC object
diff --git a/include/drm/drm_atomic_helper.h b/include/drm/drm_atomic_helper.h
index 25ca0097563e..58214be3bf3d 100644
--- a/include/drm/drm_atomic_helper.h
+++ b/include/drm/drm_atomic_helper.h
@@ -127,6 +127,9 @@ int __drm_atomic_helper_set_config(struct drm_mode_set *set,
 int drm_atomic_helper_disable_all(struct drm_device *dev,
 				  struct drm_modeset_acquire_ctx *ctx);
 void drm_atomic_helper_shutdown(struct drm_device *dev);
+struct drm_atomic_state *
+drm_atomic_helper_duplicate_state(struct drm_device *dev,
+				  struct drm_modeset_acquire_ctx *ctx);
 struct drm_atomic_state *drm_atomic_helper_suspend(struct drm_device *dev);
 int drm_atomic_helper_commit_duplicated_state(struct drm_atomic_state *state,
 					      struct drm_modeset_acquire_ctx *ctx);
@@ -145,6 +148,10 @@ int drm_atomic_helper_page_flip_target(
 				uint32_t flags,
 				uint32_t target,
 				struct drm_modeset_acquire_ctx *ctx);
+int drm_atomic_helper_legacy_gamma_set(struct drm_crtc *crtc,
+				       u16 *red, u16 *green, u16 *blue,
+				       uint32_t size,
+				       struct drm_modeset_acquire_ctx *ctx);
 
 /**
  * drm_atomic_crtc_for_each_plane - iterate over planes currently attached to CRTC
diff --git a/include/drm/drm_atomic_state_helper.h b/include/drm/drm_atomic_state_helper.h
index 5b82ccfdb502..66c92cbd8e16 100644
--- a/include/drm/drm_atomic_state_helper.h
+++ b/include/drm/drm_atomic_state_helper.h
@@ -65,16 +65,9 @@ __drm_atomic_helper_connector_duplicate_state(struct drm_connector *connector,
 					   struct drm_connector_state *state);
 struct drm_connector_state *
 drm_atomic_helper_connector_duplicate_state(struct drm_connector *connector);
-struct drm_atomic_state *
-drm_atomic_helper_duplicate_state(struct drm_device *dev,
-				  struct drm_modeset_acquire_ctx *ctx);
 void
 __drm_atomic_helper_connector_destroy_state(struct drm_connector_state *state);
 void drm_atomic_helper_connector_destroy_state(struct drm_connector *connector,
 					  struct drm_connector_state *state);
-int drm_atomic_helper_legacy_gamma_set(struct drm_crtc *crtc,
-				       u16 *red, u16 *green, u16 *blue,
-				       uint32_t size,
-				       struct drm_modeset_acquire_ctx *ctx);
 void __drm_atomic_helper_private_obj_duplicate_state(struct drm_private_obj *obj,
 						     struct drm_private_state *state);
-- 
cgit v1.2.3


From 1584f16ca96ef124aad79efa3303cff5f3530e2c Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 28 Nov 2018 15:09:25 -0800
Subject: drm/v3d: Add support for submitting jobs to the TFU.

The TFU can copy from raster, UIF, and SAND input images to UIF output
images, with optional mipmap generation.  This will certainly be
useful for media EGL image input, but is also useful immediately for
mipmap generation without bogging the V3D core down.

For now we only run the queue 1 job deep, and don't have any hang
recovery (though I don't think we should need it, with TFU).  Queuing
multiple jobs in the HW will require synchronizing the YUV coefficient
regs updates since they don't get FIFOed with the job.

v2: Change the ioctl to IOW instead of IOWR, always set COEF0, explain
    why TFU is AUTH, clarify the syncing docs, drop the unused TFU
    interrupt regs (you're expected to use the hub's), don't take
    &bo->base for NULL bos.
v3: Fix a little whitespace alignment (noticed by checkpatch), rebase
    on drm_sched_job_cleanup() changes.

Signed-off-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Dave Emett <david.emett@broadcom.com> (v2)
Link: https://patchwork.freedesktop.org/patch/264607/
---
 drivers/gpu/drm/v3d/v3d_drv.c   |  15 +++-
 drivers/gpu/drm/v3d/v3d_drv.h   |  32 ++++++--
 drivers/gpu/drm/v3d/v3d_gem.c   | 178 +++++++++++++++++++++++++++++++++++-----
 drivers/gpu/drm/v3d/v3d_irq.c   |  12 ++-
 drivers/gpu/drm/v3d/v3d_regs.h  |  49 +++++++++++
 drivers/gpu/drm/v3d/v3d_sched.c | 147 ++++++++++++++++++++++++++++-----
 drivers/gpu/drm/v3d/v3d_trace.h |  20 +++++
 include/uapi/drm/v3d_drm.h      |  25 ++++++
 8 files changed, 427 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/v3d/v3d_drv.c b/drivers/gpu/drm/v3d/v3d_drv.c
index 2a85fa68ffea..f0afcec72c34 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.c
+++ b/drivers/gpu/drm/v3d/v3d_drv.c
@@ -112,10 +112,15 @@ static int v3d_get_param_ioctl(struct drm_device *dev, void *data,
 		return 0;
 	}
 
-	/* Any params that aren't just register reads would go here. */
 
-	DRM_DEBUG("Unknown parameter %d\n", args->param);
-	return -EINVAL;
+	switch (args->param) {
+	case DRM_V3D_PARAM_SUPPORTS_TFU:
+		args->value = 1;
+		return 0;
+	default:
+		DRM_DEBUG("Unknown parameter %d\n", args->param);
+		return -EINVAL;
+	}
 }
 
 static int
@@ -170,7 +175,8 @@ static const struct file_operations v3d_drm_fops = {
 /* DRM_AUTH is required on SUBMIT_CL for now, while we don't have GMP
  * protection between clients.  Note that render nodes would be be
  * able to submit CLs that could access BOs from clients authenticated
- * with the master node.
+ * with the master node.  The TFU doesn't use the GMP, so it would
+ * need to stay DRM_AUTH until we do buffer size/offset validation.
  */
 static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CL, v3d_submit_cl_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
@@ -179,6 +185,7 @@ static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(V3D_MMAP_BO, v3d_mmap_bo_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
 };
 
 static const struct vm_operations_struct v3d_vm_ops = {
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index cbe5be0c47eb..dcb772a19191 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -7,19 +7,18 @@
 #include <drm/drm_encoder.h>
 #include <drm/drm_gem.h>
 #include <drm/gpu_scheduler.h>
+#include "uapi/drm/v3d_drm.h"
 
 #define GMP_GRANULARITY (128 * 1024)
 
-/* Enum for each of the V3D queues.  We maintain various queue
- * tracking as an array because at some point we'll want to support
- * the TFU (texture formatting unit) as another queue.
- */
+/* Enum for each of the V3D queues. */
 enum v3d_queue {
 	V3D_BIN,
 	V3D_RENDER,
+	V3D_TFU,
 };
 
-#define V3D_MAX_QUEUES (V3D_RENDER + 1)
+#define V3D_MAX_QUEUES (V3D_TFU + 1)
 
 struct v3d_queue_state {
 	struct drm_gpu_scheduler sched;
@@ -68,6 +67,7 @@ struct v3d_dev {
 
 	struct v3d_exec_info *bin_job;
 	struct v3d_exec_info *render_job;
+	struct v3d_tfu_job *tfu_job;
 
 	struct v3d_queue_state queue[V3D_MAX_QUEUES];
 
@@ -218,6 +218,25 @@ struct v3d_exec_info {
 	u32 qma, qms, qts;
 };
 
+struct v3d_tfu_job {
+	struct drm_sched_job base;
+
+	struct drm_v3d_submit_tfu args;
+
+	/* An optional fence userspace can pass in for the job to depend on. */
+	struct dma_fence *in_fence;
+
+	/* v3d fence to be signaled by IRQ handler when the job is complete. */
+	struct dma_fence *done_fence;
+
+	struct v3d_dev *v3d;
+
+	struct kref refcount;
+
+	/* This is the array of BOs that were looked up at the start of exec. */
+	struct v3d_bo *bo[4];
+};
+
 /**
  * _wait_for - magic (register) wait macro
  *
@@ -281,9 +300,12 @@ int v3d_gem_init(struct drm_device *dev);
 void v3d_gem_destroy(struct drm_device *dev);
 int v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
+int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
+			 struct drm_file *file_priv);
 int v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 void v3d_exec_put(struct v3d_exec_info *exec);
+void v3d_tfu_job_put(struct v3d_tfu_job *exec);
 void v3d_reset(struct v3d_dev *v3d);
 void v3d_invalidate_caches(struct v3d_dev *v3d);
 void v3d_flush_caches(struct v3d_dev *v3d);
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index 1e8947c7d954..6abe2fa43306 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -207,26 +207,27 @@ v3d_flush_caches(struct v3d_dev *v3d)
 }
 
 static void
-v3d_attach_object_fences(struct v3d_exec_info *exec)
+v3d_attach_object_fences(struct v3d_bo **bos, int bo_count,
+			 struct dma_fence *fence)
 {
-	struct dma_fence *out_fence = exec->render_done_fence;
 	int i;
 
-	for (i = 0; i < exec->bo_count; i++) {
+	for (i = 0; i < bo_count; i++) {
 		/* XXX: Use shared fences for read-only objects. */
-		reservation_object_add_excl_fence(exec->bo[i]->resv, out_fence);
+		reservation_object_add_excl_fence(bos[i]->resv, fence);
 	}
 }
 
 static void
 v3d_unlock_bo_reservations(struct drm_device *dev,
-			   struct v3d_exec_info *exec,
+			   struct v3d_bo **bos,
+			   int bo_count,
 			   struct ww_acquire_ctx *acquire_ctx)
 {
 	int i;
 
-	for (i = 0; i < exec->bo_count; i++)
-		ww_mutex_unlock(&exec->bo[i]->resv->lock);
+	for (i = 0; i < bo_count; i++)
+		ww_mutex_unlock(&bos[i]->resv->lock);
 
 	ww_acquire_fini(acquire_ctx);
 }
@@ -240,7 +241,8 @@ v3d_unlock_bo_reservations(struct drm_device *dev,
  */
 static int
 v3d_lock_bo_reservations(struct drm_device *dev,
-			 struct v3d_exec_info *exec,
+			 struct v3d_bo **bos,
+			 int bo_count,
 			 struct ww_acquire_ctx *acquire_ctx)
 {
 	int contended_lock = -1;
@@ -250,7 +252,7 @@ v3d_lock_bo_reservations(struct drm_device *dev,
 
 retry:
 	if (contended_lock != -1) {
-		struct v3d_bo *bo = exec->bo[contended_lock];
+		struct v3d_bo *bo = bos[contended_lock];
 
 		ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock,
 						       acquire_ctx);
@@ -260,20 +262,20 @@ retry:
 		}
 	}
 
-	for (i = 0; i < exec->bo_count; i++) {
+	for (i = 0; i < bo_count; i++) {
 		if (i == contended_lock)
 			continue;
 
-		ret = ww_mutex_lock_interruptible(&exec->bo[i]->resv->lock,
+		ret = ww_mutex_lock_interruptible(&bos[i]->resv->lock,
 						  acquire_ctx);
 		if (ret) {
 			int j;
 
 			for (j = 0; j < i; j++)
-				ww_mutex_unlock(&exec->bo[j]->resv->lock);
+				ww_mutex_unlock(&bos[j]->resv->lock);
 
 			if (contended_lock != -1 && contended_lock >= i) {
-				struct v3d_bo *bo = exec->bo[contended_lock];
+				struct v3d_bo *bo = bos[contended_lock];
 
 				ww_mutex_unlock(&bo->resv->lock);
 			}
@@ -293,10 +295,11 @@ retry:
 	/* Reserve space for our shared (read-only) fence references,
 	 * before we commit the CL to the hardware.
 	 */
-	for (i = 0; i < exec->bo_count; i++) {
-		ret = reservation_object_reserve_shared(exec->bo[i]->resv, 1);
+	for (i = 0; i < bo_count; i++) {
+		ret = reservation_object_reserve_shared(bos[i]->resv, 1);
 		if (ret) {
-			v3d_unlock_bo_reservations(dev, exec, acquire_ctx);
+			v3d_unlock_bo_reservations(dev, bos, bo_count,
+						   acquire_ctx);
 			return ret;
 		}
 	}
@@ -419,6 +422,33 @@ void v3d_exec_put(struct v3d_exec_info *exec)
 	kref_put(&exec->refcount, v3d_exec_cleanup);
 }
 
+static void
+v3d_tfu_job_cleanup(struct kref *ref)
+{
+	struct v3d_tfu_job *job = container_of(ref, struct v3d_tfu_job,
+					       refcount);
+	struct v3d_dev *v3d = job->v3d;
+	unsigned int i;
+
+	dma_fence_put(job->in_fence);
+	dma_fence_put(job->done_fence);
+
+	for (i = 0; i < ARRAY_SIZE(job->bo); i++) {
+		if (job->bo[i])
+			drm_gem_object_put_unlocked(&job->bo[i]->base);
+	}
+
+	pm_runtime_mark_last_busy(v3d->dev);
+	pm_runtime_put_autosuspend(v3d->dev);
+
+	kfree(job);
+}
+
+void v3d_tfu_job_put(struct v3d_tfu_job *job)
+{
+	kref_put(&job->refcount, v3d_tfu_job_cleanup);
+}
+
 int
 v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
 		  struct drm_file *file_priv)
@@ -536,7 +566,8 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		goto fail;
 
-	ret = v3d_lock_bo_reservations(dev, exec, &acquire_ctx);
+	ret = v3d_lock_bo_reservations(dev, exec->bo, exec->bo_count,
+				       &acquire_ctx);
 	if (ret)
 		goto fail;
 
@@ -570,9 +601,10 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 				  &v3d_priv->sched_entity[V3D_RENDER]);
 	mutex_unlock(&v3d->sched_lock);
 
-	v3d_attach_object_fences(exec);
+	v3d_attach_object_fences(exec->bo, exec->bo_count,
+				 exec->render_done_fence);
 
-	v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
+	v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
 
 	/* Update the return sync object for the */
 	sync_out = drm_syncobj_find(file_priv, args->out_sync);
@@ -588,13 +620,119 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 
 fail_unreserve:
 	mutex_unlock(&v3d->sched_lock);
-	v3d_unlock_bo_reservations(dev, exec, &acquire_ctx);
+	v3d_unlock_bo_reservations(dev, exec->bo, exec->bo_count, &acquire_ctx);
 fail:
 	v3d_exec_put(exec);
 
 	return ret;
 }
 
+/**
+ * v3d_submit_tfu_ioctl() - Submits a TFU (texture formatting) job to the V3D.
+ * @dev: DRM device
+ * @data: ioctl argument
+ * @file_priv: DRM file for this fd
+ *
+ * Userspace provides the register setup for the TFU, which we don't
+ * need to validate since the TFU is behind the MMU.
+ */
+int
+v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
+		     struct drm_file *file_priv)
+{
+	struct v3d_dev *v3d = to_v3d_dev(dev);
+	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
+	struct drm_v3d_submit_tfu *args = data;
+	struct v3d_tfu_job *job;
+	struct ww_acquire_ctx acquire_ctx;
+	struct drm_syncobj *sync_out;
+	struct dma_fence *sched_done_fence;
+	int ret = 0;
+	int bo_count;
+
+	job = kcalloc(1, sizeof(*job), GFP_KERNEL);
+	if (!job)
+		return -ENOMEM;
+
+	ret = pm_runtime_get_sync(v3d->dev);
+	if (ret < 0) {
+		kfree(job);
+		return ret;
+	}
+
+	kref_init(&job->refcount);
+
+	ret = drm_syncobj_find_fence(file_priv, args->in_sync,
+				     0, 0, &job->in_fence);
+	if (ret == -EINVAL)
+		goto fail;
+
+	job->args = *args;
+	job->v3d = v3d;
+
+	spin_lock(&file_priv->table_lock);
+	for (bo_count = 0; bo_count < ARRAY_SIZE(job->bo); bo_count++) {
+		struct drm_gem_object *bo;
+
+		if (!args->bo_handles[bo_count])
+			break;
+
+		bo = idr_find(&file_priv->object_idr,
+			      args->bo_handles[bo_count]);
+		if (!bo) {
+			DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
+				  bo_count, args->bo_handles[bo_count]);
+			ret = -ENOENT;
+			spin_unlock(&file_priv->table_lock);
+			goto fail;
+		}
+		drm_gem_object_get(bo);
+		job->bo[bo_count] = to_v3d_bo(bo);
+	}
+	spin_unlock(&file_priv->table_lock);
+
+	ret = v3d_lock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
+	if (ret)
+		goto fail;
+
+	mutex_lock(&v3d->sched_lock);
+	ret = drm_sched_job_init(&job->base,
+				 &v3d_priv->sched_entity[V3D_TFU],
+				 v3d_priv);
+	if (ret)
+		goto fail_unreserve;
+
+	sched_done_fence = dma_fence_get(&job->base.s_fence->finished);
+
+	kref_get(&job->refcount); /* put by scheduler job completion */
+	drm_sched_entity_push_job(&job->base, &v3d_priv->sched_entity[V3D_TFU]);
+	mutex_unlock(&v3d->sched_lock);
+
+	v3d_attach_object_fences(job->bo, bo_count, sched_done_fence);
+
+	v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
+
+	/* Update the return sync object */
+	sync_out = drm_syncobj_find(file_priv, args->out_sync);
+	if (sync_out) {
+		drm_syncobj_replace_fence(sync_out, 0, sched_done_fence);
+		drm_syncobj_put(sync_out);
+	}
+	dma_fence_put(sched_done_fence);
+
+	v3d_tfu_job_put(job);
+
+	return 0;
+
+fail_unreserve:
+	mutex_unlock(&v3d->sched_lock);
+	v3d_unlock_bo_reservations(dev, job->bo, bo_count, &acquire_ctx);
+fail:
+	v3d_tfu_job_put(job);
+
+	return ret;
+}
+
 int
 v3d_gem_init(struct drm_device *dev)
 {
diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c
index e07514eb11b5..dd7a7b0bd5a1 100644
--- a/drivers/gpu/drm/v3d/v3d_irq.c
+++ b/drivers/gpu/drm/v3d/v3d_irq.c
@@ -4,8 +4,8 @@
 /**
  * DOC: Interrupt management for the V3D engine
  *
- * When we take a binning or rendering flush done interrupt, we need
- * to signal the fence for that job so that the scheduler can queue up
+ * When we take a bin, render, or TFU done interrupt, we need to
+ * signal the fence for that job so that the scheduler can queue up
  * the next one and unblock any waiters.
  *
  * When we take the binner out of memory interrupt, we need to
@@ -23,7 +23,8 @@
 
 #define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV |	\
 			    V3D_HUB_INT_MMU_PTI |	\
-			    V3D_HUB_INT_MMU_CAP))
+			    V3D_HUB_INT_MMU_CAP |	\
+			    V3D_HUB_INT_TFUC))
 
 static void
 v3d_overflow_mem_work(struct work_struct *work)
@@ -117,6 +118,11 @@ v3d_hub_irq(int irq, void *arg)
 	/* Acknowledge the interrupts we're handling here. */
 	V3D_WRITE(V3D_HUB_INT_CLR, intsts);
 
+	if (intsts & V3D_HUB_INT_TFUC) {
+		dma_fence_signal(v3d->tfu_job->done_fence);
+		status = IRQ_HANDLED;
+	}
+
 	if (intsts & (V3D_HUB_INT_MMU_WRV |
 		      V3D_HUB_INT_MMU_PTI |
 		      V3D_HUB_INT_MMU_CAP)) {
diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
index c3a5e4e44f73..6ccdee9d47bd 100644
--- a/drivers/gpu/drm/v3d/v3d_regs.h
+++ b/drivers/gpu/drm/v3d/v3d_regs.h
@@ -86,6 +86,55 @@
 # define V3D_TOP_GR_BRIDGE_SW_INIT_1                   0x0000c
 # define V3D_TOP_GR_BRIDGE_SW_INIT_1_V3D_CLK_108_SW_INIT BIT(0)
 
+#define V3D_TFU_CS                                     0x00400
+/* Stops current job, empties input fifo. */
+# define V3D_TFU_CS_TFURST                             BIT(31)
+# define V3D_TFU_CS_CVTCT_MASK                         V3D_MASK(23, 16)
+# define V3D_TFU_CS_CVTCT_SHIFT                        16
+# define V3D_TFU_CS_NFREE_MASK                         V3D_MASK(13, 8)
+# define V3D_TFU_CS_NFREE_SHIFT                        8
+# define V3D_TFU_CS_BUSY                               BIT(0)
+
+#define V3D_TFU_SU                                     0x00404
+/* Interrupt when FINTTHR input slots are free (0 = disabled) */
+# define V3D_TFU_SU_FINTTHR_MASK                       V3D_MASK(13, 8)
+# define V3D_TFU_SU_FINTTHR_SHIFT                      8
+/* Skips resetting the CRC at the start of CRC generation. */
+# define V3D_TFU_SU_CRCCHAIN                           BIT(4)
+/* skips writes, computes CRC of the image.  miplevels must be 0. */
+# define V3D_TFU_SU_CRC                                BIT(3)
+# define V3D_TFU_SU_THROTTLE_MASK                      V3D_MASK(1, 0)
+# define V3D_TFU_SU_THROTTLE_SHIFT                     0
+
+#define V3D_TFU_ICFG                                   0x00408
+/* Interrupt when the conversion is complete. */
+# define V3D_TFU_ICFG_IOC                              BIT(0)
+
+/* Input Image Address */
+#define V3D_TFU_IIA                                    0x0040c
+/* Input Chroma Address */
+#define V3D_TFU_ICA                                    0x00410
+/* Input Image Stride */
+#define V3D_TFU_IIS                                    0x00414
+/* Input Image U-Plane Address */
+#define V3D_TFU_IUA                                    0x00418
+/* Output Image Address */
+#define V3D_TFU_IOA                                    0x0041c
+/* Image Output Size */
+#define V3D_TFU_IOS                                    0x00420
+/* TFU YUV Coefficient 0 */
+#define V3D_TFU_COEF0                                  0x00424
+/* Use these regs instead of the defaults. */
+# define V3D_TFU_COEF0_USECOEF                         BIT(31)
+/* TFU YUV Coefficient 1 */
+#define V3D_TFU_COEF1                                  0x00428
+/* TFU YUV Coefficient 2 */
+#define V3D_TFU_COEF2                                  0x0042c
+/* TFU YUV Coefficient 3 */
+#define V3D_TFU_COEF3                                  0x00430
+
+#define V3D_TFU_CRC                                    0x00434
+
 /* Per-MMU registers. */
 
 #define V3D_MMUC_CONTROL                               0x01000
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index c66d0ce21435..f7508e907536 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -30,6 +30,12 @@ to_v3d_job(struct drm_sched_job *sched_job)
 	return container_of(sched_job, struct v3d_job, base);
 }
 
+static struct v3d_tfu_job *
+to_tfu_job(struct drm_sched_job *sched_job)
+{
+	return container_of(sched_job, struct v3d_tfu_job, base);
+}
+
 static void
 v3d_job_free(struct drm_sched_job *sched_job)
 {
@@ -40,6 +46,16 @@ v3d_job_free(struct drm_sched_job *sched_job)
 	v3d_exec_put(job->exec);
 }
 
+static void
+v3d_tfu_job_free(struct drm_sched_job *sched_job)
+{
+	struct v3d_tfu_job *job = to_tfu_job(sched_job);
+
+	drm_sched_job_cleanup(sched_job);
+
+	v3d_tfu_job_put(job);
+}
+
 /**
  * Returns the fences that the bin or render job depends on, one by one.
  * v3d_job_run() won't be called until all of them have been signaled.
@@ -78,6 +94,27 @@ v3d_job_dependency(struct drm_sched_job *sched_job,
 	return fence;
 }
 
+/**
+ * Returns the fences that the TFU job depends on, one by one.
+ * v3d_tfu_job_run() won't be called until all of them have been
+ * signaled.
+ */
+static struct dma_fence *
+v3d_tfu_job_dependency(struct drm_sched_job *sched_job,
+		       struct drm_sched_entity *s_entity)
+{
+	struct v3d_tfu_job *job = to_tfu_job(sched_job);
+	struct dma_fence *fence;
+
+	fence = job->in_fence;
+	if (fence) {
+		job->in_fence = NULL;
+		return fence;
+	}
+
+	return NULL;
+}
+
 static struct dma_fence *v3d_job_run(struct drm_sched_job *sched_job)
 {
 	struct v3d_job *job = to_v3d_job(sched_job);
@@ -149,28 +186,47 @@ static struct dma_fence *v3d_job_run(struct drm_sched_job *sched_job)
 	return fence;
 }
 
-static void
-v3d_job_timedout(struct drm_sched_job *sched_job)
+static struct dma_fence *
+v3d_tfu_job_run(struct drm_sched_job *sched_job)
 {
-	struct v3d_job *job = to_v3d_job(sched_job);
-	struct v3d_exec_info *exec = job->exec;
-	struct v3d_dev *v3d = exec->v3d;
-	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
-	enum v3d_queue q;
-	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
-	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
+	struct v3d_tfu_job *job = to_tfu_job(sched_job);
+	struct v3d_dev *v3d = job->v3d;
+	struct drm_device *dev = &v3d->drm;
+	struct dma_fence *fence;
 
-	/* If the current address or return address have changed, then
-	 * the GPU has probably made progress and we should delay the
-	 * reset.  This could fail if the GPU got in an infinite loop
-	 * in the CL, but that is pretty unlikely outside of an i-g-t
-	 * testcase.
-	 */
-	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
-		job->timedout_ctca = ctca;
-		job->timedout_ctra = ctra;
-		return;
+	fence = v3d_fence_create(v3d, V3D_TFU);
+	if (IS_ERR(fence))
+		return NULL;
+
+	v3d->tfu_job = job;
+	if (job->done_fence)
+		dma_fence_put(job->done_fence);
+	job->done_fence = dma_fence_get(fence);
+
+	trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
+
+	V3D_WRITE(V3D_TFU_IIA, job->args.iia);
+	V3D_WRITE(V3D_TFU_IIS, job->args.iis);
+	V3D_WRITE(V3D_TFU_ICA, job->args.ica);
+	V3D_WRITE(V3D_TFU_IUA, job->args.iua);
+	V3D_WRITE(V3D_TFU_IOA, job->args.ioa);
+	V3D_WRITE(V3D_TFU_IOS, job->args.ios);
+	V3D_WRITE(V3D_TFU_COEF0, job->args.coef[0]);
+	if (job->args.coef[0] & V3D_TFU_COEF0_USECOEF) {
+		V3D_WRITE(V3D_TFU_COEF1, job->args.coef[1]);
+		V3D_WRITE(V3D_TFU_COEF2, job->args.coef[2]);
+		V3D_WRITE(V3D_TFU_COEF3, job->args.coef[3]);
 	}
+	/* ICFG kicks off the job. */
+	V3D_WRITE(V3D_TFU_ICFG, job->args.icfg | V3D_TFU_ICFG_IOC);
+
+	return fence;
+}
+
+static void
+v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
+{
+	enum v3d_queue q;
 
 	mutex_lock(&v3d->reset_lock);
 
@@ -195,6 +251,39 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
 	mutex_unlock(&v3d->reset_lock);
 }
 
+static void
+v3d_job_timedout(struct drm_sched_job *sched_job)
+{
+	struct v3d_job *job = to_v3d_job(sched_job);
+	struct v3d_exec_info *exec = job->exec;
+	struct v3d_dev *v3d = exec->v3d;
+	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
+	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
+	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
+
+	/* If the current address or return address have changed, then
+	 * the GPU has probably made progress and we should delay the
+	 * reset.  This could fail if the GPU got in an infinite loop
+	 * in the CL, but that is pretty unlikely outside of an i-g-t
+	 * testcase.
+	 */
+	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
+		job->timedout_ctca = ctca;
+		job->timedout_ctra = ctra;
+		return;
+	}
+
+	v3d_gpu_reset_for_timeout(v3d, sched_job);
+}
+
+static void
+v3d_tfu_job_timedout(struct drm_sched_job *sched_job)
+{
+	struct v3d_tfu_job *job = to_tfu_job(sched_job);
+
+	v3d_gpu_reset_for_timeout(job->v3d, sched_job);
+}
+
 static const struct drm_sched_backend_ops v3d_sched_ops = {
 	.dependency = v3d_job_dependency,
 	.run_job = v3d_job_run,
@@ -202,6 +291,13 @@ static const struct drm_sched_backend_ops v3d_sched_ops = {
 	.free_job = v3d_job_free
 };
 
+static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
+	.dependency = v3d_tfu_job_dependency,
+	.run_job = v3d_tfu_job_run,
+	.timedout_job = v3d_tfu_job_timedout,
+	.free_job = v3d_tfu_job_free
+};
+
 int
 v3d_sched_init(struct v3d_dev *v3d)
 {
@@ -232,6 +328,19 @@ v3d_sched_init(struct v3d_dev *v3d)
 		return ret;
 	}
 
+	ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
+			     &v3d_tfu_sched_ops,
+			     hw_jobs_limit, job_hang_limit,
+			     msecs_to_jiffies(hang_limit_ms),
+			     "v3d_tfu");
+	if (ret) {
+		dev_err(v3d->dev, "Failed to create TFU scheduler: %d.",
+			ret);
+		drm_sched_fini(&v3d->queue[V3D_RENDER].sched);
+		drm_sched_fini(&v3d->queue[V3D_BIN].sched);
+		return ret;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/v3d/v3d_trace.h b/drivers/gpu/drm/v3d/v3d_trace.h
index 85dd351e1e09..f54ed9cd3444 100644
--- a/drivers/gpu/drm/v3d/v3d_trace.h
+++ b/drivers/gpu/drm/v3d/v3d_trace.h
@@ -42,6 +42,26 @@ TRACE_EVENT(v3d_submit_cl,
 		      __entry->ctnqea)
 );
 
+TRACE_EVENT(v3d_submit_tfu,
+	    TP_PROTO(struct drm_device *dev,
+		     uint64_t seqno),
+	    TP_ARGS(dev, seqno),
+
+	    TP_STRUCT__entry(
+			     __field(u32, dev)
+			     __field(u64, seqno)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev->primary->index;
+			   __entry->seqno = seqno;
+			   ),
+
+	    TP_printk("dev=%u, seqno=%llu",
+		      __entry->dev,
+		      __entry->seqno)
+);
+
 TRACE_EVENT(v3d_reset_begin,
 	    TP_PROTO(struct drm_device *dev),
 	    TP_ARGS(dev),
diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h
index b1e5de076b0f..35c7d813c66e 100644
--- a/include/uapi/drm/v3d_drm.h
+++ b/include/uapi/drm/v3d_drm.h
@@ -36,6 +36,7 @@ extern "C" {
 #define DRM_V3D_MMAP_BO                           0x03
 #define DRM_V3D_GET_PARAM                         0x04
 #define DRM_V3D_GET_BO_OFFSET                     0x05
+#define DRM_V3D_SUBMIT_TFU                        0x06
 
 #define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
 #define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
@@ -43,6 +44,7 @@ extern "C" {
 #define DRM_IOCTL_V3D_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_MMAP_BO, struct drm_v3d_mmap_bo)
 #define DRM_IOCTL_V3D_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
 #define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
+#define DRM_IOCTL_V3D_SUBMIT_TFU          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
 
 /**
  * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
@@ -179,6 +181,7 @@ enum drm_v3d_param {
 	DRM_V3D_PARAM_V3D_CORE0_IDENT0,
 	DRM_V3D_PARAM_V3D_CORE0_IDENT1,
 	DRM_V3D_PARAM_V3D_CORE0_IDENT2,
+	DRM_V3D_PARAM_SUPPORTS_TFU,
 };
 
 struct drm_v3d_get_param {
@@ -197,6 +200,28 @@ struct drm_v3d_get_bo_offset {
 	__u32 offset;
 };
 
+struct drm_v3d_submit_tfu {
+	__u32 icfg;
+	__u32 iia;
+	__u32 iis;
+	__u32 ica;
+	__u32 iua;
+	__u32 ioa;
+	__u32 ios;
+	__u32 coef[4];
+	/* First handle is the output BO, following are other inputs.
+	 * 0 for unused.
+	 */
+	__u32 bo_handles[4];
+	/* sync object to block on before running the TFU job.  Each TFU
+	 * job will execute in the order submitted to its FD.  Synchronization
+	 * against rendering jobs requires using sync objects.
+	 */
+	__u32 in_sync;
+	/* Sync object to signal when the TFU job is done. */
+	__u32 out_sync;
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
cgit v1.2.3


From 078dec3326e2244c62e8a8d970ba24359e3464be Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Mon, 3 Dec 2018 13:36:14 +0100
Subject: dma-buf: add dma_fence_get_stub
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extract of useful code from the timeline work. This provides a function
to return a stub or dummy fence which is always signaled.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
Link: https://patchwork.freedesktop.org/patch/265248/
---
 drivers/dma-buf/dma-fence.c | 36 +++++++++++++++++++++++++++++++++++-
 include/linux/dma-fence.h   |  1 +
 2 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index 1551ca7df394..136ec04d683f 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -30,13 +30,16 @@
 EXPORT_TRACEPOINT_SYMBOL(dma_fence_emit);
 EXPORT_TRACEPOINT_SYMBOL(dma_fence_enable_signal);
 
+static DEFINE_SPINLOCK(dma_fence_stub_lock);
+static struct dma_fence dma_fence_stub;
+
 /*
  * fence context counter: each execution context should have its own
  * fence context, this allows checking if fences belong to the same
  * context or not. One device can have multiple separate contexts,
  * and they're used if some engine can run independently of another.
  */
-static atomic64_t dma_fence_context_counter = ATOMIC64_INIT(0);
+static atomic64_t dma_fence_context_counter = ATOMIC64_INIT(1);
 
 /**
  * DOC: DMA fences overview
@@ -68,6 +71,37 @@ static atomic64_t dma_fence_context_counter = ATOMIC64_INIT(0);
  *   &dma_buf.resv pointer.
  */
 
+static const char *dma_fence_stub_get_name(struct dma_fence *fence)
+{
+        return "stub";
+}
+
+static const struct dma_fence_ops dma_fence_stub_ops = {
+	.get_driver_name = dma_fence_stub_get_name,
+	.get_timeline_name = dma_fence_stub_get_name,
+};
+
+/**
+ * dma_fence_get_stub - return a signaled fence
+ *
+ * Return a stub fence which is already signaled.
+ */
+struct dma_fence *dma_fence_get_stub(void)
+{
+	spin_lock(&dma_fence_stub_lock);
+	if (!dma_fence_stub.ops) {
+		dma_fence_init(&dma_fence_stub,
+			       &dma_fence_stub_ops,
+			       &dma_fence_stub_lock,
+			       0, 0);
+		dma_fence_signal_locked(&dma_fence_stub);
+	}
+	spin_unlock(&dma_fence_stub_lock);
+
+	return dma_fence_get(&dma_fence_stub);
+}
+EXPORT_SYMBOL(dma_fence_get_stub);
+
 /**
  * dma_fence_context_alloc - allocate an array of fence contexts
  * @num: amount of contexts to allocate
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 02dba8cd033d..999e4b104410 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -541,6 +541,7 @@ static inline signed long dma_fence_wait(struct dma_fence *fence, bool intr)
 	return ret < 0 ? ret : 0;
 }
 
+struct dma_fence *dma_fence_get_stub(void);
 u64 dma_fence_context_alloc(unsigned num);
 
 #define DMA_FENCE_TRACE(f, fmt, args...) \
-- 
cgit v1.2.3


From 0b258ed1a219a9776e8f6967eb34837ae0332e64 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Wed, 14 Nov 2018 14:24:27 +0100
Subject: drm: revert "expand replace_fence to support timeline point v2"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 9a09a42369a4a37a959c051d8e1a1f948c1529a4.

The whole interface isn't thought through. Since this function can't
fail we actually can't allocate an object to store the sync point.

Sorry, I should have taken the lead on this from the very beginning and
reviewed it more thoughtfully. Going to propose a new interface as a
follow up change.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
Link: https://patchwork.freedesktop.org/patch/265580/
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  2 +-
 drivers/gpu/drm/drm_syncobj.c              | 14 ++++++--------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  2 +-
 drivers/gpu/drm/v3d/v3d_gem.c              |  3 +--
 drivers/gpu/drm/vc4/vc4_gem.c              |  2 +-
 include/drm/drm_syncobj.h                  |  2 +-
 6 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 024dfbd87f11..dc54e9efd910 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1193,7 +1193,7 @@ static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p)
 	int i;
 
 	for (i = 0; i < p->num_post_dep_syncobjs; ++i)
-		drm_syncobj_replace_fence(p->post_dep_syncobjs[i], 0, p->fence);
+		drm_syncobj_replace_fence(p->post_dep_syncobjs[i], p->fence);
 }
 
 static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index 5c5ba1f14307..db30a0e89db8 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -140,13 +140,11 @@ void drm_syncobj_remove_callback(struct drm_syncobj *syncobj,
 /**
  * drm_syncobj_replace_fence - replace fence in a sync object.
  * @syncobj: Sync object to replace fence in
- * @point: timeline point
  * @fence: fence to install in sync file.
  *
- * This replaces the fence on a sync object, or a timeline point fence.
+ * This replaces the fence on a sync object.
  */
 void drm_syncobj_replace_fence(struct drm_syncobj *syncobj,
-			       u64 point,
 			       struct dma_fence *fence)
 {
 	struct dma_fence *old_fence;
@@ -184,7 +182,7 @@ static void drm_syncobj_assign_null_handle(struct drm_syncobj *syncobj)
 {
 	struct dma_fence *fence = dma_fence_get_stub();
 
-	drm_syncobj_replace_fence(syncobj, 0, fence);
+	drm_syncobj_replace_fence(syncobj, fence);
 	dma_fence_put(fence);
 }
 
@@ -233,7 +231,7 @@ void drm_syncobj_free(struct kref *kref)
 	struct drm_syncobj *syncobj = container_of(kref,
 						   struct drm_syncobj,
 						   refcount);
-	drm_syncobj_replace_fence(syncobj, 0, NULL);
+	drm_syncobj_replace_fence(syncobj, NULL);
 	kfree(syncobj);
 }
 EXPORT_SYMBOL(drm_syncobj_free);
@@ -267,7 +265,7 @@ int drm_syncobj_create(struct drm_syncobj **out_syncobj, uint32_t flags,
 		drm_syncobj_assign_null_handle(syncobj);
 
 	if (fence)
-		drm_syncobj_replace_fence(syncobj, 0, fence);
+		drm_syncobj_replace_fence(syncobj, fence);
 
 	*out_syncobj = syncobj;
 	return 0;
@@ -452,7 +450,7 @@ static int drm_syncobj_import_sync_file_fence(struct drm_file *file_private,
 		return -ENOENT;
 	}
 
-	drm_syncobj_replace_fence(syncobj, 0, fence);
+	drm_syncobj_replace_fence(syncobj, fence);
 	dma_fence_put(fence);
 	drm_syncobj_put(syncobj);
 	return 0;
@@ -923,7 +921,7 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void *data,
 		return ret;
 
 	for (i = 0; i < args->count_handles; i++)
-		drm_syncobj_replace_fence(syncobjs[i], 0, NULL);
+		drm_syncobj_replace_fence(syncobjs[i], NULL);
 
 	drm_syncobj_array_free(syncobjs, args->count_handles);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index d4fac09095f8..10a4afb4f235 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -2191,7 +2191,7 @@ signal_fence_array(struct i915_execbuffer *eb,
 		if (!(flags & I915_EXEC_FENCE_SIGNAL))
 			continue;
 
-		drm_syncobj_replace_fence(syncobj, 0, fence);
+		drm_syncobj_replace_fence(syncobj, fence);
 	}
 }
 
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index f565b197cba9..cb99e53f7607 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -609,8 +609,7 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
 	/* Update the return sync object for the */
 	sync_out = drm_syncobj_find(file_priv, args->out_sync);
 	if (sync_out) {
-		drm_syncobj_replace_fence(sync_out, 0,
-					  exec->render_done_fence);
+		drm_syncobj_replace_fence(sync_out, exec->render_done_fence);
 		drm_syncobj_put(sync_out);
 	}
 
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 41881ce4132d..aea2b8dfec17 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -681,7 +681,7 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
 	exec->fence = &fence->base;
 
 	if (out_sync)
-		drm_syncobj_replace_fence(out_sync, 0, exec->fence);
+		drm_syncobj_replace_fence(out_sync, exec->fence);
 
 	vc4_update_bo_seqnos(exec, seqno);
 
diff --git a/include/drm/drm_syncobj.h b/include/drm/drm_syncobj.h
index 2eda44def639..b1fe921f8e8f 100644
--- a/include/drm/drm_syncobj.h
+++ b/include/drm/drm_syncobj.h
@@ -131,7 +131,7 @@ drm_syncobj_fence_get(struct drm_syncobj *syncobj)
 
 struct drm_syncobj *drm_syncobj_find(struct drm_file *file_private,
 				     u32 handle);
-void drm_syncobj_replace_fence(struct drm_syncobj *syncobj, u64 point,
+void drm_syncobj_replace_fence(struct drm_syncobj *syncobj,
 			       struct dma_fence *fence);
 int drm_syncobj_find_fence(struct drm_file *file_private,
 			   u32 handle, u64 point, u64 flags,
-- 
cgit v1.2.3