1 files changed, 70 insertions, 11 deletions
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 638540943c61..2107b0daf8ef 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -146,7 +146,7 @@ vc4_save_hang_state(struct drm_device *dev)
 	struct vc4_exec_info *exec[2];
 	struct vc4_bo *bo;
 	unsigned long irqflags;
-	unsigned int i, j, unref_list_count, prev_idx;
+	unsigned int i, j, k, unref_list_count;
 
 	kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL);
 	if (!kernel_state)
@@ -182,7 +182,7 @@ vc4_save_hang_state(struct drm_device *dev)
 		return;
 	}
 
-	prev_idx = 0;
+	k = 0;
 	for (i = 0; i < 2; i++) {
 		if (!exec[i])
 			continue;
@@ -197,7 +197,7 @@ vc4_save_hang_state(struct drm_device *dev)
 			WARN_ON(!refcount_read(&bo->usecnt));
 			refcount_inc(&bo->usecnt);
 			drm_gem_object_get(&exec[i]->bo[j]->base);
-			kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base;
+			kernel_state->bo[k++] = &exec[i]->bo[j]->base;
 		}
 
 		list_for_each_entry(bo, &exec[i]->unref_list, unref_head) {
@@ -205,12 +205,12 @@ vc4_save_hang_state(struct drm_device *dev)
 			 * because they are naturally unpurgeable.
 			 */
 			drm_gem_object_get(&bo->base.base);
-			kernel_state->bo[j + prev_idx] = &bo->base.base;
-			j++;
+			kernel_state->bo[k++] = &bo->base.base;
 		}
-		prev_idx = j + 1;
 	}
 
+	WARN_ON_ONCE(k != state->bo_count);
+
 	if (exec[0])
 		state->start_bin = exec[0]->ct0ca;
 	if (exec[1])
@@ -436,6 +436,19 @@ vc4_flush_caches(struct drm_device *dev)
 		  VC4_SET_FIELD(0xf, V3D_SLCACTL_ICC));
 }
 
+static void
+vc4_flush_texture_caches(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	V3D_WRITE(V3D_L2CACTL,
+		  V3D_L2CACTL_L2CCLR);
+
+	V3D_WRITE(V3D_SLCACTL,
+		  VC4_SET_FIELD(0xf, V3D_SLCACTL_T1CC) |
+		  VC4_SET_FIELD(0xf, V3D_SLCACTL_T0CC));
+}
+
 /* Sets the registers for the next job to be actually be executed in
  * the hardware.
  *
@@ -454,14 +467,30 @@ again:
 
 	vc4_flush_caches(dev);
 
+	/* Only start the perfmon if it was not already started by a previous
+	 * job.
+	 */
+	if (exec->perfmon && vc4->active_perfmon != exec->perfmon)
+		vc4_perfmon_start(vc4, exec->perfmon);
+
 	/* Either put the job in the binner if it uses the binner, or
 	 * immediately move it to the to-be-rendered queue.
 	 */
 	if (exec->ct0ca != exec->ct0ea) {
 		submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
 	} else {
+		struct vc4_exec_info *next;
+
 		vc4_move_job_to_render(dev, exec);
-		goto again;
+		next = vc4_first_bin_job(vc4);
+
+		/* We can't start the next bin job if the previous job had a
+		 * different perfmon instance attached to it. The same goes
+		 * if one of them had a perfmon attached to it and the other
+		 * one doesn't.
+		 */
+		if (next && next->perfmon == exec->perfmon)
+			goto again;
 	}
 }
 
@@ -474,6 +503,14 @@ vc4_submit_next_render_job(struct drm_device *dev)
 	if (!exec)
 		return;
 
+	/* A previous RCL may have written to one of our textures, and
+	 * our full cache flush at bin time may have occurred before
+	 * that RCL completed.  Flush the texture cache now, but not
+	 * the instructions or uniforms (since we don't write those
+	 * from an RCL).
+	 */
+	vc4_flush_texture_caches(dev);
+
 	submit_cl(dev, 1, exec->ct1ca, exec->ct1ea);
 }
 
@@ -621,6 +658,7 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
 		 struct ww_acquire_ctx *acquire_ctx)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_exec_info *renderjob;
 	uint64_t seqno;
 	unsigned long irqflags;
 	struct vc4_fence *fence;
@@ -646,11 +684,14 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
 
 	list_add_tail(&exec->head, &vc4->bin_job_list);
 
-	/* If no job was executing, kick ours off.  Otherwise, it'll
-	 * get started when the previous job's flush done interrupt
-	 * occurs.
+	/* If no bin job was executing and if the render job (if any) has the
+	 * same perfmon as our job attached to it (or if both jobs don't have
+	 * perfmon activated), then kick ours off.  Otherwise, it'll get
+	 * started when the previous job's flush/render done interrupt occurs.
 	 */
-	if (vc4_first_bin_job(vc4) == exec) {
+	renderjob = vc4_first_render_job(vc4);
+	if (vc4_first_bin_job(vc4) == exec &&
+	    (!renderjob || renderjob->perfmon == exec->perfmon)) {
 		vc4_submit_next_bin_job(dev);
 		vc4_queue_hangcheck(dev);
 	}
@@ -915,6 +956,9 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 	vc4->bin_alloc_used &= ~exec->bin_slots;
 	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 
+	/* Release the reference we had on the perf monitor. */
+	vc4_perfmon_put(exec->perfmon);
+
 	mutex_lock(&vc4->power_lock);
 	if (--vc4->power_refcount == 0) {
 		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
@@ -1067,6 +1111,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_file *vc4file = file_priv->driver_priv;
 	struct drm_vc4_submit_cl *args = data;
 	struct vc4_exec_info *exec;
 	struct ww_acquire_ctx acquire_ctx;
@@ -1080,6 +1125,11 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
+	if (args->pad2 != 0) {
+		DRM_DEBUG("->pad2 must be set to zero\n");
+		return -EINVAL;
+	}
+
 	exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
 	if (!exec) {
 		DRM_ERROR("malloc failure on exec struct\n");
@@ -1105,6 +1155,15 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		goto fail;
 
+	if (args->perfmonid) {
+		exec->perfmon = vc4_perfmon_find(vc4file,
+						 args->perfmonid);
+		if (!exec->perfmon) {
+			ret = -ENOENT;
+			goto fail;
+		}
+	}
+
 	if (exec->args->bin_cl_size != 0) {
 		ret = vc4_get_bcl(dev, exec);
 		if (ret)