1 files changed, 358 insertions, 282 deletions
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index e9503f6d1100..4c2016237d61 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -32,6 +32,7 @@
 #include <linux/uaccess.h>
 
 #include <drm/drmP.h>
+#include <drm/drm_syncobj.h>
 #include <drm/i915_drm.h>
 
 #include "i915_drv.h"
@@ -191,6 +192,8 @@ struct i915_execbuffer {
 	struct drm_file *file; /** per-file lookup tables and limits */
 	struct drm_i915_gem_execbuffer2 *args; /** ioctl parameters */
 	struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
+	struct i915_vma **vma;
+	unsigned int *flags;
 
 	struct intel_engine_cs *engine; /** engine to queue the request to */
 	struct i915_gem_context *ctx; /** context for building the request */
@@ -244,13 +247,7 @@ struct i915_execbuffer {
 	struct hlist_head *buckets; /** ht for relocation handles */
 };
 
-/*
- * As an alternative to creating a hashtable of handle-to-vma for a batch,
- * we used the last available reserved field in the execobject[] and stash
- * a link from the execobj to its vma.
- */
-#define __exec_to_vma(ee) (ee)->rsvd2
-#define exec_to_vma(ee) u64_to_ptr(struct i915_vma, __exec_to_vma(ee))
+#define exec_entry(EB, VMA) (&(EB)->exec[(VMA)->exec_flags - (EB)->flags])
 
 /*
  * Used to convert any address to canonical form.
@@ -319,85 +316,82 @@ static int eb_create(struct i915_execbuffer *eb)
 
 static bool
 eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
-		 const struct i915_vma *vma)
+		 const struct i915_vma *vma,
+		 unsigned int flags)
 {
-	if (!(entry->flags & __EXEC_OBJECT_HAS_PIN))
-		return true;
-
 	if (vma->node.size < entry->pad_to_size)
 		return true;
 
 	if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment))
 		return true;
 
-	if (entry->flags & EXEC_OBJECT_PINNED &&
+	if (flags & EXEC_OBJECT_PINNED &&
 	    vma->node.start != entry->offset)
 		return true;
 
-	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
+	if (flags & __EXEC_OBJECT_NEEDS_BIAS &&
 	    vma->node.start < BATCH_OFFSET_BIAS)
 		return true;
 
-	if (!(entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) &&
+	if (!(flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) &&
 	    (vma->node.start + vma->node.size - 1) >> 32)
 		return true;
 
 	return false;
 }
 
-static inline void
+static inline bool
 eb_pin_vma(struct i915_execbuffer *eb,
-	   struct drm_i915_gem_exec_object2 *entry,
+	   const struct drm_i915_gem_exec_object2 *entry,
 	   struct i915_vma *vma)
 {
-	u64 flags;
+	unsigned int exec_flags = *vma->exec_flags;
+	u64 pin_flags;
 
 	if (vma->node.size)
-		flags = vma->node.start;
+		pin_flags = vma->node.start;
 	else
-		flags = entry->offset & PIN_OFFSET_MASK;
+		pin_flags = entry->offset & PIN_OFFSET_MASK;
 
-	flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED;
-	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_GTT))
-		flags |= PIN_GLOBAL;
+	pin_flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED;
+	if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_GTT))
+		pin_flags |= PIN_GLOBAL;
 
-	if (unlikely(i915_vma_pin(vma, 0, 0, flags)))
-		return;
+	if (unlikely(i915_vma_pin(vma, 0, 0, pin_flags)))
+		return false;
 
-	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
+	if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) {
 		if (unlikely(i915_vma_get_fence(vma))) {
 			i915_vma_unpin(vma);
-			return;
+			return false;
 		}
 
 		if (i915_vma_pin_fence(vma))
-			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+			exec_flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
 
-	entry->flags |= __EXEC_OBJECT_HAS_PIN;
+	*vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN;
+	return !eb_vma_misplaced(entry, vma, exec_flags);
 }
 
-static inline void
-__eb_unreserve_vma(struct i915_vma *vma,
-		   const struct drm_i915_gem_exec_object2 *entry)
+static inline void __eb_unreserve_vma(struct i915_vma *vma, unsigned int flags)
 {
-	GEM_BUG_ON(!(entry->flags & __EXEC_OBJECT_HAS_PIN));
+	GEM_BUG_ON(!(flags & __EXEC_OBJECT_HAS_PIN));
 
-	if (unlikely(entry->flags & __EXEC_OBJECT_HAS_FENCE))
+	if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE))
 		i915_vma_unpin_fence(vma);
 
 	__i915_vma_unpin(vma);
 }
 
 static inline void
-eb_unreserve_vma(struct i915_vma *vma,
-		 struct drm_i915_gem_exec_object2 *entry)
+eb_unreserve_vma(struct i915_vma *vma, unsigned int *flags)
 {
-	if (!(entry->flags & __EXEC_OBJECT_HAS_PIN))
+	if (!(*flags & __EXEC_OBJECT_HAS_PIN))
 		return;
 
-	__eb_unreserve_vma(vma, entry);
-	entry->flags &= ~__EXEC_OBJECT_RESERVED;
+	__eb_unreserve_vma(vma, *flags);
+	*flags &= ~__EXEC_OBJECT_RESERVED;
 }
 
 static int
@@ -427,7 +421,7 @@ eb_validate_vma(struct i915_execbuffer *eb,
 		entry->pad_to_size = 0;
 	}
 
-	if (unlikely(vma->exec_entry)) {
+	if (unlikely(vma->exec_flags)) {
 		DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
 			  entry->handle, (int)(entry - eb->exec));
 		return -EINVAL;
@@ -440,14 +434,25 @@ eb_validate_vma(struct i915_execbuffer *eb,
 	 */
 	entry->offset = gen8_noncanonical_addr(entry->offset);
 
+	if (!eb->reloc_cache.has_fence) {
+		entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
+	} else {
+		if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE ||
+		     eb->reloc_cache.needs_unfenced) &&
+		    i915_gem_object_is_tiled(vma->obj))
+			entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP;
+	}
+
+	if (!(entry->flags & EXEC_OBJECT_PINNED))
+		entry->flags |= eb->context_flags;
+
 	return 0;
 }
 
 static int
-eb_add_vma(struct i915_execbuffer *eb,
-	   struct drm_i915_gem_exec_object2 *entry,
-	   struct i915_vma *vma)
+eb_add_vma(struct i915_execbuffer *eb, unsigned int i, struct i915_vma *vma)
 {
+	struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
 	int err;
 
 	GEM_BUG_ON(i915_vma_is_closed(vma));
@@ -468,40 +473,28 @@ eb_add_vma(struct i915_execbuffer *eb,
 	if (entry->relocation_count)
 		list_add_tail(&vma->reloc_link, &eb->relocs);
 
-	if (!eb->reloc_cache.has_fence) {
-		entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
-	} else {
-		if ((entry->flags & EXEC_OBJECT_NEEDS_FENCE ||
-		     eb->reloc_cache.needs_unfenced) &&
-		    i915_gem_object_is_tiled(vma->obj))
-			entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP;
-	}
-
-	if (!(entry->flags & EXEC_OBJECT_PINNED))
-		entry->flags |= eb->context_flags;
-
 	/*
 	 * Stash a pointer from the vma to execobj, so we can query its flags,
 	 * size, alignment etc as provided by the user. Also we stash a pointer
 	 * to the vma inside the execobj so that we can use a direct lookup
 	 * to find the right target VMA when doing relocations.
 	 */
-	vma->exec_entry = entry;
-	__exec_to_vma(entry) = (uintptr_t)vma;
+	eb->vma[i] = vma;
+	eb->flags[i] = entry->flags;
+	vma->exec_flags = &eb->flags[i];
 
 	err = 0;
-	eb_pin_vma(eb, entry, vma);
-	if (eb_vma_misplaced(entry, vma)) {
-		eb_unreserve_vma(vma, entry);
-
-		list_add_tail(&vma->exec_link, &eb->unbound);
-		if (drm_mm_node_allocated(&vma->node))
-			err = i915_vma_unbind(vma);
-	} else {
+	if (eb_pin_vma(eb, entry, vma)) {
 		if (entry->offset != vma->node.start) {
 			entry->offset = vma->node.start | UPDATE;
 			eb->args->flags |= __EXEC_HAS_RELOC;
 		}
+	} else {
+		eb_unreserve_vma(vma, vma->exec_flags);
+
+		list_add_tail(&vma->exec_link, &eb->unbound);
+		if (drm_mm_node_allocated(&vma->node))
+			err = i915_vma_unbind(vma);
 	}
 	return err;
 }
@@ -526,32 +519,35 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
 static int eb_reserve_vma(const struct i915_execbuffer *eb,
 			  struct i915_vma *vma)
 {
-	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-	u64 flags;
+	struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma);
+	unsigned int exec_flags = *vma->exec_flags;
+	u64 pin_flags;
 	int err;
 
-	flags = PIN_USER | PIN_NONBLOCK;
-	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
-		flags |= PIN_GLOBAL;
+	pin_flags = PIN_USER | PIN_NONBLOCK;
+	if (exec_flags & EXEC_OBJECT_NEEDS_GTT)
+		pin_flags |= PIN_GLOBAL;
 
 	/*
 	 * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
 	 * limit address to the first 4GBs for unflagged objects.
 	 */
-	if (!(entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS))
-		flags |= PIN_ZONE_4G;
+	if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS))
+		pin_flags |= PIN_ZONE_4G;
 
-	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
-		flags |= PIN_MAPPABLE;
+	if (exec_flags & __EXEC_OBJECT_NEEDS_MAP)
+		pin_flags |= PIN_MAPPABLE;
 
-	if (entry->flags & EXEC_OBJECT_PINNED) {
-		flags |= entry->offset | PIN_OFFSET_FIXED;
-		flags &= ~PIN_NONBLOCK; /* force overlapping PINNED checks */
-	} else if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS) {
-		flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
+	if (exec_flags & EXEC_OBJECT_PINNED) {
+		pin_flags |= entry->offset | PIN_OFFSET_FIXED;
+		pin_flags &= ~PIN_NONBLOCK; /* force overlapping checks */
+	} else if (exec_flags & __EXEC_OBJECT_NEEDS_BIAS) {
+		pin_flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
 	}
 
-	err = i915_vma_pin(vma, entry->pad_to_size, entry->alignment, flags);
+	err = i915_vma_pin(vma,
+			   entry->pad_to_size, entry->alignment,
+			   pin_flags);
 	if (err)
 		return err;
 
@@ -560,7 +556,7 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
 		eb->args->flags |= __EXEC_HAS_RELOC;
 	}
 
-	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
+	if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) {
 		err = i915_vma_get_fence(vma);
 		if (unlikely(err)) {
 			i915_vma_unpin(vma);
@@ -568,11 +564,11 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
 		}
 
 		if (i915_vma_pin_fence(vma))
-			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
+			exec_flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
 
-	entry->flags |= __EXEC_OBJECT_HAS_PIN;
-	GEM_BUG_ON(eb_vma_misplaced(entry, vma));
+	*vma->exec_flags = exec_flags | __EXEC_OBJECT_HAS_PIN;
+	GEM_BUG_ON(eb_vma_misplaced(entry, vma, exec_flags));
 
 	return 0;
 }
@@ -614,18 +610,18 @@ static int eb_reserve(struct i915_execbuffer *eb)
 		INIT_LIST_HEAD(&eb->unbound);
 		INIT_LIST_HEAD(&last);
 		for (i = 0; i < count; i++) {
-			struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
+			unsigned int flags = eb->flags[i];
+			struct i915_vma *vma = eb->vma[i];
 
-			if (entry->flags & EXEC_OBJECT_PINNED &&
-			    entry->flags & __EXEC_OBJECT_HAS_PIN)
+			if (flags & EXEC_OBJECT_PINNED &&
+			    flags & __EXEC_OBJECT_HAS_PIN)
 				continue;
 
-			vma = exec_to_vma(entry);
-			eb_unreserve_vma(vma, entry);
+			eb_unreserve_vma(vma, &eb->flags[i]);
 
-			if (entry->flags & EXEC_OBJECT_PINNED)
+			if (flags & EXEC_OBJECT_PINNED)
 				list_add(&vma->exec_link, &eb->unbound);
-			else if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
+			else if (flags & __EXEC_OBJECT_NEEDS_MAP)
 				list_add_tail(&vma->exec_link, &eb->unbound);
 			else
 				list_add_tail(&vma->exec_link, &last);
@@ -649,19 +645,6 @@ static int eb_reserve(struct i915_execbuffer *eb)
 	} while (1);
 }
 
-static inline struct hlist_head *
-ht_head(const  struct i915_gem_context_vma_lut *lut, u32 handle)
-{
-	return &lut->ht[hash_32(handle, lut->ht_bits)];
-}
-
-static inline bool
-ht_needs_resize(const struct i915_gem_context_vma_lut *lut)
-{
-	return (4*lut->ht_count > 3*lut->ht_size ||
-		4*lut->ht_count + 1 < lut->ht_size);
-}
-
 static unsigned int eb_batch_index(const struct i915_execbuffer *eb)
 {
 	if (eb->args->flags & I915_EXEC_BATCH_FIRST)
@@ -675,16 +658,10 @@ static int eb_select_context(struct i915_execbuffer *eb)
 	struct i915_gem_context *ctx;
 
 	ctx = i915_gem_context_lookup(eb->file->driver_priv, eb->args->rsvd1);
-	if (unlikely(IS_ERR(ctx)))
-		return PTR_ERR(ctx);
-
-	if (unlikely(i915_gem_context_is_banned(ctx))) {
-		DRM_DEBUG("Context %u tried to submit while banned\n",
-			  ctx->user_handle);
-		return -EIO;
-	}
+	if (unlikely(!ctx))
+		return -ENOENT;
 
-	eb->ctx = i915_gem_context_get(ctx);
+	eb->ctx = ctx;
 	eb->vm = ctx->ppgtt ? &ctx->ppgtt->base : &eb->i915->ggtt.base;
 
 	eb->context_flags = 0;
@@ -696,132 +673,74 @@ static int eb_select_context(struct i915_execbuffer *eb)
 
 static int eb_lookup_vmas(struct i915_execbuffer *eb)
 {
-#define INTERMEDIATE BIT(0)
-	const unsigned int count = eb->buffer_count;
-	struct i915_gem_context_vma_lut *lut = &eb->ctx->vma_lut;
-	struct i915_vma *vma;
-	struct idr *idr;
+	struct radix_tree_root *handles_vma = &eb->ctx->handles_vma;
+	struct drm_i915_gem_object *uninitialized_var(obj);
 	unsigned int i;
-	int slow_pass = -1;
 	int err;
 
-	INIT_LIST_HEAD(&eb->relocs);
-	INIT_LIST_HEAD(&eb->unbound);
-
-	if (unlikely(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS))
-		flush_work(&lut->resize);
-	GEM_BUG_ON(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS);
-
-	for (i = 0; i < count; i++) {
-		__exec_to_vma(&eb->exec[i]) = 0;
-
-		hlist_for_each_entry(vma,
-				     ht_head(lut, eb->exec[i].handle),
-				     ctx_node) {
-			if (vma->ctx_handle != eb->exec[i].handle)
-				continue;
-
-			err = eb_add_vma(eb, &eb->exec[i], vma);
-			if (unlikely(err))
-				return err;
-
-			goto next_vma;
-		}
+	if (unlikely(i915_gem_context_is_closed(eb->ctx)))
+		return -ENOENT;
 
-		if (slow_pass < 0)
-			slow_pass = i;
-next_vma: ;
-	}
+	if (unlikely(i915_gem_context_is_banned(eb->ctx)))
+		return -EIO;
 
-	if (slow_pass < 0)
-		goto out;
+	INIT_LIST_HEAD(&eb->relocs);
+	INIT_LIST_HEAD(&eb->unbound);
 
-	spin_lock(&eb->file->table_lock);
-	/*
-	 * Grab a reference to the object and release the lock so we can lookup
-	 * or create the VMA without using GFP_ATOMIC
-	 */
-	idr = &eb->file->object_idr;
-	for (i = slow_pass; i < count; i++) {
-		struct drm_i915_gem_object *obj;
+	for (i = 0; i < eb->buffer_count; i++) {
+		u32 handle = eb->exec[i].handle;
+		struct i915_lut_handle *lut;
+		struct i915_vma *vma;
 
-		if (__exec_to_vma(&eb->exec[i]))
-			continue;
+		vma = radix_tree_lookup(handles_vma, handle);
+		if (likely(vma))
+			goto add_vma;
 
-		obj = to_intel_bo(idr_find(idr, eb->exec[i].handle));
+		obj = i915_gem_object_lookup(eb->file, handle);
 		if (unlikely(!obj)) {
-			spin_unlock(&eb->file->table_lock);
-			DRM_DEBUG("Invalid object handle %d at index %d\n",
-				  eb->exec[i].handle, i);
 			err = -ENOENT;
-			goto err;
+			goto err_vma;
 		}
 
-		__exec_to_vma(&eb->exec[i]) = INTERMEDIATE | (uintptr_t)obj;
-	}
-	spin_unlock(&eb->file->table_lock);
-
-	for (i = slow_pass; i < count; i++) {
-		struct drm_i915_gem_object *obj;
-
-		if (!(__exec_to_vma(&eb->exec[i]) & INTERMEDIATE))
-			continue;
-
-		/*
-		 * NOTE: We can leak any vmas created here when something fails
-		 * later on. But that's no issue since vma_unbind can deal with
-		 * vmas which are not actually bound. And since only
-		 * lookup_or_create exists as an interface to get at the vma
-		 * from the (obj, vm) we don't run the risk of creating
-		 * duplicated vmas for the same vm.
-		 */
-		obj = u64_to_ptr(typeof(*obj),
-				 __exec_to_vma(&eb->exec[i]) & ~INTERMEDIATE);
 		vma = i915_vma_instance(obj, eb->vm, NULL);
 		if (unlikely(IS_ERR(vma))) {
-			DRM_DEBUG("Failed to lookup VMA\n");
 			err = PTR_ERR(vma);
-			goto err;
+			goto err_obj;
 		}
 
-		/* First come, first served */
-		if (!vma->ctx) {
-			vma->ctx = eb->ctx;
-			vma->ctx_handle = eb->exec[i].handle;
-			hlist_add_head(&vma->ctx_node,
-				       ht_head(lut, eb->exec[i].handle));
-			lut->ht_count++;
-			lut->ht_size |= I915_CTX_RESIZE_IN_PROGRESS;
-			if (i915_vma_is_ggtt(vma)) {
-				GEM_BUG_ON(obj->vma_hashed);
-				obj->vma_hashed = vma;
-			}
+		lut = kmem_cache_alloc(eb->i915->luts, GFP_KERNEL);
+		if (unlikely(!lut)) {
+			err = -ENOMEM;
+			goto err_obj;
+		}
 
-			i915_vma_get(vma);
+		err = radix_tree_insert(handles_vma, handle, vma);
+		if (unlikely(err)) {
+			kfree(lut);
+			goto err_obj;
 		}
 
-		err = eb_add_vma(eb, &eb->exec[i], vma);
-		if (unlikely(err))
-			goto err;
+		list_add(&lut->obj_link, &obj->lut_list);
+		list_add(&lut->ctx_link, &eb->ctx->handles_list);
+		lut->ctx = eb->ctx;
+		lut->handle = handle;
 
-		/* Only after we validated the user didn't use our bits */
-		if (vma->ctx != eb->ctx) {
-			i915_vma_get(vma);
-			eb->exec[i].flags |= __EXEC_OBJECT_HAS_REF;
-		}
-	}
+		/* transfer ref to ctx */
+		obj = NULL;
 
-	if (lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS) {
-		if (ht_needs_resize(lut))
-			queue_work(system_highpri_wq, &lut->resize);
-		else
-			lut->ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS;
+add_vma:
+		err = eb_add_vma(eb, i, vma);
+		if (unlikely(err))
+			goto err_obj;
+
+		GEM_BUG_ON(vma != eb->vma[i]);
+		GEM_BUG_ON(vma->exec_flags != &eb->flags[i]);
 	}
 
-out:
 	/* take note of the batch buffer before we might reorder the lists */
 	i = eb_batch_index(eb);
-	eb->batch = exec_to_vma(&eb->exec[i]);
+	eb->batch = eb->vma[i];
+	GEM_BUG_ON(eb->batch->exec_flags != &eb->flags[i]);
 
 	/*
 	 * SNA is doing fancy tricks with compressing batch buffers, which leads
@@ -832,22 +751,20 @@ out:
 	 * Note that actual hangs have only been observed on gen7, but for
 	 * paranoia do it everywhere.
 	 */
-	if (!(eb->exec[i].flags & EXEC_OBJECT_PINNED))
-		eb->exec[i].flags |= __EXEC_OBJECT_NEEDS_BIAS;
+	if (!(eb->flags[i] & EXEC_OBJECT_PINNED))
+		eb->flags[i] |= __EXEC_OBJECT_NEEDS_BIAS;
 	if (eb->reloc_cache.has_fence)
-		eb->exec[i].flags |= EXEC_OBJECT_NEEDS_FENCE;
+		eb->flags[i] |= EXEC_OBJECT_NEEDS_FENCE;
 
 	eb->args->flags |= __EXEC_VALIDATED;
 	return eb_reserve(eb);
 
-err:
-	for (i = slow_pass; i < count; i++) {
-		if (__exec_to_vma(&eb->exec[i]) & INTERMEDIATE)
-			__exec_to_vma(&eb->exec[i]) = 0;
-	}
-	lut->ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS;
+err_obj:
+	if (obj)
+		i915_gem_object_put(obj);
+err_vma:
+	eb->vma[i] = NULL;
 	return err;
-#undef INTERMEDIATE
 }
 
 static struct i915_vma *
@@ -856,7 +773,7 @@ eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle)
 	if (eb->lut_size < 0) {
 		if (handle >= -eb->lut_size)
 			return NULL;
-		return exec_to_vma(&eb->exec[handle]);
+		return eb->vma[handle];
 	} else {
 		struct hlist_head *head;
 		struct i915_vma *vma;
@@ -876,24 +793,21 @@ static void eb_release_vmas(const struct i915_execbuffer *eb)
 	unsigned int i;
 
 	for (i = 0; i < count; i++) {
-		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
-		struct i915_vma *vma = exec_to_vma(entry);
+		struct i915_vma *vma = eb->vma[i];
+		unsigned int flags = eb->flags[i];
 
 		if (!vma)
-			continue;
+			break;
 
-		GEM_BUG_ON(vma->exec_entry != entry);
-		vma->exec_entry = NULL;
-		__exec_to_vma(entry) = 0;
+		GEM_BUG_ON(vma->exec_flags != &eb->flags[i]);
+		vma->exec_flags = NULL;
+		eb->vma[i] = NULL;
 
-		if (entry->flags & __EXEC_OBJECT_HAS_PIN)
-			__eb_unreserve_vma(vma, entry);
+		if (flags & __EXEC_OBJECT_HAS_PIN)
+			__eb_unreserve_vma(vma, flags);
 
-		if (entry->flags & __EXEC_OBJECT_HAS_REF)
+		if (flags & __EXEC_OBJECT_HAS_REF)
 			i915_vma_put(vma);
-
-		entry->flags &=
-			~(__EXEC_OBJECT_RESERVED | __EXEC_OBJECT_HAS_REF);
 	}
 }
 
@@ -1266,7 +1180,9 @@ relocate_entry(struct i915_vma *vma,
 
 	if (!eb->reloc_cache.vaddr &&
 	    (DBG_FORCE_RELOC == FORCE_GPU_RELOC ||
-	     !reservation_object_test_signaled_rcu(vma->resv, true))) {
+	     !reservation_object_test_signaled_rcu(vma->resv, true)) &&
+	    __intel_engine_can_store_dword(eb->reloc_cache.gen,
+					   eb->engine->class)) {
 		const unsigned int gen = eb->reloc_cache.gen;
 		unsigned int len;
 		u32 *batch;
@@ -1276,10 +1192,8 @@ relocate_entry(struct i915_vma *vma,
 			len = offset & 7 ? 8 : 5;
 		else if (gen >= 4)
 			len = 4;
-		else if (gen >= 3)
+		else
 			len = 3;
-		else /* On gen2 MI_STORE_DWORD_IMM uses a physical address */
-			goto repeat;
 
 		batch = reloc_gpu(eb, vma, len);
 		if (IS_ERR(batch))
@@ -1382,7 +1296,7 @@ eb_relocate_entry(struct i915_execbuffer *eb,
 	}
 
 	if (reloc->write_domain) {
-		target->exec_entry->flags |= EXEC_OBJECT_WRITE;
+		*target->exec_flags |= EXEC_OBJECT_WRITE;
 
 		/*
 		 * Sandybridge PPGTT errata: We need a global gtt mapping
@@ -1432,9 +1346,9 @@ eb_relocate_entry(struct i915_execbuffer *eb,
 	 * patching using the GPU (though that should be serialised by the
 	 * timeline). To be completely sure, and since we are required to
 	 * do relocations we are already stalling, disable the user's opt
-	 * of our synchronisation.
+	 * out of our synchronisation.
 	 */
-	vma->exec_entry->flags &= ~EXEC_OBJECT_ASYNC;
+	*vma->exec_flags &= ~EXEC_OBJECT_ASYNC;
 
 	/* and update the user's relocation entry */
 	return relocate_entry(vma, reloc, eb, target);
@@ -1445,7 +1359,7 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
 	struct drm_i915_gem_relocation_entry stack[N_RELOC(512)];
 	struct drm_i915_gem_relocation_entry __user *urelocs;
-	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma);
 	unsigned int remain;
 
 	urelocs = u64_to_user_ptr(entry->relocs_ptr);
@@ -1528,7 +1442,7 @@ out:
 static int
 eb_relocate_vma_slow(struct i915_execbuffer *eb, struct i915_vma *vma)
 {
-	const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	const struct drm_i915_gem_exec_object2 *entry = exec_entry(eb, vma);
 	struct drm_i915_gem_relocation_entry *relocs =
 		u64_to_ptr(typeof(*relocs), entry->relocs_ptr);
 	unsigned int i;
@@ -1732,6 +1646,8 @@ repeat:
 	if (err)
 		goto err;
 
+	GEM_BUG_ON(!eb->batch);
+
 	list_for_each_entry(vma, &eb->relocs, reloc_link) {
 		if (!have_copy) {
 			pagefault_disable();
@@ -1825,11 +1741,11 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
 	int err;
 
 	for (i = 0; i < count; i++) {
-		struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
-		struct i915_vma *vma = exec_to_vma(entry);
+		unsigned int flags = eb->flags[i];
+		struct i915_vma *vma = eb->vma[i];
 		struct drm_i915_gem_object *obj = vma->obj;
 
-		if (entry->flags & EXEC_OBJECT_CAPTURE) {
+		if (flags & EXEC_OBJECT_CAPTURE) {
 			struct i915_gem_capture_list *capture;
 
 			capture = kmalloc(sizeof(*capture), GFP_KERNEL);
@@ -1837,35 +1753,47 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
 				return -ENOMEM;
 
 			capture->next = eb->request->capture_list;
-			capture->vma = vma;
+			capture->vma = eb->vma[i];
 			eb->request->capture_list = capture;
 		}
 
-		if (unlikely(obj->cache_dirty && !obj->cache_coherent)) {
+		/*
+		 * If the GPU is not _reading_ through the CPU cache, we need
+		 * to make sure that any writes (both previous GPU writes from
+		 * before a change in snooping levels and normal CPU writes)
+		 * caught in that cache are flushed to main memory.
+		 *
+		 * We want to say
+		 *   obj->cache_dirty &&
+		 *   !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)
+		 * but gcc's optimiser doesn't handle that as well and emits
+		 * two jumps instead of one. Maybe one day...
+		 */
+		if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) {
 			if (i915_gem_clflush_object(obj, 0))
-				entry->flags &= ~EXEC_OBJECT_ASYNC;
+				flags &= ~EXEC_OBJECT_ASYNC;
 		}
 
-		if (entry->flags & EXEC_OBJECT_ASYNC)
-			goto skip_flushes;
+		if (flags & EXEC_OBJECT_ASYNC)
+			continue;
 
 		err = i915_gem_request_await_object
-			(eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
+			(eb->request, obj, flags & EXEC_OBJECT_WRITE);
 		if (err)
 			return err;
-
-skip_flushes:
-		i915_vma_move_to_active(vma, eb->request, entry->flags);
-		__eb_unreserve_vma(vma, entry);
-		vma->exec_entry = NULL;
 	}
 
 	for (i = 0; i < count; i++) {
-		const struct drm_i915_gem_exec_object2 *entry = &eb->exec[i];
-		struct i915_vma *vma = exec_to_vma(entry);
+		unsigned int flags = eb->flags[i];
+		struct i915_vma *vma = eb->vma[i];
 
-		eb_export_fence(vma, eb->request, entry->flags);
-		if (unlikely(entry->flags & __EXEC_OBJECT_HAS_REF))
+		i915_vma_move_to_active(vma, eb->request, flags);
+		eb_export_fence(vma, eb->request, flags);
+
+		__eb_unreserve_vma(vma, flags);
+		vma->exec_flags = NULL;
+
+		if (unlikely(flags & __EXEC_OBJECT_HAS_REF))
 			i915_vma_put(vma);
 	}
 	eb->exec = NULL;
@@ -1883,8 +1811,10 @@ static bool i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec)
 		return false;
 
 	/* Kernel clipping was a DRI1 misfeature */
-	if (exec->num_cliprects || exec->cliprects_ptr)
-		return false;
+	if (!(exec->flags & I915_EXEC_FENCE_ARRAY)) {
+		if (exec->num_cliprects || exec->cliprects_ptr)
+			return false;
+	}
 
 	if (exec->DR4 == 0xffffffff) {
 		DRM_DEBUG("UXA submitting garbage DR4, fixing up\n");
@@ -1992,11 +1922,11 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
 	if (IS_ERR(vma))
 		goto out;
 
-	vma->exec_entry =
-		memset(&eb->exec[eb->buffer_count++],
-		       0, sizeof(*vma->exec_entry));
-	vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF;
-	__exec_to_vma(vma->exec_entry) = (uintptr_t)i915_vma_get(vma);
+	eb->vma[eb->buffer_count] = i915_vma_get(vma);
+	eb->flags[eb->buffer_count] =
+		__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF;
+	vma->exec_flags = &eb->flags[eb->buffer_count];
+	eb->buffer_count++;
 
 out:
 	i915_gem_object_unpin_pages(shadow_batch_obj);
@@ -2115,11 +2045,129 @@ eb_select_engine(struct drm_i915_private *dev_priv,
 	return engine;
 }
 
+static void
+__free_fence_array(struct drm_syncobj **fences, unsigned int n)
+{
+	while (n--)
+		drm_syncobj_put(ptr_mask_bits(fences[n], 2));
+	kvfree(fences);
+}
+
+static struct drm_syncobj **
+get_fence_array(struct drm_i915_gem_execbuffer2 *args,
+		struct drm_file *file)
+{
+	const unsigned int nfences = args->num_cliprects;
+	struct drm_i915_gem_exec_fence __user *user;
+	struct drm_syncobj **fences;
+	unsigned int n;
+	int err;
+
+	if (!(args->flags & I915_EXEC_FENCE_ARRAY))
+		return NULL;
+
+	if (nfences > SIZE_MAX / sizeof(*fences))
+		return ERR_PTR(-EINVAL);
+
+	user = u64_to_user_ptr(args->cliprects_ptr);
+	if (!access_ok(VERIFY_READ, user, nfences * 2 * sizeof(u32)))
+		return ERR_PTR(-EFAULT);
+
+	fences = kvmalloc_array(args->num_cliprects, sizeof(*fences),
+				__GFP_NOWARN | GFP_TEMPORARY);
+	if (!fences)
+		return ERR_PTR(-ENOMEM);
+
+	for (n = 0; n < nfences; n++) {
+		struct drm_i915_gem_exec_fence fence;
+		struct drm_syncobj *syncobj;
+
+		if (__copy_from_user(&fence, user++, sizeof(fence))) {
+			err = -EFAULT;
+			goto err;
+		}
+
+		syncobj = drm_syncobj_find(file, fence.handle);
+		if (!syncobj) {
+			DRM_DEBUG("Invalid syncobj handle provided\n");
+			err = -ENOENT;
+			goto err;
+		}
+
+		fences[n] = ptr_pack_bits(syncobj, fence.flags, 2);
+	}
+
+	return fences;
+
+err:
+	__free_fence_array(fences, n);
+	return ERR_PTR(err);
+}
+
+static void
+put_fence_array(struct drm_i915_gem_execbuffer2 *args,
+		struct drm_syncobj **fences)
+{
+	if (fences)
+		__free_fence_array(fences, args->num_cliprects);
+}
+
+static int
+await_fence_array(struct i915_execbuffer *eb,
+		  struct drm_syncobj **fences)
+{
+	const unsigned int nfences = eb->args->num_cliprects;
+	unsigned int n;
+	int err;
+
+	for (n = 0; n < nfences; n++) {
+		struct drm_syncobj *syncobj;
+		struct dma_fence *fence;
+		unsigned int flags;
+
+		syncobj = ptr_unpack_bits(fences[n], &flags, 2);
+		if (!(flags & I915_EXEC_FENCE_WAIT))
+			continue;
+
+		fence = drm_syncobj_fence_get(syncobj);
+		if (!fence)
+			return -EINVAL;
+
+		err = i915_gem_request_await_dma_fence(eb->request, fence);
+		dma_fence_put(fence);
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
+static void
+signal_fence_array(struct i915_execbuffer *eb,
+		   struct drm_syncobj **fences)
+{
+	const unsigned int nfences = eb->args->num_cliprects;
+	struct dma_fence * const fence = &eb->request->fence;
+	unsigned int n;
+
+	for (n = 0; n < nfences; n++) {
+		struct drm_syncobj *syncobj;
+		unsigned int flags;
+
+		syncobj = ptr_unpack_bits(fences[n], &flags, 2);
+		if (!(flags & I915_EXEC_FENCE_SIGNAL))
+			continue;
+
+		drm_syncobj_replace_fence(syncobj, fence);
+	}
+}
+
 static int
 i915_gem_do_execbuffer(struct drm_device *dev,
 		       struct drm_file *file,
 		       struct drm_i915_gem_execbuffer2 *args,
-		       struct drm_i915_gem_exec_object2 *exec)
+		       struct drm_i915_gem_exec_object2 *exec,
+		       struct drm_syncobj **fences)
 {
 	struct i915_execbuffer eb;
 	struct dma_fence *in_fence = NULL;
@@ -2135,8 +2183,12 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	eb.args = args;
 	if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC))
 		args->flags |= __EXEC_HAS_RELOC;
+
 	eb.exec = exec;
-	eb.ctx = NULL;
+	eb.vma = (struct i915_vma **)(exec + args->buffer_count + 1);
+	eb.vma[0] = NULL;
+	eb.flags = (unsigned int *)(eb.vma + args->buffer_count + 1);
+
 	eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
 	if (USES_FULL_PPGTT(eb.i915))
 		eb.invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
@@ -2194,6 +2246,10 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 
 	GEM_BUG_ON(!eb.lut_size);
 
+	err = eb_select_context(&eb);
+	if (unlikely(err))
+		goto err_destroy;
+
 	/*
 	 * Take a local wakeref for preparing to dispatch the execbuf as
 	 * we expect to access the hardware fairly frequently in the
@@ -2202,14 +2258,11 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	 * 100ms.
 	 */
 	intel_runtime_pm_get(eb.i915);
+
 	err = i915_mutex_lock_interruptible(dev);
 	if (err)
 		goto err_rpm;
 
-	err = eb_select_context(&eb);
-	if (unlikely(err))
-		goto err_unlock;
-
 	err = eb_relocate(&eb);
 	if (err) {
 		/*
@@ -2223,7 +2276,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		goto err_vma;
 	}
 
-	if (unlikely(eb.batch->exec_entry->flags & EXEC_OBJECT_WRITE)) {
+	if (unlikely(*eb.batch->exec_flags & EXEC_OBJECT_WRITE)) {
 		DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
 		err = -EINVAL;
 		goto err_vma;
@@ -2305,6 +2358,12 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 			goto err_request;
 	}
 
+	if (fences) {
+		err = await_fence_array(&eb, fences);
+		if (err)
+			goto err_request;
+	}
+
 	if (out_fence_fd != -1) {
 		out_fence = sync_file_create(&eb.request->fence);
 		if (!out_fence) {
@@ -2328,6 +2387,9 @@ err_request:
 	__i915_add_request(eb.request, err == 0);
 	add_to_client(eb.request, file);
 
+	if (fences)
+		signal_fence_array(&eb, fences);
+
 	if (out_fence) {
 		if (err == 0) {
 			fd_install(out_fence_fd, out_fence->file);
@@ -2345,11 +2407,11 @@ err_batch_unpin:
 err_vma:
 	if (eb.exec)
 		eb_release_vmas(&eb);
-	i915_gem_context_put(eb.ctx);
-err_unlock:
 	mutex_unlock(&dev->struct_mutex);
 err_rpm:
 	intel_runtime_pm_put(eb.i915);
+	i915_gem_context_put(eb.ctx);
+err_destroy:
 	eb_destroy(&eb);
 err_out_fence:
 	if (out_fence_fd != -1)
@@ -2367,7 +2429,9 @@ int
 i915_gem_execbuffer(struct drm_device *dev, void *data,
 		    struct drm_file *file)
 {
-	const size_t sz = sizeof(struct drm_i915_gem_exec_object2);
+	const size_t sz = (sizeof(struct drm_i915_gem_exec_object2) +
+			   sizeof(struct i915_vma *) +
+			   sizeof(unsigned int));
 	struct drm_i915_gem_execbuffer *args = data;
 	struct drm_i915_gem_execbuffer2 exec2;
 	struct drm_i915_gem_exec_object *exec_list = NULL;
@@ -2429,7 +2493,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 			exec2_list[i].flags = 0;
 	}
 
-	err = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list);
+	err = i915_gem_do_execbuffer(dev, file, &exec2, exec2_list, NULL);
 	if (exec2.flags & __EXEC_HAS_RELOC) {
 		struct drm_i915_gem_exec_object __user *user_exec_list =
 			u64_to_user_ptr(args->buffers_ptr);
@@ -2458,9 +2522,12 @@ int
 i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		     struct drm_file *file)
 {
-	const size_t sz = sizeof(struct drm_i915_gem_exec_object2);
+	const size_t sz = (sizeof(struct drm_i915_gem_exec_object2) +
+			   sizeof(struct i915_vma *) +
+			   sizeof(unsigned int));
 	struct drm_i915_gem_execbuffer2 *args = data;
 	struct drm_i915_gem_exec_object2 *exec2_list;
+	struct drm_syncobj **fences = NULL;
 	int err;
 
 	if (args->buffer_count < 1 || args->buffer_count > SIZE_MAX / sz - 1) {
@@ -2487,7 +2554,15 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EFAULT;
 	}
 
-	err = i915_gem_do_execbuffer(dev, file, args, exec2_list);
+	if (args->flags & I915_EXEC_FENCE_ARRAY) {
+		fences = get_fence_array(args, file);
+		if (IS_ERR(fences)) {
+			kvfree(exec2_list);
+			return PTR_ERR(fences);
+		}
+	}
+
+	err = i915_gem_do_execbuffer(dev, file, args, exec2_list, fences);
 
 	/*
 	 * Now that we have begun execution of the batchbuffer, we ignore
@@ -2517,6 +2592,7 @@ end_user:
 	}
 
 	args->flags &= ~__I915_EXEC_UNKNOWN_FLAGS;
+	put_fence_array(args, fences);
 	kvfree(exec2_list);
 	return err;
 }