459 files changed, 15013 insertions, 7491 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
index a52795d9b458..c04f44a90392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
@@ -35,41 +35,50 @@
 
 #include "acp_gfx_if.h"
 
-#define ACP_TILE_ON_MASK                0x03
-#define ACP_TILE_OFF_MASK               0x02
-#define ACP_TILE_ON_RETAIN_REG_MASK     0x1f
-#define ACP_TILE_OFF_RETAIN_REG_MASK    0x20
-
-#define ACP_TILE_P1_MASK                0x3e
-#define ACP_TILE_P2_MASK                0x3d
-#define ACP_TILE_DSP0_MASK              0x3b
-#define ACP_TILE_DSP1_MASK              0x37
-
-#define ACP_TILE_DSP2_MASK              0x2f
-
-#define ACP_DMA_REGS_END		0x146c0
-#define ACP_I2S_PLAY_REGS_START		0x14840
-#define ACP_I2S_PLAY_REGS_END		0x148b4
-#define ACP_I2S_CAP_REGS_START		0x148b8
-#define ACP_I2S_CAP_REGS_END		0x1496c
-
-#define ACP_I2S_COMP1_CAP_REG_OFFSET	0xac
-#define ACP_I2S_COMP2_CAP_REG_OFFSET	0xa8
-#define ACP_I2S_COMP1_PLAY_REG_OFFSET	0x6c
-#define ACP_I2S_COMP2_PLAY_REG_OFFSET	0x68
-
-#define mmACP_PGFSM_RETAIN_REG		0x51c9
-#define mmACP_PGFSM_CONFIG_REG		0x51ca
-#define mmACP_PGFSM_READ_REG_0		0x51cc
-
-#define mmACP_MEM_SHUT_DOWN_REQ_LO	0x51f8
-#define mmACP_MEM_SHUT_DOWN_REQ_HI	0x51f9
-#define mmACP_MEM_SHUT_DOWN_STS_LO	0x51fa
-#define mmACP_MEM_SHUT_DOWN_STS_HI	0x51fb
-
-#define ACP_TIMEOUT_LOOP		0x000000FF
-#define ACP_DEVS			3
-#define ACP_SRC_ID			162
+#define ACP_TILE_ON_MASK                	0x03
+#define ACP_TILE_OFF_MASK               	0x02
+#define ACP_TILE_ON_RETAIN_REG_MASK     	0x1f
+#define ACP_TILE_OFF_RETAIN_REG_MASK    	0x20
+
+#define ACP_TILE_P1_MASK                	0x3e
+#define ACP_TILE_P2_MASK                	0x3d
+#define ACP_TILE_DSP0_MASK              	0x3b
+#define ACP_TILE_DSP1_MASK              	0x37
+
+#define ACP_TILE_DSP2_MASK              	0x2f
+
+#define ACP_DMA_REGS_END			0x146c0
+#define ACP_I2S_PLAY_REGS_START			0x14840
+#define ACP_I2S_PLAY_REGS_END			0x148b4
+#define ACP_I2S_CAP_REGS_START			0x148b8
+#define ACP_I2S_CAP_REGS_END			0x1496c
+
+#define ACP_I2S_COMP1_CAP_REG_OFFSET		0xac
+#define ACP_I2S_COMP2_CAP_REG_OFFSET		0xa8
+#define ACP_I2S_COMP1_PLAY_REG_OFFSET		0x6c
+#define ACP_I2S_COMP2_PLAY_REG_OFFSET		0x68
+
+#define mmACP_PGFSM_RETAIN_REG			0x51c9
+#define mmACP_PGFSM_CONFIG_REG			0x51ca
+#define mmACP_PGFSM_READ_REG_0			0x51cc
+
+#define mmACP_MEM_SHUT_DOWN_REQ_LO		0x51f8
+#define mmACP_MEM_SHUT_DOWN_REQ_HI		0x51f9
+#define mmACP_MEM_SHUT_DOWN_STS_LO		0x51fa
+#define mmACP_MEM_SHUT_DOWN_STS_HI		0x51fb
+
+#define mmACP_CONTROL				0x5131
+#define mmACP_STATUS				0x5133
+#define mmACP_SOFT_RESET			0x5134
+#define ACP_CONTROL__ClkEn_MASK 		0x1
+#define ACP_SOFT_RESET__SoftResetAud_MASK 	0x100
+#define ACP_SOFT_RESET__SoftResetAudDone_MASK	0x1000000
+#define ACP_CLOCK_EN_TIME_OUT_VALUE		0x000000FF
+#define ACP_SOFT_RESET_DONE_TIME_OUT_VALUE	0x000000FF
+
+#define ACP_TIMEOUT_LOOP			0x000000FF
+#define ACP_DEVS				3
+#define ACP_SRC_ID				162
 
 enum {
 	ACP_TILE_P1 = 0,
@@ -260,6 +269,8 @@ static int acp_hw_init(void *handle)
 {
 	int r, i;
 	uint64_t acp_base;
+	u32 val = 0;
+	u32 count = 0;
 	struct device *dev;
 	struct i2s_platform_data *i2s_pdata;
 
@@ -371,6 +382,8 @@ static int acp_hw_init(void *handle)
 	adev->acp.acp_cell[0].name = "acp_audio_dma";
 	adev->acp.acp_cell[0].num_resources = 4;
 	adev->acp.acp_cell[0].resources = &adev->acp.acp_res[0];
+	adev->acp.acp_cell[0].platform_data = &adev->asic_type;
+	adev->acp.acp_cell[0].pdata_size = sizeof(adev->asic_type);
 
 	adev->acp.acp_cell[1].name = "designware-i2s";
 	adev->acp.acp_cell[1].num_resources = 1;
@@ -400,6 +413,46 @@ static int acp_hw_init(void *handle)
 		}
 	}
 
+	/* Assert Soft reset of ACP */
+	val = cgs_read_register(adev->acp.cgs_device, mmACP_SOFT_RESET);
+
+	val |= ACP_SOFT_RESET__SoftResetAud_MASK;
+	cgs_write_register(adev->acp.cgs_device, mmACP_SOFT_RESET, val);
+
+	count = ACP_SOFT_RESET_DONE_TIME_OUT_VALUE;
+	while (true) {
+		val = cgs_read_register(adev->acp.cgs_device, mmACP_SOFT_RESET);
+		if (ACP_SOFT_RESET__SoftResetAudDone_MASK ==
+		    (val & ACP_SOFT_RESET__SoftResetAudDone_MASK))
+			break;
+		if (--count == 0) {
+			dev_err(&adev->pdev->dev, "Failed to reset ACP\n");
+			return -ETIMEDOUT;
+		}
+		udelay(100);
+	}
+	/* Enable clock to ACP and wait until the clock is enabled */
+	val = cgs_read_register(adev->acp.cgs_device, mmACP_CONTROL);
+	val = val | ACP_CONTROL__ClkEn_MASK;
+	cgs_write_register(adev->acp.cgs_device, mmACP_CONTROL, val);
+
+	count = ACP_CLOCK_EN_TIME_OUT_VALUE;
+
+	while (true) {
+		val = cgs_read_register(adev->acp.cgs_device, mmACP_STATUS);
+		if (val & (u32) 0x1)
+			break;
+		if (--count == 0) {
+			dev_err(&adev->pdev->dev, "Failed to reset ACP\n");
+			return -ETIMEDOUT;
+		}
+		udelay(100);
+	}
+	/* Deassert the SOFT RESET flags */
+	val = cgs_read_register(adev->acp.cgs_device, mmACP_SOFT_RESET);
+	val &= ~ACP_SOFT_RESET__SoftResetAud_MASK;
+	cgs_write_register(adev->acp.cgs_device, mmACP_SOFT_RESET, val);
+
 	return 0;
 }
 
@@ -412,6 +465,8 @@ static int acp_hw_init(void *handle)
 static int acp_hw_fini(void *handle)
 {
 	int i, ret;
+	u32 val = 0;
+	u32 count = 0;
 	struct device *dev;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
@@ -419,6 +474,42 @@ static int acp_hw_fini(void *handle)
 	if (!adev->acp.acp_cell)
 		return 0;
 
+	/* Assert Soft reset of ACP */
+	val = cgs_read_register(adev->acp.cgs_device, mmACP_SOFT_RESET);
+
+	val |= ACP_SOFT_RESET__SoftResetAud_MASK;
+	cgs_write_register(adev->acp.cgs_device, mmACP_SOFT_RESET, val);
+
+	count = ACP_SOFT_RESET_DONE_TIME_OUT_VALUE;
+	while (true) {
+		val = cgs_read_register(adev->acp.cgs_device, mmACP_SOFT_RESET);
+		if (ACP_SOFT_RESET__SoftResetAudDone_MASK ==
+		    (val & ACP_SOFT_RESET__SoftResetAudDone_MASK))
+			break;
+		if (--count == 0) {
+			dev_err(&adev->pdev->dev, "Failed to reset ACP\n");
+			return -ETIMEDOUT;
+		}
+		udelay(100);
+	}
+	/* Disable ACP clock */
+	val = cgs_read_register(adev->acp.cgs_device, mmACP_CONTROL);
+	val &= ~ACP_CONTROL__ClkEn_MASK;
+	cgs_write_register(adev->acp.cgs_device, mmACP_CONTROL, val);
+
+	count = ACP_CLOCK_EN_TIME_OUT_VALUE;
+
+	while (true) {
+		val = cgs_read_register(adev->acp.cgs_device, mmACP_STATUS);
+		if (val & (u32) 0x1)
+			break;
+		if (--count == 0) {
+			dev_err(&adev->pdev->dev, "Failed to reset ACP\n");
+			return -ETIMEDOUT;
+		}
+		udelay(100);
+	}
+
 	if (adev->acp.acp_genpd) {
 		for (i = 0; i < ACP_DEVS ; i++) {
 			dev = get_mfd_cell_dev(adev->acp.acp_cell[i].name, i);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
index f66d33e4baca..f450b69323fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
@@ -1766,34 +1766,32 @@ bool amdgpu_atombios_scratch_need_asic_init(struct amdgpu_device *adev)
 		return true;
 }
 
-/* Atom needs data in little endian format
- * so swap as appropriate when copying data to
- * or from atom. Note that atom operates on
- * dw units.
+/* Atom needs data in little endian format so swap as appropriate when copying
+ * data to or from atom. Note that atom operates on dw units.
+ *
+ * Use to_le=true when sending data to atom and provide at least
+ * ALIGN(num_bytes,4) bytes in the dst buffer.
+ *
+ * Use to_le=false when receiving data from atom and provide ALIGN(num_bytes,4)
+ * byes in the src buffer.
  */
 void amdgpu_atombios_copy_swap(u8 *dst, u8 *src, u8 num_bytes, bool to_le)
 {
 #ifdef __BIG_ENDIAN
-	u8 src_tmp[20], dst_tmp[20]; /* used for byteswapping */
-	u32 *dst32, *src32;
+	u32 src_tmp[5], dst_tmp[5];
 	int i;
+	u8 align_num_bytes = ALIGN(num_bytes, 4);
 
-	memcpy(src_tmp, src, num_bytes);
-	src32 = (u32 *)src_tmp;
-	dst32 = (u32 *)dst_tmp;
 	if (to_le) {
-		for (i = 0; i < ((num_bytes + 3) / 4); i++)
-			dst32[i] = cpu_to_le32(src32[i]);
-		memcpy(dst, dst_tmp, num_bytes);
+		memcpy(src_tmp, src, num_bytes);
+		for (i = 0; i < align_num_bytes / 4; i++)
+			dst_tmp[i] = cpu_to_le32(src_tmp[i]);
+		memcpy(dst, dst_tmp, align_num_bytes);
 	} else {
-		u8 dws = num_bytes & ~3;
-		for (i = 0; i < ((num_bytes + 3) / 4); i++)
-			dst32[i] = le32_to_cpu(src32[i]);
-		memcpy(dst, dst_tmp, dws);
-		if (num_bytes % 4) {
-			for (i = 0; i < (num_bytes % 4); i++)
-				dst[dws+i] = dst_tmp[dws+i];
-		}
+		memcpy(src_tmp, src, align_num_bytes);
+		for (i = 0; i < align_num_bytes / 4; i++)
+			dst_tmp[i] = le32_to_cpu(src_tmp[i]);
+		memcpy(dst, dst_tmp, num_bytes);
 	}
 #else
 	memcpy(dst, src, num_bytes);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index f9ffe8ef0cd6..ff8efd0f8fd5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -71,19 +71,33 @@ int amdgpu_atomfirmware_allocate_fb_scratch(struct amdgpu_device *adev)
 	struct atom_context *ctx = adev->mode_info.atom_context;
 	int index = get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
 						vram_usagebyfirmware);
+	struct vram_usagebyfirmware_v2_1 *	firmware_usage;
+	uint32_t start_addr, size;
 	uint16_t data_offset;
 	int usage_bytes = 0;
 
 	if (amdgpu_atom_parse_data_header(ctx, index, NULL, NULL, NULL, &data_offset)) {
-		struct vram_usagebyfirmware_v2_1 *firmware_usage =
-			(struct vram_usagebyfirmware_v2_1 *)(ctx->bios + data_offset);
-
+		firmware_usage = (struct vram_usagebyfirmware_v2_1 *)(ctx->bios + data_offset);
 		DRM_DEBUG("atom firmware requested %08x %dkb fw %dkb drv\n",
 			  le32_to_cpu(firmware_usage->start_address_in_kb),
 			  le16_to_cpu(firmware_usage->used_by_firmware_in_kb),
 			  le16_to_cpu(firmware_usage->used_by_driver_in_kb));
 
-		usage_bytes = le16_to_cpu(firmware_usage->used_by_driver_in_kb) * 1024;
+		start_addr = le32_to_cpu(firmware_usage->start_address_in_kb);
+		size = le16_to_cpu(firmware_usage->used_by_firmware_in_kb);
+
+		if ((uint32_t)(start_addr & ATOM_VRAM_OPERATION_FLAGS_MASK) ==
+			(uint32_t)(ATOM_VRAM_BLOCK_SRIOV_MSG_SHARE_RESERVATION <<
+			ATOM_VRAM_OPERATION_FLAGS_SHIFT)) {
+			/* Firmware request VRAM reservation for SR-IOV */
+			adev->fw_vram_usage.start_offset = (start_addr &
+				(~ATOM_VRAM_OPERATION_FLAGS_MASK)) << 10;
+			adev->fw_vram_usage.size = size << 10;
+			/* Use the default scratch size */
+			usage_bytes = 0;
+		} else {
+			usage_bytes = le16_to_cpu(firmware_usage->used_by_driver_in_kb) << 10;
+		}
 	}
 	ctx->scratch_size_bytes = 0;
 	if (usage_bytes == 0)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0b9332e65a4c..efcacb827de7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -546,7 +546,7 @@ int amdgpu_wb_get(struct amdgpu_device *adev, u32 *wb)
 
 	if (offset < adev->wb.num_wb) {
 		__set_bit(offset, adev->wb.used);
-		*wb = offset * 8; /* convert to dw offset */
+		*wb = offset << 3; /* convert to dw offset */
 		return 0;
 	} else {
 		return -EINVAL;
@@ -564,7 +564,7 @@ int amdgpu_wb_get(struct amdgpu_device *adev, u32 *wb)
 void amdgpu_wb_free(struct amdgpu_device *adev, u32 wb)
 {
 	if (wb < adev->wb.num_wb)
-		__clear_bit(wb, adev->wb.used);
+		__clear_bit(wb >> 3, adev->wb.used);
 }
 
 /**
@@ -744,27 +744,6 @@ bool amdgpu_need_post(struct amdgpu_device *adev)
 {
 	uint32_t reg;
 
-	if (adev->has_hw_reset) {
-		adev->has_hw_reset = false;
-		return true;
-	}
-
-	/* bios scratch used on CIK+ */
-	if (adev->asic_type >= CHIP_BONAIRE)
-		return amdgpu_atombios_scratch_need_asic_init(adev);
-
-	/* check MEM_SIZE for older asics */
-	reg = amdgpu_asic_get_config_memsize(adev);
-
-	if ((reg != 0) && (reg != 0xffffffff))
-		return false;
-
-	return true;
-
-}
-
-static bool amdgpu_vpost_needed(struct amdgpu_device *adev)
-{
 	if (amdgpu_sriov_vf(adev))
 		return false;
 
@@ -787,7 +766,23 @@ static bool amdgpu_vpost_needed(struct amdgpu_device *adev)
 				return true;
 		}
 	}
-	return amdgpu_need_post(adev);
+
+	if (adev->has_hw_reset) {
+		adev->has_hw_reset = false;
+		return true;
+	}
+
+	/* bios scratch used on CIK+ */
+	if (adev->asic_type >= CHIP_BONAIRE)
+		return amdgpu_atombios_scratch_need_asic_init(adev);
+
+	/* check MEM_SIZE for older asics */
+	reg = amdgpu_asic_get_config_memsize(adev);
+
+	if ((reg != 0) && (reg != 0xffffffff))
+		return false;
+
+	return true;
 }
 
 /**
@@ -1951,6 +1946,7 @@ static int amdgpu_sriov_reinit_late(struct amdgpu_device *adev)
 
 	static enum amd_ip_block_type ip_order[] = {
 		AMD_IP_BLOCK_TYPE_SMC,
+		AMD_IP_BLOCK_TYPE_PSP,
 		AMD_IP_BLOCK_TYPE_DCE,
 		AMD_IP_BLOCK_TYPE_GFX,
 		AMD_IP_BLOCK_TYPE_SDMA,
@@ -2036,12 +2032,17 @@ static int amdgpu_resume(struct amdgpu_device *adev)
 
 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
 {
-	if (adev->is_atom_fw) {
-		if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
-			adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
-	} else {
-		if (amdgpu_atombios_has_gpu_virtualization_table(adev))
-			adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
+	if (amdgpu_sriov_vf(adev)) {
+		if (adev->is_atom_fw) {
+			if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
+				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
+		} else {
+			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
+				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
+		}
+
+		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
+			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
 	}
 }
 
@@ -2208,10 +2209,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	amdgpu_device_detect_sriov_bios(adev);
 
 	/* Post card if necessary */
-	if (amdgpu_vpost_needed(adev)) {
+	if (amdgpu_need_post(adev)) {
 		if (!adev->bios) {
 			dev_err(adev->dev, "no vBIOS found\n");
-			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
 			r = -EINVAL;
 			goto failed;
 		}
@@ -2219,7 +2219,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 		r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
 		if (r) {
 			dev_err(adev->dev, "gpu post error!\n");
-			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_POST_ERROR, 0, 0);
 			goto failed;
 		}
 	} else {
@@ -3023,7 +3022,6 @@ out:
 		}
 	} else {
 		dev_err(adev->dev, "asic resume failed (%d).\n", r);
-		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ASIC_RESUME_FAIL, 0, r);
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			if (adev->rings[i] && adev->rings[i]->sched.thread) {
 				kthread_unpark(adev->rings[i]->sched.thread);
@@ -3037,7 +3035,6 @@ out:
 	if (r) {
 		/* bad news, how to tell it to userspace ? */
 		dev_info(adev->dev, "GPU reset failed\n");
-		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
 	}
 	else {
 		dev_info(adev->dev, "GPU reset successed!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index f4370081f6e6..fe818501c520 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -332,12 +332,13 @@ int amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset,
 		adev->gart.pages[p] = pagelist[i];
 #endif
 
-	if (adev->gart.ptr) {
-		r = amdgpu_gart_map(adev, offset, pages, dma_addr, flags,
-			    adev->gart.ptr);
-		if (r)
-			return r;
-	}
+	if (!adev->gart.ptr)
+		return 0;
+
+	r = amdgpu_gart_map(adev, offset, pages, dma_addr, flags,
+		    adev->gart.ptr);
+	if (r)
+		return r;
 
 	mb();
 	amdgpu_gart_flush_gpu_tlb(adev, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 8b4ed8a98a18..ea25164e7f4b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -369,6 +369,9 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
 				 &bo->placement, page_align, !kernel, NULL,
 				 acc_size, sg, resv, &amdgpu_ttm_bo_destroy);
+	if (unlikely(r != 0))
+		return r;
+
 	bytes_moved = atomic64_read(&adev->num_bytes_moved) -
 		      initial_bytes_moved;
 	if (adev->mc.visible_vram_size < adev->mc.real_vram_size &&
@@ -378,9 +381,6 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 	else
 		amdgpu_cs_report_moved_bytes(adev, bytes_moved, 0);
 
-	if (unlikely(r != 0))
-		return r;
-
 	if (kernel)
 		bo->tbo.priority = 1;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index e5ece1fae149..a98fbbb4739f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -136,7 +136,8 @@ void amdgpu_ring_commit(struct amdgpu_ring *ring)
 	if (ring->funcs->end_use)
 		ring->funcs->end_use(ring);
 
-	amdgpu_ring_lru_touch(ring->adev, ring);
+	if (ring->funcs->type != AMDGPU_RING_TYPE_KIQ)
+		amdgpu_ring_lru_touch(ring->adev, ring);
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 51eacefadea1..1f036af85ba6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -909,7 +909,8 @@ int amdgpu_ttm_bind(struct ttm_buffer_object *bo, struct ttm_mem_reg *bo_mem)
 	placement.busy_placement = &placements;
 	placements.fpfn = 0;
 	placements.lpfn = adev->mc.gart_size >> PAGE_SHIFT;
-	placements.flags = bo->mem.placement | TTM_PL_FLAG_TT;
+	placements.flags = (bo->mem.placement & ~TTM_PL_MASK_MEM) |
+		TTM_PL_FLAG_TT;
 
 	r = ttm_bo_mem_space(bo, &placement, &tmp, true, false);
 	if (unlikely(r))
@@ -1192,9 +1193,6 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
 	unsigned long num_pages = bo->mem.num_pages;
 	struct drm_mm_node *node = bo->mem.mm_node;
 
-	if (bo->mem.start != AMDGPU_BO_INVALID_OFFSET)
-		return ttm_bo_eviction_valuable(bo, place);
-
 	switch (bo->mem.mem_type) {
 	case TTM_PL_TT:
 		return true;
@@ -1209,7 +1207,7 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
 			num_pages -= node->size;
 			++node;
 		}
-		break;
+		return false;
 
 	default:
 		break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index b46280c1279f..2918de2f39ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -648,7 +648,7 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 	uint32_t allocated = 0;
 	uint32_t tmp, handle = 0;
 	uint32_t *size = &tmp;
-	int i, r, idx = 0;
+	int i, r = 0, idx = 0;
 
 	p->job->vm = NULL;
 	ib->gpu_addr = amdgpu_sa_bo_gpu_addr(ib->sa_bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c
index 746b81339835..7f7097931c6f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c
@@ -31,7 +31,12 @@ void amdgpu_vf_error_put(struct amdgpu_device *adev,
 			 uint64_t error_data)
 {
 	int index;
-	uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code);
+	uint16_t error_code;
+
+	if (!amdgpu_sriov_vf(adev))
+		return;
+
+	error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code);
 
 	mutex_lock(&adev->virt.vf_errors.lock);
 	index = adev->virt.vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index e97f80f86005..6738df836a70 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -114,18 +114,19 @@ void amdgpu_virt_init_setting(struct amdgpu_device *adev)
 uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 {
 	signed long r;
+	unsigned long flags;
 	uint32_t val, seq;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
 
 	BUG_ON(!ring->funcs->emit_rreg);
 
-	spin_lock(&kiq->ring_lock);
+	spin_lock_irqsave(&kiq->ring_lock, flags);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_emit_rreg(ring, reg);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock(&kiq->ring_lock);
+	spin_unlock_irqrestore(&kiq->ring_lock, flags);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 	if (r < 1) {
@@ -140,18 +141,19 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
 void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 {
 	signed long r;
+	unsigned long flags;
 	uint32_t seq;
 	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
 	struct amdgpu_ring *ring = &kiq->ring;
 
 	BUG_ON(!ring->funcs->emit_wreg);
 
-	spin_lock(&kiq->ring_lock);
+	spin_lock_irqsave(&kiq->ring_lock, flags);
 	amdgpu_ring_alloc(ring, 32);
 	amdgpu_ring_emit_wreg(ring, reg, v);
 	amdgpu_fence_emit_polling(ring, &seq);
 	amdgpu_ring_commit(ring);
-	spin_unlock(&kiq->ring_lock);
+	spin_unlock_irqrestore(&kiq->ring_lock, flags);
 
 	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
 	if (r < 1)
@@ -328,9 +330,11 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
 					sizeof(amdgim_vf2pf_info));
 				AMDGPU_FW_VRAM_VF2PF_READ(adev, driver_version,
 					&str);
+#ifdef MODULE
 				if (THIS_MODULE->version != NULL)
 					strcpy(str, THIS_MODULE->version);
 				else
+#endif
 					strcpy(str, "N/A");
 				AMDGPU_FW_VRAM_VF2PF_WRITE(adev, driver_cert,
 					0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 010d14195a5e..c8c26f21993c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1244,7 +1244,7 @@ static void amdgpu_vm_invalidate_level(struct amdgpu_vm *vm,
 int amdgpu_vm_update_directories(struct amdgpu_device *adev,
 				 struct amdgpu_vm *vm)
 {
-	int r;
+	int r = 0;
 
 	spin_lock(&vm->status_lock);
 	while (!list_empty(&vm->relocated)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
index dea7c909ca5f..4e20d91d5d50 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
@@ -257,6 +257,9 @@ int psp_v10_0_cmd_submit(struct psp_context *psp,
 	unsigned int psp_write_ptr_reg = 0;
 	struct psp_gfx_rb_frame * write_frame = psp->km_ring.ring_mem;
 	struct psp_ring *ring = &psp->km_ring;
+	struct psp_gfx_rb_frame *ring_buffer_start = ring->ring_mem;
+	struct psp_gfx_rb_frame *ring_buffer_end = ring_buffer_start +
+		ring->ring_size / sizeof(struct psp_gfx_rb_frame) - 1;
 	struct amdgpu_device *adev = psp->adev;
 	uint32_t ring_size_dw = ring->ring_size / 4;
 	uint32_t rb_frame_size_dw = sizeof(struct psp_gfx_rb_frame) / 4;
@@ -266,9 +269,16 @@ int psp_v10_0_cmd_submit(struct psp_context *psp,
 
 	/* Update KM RB frame pointer to new frame */
 	if ((psp_write_ptr_reg % ring_size_dw) == 0)
-		write_frame = ring->ring_mem;
+		write_frame = ring_buffer_start;
 	else
-		write_frame = ring->ring_mem + (psp_write_ptr_reg / rb_frame_size_dw);
+		write_frame = ring_buffer_start + (psp_write_ptr_reg / rb_frame_size_dw);
+	/* Check invalid write_frame ptr address */
+	if ((write_frame < ring_buffer_start) || (ring_buffer_end < write_frame)) {
+		DRM_ERROR("ring_buffer_start = %p; ring_buffer_end = %p; write_frame = %p\n",
+			  ring_buffer_start, ring_buffer_end, write_frame);
+		DRM_ERROR("write_frame is pointing to address out of bounds\n");
+		return -EINVAL;
+	}
 
 	/* Initialize KM RB frame */
 	memset(write_frame, 0, sizeof(struct psp_gfx_rb_frame));
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c
index cee5c396b277..c7bcfe8e286c 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c
@@ -367,6 +367,9 @@ int psp_v3_1_cmd_submit(struct psp_context *psp,
 	unsigned int psp_write_ptr_reg = 0;
 	struct psp_gfx_rb_frame * write_frame = psp->km_ring.ring_mem;
 	struct psp_ring *ring = &psp->km_ring;
+	struct psp_gfx_rb_frame *ring_buffer_start = ring->ring_mem;
+	struct psp_gfx_rb_frame *ring_buffer_end = ring_buffer_start +
+		ring->ring_size / sizeof(struct psp_gfx_rb_frame) - 1;
 	struct amdgpu_device *adev = psp->adev;
 	uint32_t ring_size_dw = ring->ring_size / 4;
 	uint32_t rb_frame_size_dw = sizeof(struct psp_gfx_rb_frame) / 4;
@@ -378,9 +381,16 @@ int psp_v3_1_cmd_submit(struct psp_context *psp,
 	/* write_frame ptr increments by size of rb_frame in bytes */
 	/* psp_write_ptr_reg increments by size of rb_frame in DWORDs */
 	if ((psp_write_ptr_reg % ring_size_dw) == 0)
-		write_frame = ring->ring_mem;
+		write_frame = ring_buffer_start;
 	else
-		write_frame = ring->ring_mem + (psp_write_ptr_reg / rb_frame_size_dw);
+		write_frame = ring_buffer_start + (psp_write_ptr_reg / rb_frame_size_dw);
+	/* Check invalid write_frame ptr address */
+	if ((write_frame < ring_buffer_start) || (ring_buffer_end < write_frame)) {
+		DRM_ERROR("ring_buffer_start = %p; ring_buffer_end = %p; write_frame = %p\n",
+			  ring_buffer_start, ring_buffer_end, write_frame);
+		DRM_ERROR("write_frame is pointing to address out of bounds\n");
+		return -EINVAL;
+	}
 
 	/* Initialize KM RB frame */
 	memset(write_frame, 0, sizeof(struct psp_gfx_rb_frame));
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
index 71299c67c517..2581543b35a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
@@ -565,11 +565,7 @@ static int uvd_v6_0_suspend(void *handle)
 	if (r)
 		return r;
 
-	/* Skip this for APU for now */
-	if (!(adev->flags & AMD_IS_APU))
-		r = amdgpu_uvd_suspend(adev);
-
-	return r;
+	return amdgpu_uvd_suspend(adev);
 }
 
 static int uvd_v6_0_resume(void *handle)
@@ -577,12 +573,10 @@ static int uvd_v6_0_resume(void *handle)
 	int r;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	/* Skip this for APU for now */
-	if (!(adev->flags & AMD_IS_APU)) {
-		r = amdgpu_uvd_resume(adev);
-		if (r)
-			return r;
-	}
+	r = amdgpu_uvd_resume(adev);
+	if (r)
+		return r;
+
 	return uvd_v6_0_hw_init(adev);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
index b8ed8faf2003..6634545060fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
@@ -592,11 +592,7 @@ static int uvd_v7_0_suspend(void *handle)
 	if (r)
 		return r;
 
-	/* Skip this for APU for now */
-	if (!(adev->flags & AMD_IS_APU))
-		r = amdgpu_uvd_suspend(adev);
-
-	return r;
+	return amdgpu_uvd_suspend(adev);
 }
 
 static int uvd_v7_0_resume(void *handle)
@@ -604,12 +600,10 @@ static int uvd_v7_0_resume(void *handle)
 	int r;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	/* Skip this for APU for now */
-	if (!(adev->flags & AMD_IS_APU)) {
-		r = amdgpu_uvd_resume(adev);
-		if (r)
-			return r;
-	}
+	r = amdgpu_uvd_resume(adev);
+	if (r)
+		return r;
+
 	return uvd_v7_0_hw_init(adev);
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig
index e13c67c8d2c0..bc5a2945bd2b 100644
--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
@@ -4,6 +4,6 @@
 
 config HSA_AMD
 	tristate "HSA kernel driver for AMD GPU devices"
-	depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64
+	depends on DRM_AMDGPU && AMD_IOMMU_V2 && X86_64
 	help
 	  Enable this if you want to use HSA features on AMD GPU devices.
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 211fc48697fa..3d5ccb3755d4 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -36,6 +36,7 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 	/* Do not process in ISR, just request it to be forwarded to WQ. */
 	return (pasid != 0) &&
 		(ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
+		ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
 		ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
 		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE);
 }
@@ -46,6 +47,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 	unsigned int pasid;
 	const struct cik_ih_ring_entry *ihre =
 			(const struct cik_ih_ring_entry *)ih_ring_entry;
+	uint32_t context_id = ihre->data & 0xfffffff;
 
 	pasid = (ihre->ring_id & 0xffff0000) >> 16;
 
@@ -53,9 +55,11 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 		return;
 
 	if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE)
-		kfd_signal_event_interrupt(pasid, 0, 0);
+		kfd_signal_event_interrupt(pasid, context_id, 28);
+	else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP)
+		kfd_signal_event_interrupt(pasid, context_id, 28);
 	else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG)
-		kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8);
+		kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
 	else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
 		kfd_signal_hw_exception_event(pasid);
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h
index 79a16d24c1b8..109298b9d507 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_int.h
+++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h
@@ -32,9 +32,10 @@ struct cik_ih_ring_entry {
 	uint32_t reserved;
 };
 
-#define CIK_INTSRC_DEQUEUE_COMPLETE	0xC6
 #define CIK_INTSRC_CP_END_OF_PIPE	0xB5
 #define CIK_INTSRC_CP_BAD_OPCODE	0xB7
+#define CIK_INTSRC_DEQUEUE_COMPLETE	0xC6
+#define CIK_INTSRC_SDMA_TRAP		0xE0
 #define CIK_INTSRC_SQ_INTERRUPT_MSG	0xEF
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 0ef82b229754..505d39156acd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -450,8 +450,8 @@ static int kfd_ioctl_dbg_register(struct file *filep,
 		return -EINVAL;
 	}
 
-	mutex_lock(kfd_get_dbgmgr_mutex());
 	mutex_lock(&p->mutex);
+	mutex_lock(kfd_get_dbgmgr_mutex());
 
 	/*
 	 * make sure that we have pdd, if this the first queue created for
@@ -479,8 +479,8 @@ static int kfd_ioctl_dbg_register(struct file *filep,
 	}
 
 out:
-	mutex_unlock(&p->mutex);
 	mutex_unlock(kfd_get_dbgmgr_mutex());
+	mutex_unlock(&p->mutex);
 
 	return status;
 }
@@ -835,15 +835,12 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p,
 				void *data)
 {
 	struct kfd_ioctl_wait_events_args *args = data;
-	enum kfd_event_wait_result wait_result;
 	int err;
 
 	err = kfd_wait_on_events(p, args->num_events,
 			(void __user *)args->events_ptr,
 			(args->wait_for_all != 0),
-			args->timeout, &wait_result);
-
-	args->wait_result = wait_result;
+			args->timeout, &args->wait_result);
 
 	return err;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 46049f005b02..621a3b53a038 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -403,7 +403,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
 	if (kfd->interrupts_active
 	    && interrupt_is_wanted(kfd, ih_ring_entry)
 	    && enqueue_ih_ring_entry(kfd, ih_ring_entry))
-		schedule_work(&kfd->interrupt_work);
+		queue_work(kfd->ih_wq, &kfd->interrupt_work);
 
 	spin_unlock(&kfd->interrupt_lock);
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index da3b74315acf..e202921c150e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -389,12 +389,11 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 	if (sched_policy != KFD_SCHED_POLICY_NO_HWS) {
 		retval = unmap_queues_cpsch(dqm,
 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
-		if (retval != 0) {
+		if (retval) {
 			pr_err("unmap queue failed\n");
 			goto out_unlock;
 		}
-	} else if (sched_policy == KFD_SCHED_POLICY_NO_HWS &&
-		   prev_active &&
+	} else if (prev_active &&
 		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
 		    q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
 		retval = mqd->destroy_mqd(mqd, q->mqd,
@@ -408,24 +407,25 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 
 	retval = mqd->update_mqd(mqd, q->mqd, &q->properties);
 
-	if (sched_policy != KFD_SCHED_POLICY_NO_HWS)
-		retval = map_queues_cpsch(dqm);
-	else if (sched_policy == KFD_SCHED_POLICY_NO_HWS &&
-		 q->properties.is_active &&
-		 (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
-		  q->properties.type == KFD_QUEUE_TYPE_SDMA))
-		retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue,
-				       &q->properties, q->process->mm);
-
 	/*
-	 * check active state vs. the previous state
-	 * and modify counter accordingly
+	 * check active state vs. the previous state and modify
+	 * counter accordingly. map_queues_cpsch uses the
+	 * dqm->queue_count to determine whether a new runlist must be
+	 * uploaded.
 	 */
 	if (q->properties.is_active && !prev_active)
 		dqm->queue_count++;
 	else if (!q->properties.is_active && prev_active)
 		dqm->queue_count--;
 
+	if (sched_policy != KFD_SCHED_POLICY_NO_HWS)
+		retval = map_queues_cpsch(dqm);
+	else if (q->properties.is_active &&
+		 (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+		  q->properties.type == KFD_QUEUE_TYPE_SDMA))
+		retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue,
+				       &q->properties, q->process->mm);
+
 out_unlock:
 	mutex_unlock(&dqm->lock);
 	return retval;
@@ -467,7 +467,7 @@ static int register_process(struct device_queue_manager *dqm,
 	mutex_lock(&dqm->lock);
 	list_add(&n->list, &dqm->queues);
 
-	retval = dqm->ops_asic_specific.register_process(dqm, qpd);
+	retval = dqm->asic_ops.update_qpd(dqm, qpd);
 
 	dqm->processes_count++;
 
@@ -629,7 +629,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
 	pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
 	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
 
-	dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd);
+	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
 	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
 				&q->gart_mqd_addr, &q->properties);
 	if (retval)
@@ -696,8 +696,6 @@ static int set_sched_resources(struct device_queue_manager *dqm)
 
 static int initialize_cpsch(struct device_queue_manager *dqm)
 {
-	int retval;
-
 	pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm));
 
 	mutex_init(&dqm->lock);
@@ -706,11 +704,8 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 	dqm->sdma_queue_count = 0;
 	dqm->active_runlist = false;
 	dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
-	retval = dqm->ops_asic_specific.initialize(dqm);
-	if (retval)
-		mutex_destroy(&dqm->lock);
 
-	return retval;
+	return 0;
 }
 
 static int start_cpsch(struct device_queue_manager *dqm)
@@ -835,7 +830,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		retval = allocate_sdma_queue(dqm, &q->sdma_id);
-		if (retval != 0)
+		if (retval)
 			goto out;
 		q->properties.sdma_queue_id =
 			q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE;
@@ -850,7 +845,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 		goto out;
 	}
 
-	dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd);
+	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
 	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
 				&q->gart_mqd_addr, &q->properties);
 	if (retval)
@@ -1095,7 +1090,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
 		qpd->sh_mem_ape1_limit = limit >> 16;
 	}
 
-	retval = dqm->ops_asic_specific.set_cache_memory_policy(
+	retval = dqm->asic_ops.set_cache_memory_policy(
 			dqm,
 			qpd,
 			default_policy,
@@ -1270,11 +1265,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 
 	switch (dev->device_info->asic_family) {
 	case CHIP_CARRIZO:
-		device_queue_manager_init_vi(&dqm->ops_asic_specific);
+		device_queue_manager_init_vi(&dqm->asic_ops);
 		break;
 
 	case CHIP_KAVERI:
-		device_queue_manager_init_cik(&dqm->ops_asic_specific);
+		device_queue_manager_init_cik(&dqm->asic_ops);
 		break;
 	default:
 		WARN(1, "Unexpected ASIC family %u",
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 31c2b1f9d320..5b77cb69f732 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -128,9 +128,8 @@ struct device_queue_manager_ops {
 };
 
 struct device_queue_manager_asic_ops {
-	int	(*register_process)(struct device_queue_manager *dqm,
+	int	(*update_qpd)(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd);
-	int	(*initialize)(struct device_queue_manager *dqm);
 	bool	(*set_cache_memory_policy)(struct device_queue_manager *dqm,
 					   struct qcm_process_device *qpd,
 					   enum cache_policy default_policy,
@@ -156,7 +155,7 @@ struct device_queue_manager_asic_ops {
 
 struct device_queue_manager {
 	struct device_queue_manager_ops ops;
-	struct device_queue_manager_asic_ops ops_asic_specific;
+	struct device_queue_manager_asic_ops asic_ops;
 
 	struct mqd_manager	*mqds[KFD_MQD_TYPE_MAX];
 	struct packet_manager	packets;
@@ -179,8 +178,10 @@ struct device_queue_manager {
 	bool			active_runlist;
 };
 
-void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops);
-void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops);
+void device_queue_manager_init_cik(
+		struct device_queue_manager_asic_ops *asic_ops);
+void device_queue_manager_init_vi(
+		struct device_queue_manager_asic_ops *asic_ops);
 void program_sh_mem_settings(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd);
 unsigned int get_queues_num(struct device_queue_manager *dqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
index 72c3cbabc0a7..28e48c90c596 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
@@ -32,18 +32,17 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm,
 				   enum cache_policy alternate_policy,
 				   void __user *alternate_aperture_base,
 				   uint64_t alternate_aperture_size);
-static int register_process_cik(struct device_queue_manager *dqm,
+static int update_qpd_cik(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd);
-static int initialize_cpsch_cik(struct device_queue_manager *dqm);
 static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
 				struct qcm_process_device *qpd);
 
-void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops)
+void device_queue_manager_init_cik(
+		struct device_queue_manager_asic_ops *asic_ops)
 {
-	ops->set_cache_memory_policy = set_cache_memory_policy_cik;
-	ops->register_process = register_process_cik;
-	ops->initialize = initialize_cpsch_cik;
-	ops->init_sdma_vm = init_sdma_vm;
+	asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik;
+	asic_ops->update_qpd = update_qpd_cik;
+	asic_ops->init_sdma_vm = init_sdma_vm;
 }
 
 static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
@@ -99,7 +98,7 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm,
 	return true;
 }
 
-static int register_process_cik(struct device_queue_manager *dqm,
+static int update_qpd_cik(struct device_queue_manager *dqm,
 		struct qcm_process_device *qpd)
 {
 	struct kfd_process_device *pdd;
@@ -148,8 +147,3 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
 
 	q->properties.sdma_vm_addr = value;
 }
-
-static int initialize_cpsch_cik(struct device_queue_manager *dqm)
-{
-	return 0;
-}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
index 40e9ddd096cd..2fbce57a2f21 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
@@ -33,18 +33,17 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
 				   enum cache_policy alternate_policy,
 				   void __user *alternate_aperture_base,
 				   uint64_t alternate_aperture_size);
-static int register_process_vi(struct device_queue_manager *dqm,
+static int update_qpd_vi(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd);
-static int initialize_cpsch_vi(struct device_queue_manager *dqm);
 static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
 				struct qcm_process_device *qpd);
 
-void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops)
+void device_queue_manager_init_vi(
+		struct device_queue_manager_asic_ops *asic_ops)
 {
-	ops->set_cache_memory_policy = set_cache_memory_policy_vi;
-	ops->register_process = register_process_vi;
-	ops->initialize = initialize_cpsch_vi;
-	ops->init_sdma_vm = init_sdma_vm;
+	asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi;
+	asic_ops->update_qpd = update_qpd_vi;
+	asic_ops->init_sdma_vm = init_sdma_vm;
 }
 
 static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
@@ -104,7 +103,7 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
 	return true;
 }
 
-static int register_process_vi(struct device_queue_manager *dqm,
+static int update_qpd_vi(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd)
 {
 	struct kfd_process_device *pdd;
@@ -160,8 +159,3 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
 
 	q->properties.sdma_vm_addr = value;
 }
-
-static int initialize_cpsch_vi(struct device_queue_manager *dqm)
-{
-	return 0;
-}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 944abfad39c1..cb92d4b72400 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -24,8 +24,8 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
 #include <linux/uaccess.h>
-#include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/memory.h>
 #include "kfd_priv.h"
@@ -33,185 +33,89 @@
 #include <linux/device.h>
 
 /*
- * A task can only be on a single wait_queue at a time, but we need to support
- * waiting on multiple events (any/all).
- * Instead of each event simply having a wait_queue with sleeping tasks, it
- * has a singly-linked list of tasks.
- * A thread that wants to sleep creates an array of these, one for each event
- * and adds one to each event's waiter chain.
+ * Wrapper around wait_queue_entry_t
  */
 struct kfd_event_waiter {
-	struct list_head waiters;
-	struct task_struct *sleeping_task;
-
-	/* Transitions to true when the event this belongs to is signaled. */
-	bool activated;
-
-	/* Event */
-	struct kfd_event *event;
-	uint32_t input_index;
+	wait_queue_entry_t wait;
+	struct kfd_event *event; /* Event to wait for */
+	bool activated;		 /* Becomes true when event is signaled */
 };
 
 /*
- * Over-complicated pooled allocator for event notification slots.
- *
  * Each signal event needs a 64-bit signal slot where the signaler will write
- * a 1 before sending an interrupt.l (This is needed because some interrupts
+ * a 1 before sending an interrupt. (This is needed because some interrupts
  * do not contain enough spare data bits to identify an event.)
- * We get whole pages from vmalloc and map them to the process VA.
- * Individual signal events are then allocated a slot in a page.
+ * We get whole pages and map them to the process VA.
+ * Individual signal events use their event_id as slot index.
  */
-
-struct signal_page {
-	struct list_head event_pages;	/* kfd_process.signal_event_pages */
+struct kfd_signal_page {
 	uint64_t *kernel_address;
 	uint64_t __user *user_address;
-	uint32_t page_index;		/* Index into the mmap aperture. */
-	unsigned int free_slots;
-	unsigned long used_slot_bitmap[0];
 };
 
-#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT
-#define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE)
-#define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1)
-#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \
-				SLOT_BITMAP_SIZE * sizeof(long))
-
-/*
- * For signal events, the event ID is used as the interrupt user data.
- * For SQ s_sendmsg interrupts, this is limited to 8 bits.
- */
-
-#define INTERRUPT_DATA_BITS 8
-#define SIGNAL_EVENT_ID_SLOT_SHIFT 0
 
-static uint64_t *page_slots(struct signal_page *page)
+static uint64_t *page_slots(struct kfd_signal_page *page)
 {
 	return page->kernel_address;
 }
 
-static bool allocate_free_slot(struct kfd_process *process,
-				struct signal_page **out_page,
-				unsigned int *out_slot_index)
-{
-	struct signal_page *page;
-
-	list_for_each_entry(page, &process->signal_event_pages, event_pages) {
-		if (page->free_slots > 0) {
-			unsigned int slot =
-				find_first_zero_bit(page->used_slot_bitmap,
-							SLOTS_PER_PAGE);
-
-			__set_bit(slot, page->used_slot_bitmap);
-			page->free_slots--;
-
-			page_slots(page)[slot] = UNSIGNALED_EVENT_SLOT;
-
-			*out_page = page;
-			*out_slot_index = slot;
-
-			pr_debug("Allocated event signal slot in page %p, slot %d\n",
-					page, slot);
-
-			return true;
-		}
-	}
-
-	pr_debug("No free event signal slots were found for process %p\n",
-			process);
-
-	return false;
-}
-
-#define list_tail_entry(head, type, member) \
-	list_entry((head)->prev, type, member)
-
-static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p)
+static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p)
 {
 	void *backing_store;
-	struct signal_page *page;
+	struct kfd_signal_page *page;
 
-	page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL);
+	page = kzalloc(sizeof(*page), GFP_KERNEL);
 	if (!page)
-		goto fail_alloc_signal_page;
+		return NULL;
 
-	page->free_slots = SLOTS_PER_PAGE;
-
-	backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+	backing_store = (void *) __get_free_pages(GFP_KERNEL,
 					get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
 	if (!backing_store)
 		goto fail_alloc_signal_store;
 
-	/* prevent user-mode info leaks */
+	/* Initialize all events to unsignaled */
 	memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT,
-		KFD_SIGNAL_EVENT_LIMIT * 8);
+	       KFD_SIGNAL_EVENT_LIMIT * 8);
 
 	page->kernel_address = backing_store;
-
-	if (list_empty(&p->signal_event_pages))
-		page->page_index = 0;
-	else
-		page->page_index = list_tail_entry(&p->signal_event_pages,
-						   struct signal_page,
-						   event_pages)->page_index + 1;
-
 	pr_debug("Allocated new event signal page at %p, for process %p\n",
 			page, p);
-	pr_debug("Page index is %d\n", page->page_index);
 
-	list_add(&page->event_pages, &p->signal_event_pages);
-
-	return true;
+	return page;
 
 fail_alloc_signal_store:
 	kfree(page);
-fail_alloc_signal_page:
-	return false;
+	return NULL;
 }
 
-static bool allocate_event_notification_slot(struct file *devkfd,
-					struct kfd_process *p,
-					struct signal_page **page,
-					unsigned int *signal_slot_index)
+static int allocate_event_notification_slot(struct kfd_process *p,
+					    struct kfd_event *ev)
 {
-	bool ret;
+	int id;
 
-	ret = allocate_free_slot(p, page, signal_slot_index);
-	if (!ret) {
-		ret = allocate_signal_page(devkfd, p);
-		if (ret)
-			ret = allocate_free_slot(p, page, signal_slot_index);
+	if (!p->signal_page) {
+		p->signal_page = allocate_signal_page(p);
+		if (!p->signal_page)
+			return -ENOMEM;
+		/* Oldest user mode expects 256 event slots */
+		p->signal_mapped_size = 256*8;
 	}
 
-	return ret;
-}
-
-/* Assumes that the process's event_mutex is locked. */
-static void release_event_notification_slot(struct signal_page *page,
-						size_t slot_index)
-{
-	__clear_bit(slot_index, page->used_slot_bitmap);
-	page->free_slots++;
-
-	/* We don't free signal pages, they are retained by the process
-	 * and reused until it exits.
-	 */
-}
-
-static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p,
-						unsigned int page_index)
-{
-	struct signal_page *page;
-
 	/*
-	 * This is safe because we don't delete signal pages until the
-	 * process exits.
+	 * Compatibility with old user mode: Only use signal slots
+	 * user mode has mapped, may be less than
+	 * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase
+	 * of the event limit without breaking user mode.
 	 */
-	list_for_each_entry(page, &p->signal_event_pages, event_pages)
-		if (page->page_index == page_index)
-			return page;
+	id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8,
+		       GFP_KERNEL);
+	if (id < 0)
+		return id;
 
-	return NULL;
+	ev->event_id = id;
+	page_slots(p->signal_page)[id] = UNSIGNALED_EVENT_SLOT;
+
+	return 0;
 }
 
 /*
@@ -220,99 +124,81 @@ static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p,
  */
 static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
 {
-	struct kfd_event *ev;
-
-	hash_for_each_possible(p->events, ev, events, id)
-		if (ev->event_id == id)
-			return ev;
-
-	return NULL;
+	return idr_find(&p->event_idr, id);
 }
 
-static u32 make_signal_event_id(struct signal_page *page,
-					 unsigned int signal_slot_index)
-{
-	return page->page_index |
-			(signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT);
-}
-
-/*
- * Produce a kfd event id for a nonsignal event.
- * These are arbitrary numbers, so we do a sequential search through
- * the hash table for an unused number.
+/**
+ * lookup_signaled_event_by_partial_id - Lookup signaled event from partial ID
+ * @p:     Pointer to struct kfd_process
+ * @id:    ID to look up
+ * @bits:  Number of valid bits in @id
+ *
+ * Finds the first signaled event with a matching partial ID. If no
+ * matching signaled event is found, returns NULL. In that case the
+ * caller should assume that the partial ID is invalid and do an
+ * exhaustive search of all siglaned events.
+ *
+ * If multiple events with the same partial ID signal at the same
+ * time, they will be found one interrupt at a time, not necessarily
+ * in the same order the interrupts occurred. As long as the number of
+ * interrupts is correct, all signaled events will be seen by the
+ * driver.
  */
-static u32 make_nonsignal_event_id(struct kfd_process *p)
+static struct kfd_event *lookup_signaled_event_by_partial_id(
+	struct kfd_process *p, uint32_t id, uint32_t bits)
 {
-	u32 id;
-
-	for (id = p->next_nonsignal_event_id;
-		id < KFD_LAST_NONSIGNAL_EVENT_ID &&
-		lookup_event_by_id(p, id);
-		id++)
-		;
+	struct kfd_event *ev;
 
-	if (id < KFD_LAST_NONSIGNAL_EVENT_ID) {
+	if (!p->signal_page || id >= KFD_SIGNAL_EVENT_LIMIT)
+		return NULL;
 
-		/*
-		 * What if id == LAST_NONSIGNAL_EVENT_ID - 1?
-		 * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so
-		 * the first loop fails immediately and we proceed with the
-		 * wraparound loop below.
-		 */
-		p->next_nonsignal_event_id = id + 1;
+	/* Fast path for the common case that @id is not a partial ID
+	 * and we only need a single lookup.
+	 */
+	if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) {
+		if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
+			return NULL;
 
-		return id;
+		return idr_find(&p->event_idr, id);
 	}
 
-	for (id = KFD_FIRST_NONSIGNAL_EVENT_ID;
-		id < KFD_LAST_NONSIGNAL_EVENT_ID &&
-		lookup_event_by_id(p, id);
-		id++)
-		;
-
+	/* General case for partial IDs: Iterate over all matching IDs
+	 * and find the first one that has signaled.
+	 */
+	for (ev = NULL; id < KFD_SIGNAL_EVENT_LIMIT && !ev; id += 1U << bits) {
+		if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
+			continue;
 
-	if (id < KFD_LAST_NONSIGNAL_EVENT_ID) {
-		p->next_nonsignal_event_id = id + 1;
-		return id;
+		ev = idr_find(&p->event_idr, id);
 	}
 
-	p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID;
-	return 0;
-}
-
-static struct kfd_event *lookup_event_by_page_slot(struct kfd_process *p,
-						struct signal_page *page,
-						unsigned int signal_slot)
-{
-	return lookup_event_by_id(p, make_signal_event_id(page, signal_slot));
+	return ev;
 }
 
 static int create_signal_event(struct file *devkfd,
 				struct kfd_process *p,
 				struct kfd_event *ev)
 {
-	if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) {
+	int ret;
+
+	if (p->signal_mapped_size &&
+	    p->signal_event_count == p->signal_mapped_size / 8) {
 		if (!p->signal_event_limit_reached) {
 			pr_warn("Signal event wasn't created because limit was reached\n");
 			p->signal_event_limit_reached = true;
 		}
-		return -ENOMEM;
+		return -ENOSPC;
 	}
 
-	if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page,
-						&ev->signal_slot_index)) {
+	ret = allocate_event_notification_slot(p, ev);
+	if (ret) {
 		pr_warn("Signal event wasn't created because out of kernel memory\n");
-		return -ENOMEM;
+		return ret;
 	}
 
 	p->signal_event_count++;
 
-	ev->user_signal_address =
-			&ev->signal_page->user_address[ev->signal_slot_index];
-
-	ev->event_id = make_signal_event_id(ev->signal_page,
-						ev->signal_slot_index);
-
+	ev->user_signal_address = &p->signal_page->user_address[ev->event_id];
 	pr_debug("Signal event number %zu created with id %d, address %p\n",
 			p->signal_event_count, ev->event_id,
 			ev->user_signal_address);
@@ -320,16 +206,20 @@ static int create_signal_event(struct file *devkfd,
 	return 0;
 }
 
-/*
- * No non-signal events are supported yet.
- * We create them as events that never signal.
- * Set event calls from user-mode are failed.
- */
 static int create_other_event(struct kfd_process *p, struct kfd_event *ev)
 {
-	ev->event_id = make_nonsignal_event_id(p);
-	if (ev->event_id == 0)
-		return -ENOMEM;
+	/* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an
+	 * intentional integer overflow to -1 without a compiler
+	 * warning. idr_alloc treats a negative value as "maximum
+	 * signed integer".
+	 */
+	int id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID,
+			   (uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1,
+			   GFP_KERNEL);
+
+	if (id < 0)
+		return id;
+	ev->event_id = id;
 
 	return 0;
 }
@@ -337,50 +227,47 @@ static int create_other_event(struct kfd_process *p, struct kfd_event *ev)
 void kfd_event_init_process(struct kfd_process *p)
 {
 	mutex_init(&p->event_mutex);
-	hash_init(p->events);
-	INIT_LIST_HEAD(&p->signal_event_pages);
-	p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+	idr_init(&p->event_idr);
+	p->signal_page = NULL;
 	p->signal_event_count = 0;
 }
 
 static void destroy_event(struct kfd_process *p, struct kfd_event *ev)
 {
-	if (ev->signal_page) {
-		release_event_notification_slot(ev->signal_page,
-						ev->signal_slot_index);
-		p->signal_event_count--;
-	}
+	struct kfd_event_waiter *waiter;
 
-	/*
-	 * Abandon the list of waiters. Individual waiting threads will
-	 * clean up their own data.
-	 */
-	list_del(&ev->waiters);
+	/* Wake up pending waiters. They will return failure */
+	list_for_each_entry(waiter, &ev->wq.head, wait.entry)
+		waiter->event = NULL;
+	wake_up_all(&ev->wq);
+
+	if (ev->type == KFD_EVENT_TYPE_SIGNAL ||
+	    ev->type == KFD_EVENT_TYPE_DEBUG)
+		p->signal_event_count--;
 
-	hash_del(&ev->events);
+	idr_remove(&p->event_idr, ev->event_id);
 	kfree(ev);
 }
 
 static void destroy_events(struct kfd_process *p)
 {
 	struct kfd_event *ev;
-	struct hlist_node *tmp;
-	unsigned int hash_bkt;
+	uint32_t id;
 
-	hash_for_each_safe(p->events, hash_bkt, tmp, ev, events)
+	idr_for_each_entry(&p->event_idr, ev, id)
 		destroy_event(p, ev);
+	idr_destroy(&p->event_idr);
 }
 
 /*
  * We assume that the process is being destroyed and there is no need to
  * unmap the pages or keep bookkeeping data in order.
  */
-static void shutdown_signal_pages(struct kfd_process *p)
+static void shutdown_signal_page(struct kfd_process *p)
 {
-	struct signal_page *page, *tmp;
+	struct kfd_signal_page *page = p->signal_page;
 
-	list_for_each_entry_safe(page, tmp, &p->signal_event_pages,
-					event_pages) {
+	if (page) {
 		free_pages((unsigned long)page->kernel_address,
 				get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
 		kfree(page);
@@ -390,7 +277,7 @@ static void shutdown_signal_pages(struct kfd_process *p)
 void kfd_event_free_process(struct kfd_process *p)
 {
 	destroy_events(p);
-	shutdown_signal_pages(p);
+	shutdown_signal_page(p);
 }
 
 static bool event_can_be_gpu_signaled(const struct kfd_event *ev)
@@ -419,7 +306,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
 	ev->auto_reset = auto_reset;
 	ev->signaled = false;
 
-	INIT_LIST_HEAD(&ev->waiters);
+	init_waitqueue_head(&ev->wq);
 
 	*event_page_offset = 0;
 
@@ -430,10 +317,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
 	case KFD_EVENT_TYPE_DEBUG:
 		ret = create_signal_event(devkfd, p, ev);
 		if (!ret) {
-			*event_page_offset = (ev->signal_page->page_index |
-					KFD_MMAP_EVENTS_MASK);
+			*event_page_offset = KFD_MMAP_EVENTS_MASK;
 			*event_page_offset <<= PAGE_SHIFT;
-			*event_slot_index = ev->signal_slot_index;
+			*event_slot_index = ev->event_id;
 		}
 		break;
 	default:
@@ -442,8 +328,6 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
 	}
 
 	if (!ret) {
-		hash_add(p->events, &ev->events, ev->event_id);
-
 		*event_id = ev->event_id;
 		*event_trigger_data = ev->event_id;
 	} else {
@@ -477,19 +361,18 @@ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id)
 static void set_event(struct kfd_event *ev)
 {
 	struct kfd_event_waiter *waiter;
-	struct kfd_event_waiter *next;
 
-	/* Auto reset if the list is non-empty and we're waking someone. */
-	ev->signaled = !ev->auto_reset || list_empty(&ev->waiters);
+	/* Auto reset if the list is non-empty and we're waking
+	 * someone. waitqueue_active is safe here because we're
+	 * protected by the p->event_mutex, which is also held when
+	 * updating the wait queues in kfd_wait_on_events.
+	 */
+	ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq);
 
-	list_for_each_entry_safe(waiter, next, &ev->waiters, waiters) {
+	list_for_each_entry(waiter, &ev->wq.head, wait.entry)
 		waiter->activated = true;
 
-		/* _init because free_waiters will call list_del */
-		list_del_init(&waiter->waiters);
-
-		wake_up_process(waiter->sleeping_task);
-	}
+	wake_up_all(&ev->wq);
 }
 
 /* Assumes that p is current. */
@@ -538,13 +421,7 @@ int kfd_reset_event(struct kfd_process *p, uint32_t event_id)
 
 static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev)
 {
-	page_slots(ev->signal_page)[ev->signal_slot_index] =
-						UNSIGNALED_EVENT_SLOT;
-}
-
-static bool is_slot_signaled(struct signal_page *page, unsigned int index)
-{
-	return page_slots(page)[index] != UNSIGNALED_EVENT_SLOT;
+	page_slots(p->signal_page)[ev->event_id] = UNSIGNALED_EVENT_SLOT;
 }
 
 static void set_event_from_interrupt(struct kfd_process *p,
@@ -559,7 +436,7 @@ static void set_event_from_interrupt(struct kfd_process *p,
 void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
 				uint32_t valid_id_bits)
 {
-	struct kfd_event *ev;
+	struct kfd_event *ev = NULL;
 
 	/*
 	 * Because we are called from arbitrary context (workqueue) as opposed
@@ -573,26 +450,46 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
 
 	mutex_lock(&p->event_mutex);
 
-	if (valid_id_bits >= INTERRUPT_DATA_BITS) {
-		/* Partial ID is a full ID. */
-		ev = lookup_event_by_id(p, partial_id);
+	if (valid_id_bits)
+		ev = lookup_signaled_event_by_partial_id(p, partial_id,
+							 valid_id_bits);
+	if (ev) {
 		set_event_from_interrupt(p, ev);
-	} else {
+	} else if (p->signal_page) {
 		/*
-		 * Partial ID is in fact partial. For now we completely
-		 * ignore it, but we could use any bits we did receive to
-		 * search faster.
+		 * Partial ID lookup failed. Assume that the event ID
+		 * in the interrupt payload was invalid and do an
+		 * exhaustive search of signaled events.
 		 */
-		struct signal_page *page;
-		unsigned int i;
-
-		list_for_each_entry(page, &p->signal_event_pages, event_pages)
-			for (i = 0; i < SLOTS_PER_PAGE; i++)
-				if (is_slot_signaled(page, i)) {
-					ev = lookup_event_by_page_slot(p,
-								page, i);
+		uint64_t *slots = page_slots(p->signal_page);
+		uint32_t id;
+
+		if (valid_id_bits)
+			pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
+					     partial_id, valid_id_bits);
+
+		if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT/2) {
+			/* With relatively few events, it's faster to
+			 * iterate over the event IDR
+			 */
+			idr_for_each_entry(&p->event_idr, ev, id) {
+				if (id >= KFD_SIGNAL_EVENT_LIMIT)
+					break;
+
+				if (slots[id] != UNSIGNALED_EVENT_SLOT)
+					set_event_from_interrupt(p, ev);
+			}
+		} else {
+			/* With relatively many events, it's faster to
+			 * iterate over the signal slots and lookup
+			 * only signaled events from the IDR.
+			 */
+			for (id = 0; id < KFD_SIGNAL_EVENT_LIMIT; id++)
+				if (slots[id] != UNSIGNALED_EVENT_SLOT) {
+					ev = lookup_event_by_id(p, id);
 					set_event_from_interrupt(p, ev);
 				}
+		}
 	}
 
 	mutex_unlock(&p->event_mutex);
@@ -609,18 +506,16 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
 					GFP_KERNEL);
 
 	for (i = 0; (event_waiters) && (i < num_events) ; i++) {
-		INIT_LIST_HEAD(&event_waiters[i].waiters);
-		event_waiters[i].sleeping_task = current;
+		init_wait(&event_waiters[i].wait);
 		event_waiters[i].activated = false;
 	}
 
 	return event_waiters;
 }
 
-static int init_event_waiter(struct kfd_process *p,
+static int init_event_waiter_get_status(struct kfd_process *p,
 		struct kfd_event_waiter *waiter,
-		uint32_t event_id,
-		uint32_t input_index)
+		uint32_t event_id)
 {
 	struct kfd_event *ev = lookup_event_by_id(p, event_id);
 
@@ -628,38 +523,60 @@ static int init_event_waiter(struct kfd_process *p,
 		return -EINVAL;
 
 	waiter->event = ev;
-	waiter->input_index = input_index;
 	waiter->activated = ev->signaled;
 	ev->signaled = ev->signaled && !ev->auto_reset;
 
-	list_add(&waiter->waiters, &ev->waiters);
-
 	return 0;
 }
 
-static bool test_event_condition(bool all, uint32_t num_events,
+static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter)
+{
+	struct kfd_event *ev = waiter->event;
+
+	/* Only add to the wait list if we actually need to
+	 * wait on this event.
+	 */
+	if (!waiter->activated)
+		add_wait_queue(&ev->wq, &waiter->wait);
+}
+
+/* test_event_condition - Test condition of events being waited for
+ * @all:           Return completion only if all events have signaled
+ * @num_events:    Number of events to wait for
+ * @event_waiters: Array of event waiters, one per event
+ *
+ * Returns KFD_IOC_WAIT_RESULT_COMPLETE if all (or one) event(s) have
+ * signaled. Returns KFD_IOC_WAIT_RESULT_TIMEOUT if no (or not all)
+ * events have signaled. Returns KFD_IOC_WAIT_RESULT_FAIL if any of
+ * the events have been destroyed.
+ */
+static uint32_t test_event_condition(bool all, uint32_t num_events,
 				struct kfd_event_waiter *event_waiters)
 {
 	uint32_t i;
 	uint32_t activated_count = 0;
 
 	for (i = 0; i < num_events; i++) {
+		if (!event_waiters[i].event)
+			return KFD_IOC_WAIT_RESULT_FAIL;
+
 		if (event_waiters[i].activated) {
 			if (!all)
-				return true;
+				return KFD_IOC_WAIT_RESULT_COMPLETE;
 
 			activated_count++;
 		}
 	}
 
-	return activated_count == num_events;
+	return activated_count == num_events ?
+		KFD_IOC_WAIT_RESULT_COMPLETE : KFD_IOC_WAIT_RESULT_TIMEOUT;
 }
 
 /*
  * Copy event specific data, if defined.
  * Currently only memory exception events have additional data to copy to user
  */
-static bool copy_signaled_event_data(uint32_t num_events,
+static int copy_signaled_event_data(uint32_t num_events,
 		struct kfd_event_waiter *event_waiters,
 		struct kfd_event_data __user *data)
 {
@@ -673,15 +590,15 @@ static bool copy_signaled_event_data(uint32_t num_events,
 		waiter = &event_waiters[i];
 		event = waiter->event;
 		if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) {
-			dst = &data[waiter->input_index].memory_exception_data;
+			dst = &data[i].memory_exception_data;
 			src = &event->memory_exception_data;
 			if (copy_to_user(dst, src,
 				sizeof(struct kfd_hsa_memory_exception_data)))
-				return false;
+				return -EFAULT;
 		}
 	}
 
-	return true;
+	return 0;
 
 }
 
@@ -710,7 +627,9 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters)
 	uint32_t i;
 
 	for (i = 0; i < num_events; i++)
-		list_del(&waiters[i].waiters);
+		if (waiters[i].event)
+			remove_wait_queue(&waiters[i].event->wq,
+					  &waiters[i].wait);
 
 	kfree(waiters);
 }
@@ -718,38 +637,56 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters)
 int kfd_wait_on_events(struct kfd_process *p,
 		       uint32_t num_events, void __user *data,
 		       bool all, uint32_t user_timeout_ms,
-		       enum kfd_event_wait_result *wait_result)
+		       uint32_t *wait_result)
 {
 	struct kfd_event_data __user *events =
 			(struct kfd_event_data __user *) data;
 	uint32_t i;
 	int ret = 0;
+
 	struct kfd_event_waiter *event_waiters = NULL;
 	long timeout = user_timeout_to_jiffies(user_timeout_ms);
 
-	mutex_lock(&p->event_mutex);
-
 	event_waiters = alloc_event_waiters(num_events);
 	if (!event_waiters) {
 		ret = -ENOMEM;
-		goto fail;
+		goto out;
 	}
 
+	mutex_lock(&p->event_mutex);
+
 	for (i = 0; i < num_events; i++) {
 		struct kfd_event_data event_data;
 
 		if (copy_from_user(&event_data, &events[i],
 				sizeof(struct kfd_event_data))) {
 			ret = -EFAULT;
-			goto fail;
+			goto out_unlock;
 		}
 
-		ret = init_event_waiter(p, &event_waiters[i],
-				event_data.event_id, i);
+		ret = init_event_waiter_get_status(p, &event_waiters[i],
+				event_data.event_id);
 		if (ret)
-			goto fail;
+			goto out_unlock;
 	}
 
+	/* Check condition once. */
+	*wait_result = test_event_condition(all, num_events, event_waiters);
+	if (*wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) {
+		ret = copy_signaled_event_data(num_events,
+					       event_waiters, events);
+		goto out_unlock;
+	} else if (WARN_ON(*wait_result == KFD_IOC_WAIT_RESULT_FAIL)) {
+		/* This should not happen. Events shouldn't be
+		 * destroyed while we're holding the event_mutex
+		 */
+		goto out_unlock;
+	}
+
+	/* Add to wait lists if we need to wait. */
+	for (i = 0; i < num_events; i++)
+		init_event_waiter_add_to_waitlist(&event_waiters[i]);
+
 	mutex_unlock(&p->event_mutex);
 
 	while (true) {
@@ -771,62 +708,66 @@ int kfd_wait_on_events(struct kfd_process *p,
 			break;
 		}
 
-		if (test_event_condition(all, num_events, event_waiters)) {
-			if (copy_signaled_event_data(num_events,
-					event_waiters, events))
-				*wait_result = KFD_WAIT_COMPLETE;
-			else
-				*wait_result = KFD_WAIT_ERROR;
+		/* Set task state to interruptible sleep before
+		 * checking wake-up conditions. A concurrent wake-up
+		 * will put the task back into runnable state. In that
+		 * case schedule_timeout will not put the task to
+		 * sleep and we'll get a chance to re-check the
+		 * updated conditions almost immediately. Otherwise,
+		 * this race condition would lead to a soft hang or a
+		 * very long sleep.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		*wait_result = test_event_condition(all, num_events,
+						    event_waiters);
+		if (*wait_result != KFD_IOC_WAIT_RESULT_TIMEOUT)
 			break;
-		}
 
-		if (timeout <= 0) {
-			*wait_result = KFD_WAIT_TIMEOUT;
+		if (timeout <= 0)
 			break;
-		}
 
-		timeout = schedule_timeout_interruptible(timeout);
+		timeout = schedule_timeout(timeout);
 	}
 	__set_current_state(TASK_RUNNING);
 
+	/* copy_signaled_event_data may sleep. So this has to happen
+	 * after the task state is set back to RUNNING.
+	 */
+	if (!ret && *wait_result == KFD_IOC_WAIT_RESULT_COMPLETE)
+		ret = copy_signaled_event_data(num_events,
+					       event_waiters, events);
+
 	mutex_lock(&p->event_mutex);
+out_unlock:
 	free_waiters(num_events, event_waiters);
 	mutex_unlock(&p->event_mutex);
-
-	return ret;
-
-fail:
-	if (event_waiters)
-		free_waiters(num_events, event_waiters);
-
-	mutex_unlock(&p->event_mutex);
-
-	*wait_result = KFD_WAIT_ERROR;
+out:
+	if (ret)
+		*wait_result = KFD_IOC_WAIT_RESULT_FAIL;
+	else if (*wait_result == KFD_IOC_WAIT_RESULT_FAIL)
+		ret = -EIO;
 
 	return ret;
 }
 
 int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
 {
-
-	unsigned int page_index;
 	unsigned long pfn;
-	struct signal_page *page;
+	struct kfd_signal_page *page;
+	int ret;
 
-	/* check required size is logical */
-	if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) !=
+	/* check required size doesn't exceed the allocated size */
+	if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) <
 			get_order(vma->vm_end - vma->vm_start)) {
 		pr_err("Event page mmap requested illegal size\n");
 		return -EINVAL;
 	}
 
-	page_index = vma->vm_pgoff;
-
-	page = lookup_signal_page_by_index(p, page_index);
+	page = p->signal_page;
 	if (!page) {
 		/* Probably KFD bug, but mmap is user-accessible. */
-		pr_debug("Signal page could not be found for page_index %u\n",
-				page_index);
+		pr_debug("Signal page could not be found\n");
 		return -EINVAL;
 	}
 
@@ -847,8 +788,12 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
 	page->user_address = (uint64_t __user *)vma->vm_start;
 
 	/* mapping the page to user process */
-	return remap_pfn_range(vma, vma->vm_start, pfn,
+	ret = remap_pfn_range(vma, vma->vm_start, pfn,
 			vma->vm_end - vma->vm_start, vma->vm_page_prot);
+	if (!ret)
+		p->signal_mapped_size = vma->vm_end - vma->vm_start;
+
+	return ret;
 }
 
 /*
@@ -860,12 +805,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
 {
 	struct kfd_hsa_memory_exception_data *ev_data;
 	struct kfd_event *ev;
-	int bkt;
+	uint32_t id;
 	bool send_signal = true;
 
 	ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
 
-	hash_for_each(p->events, bkt, ev, events)
+	id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+	idr_for_each_entry_continue(&p->event_idr, ev, id)
 		if (ev->type == type) {
 			send_signal = false;
 			dev_dbg(kfd_device,
@@ -904,14 +850,24 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 	 * running so the lookup function returns a locked process.
 	 */
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+	struct mm_struct *mm;
 
 	if (!p)
 		return; /* Presumably process exited. */
 
+	/* Take a safe reference to the mm_struct, which may otherwise
+	 * disappear even while the kfd_process is still referenced.
+	 */
+	mm = get_task_mm(p->lead_thread);
+	if (!mm) {
+		mutex_unlock(&p->mutex);
+		return; /* Process is exiting */
+	}
+
 	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
 
-	down_read(&p->mm->mmap_sem);
-	vma = find_vma(p->mm, address);
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, address);
 
 	memory_exception_data.gpu_id = dev->id;
 	memory_exception_data.va = address;
@@ -937,7 +893,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 		}
 	}
 
-	up_read(&p->mm->mmap_sem);
+	up_read(&mm->mmap_sem);
+	mmput(mm);
 
 	mutex_lock(&p->event_mutex);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
index 28f6838b1f4c..abca5bfebbff 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
@@ -27,12 +27,17 @@
 #include <linux/hashtable.h>
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/wait.h>
 #include "kfd_priv.h"
 #include <uapi/linux/kfd_ioctl.h>
 
-#define KFD_EVENT_ID_NONSIGNAL_MASK 0x80000000U
-#define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK
-#define KFD_LAST_NONSIGNAL_EVENT_ID UINT_MAX
+/*
+ * IDR supports non-negative integer IDs. Small IDs are used for
+ * signal events to match their signal slot. Use the upper half of the
+ * ID space for non-signal events.
+ */
+#define KFD_FIRST_NONSIGNAL_EVENT_ID ((INT_MAX >> 1) + 1)
+#define KFD_LAST_NONSIGNAL_EVENT_ID INT_MAX
 
 /*
  * Written into kfd_signal_slot_t to indicate that the event is not signaled.
@@ -46,9 +51,6 @@ struct kfd_event_waiter;
 struct signal_page;
 
 struct kfd_event {
-	/* All events in process, rooted at kfd_process.events. */
-	struct hlist_node events;
-
 	u32 event_id;
 
 	bool signaled;
@@ -56,11 +58,9 @@ struct kfd_event {
 
 	int type;
 
-	struct list_head waiters; /* List of kfd_event_waiter by waiters. */
+	wait_queue_head_t wq; /* List of event waiters. */
 
 	/* Only for signal events. */
-	struct signal_page *signal_page;
-	unsigned int signal_slot_index;
 	uint64_t __user *user_signal_address;
 
 	/* type specific data */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
index 70b3a99cffc2..035c351f47c5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
@@ -42,26 +42,26 @@
 
 #include <linux/slab.h>
 #include <linux/device.h>
+#include <linux/kfifo.h>
 #include "kfd_priv.h"
 
-#define KFD_INTERRUPT_RING_SIZE 1024
+#define KFD_IH_NUM_ENTRIES 8192
 
 static void interrupt_wq(struct work_struct *);
 
 int kfd_interrupt_init(struct kfd_dev *kfd)
 {
-	void *interrupt_ring = kmalloc_array(KFD_INTERRUPT_RING_SIZE,
-					kfd->device_info->ih_ring_entry_size,
-					GFP_KERNEL);
-	if (!interrupt_ring)
-		return -ENOMEM;
-
-	kfd->interrupt_ring = interrupt_ring;
-	kfd->interrupt_ring_size =
-		KFD_INTERRUPT_RING_SIZE * kfd->device_info->ih_ring_entry_size;
-	atomic_set(&kfd->interrupt_ring_wptr, 0);
-	atomic_set(&kfd->interrupt_ring_rptr, 0);
+	int r;
+
+	r = kfifo_alloc(&kfd->ih_fifo,
+		KFD_IH_NUM_ENTRIES * kfd->device_info->ih_ring_entry_size,
+		GFP_KERNEL);
+	if (r) {
+		dev_err(kfd_chardev(), "Failed to allocate IH fifo\n");
+		return r;
+	}
 
+	kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1);
 	spin_lock_init(&kfd->interrupt_lock);
 
 	INIT_WORK(&kfd->interrupt_work, interrupt_wq);
@@ -92,74 +92,47 @@ void kfd_interrupt_exit(struct kfd_dev *kfd)
 	spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
 
 	/*
-	 * Flush_scheduled_work ensures that there are no outstanding
+	 * flush_work ensures that there are no outstanding
 	 * work-queue items that will access interrupt_ring. New work items
 	 * can't be created because we stopped interrupt handling above.
 	 */
-	flush_scheduled_work();
+	flush_workqueue(kfd->ih_wq);
 
-	kfree(kfd->interrupt_ring);
+	kfifo_free(&kfd->ih_fifo);
 }
 
 /*
- * This assumes that it can't be called concurrently with itself
- * but only with dequeue_ih_ring_entry.
+ * Assumption: single reader/writer. This function is not re-entrant
  */
 bool enqueue_ih_ring_entry(struct kfd_dev *kfd,	const void *ih_ring_entry)
 {
-	unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr);
-	unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr);
+	int count;
 
-	if ((rptr - wptr) % kfd->interrupt_ring_size ==
-					kfd->device_info->ih_ring_entry_size) {
-		/* This is very bad, the system is likely to hang. */
+	count = kfifo_in(&kfd->ih_fifo, ih_ring_entry,
+				kfd->device_info->ih_ring_entry_size);
+	if (count != kfd->device_info->ih_ring_entry_size) {
 		dev_err_ratelimited(kfd_chardev(),
-			"Interrupt ring overflow, dropping interrupt.\n");
+			"Interrupt ring overflow, dropping interrupt %d\n",
+			count);
 		return false;
 	}
 
-	memcpy(kfd->interrupt_ring + wptr, ih_ring_entry,
-			kfd->device_info->ih_ring_entry_size);
-
-	wptr = (wptr + kfd->device_info->ih_ring_entry_size) %
-			kfd->interrupt_ring_size;
-	smp_wmb(); /* Ensure memcpy'd data is visible before wptr update. */
-	atomic_set(&kfd->interrupt_ring_wptr, wptr);
-
 	return true;
 }
 
 /*
- * This assumes that it can't be called concurrently with itself
- * but only with enqueue_ih_ring_entry.
+ * Assumption: single reader/writer. This function is not re-entrant
  */
 static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry)
 {
-	/*
-	 * Assume that wait queues have an implicit barrier, i.e. anything that
-	 * happened in the ISR before it queued work is visible.
-	 */
-
-	unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr);
-	unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr);
+	int count;
 
-	if (rptr == wptr)
-		return false;
-
-	memcpy(ih_ring_entry, kfd->interrupt_ring + rptr,
-			kfd->device_info->ih_ring_entry_size);
-
-	rptr = (rptr + kfd->device_info->ih_ring_entry_size) %
-			kfd->interrupt_ring_size;
+	count = kfifo_out(&kfd->ih_fifo, ih_ring_entry,
+				kfd->device_info->ih_ring_entry_size);
 
-	/*
-	 * Ensure the rptr write update is not visible until
-	 * memcpy has finished reading.
-	 */
-	smp_mb();
-	atomic_set(&kfd->interrupt_ring_rptr, rptr);
+	WARN_ON(count && count != kfd->device_info->ih_ring_entry_size);
 
-	return true;
+	return count == kfd->device_info->ih_ring_entry_size;
 }
 
 static void interrupt_wq(struct work_struct *work)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index 44ffd23348fc..4859d263fa2a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -189,12 +189,9 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
 	if (q->format == KFD_QUEUE_FORMAT_AQL)
 		m->cp_hqd_pq_control |= NO_UPDATE_RPTR;
 
-	q->is_active = false;
-	if (q->queue_size > 0 &&
+	q->is_active = (q->queue_size > 0 &&
 			q->queue_address != 0 &&
-			q->queue_percent > 0) {
-		q->is_active = true;
-	}
+			q->queue_percent > 0);
 
 	return 0;
 }
@@ -215,24 +212,17 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
 	m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8);
 	m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
 	m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
-	m->sdma_rlc_doorbell = q->doorbell_off <<
-			SDMA0_RLC0_DOORBELL__OFFSET__SHIFT |
-			1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT;
+	m->sdma_rlc_doorbell =
+		q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT;
 
 	m->sdma_rlc_virtual_addr = q->sdma_vm_addr;
 
 	m->sdma_engine_id = q->sdma_engine_id;
 	m->sdma_queue_id = q->sdma_queue_id;
 
-	q->is_active = false;
-	if (q->queue_size > 0 &&
+	q->is_active = (q->queue_size > 0 &&
 			q->queue_address != 0 &&
-			q->queue_percent > 0) {
-		m->sdma_rlc_rb_cntl |=
-				1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT;
-
-		q->is_active = true;
-	}
+			q->queue_percent > 0);
 
 	return 0;
 }
@@ -359,19 +349,13 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
 	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
 	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
-	m->cp_hqd_pq_doorbell_control = DOORBELL_EN |
-					DOORBELL_OFFSET(q->doorbell_off);
+	m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off);
 
 	m->cp_hqd_vmid = q->vmid;
 
-	m->cp_hqd_active = 0;
-	q->is_active = false;
-	if (q->queue_size > 0 &&
+	q->is_active = (q->queue_size > 0 &&
 			q->queue_address != 0 &&
-			q->queue_percent > 0) {
-		m->cp_hqd_active = 1;
-		q->is_active = true;
-	}
+			q->queue_percent > 0);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index 73cbfe186dd2..4ea854f9007b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -163,12 +163,9 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
 				2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT;
 	}
 
-	q->is_active = false;
-	if (q->queue_size > 0 &&
+	q->is_active = (q->queue_size > 0 &&
 			q->queue_address != 0 &&
-			q->queue_percent > 0) {
-		q->is_active = true;
-	}
+			q->queue_percent > 0);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 7d86ec9790d3..9e4134c5b481 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -31,6 +31,8 @@
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
 #include <linux/kfd_ioctl.h>
+#include <linux/idr.h>
+#include <linux/kfifo.h>
 #include <kgd_kfd_interface.h>
 
 #include "amd_shared.h"
@@ -181,10 +183,8 @@ struct kfd_dev {
 	unsigned int gtt_sa_num_of_chunks;
 
 	/* Interrupts */
-	void *interrupt_ring;
-	size_t interrupt_ring_size;
-	atomic_t interrupt_ring_rptr;
-	atomic_t interrupt_ring_wptr;
+	struct kfifo ih_fifo;
+	struct workqueue_struct *ih_wq;
 	struct work_struct interrupt_work;
 	spinlock_t interrupt_lock;
 
@@ -494,7 +494,12 @@ struct kfd_process {
 	 */
 	struct hlist_node kfd_processes;
 
-	struct mm_struct *mm;
+	/*
+	 * Opaque pointer to mm_struct. We don't hold a reference to
+	 * it so it should never be dereferenced from here. This is
+	 * only used for looking up processes by their mm.
+	 */
+	void *mm;
 
 	struct mutex mutex;
 
@@ -502,6 +507,8 @@ struct kfd_process {
 	 * In any process, the thread that started main() is the lead
 	 * thread and outlives the rest.
 	 * It is here because amd_iommu_bind_pasid wants a task_struct.
+	 * It can also be used for safely getting a reference to the
+	 * mm_struct of the process.
 	 */
 	struct task_struct *lead_thread;
 
@@ -522,22 +529,16 @@ struct kfd_process {
 
 	struct process_queue_manager pqm;
 
-	/* The process's queues. */
-	size_t queue_array_size;
-
-	/* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */
-	struct kfd_queue **queues;
-
 	/*Is the user space process 32 bit?*/
 	bool is_32bit_user_mode;
 
 	/* Event-related data */
 	struct mutex event_mutex;
-	/* All events in process hashed by ID, linked on kfd_event.events. */
-	DECLARE_HASHTABLE(events, 4);
-	/* struct slot_page_header.event_pages */
-	struct list_head signal_event_pages;
-	u32 next_nonsignal_event_id;
+	/* Event ID allocator and lookup */
+	struct idr event_idr;
+	/* Event page */
+	struct kfd_signal_page *signal_page;
+	size_t signal_mapped_size;
 	size_t signal_event_count;
 	bool signal_event_limit_reached;
 };
@@ -721,19 +722,13 @@ uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
 extern const struct kfd_event_interrupt_class event_interrupt_class_cik;
 extern const struct kfd_device_global_init_class device_global_init_class_cik;
 
-enum kfd_event_wait_result {
-	KFD_WAIT_COMPLETE,
-	KFD_WAIT_TIMEOUT,
-	KFD_WAIT_ERROR
-};
-
 void kfd_event_init_process(struct kfd_process *p);
 void kfd_event_free_process(struct kfd_process *p);
 int kfd_event_mmap(struct kfd_process *process, struct vm_area_struct *vma);
 int kfd_wait_on_events(struct kfd_process *p,
 		       uint32_t num_events, void __user *data,
 		       bool all, uint32_t user_timeout_ms,
-		       enum kfd_event_wait_result *wait_result);
+		       uint32_t *wait_result);
 void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
 				uint32_t valid_id_bits);
 void kfd_signal_iommu_event(struct kfd_dev *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 3ccb3b53216e..1f5ccd28bd41 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -35,13 +35,6 @@ struct mm_struct;
 #include "kfd_dbgmgr.h"
 
 /*
- * Initial size for the array of queues.
- * The allocated size is doubled each time
- * it is exceeded up to MAX_PROCESS_QUEUES.
- */
-#define INITIAL_QUEUE_ARRAY_SIZE 16
-
-/*
  * List of struct kfd_process (field kfd_process).
  * Unique/indexed by mm_struct*
  */
@@ -187,8 +180,6 @@ static void kfd_process_wq_release(struct work_struct *work)
 
 	mutex_destroy(&p->mutex);
 
-	kfree(p->queues);
-
 	kfree(p);
 
 	kfree(work);
@@ -200,7 +191,6 @@ static void kfd_process_destroy_delayed(struct rcu_head *rcu)
 	struct kfd_process *p;
 
 	p = container_of(rcu, struct kfd_process, rcu);
-	WARN_ON(atomic_read(&p->mm->mm_count) <= 0);
 
 	mmdrop(p->mm);
 
@@ -234,17 +224,26 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 
 	mutex_lock(&p->mutex);
 
+	/* Iterate over all process device data structures and if the
+	 * pdd is in debug mode, we should first force unregistration,
+	 * then we will be able to destroy the queues
+	 */
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+		struct kfd_dev *dev = pdd->dev;
+
+		mutex_lock(kfd_get_dbgmgr_mutex());
+		if (dev && dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) {
+			if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) {
+				kfd_dbgmgr_destroy(dev->dbgmgr);
+				dev->dbgmgr = NULL;
+			}
+		}
+		mutex_unlock(kfd_get_dbgmgr_mutex());
+	}
+
 	kfd_process_dequeue_from_all_devices(p);
 	pqm_uninit(&p->pqm);
 
-	/* Iterate over all process device data structure and check
-	 * if we should delete debug managers
-	 */
-	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
-		if ((pdd->dev->dbgmgr) &&
-				(pdd->dev->dbgmgr->pasid == p->pasid))
-			kfd_dbgmgr_destroy(pdd->dev->dbgmgr);
-
 	mutex_unlock(&p->mutex);
 
 	/*
@@ -271,11 +270,6 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	if (!process)
 		goto err_alloc_process;
 
-	process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE,
-					sizeof(process->queues[0]), GFP_KERNEL);
-	if (!process->queues)
-		goto err_alloc_queues;
-
 	process->pasid = kfd_pasid_alloc();
 	if (process->pasid == 0)
 		goto err_alloc_pasid;
@@ -298,8 +292,6 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 
 	process->lead_thread = thread->group_leader;
 
-	process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE;
-
 	INIT_LIST_HEAD(&process->per_device_data);
 
 	kfd_event_init_process(process);
@@ -328,8 +320,6 @@ err_mmu_notifier:
 err_alloc_doorbells:
 	kfd_pasid_free(process->pasid);
 err_alloc_pasid:
-	kfree(process->queues);
-err_alloc_queues:
 	kfree(process);
 err_alloc_process:
 	return ERR_PTR(err);
@@ -426,7 +416,7 @@ int kfd_bind_processes_to_device(struct kfd_dev *dev)
 		err = amd_iommu_bind_pasid(dev->pdev, p->pasid,
 				p->lead_thread);
 		if (err < 0) {
-			pr_err("unexpected pasid %d binding failure\n",
+			pr_err("Unexpected pasid %d binding failure\n",
 					p->pasid);
 			mutex_unlock(&p->mutex);
 			break;
@@ -442,29 +432,25 @@ int kfd_bind_processes_to_device(struct kfd_dev *dev)
 }
 
 /*
- * Temporarily unbind currently bound processes from the device and
- * mark them as PDD_BOUND_SUSPENDED. These processes will be restored
- * to PDD_BOUND state in kfd_bind_processes_to_device.
+ * Mark currently bound processes as PDD_BOUND_SUSPENDED. These
+ * processes will be restored to PDD_BOUND state in
+ * kfd_bind_processes_to_device.
  */
 void kfd_unbind_processes_from_device(struct kfd_dev *dev)
 {
 	struct kfd_process_device *pdd;
 	struct kfd_process *p;
-	unsigned int temp, temp_bound, temp_pasid;
+	unsigned int temp;
 
 	int idx = srcu_read_lock(&kfd_processes_srcu);
 
 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
 		mutex_lock(&p->mutex);
 		pdd = kfd_get_process_device_data(dev, p);
-		temp_bound = pdd->bound;
-		temp_pasid = p->pasid;
+
 		if (pdd->bound == PDD_BOUND)
 			pdd->bound = PDD_BOUND_SUSPENDED;
 		mutex_unlock(&p->mutex);
-
-		if (temp_bound == PDD_BOUND)
-			amd_iommu_unbind_pasid(dev->pdev, temp_pasid);
 	}
 
 	srcu_read_unlock(&kfd_processes_srcu, idx);
@@ -486,8 +472,16 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
 
 	pr_debug("Unbinding process %d from IOMMU\n", pasid);
 
-	if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid))
-		kfd_dbgmgr_destroy(dev->dbgmgr);
+	mutex_lock(kfd_get_dbgmgr_mutex());
+
+	if (dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) {
+		if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) {
+			kfd_dbgmgr_destroy(dev->dbgmgr);
+			dev->dbgmgr = NULL;
+		}
+	}
+
+	mutex_unlock(kfd_get_dbgmgr_mutex());
 
 	pdd = kfd_get_process_device_data(dev, p);
 	if (pdd)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 5129dc139219..2bec902fc939 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -177,7 +177,8 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 	if (retval != 0)
 		return retval;
 
-	if (list_empty(&pqm->queues)) {
+	if (list_empty(&pdd->qpd.queues_list) &&
+	    list_empty(&pdd->qpd.priv_queue_list)) {
 		pdd->qpd.pqm = pqm;
 		dev->dqm->ops.register_process(dev->dqm, &pdd->qpd);
 	}
@@ -248,7 +249,8 @@ err_create_queue:
 err_allocate_pqn:
 	/* check if queues list is empty unregister process from device */
 	clear_bit(*qid, pqm->queue_slot_bitmap);
-	if (list_empty(&pqm->queues))
+	if (list_empty(&pdd->qpd.queues_list) &&
+	    list_empty(&pdd->qpd.priv_queue_list))
 		dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd);
 	return retval;
 }
@@ -302,7 +304,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 	kfree(pqn);
 	clear_bit(qid, pqm->queue_slot_bitmap);
 
-	if (list_empty(&pqm->queues))
+	if (list_empty(&pdd->qpd.queues_list) &&
+	    list_empty(&pdd->qpd.priv_queue_list))
 		dqm->ops.unregister_process(dqm, &pdd->qpd);
 
 	return retval;
diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
index de6fc2731b98..b72f8a43d86b 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -23,36 +23,11 @@
 #ifndef __AMD_SHARED_H__
 #define __AMD_SHARED_H__
 
-#define AMD_MAX_USEC_TIMEOUT		200000  /* 200 ms */
+#include <drm/amd_asic_type.h>
 
 struct seq_file;
 
-/*
- * Supported ASIC types
- */
-enum amd_asic_type {
-	CHIP_TAHITI = 0,
-	CHIP_PITCAIRN,
-	CHIP_VERDE,
-	CHIP_OLAND,
-	CHIP_HAINAN,
-	CHIP_BONAIRE,
-	CHIP_KAVERI,
-	CHIP_KABINI,
-	CHIP_HAWAII,
-	CHIP_MULLINS,
-	CHIP_TOPAZ,
-	CHIP_TONGA,
-	CHIP_FIJI,
-	CHIP_CARRIZO,
-	CHIP_STONEY,
-	CHIP_POLARIS10,
-	CHIP_POLARIS11,
-	CHIP_POLARIS12,
-	CHIP_VEGA10,
-	CHIP_RAVEN,
-	CHIP_LAST,
-};
+#define AMD_MAX_USEC_TIMEOUT		200000  /* 200 ms */
 
 /*
  * Chip flags
diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
index 3c8ef4bfc205..c7e34128cbde 100644
--- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
@@ -78,6 +78,9 @@ static int amd_powerplay_destroy(void *handle)
 {
 	struct pp_instance *instance = (struct pp_instance *)handle;
 
+	kfree(instance->hwmgr->hardcode_pp_table);
+	instance->hwmgr->hardcode_pp_table = NULL;
+
 	kfree(instance->hwmgr);
 	instance->hwmgr = NULL;
 
@@ -1184,7 +1187,7 @@ int amd_powerplay_reset(void *handle)
 	int ret;
 
 	ret = pp_check(instance);
-	if (!ret)
+	if (ret)
 		return ret;
 
 	ret = pp_hw_fini(instance);
diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/rv_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/rv_hwmgr.c
index 938010842c7d..3e0b267c74a8 100644
--- a/drivers/gpu/drm/amd/powerplay/hwmgr/rv_hwmgr.c
+++ b/drivers/gpu/drm/amd/powerplay/hwmgr/rv_hwmgr.c
@@ -672,36 +672,20 @@ static int rv_get_performance_level(struct pp_hwmgr *hwmgr, const struct pp_hw_p
 				PHM_PerformanceLevelDesignation designation, uint32_t index,
 				PHM_PerformanceLevel *level)
 {
-	const struct rv_power_state *ps;
 	struct rv_hwmgr *data;
-	uint32_t level_index;
-	uint32_t i;
-	uint32_t vol_dep_record_index = 0;
 
 	if (level == NULL || hwmgr == NULL || state == NULL)
 		return -EINVAL;
 
 	data = (struct rv_hwmgr *)(hwmgr->backend);
-	ps = cast_const_rv_ps(state);
-
-	level_index = index > ps->level - 1 ? ps->level - 1 : index;
-	level->coreClock = 30000;
 
-	if (designation == PHM_PerformanceLevelDesignation_PowerContainment) {
-		for (i = 1; i < ps->level; i++) {
-			if (ps->levels[i].engine_clock > data->dce_slow_sclk_threshold) {
-				level->coreClock = 30000;
-				break;
-			}
-		}
-	}
-
-	if (level_index == 0) {
-		vol_dep_record_index = data->clock_vol_info.vdd_dep_on_fclk->count - 1;
-		level->memory_clock =
-			data->clock_vol_info.vdd_dep_on_fclk->entries[vol_dep_record_index].clk;
-	} else {
+	if (index == 0) {
 		level->memory_clock = data->clock_vol_info.vdd_dep_on_fclk->entries[0].clk;
+		level->coreClock = data->gfx_min_freq_limit;
+	} else {
+		level->memory_clock = data->clock_vol_info.vdd_dep_on_fclk->entries[
+			data->clock_vol_info.vdd_dep_on_fclk->count - 1].clk;
+		level->coreClock = data->gfx_max_freq_limit;
 	}
 
 	level->nonLocalMemoryFreq = 0;
diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c
index e32f18a99074..4466469cf8ab 100644
--- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c
+++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu7_hwmgr.c
@@ -815,7 +815,7 @@ uint32_t smu7_get_xclk(struct pp_hwmgr *hwmgr)
 {
 	uint32_t reference_clock, tmp;
 	struct cgs_display_info info = {0};
-	struct cgs_mode_info mode_info;
+	struct cgs_mode_info mode_info = {0};
 
 	info.mode_info = &mode_info;
 
@@ -3948,10 +3948,9 @@ static int smu7_program_display_gap(struct pp_hwmgr *hwmgr)
 	uint32_t ref_clock;
 	uint32_t refresh_rate = 0;
 	struct cgs_display_info info = {0};
-	struct cgs_mode_info mode_info;
+	struct cgs_mode_info mode_info = {0};
 
 	info.mode_info = &mode_info;
-
 	cgs_get_active_displays_info(hwmgr->device, &info);
 	num_active_displays = info.display_count;
 
@@ -3967,6 +3966,7 @@ static int smu7_program_display_gap(struct pp_hwmgr *hwmgr)
 	frame_time_in_us = 1000000 / refresh_rate;
 
 	pre_vbi_time_in_us = frame_time_in_us - 200 - mode_info.vblank_time_us;
+
 	data->frame_time_x2 = frame_time_in_us * 2 / 100;
 
 	display_gap2 = pre_vbi_time_in_us * (ref_clock / 100);
diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c
index 0519338e0e5e..4f79c21f27ed 100644
--- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c
+++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c
@@ -1807,6 +1807,10 @@ static int vega10_populate_all_memory_levels(struct pp_hwmgr *hwmgr)
 	mem_channels = (cgs_read_register(hwmgr->device, reg) &
 			DF_CS_AON0_DramBaseAddress0__IntLvNumChan_MASK) >>
 			DF_CS_AON0_DramBaseAddress0__IntLvNumChan__SHIFT;
+	PP_ASSERT_WITH_CODE(mem_channels < ARRAY_SIZE(channel_number),
+			"Mem Channel Index Exceeded maximum!",
+			return -1);
+
 	pp_table->NumMemoryChannels = cpu_to_le16(mem_channels);
 	pp_table->MemoryChannelWidth =
 			cpu_to_le16(HBM_MEMORY_CHANNEL_WIDTH *
@@ -2879,6 +2883,15 @@ static int vega10_enable_dpm_tasks(struct pp_hwmgr *hwmgr)
 			"DPM is already running right , skipping re-enablement!",
 			return 0);
 
+	if ((data->smu_version == 0x001c2c00) ||
+			(data->smu_version == 0x001c2d00)) {
+		tmp_result = smum_send_msg_to_smc_with_parameter(hwmgr,
+				PPSMC_MSG_UpdatePkgPwrPidAlpha, 1);
+		PP_ASSERT_WITH_CODE(!tmp_result,
+				"Failed to set package power PID!",
+				return tmp_result);
+	}
+
 	tmp_result = vega10_construct_voltage_tables(hwmgr);
 	PP_ASSERT_WITH_CODE(!tmp_result,
 			"Failed to contruct voltage tables!",
@@ -3125,6 +3138,8 @@ static int vega10_apply_state_adjust_rules(struct pp_hwmgr *hwmgr,
 	minimum_clocks.memoryClock = hwmgr->display_config.min_mem_set_clock;
 
 	if (PP_CAP(PHM_PlatformCaps_StablePState)) {
+		stable_pstate_sclk_dpm_percentage =
+			data->registry_data.stable_pstate_sclk_dpm_percentage;
 		PP_ASSERT_WITH_CODE(
 			data->registry_data.stable_pstate_sclk_dpm_percentage >= 1 &&
 			data->registry_data.stable_pstate_sclk_dpm_percentage <= 100,
@@ -4225,7 +4240,7 @@ static void vega10_set_fan_control_mode(struct pp_hwmgr *hwmgr, uint32_t mode)
 			vega10_fan_ctrl_stop_smc_fan_control(hwmgr);
 		break;
 	case AMD_FAN_CTRL_AUTO:
-		if (!vega10_fan_ctrl_set_static_mode(hwmgr, mode))
+		if (PP_CAP(PHM_PlatformCaps_MicrocodeFanControl))
 			vega10_fan_ctrl_start_smc_fan_control(hwmgr);
 		break;
 	default:
diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_thermal.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_thermal.c
index 1feefac49ea9..dc3761bcb9b6 100644
--- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_thermal.c
+++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_thermal.c
@@ -365,8 +365,8 @@ int vega10_thermal_get_temperature(struct pp_hwmgr *hwmgr)
 
 	temp = cgs_read_register(hwmgr->device, reg);
 
-	temp = (temp & CG_MULT_THERMAL_STATUS__ASIC_MAX_TEMP_MASK) >>
-			CG_MULT_THERMAL_STATUS__ASIC_MAX_TEMP__SHIFT;
+	temp = (temp & CG_MULT_THERMAL_STATUS__CTF_TEMP_MASK) >>
+			CG_MULT_THERMAL_STATUS__CTF_TEMP__SHIFT;
 
 	temp = temp & 0x1ff;
 
diff --git a/drivers/gpu/drm/amd/powerplay/inc/vega10_ppsmc.h b/drivers/gpu/drm/amd/powerplay/inc/vega10_ppsmc.h
index d06ece4ac47d..247c97397a27 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/vega10_ppsmc.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/vega10_ppsmc.h
@@ -131,7 +131,8 @@ typedef uint16_t PPSMC_Result;
 #define PPSMC_MSG_RunAcgInOpenLoop               0x5E
 #define PPSMC_MSG_InitializeAcg                  0x5F
 #define PPSMC_MSG_GetCurrPkgPwr                  0x61
-#define PPSMC_Message_Count                      0x62
+#define PPSMC_MSG_UpdatePkgPwrPidAlpha           0x68
+#define PPSMC_Message_Count                      0x69
 
 
 typedef int PPSMC_Msg;
diff --git a/drivers/gpu/drm/drm_dp_dual_mode_helper.c b/drivers/gpu/drm/drm_dp_dual_mode_helper.c
index 0ef9011a1856..02a50929af67 100644
--- a/drivers/gpu/drm/drm_dp_dual_mode_helper.c
+++ b/drivers/gpu/drm/drm_dp_dual_mode_helper.c
@@ -410,6 +410,7 @@ int drm_lspcon_get_mode(struct i2c_adapter *adapter,
 {
 	u8 data;
 	int ret = 0;
+	int retry;
 
 	if (!mode) {
 		DRM_ERROR("NULL input\n");
@@ -417,10 +418,19 @@ int drm_lspcon_get_mode(struct i2c_adapter *adapter,
 	}
 
 	/* Read Status: i2c over aux */
-	ret = drm_dp_dual_mode_read(adapter, DP_DUAL_MODE_LSPCON_CURRENT_MODE,
-				    &data, sizeof(data));
+	for (retry = 0; retry < 6; retry++) {
+		if (retry)
+			usleep_range(500, 1000);
+
+		ret = drm_dp_dual_mode_read(adapter,
+					    DP_DUAL_MODE_LSPCON_CURRENT_MODE,
+					    &data, sizeof(data));
+		if (!ret)
+			break;
+	}
+
 	if (ret < 0) {
-		DRM_ERROR("LSPCON read(0x80, 0x41) failed\n");
+		DRM_DEBUG_KMS("LSPCON read(0x80, 0x41) failed\n");
 		return -EFAULT;
 	}
 
diff --git a/drivers/gpu/drm/drm_framebuffer.c b/drivers/gpu/drm/drm_framebuffer.c
index 2affe53f3fda..279c1035c12d 100644
--- a/drivers/gpu/drm/drm_framebuffer.c
+++ b/drivers/gpu/drm/drm_framebuffer.c
@@ -681,6 +681,7 @@ EXPORT_SYMBOL(drm_framebuffer_init);
 /**
  * drm_framebuffer_lookup - look up a drm framebuffer and grab a reference
  * @dev: drm device
+ * @file_priv: drm file to check for lease against.
  * @id: id of the fb object
  *
  * If successful, this grabs an additional reference to the framebuffer -
diff --git a/drivers/gpu/drm/drm_mode_object.c b/drivers/gpu/drm/drm_mode_object.c
index 7c8b2698c6a7..ce4d2fb32810 100644
--- a/drivers/gpu/drm/drm_mode_object.c
+++ b/drivers/gpu/drm/drm_mode_object.c
@@ -151,6 +151,7 @@ struct drm_mode_object *__drm_mode_object_find(struct drm_device *dev,
 
 /**
  * drm_mode_object_find - look up a drm object with static lifetime
+ * @dev: drm device
  * @file_priv: drm file
  * @id: id of the mode object
  * @type: type of the mode object
diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c
index e123497da0ca..963e23db0fe7 100644
--- a/drivers/gpu/drm/drm_modeset_lock.c
+++ b/drivers/gpu/drm/drm_modeset_lock.c
@@ -93,7 +93,7 @@ void drm_modeset_lock_all(struct drm_device *dev)
 	struct drm_modeset_acquire_ctx *ctx;
 	int ret;
 
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL);
 	if (WARN_ON(!ctx))
 		return;
 
diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c
index 57cc6e37c810..09c1c4ff93ca 100644
--- a/drivers/gpu/drm/drm_vblank.c
+++ b/drivers/gpu/drm/drm_vblank.c
@@ -299,8 +299,8 @@ u32 drm_crtc_accurate_vblank_count(struct drm_crtc *crtc)
 	u32 vblank;
 	unsigned long flags;
 
-	WARN(!dev->driver->get_vblank_timestamp,
-	     "This function requires support for accurate vblank timestamps.");
+	WARN_ONCE(drm_debug & DRM_UT_VBL && !dev->driver->get_vblank_timestamp,
+		  "This function requires support for accurate vblank timestamps.");
 
 	spin_lock_irqsave(&dev->vblank_time_lock, flags);
 
diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c
index e651a58c18cf..82b72425a42f 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_drv.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c
@@ -168,11 +168,13 @@ static struct drm_driver exynos_drm_driver = {
 static int exynos_drm_suspend(struct device *dev)
 {
 	struct drm_device *drm_dev = dev_get_drvdata(dev);
-	struct exynos_drm_private *private = drm_dev->dev_private;
+	struct exynos_drm_private *private;
 
 	if (pm_runtime_suspended(dev) || !drm_dev)
 		return 0;
 
+	private = drm_dev->dev_private;
+
 	drm_kms_helper_poll_disable(drm_dev);
 	exynos_drm_fbdev_suspend(drm_dev);
 	private->suspend_state = drm_atomic_helper_suspend(drm_dev);
@@ -188,11 +190,12 @@ static int exynos_drm_suspend(struct device *dev)
 static int exynos_drm_resume(struct device *dev)
 {
 	struct drm_device *drm_dev = dev_get_drvdata(dev);
-	struct exynos_drm_private *private = drm_dev->dev_private;
+	struct exynos_drm_private *private;
 
 	if (pm_runtime_suspended(dev) || !drm_dev)
 		return 0;
 
+	private = drm_dev->dev_private;
 	drm_atomic_helper_resume(drm_dev, private->suspend_state);
 	exynos_drm_fbdev_resume(drm_dev);
 	drm_kms_helper_poll_enable(drm_dev);
@@ -427,6 +430,7 @@ static void exynos_drm_unbind(struct device *dev)
 
 	kfree(drm->dev_private);
 	drm->dev_private = NULL;
+	dev_set_drvdata(dev, NULL);
 
 	drm_dev_unref(drm);
 }
diff --git a/drivers/gpu/drm/hisilicon/kirin/kirin_drm_ade.c b/drivers/gpu/drm/hisilicon/kirin/kirin_drm_ade.c
index 9823477b1855..2269be91f3e1 100644
--- a/drivers/gpu/drm/hisilicon/kirin/kirin_drm_ade.c
+++ b/drivers/gpu/drm/hisilicon/kirin/kirin_drm_ade.c
@@ -534,9 +534,12 @@ static void ade_crtc_atomic_begin(struct drm_crtc *crtc,
 {
 	struct ade_crtc *acrtc = to_ade_crtc(crtc);
 	struct ade_hw_ctx *ctx = acrtc->ctx;
+	struct drm_display_mode *mode = &crtc->state->mode;
+	struct drm_display_mode *adj_mode = &crtc->state->adjusted_mode;
 
 	if (!ctx->power_on)
 		(void)ade_power_up(ctx);
+	ade_ldi_set_mode(acrtc, mode, adj_mode);
 }
 
 static void ade_crtc_atomic_flush(struct drm_crtc *crtc,
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 66d23b619db1..6c3b0481ef82 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -64,7 +64,7 @@ i915-y += intel_uc.o \
 	  intel_guc.o \
 	  intel_guc_ct.o \
 	  intel_guc_log.o \
-	  intel_guc_loader.o \
+	  intel_guc_fw.o \
 	  intel_huc.o \
 	  i915_guc_submission.o
 
diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c
index 2c0ccbb817dc..701a3c6f1669 100644
--- a/drivers/gpu/drm/i915/gvt/cmd_parser.c
+++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c
@@ -2734,6 +2734,9 @@ static int combine_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
 	uint32_t per_ctx_start[CACHELINE_DWORDS] = {0};
 	unsigned char *bb_start_sva;
 
+	if (!wa_ctx->per_ctx.valid)
+		return 0;
+
 	per_ctx_start[0] = 0x18800001;
 	per_ctx_start[1] = wa_ctx->per_ctx.guest_gma;
 
diff --git a/drivers/gpu/drm/i915/gvt/execlist.c b/drivers/gpu/drm/i915/gvt/execlist.c
index 5ec07ecf33ad..4427be18e4a9 100644
--- a/drivers/gpu/drm/i915/gvt/execlist.c
+++ b/drivers/gpu/drm/i915/gvt/execlist.c
@@ -734,8 +734,7 @@ static int submit_context(struct intel_vgpu *vgpu, int ring_id,
 			CACHELINE_BYTES;
 		workload->wa_ctx.per_ctx.guest_gma =
 			per_ctx & PER_CTX_ADDR_MASK;
-
-		WARN_ON(workload->wa_ctx.indirect_ctx.size && !(per_ctx & 0x1));
+		workload->wa_ctx.per_ctx.valid = per_ctx & 1;
 	}
 
 	if (emulate_schedule_in)
diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c
index 2294466dd415..a5bed2e71b92 100644
--- a/drivers/gpu/drm/i915/gvt/handlers.c
+++ b/drivers/gpu/drm/i915/gvt/handlers.c
@@ -1429,18 +1429,7 @@ static int skl_lcpll_write(struct intel_vgpu *vgpu, unsigned int offset,
 	return 0;
 }
 
-static int ring_timestamp_mmio_read(struct intel_vgpu *vgpu,
-		unsigned int offset, void *p_data, unsigned int bytes)
-{
-	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
-
-	mmio_hw_access_pre(dev_priv);
-	vgpu_vreg(vgpu, offset) = I915_READ(_MMIO(offset));
-	mmio_hw_access_post(dev_priv);
-	return intel_vgpu_default_mmio_read(vgpu, offset, p_data, bytes);
-}
-
-static int instdone_mmio_read(struct intel_vgpu *vgpu,
+static int mmio_read_from_hw(struct intel_vgpu *vgpu,
 		unsigned int offset, void *p_data, unsigned int bytes)
 {
 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
@@ -1589,6 +1578,8 @@ static int ring_reset_ctl_write(struct intel_vgpu *vgpu,
 	MMIO_F(prefix(BLT_RING_BASE), s, f, am, rm, d, r, w); \
 	MMIO_F(prefix(GEN6_BSD_RING_BASE), s, f, am, rm, d, r, w); \
 	MMIO_F(prefix(VEBOX_RING_BASE), s, f, am, rm, d, r, w); \
+	if (HAS_BSD2(dev_priv)) \
+		MMIO_F(prefix(GEN8_BSD2_RING_BASE), s, f, am, rm, d, r, w); \
 } while (0)
 
 #define MMIO_RING_D(prefix, d) \
@@ -1635,10 +1626,9 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 #undef RING_REG
 
 #define RING_REG(base) (base + 0x6c)
-	MMIO_RING_DFH(RING_REG, D_ALL, 0, instdone_mmio_read, NULL);
-	MMIO_DH(RING_REG(GEN8_BSD2_RING_BASE), D_ALL, instdone_mmio_read, NULL);
+	MMIO_RING_DFH(RING_REG, D_ALL, 0, mmio_read_from_hw, NULL);
 #undef RING_REG
-	MMIO_DH(GEN7_SC_INSTDONE, D_BDW_PLUS, instdone_mmio_read, NULL);
+	MMIO_DH(GEN7_SC_INSTDONE, D_BDW_PLUS, mmio_read_from_hw, NULL);
 
 	MMIO_GM_RDR(0x2148, D_ALL, NULL, NULL);
 	MMIO_GM_RDR(CCID, D_ALL, NULL, NULL);
@@ -1648,7 +1638,7 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_RING_DFH(RING_TAIL, D_ALL, F_CMD_ACCESS, NULL, NULL);
 	MMIO_RING_DFH(RING_HEAD, D_ALL, F_CMD_ACCESS, NULL, NULL);
 	MMIO_RING_DFH(RING_CTL, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_RING_DFH(RING_ACTHD, D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_RING_DFH(RING_ACTHD, D_ALL, F_CMD_ACCESS, mmio_read_from_hw, NULL);
 	MMIO_RING_GM_RDR(RING_START, D_ALL, NULL, NULL);
 
 	/* RING MODE */
@@ -1662,9 +1652,9 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_RING_DFH(RING_INSTPM, D_ALL, F_MODE_MASK | F_CMD_ACCESS,
 			NULL, NULL);
 	MMIO_RING_DFH(RING_TIMESTAMP, D_ALL, F_CMD_ACCESS,
-			ring_timestamp_mmio_read, NULL);
+			mmio_read_from_hw, NULL);
 	MMIO_RING_DFH(RING_TIMESTAMP_UDW, D_ALL, F_CMD_ACCESS,
-			ring_timestamp_mmio_read, NULL);
+			mmio_read_from_hw, NULL);
 
 	MMIO_DFH(GEN7_GT_MODE, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 	MMIO_DFH(CACHE_MODE_0_GEN7, D_ALL, F_MODE_MASK | F_CMD_ACCESS,
@@ -2411,9 +2401,6 @@ static int init_broadwell_mmio_info(struct intel_gvt *gvt)
 	struct drm_i915_private *dev_priv = gvt->dev_priv;
 	int ret;
 
-	MMIO_DFH(RING_IMR(GEN8_BSD2_RING_BASE), D_BDW_PLUS, F_CMD_ACCESS, NULL,
-			intel_vgpu_reg_imr_handler);
-
 	MMIO_DH(GEN8_GT_IMR(0), D_BDW_PLUS, NULL, intel_vgpu_reg_imr_handler);
 	MMIO_DH(GEN8_GT_IER(0), D_BDW_PLUS, NULL, intel_vgpu_reg_ier_handler);
 	MMIO_DH(GEN8_GT_IIR(0), D_BDW_PLUS, NULL, intel_vgpu_reg_iir_handler);
@@ -2476,68 +2463,34 @@ static int init_broadwell_mmio_info(struct intel_gvt *gvt)
 	MMIO_DH(GEN8_MASTER_IRQ, D_BDW_PLUS, NULL,
 		intel_vgpu_reg_master_irq_handler);
 
-	MMIO_DFH(RING_HWSTAM(GEN8_BSD2_RING_BASE), D_BDW_PLUS,
-		F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x1c134, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-
-	MMIO_DFH(RING_TAIL(GEN8_BSD2_RING_BASE), D_BDW_PLUS, F_CMD_ACCESS,
-		NULL, NULL);
-	MMIO_DFH(RING_HEAD(GEN8_BSD2_RING_BASE),  D_BDW_PLUS,
-		F_CMD_ACCESS, NULL, NULL);
-	MMIO_GM_RDR(RING_START(GEN8_BSD2_RING_BASE), D_BDW_PLUS, NULL, NULL);
-	MMIO_DFH(RING_CTL(GEN8_BSD2_RING_BASE), D_BDW_PLUS, F_CMD_ACCESS,
-		NULL, NULL);
-	MMIO_DFH(RING_ACTHD(GEN8_BSD2_RING_BASE), D_BDW_PLUS,
-		F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(RING_ACTHD_UDW(GEN8_BSD2_RING_BASE), D_BDW_PLUS,
-		F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x1c29c, D_BDW_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL,
-		ring_mode_mmio_write);
-	MMIO_DFH(RING_MI_MODE(GEN8_BSD2_RING_BASE), D_BDW_PLUS,
-		F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(RING_INSTPM(GEN8_BSD2_RING_BASE), D_BDW_PLUS,
-		F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(RING_TIMESTAMP(GEN8_BSD2_RING_BASE), D_BDW_PLUS, F_CMD_ACCESS,
-			ring_timestamp_mmio_read, NULL);
-
-	MMIO_RING_DFH(RING_ACTHD_UDW, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_RING_DFH(RING_ACTHD_UDW, D_BDW_PLUS, F_CMD_ACCESS,
+		mmio_read_from_hw, NULL);
 
 #define RING_REG(base) (base + 0xd0)
 	MMIO_RING_F(RING_REG, 4, F_RO, 0,
 		~_MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET), D_BDW_PLUS, NULL,
 		ring_reset_ctl_write);
-	MMIO_F(RING_REG(GEN8_BSD2_RING_BASE), 4, F_RO, 0,
-		~_MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET), D_BDW_PLUS, NULL,
-		ring_reset_ctl_write);
 #undef RING_REG
 
 #define RING_REG(base) (base + 0x230)
 	MMIO_RING_DFH(RING_REG, D_BDW_PLUS, 0, NULL, elsp_mmio_write);
-	MMIO_DH(RING_REG(GEN8_BSD2_RING_BASE), D_BDW_PLUS, NULL, elsp_mmio_write);
 #undef RING_REG
 
 #define RING_REG(base) (base + 0x234)
 	MMIO_RING_F(RING_REG, 8, F_RO | F_CMD_ACCESS, 0, ~0, D_BDW_PLUS,
 		NULL, NULL);
-	MMIO_F(RING_REG(GEN8_BSD2_RING_BASE), 4, F_RO | F_CMD_ACCESS, 0,
-		~0LL, D_BDW_PLUS, NULL, NULL);
 #undef RING_REG
 
 #define RING_REG(base) (base + 0x244)
 	MMIO_RING_DFH(RING_REG, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(RING_REG(GEN8_BSD2_RING_BASE), D_BDW_PLUS, F_CMD_ACCESS,
-		NULL, NULL);
 #undef RING_REG
 
 #define RING_REG(base) (base + 0x370)
 	MMIO_RING_F(RING_REG, 48, F_RO, 0, ~0, D_BDW_PLUS, NULL, NULL);
-	MMIO_F(RING_REG(GEN8_BSD2_RING_BASE), 48, F_RO, 0, ~0, D_BDW_PLUS,
-			NULL, NULL);
 #undef RING_REG
 
 #define RING_REG(base) (base + 0x3a0)
 	MMIO_RING_DFH(RING_REG, D_BDW_PLUS, F_MODE_MASK, NULL, NULL);
-	MMIO_DFH(RING_REG(GEN8_BSD2_RING_BASE), D_BDW_PLUS, F_MODE_MASK, NULL, NULL);
 #undef RING_REG
 
 	MMIO_D(PIPEMISC(PIPE_A), D_BDW_PLUS);
@@ -2557,11 +2510,9 @@ static int init_broadwell_mmio_info(struct intel_gvt *gvt)
 
 #define RING_REG(base) (base + 0x270)
 	MMIO_RING_F(RING_REG, 32, 0, 0, 0, D_BDW_PLUS, NULL, NULL);
-	MMIO_F(RING_REG(GEN8_BSD2_RING_BASE), 32, 0, 0, 0, D_BDW_PLUS, NULL, NULL);
 #undef RING_REG
 
 	MMIO_RING_GM_RDR(RING_HWS_PGA, D_BDW_PLUS, NULL, NULL);
-	MMIO_GM_RDR(RING_HWS_PGA(GEN8_BSD2_RING_BASE), D_BDW_PLUS, NULL, NULL);
 
 	MMIO_DFH(HDC_CHICKEN0, D_BDW_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 
@@ -2849,7 +2800,6 @@ static int init_skl_mmio_info(struct intel_gvt *gvt)
 	MMIO_D(0x65f08, D_SKL | D_KBL);
 	MMIO_D(0x320f0, D_SKL | D_KBL);
 
-	MMIO_DFH(_REG_VCS2_EXCC, D_SKL_PLUS, F_CMD_ACCESS, NULL, NULL);
 	MMIO_D(0x70034, D_SKL_PLUS);
 	MMIO_D(0x71034, D_SKL_PLUS);
 	MMIO_D(0x72034, D_SKL_PLUS);
diff --git a/drivers/gpu/drm/i915/gvt/reg.h b/drivers/gpu/drm/i915/gvt/reg.h
index fbd023a16f18..7d01c77a0f7a 100644
--- a/drivers/gpu/drm/i915/gvt/reg.h
+++ b/drivers/gpu/drm/i915/gvt/reg.h
@@ -54,9 +54,6 @@
 
 #define VGT_SPRSTRIDE(pipe)	_PIPE(pipe, _SPRA_STRIDE, _PLANE_STRIDE_2_B)
 
-#define _REG_VECS_EXCC		0x1A028
-#define _REG_VCS2_EXCC		0x1c028
-
 #define _REG_701C0(pipe, plane) (0x701c0 + pipe * 0x1000 + (plane - 1) * 0x100)
 #define _REG_701C4(pipe, plane) (0x701c4 + pipe * 0x1000 + (plane - 1) * 0x100)
 
diff --git a/drivers/gpu/drm/i915/gvt/sched_policy.c b/drivers/gpu/drm/i915/gvt/sched_policy.c
index 436377da41ba..03532dfc0cd5 100644
--- a/drivers/gpu/drm/i915/gvt/sched_policy.c
+++ b/drivers/gpu/drm/i915/gvt/sched_policy.c
@@ -308,20 +308,8 @@ static int tbs_sched_init_vgpu(struct intel_vgpu *vgpu)
 
 static void tbs_sched_clean_vgpu(struct intel_vgpu *vgpu)
 {
-	struct intel_gvt_workload_scheduler *scheduler = &vgpu->gvt->scheduler;
-	int ring_id;
-
 	kfree(vgpu->sched_data);
 	vgpu->sched_data = NULL;
-
-	spin_lock_bh(&scheduler->mmio_context_lock);
-	for (ring_id = 0; ring_id < I915_NUM_ENGINES; ring_id++) {
-		if (scheduler->engine_owner[ring_id] == vgpu) {
-			intel_gvt_switch_mmio(vgpu, NULL, ring_id);
-			scheduler->engine_owner[ring_id] = NULL;
-		}
-	}
-	spin_unlock_bh(&scheduler->mmio_context_lock);
 }
 
 static void tbs_sched_start_schedule(struct intel_vgpu *vgpu)
@@ -388,6 +376,7 @@ void intel_vgpu_stop_schedule(struct intel_vgpu *vgpu)
 {
 	struct intel_gvt_workload_scheduler *scheduler =
 		&vgpu->gvt->scheduler;
+	int ring_id;
 
 	gvt_dbg_core("vgpu%d: stop schedule\n", vgpu->id);
 
@@ -401,4 +390,13 @@ void intel_vgpu_stop_schedule(struct intel_vgpu *vgpu)
 		scheduler->need_reschedule = true;
 		scheduler->current_vgpu = NULL;
 	}
+
+	spin_lock_bh(&scheduler->mmio_context_lock);
+	for (ring_id = 0; ring_id < I915_NUM_ENGINES; ring_id++) {
+		if (scheduler->engine_owner[ring_id] == vgpu) {
+			intel_gvt_switch_mmio(vgpu, NULL, ring_id);
+			scheduler->engine_owner[ring_id] = NULL;
+		}
+	}
+	spin_unlock_bh(&scheduler->mmio_context_lock);
 }
diff --git a/drivers/gpu/drm/i915/gvt/scheduler.h b/drivers/gpu/drm/i915/gvt/scheduler.h
index f36b85fd6d01..2d694f6c0907 100644
--- a/drivers/gpu/drm/i915/gvt/scheduler.h
+++ b/drivers/gpu/drm/i915/gvt/scheduler.h
@@ -68,6 +68,7 @@ struct shadow_indirect_ctx {
 struct shadow_per_ctx {
 	unsigned long guest_gma;
 	unsigned long shadow_gma;
+	unsigned valid;
 };
 
 struct intel_shadow_wa_ctx {
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 0bb6e01121fc..c65e381b85f3 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -83,7 +83,7 @@ static char get_active_flag(struct drm_i915_gem_object *obj)
 
 static char get_pin_flag(struct drm_i915_gem_object *obj)
 {
-	return obj->pin_display ? 'p' : ' ';
+	return obj->pin_global ? 'p' : ' ';
 }
 
 static char get_tiling_flag(struct drm_i915_gem_object *obj)
@@ -180,8 +180,8 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 			pin_count++;
 	}
 	seq_printf(m, " (pinned x %d)", pin_count);
-	if (obj->pin_display)
-		seq_printf(m, " (display)");
+	if (obj->pin_global)
+		seq_printf(m, " (global)");
 	list_for_each_entry(vma, &obj->vma_list, obj_link) {
 		if (!drm_mm_node_allocated(&vma->node))
 			continue;
@@ -271,7 +271,9 @@ static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
 		goto out;
 
 	total_obj_size = total_gtt_size = count = 0;
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
+
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, mm.link) {
 		if (count == total)
 			break;
 
@@ -283,7 +285,7 @@ static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
 		total_gtt_size += i915_gem_obj_total_ggtt_size(obj);
 
 	}
-	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link) {
+	list_for_each_entry(obj, &dev_priv->mm.unbound_list, mm.link) {
 		if (count == total)
 			break;
 
@@ -293,6 +295,7 @@ static int i915_gem_stolen_list_info(struct seq_file *m, void *data)
 		objects[count++] = obj;
 		total_obj_size += obj->base.size;
 	}
+	spin_unlock(&dev_priv->mm.obj_lock);
 
 	sort(objects, count, sizeof(*objects), obj_rank_by_stolen, NULL);
 
@@ -454,7 +457,9 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
 	mapped_size = mapped_count = 0;
 	purgeable_size = purgeable_count = 0;
 	huge_size = huge_count = 0;
-	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link) {
+
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_for_each_entry(obj, &dev_priv->mm.unbound_list, mm.link) {
 		size += obj->base.size;
 		++count;
 
@@ -477,11 +482,11 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
 	seq_printf(m, "%u unbound objects, %llu bytes\n", count, size);
 
 	size = count = dpy_size = dpy_count = 0;
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, mm.link) {
 		size += obj->base.size;
 		++count;
 
-		if (obj->pin_display) {
+		if (obj->pin_global) {
 			dpy_size += obj->base.size;
 			++dpy_count;
 		}
@@ -502,6 +507,8 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
 			page_sizes |= obj->mm.page_sizes.sg;
 		}
 	}
+	spin_unlock(&dev_priv->mm.obj_lock);
+
 	seq_printf(m, "%u bound objects, %llu bytes\n",
 		   count, size);
 	seq_printf(m, "%u purgeable objects, %llu bytes\n",
@@ -512,7 +519,7 @@ static int i915_gem_object_info(struct seq_file *m, void *data)
 		   huge_count,
 		   stringify_page_sizes(page_sizes, buf, sizeof(buf)),
 		   huge_size);
-	seq_printf(m, "%u display objects (pinned), %llu bytes\n",
+	seq_printf(m, "%u display objects (globally pinned), %llu bytes\n",
 		   dpy_count, dpy_size);
 
 	seq_printf(m, "%llu [%llu] gtt total\n",
@@ -568,32 +575,46 @@ static int i915_gem_gtt_info(struct seq_file *m, void *data)
 	struct drm_info_node *node = m->private;
 	struct drm_i915_private *dev_priv = node_to_i915(node);
 	struct drm_device *dev = &dev_priv->drm;
-	bool show_pin_display_only = !!node->info_ent->data;
+	struct drm_i915_gem_object **objects;
 	struct drm_i915_gem_object *obj;
 	u64 total_obj_size, total_gtt_size;
+	unsigned long nobject, n;
 	int count, ret;
 
+	nobject = READ_ONCE(dev_priv->mm.object_count);
+	objects = kvmalloc_array(nobject, sizeof(*objects), GFP_KERNEL);
+	if (!objects)
+		return -ENOMEM;
+
 	ret = mutex_lock_interruptible(&dev->struct_mutex);
 	if (ret)
 		return ret;
 
-	total_obj_size = total_gtt_size = count = 0;
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
-		if (show_pin_display_only && !obj->pin_display)
-			continue;
+	count = 0;
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, mm.link) {
+		objects[count++] = obj;
+		if (count == nobject)
+			break;
+	}
+	spin_unlock(&dev_priv->mm.obj_lock);
+
+	total_obj_size = total_gtt_size = 0;
+	for (n = 0;  n < count; n++) {
+		obj = objects[n];
 
 		seq_puts(m, "   ");
 		describe_obj(m, obj);
 		seq_putc(m, '\n');
 		total_obj_size += obj->base.size;
 		total_gtt_size += i915_gem_obj_total_ggtt_size(obj);
-		count++;
 	}
 
 	mutex_unlock(&dev->struct_mutex);
 
 	seq_printf(m, "Total %d objects, %llu bytes, %llu GTT size\n",
 		   count, total_obj_size, total_gtt_size);
+	kvfree(objects);
 
 	return 0;
 }
@@ -643,54 +664,6 @@ static int i915_gem_batch_pool_info(struct seq_file *m, void *data)
 	return 0;
 }
 
-static void print_request(struct seq_file *m,
-			  struct drm_i915_gem_request *rq,
-			  const char *prefix)
-{
-	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,
-		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
-		   rq->priotree.priority,
-		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
-		   rq->timeline->common->name);
-}
-
-static int i915_gem_request_info(struct seq_file *m, void *data)
-{
-	struct drm_i915_private *dev_priv = node_to_i915(m->private);
-	struct drm_device *dev = &dev_priv->drm;
-	struct drm_i915_gem_request *req;
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-	int ret, any;
-
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
-
-	any = 0;
-	for_each_engine(engine, dev_priv, id) {
-		int count;
-
-		count = 0;
-		list_for_each_entry(req, &engine->timeline->requests, link)
-			count++;
-		if (count == 0)
-			continue;
-
-		seq_printf(m, "%s requests: %d\n", engine->name, count);
-		list_for_each_entry(req, &engine->timeline->requests, link)
-			print_request(m, req, "    ");
-
-		any++;
-	}
-	mutex_unlock(&dev->struct_mutex);
-
-	if (any == 0)
-		seq_puts(m, "No requests\n");
-
-	return 0;
-}
-
 static void i915_ring_seqno_info(struct seq_file *m,
 				 struct intel_engine_cs *engine)
 {
@@ -2386,27 +2359,13 @@ static int i915_llc(struct seq_file *m, void *data)
 static int i915_huc_load_status_info(struct seq_file *m, void *data)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
-	struct intel_uc_fw *huc_fw = &dev_priv->huc.fw;
+	struct drm_printer p;
 
 	if (!HAS_HUC_UCODE(dev_priv))
 		return 0;
 
-	seq_puts(m, "HuC firmware status:\n");
-	seq_printf(m, "\tpath: %s\n", huc_fw->path);
-	seq_printf(m, "\tfetch: %s\n",
-		intel_uc_fw_status_repr(huc_fw->fetch_status));
-	seq_printf(m, "\tload: %s\n",
-		intel_uc_fw_status_repr(huc_fw->load_status));
-	seq_printf(m, "\tversion wanted: %d.%d\n",
-		huc_fw->major_ver_wanted, huc_fw->minor_ver_wanted);
-	seq_printf(m, "\tversion found: %d.%d\n",
-		huc_fw->major_ver_found, huc_fw->minor_ver_found);
-	seq_printf(m, "\theader: offset is %d; size = %d\n",
-		huc_fw->header_offset, huc_fw->header_size);
-	seq_printf(m, "\tuCode: offset is %d; size = %d\n",
-		huc_fw->ucode_offset, huc_fw->ucode_size);
-	seq_printf(m, "\tRSA: offset is %d; size = %d\n",
-		huc_fw->rsa_offset, huc_fw->rsa_size);
+	p = drm_seq_file_printer(m);
+	intel_uc_fw_dump(&dev_priv->huc.fw, &p);
 
 	intel_runtime_pm_get(dev_priv);
 	seq_printf(m, "\nHuC status 0x%08x:\n", I915_READ(HUC_STATUS2));
@@ -2418,29 +2377,14 @@ static int i915_huc_load_status_info(struct seq_file *m, void *data)
 static int i915_guc_load_status_info(struct seq_file *m, void *data)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
-	struct intel_uc_fw *guc_fw = &dev_priv->guc.fw;
+	struct drm_printer p;
 	u32 tmp, i;
 
 	if (!HAS_GUC_UCODE(dev_priv))
 		return 0;
 
-	seq_printf(m, "GuC firmware status:\n");
-	seq_printf(m, "\tpath: %s\n",
-		guc_fw->path);
-	seq_printf(m, "\tfetch: %s\n",
-		intel_uc_fw_status_repr(guc_fw->fetch_status));
-	seq_printf(m, "\tload: %s\n",
-		intel_uc_fw_status_repr(guc_fw->load_status));
-	seq_printf(m, "\tversion wanted: %d.%d\n",
-		guc_fw->major_ver_wanted, guc_fw->minor_ver_wanted);
-	seq_printf(m, "\tversion found: %d.%d\n",
-		guc_fw->major_ver_found, guc_fw->minor_ver_found);
-	seq_printf(m, "\theader: offset is %d; size = %d\n",
-		guc_fw->header_offset, guc_fw->header_size);
-	seq_printf(m, "\tuCode: offset is %d; size = %d\n",
-		guc_fw->ucode_offset, guc_fw->ucode_size);
-	seq_printf(m, "\tRSA: offset is %d; size = %d\n",
-		guc_fw->rsa_offset, guc_fw->rsa_size);
+	p = drm_seq_file_printer(m);
+	intel_uc_fw_dump(&dev_priv->guc.fw, &p);
 
 	intel_runtime_pm_get(dev_priv);
 
@@ -3310,6 +3254,16 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 	return 0;
 }
 
+static int i915_shrinker_info(struct seq_file *m, void *unused)
+{
+	struct drm_i915_private *i915 = node_to_i915(m->private);
+
+	seq_printf(m, "seeks = %d\n", i915->mm.shrinker.seeks);
+	seq_printf(m, "batch = %lu\n", i915->mm.shrinker.batch);
+
+	return 0;
+}
+
 static int i915_semaphore_status(struct seq_file *m, void *unused)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
@@ -4225,18 +4179,20 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
 			i915_ring_test_irq_get, i915_ring_test_irq_set,
 			"0x%08llx\n");
 
-#define DROP_UNBOUND 0x1
-#define DROP_BOUND 0x2
-#define DROP_RETIRE 0x4
-#define DROP_ACTIVE 0x8
-#define DROP_FREED 0x10
-#define DROP_SHRINK_ALL 0x20
+#define DROP_UNBOUND	BIT(0)
+#define DROP_BOUND	BIT(1)
+#define DROP_RETIRE	BIT(2)
+#define DROP_ACTIVE	BIT(3)
+#define DROP_FREED	BIT(4)
+#define DROP_SHRINK_ALL	BIT(5)
+#define DROP_IDLE	BIT(6)
 #define DROP_ALL (DROP_UNBOUND	| \
 		  DROP_BOUND	| \
 		  DROP_RETIRE	| \
 		  DROP_ACTIVE	| \
 		  DROP_FREED	| \
-		  DROP_SHRINK_ALL)
+		  DROP_SHRINK_ALL |\
+		  DROP_IDLE)
 static int
 i915_drop_caches_get(void *data, u64 *val)
 {
@@ -4252,7 +4208,8 @@ i915_drop_caches_set(void *data, u64 val)
 	struct drm_device *dev = &dev_priv->drm;
 	int ret = 0;
 
-	DRM_DEBUG("Dropping caches: 0x%08llx\n", val);
+	DRM_DEBUG("Dropping caches: 0x%08llx [0x%08llx]\n",
+		  val, val & DROP_ALL);
 
 	/* No need to check and wait for gpu resets, only libdrm auto-restarts
 	 * on ioctls on -EAGAIN. */
@@ -4283,6 +4240,9 @@ i915_drop_caches_set(void *data, u64 val)
 		i915_gem_shrink_all(dev_priv);
 	fs_reclaim_release(GFP_KERNEL);
 
+	if (val & DROP_IDLE)
+		drain_delayed_work(&dev_priv->gt.idle_work);
+
 	if (val & DROP_FREED) {
 		synchronize_rcu();
 		i915_gem_drain_freed_objects(dev_priv);
@@ -4751,9 +4711,7 @@ static const struct drm_info_list i915_debugfs_list[] = {
 	{"i915_capabilities", i915_capabilities, 0},
 	{"i915_gem_objects", i915_gem_object_info, 0},
 	{"i915_gem_gtt", i915_gem_gtt_info, 0},
-	{"i915_gem_pin_display", i915_gem_gtt_info, 0, (void *)1},
 	{"i915_gem_stolen", i915_gem_stolen_list_info },
-	{"i915_gem_request", i915_gem_request_info, 0},
 	{"i915_gem_seqno", i915_gem_seqno_info, 0},
 	{"i915_gem_fence_regs", i915_gem_fence_regs_info, 0},
 	{"i915_gem_interrupt", i915_interrupt_info, 0},
@@ -4791,6 +4749,7 @@ static const struct drm_info_list i915_debugfs_list[] = {
 	{"i915_dmc_info", i915_dmc_info, 0},
 	{"i915_display_info", i915_display_info, 0},
 	{"i915_engine_info", i915_engine_info, 0},
+	{"i915_shrinker_info", i915_shrinker_info, 0},
 	{"i915_semaphore_status", i915_semaphore_status, 0},
 	{"i915_shared_dplls_info", i915_shared_dplls_info, 0},
 	{"i915_dp_mst_info", i915_dp_mst_info, 0},
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c7b2ca6aff05..54b5d4c582b6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -80,8 +80,8 @@
 
 #define DRIVER_NAME		"i915"
 #define DRIVER_DESC		"Intel Graphics"
-#define DRIVER_DATE		"20171012"
-#define DRIVER_TIMESTAMP	1507831511
+#define DRIVER_DATE		"20171023"
+#define DRIVER_TIMESTAMP	1508748913
 
 /* Use I915_STATE_WARN(x) and I915_STATE_WARN_ON() (rather than WARN() and
  * WARN_ON()) for hw state sanity checks to check for unexpected conditions
@@ -785,7 +785,6 @@ struct intel_csr {
 	func(has_logical_ring_contexts); \
 	func(has_logical_ring_preemption); \
 	func(has_overlay); \
-	func(has_pipe_cxsr); \
 	func(has_pooled_eu); \
 	func(has_psr); \
 	func(has_rc6); \
@@ -1108,6 +1107,16 @@ struct intel_fbc {
 			int src_w;
 			int src_h;
 			bool visible;
+			/*
+			 * Display surface base address adjustement for
+			 * pageflips. Note that on gen4+ this only adjusts up
+			 * to a tile, offsets within a tile are handled in
+			 * the hw itself (with the TILEOFF register).
+			 */
+			int adjusted_x;
+			int adjusted_y;
+
+			int y;
 		} plane;
 
 		struct {
@@ -1490,6 +1499,9 @@ struct i915_gem_mm {
 	 * always the inner lock when overlapping with struct_mutex. */
 	struct mutex stolen_lock;
 
+	/* Protects bound_list/unbound_list and #drm_i915_gem_object.mm.link */
+	spinlock_t obj_lock;
+
 	/** List of all objects in gtt_space. Used to restore gtt
 	 * mappings on resume */
 	struct list_head bound_list;
@@ -1510,6 +1522,7 @@ struct i915_gem_mm {
 	 */
 	struct llist_head free_list;
 	struct work_struct free_work;
+	spinlock_t free_lock;
 
 	/**
 	 * Small stash of WC pages
@@ -1765,6 +1778,8 @@ struct intel_vbt_data {
 		u16 panel_id;
 		struct mipi_config *config;
 		struct mipi_pps_data *pps;
+		u16 bl_ports;
+		u16 cabc_ports;
 		u8 seq_version;
 		u32 size;
 		u8 *data;
@@ -1960,13 +1975,7 @@ struct i915_wa_reg {
 	u32 mask;
 };
 
-/*
- * RING_MAX_NONPRIV_SLOTS is per-engine but at this point we are only
- * allowing it for RCS as we don't foresee any requirement of having
- * a whitelist for other engines. When it is really required for
- * other engines then the limit need to be increased.
- */
-#define I915_MAX_WA_REGS (16 + RING_MAX_NONPRIV_SLOTS)
+#define I915_MAX_WA_REGS 16
 
 struct i915_workarounds {
 	struct i915_wa_reg reg[I915_MAX_WA_REGS];
@@ -3077,6 +3086,7 @@ intel_info(const struct drm_i915_private *dev_priv)
 
 #define CNL_REVID_A0		0x0
 #define CNL_REVID_B0		0x1
+#define CNL_REVID_C0		0x2
 
 #define IS_CNL_REVID(p, since, until) \
 	(IS_CANNONLAKE(p) && IS_REVID(p, since, until))
@@ -3168,7 +3178,6 @@ intel_info(const struct drm_i915_private *dev_priv)
 #define I915_HAS_HOTPLUG(dev_priv)	((dev_priv)->info.has_hotplug)
 
 #define HAS_FW_BLC(dev_priv) 	(INTEL_GEN(dev_priv) > 2)
-#define HAS_PIPE_CXSR(dev_priv) ((dev_priv)->info.has_pipe_cxsr)
 #define HAS_FBC(dev_priv)	((dev_priv)->info.has_fbc)
 #define HAS_CUR_FBC(dev_priv)	(!HAS_GMCH_DISPLAY(dev_priv) && INTEL_INFO(dev_priv)->gen >= 7)
 
@@ -3565,10 +3574,16 @@ i915_gem_object_pin_pages(struct drm_i915_gem_object *obj)
 	return __i915_gem_object_get_pages(obj);
 }
 
+static inline bool
+i915_gem_object_has_pages(struct drm_i915_gem_object *obj)
+{
+	return !IS_ERR_OR_NULL(READ_ONCE(obj->mm.pages));
+}
+
 static inline void
 __i915_gem_object_pin_pages(struct drm_i915_gem_object *obj)
 {
-	GEM_BUG_ON(!obj->mm.pages);
+	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
 
 	atomic_inc(&obj->mm.pages_pin_count);
 }
@@ -3582,8 +3597,8 @@ i915_gem_object_has_pinned_pages(struct drm_i915_gem_object *obj)
 static inline void
 __i915_gem_object_unpin_pages(struct drm_i915_gem_object *obj)
 {
+	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
-	GEM_BUG_ON(!obj->mm.pages);
 
 	atomic_dec(&obj->mm.pages_pin_count);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 20fcac37c85a..94b23fcbc989 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -56,7 +56,7 @@ static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 	if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
 		return true;
 
-	return obj->pin_display;
+	return obj->pin_global; /* currently in use by HW, keep flushed */
 }
 
 static int
@@ -1240,7 +1240,23 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
 	if (ret)
 		return ret;
 
-	intel_runtime_pm_get(i915);
+	if (i915_gem_object_has_struct_page(obj)) {
+		/*
+		 * Avoid waking the device up if we can fallback, as
+		 * waking/resuming is very slow (worst-case 10-100 ms
+		 * depending on PCI sleeps and our own resume time).
+		 * This easily dwarfs any performance advantage from
+		 * using the cache bypass of indirect GGTT access.
+		 */
+		if (!intel_runtime_pm_get_if_in_use(i915)) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+	} else {
+		/* No backing pages, no fallback, we must force GGTT access */
+		intel_runtime_pm_get(i915);
+	}
+
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
 				       PIN_MAPPABLE |
 				       PIN_NONFAULT |
@@ -1257,7 +1273,7 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
 	if (IS_ERR(vma)) {
 		ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
 		if (ret)
-			goto out_unlock;
+			goto out_rpm;
 		GEM_BUG_ON(!node.allocated);
 	}
 
@@ -1320,8 +1336,9 @@ out_unpin:
 	} else {
 		i915_vma_unpin(vma);
 	}
-out_unlock:
+out_rpm:
 	intel_runtime_pm_put(i915);
+out_unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
 	return ret;
 }
@@ -1537,6 +1554,8 @@ static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
 	struct list_head *list;
 	struct i915_vma *vma;
 
+	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
+
 	list_for_each_entry(vma, &obj->vma_list, obj_link) {
 		if (!i915_vma_is_ggtt(vma))
 			break;
@@ -1551,8 +1570,10 @@ static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
 	}
 
 	i915 = to_i915(obj->base.dev);
+	spin_lock(&i915->mm.obj_lock);
 	list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
-	list_move_tail(&obj->global_link, list);
+	list_move_tail(&obj->mm.link, list);
+	spin_unlock(&i915->mm.obj_lock);
 }
 
 /**
@@ -2196,7 +2217,7 @@ void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
 	struct address_space *mapping;
 
 	lockdep_assert_held(&obj->mm.lock);
-	GEM_BUG_ON(obj->mm.pages);
+	GEM_BUG_ON(i915_gem_object_has_pages(obj));
 
 	switch (obj->mm.madv) {
 	case I915_MADV_DONTNEED:
@@ -2253,13 +2274,14 @@ static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
 void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
 				 enum i915_mm_subclass subclass)
 {
+	struct drm_i915_private *i915 = to_i915(obj->base.dev);
 	struct sg_table *pages;
 
 	if (i915_gem_object_has_pinned_pages(obj))
 		return;
 
 	GEM_BUG_ON(obj->bind_count);
-	if (!READ_ONCE(obj->mm.pages))
+	if (!i915_gem_object_has_pages(obj))
 		return;
 
 	/* May be called by shrinker from within get_pages() (on another bo) */
@@ -2273,6 +2295,10 @@ void __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
 	pages = fetch_and_zero(&obj->mm.pages);
 	GEM_BUG_ON(!pages);
 
+	spin_lock(&i915->mm.obj_lock);
+	list_del(&obj->mm.link);
+	spin_unlock(&i915->mm.obj_lock);
+
 	if (obj->mm.mapping) {
 		void *ptr;
 
@@ -2507,7 +2533,7 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 	obj->mm.pages = pages;
 
 	if (i915_gem_object_is_tiled(obj) &&
-	    to_i915(obj->base.dev)->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
+	    i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
 		GEM_BUG_ON(obj->mm.quirked);
 		__i915_gem_object_pin_pages(obj);
 		obj->mm.quirked = true;
@@ -2529,8 +2555,11 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 		if (obj->mm.page_sizes.phys & ~0u << i)
 			obj->mm.page_sizes.sg |= BIT(i);
 	}
-
 	GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
+
+	spin_lock(&i915->mm.obj_lock);
+	list_add(&obj->mm.link, &i915->mm.unbound_list);
+	spin_unlock(&i915->mm.obj_lock);
 }
 
 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
@@ -2563,7 +2592,7 @@ int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
 	if (err)
 		return err;
 
-	if (unlikely(IS_ERR_OR_NULL(obj->mm.pages))) {
+	if (unlikely(!i915_gem_object_has_pages(obj))) {
 		GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
 
 		err = ____i915_gem_object_get_pages(obj);
@@ -2648,7 +2677,7 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 	type &= ~I915_MAP_OVERRIDE;
 
 	if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
-		if (unlikely(IS_ERR_OR_NULL(obj->mm.pages))) {
+		if (unlikely(!i915_gem_object_has_pages(obj))) {
 			GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
 
 			ret = ____i915_gem_object_get_pages(obj);
@@ -2660,7 +2689,7 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 		atomic_inc(&obj->mm.pages_pin_count);
 		pinned = false;
 	}
-	GEM_BUG_ON(!obj->mm.pages);
+	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
 
 	ptr = page_unpack_bits(obj->mm.mapping, &has_type);
 	if (ptr && has_type != type) {
@@ -2715,9 +2744,12 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
 	 * allows it to avoid the cost of retrieving a page (either swapin
 	 * or clearing-before-use) before it is overwritten.
 	 */
-	if (READ_ONCE(obj->mm.pages))
+	if (i915_gem_object_has_pages(obj))
 		return -ENODEV;
 
+	if (obj->mm.madv != I915_MADV_WILLNEED)
+		return -EFAULT;
+
 	/* Before the pages are instantiated the object is treated as being
 	 * in the CPU domain. The pages will be clflushed as required before
 	 * use, and we can freely write into the pages directly. If userspace
@@ -3087,7 +3119,6 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
 
 static void nop_submit_request(struct drm_i915_gem_request *request)
 {
-	GEM_BUG_ON(!i915_terminally_wedged(&request->i915->gpu_error));
 	dma_fence_set_error(&request->fence, -EIO);
 
 	i915_gem_request_submit(request);
@@ -3097,7 +3128,6 @@ static void nop_complete_submit_request(struct drm_i915_gem_request *request)
 {
 	unsigned long flags;
 
-	GEM_BUG_ON(!i915_terminally_wedged(&request->i915->gpu_error));
 	dma_fence_set_error(&request->fence, -EIO);
 
 	spin_lock_irqsave(&request->engine->timeline->lock, flags);
@@ -3495,7 +3525,7 @@ static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
 
 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
 {
-	if (!READ_ONCE(obj->pin_display))
+	if (!READ_ONCE(obj->pin_global))
 		return;
 
 	mutex_lock(&obj->base.dev->struct_mutex);
@@ -3862,10 +3892,10 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 
 	lockdep_assert_held(&obj->base.dev->struct_mutex);
 
-	/* Mark the pin_display early so that we account for the
+	/* Mark the global pin early so that we account for the
 	 * display coherency whilst setting up the cache domains.
 	 */
-	obj->pin_display++;
+	obj->pin_global++;
 
 	/* The display engine is not coherent with the LLC cache on gen6.  As
 	 * a result, we make sure that the pinning that is about to occur is
@@ -3881,7 +3911,7 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 					      I915_CACHE_WT : I915_CACHE_NONE);
 	if (ret) {
 		vma = ERR_PTR(ret);
-		goto err_unpin_display;
+		goto err_unpin_global;
 	}
 
 	/* As the user may map the buffer once pinned in the display plane
@@ -3912,7 +3942,7 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 		vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
 	}
 	if (IS_ERR(vma))
-		goto err_unpin_display;
+		goto err_unpin_global;
 
 	vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
 
@@ -3927,8 +3957,8 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 
 	return vma;
 
-err_unpin_display:
-	obj->pin_display--;
+err_unpin_global:
+	obj->pin_global--;
 	return vma;
 }
 
@@ -3937,10 +3967,10 @@ i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
 {
 	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
 
-	if (WARN_ON(vma->obj->pin_display == 0))
+	if (WARN_ON(vma->obj->pin_global == 0))
 		return;
 
-	if (--vma->obj->pin_display == 0)
+	if (--vma->obj->pin_global == 0)
 		vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
 
 	/* Bump the LRU to try and avoid premature eviction whilst flipping  */
@@ -4280,7 +4310,7 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 	if (err)
 		goto out;
 
-	if (obj->mm.pages &&
+	if (i915_gem_object_has_pages(obj) &&
 	    i915_gem_object_is_tiled(obj) &&
 	    dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
 		if (obj->mm.madv == I915_MADV_WILLNEED) {
@@ -4299,7 +4329,8 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 		obj->mm.madv = args->madv;
 
 	/* if the object is no longer attached, discard its backing storage */
-	if (obj->mm.madv == I915_MADV_DONTNEED && !obj->mm.pages)
+	if (obj->mm.madv == I915_MADV_DONTNEED &&
+	    !i915_gem_object_has_pages(obj))
 		i915_gem_object_truncate(obj);
 
 	args->retained = obj->mm.madv != __I915_MADV_PURGED;
@@ -4325,7 +4356,6 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 {
 	mutex_init(&obj->mm.lock);
 
-	INIT_LIST_HEAD(&obj->global_link);
 	INIT_LIST_HEAD(&obj->vma_list);
 	INIT_LIST_HEAD(&obj->lut_list);
 	INIT_LIST_HEAD(&obj->batch_pool_link);
@@ -4480,13 +4510,14 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 {
 	struct drm_i915_gem_object *obj, *on;
 
-	mutex_lock(&i915->drm.struct_mutex);
 	intel_runtime_pm_get(i915);
-	llist_for_each_entry(obj, freed, freed) {
+	llist_for_each_entry_safe(obj, on, freed, freed) {
 		struct i915_vma *vma, *vn;
 
 		trace_i915_gem_object_destroy(obj);
 
+		mutex_lock(&i915->drm.struct_mutex);
+
 		GEM_BUG_ON(i915_gem_object_is_active(obj));
 		list_for_each_entry_safe(vma, vn,
 					 &obj->vma_list, obj_link) {
@@ -4497,14 +4528,20 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		GEM_BUG_ON(!list_empty(&obj->vma_list));
 		GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
 
-		list_del(&obj->global_link);
-	}
-	intel_runtime_pm_put(i915);
-	mutex_unlock(&i915->drm.struct_mutex);
+		/* This serializes freeing with the shrinker. Since the free
+		 * is delayed, first by RCU then by the workqueue, we want the
+		 * shrinker to be able to free pages of unreferenced objects,
+		 * or else we may oom whilst there are plenty of deferred
+		 * freed objects.
+		 */
+		if (i915_gem_object_has_pages(obj)) {
+			spin_lock(&i915->mm.obj_lock);
+			list_del_init(&obj->mm.link);
+			spin_unlock(&i915->mm.obj_lock);
+		}
 
-	cond_resched();
+		mutex_unlock(&i915->drm.struct_mutex);
 
-	llist_for_each_entry_safe(obj, on, freed, freed) {
 		GEM_BUG_ON(obj->bind_count);
 		GEM_BUG_ON(obj->userfault_count);
 		GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
@@ -4516,7 +4553,7 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
 			atomic_set(&obj->mm.pages_pin_count, 0);
 		__i915_gem_object_put_pages(obj, I915_MM_NORMAL);
-		GEM_BUG_ON(obj->mm.pages);
+		GEM_BUG_ON(i915_gem_object_has_pages(obj));
 
 		if (obj->base.import_attach)
 			drm_prime_gem_destroy(&obj->base, NULL);
@@ -4527,16 +4564,29 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 
 		kfree(obj->bit_17);
 		i915_gem_object_free(obj);
+
+		if (on)
+			cond_resched();
 	}
+	intel_runtime_pm_put(i915);
 }
 
 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
 {
 	struct llist_node *freed;
 
-	freed = llist_del_all(&i915->mm.free_list);
-	if (unlikely(freed))
+	/* Free the oldest, most stale object to keep the free_list short */
+	freed = NULL;
+	if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
+		/* Only one consumer of llist_del_first() allowed */
+		spin_lock(&i915->mm.free_lock);
+		freed = llist_del_first(&i915->mm.free_list);
+		spin_unlock(&i915->mm.free_lock);
+	}
+	if (unlikely(freed)) {
+		freed->next = NULL;
 		__i915_gem_free_objects(i915, freed);
+	}
 }
 
 static void __i915_gem_free_work(struct work_struct *work)
@@ -4553,11 +4603,17 @@ static void __i915_gem_free_work(struct work_struct *work)
 	 * unbound now.
 	 */
 
+	spin_lock(&i915->mm.free_lock);
 	while ((freed = llist_del_all(&i915->mm.free_list))) {
+		spin_unlock(&i915->mm.free_lock);
+
 		__i915_gem_free_objects(i915, freed);
 		if (need_resched())
-			break;
+			return;
+
+		spin_lock(&i915->mm.free_lock);
 	}
+	spin_unlock(&i915->mm.free_lock);
 }
 
 static void __i915_gem_free_object_rcu(struct rcu_head *head)
@@ -4837,6 +4893,10 @@ int i915_gem_init_hw(struct drm_i915_private *dev_priv)
 	init_unused_rings(dev_priv);
 
 	BUG_ON(!dev_priv->kernel_context);
+	if (i915_terminally_wedged(&dev_priv->gpu_error)) {
+		ret = -EIO;
+		goto out;
+	}
 
 	ret = i915_ppgtt_init_hw(dev_priv);
 	if (ret) {
@@ -4935,8 +4995,10 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 		 * wedged. But we only want to do this where the GPU is angry,
 		 * for all other failure, such as an allocation failure, bail.
 		 */
-		DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
-		i915_gem_set_wedged(dev_priv);
+		if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
+			DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
+			i915_gem_set_wedged(dev_priv);
+		}
 		ret = 0;
 	}
 
@@ -5036,11 +5098,15 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
 		goto err_priorities;
 
 	INIT_WORK(&dev_priv->mm.free_work, __i915_gem_free_work);
+
+	spin_lock_init(&dev_priv->mm.obj_lock);
+	spin_lock_init(&dev_priv->mm.free_lock);
 	init_llist_head(&dev_priv->mm.free_list);
 	INIT_LIST_HEAD(&dev_priv->mm.unbound_list);
 	INIT_LIST_HEAD(&dev_priv->mm.bound_list);
 	INIT_LIST_HEAD(&dev_priv->mm.fence_list);
 	INIT_LIST_HEAD(&dev_priv->mm.userfault_list);
+
 	INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
 			  i915_gem_retire_work_handler);
 	INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
@@ -5134,12 +5200,12 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
 	i915_gem_shrink(dev_priv, -1UL, NULL, I915_SHRINK_UNBOUND);
 	i915_gem_drain_freed_objects(dev_priv);
 
-	mutex_lock(&dev_priv->drm.struct_mutex);
+	spin_lock(&dev_priv->mm.obj_lock);
 	for (p = phases; *p; p++) {
-		list_for_each_entry(obj, *p, global_link)
+		list_for_each_entry(obj, *p, mm.link)
 			__start_cpu_write(obj);
 	}
-	mutex_unlock(&dev_priv->drm.struct_mutex);
+	spin_unlock(&dev_priv->mm.obj_lock);
 
 	return 0;
 }
@@ -5458,7 +5524,17 @@ int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
 		goto err_unlock;
 	}
 
-	pages = obj->mm.pages;
+	pages = fetch_and_zero(&obj->mm.pages);
+	if (pages) {
+		struct drm_i915_private *i915 = to_i915(obj->base.dev);
+
+		__i915_gem_object_reset_page_iter(obj);
+
+		spin_lock(&i915->mm.obj_lock);
+		list_del(&obj->mm.link);
+		spin_unlock(&i915->mm.obj_lock);
+	}
+
 	obj->ops = &i915_gem_phys_ops;
 
 	err = ____i915_gem_object_get_pages(obj);
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
index 8a04d33055be..f663cd919795 100644
--- a/drivers/gpu/drm/i915/i915_gem_clflush.c
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -70,6 +70,7 @@ static const struct dma_fence_ops i915_clflush_ops = {
 
 static void __i915_do_clflush(struct drm_i915_gem_object *obj)
 {
+	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
 	drm_clflush_sg(obj->mm.pages);
 	intel_fb_obj_flush(obj, ORIGIN_CPU);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 5bf96a258509..e304dcbc6042 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -106,14 +106,9 @@ static void lut_close(struct i915_gem_context *ctx)
 
 	radix_tree_for_each_slot(slot, &ctx->handles_vma, &iter, 0) {
 		struct i915_vma *vma = rcu_dereference_raw(*slot);
-		struct drm_i915_gem_object *obj = vma->obj;
 
 		radix_tree_iter_delete(&ctx->handles_vma, &iter, slot);
-
-		if (!i915_vma_is_ggtt(vma))
-			i915_vma_close(vma);
-
-		__i915_gem_object_release_unless_active(obj);
+		__i915_gem_object_release_unless_active(vma->obj);
 	}
 }
 
@@ -198,6 +193,11 @@ static void context_close(struct i915_gem_context *ctx)
 {
 	i915_gem_context_set_closed(ctx);
 
+	/*
+	 * The LUT uses the VMA as a backpointer to unref the object,
+	 * so we need to clear the LUT before we close all the VMA (inside
+	 * the ppgtt).
+	 */
 	lut_close(ctx);
 	if (ctx->ppgtt)
 		i915_ppgtt_close(&ctx->ppgtt->base);
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index a5a5b7e6daae..8daa8a78cdc0 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -33,21 +33,24 @@
 #include "intel_drv.h"
 #include "i915_trace.h"
 
-static bool ggtt_is_idle(struct drm_i915_private *dev_priv)
+I915_SELFTEST_DECLARE(static struct igt_evict_ctl {
+	bool fail_if_busy:1;
+} igt_evict_ctl;)
+
+static bool ggtt_is_idle(struct drm_i915_private *i915)
 {
-	struct i915_ggtt *ggtt = &dev_priv->ggtt;
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
 
-	for_each_engine(engine, dev_priv, id) {
-		struct intel_timeline *tl;
+       if (i915->gt.active_requests)
+	       return false;
 
-		tl = &ggtt->base.timeline.engine[engine->id];
-		if (i915_gem_active_isset(&tl->last_request))
-			return false;
-	}
+       for_each_engine(engine, i915, id) {
+	       if (engine->last_retired_context != i915->kernel_context)
+		       return false;
+       }
 
-	return true;
+       return true;
 }
 
 static int ggtt_flush(struct drm_i915_private *i915)
@@ -157,7 +160,8 @@ i915_gem_evict_something(struct i915_address_space *vm,
 				    min_size, alignment, cache_level,
 				    start, end, mode);
 
-	/* Retire before we search the active list. Although we have
+	/*
+	 * Retire before we search the active list. Although we have
 	 * reasonable accuracy in our retirement lists, we may have
 	 * a stray pin (preventing eviction) that can only be resolved by
 	 * retiring.
@@ -182,7 +186,8 @@ search_again:
 		BUG_ON(ret);
 	}
 
-	/* Can we unpin some objects such as idle hw contents,
+	/*
+	 * Can we unpin some objects such as idle hw contents,
 	 * or pending flips? But since only the GGTT has global entries
 	 * such as scanouts, rinbuffers and contexts, we can skip the
 	 * purge when inspecting per-process local address spaces.
@@ -190,19 +195,36 @@ search_again:
 	if (!i915_is_ggtt(vm) || flags & PIN_NONBLOCK)
 		return -ENOSPC;
 
-	if (ggtt_is_idle(dev_priv)) {
-		/* If we still have pending pageflip completions, drop
-		 * back to userspace to give our workqueues time to
-		 * acquire our locks and unpin the old scanouts.
-		 */
-		return intel_has_pending_fb_unpin(dev_priv) ? -EAGAIN : -ENOSPC;
-	}
+	/*
+	 * Not everything in the GGTT is tracked via VMA using
+	 * i915_vma_move_to_active(), otherwise we could evict as required
+	 * with minimal stalling. Instead we are forced to idle the GPU and
+	 * explicitly retire outstanding requests which will then remove
+	 * the pinning for active objects such as contexts and ring,
+	 * enabling us to evict them on the next iteration.
+	 *
+	 * To ensure that all user contexts are evictable, we perform
+	 * a switch to the perma-pinned kernel context. This all also gives
+	 * us a termination condition, when the last retired context is
+	 * the kernel's there is no more we can evict.
+	 */
+	if (!ggtt_is_idle(dev_priv)) {
+		if (I915_SELFTEST_ONLY(igt_evict_ctl.fail_if_busy))
+			return -EBUSY;
+
+		ret = ggtt_flush(dev_priv);
+		if (ret)
+			return ret;
 
-	ret = ggtt_flush(dev_priv);
-	if (ret)
-		return ret;
+		goto search_again;
+	}
 
-	goto search_again;
+	/*
+	 * If we still have pending pageflip completions, drop
+	 * back to userspace to give our workqueues time to
+	 * acquire our locks and unpin the old scanouts.
+	 */
+	return intel_has_pending_fb_unpin(dev_priv) ? -EAGAIN : -ENOSPC;
 
 found:
 	/* drm_mm doesn't allow any other other operations while
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index daba55a4fce2..5eaa6893daaa 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1341,7 +1341,7 @@ static int gen8_ppgtt_alloc_pd(struct i915_address_space *vm,
 			if (IS_ERR(pt))
 				goto unwind;
 
-			if (count < GEN8_PTES)
+			if (count < GEN8_PTES || intel_vgpu_active(vm->i915))
 				gen8_initialize_pt(vm, pt);
 
 			gen8_ppgtt_set_pde(vm, pd, pt, pde);
@@ -3594,8 +3594,7 @@ void i915_gem_restore_gtt_mappings(struct drm_i915_private *dev_priv)
 	ggtt->base.closed = true; /* skip rewriting PTE on VMA unbind */
 
 	/* clflush objects bound into the GGTT and rebind them. */
-	list_for_each_entry_safe(obj, on,
-				 &dev_priv->mm.bound_list, global_link) {
+	list_for_each_entry_safe(obj, on, &dev_priv->mm.bound_list, mm.link) {
 		bool ggtt_bound = false;
 		struct i915_vma *vma;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index d67f1cbe842d..63ce38c1cce9 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -114,7 +114,6 @@ struct drm_i915_gem_object {
 
 	/** Stolen memory for this object, instead of being backed by shmem. */
 	struct drm_mm_node *stolen;
-	struct list_head global_link;
 	union {
 		struct rcu_head rcu;
 		struct llist_node freed;
@@ -161,7 +160,8 @@ struct drm_i915_gem_object {
 	/** Count of VMA actually bound by this object */
 	unsigned int bind_count;
 	unsigned int active_count;
-	unsigned int pin_display;
+	/** Count of how many global VMA are currently pinned for use by HW */
+	unsigned int pin_global;
 
 	struct {
 		struct mutex lock; /* protects the pages and their use */
@@ -208,6 +208,12 @@ struct drm_i915_gem_object {
 		} get_page;
 
 		/**
+		 * Element within i915->mm.unbound_list or i915->mm.bound_list,
+		 * locked by i915->mm.obj_lock.
+		 */
+		struct list_head link;
+
+		/**
 		 * Advice: are the backing pages purgeable?
 		 */
 		unsigned int madv:2;
diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c b/drivers/gpu/drm/i915/i915_gem_render_state.c
index 4dd4c2159a92..3703dc91eeda 100644
--- a/drivers/gpu/drm/i915/i915_gem_render_state.c
+++ b/drivers/gpu/drm/i915/i915_gem_render_state.c
@@ -229,7 +229,7 @@ int i915_gem_render_state_emit(struct drm_i915_gem_request *req)
 		return 0;
 
 	/* Recreate the page after shrinking */
-	if (!so->vma->obj->mm.pages)
+	if (!i915_gem_object_has_pages(so->vma->obj))
 		so->batch_offset = -1;
 
 	ret = i915_vma_pin(so->vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index 74002b2d1b6f..3770e3323fc8 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -71,25 +71,6 @@ static void shrinker_unlock(struct drm_i915_private *dev_priv, bool unlock)
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 }
 
-static bool any_vma_pinned(struct drm_i915_gem_object *obj)
-{
-	struct i915_vma *vma;
-
-	list_for_each_entry(vma, &obj->vma_list, obj_link) {
-		/* Only GGTT vma may be permanently pinned, and are always
-		 * at the start of the list. We can stop hunting as soon
-		 * as we see a ppGTT vma.
-		 */
-		if (!i915_vma_is_ggtt(vma))
-			break;
-
-		if (i915_vma_is_pinned(vma))
-			return true;
-	}
-
-	return false;
-}
-
 static bool swap_available(void)
 {
 	return get_nr_swap_pages() > 0;
@@ -97,9 +78,6 @@ static bool swap_available(void)
 
 static bool can_release_pages(struct drm_i915_gem_object *obj)
 {
-	if (!obj->mm.pages)
-		return false;
-
 	/* Consider only shrinkable ojects. */
 	if (!i915_gem_object_is_shrinkable(obj))
 		return false;
@@ -115,7 +93,13 @@ static bool can_release_pages(struct drm_i915_gem_object *obj)
 	if (atomic_read(&obj->mm.pages_pin_count) > obj->bind_count)
 		return false;
 
-	if (any_vma_pinned(obj))
+	/* If any vma are "permanently" pinned, it will prevent us from
+	 * reclaiming the obj->mm.pages. We only allow scanout objects to claim
+	 * a permanent pin, along with a few others like the context objects.
+	 * To simplify the scan, and to avoid walking the list of vma under the
+	 * object, we just check the count of its permanently pinned.
+	 */
+	if (READ_ONCE(obj->pin_global))
 		return false;
 
 	/* We can only return physical pages to the system if we can either
@@ -129,7 +113,7 @@ static bool unsafe_drop_pages(struct drm_i915_gem_object *obj)
 {
 	if (i915_gem_object_unbind(obj) == 0)
 		__i915_gem_object_put_pages(obj, I915_MM_SHRINKER);
-	return !READ_ONCE(obj->mm.pages);
+	return !i915_gem_object_has_pages(obj);
 }
 
 /**
@@ -178,6 +162,18 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
 	if (!shrinker_lock(dev_priv, &unlock))
 		return 0;
 
+	/*
+	 * When shrinking the active list, also consider active contexts.
+	 * Active contexts are pinned until they are retired, and so can
+	 * not be simply unbound to retire and unpin their pages. To shrink
+	 * the contexts, we must wait until the gpu is idle.
+	 *
+	 * We don't care about errors here; if we cannot wait upon the GPU,
+	 * we will free as much as we can and hope to get a second chance.
+	 */
+	if (flags & I915_SHRINK_ACTIVE)
+		i915_gem_wait_for_idle(dev_priv, I915_WAIT_LOCKED);
+
 	trace_i915_gem_shrink(dev_priv, target, flags);
 	i915_gem_retire_requests(dev_priv);
 
@@ -217,15 +213,20 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
 			continue;
 
 		INIT_LIST_HEAD(&still_in_list);
+
+		/*
+		 * We serialize our access to unreferenced objects through
+		 * the use of the struct_mutex. While the objects are not
+		 * yet freed (due to RCU then a workqueue) we still want
+		 * to be able to shrink their pages, so they remain on
+		 * the unbound/bound list until actually freed.
+		 */
+		spin_lock(&dev_priv->mm.obj_lock);
 		while (count < target &&
 		       (obj = list_first_entry_or_null(phase->list,
 						       typeof(*obj),
-						       global_link))) {
-			list_move_tail(&obj->global_link, &still_in_list);
-			if (!obj->mm.pages) {
-				list_del_init(&obj->global_link);
-				continue;
-			}
+						       mm.link))) {
+			list_move_tail(&obj->mm.link, &still_in_list);
 
 			if (flags & I915_SHRINK_PURGEABLE &&
 			    obj->mm.madv != I915_MADV_DONTNEED)
@@ -243,20 +244,24 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
 			if (!can_release_pages(obj))
 				continue;
 
+			spin_unlock(&dev_priv->mm.obj_lock);
+
 			if (unsafe_drop_pages(obj)) {
 				/* May arrive from get_pages on another bo */
 				mutex_lock_nested(&obj->mm.lock,
 						  I915_MM_SHRINKER);
-				if (!obj->mm.pages) {
+				if (!i915_gem_object_has_pages(obj)) {
 					__i915_gem_object_invalidate(obj);
-					list_del_init(&obj->global_link);
 					count += obj->base.size >> PAGE_SHIFT;
 				}
 				mutex_unlock(&obj->mm.lock);
-				scanned += obj->base.size >> PAGE_SHIFT;
 			}
+			scanned += obj->base.size >> PAGE_SHIFT;
+
+			spin_lock(&dev_priv->mm.obj_lock);
 		}
 		list_splice_tail(&still_in_list, phase->list);
+		spin_unlock(&dev_priv->mm.obj_lock);
 	}
 
 	if (flags & I915_SHRINK_BOUND)
@@ -302,28 +307,39 @@ unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv)
 static unsigned long
 i915_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc)
 {
-	struct drm_i915_private *dev_priv =
+	struct drm_i915_private *i915 =
 		container_of(shrinker, struct drm_i915_private, mm.shrinker);
 	struct drm_i915_gem_object *obj;
-	unsigned long count;
-	bool unlock;
-
-	if (!shrinker_lock(dev_priv, &unlock))
-		return 0;
-
-	i915_gem_retire_requests(dev_priv);
+	unsigned long num_objects = 0;
+	unsigned long count = 0;
 
-	count = 0;
-	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link)
-		if (can_release_pages(obj))
+	spin_lock(&i915->mm.obj_lock);
+	list_for_each_entry(obj, &i915->mm.unbound_list, mm.link)
+		if (can_release_pages(obj)) {
 			count += obj->base.size >> PAGE_SHIFT;
+			num_objects++;
+		}
 
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
-		if (!i915_gem_object_is_active(obj) && can_release_pages(obj))
+	list_for_each_entry(obj, &i915->mm.bound_list, mm.link)
+		if (!i915_gem_object_is_active(obj) && can_release_pages(obj)) {
 			count += obj->base.size >> PAGE_SHIFT;
-	}
+			num_objects++;
+		}
+	spin_unlock(&i915->mm.obj_lock);
 
-	shrinker_unlock(dev_priv, unlock);
+	/* Update our preferred vmscan batch size for the next pass.
+	 * Our rough guess for an effective batch size is roughly 2
+	 * available GEM objects worth of pages. That is we don't want
+	 * the shrinker to fire, until it is worth the cost of freeing an
+	 * entire GEM object.
+	 */
+	if (num_objects) {
+		unsigned long avg = 2 * count / num_objects;
+
+		i915->mm.shrinker.batch =
+			max((i915->mm.shrinker.batch + avg) >> 1,
+			    128ul /* default SHRINK_BATCH */);
+	}
 
 	return count;
 }
@@ -400,10 +416,6 @@ i915_gem_shrinker_oom(struct notifier_block *nb, unsigned long event, void *ptr)
 		container_of(nb, struct drm_i915_private, mm.oom_notifier);
 	struct drm_i915_gem_object *obj;
 	unsigned long unevictable, bound, unbound, freed_pages;
-	bool unlock;
-
-	if (!shrinker_lock_uninterruptible(dev_priv, &unlock, 5000))
-		return NOTIFY_DONE;
 
 	freed_pages = i915_gem_shrink_all(dev_priv);
 
@@ -412,26 +424,20 @@ i915_gem_shrinker_oom(struct notifier_block *nb, unsigned long event, void *ptr)
 	 * being pointed to by hardware.
 	 */
 	unbound = bound = unevictable = 0;
-	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_link) {
-		if (!obj->mm.pages)
-			continue;
-
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_for_each_entry(obj, &dev_priv->mm.unbound_list, mm.link) {
 		if (!can_release_pages(obj))
 			unevictable += obj->base.size >> PAGE_SHIFT;
 		else
 			unbound += obj->base.size >> PAGE_SHIFT;
 	}
-	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_link) {
-		if (!obj->mm.pages)
-			continue;
-
+	list_for_each_entry(obj, &dev_priv->mm.bound_list, mm.link) {
 		if (!can_release_pages(obj))
 			unevictable += obj->base.size >> PAGE_SHIFT;
 		else
 			bound += obj->base.size >> PAGE_SHIFT;
 	}
-
-	shrinker_unlock(dev_priv, unlock);
+	spin_unlock(&dev_priv->mm.obj_lock);
 
 	if (freed_pages || unbound || bound)
 		pr_info("Purging GPU memory, %lu pages freed, "
@@ -498,6 +504,7 @@ void i915_gem_shrinker_init(struct drm_i915_private *dev_priv)
 	dev_priv->mm.shrinker.scan_objects = i915_gem_shrinker_scan;
 	dev_priv->mm.shrinker.count_objects = i915_gem_shrinker_count;
 	dev_priv->mm.shrinker.seeks = DEFAULT_SEEKS;
+	dev_priv->mm.shrinker.batch = 4096;
 	WARN_ON(register_shrinker(&dev_priv->mm.shrinker));
 
 	dev_priv->mm.oom_notifier.notifier_call = i915_gem_shrinker_oom;
diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
index 54fd4cfa9d07..03e7abc7e043 100644
--- a/drivers/gpu/drm/i915/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
@@ -724,8 +724,11 @@ i915_gem_object_create_stolen_for_preallocated(struct drm_i915_private *dev_priv
 	vma->flags |= I915_VMA_GLOBAL_BIND;
 	__i915_vma_set_map_and_fenceable(vma);
 	list_move_tail(&vma->vm_link, &ggtt->base.inactive_list);
-	list_move_tail(&obj->global_link, &dev_priv->mm.bound_list);
+
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_move_tail(&obj->mm.link, &dev_priv->mm.bound_list);
 	obj->bind_count++;
+	spin_unlock(&dev_priv->mm.obj_lock);
 
 	return obj;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index fb5231f98c0d..1294cf695df0 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -269,7 +269,7 @@ i915_gem_object_set_tiling(struct drm_i915_gem_object *obj,
 	 * due to the change in swizzling.
 	 */
 	mutex_lock(&obj->mm.lock);
-	if (obj->mm.pages &&
+	if (i915_gem_object_has_pages(obj) &&
 	    obj->mm.madv == I915_MADV_WILLNEED &&
 	    i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
 		if (tiling == I915_TILING_NONE) {
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 4d712a4db63b..e26b23171b56 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -82,11 +82,11 @@ static void cancel_userptr(struct work_struct *work)
 	/* We are inside a kthread context and can't be interrupted */
 	if (i915_gem_object_unbind(obj) == 0)
 		__i915_gem_object_put_pages(obj, I915_MM_NORMAL);
-	WARN_ONCE(obj->mm.pages,
-		  "Failed to release pages: bind_count=%d, pages_pin_count=%d, pin_display=%d\n",
+	WARN_ONCE(i915_gem_object_has_pages(obj),
+		  "Failed to release pages: bind_count=%d, pages_pin_count=%d, pin_global=%d\n",
 		  obj->bind_count,
 		  atomic_read(&obj->mm.pages_pin_count),
-		  obj->pin_display);
+		  obj->pin_global);
 
 	mutex_unlock(&obj->base.dev->struct_mutex);
 
@@ -221,15 +221,17 @@ i915_mmu_notifier_find(struct i915_mm_struct *mm)
 			/* Protected by mm_lock */
 			mm->mn = fetch_and_zero(&mn);
 		}
-	} else {
-		/* someone else raced and successfully installed the mmu
-		 * notifier, we can cancel our own errors */
+	} else if (mm->mn) {
+		/*
+		 * Someone else raced and successfully installed the mmu
+		 * notifier, we can cancel our own errors.
+		 */
 		err = 0;
 	}
 	mutex_unlock(&mm->i915->mm_lock);
 	up_write(&mm->mm->mmap_sem);
 
-	if (mn) {
+	if (mn && !IS_ERR(mn)) {
 		destroy_workqueue(mn->wq);
 		kfree(mn);
 	}
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index a2e8114b739d..f84c267728fd 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -610,6 +610,7 @@ done:
 	execlists->first = rb;
 	if (submit) {
 		port_assign(port, last);
+		execlists_set_active(execlists, EXECLISTS_ACTIVE_USER);
 		i915_guc_submit(engine);
 	}
 	spin_unlock_irq(&engine->timeline->lock);
@@ -633,6 +634,8 @@ static void i915_guc_irq_handler(unsigned long data)
 
 		rq = port_request(&port[0]);
 	}
+	if (!rq)
+		execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
 
 	if (!port_isset(last_port))
 		i915_guc_dequeue(engine);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index b1296a55c1e4..f8205841868b 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1388,8 +1388,10 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
 	bool tasklet = false;
 
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
-		__set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-		tasklet = true;
+		if (READ_ONCE(engine->execlists.active)) {
+			__set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+			tasklet = true;
+		}
 	}
 
 	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index bf467f30c99b..6458c309c039 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -193,7 +193,6 @@ static const struct intel_device_info intel_i965gm_info __initconst = {
 static const struct intel_device_info intel_g45_info __initconst = {
 	GEN4_FEATURES,
 	.platform = INTEL_G45,
-	.has_pipe_cxsr = 1,
 	.ring_mask = RENDER_RING | BSD_RING,
 };
 
@@ -201,7 +200,6 @@ static const struct intel_device_info intel_gm45_info __initconst = {
 	GEN4_FEATURES,
 	.platform = INTEL_GM45,
 	.is_mobile = 1, .has_fbc = 1,
-	.has_pipe_cxsr = 1,
 	.supports_tv = 1,
 	.ring_mask = RENDER_RING | BSD_RING,
 };
@@ -645,7 +643,7 @@ static void i915_pci_remove(struct pci_dev *pdev)
 	struct drm_device *dev = pci_get_drvdata(pdev);
 
 	i915_driver_unload(dev);
-	drm_dev_unref(dev);
+	drm_dev_put(dev);
 }
 
 static int i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 1383a2995a69..59ee808f8fd9 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -2537,6 +2537,10 @@ static const struct file_operations fops = {
 	.poll		= i915_perf_poll,
 	.read		= i915_perf_read,
 	.unlocked_ioctl	= i915_perf_ioctl,
+	/* Our ioctl have no arguments, so it's safe to use the same function
+	 * to handle 32bits compatibility.
+	 */
+	.compat_ioctl   = i915_perf_ioctl,
 };
 
 
diff --git a/drivers/gpu/drm/i915/i915_pvinfo.h b/drivers/gpu/drm/i915/i915_pvinfo.h
index 0679a58cdbae..195203f298df 100644
--- a/drivers/gpu/drm/i915/i915_pvinfo.h
+++ b/drivers/gpu/drm/i915/i915_pvinfo.h
@@ -53,6 +53,7 @@ enum vgt_g2v_type {
  * VGT capabilities type
  */
 #define VGT_CAPS_FULL_48BIT_PPGTT	BIT(2)
+#define VGT_CAPS_HWSP_EMULATION		BIT(3)
 
 struct vgt_if {
 	u64 magic;		/* VGT_MAGIC */
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index d2d0a83c09b6..68a58cce6ab1 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -5242,7 +5242,7 @@ enum {
 #define   DP_AUX_CH_CTL_TIME_OUT_400us	    (0 << 26)
 #define   DP_AUX_CH_CTL_TIME_OUT_600us	    (1 << 26)
 #define   DP_AUX_CH_CTL_TIME_OUT_800us	    (2 << 26)
-#define   DP_AUX_CH_CTL_TIME_OUT_1600us	    (3 << 26)
+#define   DP_AUX_CH_CTL_TIME_OUT_MAX	    (3 << 26) /* Varies per platform */
 #define   DP_AUX_CH_CTL_TIME_OUT_MASK	    (3 << 26)
 #define   DP_AUX_CH_CTL_RECEIVE_ERROR	    (1 << 25)
 #define   DP_AUX_CH_CTL_MESSAGE_SIZE_MASK    (0x1f << 20)
@@ -7041,6 +7041,7 @@ enum {
  */
 #define  L3_GENERAL_PRIO_CREDITS(x)		(((x) >> 1) << 19)
 #define  L3_HIGH_PRIO_CREDITS(x)		(((x) >> 1) << 14)
+#define  L3_PRIO_CREDITS_MASK			((0x1f << 19) | (0x1f << 14))
 
 #define GEN7_L3CNTLREG1				_MMIO(0xB01C)
 #define  GEN7_WA_FOR_GEN7_L3_CONTROL			0x3C47FF8C
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 808ea4d5b962..e8ca67a129d2 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -41,6 +41,11 @@ static inline void debug_fence_init(struct i915_sw_fence *fence)
 	debug_object_init(fence, &i915_sw_fence_debug_descr);
 }
 
+static inline void debug_fence_init_onstack(struct i915_sw_fence *fence)
+{
+	debug_object_init_on_stack(fence, &i915_sw_fence_debug_descr);
+}
+
 static inline void debug_fence_activate(struct i915_sw_fence *fence)
 {
 	debug_object_activate(fence, &i915_sw_fence_debug_descr);
@@ -79,6 +84,10 @@ static inline void debug_fence_init(struct i915_sw_fence *fence)
 {
 }
 
+static inline void debug_fence_init_onstack(struct i915_sw_fence *fence)
+{
+}
+
 static inline void debug_fence_activate(struct i915_sw_fence *fence)
 {
 }
@@ -360,9 +369,9 @@ struct i915_sw_dma_fence_cb {
 	struct irq_work work;
 };
 
-static void timer_i915_sw_fence_wake(unsigned long data)
+static void timer_i915_sw_fence_wake(struct timer_list *t)
 {
-	struct i915_sw_dma_fence_cb *cb = (struct i915_sw_dma_fence_cb *)data;
+	struct i915_sw_dma_fence_cb *cb = from_timer(cb, t, timer);
 	struct i915_sw_fence *fence;
 
 	fence = xchg(&cb->fence, NULL);
@@ -425,9 +434,7 @@ int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
 	i915_sw_fence_await(fence);
 
 	cb->dma = NULL;
-	__setup_timer(&cb->timer,
-		      timer_i915_sw_fence_wake, (unsigned long)cb,
-		      TIMER_IRQSAFE);
+	timer_setup(&cb->timer, timer_i915_sw_fence_wake, TIMER_IRQSAFE);
 	init_irq_work(&cb->work, irq_i915_sw_fence_work);
 	if (timeout) {
 		cb->dma = dma_fence_get(dma);
@@ -507,5 +514,6 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftests/lib_sw_fence.c"
 #include "selftests/i915_sw_fence.c"
 #endif
diff --git a/drivers/gpu/drm/i915/i915_vgpu.h b/drivers/gpu/drm/i915/i915_vgpu.h
index b72bd2956b70..bb8338450dc1 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.h
+++ b/drivers/gpu/drm/i915/i915_vgpu.h
@@ -30,6 +30,12 @@ void i915_check_vgpu(struct drm_i915_private *dev_priv);
 
 bool intel_vgpu_has_full_48bit_ppgtt(struct drm_i915_private *dev_priv);
 
+static inline bool
+intel_vgpu_has_hwsp_emulation(struct drm_i915_private *dev_priv)
+{
+	return dev_priv->vgpu.caps & VGT_CAPS_HWSP_EMULATION;
+}
+
 int intel_vgt_balloon(struct drm_i915_private *dev_priv);
 void intel_vgt_deballoon(struct drm_i915_private *dev_priv);
 
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 4dce2e0197d9..fbfab2f33023 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -54,12 +54,21 @@ i915_vma_retire(struct i915_gem_active *active,
 	if (--obj->active_count)
 		return;
 
+	/* Prune the shared fence arrays iff completely idle (inc. external) */
+	if (reservation_object_trylock(obj->resv)) {
+		if (reservation_object_test_signaled_rcu(obj->resv, true))
+			reservation_object_add_excl_fence(obj->resv, NULL);
+		reservation_object_unlock(obj->resv);
+	}
+
 	/* Bump our place on the bound list to keep it roughly in LRU order
 	 * so that we don't steal from recently used but inactive objects
 	 * (unless we are forced to ofc!)
 	 */
+	spin_lock(&rq->i915->mm.obj_lock);
 	if (obj->bind_count)
-		list_move_tail(&obj->global_link, &rq->i915->mm.bound_list);
+		list_move_tail(&obj->mm.link, &rq->i915->mm.bound_list);
+	spin_unlock(&rq->i915->mm.obj_lock);
 
 	obj->mm.dirty = true; /* be paranoid  */
 
@@ -563,9 +572,13 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
 	GEM_BUG_ON(!i915_gem_valid_gtt_space(vma, obj->cache_level));
 
-	list_move_tail(&obj->global_link, &dev_priv->mm.bound_list);
 	list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
+
+	spin_lock(&dev_priv->mm.obj_lock);
+	list_move_tail(&obj->mm.link, &dev_priv->mm.bound_list);
 	obj->bind_count++;
+	spin_unlock(&dev_priv->mm.obj_lock);
+
 	GEM_BUG_ON(atomic_read(&obj->mm.pages_pin_count) < obj->bind_count);
 
 	return 0;
@@ -580,6 +593,7 @@ err_unpin:
 static void
 i915_vma_remove(struct i915_vma *vma)
 {
+	struct drm_i915_private *i915 = vma->vm->i915;
 	struct drm_i915_gem_object *obj = vma->obj;
 
 	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
@@ -593,9 +607,10 @@ i915_vma_remove(struct i915_vma *vma)
 	/* Since the unbound list is global, only move to that list if
 	 * no more VMAs exist.
 	 */
+	spin_lock(&i915->mm.obj_lock);
 	if (--obj->bind_count == 0)
-		list_move_tail(&obj->global_link,
-			       &to_i915(obj->base.dev)->mm.unbound_list);
+		list_move_tail(&obj->mm.link, &i915->mm.unbound_list);
+	spin_unlock(&i915->mm.obj_lock);
 
 	/* And finally now the object is completely decoupled from this vma,
 	 * we can drop its hold on the backing storage and allow it to be
diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index 9d5b42953436..fd23023df7c1 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -691,6 +691,48 @@ parse_psr(struct drm_i915_private *dev_priv, const struct bdb_header *bdb)
 	dev_priv->vbt.psr.tp2_tp3_wakeup_time = psr_table->tp2_tp3_wakeup_time;
 }
 
+static void parse_dsi_backlight_ports(struct drm_i915_private *dev_priv,
+				      u16 version, enum port port)
+{
+	if (!dev_priv->vbt.dsi.config->dual_link || version < 197) {
+		dev_priv->vbt.dsi.bl_ports = BIT(port);
+		if (dev_priv->vbt.dsi.config->cabc_supported)
+			dev_priv->vbt.dsi.cabc_ports = BIT(port);
+
+		return;
+	}
+
+	switch (dev_priv->vbt.dsi.config->dl_dcs_backlight_ports) {
+	case DL_DCS_PORT_A:
+		dev_priv->vbt.dsi.bl_ports = BIT(PORT_A);
+		break;
+	case DL_DCS_PORT_C:
+		dev_priv->vbt.dsi.bl_ports = BIT(PORT_C);
+		break;
+	default:
+	case DL_DCS_PORT_A_AND_C:
+		dev_priv->vbt.dsi.bl_ports = BIT(PORT_A) | BIT(PORT_C);
+		break;
+	}
+
+	if (!dev_priv->vbt.dsi.config->cabc_supported)
+		return;
+
+	switch (dev_priv->vbt.dsi.config->dl_dcs_cabc_ports) {
+	case DL_DCS_PORT_A:
+		dev_priv->vbt.dsi.cabc_ports = BIT(PORT_A);
+		break;
+	case DL_DCS_PORT_C:
+		dev_priv->vbt.dsi.cabc_ports = BIT(PORT_C);
+		break;
+	default:
+	case DL_DCS_PORT_A_AND_C:
+		dev_priv->vbt.dsi.cabc_ports =
+					BIT(PORT_A) | BIT(PORT_C);
+		break;
+	}
+}
+
 static void
 parse_mipi_config(struct drm_i915_private *dev_priv,
 		  const struct bdb_header *bdb)
@@ -699,9 +741,10 @@ parse_mipi_config(struct drm_i915_private *dev_priv,
 	const struct mipi_config *config;
 	const struct mipi_pps_data *pps;
 	int panel_type = dev_priv->vbt.panel_type;
+	enum port port;
 
 	/* parse MIPI blocks only if LFP type is MIPI */
-	if (!intel_bios_is_dsi_present(dev_priv, NULL))
+	if (!intel_bios_is_dsi_present(dev_priv, &port))
 		return;
 
 	/* Initialize this to undefined indicating no generic MIPI support */
@@ -742,15 +785,7 @@ parse_mipi_config(struct drm_i915_private *dev_priv,
 		return;
 	}
 
-	/*
-	 * These fields are introduced from the VBT version 197 onwards,
-	 * so making sure that these bits are set zero in the previous
-	 * versions.
-	 */
-	if (dev_priv->vbt.dsi.config->dual_link && bdb->version < 197) {
-		dev_priv->vbt.dsi.config->dl_dcs_cabc_ports = 0;
-		dev_priv->vbt.dsi.config->dl_dcs_backlight_ports = 0;
-	}
+	parse_dsi_backlight_ports(dev_priv, bdb->version, port);
 
 	/* We have mandatory mipi config blocks. Initialize as generic panel */
 	dev_priv->vbt.dsi.panel_id = MIPI_DSI_GENERIC_PANEL_ID;
@@ -1071,6 +1106,22 @@ static void sanitize_aux_ch(struct drm_i915_private *dev_priv,
 	}
 }
 
+static const u8 cnp_ddc_pin_map[] = {
+	[DDC_BUS_DDI_B] = GMBUS_PIN_1_BXT,
+	[DDC_BUS_DDI_C] = GMBUS_PIN_2_BXT,
+	[DDC_BUS_DDI_D] = GMBUS_PIN_4_CNP, /* sic */
+	[DDC_BUS_DDI_F] = GMBUS_PIN_3_BXT, /* sic */
+};
+
+static u8 map_ddc_pin(struct drm_i915_private *dev_priv, u8 vbt_pin)
+{
+	if (HAS_PCH_CNP(dev_priv) &&
+	    vbt_pin > 0 && vbt_pin < ARRAY_SIZE(cnp_ddc_pin_map))
+		return cnp_ddc_pin_map[vbt_pin];
+
+	return vbt_pin;
+}
+
 static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port,
 			   u8 bdb_version)
 {
@@ -1131,6 +1182,13 @@ static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port,
 		is_hdmi = false;
 	}
 
+	if (port == PORT_A && is_dvi) {
+		DRM_DEBUG_KMS("VBT claims port A supports DVI%s, ignoring\n",
+			      is_hdmi ? "/HDMI" : "");
+		is_dvi = false;
+		is_hdmi = false;
+	}
+
 	info->supports_dvi = is_dvi;
 	info->supports_hdmi = is_hdmi;
 	info->supports_dp = is_dp;
@@ -1156,16 +1214,7 @@ static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port,
 		DRM_DEBUG_KMS("Port %c is internal DP\n", port_name(port));
 
 	if (is_dvi) {
-		info->alternate_ddc_pin = ddc_pin;
-
-		/*
-		 * All VBTs that we got so far for B Stepping has this
-		 * information wrong for Port D. So, let's just ignore for now.
-		 */
-		if (IS_CNL_REVID(dev_priv, CNL_REVID_B0, CNL_REVID_B0) &&
-		    port == PORT_D) {
-			info->alternate_ddc_pin = 0;
-		}
+		info->alternate_ddc_pin = map_ddc_pin(dev_priv, ddc_pin);
 
 		sanitize_ddc_pin(dev_priv, port);
 	}
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 29c62d481cef..48e1ba01ccf8 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -74,9 +74,10 @@ static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
 	set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
 }
 
-static void intel_breadcrumbs_hangcheck(unsigned long data)
+static void intel_breadcrumbs_hangcheck(struct timer_list *t)
 {
-	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct intel_engine_cs *engine = from_timer(engine, t,
+						    breadcrumbs.hangcheck);
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
 
 	if (!b->irq_armed)
@@ -108,9 +109,10 @@ static void intel_breadcrumbs_hangcheck(unsigned long data)
 	}
 }
 
-static void intel_breadcrumbs_fake_irq(unsigned long data)
+static void intel_breadcrumbs_fake_irq(struct timer_list *t)
 {
-	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct intel_engine_cs *engine = from_timer(engine, t,
+						    breadcrumbs.fake_irq);
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
 
 	/* The timer persists in case we cannot enable interrupts,
@@ -787,12 +789,8 @@ int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
 	spin_lock_init(&b->rb_lock);
 	spin_lock_init(&b->irq_lock);
 
-	setup_timer(&b->fake_irq,
-		    intel_breadcrumbs_fake_irq,
-		    (unsigned long)engine);
-	setup_timer(&b->hangcheck,
-		    intel_breadcrumbs_hangcheck,
-		    (unsigned long)engine);
+	timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
+	timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
 
 	/* Spawn a thread to provide a common bottom-half for all signals.
 	 * As this is an asynchronous interface we cannot steal the current
diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c
index 668e8c3e791d..437339f5d098 100644
--- a/drivers/gpu/drm/i915/intel_crt.c
+++ b/drivers/gpu/drm/i915/intel_crt.c
@@ -344,10 +344,25 @@ static bool intel_crt_compute_config(struct intel_encoder *encoder,
 				     struct intel_crtc_state *pipe_config,
 				     struct drm_connector_state *conn_state)
 {
+	return true;
+}
+
+static bool pch_crt_compute_config(struct intel_encoder *encoder,
+				   struct intel_crtc_state *pipe_config,
+				   struct drm_connector_state *conn_state)
+{
+	pipe_config->has_pch_encoder = true;
+
+	return true;
+}
+
+static bool hsw_crt_compute_config(struct intel_encoder *encoder,
+				   struct intel_crtc_state *pipe_config,
+				   struct drm_connector_state *conn_state)
+{
 	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
 
-	if (HAS_PCH_SPLIT(dev_priv))
-		pipe_config->has_pch_encoder = true;
+	pipe_config->has_pch_encoder = true;
 
 	/* LPT FDI RX only supports 8bpc. */
 	if (HAS_PCH_LPT(dev_priv)) {
@@ -360,8 +375,7 @@ static bool intel_crt_compute_config(struct intel_encoder *encoder,
 	}
 
 	/* FDI must always be 2.7 GHz */
-	if (HAS_DDI(dev_priv))
-		pipe_config->port_clock = 135000 * 2;
+	pipe_config->port_clock = 135000 * 2;
 
 	return true;
 }
@@ -959,11 +973,11 @@ void intel_crt_init(struct drm_i915_private *dev_priv)
 	    !dmi_check_system(intel_spurious_crt_detect))
 		crt->base.hpd_pin = HPD_CRT;
 
-	crt->base.compute_config = intel_crt_compute_config;
 	if (HAS_DDI(dev_priv)) {
 		crt->base.port = PORT_E;
 		crt->base.get_config = hsw_crt_get_config;
 		crt->base.get_hw_state = intel_ddi_get_hw_state;
+		crt->base.compute_config = hsw_crt_compute_config;
 		crt->base.pre_pll_enable = hsw_pre_pll_enable_crt;
 		crt->base.pre_enable = hsw_pre_enable_crt;
 		crt->base.enable = hsw_enable_crt;
@@ -971,9 +985,11 @@ void intel_crt_init(struct drm_i915_private *dev_priv)
 		crt->base.post_disable = hsw_post_disable_crt;
 	} else {
 		if (HAS_PCH_SPLIT(dev_priv)) {
+			crt->base.compute_config = pch_crt_compute_config;
 			crt->base.disable = pch_disable_crt;
 			crt->base.post_disable = pch_post_disable_crt;
 		} else {
+			crt->base.compute_config = intel_crt_compute_config;
 			crt->base.disable = intel_disable_crt;
 		}
 		crt->base.port = PORT_NONE;
diff --git a/drivers/gpu/drm/i915/intel_csr.c b/drivers/gpu/drm/i915/intel_csr.c
index ea5d5c9645a4..da9de47562b8 100644
--- a/drivers/gpu/drm/i915/intel_csr.c
+++ b/drivers/gpu/drm/i915/intel_csr.c
@@ -52,10 +52,6 @@ MODULE_FIRMWARE(I915_CSR_SKL);
 MODULE_FIRMWARE(I915_CSR_BXT);
 #define BXT_CSR_VERSION_REQUIRED	CSR_VERSION(1, 7)
 
-#define FIRMWARE_URL  "https://01.org/linuxgraphics/downloads/firmware"
-
-
-
 
 #define CSR_MAX_FW_SIZE			0x2FFF
 #define CSR_DEFAULT_FW_OFFSET		0xFFFFFFFF
@@ -291,7 +287,8 @@ static uint32_t *parse_csr_fw(struct drm_i915_private *dev_priv,
 	css_header = (struct intel_css_header *)fw->data;
 	if (sizeof(struct intel_css_header) !=
 	    (css_header->header_len * 4)) {
-		DRM_ERROR("Firmware has wrong CSS header length %u bytes\n",
+		DRM_ERROR("DMC firmware has wrong CSS header length "
+			  "(%u bytes)\n",
 			  (css_header->header_len * 4));
 		return NULL;
 	}
@@ -315,7 +312,7 @@ static uint32_t *parse_csr_fw(struct drm_i915_private *dev_priv,
 
 	if (csr->version != required_version) {
 		DRM_INFO("Refusing to load DMC firmware v%u.%u,"
-			 " please use v%u.%u [" FIRMWARE_URL "].\n",
+			 " please use v%u.%u\n",
 			 CSR_VERSION_MAJOR(csr->version),
 			 CSR_VERSION_MINOR(csr->version),
 			 CSR_VERSION_MAJOR(required_version),
@@ -330,7 +327,8 @@ static uint32_t *parse_csr_fw(struct drm_i915_private *dev_priv,
 		&fw->data[readcount];
 	if (sizeof(struct intel_package_header) !=
 	    (package_header->header_len * 4)) {
-		DRM_ERROR("Firmware has wrong package header length %u bytes\n",
+		DRM_ERROR("DMC firmware has wrong package header length "
+			  "(%u bytes)\n",
 			  (package_header->header_len * 4));
 		return NULL;
 	}
@@ -351,7 +349,7 @@ static uint32_t *parse_csr_fw(struct drm_i915_private *dev_priv,
 			dmc_offset = package_header->fw_info[i].offset;
 	}
 	if (dmc_offset == CSR_DEFAULT_FW_OFFSET) {
-		DRM_ERROR("Firmware not supported for %c stepping\n",
+		DRM_ERROR("DMC firmware not supported for %c stepping\n",
 			  si->stepping);
 		return NULL;
 	}
@@ -360,7 +358,8 @@ static uint32_t *parse_csr_fw(struct drm_i915_private *dev_priv,
 	/* Extract dmc_header information. */
 	dmc_header = (struct intel_dmc_header *)&fw->data[readcount];
 	if (sizeof(struct intel_dmc_header) != (dmc_header->header_len)) {
-		DRM_ERROR("Firmware has wrong dmc header length %u bytes\n",
+		DRM_ERROR("DMC firmware has wrong dmc header length "
+			  "(%u bytes)\n",
 			  (dmc_header->header_len));
 		return NULL;
 	}
@@ -368,7 +367,7 @@ static uint32_t *parse_csr_fw(struct drm_i915_private *dev_priv,
 
 	/* Cache the dmc header info. */
 	if (dmc_header->mmio_count > ARRAY_SIZE(csr->mmioaddr)) {
-		DRM_ERROR("Firmware has wrong mmio count %u\n",
+		DRM_ERROR("DMC firmware has wrong mmio count %u\n",
 			  dmc_header->mmio_count);
 		return NULL;
 	}
@@ -376,7 +375,7 @@ static uint32_t *parse_csr_fw(struct drm_i915_private *dev_priv,
 	for (i = 0; i < dmc_header->mmio_count; i++) {
 		if (dmc_header->mmioaddr[i] < CSR_MMIO_START_RANGE ||
 		    dmc_header->mmioaddr[i] > CSR_MMIO_END_RANGE) {
-			DRM_ERROR(" Firmware has wrong mmio address 0x%x\n",
+			DRM_ERROR("DMC firmware has wrong mmio address 0x%x\n",
 				  dmc_header->mmioaddr[i]);
 			return NULL;
 		}
@@ -387,7 +386,7 @@ static uint32_t *parse_csr_fw(struct drm_i915_private *dev_priv,
 	/* fw_size is in dwords, so multiplied by 4 to convert into bytes. */
 	nbytes = dmc_header->fw_size * 4;
 	if (nbytes > CSR_MAX_FW_SIZE) {
-		DRM_ERROR("CSR firmware too big (%u) bytes\n", nbytes);
+		DRM_ERROR("DMC firmware too big (%u bytes)\n", nbytes);
 		return NULL;
 	}
 	csr->dmc_fw_size = dmc_header->fw_size;
@@ -425,9 +424,11 @@ static void csr_load_work_fn(struct work_struct *work)
 			 CSR_VERSION_MINOR(csr->version));
 	} else {
 		dev_notice(dev_priv->drm.dev,
-			   "Failed to load DMC firmware"
-			   " [" FIRMWARE_URL "],"
-			   " disabling runtime power management.\n");
+			   "Failed to load DMC firmware %s."
+			   " Disabling runtime power management.\n",
+			   csr->fw_path);
+		dev_notice(dev_priv->drm.dev, "DMC firmware homepage: %s",
+			   INTEL_UC_FIRMWARE_URL);
 	}
 
 	release_firmware(fw);
diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c
index b307b6fe1ce3..933c18fd4258 100644
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -305,35 +305,34 @@ struct bxt_ddi_buf_trans {
 	u8 scale;	/* scale value */
 	u8 enable;	/* scale enable */
 	u8 deemphasis;
-	bool default_index; /* true if the entry represents default value */
 };
 
 static const struct bxt_ddi_buf_trans bxt_ddi_translations_dp[] = {
 					/* Idx	NT mV diff	db  */
-	{ 52,  0x9A, 0, 128, true  },	/* 0:	400		0   */
-	{ 78,  0x9A, 0, 85,  false },	/* 1:	400		3.5 */
-	{ 104, 0x9A, 0, 64,  false },	/* 2:	400		6   */
-	{ 154, 0x9A, 0, 43,  false },	/* 3:	400		9.5 */
-	{ 77,  0x9A, 0, 128, false },	/* 4:	600		0   */
-	{ 116, 0x9A, 0, 85,  false },	/* 5:	600		3.5 */
-	{ 154, 0x9A, 0, 64,  false },	/* 6:	600		6   */
-	{ 102, 0x9A, 0, 128, false },	/* 7:	800		0   */
-	{ 154, 0x9A, 0, 85,  false },	/* 8:	800		3.5 */
-	{ 154, 0x9A, 1, 128, false },	/* 9:	1200		0   */
+	{ 52,  0x9A, 0, 128, },	/* 0:	400		0   */
+	{ 78,  0x9A, 0, 85,  },	/* 1:	400		3.5 */
+	{ 104, 0x9A, 0, 64,  },	/* 2:	400		6   */
+	{ 154, 0x9A, 0, 43,  },	/* 3:	400		9.5 */
+	{ 77,  0x9A, 0, 128, },	/* 4:	600		0   */
+	{ 116, 0x9A, 0, 85,  },	/* 5:	600		3.5 */
+	{ 154, 0x9A, 0, 64,  },	/* 6:	600		6   */
+	{ 102, 0x9A, 0, 128, },	/* 7:	800		0   */
+	{ 154, 0x9A, 0, 85,  },	/* 8:	800		3.5 */
+	{ 154, 0x9A, 1, 128, },	/* 9:	1200		0   */
 };
 
 static const struct bxt_ddi_buf_trans bxt_ddi_translations_edp[] = {
 					/* Idx	NT mV diff	db  */
-	{ 26, 0, 0, 128, false },	/* 0:	200		0   */
-	{ 38, 0, 0, 112, false },	/* 1:	200		1.5 */
-	{ 48, 0, 0, 96,  false },	/* 2:	200		4   */
-	{ 54, 0, 0, 69,  false },	/* 3:	200		6   */
-	{ 32, 0, 0, 128, false },	/* 4:	250		0   */
-	{ 48, 0, 0, 104, false },	/* 5:	250		1.5 */
-	{ 54, 0, 0, 85,  false },	/* 6:	250		4   */
-	{ 43, 0, 0, 128, false },	/* 7:	300		0   */
-	{ 54, 0, 0, 101, false },	/* 8:	300		1.5 */
-	{ 48, 0, 0, 128, false },	/* 9:	300		0   */
+	{ 26, 0, 0, 128, },	/* 0:	200		0   */
+	{ 38, 0, 0, 112, },	/* 1:	200		1.5 */
+	{ 48, 0, 0, 96,  },	/* 2:	200		4   */
+	{ 54, 0, 0, 69,  },	/* 3:	200		6   */
+	{ 32, 0, 0, 128, },	/* 4:	250		0   */
+	{ 48, 0, 0, 104, },	/* 5:	250		1.5 */
+	{ 54, 0, 0, 85,  },	/* 6:	250		4   */
+	{ 43, 0, 0, 128, },	/* 7:	300		0   */
+	{ 54, 0, 0, 101, },	/* 8:	300		1.5 */
+	{ 48, 0, 0, 128, },	/* 9:	300		0   */
 };
 
 /* BSpec has 2 recommended values - entries 0 and 8.
@@ -341,16 +340,16 @@ static const struct bxt_ddi_buf_trans bxt_ddi_translations_edp[] = {
  */
 static const struct bxt_ddi_buf_trans bxt_ddi_translations_hdmi[] = {
 					/* Idx	NT mV diff	db  */
-	{ 52,  0x9A, 0, 128, false },	/* 0:	400		0   */
-	{ 52,  0x9A, 0, 85,  false },	/* 1:	400		3.5 */
-	{ 52,  0x9A, 0, 64,  false },	/* 2:	400		6   */
-	{ 42,  0x9A, 0, 43,  false },	/* 3:	400		9.5 */
-	{ 77,  0x9A, 0, 128, false },	/* 4:	600		0   */
-	{ 77,  0x9A, 0, 85,  false },	/* 5:	600		3.5 */
-	{ 77,  0x9A, 0, 64,  false },	/* 6:	600		6   */
-	{ 102, 0x9A, 0, 128, false },	/* 7:	800		0   */
-	{ 102, 0x9A, 0, 85,  false },	/* 8:	800		3.5 */
-	{ 154, 0x9A, 1, 128, true },	/* 9:	1200		0   */
+	{ 52,  0x9A, 0, 128, },	/* 0:	400		0   */
+	{ 52,  0x9A, 0, 85,  },	/* 1:	400		3.5 */
+	{ 52,  0x9A, 0, 64,  },	/* 2:	400		6   */
+	{ 42,  0x9A, 0, 43,  },	/* 3:	400		9.5 */
+	{ 77,  0x9A, 0, 128, },	/* 4:	600		0   */
+	{ 77,  0x9A, 0, 85,  },	/* 5:	600		3.5 */
+	{ 77,  0x9A, 0, 64,  },	/* 6:	600		6   */
+	{ 102, 0x9A, 0, 128, },	/* 7:	800		0   */
+	{ 102, 0x9A, 0, 85,  },	/* 8:	800		3.5 */
+	{ 154, 0x9A, 1, 128, },	/* 9:	1200		0   */
 };
 
 struct cnl_ddi_buf_trans {
@@ -588,6 +587,120 @@ skl_get_buf_trans_hdmi(struct drm_i915_private *dev_priv, int *n_entries)
 	}
 }
 
+static int skl_buf_trans_num_entries(enum port port, int n_entries)
+{
+	/* Only DDIA and DDIE can select the 10th register with DP */
+	if (port == PORT_A || port == PORT_E)
+		return min(n_entries, 10);
+	else
+		return min(n_entries, 9);
+}
+
+static const struct ddi_buf_trans *
+intel_ddi_get_buf_trans_dp(struct drm_i915_private *dev_priv,
+			   enum port port, int *n_entries)
+{
+	if (IS_KABYLAKE(dev_priv) || IS_COFFEELAKE(dev_priv)) {
+		const struct ddi_buf_trans *ddi_translations =
+			kbl_get_buf_trans_dp(dev_priv, n_entries);
+		*n_entries = skl_buf_trans_num_entries(port, *n_entries);
+		return ddi_translations;
+	} else if (IS_SKYLAKE(dev_priv)) {
+		const struct ddi_buf_trans *ddi_translations =
+			skl_get_buf_trans_dp(dev_priv, n_entries);
+		*n_entries = skl_buf_trans_num_entries(port, *n_entries);
+		return ddi_translations;
+	} else if (IS_BROADWELL(dev_priv)) {
+		*n_entries = ARRAY_SIZE(bdw_ddi_translations_dp);
+		return  bdw_ddi_translations_dp;
+	} else if (IS_HASWELL(dev_priv)) {
+		*n_entries = ARRAY_SIZE(hsw_ddi_translations_dp);
+		return hsw_ddi_translations_dp;
+	}
+
+	*n_entries = 0;
+	return NULL;
+}
+
+static const struct ddi_buf_trans *
+intel_ddi_get_buf_trans_edp(struct drm_i915_private *dev_priv,
+			    enum port port, int *n_entries)
+{
+	if (IS_GEN9_BC(dev_priv)) {
+		const struct ddi_buf_trans *ddi_translations =
+			skl_get_buf_trans_edp(dev_priv, n_entries);
+		*n_entries = skl_buf_trans_num_entries(port, *n_entries);
+		return ddi_translations;
+	} else if (IS_BROADWELL(dev_priv)) {
+		return bdw_get_buf_trans_edp(dev_priv, n_entries);
+	} else if (IS_HASWELL(dev_priv)) {
+		*n_entries = ARRAY_SIZE(hsw_ddi_translations_dp);
+		return hsw_ddi_translations_dp;
+	}
+
+	*n_entries = 0;
+	return NULL;
+}
+
+static const struct ddi_buf_trans *
+intel_ddi_get_buf_trans_fdi(struct drm_i915_private *dev_priv,
+			    int *n_entries)
+{
+	if (IS_BROADWELL(dev_priv)) {
+		*n_entries = ARRAY_SIZE(bdw_ddi_translations_fdi);
+		return bdw_ddi_translations_fdi;
+	} else if (IS_HASWELL(dev_priv)) {
+		*n_entries = ARRAY_SIZE(hsw_ddi_translations_fdi);
+		return hsw_ddi_translations_fdi;
+	}
+
+	*n_entries = 0;
+	return NULL;
+}
+
+static const struct ddi_buf_trans *
+intel_ddi_get_buf_trans_hdmi(struct drm_i915_private *dev_priv,
+			     int *n_entries)
+{
+	if (IS_GEN9_BC(dev_priv)) {
+		return skl_get_buf_trans_hdmi(dev_priv, n_entries);
+	} else if (IS_BROADWELL(dev_priv)) {
+		*n_entries = ARRAY_SIZE(bdw_ddi_translations_hdmi);
+		return bdw_ddi_translations_hdmi;
+	} else if (IS_HASWELL(dev_priv)) {
+		*n_entries = ARRAY_SIZE(hsw_ddi_translations_hdmi);
+		return hsw_ddi_translations_hdmi;
+	}
+
+	*n_entries = 0;
+	return NULL;
+}
+
+static const struct bxt_ddi_buf_trans *
+bxt_get_buf_trans_dp(struct drm_i915_private *dev_priv, int *n_entries)
+{
+	*n_entries = ARRAY_SIZE(bxt_ddi_translations_dp);
+	return bxt_ddi_translations_dp;
+}
+
+static const struct bxt_ddi_buf_trans *
+bxt_get_buf_trans_edp(struct drm_i915_private *dev_priv, int *n_entries)
+{
+	if (dev_priv->vbt.edp.low_vswing) {
+		*n_entries = ARRAY_SIZE(bxt_ddi_translations_edp);
+		return bxt_ddi_translations_edp;
+	}
+
+	return bxt_get_buf_trans_dp(dev_priv, n_entries);
+}
+
+static const struct bxt_ddi_buf_trans *
+bxt_get_buf_trans_hdmi(struct drm_i915_private *dev_priv, int *n_entries)
+{
+	*n_entries = ARRAY_SIZE(bxt_ddi_translations_hdmi);
+	return bxt_ddi_translations_hdmi;
+}
+
 static const struct cnl_ddi_buf_trans *
 cnl_get_buf_trans_hdmi(struct drm_i915_private *dev_priv, int *n_entries)
 {
@@ -657,92 +770,40 @@ cnl_get_buf_trans_edp(struct drm_i915_private *dev_priv, int *n_entries)
 
 static int intel_ddi_hdmi_level(struct drm_i915_private *dev_priv, enum port port)
 {
-	int n_hdmi_entries;
-	int hdmi_level;
-	int hdmi_default_entry;
-
-	hdmi_level = dev_priv->vbt.ddi_port_info[port].hdmi_level_shift;
+	int n_entries, level, default_entry;
 
-	if (IS_GEN9_LP(dev_priv))
-		return hdmi_level;
+	level = dev_priv->vbt.ddi_port_info[port].hdmi_level_shift;
 
 	if (IS_CANNONLAKE(dev_priv)) {
-		cnl_get_buf_trans_hdmi(dev_priv, &n_hdmi_entries);
-		hdmi_default_entry = n_hdmi_entries - 1;
+		cnl_get_buf_trans_hdmi(dev_priv, &n_entries);
+		default_entry = n_entries - 1;
+	} else if (IS_GEN9_LP(dev_priv)) {
+		bxt_get_buf_trans_hdmi(dev_priv, &n_entries);
+		default_entry = n_entries - 1;
 	} else if (IS_GEN9_BC(dev_priv)) {
-		skl_get_buf_trans_hdmi(dev_priv, &n_hdmi_entries);
-		hdmi_default_entry = 8;
+		intel_ddi_get_buf_trans_hdmi(dev_priv, &n_entries);
+		default_entry = 8;
 	} else if (IS_BROADWELL(dev_priv)) {
-		n_hdmi_entries = ARRAY_SIZE(bdw_ddi_translations_hdmi);
-		hdmi_default_entry = 7;
+		intel_ddi_get_buf_trans_hdmi(dev_priv, &n_entries);
+		default_entry = 7;
 	} else if (IS_HASWELL(dev_priv)) {
-		n_hdmi_entries = ARRAY_SIZE(hsw_ddi_translations_hdmi);
-		hdmi_default_entry = 6;
+		intel_ddi_get_buf_trans_hdmi(dev_priv, &n_entries);
+		default_entry = 6;
 	} else {
 		WARN(1, "ddi translation table missing\n");
-		n_hdmi_entries = ARRAY_SIZE(bdw_ddi_translations_hdmi);
-		hdmi_default_entry = 7;
+		return 0;
 	}
 
 	/* Choose a good default if VBT is badly populated */
-	if (hdmi_level == HDMI_LEVEL_SHIFT_UNKNOWN ||
-	    hdmi_level >= n_hdmi_entries)
-		hdmi_level = hdmi_default_entry;
+	if (level == HDMI_LEVEL_SHIFT_UNKNOWN || level >= n_entries)
+		level = default_entry;
 
-	return hdmi_level;
-}
-
-static const struct ddi_buf_trans *
-intel_ddi_get_buf_trans_dp(struct drm_i915_private *dev_priv,
-			   int *n_entries)
-{
-	if (IS_KABYLAKE(dev_priv) || IS_COFFEELAKE(dev_priv)) {
-		return kbl_get_buf_trans_dp(dev_priv, n_entries);
-	} else if (IS_SKYLAKE(dev_priv)) {
-		return skl_get_buf_trans_dp(dev_priv, n_entries);
-	} else if (IS_BROADWELL(dev_priv)) {
-		*n_entries = ARRAY_SIZE(bdw_ddi_translations_dp);
-		return  bdw_ddi_translations_dp;
-	} else if (IS_HASWELL(dev_priv)) {
-		*n_entries = ARRAY_SIZE(hsw_ddi_translations_dp);
-		return hsw_ddi_translations_dp;
-	}
-
-	*n_entries = 0;
-	return NULL;
-}
-
-static const struct ddi_buf_trans *
-intel_ddi_get_buf_trans_edp(struct drm_i915_private *dev_priv,
-			    int *n_entries)
-{
-	if (IS_GEN9_BC(dev_priv)) {
-		return skl_get_buf_trans_edp(dev_priv, n_entries);
-	} else if (IS_BROADWELL(dev_priv)) {
-		return bdw_get_buf_trans_edp(dev_priv, n_entries);
-	} else if (IS_HASWELL(dev_priv)) {
-		*n_entries = ARRAY_SIZE(hsw_ddi_translations_dp);
-		return hsw_ddi_translations_dp;
-	}
-
-	*n_entries = 0;
-	return NULL;
-}
-
-static const struct ddi_buf_trans *
-intel_ddi_get_buf_trans_fdi(struct drm_i915_private *dev_priv,
-			    int *n_entries)
-{
-	if (IS_BROADWELL(dev_priv)) {
-		*n_entries = ARRAY_SIZE(hsw_ddi_translations_fdi);
-		return hsw_ddi_translations_fdi;
-	} else if (IS_HASWELL(dev_priv)) {
-		*n_entries = ARRAY_SIZE(hsw_ddi_translations_fdi);
-		return hsw_ddi_translations_fdi;
-	}
+	if (WARN_ON_ONCE(n_entries == 0))
+		return 0;
+	if (WARN_ON_ONCE(level >= n_entries))
+		level = n_entries - 1;
 
-	*n_entries = 0;
-	return NULL;
+	return level;
 }
 
 /*
@@ -760,11 +821,11 @@ static void intel_prepare_dp_ddi_buffers(struct intel_encoder *encoder)
 
 	switch (encoder->type) {
 	case INTEL_OUTPUT_EDP:
-		ddi_translations = intel_ddi_get_buf_trans_edp(dev_priv,
+		ddi_translations = intel_ddi_get_buf_trans_edp(dev_priv, port,
 							       &n_entries);
 		break;
 	case INTEL_OUTPUT_DP:
-		ddi_translations = intel_ddi_get_buf_trans_dp(dev_priv,
+		ddi_translations = intel_ddi_get_buf_trans_dp(dev_priv, port,
 							      &n_entries);
 		break;
 	case INTEL_OUTPUT_ANALOG:
@@ -776,16 +837,10 @@ static void intel_prepare_dp_ddi_buffers(struct intel_encoder *encoder)
 		return;
 	}
 
-	if (IS_GEN9_BC(dev_priv)) {
-		/* If we're boosting the current, set bit 31 of trans1 */
-		if (dev_priv->vbt.ddi_port_info[port].dp_boost_level)
-			iboost_bit = DDI_BUF_BALANCE_LEG_ENABLE;
-
-		if (WARN_ON(encoder->type == INTEL_OUTPUT_EDP &&
-			    port != PORT_A && port != PORT_E &&
-			    n_entries > 9))
-			n_entries = 9;
-	}
+	/* If we're boosting the current, set bit 31 of trans1 */
+	if (IS_GEN9_BC(dev_priv) &&
+	    dev_priv->vbt.ddi_port_info[port].dp_boost_level)
+		iboost_bit = DDI_BUF_BALANCE_LEG_ENABLE;
 
 	for (i = 0; i < n_entries; i++) {
 		I915_WRITE(DDI_BUF_TRANS_LO(port, i),
@@ -800,39 +855,32 @@ static void intel_prepare_dp_ddi_buffers(struct intel_encoder *encoder)
  * values in advance. This function programs the correct values for
  * HDMI/DVI use cases.
  */
-static void intel_prepare_hdmi_ddi_buffers(struct intel_encoder *encoder)
+static void intel_prepare_hdmi_ddi_buffers(struct intel_encoder *encoder,
+					   int level)
 {
 	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
 	u32 iboost_bit = 0;
-	int n_hdmi_entries, hdmi_level;
+	int n_entries;
 	enum port port = intel_ddi_get_encoder_port(encoder);
-	const struct ddi_buf_trans *ddi_translations_hdmi;
+	const struct ddi_buf_trans *ddi_translations;
 
-	hdmi_level = intel_ddi_hdmi_level(dev_priv, port);
+	ddi_translations = intel_ddi_get_buf_trans_hdmi(dev_priv, &n_entries);
 
-	if (IS_GEN9_BC(dev_priv)) {
-		ddi_translations_hdmi = skl_get_buf_trans_hdmi(dev_priv, &n_hdmi_entries);
+	if (WARN_ON_ONCE(!ddi_translations))
+		return;
+	if (WARN_ON_ONCE(level >= n_entries))
+		level = n_entries - 1;
 
-		/* If we're boosting the current, set bit 31 of trans1 */
-		if (dev_priv->vbt.ddi_port_info[port].hdmi_boost_level)
-			iboost_bit = DDI_BUF_BALANCE_LEG_ENABLE;
-	} else if (IS_BROADWELL(dev_priv)) {
-		ddi_translations_hdmi = bdw_ddi_translations_hdmi;
-		n_hdmi_entries = ARRAY_SIZE(bdw_ddi_translations_hdmi);
-	} else if (IS_HASWELL(dev_priv)) {
-		ddi_translations_hdmi = hsw_ddi_translations_hdmi;
-		n_hdmi_entries = ARRAY_SIZE(hsw_ddi_translations_hdmi);
-	} else {
-		WARN(1, "ddi translation table missing\n");
-		ddi_translations_hdmi = bdw_ddi_translations_hdmi;
-		n_hdmi_entries = ARRAY_SIZE(bdw_ddi_translations_hdmi);
-	}
+	/* If we're boosting the current, set bit 31 of trans1 */
+	if (IS_GEN9_BC(dev_priv) &&
+	    dev_priv->vbt.ddi_port_info[port].hdmi_boost_level)
+		iboost_bit = DDI_BUF_BALANCE_LEG_ENABLE;
 
 	/* Entry 9 is for HDMI: */
 	I915_WRITE(DDI_BUF_TRANS_LO(port, 9),
-		   ddi_translations_hdmi[hdmi_level].trans1 | iboost_bit);
+		   ddi_translations[level].trans1 | iboost_bit);
 	I915_WRITE(DDI_BUF_TRANS_HI(port, 9),
-		   ddi_translations_hdmi[hdmi_level].trans2);
+		   ddi_translations[level].trans2);
 }
 
 static void intel_wait_ddi_buf_idle(struct drm_i915_private *dev_priv,
@@ -1108,14 +1156,14 @@ static int hsw_ddi_calc_wrpll_link(struct drm_i915_private *dev_priv,
 }
 
 static int skl_calc_wrpll_link(struct drm_i915_private *dev_priv,
-			       uint32_t dpll)
+			       enum intel_dpll_id pll_id)
 {
 	i915_reg_t cfgcr1_reg, cfgcr2_reg;
 	uint32_t cfgcr1_val, cfgcr2_val;
 	uint32_t p0, p1, p2, dco_freq;
 
-	cfgcr1_reg = DPLL_CFGCR1(dpll);
-	cfgcr2_reg = DPLL_CFGCR2(dpll);
+	cfgcr1_reg = DPLL_CFGCR1(pll_id);
+	cfgcr2_reg = DPLL_CFGCR2(pll_id);
 
 	cfgcr1_val = I915_READ(cfgcr1_reg);
 	cfgcr2_val = I915_READ(cfgcr2_reg);
@@ -1168,7 +1216,7 @@ static int skl_calc_wrpll_link(struct drm_i915_private *dev_priv,
 }
 
 static int cnl_calc_wrpll_link(struct drm_i915_private *dev_priv,
-			       uint32_t pll_id)
+			       enum intel_dpll_id pll_id)
 {
 	uint32_t cfgcr0, cfgcr1;
 	uint32_t p0, p1, p2, dco_freq, ref_clock;
@@ -1255,7 +1303,8 @@ static void cnl_ddi_clock_get(struct intel_encoder *encoder,
 {
 	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
 	int link_clock = 0;
-	uint32_t cfgcr0, pll_id;
+	uint32_t cfgcr0;
+	enum intel_dpll_id pll_id;
 
 	pll_id = intel_get_shared_dpll_id(dev_priv, pipe_config->shared_dpll);
 
@@ -1308,17 +1357,18 @@ static void skl_ddi_clock_get(struct intel_encoder *encoder,
 {
 	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
 	int link_clock = 0;
-	uint32_t dpll_ctl1, dpll;
+	uint32_t dpll_ctl1;
+	enum intel_dpll_id pll_id;
 
-	dpll = intel_get_shared_dpll_id(dev_priv, pipe_config->shared_dpll);
+	pll_id = intel_get_shared_dpll_id(dev_priv, pipe_config->shared_dpll);
 
 	dpll_ctl1 = I915_READ(DPLL_CTRL1);
 
-	if (dpll_ctl1 & DPLL_CTRL1_HDMI_MODE(dpll)) {
-		link_clock = skl_calc_wrpll_link(dev_priv, dpll);
+	if (dpll_ctl1 & DPLL_CTRL1_HDMI_MODE(pll_id)) {
+		link_clock = skl_calc_wrpll_link(dev_priv, pll_id);
 	} else {
-		link_clock = dpll_ctl1 & DPLL_CTRL1_LINK_RATE_MASK(dpll);
-		link_clock >>= DPLL_CTRL1_LINK_RATE_SHIFT(dpll);
+		link_clock = dpll_ctl1 & DPLL_CTRL1_LINK_RATE_MASK(pll_id);
+		link_clock >>= DPLL_CTRL1_LINK_RATE_SHIFT(pll_id);
 
 		switch (link_clock) {
 		case DPLL_CTRL1_LINK_RATE_810:
@@ -1399,17 +1449,17 @@ static void hsw_ddi_clock_get(struct intel_encoder *encoder,
 }
 
 static int bxt_calc_pll_link(struct drm_i915_private *dev_priv,
-				enum intel_dpll_id dpll)
+			     enum intel_dpll_id pll_id)
 {
 	struct intel_shared_dpll *pll;
 	struct intel_dpll_hw_state *state;
 	struct dpll clock;
 
 	/* For DDI ports we always use a shared PLL. */
-	if (WARN_ON(dpll == DPLL_ID_PRIVATE))
+	if (WARN_ON(pll_id == DPLL_ID_PRIVATE))
 		return 0;
 
-	pll = &dev_priv->shared_dplls[dpll];
+	pll = &dev_priv->shared_dplls[pll_id];
 	state = &pll->state.hw_state;
 
 	clock.m1 = 2;
@@ -1428,9 +1478,9 @@ static void bxt_ddi_clock_get(struct intel_encoder *encoder,
 {
 	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
 	enum port port = intel_ddi_get_encoder_port(encoder);
-	uint32_t dpll = port;
+	enum intel_dpll_id pll_id = port;
 
-	pipe_config->port_clock = bxt_calc_pll_link(dev_priv, dpll);
+	pipe_config->port_clock = bxt_calc_pll_link(dev_priv, pll_id);
 
 	ddi_dotclock_get(pipe_config);
 }
@@ -1782,54 +1832,36 @@ static void _skl_ddi_set_iboost(struct drm_i915_private *dev_priv,
 	I915_WRITE(DISPIO_CR_TX_BMU_CR0, tmp);
 }
 
-static void skl_ddi_set_iboost(struct intel_encoder *encoder, u32 level)
+static void skl_ddi_set_iboost(struct intel_encoder *encoder,
+			       int level, enum intel_output_type type)
 {
 	struct intel_digital_port *intel_dig_port = enc_to_dig_port(&encoder->base);
 	struct drm_i915_private *dev_priv = to_i915(intel_dig_port->base.base.dev);
 	enum port port = intel_dig_port->port;
-	int type = encoder->type;
-	const struct ddi_buf_trans *ddi_translations;
 	uint8_t iboost;
-	uint8_t dp_iboost, hdmi_iboost;
-	int n_entries;
 
-	/* VBT may override standard boost values */
-	dp_iboost = dev_priv->vbt.ddi_port_info[port].dp_boost_level;
-	hdmi_iboost = dev_priv->vbt.ddi_port_info[port].hdmi_boost_level;
+	if (type == INTEL_OUTPUT_HDMI)
+		iboost = dev_priv->vbt.ddi_port_info[port].hdmi_boost_level;
+	else
+		iboost = dev_priv->vbt.ddi_port_info[port].dp_boost_level;
 
-	if (type == INTEL_OUTPUT_DP) {
-		if (dp_iboost) {
-			iboost = dp_iboost;
-		} else {
-			if (IS_KABYLAKE(dev_priv) || IS_COFFEELAKE(dev_priv))
-				ddi_translations = kbl_get_buf_trans_dp(dev_priv,
-									&n_entries);
-			else
-				ddi_translations = skl_get_buf_trans_dp(dev_priv,
-									&n_entries);
-			iboost = ddi_translations[level].i_boost;
-		}
-	} else if (type == INTEL_OUTPUT_EDP) {
-		if (dp_iboost) {
-			iboost = dp_iboost;
-		} else {
-			ddi_translations = skl_get_buf_trans_edp(dev_priv, &n_entries);
+	if (iboost == 0) {
+		const struct ddi_buf_trans *ddi_translations;
+		int n_entries;
 
-			if (WARN_ON(port != PORT_A &&
-				    port != PORT_E && n_entries > 9))
-				n_entries = 9;
+		if (type == INTEL_OUTPUT_HDMI)
+			ddi_translations = intel_ddi_get_buf_trans_hdmi(dev_priv, &n_entries);
+		else if (type == INTEL_OUTPUT_EDP)
+			ddi_translations = intel_ddi_get_buf_trans_edp(dev_priv, port, &n_entries);
+		else
+			ddi_translations = intel_ddi_get_buf_trans_dp(dev_priv, port, &n_entries);
 
-			iboost = ddi_translations[level].i_boost;
-		}
-	} else if (type == INTEL_OUTPUT_HDMI) {
-		if (hdmi_iboost) {
-			iboost = hdmi_iboost;
-		} else {
-			ddi_translations = skl_get_buf_trans_hdmi(dev_priv, &n_entries);
-			iboost = ddi_translations[level].i_boost;
-		}
-	} else {
-		return;
+		if (WARN_ON_ONCE(!ddi_translations))
+			return;
+		if (WARN_ON_ONCE(level >= n_entries))
+			level = n_entries - 1;
+
+		iboost = ddi_translations[level].i_boost;
 	}
 
 	/* Make sure that the requested I_boost is valid */
@@ -1844,38 +1876,25 @@ static void skl_ddi_set_iboost(struct intel_encoder *encoder, u32 level)
 		_skl_ddi_set_iboost(dev_priv, PORT_E, iboost);
 }
 
-static void bxt_ddi_vswing_sequence(struct drm_i915_private *dev_priv,
-				    u32 level, enum port port, int type)
+static void bxt_ddi_vswing_sequence(struct intel_encoder *encoder,
+				    int level, enum intel_output_type type)
 {
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
 	const struct bxt_ddi_buf_trans *ddi_translations;
-	u32 n_entries, i;
-
-	if (type == INTEL_OUTPUT_EDP && dev_priv->vbt.edp.low_vswing) {
-		n_entries = ARRAY_SIZE(bxt_ddi_translations_edp);
-		ddi_translations = bxt_ddi_translations_edp;
-	} else if (type == INTEL_OUTPUT_DP
-			|| type == INTEL_OUTPUT_EDP) {
-		n_entries = ARRAY_SIZE(bxt_ddi_translations_dp);
-		ddi_translations = bxt_ddi_translations_dp;
-	} else if (type == INTEL_OUTPUT_HDMI) {
-		n_entries = ARRAY_SIZE(bxt_ddi_translations_hdmi);
-		ddi_translations = bxt_ddi_translations_hdmi;
-	} else {
-		DRM_DEBUG_KMS("Vswing programming not done for encoder %d\n",
-				type);
-		return;
-	}
+	enum port port = encoder->port;
+	int n_entries;
 
-	/* Check if default value has to be used */
-	if (level >= n_entries ||
-	    (type == INTEL_OUTPUT_HDMI && level == HDMI_LEVEL_SHIFT_UNKNOWN)) {
-		for (i = 0; i < n_entries; i++) {
-			if (ddi_translations[i].default_index) {
-				level = i;
-				break;
-			}
-		}
-	}
+	if (type == INTEL_OUTPUT_HDMI)
+		ddi_translations = bxt_get_buf_trans_hdmi(dev_priv, &n_entries);
+	else if (type == INTEL_OUTPUT_EDP)
+		ddi_translations = bxt_get_buf_trans_edp(dev_priv, &n_entries);
+	else
+		ddi_translations = bxt_get_buf_trans_dp(dev_priv, &n_entries);
+
+	if (WARN_ON_ONCE(!ddi_translations))
+		return;
+	if (WARN_ON_ONCE(level >= n_entries))
+		level = n_entries - 1;
 
 	bxt_ddi_phy_set_signal_level(dev_priv, port,
 				     ddi_translations[level].margin,
@@ -1887,6 +1906,7 @@ static void bxt_ddi_vswing_sequence(struct drm_i915_private *dev_priv,
 u8 intel_ddi_dp_voltage_max(struct intel_encoder *encoder)
 {
 	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	enum port port = encoder->port;
 	int n_entries;
 
 	if (IS_CANNONLAKE(dev_priv)) {
@@ -1894,11 +1914,16 @@ u8 intel_ddi_dp_voltage_max(struct intel_encoder *encoder)
 			cnl_get_buf_trans_edp(dev_priv, &n_entries);
 		else
 			cnl_get_buf_trans_dp(dev_priv, &n_entries);
+	} else if (IS_GEN9_LP(dev_priv)) {
+		if (encoder->type == INTEL_OUTPUT_EDP)
+			bxt_get_buf_trans_edp(dev_priv, &n_entries);
+		else
+			bxt_get_buf_trans_dp(dev_priv, &n_entries);
 	} else {
 		if (encoder->type == INTEL_OUTPUT_EDP)
-			intel_ddi_get_buf_trans_edp(dev_priv, &n_entries);
+			intel_ddi_get_buf_trans_edp(dev_priv, port, &n_entries);
 		else
-			intel_ddi_get_buf_trans_dp(dev_priv, &n_entries);
+			intel_ddi_get_buf_trans_dp(dev_priv, port, &n_entries);
 	}
 
 	if (WARN_ON(n_entries < 1))
@@ -1910,28 +1935,26 @@ u8 intel_ddi_dp_voltage_max(struct intel_encoder *encoder)
 		DP_TRAIN_VOLTAGE_SWING_MASK;
 }
 
-static void cnl_ddi_vswing_program(struct drm_i915_private *dev_priv,
-				    u32 level, enum port port, int type)
+static void cnl_ddi_vswing_program(struct intel_encoder *encoder,
+				   int level, enum intel_output_type type)
 {
-	const struct cnl_ddi_buf_trans *ddi_translations = NULL;
-	u32 n_entries, val;
-	int ln;
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	enum port port = intel_ddi_get_encoder_port(encoder);
+	const struct cnl_ddi_buf_trans *ddi_translations;
+	int n_entries, ln;
+	u32 val;
 
-	if (type == INTEL_OUTPUT_HDMI) {
+	if (type == INTEL_OUTPUT_HDMI)
 		ddi_translations = cnl_get_buf_trans_hdmi(dev_priv, &n_entries);
-	} else if (type == INTEL_OUTPUT_DP) {
-		ddi_translations = cnl_get_buf_trans_dp(dev_priv, &n_entries);
-	} else if (type == INTEL_OUTPUT_EDP) {
+	else if (type == INTEL_OUTPUT_EDP)
 		ddi_translations = cnl_get_buf_trans_edp(dev_priv, &n_entries);
-	}
+	else
+		ddi_translations = cnl_get_buf_trans_dp(dev_priv, &n_entries);
 
-	if (WARN_ON(ddi_translations == NULL))
+	if (WARN_ON_ONCE(!ddi_translations))
 		return;
-
-	if (level >= n_entries) {
-		DRM_DEBUG_KMS("DDI translation not found for level %d. Using %d instead.", level, n_entries - 1);
+	if (WARN_ON_ONCE(level >= n_entries))
 		level = n_entries - 1;
-	}
 
 	/* Set PORT_TX_DW5 Scaling Mode Sel to 010b. */
 	val = I915_READ(CNL_PORT_TX_DW5_LN0(port));
@@ -1976,26 +1999,22 @@ static void cnl_ddi_vswing_program(struct drm_i915_private *dev_priv,
 	I915_WRITE(CNL_PORT_TX_DW7_GRP(port), val);
 }
 
-static void cnl_ddi_vswing_sequence(struct intel_encoder *encoder, u32 level)
+static void cnl_ddi_vswing_sequence(struct intel_encoder *encoder,
+				    int level, enum intel_output_type type)
 {
 	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
-	struct intel_dp *intel_dp = enc_to_intel_dp(&encoder->base);
 	enum port port = intel_ddi_get_encoder_port(encoder);
-	int type = encoder->type;
-	int width = 0;
-	int rate = 0;
+	int width, rate, ln;
 	u32 val;
-	int ln = 0;
 
-	if ((intel_dp) && (type == INTEL_OUTPUT_EDP || type == INTEL_OUTPUT_DP)) {
-		width = intel_dp->lane_count;
-		rate = intel_dp->link_rate;
-	} else if (type == INTEL_OUTPUT_HDMI) {
+	if (type == INTEL_OUTPUT_HDMI) {
 		width = 4;
-		/* Rate is always < than 6GHz for HDMI */
+		rate = 0; /* Rate is always < than 6GHz for HDMI */
 	} else {
-		MISSING_CASE(type);
-		return;
+		struct intel_dp *intel_dp = enc_to_intel_dp(&encoder->base);
+
+		width = intel_dp->lane_count;
+		rate = intel_dp->link_rate;
 	}
 
 	/*
@@ -2004,7 +2023,7 @@ static void cnl_ddi_vswing_sequence(struct intel_encoder *encoder, u32 level)
 	 * else clear to 0b.
 	 */
 	val = I915_READ(CNL_PORT_PCS_DW1_LN0(port));
-	if (type == INTEL_OUTPUT_EDP || type == INTEL_OUTPUT_DP)
+	if (type != INTEL_OUTPUT_HDMI)
 		val |= COMMON_KEEPER_EN;
 	else
 		val &= ~COMMON_KEEPER_EN;
@@ -2039,7 +2058,7 @@ static void cnl_ddi_vswing_sequence(struct intel_encoder *encoder, u32 level)
 	I915_WRITE(CNL_PORT_TX_DW5_GRP(port), val);
 
 	/* 5. Program swing and de-emphasis */
-	cnl_ddi_vswing_program(dev_priv, level, port, type);
+	cnl_ddi_vswing_program(encoder, level, type);
 
 	/* 6. Set training enable to trigger update */
 	val = I915_READ(CNL_PORT_TX_DW5_LN0(port));
@@ -2076,13 +2095,12 @@ u32 bxt_signal_levels(struct intel_dp *intel_dp)
 	struct intel_digital_port *dport = dp_to_dig_port(intel_dp);
 	struct drm_i915_private *dev_priv = to_i915(dport->base.base.dev);
 	struct intel_encoder *encoder = &dport->base;
-	enum port port = dport->port;
-	u32 level = intel_ddi_dp_level(intel_dp);
+	int level = intel_ddi_dp_level(intel_dp);
 
 	if (IS_CANNONLAKE(dev_priv))
-		cnl_ddi_vswing_sequence(encoder, level);
+		cnl_ddi_vswing_sequence(encoder, level, encoder->type);
 	else
-		bxt_ddi_vswing_sequence(dev_priv, level, port, encoder->type);
+		bxt_ddi_vswing_sequence(encoder, level, encoder->type);
 
 	return 0;
 }
@@ -2092,10 +2110,10 @@ uint32_t ddi_signal_levels(struct intel_dp *intel_dp)
 	struct intel_digital_port *dport = dp_to_dig_port(intel_dp);
 	struct drm_i915_private *dev_priv = to_i915(dport->base.base.dev);
 	struct intel_encoder *encoder = &dport->base;
-	uint32_t level = intel_ddi_dp_level(intel_dp);
+	int level = intel_ddi_dp_level(intel_dp);
 
 	if (IS_GEN9_BC(dev_priv))
-	    skl_ddi_set_iboost(encoder, level);
+		skl_ddi_set_iboost(encoder, level, encoder->type);
 
 	return DDI_BUF_TRANS_SELECT(level);
 }
@@ -2122,8 +2140,7 @@ static void intel_ddi_clk_select(struct intel_encoder *encoder,
 		 * register writes.
 		 */
 		val = I915_READ(DPCLKA_CFGCR0);
-		val &= ~(DPCLKA_CFGCR0_DDI_CLK_OFF(port) |
-			 DPCLKA_CFGCR0_DDI_CLK_SEL_MASK(port));
+		val &= ~DPCLKA_CFGCR0_DDI_CLK_OFF(port);
 		I915_WRITE(DPCLKA_CFGCR0, val);
 	} else if (IS_GEN9_BC(dev_priv)) {
 		/* DDI -> PLL mapping  */
@@ -2141,37 +2158,52 @@ static void intel_ddi_clk_select(struct intel_encoder *encoder,
 	}
 }
 
+static void intel_ddi_clk_disable(struct intel_encoder *encoder)
+{
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	enum port port = intel_ddi_get_encoder_port(encoder);
+
+	if (IS_CANNONLAKE(dev_priv))
+		I915_WRITE(DPCLKA_CFGCR0, I915_READ(DPCLKA_CFGCR0) |
+			   DPCLKA_CFGCR0_DDI_CLK_OFF(port));
+	else if (IS_GEN9_BC(dev_priv))
+		I915_WRITE(DPLL_CTRL2, I915_READ(DPLL_CTRL2) |
+			   DPLL_CTRL2_DDI_CLK_OFF(port));
+	else if (INTEL_GEN(dev_priv) < 9)
+		I915_WRITE(PORT_CLK_SEL(port), PORT_CLK_SEL_NONE);
+}
+
 static void intel_ddi_pre_enable_dp(struct intel_encoder *encoder,
-				    int link_rate, uint32_t lane_count,
-				    struct intel_shared_dpll *pll,
-				    bool link_mst)
+				    const struct intel_crtc_state *crtc_state,
+				    const struct drm_connector_state *conn_state)
 {
 	struct intel_dp *intel_dp = enc_to_intel_dp(&encoder->base);
 	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
 	enum port port = intel_ddi_get_encoder_port(encoder);
 	struct intel_digital_port *dig_port = enc_to_dig_port(&encoder->base);
-	uint32_t level = intel_ddi_dp_level(intel_dp);
+	bool is_mst = intel_crtc_has_type(crtc_state, INTEL_OUTPUT_DP_MST);
+	int level = intel_ddi_dp_level(intel_dp);
+
+	WARN_ON(is_mst && (port == PORT_A || port == PORT_E));
 
-	WARN_ON(link_mst && (port == PORT_A || port == PORT_E));
+	intel_dp_set_link_params(intel_dp, crtc_state->port_clock,
+				 crtc_state->lane_count, is_mst);
 
-	intel_dp_set_link_params(intel_dp, link_rate, lane_count,
-				 link_mst);
-	if (encoder->type == INTEL_OUTPUT_EDP)
-		intel_edp_panel_on(intel_dp);
+	intel_edp_panel_on(intel_dp);
 
-	intel_ddi_clk_select(encoder, pll);
+	intel_ddi_clk_select(encoder, crtc_state->shared_dpll);
 
 	intel_display_power_get(dev_priv, dig_port->ddi_io_power_domain);
 
 	if (IS_CANNONLAKE(dev_priv))
-		cnl_ddi_vswing_sequence(encoder, level);
+		cnl_ddi_vswing_sequence(encoder, level, encoder->type);
 	else if (IS_GEN9_LP(dev_priv))
-		bxt_ddi_vswing_sequence(dev_priv, level, port, encoder->type);
+		bxt_ddi_vswing_sequence(encoder, level, encoder->type);
 	else
 		intel_prepare_dp_ddi_buffers(encoder);
 
 	intel_ddi_init_dp_buf_reg(encoder);
-	if (!link_mst)
+	if (!is_mst)
 		intel_dp_sink_dpms(intel_dp, DRM_MODE_DPMS_ON);
 	intel_dp_start_link_train(intel_dp);
 	if (port != PORT_A || INTEL_GEN(dev_priv) >= 9)
@@ -2179,10 +2211,8 @@ static void intel_ddi_pre_enable_dp(struct intel_encoder *encoder,
 }
 
 static void intel_ddi_pre_enable_hdmi(struct intel_encoder *encoder,
-				      bool has_infoframe,
 				      const struct intel_crtc_state *crtc_state,
-				      const struct drm_connector_state *conn_state,
-				      const struct intel_shared_dpll *pll)
+				      const struct drm_connector_state *conn_state)
 {
 	struct intel_digital_port *intel_dig_port = enc_to_dig_port(&encoder->base);
 	struct intel_hdmi *intel_hdmi = &intel_dig_port->hdmi;
@@ -2192,84 +2222,49 @@ static void intel_ddi_pre_enable_hdmi(struct intel_encoder *encoder,
 	struct intel_digital_port *dig_port = enc_to_dig_port(&encoder->base);
 
 	intel_dp_dual_mode_set_tmds_output(intel_hdmi, true);
-	intel_ddi_clk_select(encoder, pll);
+	intel_ddi_clk_select(encoder, crtc_state->shared_dpll);
 
 	intel_display_power_get(dev_priv, dig_port->ddi_io_power_domain);
 
 	if (IS_CANNONLAKE(dev_priv))
-		cnl_ddi_vswing_sequence(encoder, level);
+		cnl_ddi_vswing_sequence(encoder, level, INTEL_OUTPUT_HDMI);
 	else if (IS_GEN9_LP(dev_priv))
-		bxt_ddi_vswing_sequence(dev_priv, level, port,
-					INTEL_OUTPUT_HDMI);
+		bxt_ddi_vswing_sequence(encoder, level, INTEL_OUTPUT_HDMI);
 	else
-		intel_prepare_hdmi_ddi_buffers(encoder);
+		intel_prepare_hdmi_ddi_buffers(encoder, level);
 
 	if (IS_GEN9_BC(dev_priv))
-		skl_ddi_set_iboost(encoder, level);
+		skl_ddi_set_iboost(encoder, level, INTEL_OUTPUT_HDMI);
 
 	intel_dig_port->set_infoframes(&encoder->base,
-				       has_infoframe,
+				       crtc_state->has_infoframe,
 				       crtc_state, conn_state);
 }
 
 static void intel_ddi_pre_enable(struct intel_encoder *encoder,
-				 const struct intel_crtc_state *pipe_config,
+				 const struct intel_crtc_state *crtc_state,
 				 const struct drm_connector_state *conn_state)
 {
-	struct drm_crtc *crtc = pipe_config->base.crtc;
-	struct drm_i915_private *dev_priv = to_i915(crtc->dev);
-	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-	int pipe = intel_crtc->pipe;
-	int type = encoder->type;
+	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
+	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
+	enum pipe pipe = crtc->pipe;
 
-	WARN_ON(intel_crtc->config->has_pch_encoder);
+	WARN_ON(crtc_state->has_pch_encoder);
 
 	intel_set_cpu_fifo_underrun_reporting(dev_priv, pipe, true);
 
-	if (type == INTEL_OUTPUT_DP || type == INTEL_OUTPUT_EDP) {
-		intel_ddi_pre_enable_dp(encoder,
-					pipe_config->port_clock,
-					pipe_config->lane_count,
-					pipe_config->shared_dpll,
-					intel_crtc_has_type(pipe_config,
-							    INTEL_OUTPUT_DP_MST));
-	}
-	if (type == INTEL_OUTPUT_HDMI) {
-		intel_ddi_pre_enable_hdmi(encoder,
-					  pipe_config->has_infoframe,
-					  pipe_config, conn_state,
-					  pipe_config->shared_dpll);
-	}
+	if (intel_crtc_has_type(crtc_state, INTEL_OUTPUT_HDMI))
+		intel_ddi_pre_enable_hdmi(encoder, crtc_state, conn_state);
+	else
+		intel_ddi_pre_enable_dp(encoder, crtc_state, conn_state);
 }
 
-static void intel_ddi_post_disable(struct intel_encoder *intel_encoder,
-				   const struct intel_crtc_state *old_crtc_state,
-				   const struct drm_connector_state *old_conn_state)
+static void intel_disable_ddi_buf(struct intel_encoder *encoder)
 {
-	struct drm_encoder *encoder = &intel_encoder->base;
-	struct drm_i915_private *dev_priv = to_i915(encoder->dev);
-	enum port port = intel_ddi_get_encoder_port(intel_encoder);
-	struct intel_digital_port *dig_port = enc_to_dig_port(encoder);
-	int type = intel_encoder->type;
-	uint32_t val;
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	enum port port = intel_ddi_get_encoder_port(encoder);
 	bool wait = false;
-
-	if (type == INTEL_OUTPUT_DP || type == INTEL_OUTPUT_EDP) {
-		/*
-		 * old_crtc_state and old_conn_state are NULL when called from
-		 * DP_MST. The main connector associated with this port is never
-		 * bound to a crtc for MST.
-		 */
-		bool is_mst = !old_crtc_state;
-		struct intel_dp *intel_dp = enc_to_intel_dp(encoder);
-
-		/*
-		 * Power down sink before disabling the port, otherwise we end
-		 * up getting interrupts from the sink on detecting link loss.
-		 */
-		if (!is_mst)
-			intel_dp_sink_dpms(intel_dp, DRM_MODE_DPMS_OFF);
-	}
+	u32 val;
 
 	val = I915_READ(DDI_BUF_CTL(port));
 	if (val & DDI_BUF_CTL_ENABLE) {
@@ -2285,36 +2280,75 @@ static void intel_ddi_post_disable(struct intel_encoder *intel_encoder,
 
 	if (wait)
 		intel_wait_ddi_buf_idle(dev_priv, port);
+}
 
-	if (type == INTEL_OUTPUT_HDMI) {
-		dig_port->set_infoframes(encoder, false,
-					 old_crtc_state, old_conn_state);
-	}
+static void intel_ddi_post_disable_dp(struct intel_encoder *encoder,
+				      const struct intel_crtc_state *old_crtc_state,
+				      const struct drm_connector_state *old_conn_state)
+{
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	struct intel_digital_port *dig_port = enc_to_dig_port(&encoder->base);
+	struct intel_dp *intel_dp = &dig_port->dp;
+	/*
+	 * old_crtc_state and old_conn_state are NULL when called from
+	 * DP_MST. The main connector associated with this port is never
+	 * bound to a crtc for MST.
+	 */
+	bool is_mst = !old_crtc_state;
 
-	if (type == INTEL_OUTPUT_DP || type == INTEL_OUTPUT_EDP) {
-		struct intel_dp *intel_dp = enc_to_intel_dp(encoder);
+	/*
+	 * Power down sink before disabling the port, otherwise we end
+	 * up getting interrupts from the sink on detecting link loss.
+	 */
+	if (!is_mst)
+		intel_dp_sink_dpms(intel_dp, DRM_MODE_DPMS_OFF);
 
-		intel_edp_panel_vdd_on(intel_dp);
-		intel_edp_panel_off(intel_dp);
-	}
+	intel_disable_ddi_buf(encoder);
 
-	if (dig_port)
-		intel_display_power_put(dev_priv, dig_port->ddi_io_power_domain);
+	intel_edp_panel_vdd_on(intel_dp);
+	intel_edp_panel_off(intel_dp);
 
-	if (IS_CANNONLAKE(dev_priv))
-		I915_WRITE(DPCLKA_CFGCR0, I915_READ(DPCLKA_CFGCR0) |
-			   DPCLKA_CFGCR0_DDI_CLK_OFF(port));
-	else if (IS_GEN9_BC(dev_priv))
-		I915_WRITE(DPLL_CTRL2, (I915_READ(DPLL_CTRL2) |
-					DPLL_CTRL2_DDI_CLK_OFF(port)));
-	else if (INTEL_GEN(dev_priv) < 9)
-		I915_WRITE(PORT_CLK_SEL(port), PORT_CLK_SEL_NONE);
+	intel_display_power_put(dev_priv, dig_port->ddi_io_power_domain);
 
-	if (type == INTEL_OUTPUT_HDMI) {
-		struct intel_hdmi *intel_hdmi = enc_to_intel_hdmi(encoder);
+	intel_ddi_clk_disable(encoder);
+}
 
-		intel_dp_dual_mode_set_tmds_output(intel_hdmi, false);
-	}
+static void intel_ddi_post_disable_hdmi(struct intel_encoder *encoder,
+					const struct intel_crtc_state *old_crtc_state,
+					const struct drm_connector_state *old_conn_state)
+{
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	struct intel_digital_port *dig_port = enc_to_dig_port(&encoder->base);
+	struct intel_hdmi *intel_hdmi = &dig_port->hdmi;
+
+	intel_disable_ddi_buf(encoder);
+
+	dig_port->set_infoframes(&encoder->base, false,
+				 old_crtc_state, old_conn_state);
+
+	intel_display_power_put(dev_priv, dig_port->ddi_io_power_domain);
+
+	intel_ddi_clk_disable(encoder);
+
+	intel_dp_dual_mode_set_tmds_output(intel_hdmi, false);
+}
+
+static void intel_ddi_post_disable(struct intel_encoder *encoder,
+				   const struct intel_crtc_state *old_crtc_state,
+				   const struct drm_connector_state *old_conn_state)
+{
+	/*
+	 * old_crtc_state and old_conn_state are NULL when called from
+	 * DP_MST. The main connector associated with this port is never
+	 * bound to a crtc for MST.
+	 */
+	if (old_crtc_state &&
+	    intel_crtc_has_type(old_crtc_state, INTEL_OUTPUT_HDMI))
+		intel_ddi_post_disable_hdmi(encoder,
+					    old_crtc_state, old_conn_state);
+	else
+		intel_ddi_post_disable_dp(encoder,
+					  old_crtc_state, old_conn_state);
 }
 
 void intel_ddi_fdi_post_disable(struct intel_encoder *encoder,
@@ -2334,7 +2368,8 @@ void intel_ddi_fdi_post_disable(struct intel_encoder *encoder,
 	val &= ~FDI_RX_ENABLE;
 	I915_WRITE(FDI_RX_CTL(PIPE_A), val);
 
-	intel_ddi_post_disable(encoder, old_crtc_state, old_conn_state);
+	intel_disable_ddi_buf(encoder);
+	intel_ddi_clk_disable(encoder);
 
 	val = I915_READ(FDI_RX_MISC(PIPE_A));
 	val &= ~(FDI_RX_PWRDN_LANE1_MASK | FDI_RX_PWRDN_LANE0_MASK);
@@ -2350,70 +2385,93 @@ void intel_ddi_fdi_post_disable(struct intel_encoder *encoder,
 	I915_WRITE(FDI_RX_CTL(PIPE_A), val);
 }
 
-static void intel_enable_ddi(struct intel_encoder *intel_encoder,
-			     const struct intel_crtc_state *pipe_config,
-			     const struct drm_connector_state *conn_state)
+static void intel_enable_ddi_dp(struct intel_encoder *encoder,
+				const struct intel_crtc_state *crtc_state,
+				const struct drm_connector_state *conn_state)
 {
-	struct drm_encoder *encoder = &intel_encoder->base;
-	struct drm_i915_private *dev_priv = to_i915(encoder->dev);
-	enum port port = intel_ddi_get_encoder_port(intel_encoder);
-	int type = intel_encoder->type;
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	struct intel_dp *intel_dp = enc_to_intel_dp(&encoder->base);
+	enum port port = intel_ddi_get_encoder_port(encoder);
 
-	if (type == INTEL_OUTPUT_HDMI) {
-		struct intel_digital_port *intel_dig_port =
-			enc_to_dig_port(encoder);
-		bool clock_ratio = pipe_config->hdmi_high_tmds_clock_ratio;
-		bool scrambling = pipe_config->hdmi_scrambling;
-
-		intel_hdmi_handle_sink_scrambling(intel_encoder,
-						  conn_state->connector,
-						  clock_ratio, scrambling);
-
-		/* In HDMI/DVI mode, the port width, and swing/emphasis values
-		 * are ignored so nothing special needs to be done besides
-		 * enabling the port.
-		 */
-		I915_WRITE(DDI_BUF_CTL(port),
-			   intel_dig_port->saved_port_bits |
-			   DDI_BUF_CTL_ENABLE);
-	} else if (type == INTEL_OUTPUT_EDP) {
-		struct intel_dp *intel_dp = enc_to_intel_dp(encoder);
+	if (port == PORT_A && INTEL_GEN(dev_priv) < 9)
+		intel_dp_stop_link_train(intel_dp);
 
-		if (port == PORT_A && INTEL_GEN(dev_priv) < 9)
-			intel_dp_stop_link_train(intel_dp);
+	intel_edp_backlight_on(crtc_state, conn_state);
+	intel_psr_enable(intel_dp, crtc_state);
+	intel_edp_drrs_enable(intel_dp, crtc_state);
 
-		intel_edp_backlight_on(pipe_config, conn_state);
-		intel_psr_enable(intel_dp, pipe_config);
-		intel_edp_drrs_enable(intel_dp, pipe_config);
-	}
+	if (crtc_state->has_audio)
+		intel_audio_codec_enable(encoder, crtc_state, conn_state);
+}
 
-	if (pipe_config->has_audio)
-		intel_audio_codec_enable(intel_encoder, pipe_config, conn_state);
+static void intel_enable_ddi_hdmi(struct intel_encoder *encoder,
+				  const struct intel_crtc_state *crtc_state,
+				  const struct drm_connector_state *conn_state)
+{
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	struct intel_digital_port *dig_port = enc_to_dig_port(&encoder->base);
+	enum port port = intel_ddi_get_encoder_port(encoder);
+
+	intel_hdmi_handle_sink_scrambling(encoder,
+					  conn_state->connector,
+					  crtc_state->hdmi_high_tmds_clock_ratio,
+					  crtc_state->hdmi_scrambling);
+
+	/* In HDMI/DVI mode, the port width, and swing/emphasis values
+	 * are ignored so nothing special needs to be done besides
+	 * enabling the port.
+	 */
+	I915_WRITE(DDI_BUF_CTL(port),
+		   dig_port->saved_port_bits | DDI_BUF_CTL_ENABLE);
+
+	if (crtc_state->has_audio)
+		intel_audio_codec_enable(encoder, crtc_state, conn_state);
 }
 
-static void intel_disable_ddi(struct intel_encoder *intel_encoder,
-			      const struct intel_crtc_state *old_crtc_state,
-			      const struct drm_connector_state *old_conn_state)
+static void intel_enable_ddi(struct intel_encoder *encoder,
+			     const struct intel_crtc_state *crtc_state,
+			     const struct drm_connector_state *conn_state)
 {
-	struct drm_encoder *encoder = &intel_encoder->base;
-	int type = intel_encoder->type;
+	if (intel_crtc_has_type(crtc_state, INTEL_OUTPUT_HDMI))
+		intel_enable_ddi_hdmi(encoder, crtc_state, conn_state);
+	else
+		intel_enable_ddi_dp(encoder, crtc_state, conn_state);
+}
+
+static void intel_disable_ddi_dp(struct intel_encoder *encoder,
+				 const struct intel_crtc_state *old_crtc_state,
+				 const struct drm_connector_state *old_conn_state)
+{
+	struct intel_dp *intel_dp = enc_to_intel_dp(&encoder->base);
 
 	if (old_crtc_state->has_audio)
-		intel_audio_codec_disable(intel_encoder);
+		intel_audio_codec_disable(encoder);
 
-	if (type == INTEL_OUTPUT_HDMI) {
-		intel_hdmi_handle_sink_scrambling(intel_encoder,
-						  old_conn_state->connector,
-						  false, false);
-	}
+	intel_edp_drrs_disable(intel_dp, old_crtc_state);
+	intel_psr_disable(intel_dp, old_crtc_state);
+	intel_edp_backlight_off(old_conn_state);
+}
+
+static void intel_disable_ddi_hdmi(struct intel_encoder *encoder,
+				   const struct intel_crtc_state *old_crtc_state,
+				   const struct drm_connector_state *old_conn_state)
+{
+	if (old_crtc_state->has_audio)
+		intel_audio_codec_disable(encoder);
 
-	if (type == INTEL_OUTPUT_EDP) {
-		struct intel_dp *intel_dp = enc_to_intel_dp(encoder);
+	intel_hdmi_handle_sink_scrambling(encoder,
+					  old_conn_state->connector,
+					  false, false);
+}
 
-		intel_edp_drrs_disable(intel_dp, old_crtc_state);
-		intel_psr_disable(intel_dp, old_crtc_state);
-		intel_edp_backlight_off(old_conn_state);
-	}
+static void intel_disable_ddi(struct intel_encoder *encoder,
+			      const struct intel_crtc_state *old_crtc_state,
+			      const struct drm_connector_state *old_conn_state)
+{
+	if (intel_crtc_has_type(old_crtc_state, INTEL_OUTPUT_HDMI))
+		intel_disable_ddi_hdmi(encoder, old_crtc_state, old_conn_state);
+	else
+		intel_disable_ddi_dp(encoder, old_crtc_state, old_conn_state);
 }
 
 static void bxt_ddi_pre_pll_enable(struct intel_encoder *encoder,
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 7e91dc9a0fcf..f4a9a182868f 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2847,7 +2847,7 @@ intel_find_initial_plane_obj(struct intel_crtc *intel_crtc,
 
 		if (intel_plane_ggtt_offset(state) == plane_config->base) {
 			fb = c->primary->fb;
-			drm_framebuffer_reference(fb);
+			drm_framebuffer_get(fb);
 			goto valid_fb;
 		}
 	}
@@ -2878,7 +2878,7 @@ valid_fb:
 			  intel_crtc->pipe, PTR_ERR(intel_state->vma));
 
 		intel_state->vma = NULL;
-		drm_framebuffer_unreference(fb);
+		drm_framebuffer_put(fb);
 		return;
 	}
 
@@ -2899,7 +2899,7 @@ valid_fb:
 	if (i915_gem_object_is_tiled(obj))
 		dev_priv->preserve_bios_swizzle = true;
 
-	drm_framebuffer_reference(fb);
+	drm_framebuffer_get(fb);
 	primary->fb = primary->state->fb = fb;
 	primary->crtc = primary->state->crtc = &intel_crtc->base;
 
@@ -3289,7 +3289,6 @@ static void i9xx_update_primary_plane(struct intel_plane *primary,
 				      const struct intel_plane_state *plane_state)
 {
 	struct drm_i915_private *dev_priv = to_i915(primary->base.dev);
-	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
 	const struct drm_framebuffer *fb = plane_state->base.fb;
 	enum plane plane = primary->plane;
 	u32 linear_offset;
@@ -3298,16 +3297,14 @@ static void i9xx_update_primary_plane(struct intel_plane *primary,
 	int x = plane_state->main.x;
 	int y = plane_state->main.y;
 	unsigned long irqflags;
+	u32 dspaddr_offset;
 
 	linear_offset = intel_fb_xy_to_linear(x, y, plane_state, 0);
 
 	if (INTEL_GEN(dev_priv) >= 4)
-		crtc->dspaddr_offset = plane_state->main.offset;
+		dspaddr_offset = plane_state->main.offset;
 	else
-		crtc->dspaddr_offset = linear_offset;
-
-	crtc->adjusted_x = x;
-	crtc->adjusted_y = y;
+		dspaddr_offset = linear_offset;
 
 	spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
 
@@ -3333,18 +3330,18 @@ static void i9xx_update_primary_plane(struct intel_plane *primary,
 	if (IS_HASWELL(dev_priv) || IS_BROADWELL(dev_priv)) {
 		I915_WRITE_FW(DSPSURF(plane),
 			      intel_plane_ggtt_offset(plane_state) +
-			      crtc->dspaddr_offset);
+			      dspaddr_offset);
 		I915_WRITE_FW(DSPOFFSET(plane), (y << 16) | x);
 	} else if (INTEL_GEN(dev_priv) >= 4) {
 		I915_WRITE_FW(DSPSURF(plane),
 			      intel_plane_ggtt_offset(plane_state) +
-			      crtc->dspaddr_offset);
+			      dspaddr_offset);
 		I915_WRITE_FW(DSPTILEOFF(plane), (y << 16) | x);
 		I915_WRITE_FW(DSPLINOFF(plane), linear_offset);
 	} else {
 		I915_WRITE_FW(DSPADDR(plane),
 			      intel_plane_ggtt_offset(plane_state) +
-			      crtc->dspaddr_offset);
+			      dspaddr_offset);
 	}
 	POSTING_READ_FW(reg);
 
@@ -3544,100 +3541,6 @@ u32 skl_plane_ctl(const struct intel_crtc_state *crtc_state,
 	return plane_ctl;
 }
 
-static void skylake_update_primary_plane(struct intel_plane *plane,
-					 const struct intel_crtc_state *crtc_state,
-					 const struct intel_plane_state *plane_state)
-{
-	struct drm_i915_private *dev_priv = to_i915(plane->base.dev);
-	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
-	const struct drm_framebuffer *fb = plane_state->base.fb;
-	enum plane_id plane_id = plane->id;
-	enum pipe pipe = plane->pipe;
-	u32 plane_ctl = plane_state->ctl;
-	unsigned int rotation = plane_state->base.rotation;
-	u32 stride = skl_plane_stride(fb, 0, rotation);
-	u32 aux_stride = skl_plane_stride(fb, 1, rotation);
-	u32 surf_addr = plane_state->main.offset;
-	int scaler_id = plane_state->scaler_id;
-	int src_x = plane_state->main.x;
-	int src_y = plane_state->main.y;
-	int src_w = drm_rect_width(&plane_state->base.src) >> 16;
-	int src_h = drm_rect_height(&plane_state->base.src) >> 16;
-	int dst_x = plane_state->base.dst.x1;
-	int dst_y = plane_state->base.dst.y1;
-	int dst_w = drm_rect_width(&plane_state->base.dst);
-	int dst_h = drm_rect_height(&plane_state->base.dst);
-	unsigned long irqflags;
-
-	/* Sizes are 0 based */
-	src_w--;
-	src_h--;
-	dst_w--;
-	dst_h--;
-
-	crtc->dspaddr_offset = surf_addr;
-
-	crtc->adjusted_x = src_x;
-	crtc->adjusted_y = src_y;
-
-	spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
-
-	if (IS_GEMINILAKE(dev_priv) || IS_CANNONLAKE(dev_priv)) {
-		I915_WRITE_FW(PLANE_COLOR_CTL(pipe, plane_id),
-			      PLANE_COLOR_PIPE_GAMMA_ENABLE |
-			      PLANE_COLOR_PIPE_CSC_ENABLE |
-			      PLANE_COLOR_PLANE_GAMMA_DISABLE);
-	}
-
-	I915_WRITE_FW(PLANE_CTL(pipe, plane_id), plane_ctl);
-	I915_WRITE_FW(PLANE_OFFSET(pipe, plane_id), (src_y << 16) | src_x);
-	I915_WRITE_FW(PLANE_STRIDE(pipe, plane_id), stride);
-	I915_WRITE_FW(PLANE_SIZE(pipe, plane_id), (src_h << 16) | src_w);
-	I915_WRITE_FW(PLANE_AUX_DIST(pipe, plane_id),
-		      (plane_state->aux.offset - surf_addr) | aux_stride);
-	I915_WRITE_FW(PLANE_AUX_OFFSET(pipe, plane_id),
-		      (plane_state->aux.y << 16) | plane_state->aux.x);
-
-	if (scaler_id >= 0) {
-		uint32_t ps_ctrl = 0;
-
-		WARN_ON(!dst_w || !dst_h);
-		ps_ctrl = PS_SCALER_EN | PS_PLANE_SEL(plane_id) |
-			crtc_state->scaler_state.scalers[scaler_id].mode;
-		I915_WRITE_FW(SKL_PS_CTRL(pipe, scaler_id), ps_ctrl);
-		I915_WRITE_FW(SKL_PS_PWR_GATE(pipe, scaler_id), 0);
-		I915_WRITE_FW(SKL_PS_WIN_POS(pipe, scaler_id), (dst_x << 16) | dst_y);
-		I915_WRITE_FW(SKL_PS_WIN_SZ(pipe, scaler_id), (dst_w << 16) | dst_h);
-		I915_WRITE_FW(PLANE_POS(pipe, plane_id), 0);
-	} else {
-		I915_WRITE_FW(PLANE_POS(pipe, plane_id), (dst_y << 16) | dst_x);
-	}
-
-	I915_WRITE_FW(PLANE_SURF(pipe, plane_id),
-		      intel_plane_ggtt_offset(plane_state) + surf_addr);
-
-	POSTING_READ_FW(PLANE_SURF(pipe, plane_id));
-
-	spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
-}
-
-static void skylake_disable_primary_plane(struct intel_plane *primary,
-					  struct intel_crtc *crtc)
-{
-	struct drm_i915_private *dev_priv = to_i915(primary->base.dev);
-	enum plane_id plane_id = primary->id;
-	enum pipe pipe = primary->pipe;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
-
-	I915_WRITE_FW(PLANE_CTL(pipe, plane_id), 0);
-	I915_WRITE_FW(PLANE_SURF(pipe, plane_id), 0);
-	POSTING_READ_FW(PLANE_SURF(pipe, plane_id));
-
-	spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
-}
-
 static int
 __intel_display_resume(struct drm_device *dev,
 		       struct drm_atomic_state *state,
@@ -3773,6 +3676,7 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
 
 		intel_pps_unlock_regs_wa(dev_priv);
 		intel_modeset_init_hw(dev);
+		intel_init_clock_gating(dev_priv);
 
 		spin_lock_irq(&dev_priv->irq_lock);
 		if (dev_priv->display.hpd_irq_setup)
@@ -6139,6 +6043,19 @@ struct intel_connector *intel_connector_alloc(void)
 	return connector;
 }
 
+/*
+ * Free the bits allocated by intel_connector_alloc.
+ * This should only be used after intel_connector_alloc has returned
+ * successfully, and before drm_connector_init returns successfully.
+ * Otherwise the destroy callbacks for the connector and the state should
+ * take care of proper cleanup/free
+ */
+void intel_connector_free(struct intel_connector *connector)
+{
+	kfree(to_intel_digital_connector_state(connector->base.state));
+	kfree(connector);
+}
+
 /* Simple connector->get_hw_state implementation for encoders that support only
  * one connector and no cloning and hence the encoder state determines the state
  * of the connector. */
@@ -6522,11 +6439,9 @@ static void i9xx_update_pll_dividers(struct intel_crtc *crtc,
 
 	crtc_state->dpll_hw_state.fp0 = fp;
 
-	crtc->lowfreq_avail = false;
 	if (intel_crtc_has_type(crtc_state, INTEL_OUTPUT_LVDS) &&
 	    reduced_clock) {
 		crtc_state->dpll_hw_state.fp1 = fp2;
-		crtc->lowfreq_avail = true;
 	} else {
 		crtc_state->dpll_hw_state.fp1 = fp;
 	}
@@ -7221,15 +7136,6 @@ static void i9xx_set_pipeconf(struct intel_crtc *intel_crtc)
 		}
 	}
 
-	if (HAS_PIPE_CXSR(dev_priv)) {
-		if (intel_crtc->lowfreq_avail) {
-			DRM_DEBUG_KMS("enabling CxSR downclocking\n");
-			pipeconf |= PIPECONF_CXSR_DOWNCLOCK;
-		} else {
-			DRM_DEBUG_KMS("disabling CxSR downclocking\n");
-		}
-	}
-
 	if (intel_crtc->config->base.adjusted_mode.flags & DRM_MODE_FLAG_INTERLACE) {
 		if (INTEL_GEN(dev_priv) < 4 ||
 		    intel_crtc_has_type(intel_crtc->config, INTEL_OUTPUT_SDVO))
@@ -8365,8 +8271,6 @@ static int ironlake_crtc_compute_clock(struct intel_crtc *crtc,
 	memset(&crtc_state->dpll_hw_state, 0,
 	       sizeof(crtc_state->dpll_hw_state));
 
-	crtc->lowfreq_avail = false;
-
 	/* CPU eDP is the only output that doesn't need a PCH PLL of its own. */
 	if (!crtc_state->has_pch_encoder)
 		return 0;
@@ -9025,8 +8929,6 @@ static int haswell_crtc_compute_clock(struct intel_crtc *crtc,
 		}
 	}
 
-	crtc->lowfreq_avail = false;
-
 	return 0;
 }
 
@@ -9846,7 +9748,7 @@ mode_fits_in_fbdev(struct drm_device *dev,
 	if (obj->base.size < mode->vdisplay * fb->pitches[0])
 		return NULL;
 
-	drm_framebuffer_reference(fb);
+	drm_framebuffer_get(fb);
 	return fb;
 #else
 	return NULL;
@@ -10027,7 +9929,7 @@ found:
 	if (ret)
 		goto fail;
 
-	drm_framebuffer_unreference(fb);
+	drm_framebuffer_put(fb);
 
 	ret = drm_atomic_set_mode_for_crtc(&crtc_state->base, mode);
 	if (ret)
@@ -10662,6 +10564,52 @@ intel_dump_m_n_config(struct intel_crtc_state *pipe_config, char *id,
 		      m_n->link_m, m_n->link_n, m_n->tu);
 }
 
+#define OUTPUT_TYPE(x) [INTEL_OUTPUT_ ## x] = #x
+
+static const char * const output_type_str[] = {
+	OUTPUT_TYPE(UNUSED),
+	OUTPUT_TYPE(ANALOG),
+	OUTPUT_TYPE(DVO),
+	OUTPUT_TYPE(SDVO),
+	OUTPUT_TYPE(LVDS),
+	OUTPUT_TYPE(TVOUT),
+	OUTPUT_TYPE(HDMI),
+	OUTPUT_TYPE(DP),
+	OUTPUT_TYPE(EDP),
+	OUTPUT_TYPE(DSI),
+	OUTPUT_TYPE(UNKNOWN),
+	OUTPUT_TYPE(DP_MST),
+};
+
+#undef OUTPUT_TYPE
+
+static void snprintf_output_types(char *buf, size_t len,
+				  unsigned int output_types)
+{
+	char *str = buf;
+	int i;
+
+	str[0] = '\0';
+
+	for (i = 0; i < ARRAY_SIZE(output_type_str); i++) {
+		int r;
+
+		if ((output_types & BIT(i)) == 0)
+			continue;
+
+		r = snprintf(str, len, "%s%s",
+			     str != buf ? "," : "", output_type_str[i]);
+		if (r >= len)
+			break;
+		str += r;
+		len -= r;
+
+		output_types &= ~BIT(i);
+	}
+
+	WARN_ON_ONCE(output_types != 0);
+}
+
 static void intel_dump_pipe_config(struct intel_crtc *crtc,
 				   struct intel_crtc_state *pipe_config,
 				   const char *context)
@@ -10672,10 +10620,15 @@ static void intel_dump_pipe_config(struct intel_crtc *crtc,
 	struct intel_plane *intel_plane;
 	struct intel_plane_state *state;
 	struct drm_framebuffer *fb;
+	char buf[64];
 
 	DRM_DEBUG_KMS("[CRTC:%d:%s]%s\n",
 		      crtc->base.base.id, crtc->base.name, context);
 
+	snprintf_output_types(buf, sizeof(buf), pipe_config->output_types);
+	DRM_DEBUG_KMS("output_types: %s (0x%x)\n",
+		      buf, pipe_config->output_types);
+
 	DRM_DEBUG_KMS("cpu_transcoder: %s, pipe bpp: %i, dithering: %i\n",
 		      transcoder_name(pipe_config->cpu_transcoder),
 		      pipe_config->pipe_bpp, pipe_config->dither);
@@ -13229,8 +13182,8 @@ intel_primary_plane_create(struct drm_i915_private *dev_priv, enum pipe pipe)
 		num_formats = ARRAY_SIZE(skl_primary_formats);
 		modifiers = skl_format_modifiers_ccs;
 
-		primary->update_plane = skylake_update_primary_plane;
-		primary->disable_plane = skylake_disable_primary_plane;
+		primary->update_plane = skl_update_plane;
+		primary->disable_plane = skl_disable_plane;
 	} else if (INTEL_GEN(dev_priv) >= 9) {
 		intel_primary_formats = skl_primary_formats;
 		num_formats = ARRAY_SIZE(skl_primary_formats);
@@ -13239,8 +13192,8 @@ intel_primary_plane_create(struct drm_i915_private *dev_priv, enum pipe pipe)
 		else
 			modifiers = skl_format_modifiers_noccs;
 
-		primary->update_plane = skylake_update_primary_plane;
-		primary->disable_plane = skylake_disable_primary_plane;
+		primary->update_plane = skl_update_plane;
+		primary->disable_plane = skl_disable_plane;
 	} else if (INTEL_GEN(dev_priv) >= 4) {
 		intel_primary_formats = i965_primary_formats;
 		num_formats = ARRAY_SIZE(i965_primary_formats);
@@ -14398,8 +14351,6 @@ void intel_modeset_init_hw(struct drm_device *dev)
 
 	intel_update_cdclk(dev_priv);
 	dev_priv->cdclk.logical = dev_priv->cdclk.actual = dev_priv->cdclk.hw;
-
-	intel_init_clock_gating(dev_priv);
 }
 
 /*
@@ -15111,6 +15062,15 @@ intel_modeset_setup_hw_state(struct drm_device *dev,
 	struct intel_encoder *encoder;
 	int i;
 
+	if (IS_HASWELL(dev_priv)) {
+		/*
+		 * WaRsPkgCStateDisplayPMReq:hsw
+		 * System hang if this isn't done before disabling all planes!
+		 */
+		I915_WRITE(CHICKEN_PAR1_1,
+			   I915_READ(CHICKEN_PAR1_1) | FORCE_ARB_IDLE_PLANES);
+	}
+
 	intel_modeset_readout_hw_state(dev);
 
 	/* HW state is read out, now we need to sanitize this mess. */
@@ -15208,6 +15168,8 @@ void intel_modeset_gem_init(struct drm_device *dev)
 
 	intel_init_gt_powersave(dev_priv);
 
+	intel_init_clock_gating(dev_priv);
+
 	intel_setup_overlay(dev_priv);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
index b0f446b68f42..aa75f55eeb61 100644
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -1007,7 +1007,7 @@ static uint32_t g4x_get_aux_send_ctl(struct intel_dp *intel_dp,
 	else
 		precharge = 5;
 
-	if (IS_BROADWELL(dev_priv) && intel_dig_port->port == PORT_A)
+	if (IS_BROADWELL(dev_priv))
 		timeout = DP_AUX_CH_CTL_TIME_OUT_600us;
 	else
 		timeout = DP_AUX_CH_CTL_TIME_OUT_400us;
@@ -1032,7 +1032,7 @@ static uint32_t skl_get_aux_send_ctl(struct intel_dp *intel_dp,
 	       DP_AUX_CH_CTL_DONE |
 	       (has_aux_irq ? DP_AUX_CH_CTL_INTERRUPT : 0) |
 	       DP_AUX_CH_CTL_TIME_OUT_ERROR |
-	       DP_AUX_CH_CTL_TIME_OUT_1600us |
+	       DP_AUX_CH_CTL_TIME_OUT_MAX |
 	       DP_AUX_CH_CTL_RECEIVE_ERROR |
 	       (send_bytes << DP_AUX_CH_CTL_MESSAGE_SIZE_SHIFT) |
 	       DP_AUX_CH_CTL_FW_SYNC_PULSE_SKL(32) |
@@ -1832,6 +1832,8 @@ found:
 	if (!HAS_DDI(dev_priv))
 		intel_dp_set_clock(encoder, pipe_config);
 
+	intel_psr_compute_config(intel_dp, pipe_config);
+
 	return true;
 }
 
@@ -3153,9 +3155,7 @@ intel_dp_voltage_max(struct intel_dp *intel_dp)
 	struct drm_i915_private *dev_priv = to_i915(intel_dp_to_dev(intel_dp));
 	enum port port = dp_to_dig_port(intel_dp)->port;
 
-	if (IS_GEN9_LP(dev_priv))
-		return DP_TRAIN_VOLTAGE_SWING_LEVEL_3;
-	else if (INTEL_GEN(dev_priv) >= 9) {
+	if (INTEL_GEN(dev_priv) >= 9) {
 		struct intel_encoder *encoder = &dp_to_dig_port(intel_dp)->base;
 		return intel_ddi_dp_voltage_max(encoder);
 	} else if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv))
diff --git a/drivers/gpu/drm/i915/intel_dp_mst.c b/drivers/gpu/drm/i915/intel_dp_mst.c
index 3c131e2544cf..772521440a9f 100644
--- a/drivers/gpu/drm/i915/intel_dp_mst.c
+++ b/drivers/gpu/drm/i915/intel_dp_mst.c
@@ -454,32 +454,52 @@ static struct drm_connector *intel_dp_add_mst_connector(struct drm_dp_mst_topolo
 	struct intel_dp *intel_dp = container_of(mgr, struct intel_dp, mst_mgr);
 	struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp);
 	struct drm_device *dev = intel_dig_port->base.base.dev;
+	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_connector *intel_connector;
 	struct drm_connector *connector;
-	int i;
+	enum pipe pipe;
+	int ret;
 
 	intel_connector = intel_connector_alloc();
 	if (!intel_connector)
 		return NULL;
 
 	connector = &intel_connector->base;
-	drm_connector_init(dev, connector, &intel_dp_mst_connector_funcs, DRM_MODE_CONNECTOR_DisplayPort);
+	ret = drm_connector_init(dev, connector, &intel_dp_mst_connector_funcs,
+				 DRM_MODE_CONNECTOR_DisplayPort);
+	if (ret) {
+		intel_connector_free(intel_connector);
+		return NULL;
+	}
+
 	drm_connector_helper_add(connector, &intel_dp_mst_connector_helper_funcs);
 
 	intel_connector->get_hw_state = intel_dp_mst_get_hw_state;
 	intel_connector->mst_port = intel_dp;
 	intel_connector->port = port;
 
-	for (i = PIPE_A; i <= PIPE_C; i++) {
-		drm_mode_connector_attach_encoder(&intel_connector->base,
-						  &intel_dp->mst_encoders[i]->base.base);
+	for_each_pipe(dev_priv, pipe) {
+		struct drm_encoder *enc =
+			&intel_dp->mst_encoders[pipe]->base.base;
+
+		ret = drm_mode_connector_attach_encoder(&intel_connector->base,
+							enc);
+		if (ret)
+			goto err;
 	}
 
 	drm_object_attach_property(&connector->base, dev->mode_config.path_property, 0);
 	drm_object_attach_property(&connector->base, dev->mode_config.tile_property, 0);
 
-	drm_mode_connector_set_path_property(connector, pathprop);
+	ret = drm_mode_connector_set_path_property(connector, pathprop);
+	if (ret)
+		goto err;
+
 	return connector;
+
+err:
+	drm_connector_cleanup(connector);
+	return NULL;
 }
 
 static void intel_dp_register_mst_connector(struct drm_connector *connector)
@@ -569,11 +589,12 @@ intel_dp_create_fake_mst_encoder(struct intel_digital_port *intel_dig_port, enum
 static bool
 intel_dp_create_fake_mst_encoders(struct intel_digital_port *intel_dig_port)
 {
-	int i;
 	struct intel_dp *intel_dp = &intel_dig_port->dp;
+	struct drm_i915_private *dev_priv = to_i915(intel_dig_port->base.base.dev);
+	enum pipe pipe;
 
-	for (i = PIPE_A; i <= PIPE_C; i++)
-		intel_dp->mst_encoders[i] = intel_dp_create_fake_mst_encoder(intel_dig_port, i);
+	for_each_pipe(dev_priv, pipe)
+		intel_dp->mst_encoders[pipe] = intel_dp_create_fake_mst_encoder(intel_dig_port, pipe);
 	return true;
 }
 
diff --git a/drivers/gpu/drm/i915/intel_dpll_mgr.c b/drivers/gpu/drm/i915/intel_dpll_mgr.c
index a2a3d93d67bd..df808a94c511 100644
--- a/drivers/gpu/drm/i915/intel_dpll_mgr.c
+++ b/drivers/gpu/drm/i915/intel_dpll_mgr.c
@@ -1996,7 +1996,7 @@ static void cnl_ddi_pll_enable(struct drm_i915_private *dev_priv,
 
 	/* 3. Configure DPLL_CFGCR0 */
 	/* Avoid touch CFGCR1 if HDMI mode is not enabled */
-	if (pll->state.hw_state.cfgcr0 & DPLL_CTRL1_HDMI_MODE(pll->id)) {
+	if (pll->state.hw_state.cfgcr0 & DPLL_CFGCR0_HDMI_MODE) {
 		val = pll->state.hw_state.cfgcr1;
 		I915_WRITE(CNL_DPLL_CFGCR1(pll->id), val);
 		/* 4. Reab back to ensure writes completed */
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index b87946dcc53f..47d022d48718 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -718,6 +718,9 @@ struct intel_crtc_state {
 	struct intel_link_m_n dp_m2_n2;
 	bool has_drrs;
 
+	bool has_psr;
+	bool has_psr2;
+
 	/*
 	 * Frequence the dpll for the port should run at. Differs from the
 	 * adjusted dotclock e.g. for DP or 12bpc hdmi mode. This is also
@@ -800,18 +803,10 @@ struct intel_crtc {
 	 * some outputs connected to this crtc.
 	 */
 	bool active;
-	bool lowfreq_avail;
 	u8 plane_ids_mask;
 	unsigned long long enabled_power_domains;
 	struct intel_overlay *overlay;
 
-	/* Display surface base address adjustement for pageflips. Note that on
-	 * gen4+ this only adjusts up to a tile, offsets within a tile are
-	 * handled in the hw itself (with the TILEOFF register). */
-	u32 dspaddr_offset;
-	int adjusted_x;
-	int adjusted_y;
-
 	struct intel_crtc_state *config;
 
 	/* global reset count when the last flip was submitted */
@@ -1066,7 +1061,7 @@ struct intel_digital_port {
 
 	void (*write_infoframe)(struct drm_encoder *encoder,
 				const struct intel_crtc_state *crtc_state,
-				enum hdmi_infoframe_type type,
+				unsigned int type,
 				const void *frame, ssize_t len);
 	void (*set_infoframes)(struct drm_encoder *encoder,
 			       bool enable,
@@ -1360,6 +1355,7 @@ void intel_pps_unlock_regs_wa(struct drm_i915_private *dev_priv);
 void intel_encoder_destroy(struct drm_encoder *encoder);
 int intel_connector_init(struct intel_connector *);
 struct intel_connector *intel_connector_alloc(void);
+void intel_connector_free(struct intel_connector *connector);
 bool intel_connector_get_hw_state(struct intel_connector *connector);
 void intel_connector_attach_encoder(struct intel_connector *connector,
 				    struct intel_encoder *encoder);
@@ -1764,6 +1760,8 @@ void intel_psr_flush(struct drm_i915_private *dev_priv,
 void intel_psr_init(struct drm_i915_private *dev_priv);
 void intel_psr_single_frame_update(struct drm_i915_private *dev_priv,
 				   unsigned frontbuffer_bits);
+void intel_psr_compute_config(struct intel_dp *intel_dp,
+			      struct intel_crtc_state *crtc_state);
 
 /* intel_runtime_pm.c */
 int intel_power_domains_init(struct drm_i915_private *);
@@ -1923,6 +1921,10 @@ int intel_sprite_set_colorkey(struct drm_device *dev, void *data,
 			      struct drm_file *file_priv);
 void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state);
 void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state);
+void skl_update_plane(struct intel_plane *plane,
+		      const struct intel_crtc_state *crtc_state,
+		      const struct intel_plane_state *plane_state);
+void skl_disable_plane(struct intel_plane *plane, struct intel_crtc *crtc);
 
 /* intel_tv.c */
 void intel_tv_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/intel_dsi.c b/drivers/gpu/drm/i915/intel_dsi.c
index 66bbedc5fa01..83f15848098a 100644
--- a/drivers/gpu/drm/i915/intel_dsi.c
+++ b/drivers/gpu/drm/i915/intel_dsi.c
@@ -1751,42 +1751,13 @@ void intel_dsi_init(struct drm_i915_private *dev_priv)
 	else
 		intel_encoder->crtc_mask = BIT(PIPE_B);
 
-	if (dev_priv->vbt.dsi.config->dual_link) {
+	if (dev_priv->vbt.dsi.config->dual_link)
 		intel_dsi->ports = BIT(PORT_A) | BIT(PORT_C);
-
-		switch (dev_priv->vbt.dsi.config->dl_dcs_backlight_ports) {
-		case DL_DCS_PORT_A:
-			intel_dsi->dcs_backlight_ports = BIT(PORT_A);
-			break;
-		case DL_DCS_PORT_C:
-			intel_dsi->dcs_backlight_ports = BIT(PORT_C);
-			break;
-		default:
-		case DL_DCS_PORT_A_AND_C:
-			intel_dsi->dcs_backlight_ports = BIT(PORT_A) | BIT(PORT_C);
-			break;
-		}
-
-		switch (dev_priv->vbt.dsi.config->dl_dcs_cabc_ports) {
-		case DL_DCS_PORT_A:
-			intel_dsi->dcs_cabc_ports = BIT(PORT_A);
-			break;
-		case DL_DCS_PORT_C:
-			intel_dsi->dcs_cabc_ports = BIT(PORT_C);
-			break;
-		default:
-		case DL_DCS_PORT_A_AND_C:
-			intel_dsi->dcs_cabc_ports = BIT(PORT_A) | BIT(PORT_C);
-			break;
-		}
-	} else {
+	else
 		intel_dsi->ports = BIT(port);
-		intel_dsi->dcs_backlight_ports = BIT(port);
-		intel_dsi->dcs_cabc_ports = BIT(port);
-	}
 
-	if (!dev_priv->vbt.dsi.config->cabc_supported)
-		intel_dsi->dcs_cabc_ports = 0;
+	intel_dsi->dcs_backlight_ports = dev_priv->vbt.dsi.bl_ports;
+	intel_dsi->dcs_cabc_ports = dev_priv->vbt.dsi.cabc_ports;
 
 	/* Create a DSI host (and a device) for each port. */
 	for_each_dsi_port(port, intel_dsi->ports) {
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index a59b2a30ff5a..ab5bf4e2e28e 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -25,6 +25,7 @@
 #include <drm/drm_print.h>
 
 #include "i915_drv.h"
+#include "i915_vgpu.h"
 #include "intel_ringbuffer.h"
 #include "intel_lrc.h"
 
@@ -386,10 +387,6 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
 
 static bool csb_force_mmio(struct drm_i915_private *i915)
 {
-	/* GVT emulation depends upon intercepting CSB mmio */
-	if (intel_vgpu_active(i915))
-		return true;
-
 	/*
 	 * IOMMU adds unpredictable latency causing the CSB write (from the
 	 * GPU into the HWSP) to only be visible some time after the interrupt
@@ -398,6 +395,10 @@ static bool csb_force_mmio(struct drm_i915_private *i915)
 	if (intel_vtd_active())
 		return true;
 
+	/* Older GVT emulation depends upon intercepting CSB mmio */
+	if (intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915))
+		return true;
+
 	return false;
 }
 
@@ -1252,9 +1253,12 @@ static int bxt_init_workarounds(struct intel_engine_cs *engine)
 	}
 
 	/* WaProgramL3SqcReg1DefaultForPerf:bxt */
-	if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER))
-		I915_WRITE(GEN8_L3SQCREG1, L3_GENERAL_PRIO_CREDITS(62) |
-					   L3_HIGH_PRIO_CREDITS(2));
+	if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER)) {
+		u32 val = I915_READ(GEN8_L3SQCREG1);
+		val &= ~L3_PRIO_CREDITS_MASK;
+		val |= L3_GENERAL_PRIO_CREDITS(62) | L3_HIGH_PRIO_CREDITS(2);
+		I915_WRITE(GEN8_L3SQCREG1, val);
+	}
 
 	/* WaToEnableHwFixForPushConstHWBug:bxt */
 	if (IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))
@@ -1544,8 +1548,8 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine)
 	if (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted))
 		return false;
 
-	/* Both ports drained, no more ELSP submission? */
-	if (port_request(&engine->execlists.port[0]))
+	/* Waiting to drain ELSP? */
+	if (READ_ONCE(engine->execlists.active))
 		return false;
 
 	/* ELSP is empty, but there are ready requests? */
@@ -1622,8 +1626,10 @@ static void print_request(struct drm_printer *m,
 			  struct drm_i915_gem_request *rq,
 			  const char *prefix)
 {
-	drm_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,
-		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
+	drm_printf(m, "%s%x%s [%x:%x] prio=%d @ %dms: %s\n", prefix,
+		   rq->global_seqno,
+		   i915_gem_request_completed(rq) ? "!" : "",
+		   rq->ctx->hw_id, rq->fence.seqno,
 		   rq->priotree.priority,
 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
 		   rq->timeline->common->name);
@@ -1631,8 +1637,9 @@ static void print_request(struct drm_printer *m,
 
 void intel_engine_dump(struct intel_engine_cs *engine, struct drm_printer *m)
 {
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	struct i915_gpu_error *error = &engine->i915->gpu_error;
+	struct intel_breadcrumbs * const b = &engine->breadcrumbs;
+	const struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_gpu_error * const error = &engine->i915->gpu_error;
 	struct drm_i915_private *dev_priv = engine->i915;
 	struct drm_i915_gem_request *rq;
 	struct rb_node *rb;
@@ -1696,7 +1703,6 @@ void intel_engine_dump(struct intel_engine_cs *engine, struct drm_printer *m)
 
 	if (i915_modparams.enable_execlists) {
 		const u32 *hws = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
-		struct intel_engine_execlists * const execlists = &engine->execlists;
 		u32 ptr, read, write;
 		unsigned int idx;
 
@@ -1743,18 +1749,8 @@ void intel_engine_dump(struct intel_engine_cs *engine, struct drm_printer *m)
 					   idx);
 			}
 		}
+		drm_printf(m, "\t\tHW active? 0x%x\n", execlists->active);
 		rcu_read_unlock();
-
-		spin_lock_irq(&engine->timeline->lock);
-		for (rb = execlists->first; rb; rb = rb_next(rb)) {
-			struct i915_priolist *p =
-				rb_entry(rb, typeof(*p), node);
-
-			list_for_each_entry(rq, &p->requests,
-					    priotree.link)
-				print_request(m, rq, "\t\tQ ");
-		}
-		spin_unlock_irq(&engine->timeline->lock);
 	} else if (INTEL_GEN(dev_priv) > 6) {
 		drm_printf(m, "\tPP_DIR_BASE: 0x%08x\n",
 			   I915_READ(RING_PP_DIR_BASE(engine)));
@@ -1764,6 +1760,18 @@ void intel_engine_dump(struct intel_engine_cs *engine, struct drm_printer *m)
 			   I915_READ(RING_PP_DIR_DCLV(engine)));
 	}
 
+	spin_lock_irq(&engine->timeline->lock);
+	list_for_each_entry(rq, &engine->timeline->requests, link)
+		print_request(m, rq, "\t\tE ");
+	for (rb = execlists->first; rb; rb = rb_next(rb)) {
+		struct i915_priolist *p =
+			rb_entry(rb, typeof(*p), node);
+
+		list_for_each_entry(rq, &p->requests, priotree.link)
+			print_request(m, rq, "\t\tQ ");
+	}
+	spin_unlock_irq(&engine->timeline->lock);
+
 	spin_lock_irq(&b->rb_lock);
 	for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
 		struct intel_wait *w = rb_entry(rb, typeof(*w), node);
diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c
index 8e3a05505f49..1a0f5e0c8d10 100644
--- a/drivers/gpu/drm/i915/intel_fbc.c
+++ b/drivers/gpu/drm/i915/intel_fbc.c
@@ -69,9 +69,9 @@ static inline bool no_fbc_on_multiple_pipes(struct drm_i915_private *dev_priv)
  * address we program because it starts at the real start of the buffer, so we
  * have to take this into consideration here.
  */
-static unsigned int get_crtc_fence_y_offset(struct intel_crtc *crtc)
+static unsigned int get_crtc_fence_y_offset(struct intel_fbc *fbc)
 {
-	return crtc->base.y - crtc->adjusted_y;
+	return fbc->state_cache.plane.y - fbc->state_cache.plane.adjusted_y;
 }
 
 /*
@@ -727,8 +727,8 @@ static bool intel_fbc_hw_tracking_covers_screen(struct intel_crtc *crtc)
 
 	intel_fbc_get_plane_source_size(&fbc->state_cache, &effective_w,
 					&effective_h);
-	effective_w += crtc->adjusted_x;
-	effective_h += crtc->adjusted_y;
+	effective_w += fbc->state_cache.plane.adjusted_x;
+	effective_h += fbc->state_cache.plane.adjusted_y;
 
 	return effective_w <= max_w && effective_h <= max_h;
 }
@@ -757,6 +757,9 @@ static void intel_fbc_update_state_cache(struct intel_crtc *crtc,
 	cache->plane.src_w = drm_rect_width(&plane_state->base.src) >> 16;
 	cache->plane.src_h = drm_rect_height(&plane_state->base.src) >> 16;
 	cache->plane.visible = plane_state->base.visible;
+	cache->plane.adjusted_x = plane_state->main.x;
+	cache->plane.adjusted_y = plane_state->main.y;
+	cache->plane.y = plane_state->base.src.y1 >> 16;
 
 	if (!cache->plane.visible)
 		return;
@@ -888,7 +891,7 @@ static void intel_fbc_get_reg_params(struct intel_crtc *crtc,
 
 	params->crtc.pipe = crtc->pipe;
 	params->crtc.plane = crtc->plane;
-	params->crtc.fence_y_offset = get_crtc_fence_y_offset(crtc);
+	params->crtc.fence_y_offset = get_crtc_fence_y_offset(fbc);
 
 	params->fb.format = cache->fb.format;
 	params->fb.stride = cache->fb.stride;
diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c
index f2bb8116227c..b8af35187d22 100644
--- a/drivers/gpu/drm/i915/intel_fbdev.c
+++ b/drivers/gpu/drm/i915/intel_fbdev.c
@@ -189,7 +189,7 @@ static int intelfb_create(struct drm_fb_helper *helper,
 			      " releasing it\n",
 			      intel_fb->base.width, intel_fb->base.height,
 			      sizes->fb_width, sizes->fb_height);
-		drm_framebuffer_unreference(&intel_fb->base);
+		drm_framebuffer_put(&intel_fb->base);
 		intel_fb = ifbdev->fb = NULL;
 	}
 	if (!intel_fb || WARN_ON(!intel_fb->obj)) {
@@ -627,7 +627,7 @@ static bool intel_fbdev_init_bios(struct drm_device *dev,
 	ifbdev->preferred_bpp = fb->base.format->cpp[0] * 8;
 	ifbdev->fb = fb;
 
-	drm_framebuffer_reference(&ifbdev->fb->base);
+	drm_framebuffer_get(&ifbdev->fb->base);
 
 	/* Final pass to check if any active pipes don't have fbs */
 	for_each_crtc(dev, crtc) {
diff --git a/drivers/gpu/drm/i915/intel_guc.c b/drivers/gpu/drm/i915/intel_guc.c
index 9e18c4fb9909..10037c0fdf95 100644
--- a/drivers/gpu/drm/i915/intel_guc.c
+++ b/drivers/gpu/drm/i915/intel_guc.c
@@ -67,6 +67,99 @@ void intel_guc_init_early(struct intel_guc *guc)
 	guc->notify = gen8_guc_raise_irq;
 }
 
+static u32 get_gt_type(struct drm_i915_private *dev_priv)
+{
+	/* XXX: GT type based on PCI device ID? field seems unused by fw */
+	return 0;
+}
+
+static u32 get_core_family(struct drm_i915_private *dev_priv)
+{
+	u32 gen = INTEL_GEN(dev_priv);
+
+	switch (gen) {
+	case 9:
+		return GUC_CORE_FAMILY_GEN9;
+
+	default:
+		MISSING_CASE(gen);
+		return GUC_CORE_FAMILY_UNKNOWN;
+	}
+}
+
+/*
+ * Initialise the GuC parameter block before starting the firmware
+ * transfer. These parameters are read by the firmware on startup
+ * and cannot be changed thereafter.
+ */
+void intel_guc_init_params(struct intel_guc *guc)
+{
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
+	u32 params[GUC_CTL_MAX_DWORDS];
+	int i;
+
+	memset(params, 0, sizeof(params));
+
+	params[GUC_CTL_DEVICE_INFO] |=
+		(get_gt_type(dev_priv) << GUC_CTL_GT_TYPE_SHIFT) |
+		(get_core_family(dev_priv) << GUC_CTL_CORE_FAMILY_SHIFT);
+
+	/*
+	 * GuC ARAT increment is 10 ns. GuC default scheduler quantum is one
+	 * second. This ARAR is calculated by:
+	 * Scheduler-Quantum-in-ns / ARAT-increment-in-ns = 1000000000 / 10
+	 */
+	params[GUC_CTL_ARAT_HIGH] = 0;
+	params[GUC_CTL_ARAT_LOW] = 100000000;
+
+	params[GUC_CTL_WA] |= GUC_CTL_WA_UK_BY_DRIVER;
+
+	params[GUC_CTL_FEATURE] |= GUC_CTL_DISABLE_SCHEDULER |
+			GUC_CTL_VCS2_ENABLED;
+
+	params[GUC_CTL_LOG_PARAMS] = guc->log.flags;
+
+	if (i915_modparams.guc_log_level >= 0) {
+		params[GUC_CTL_DEBUG] =
+			i915_modparams.guc_log_level << GUC_LOG_VERBOSITY_SHIFT;
+	} else {
+		params[GUC_CTL_DEBUG] = GUC_LOG_DISABLED;
+	}
+
+	/* If GuC submission is enabled, set up additional parameters here */
+	if (i915_modparams.enable_guc_submission) {
+		u32 ads = guc_ggtt_offset(guc->ads_vma) >> PAGE_SHIFT;
+		u32 pgs = guc_ggtt_offset(dev_priv->guc.stage_desc_pool);
+		u32 ctx_in_16 = GUC_MAX_STAGE_DESCRIPTORS / 16;
+
+		params[GUC_CTL_DEBUG] |= ads << GUC_ADS_ADDR_SHIFT;
+		params[GUC_CTL_DEBUG] |= GUC_ADS_ENABLED;
+
+		pgs >>= PAGE_SHIFT;
+		params[GUC_CTL_CTXINFO] = (pgs << GUC_CTL_BASE_ADDR_SHIFT) |
+			(ctx_in_16 << GUC_CTL_CTXNUM_IN16_SHIFT);
+
+		params[GUC_CTL_FEATURE] |= GUC_CTL_KERNEL_SUBMISSIONS;
+
+		/* Unmask this bit to enable the GuC's internal scheduler */
+		params[GUC_CTL_FEATURE] &= ~GUC_CTL_DISABLE_SCHEDULER;
+	}
+
+	/*
+	 * All SOFT_SCRATCH registers are in FORCEWAKE_BLITTER domain and
+	 * they are power context saved so it's ok to release forcewake
+	 * when we are done here and take it again at xfer time.
+	 */
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_BLITTER);
+
+	I915_WRITE(SOFT_SCRATCH(0), 0);
+
+	for (i = 0; i < GUC_CTL_MAX_DWORDS; i++)
+		I915_WRITE(SOFT_SCRATCH(1 + i), params[i]);
+
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_BLITTER);
+}
+
 int intel_guc_send_nop(struct intel_guc *guc, const u32 *action, u32 len)
 {
 	WARN(1, "Unexpected send: action=%#x\n", *action);
@@ -263,3 +356,14 @@ err:
 	i915_gem_object_put(obj);
 	return vma;
 }
+
+u32 intel_guc_wopcm_size(struct drm_i915_private *dev_priv)
+{
+	u32 wopcm_size = GUC_WOPCM_TOP;
+
+	/* On BXT, the top of WOPCM is reserved for RC6 context */
+	if (IS_GEN9_LP(dev_priv))
+		wopcm_size -= BXT_GUC_WOPCM_RC6_RESERVED;
+
+	return wopcm_size;
+}
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index aa9a7b55be6e..418450b1ae27 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -26,6 +26,7 @@
 #define _INTEL_GUC_H_
 
 #include "intel_uncore.h"
+#include "intel_guc_fw.h"
 #include "intel_guc_fwif.h"
 #include "intel_guc_ct.h"
 #include "intel_guc_log.h"
@@ -33,6 +34,11 @@
 #include "i915_guc_reg.h"
 #include "i915_vma.h"
 
+/*
+ * Top level structure of GuC. It handles firmware loading and manages client
+ * pool and doorbells. intel_guc owns a i915_guc_client to replace the legacy
+ * ExecList submission.
+ */
 struct intel_guc {
 	struct intel_uc_fw fw;
 	struct intel_guc_log log;
@@ -83,6 +89,12 @@ static inline void intel_guc_notify(struct intel_guc *guc)
 	guc->notify(guc);
 }
 
+/*
+ * GuC does not allow any gfx GGTT address that falls into range [0, WOPCM_TOP),
+ * which is reserved for Boot ROM, SRAM and WOPCM. Currently this top address is
+ * 512K. In order to exclude 0-512K address space from GGTT, all gfx objects
+ * used by GuC is pinned with PIN_OFFSET_BIAS along with size of WOPCM.
+ */
 static inline u32 guc_ggtt_offset(struct i915_vma *vma)
 {
 	u32 offset = i915_ggtt_offset(vma);
@@ -95,6 +107,7 @@ static inline u32 guc_ggtt_offset(struct i915_vma *vma)
 
 void intel_guc_init_early(struct intel_guc *guc);
 void intel_guc_init_send_regs(struct intel_guc *guc);
+void intel_guc_init_params(struct intel_guc *guc);
 int intel_guc_send_nop(struct intel_guc *guc, const u32 *action, u32 len);
 int intel_guc_send_mmio(struct intel_guc *guc, const u32 *action, u32 len);
 int intel_guc_sample_forcewake(struct intel_guc *guc);
@@ -102,9 +115,6 @@ int intel_guc_auth_huc(struct intel_guc *guc, u32 rsa_offset);
 int intel_guc_suspend(struct drm_i915_private *dev_priv);
 int intel_guc_resume(struct drm_i915_private *dev_priv);
 struct i915_vma *intel_guc_allocate_vma(struct intel_guc *guc, u32 size);
-
-int intel_guc_select_fw(struct intel_guc *guc);
-int intel_guc_init_hw(struct intel_guc *guc);
 u32 intel_guc_wopcm_size(struct drm_i915_private *dev_priv);
 
 #endif
diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_fw.c
index c7a800a3798d..ef67a36354c5 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_fw.c
@@ -26,31 +26,9 @@
  *    Dave Gordon <david.s.gordon@intel.com>
  *    Alex Dai <yu.dai@intel.com>
  */
-#include "i915_drv.h"
-#include "intel_uc.h"
 
-/**
- * DOC: GuC-specific firmware loader
- *
- * intel_guc:
- * Top level structure of guc. It handles firmware loading and manages client
- * pool and doorbells. intel_guc owns a i915_guc_client to replace the legacy
- * ExecList submission.
- *
- * Firmware versioning:
- * The firmware build process will generate a version header file with major and
- * minor version defined. The versions are built into CSS header of firmware.
- * i915 kernel driver set the minimal firmware version required per platform.
- * The firmware installation package will install (symbolic link) proper version
- * of firmware.
- *
- * GuC address space:
- * GuC does not allow any gfx GGTT address that falls into range [0, WOPCM_TOP),
- * which is reserved for Boot ROM, SRAM and WOPCM. Currently this top address is
- * 512K. In order to exclude 0-512K address space from GGTT, all gfx objects
- * used by GuC is pinned with PIN_OFFSET_BIAS along with size of WOPCM.
- *
- */
+#include "intel_guc_fw.h"
+#include "i915_drv.h"
 
 #define SKL_FW_MAJOR 6
 #define SKL_FW_MINOR 1
@@ -78,88 +56,45 @@ MODULE_FIRMWARE(I915_KBL_GUC_UCODE);
 
 #define I915_GLK_GUC_UCODE GUC_FW_PATH(glk, GLK_FW_MAJOR, GLK_FW_MINOR)
 
-
-static u32 get_gttype(struct drm_i915_private *dev_priv)
-{
-	/* XXX: GT type based on PCI device ID? field seems unused by fw */
-	return 0;
-}
-
-static u32 get_core_family(struct drm_i915_private *dev_priv)
-{
-	u32 gen = INTEL_GEN(dev_priv);
-
-	switch (gen) {
-	case 9:
-		return GUC_CORE_FAMILY_GEN9;
-
-	default:
-		MISSING_CASE(gen);
-		return GUC_CORE_FAMILY_UNKNOWN;
-	}
-}
-
-/*
- * Initialise the GuC parameter block before starting the firmware
- * transfer. These parameters are read by the firmware on startup
- * and cannot be changed thereafter.
+/**
+ * intel_guc_fw_select() - selects GuC firmware for uploading
+ *
+ * @guc:	intel_guc struct
+ *
+ * Return: zero when we know firmware, non-zero in other case
  */
-static void guc_params_init(struct drm_i915_private *dev_priv)
+int intel_guc_fw_select(struct intel_guc *guc)
 {
-	struct intel_guc *guc = &dev_priv->guc;
-	u32 params[GUC_CTL_MAX_DWORDS];
-	int i;
-
-	memset(&params, 0, sizeof(params));
-
-	params[GUC_CTL_DEVICE_INFO] |=
-		(get_gttype(dev_priv) << GUC_CTL_GTTYPE_SHIFT) |
-		(get_core_family(dev_priv) << GUC_CTL_COREFAMILY_SHIFT);
-
-	/*
-	 * GuC ARAT increment is 10 ns. GuC default scheduler quantum is one
-	 * second. This ARAR is calculated by:
-	 * Scheduler-Quantum-in-ns / ARAT-increment-in-ns = 1000000000 / 10
-	 */
-	params[GUC_CTL_ARAT_HIGH] = 0;
-	params[GUC_CTL_ARAT_LOW] = 100000000;
-
-	params[GUC_CTL_WA] |= GUC_CTL_WA_UK_BY_DRIVER;
-
-	params[GUC_CTL_FEATURE] |= GUC_CTL_DISABLE_SCHEDULER |
-			GUC_CTL_VCS2_ENABLED;
-
-	params[GUC_CTL_LOG_PARAMS] = guc->log.flags;
-
-	if (i915_modparams.guc_log_level >= 0) {
-		params[GUC_CTL_DEBUG] =
-			i915_modparams.guc_log_level << GUC_LOG_VERBOSITY_SHIFT;
-	} else
-		params[GUC_CTL_DEBUG] = GUC_LOG_DISABLED;
-
-	/* If GuC submission is enabled, set up additional parameters here */
-	if (i915_modparams.enable_guc_submission) {
-		u32 ads = guc_ggtt_offset(guc->ads_vma) >> PAGE_SHIFT;
-		u32 pgs = guc_ggtt_offset(dev_priv->guc.stage_desc_pool);
-		u32 ctx_in_16 = GUC_MAX_STAGE_DESCRIPTORS / 16;
-
-		params[GUC_CTL_DEBUG] |= ads << GUC_ADS_ADDR_SHIFT;
-		params[GUC_CTL_DEBUG] |= GUC_ADS_ENABLED;
-
-		pgs >>= PAGE_SHIFT;
-		params[GUC_CTL_CTXINFO] = (pgs << GUC_CTL_BASE_ADDR_SHIFT) |
-			(ctx_in_16 << GUC_CTL_CTXNUM_IN16_SHIFT);
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
 
-		params[GUC_CTL_FEATURE] |= GUC_CTL_KERNEL_SUBMISSIONS;
+	intel_uc_fw_init(&guc->fw, INTEL_UC_FW_TYPE_GUC);
 
-		/* Unmask this bit to enable the GuC's internal scheduler */
-		params[GUC_CTL_FEATURE] &= ~GUC_CTL_DISABLE_SCHEDULER;
+	if (i915_modparams.guc_firmware_path) {
+		guc->fw.path = i915_modparams.guc_firmware_path;
+		guc->fw.major_ver_wanted = 0;
+		guc->fw.minor_ver_wanted = 0;
+	} else if (IS_SKYLAKE(dev_priv)) {
+		guc->fw.path = I915_SKL_GUC_UCODE;
+		guc->fw.major_ver_wanted = SKL_FW_MAJOR;
+		guc->fw.minor_ver_wanted = SKL_FW_MINOR;
+	} else if (IS_BROXTON(dev_priv)) {
+		guc->fw.path = I915_BXT_GUC_UCODE;
+		guc->fw.major_ver_wanted = BXT_FW_MAJOR;
+		guc->fw.minor_ver_wanted = BXT_FW_MINOR;
+	} else if (IS_KABYLAKE(dev_priv) || IS_COFFEELAKE(dev_priv)) {
+		guc->fw.path = I915_KBL_GUC_UCODE;
+		guc->fw.major_ver_wanted = KBL_FW_MAJOR;
+		guc->fw.minor_ver_wanted = KBL_FW_MINOR;
+	} else if (IS_GEMINILAKE(dev_priv)) {
+		guc->fw.path = I915_GLK_GUC_UCODE;
+		guc->fw.major_ver_wanted = GLK_FW_MAJOR;
+		guc->fw.minor_ver_wanted = GLK_FW_MINOR;
+	} else {
+		DRM_ERROR("No GuC firmware known for platform with GuC!\n");
+		return -ENOENT;
 	}
 
-	I915_WRITE(SOFT_SCRATCH(0), 0);
-
-	for (i = 0; i < GUC_CTL_MAX_DWORDS; i++)
-		I915_WRITE(SOFT_SCRATCH(1 + i), params[i]);
+	return 0;
 }
 
 /*
@@ -250,38 +185,16 @@ static int guc_ucode_xfer_dma(struct drm_i915_private *dev_priv,
 	return ret;
 }
 
-u32 intel_guc_wopcm_size(struct drm_i915_private *dev_priv)
-{
-	u32 wopcm_size = GUC_WOPCM_TOP;
-
-	/* On BXT, the top of WOPCM is reserved for RC6 context */
-	if (IS_GEN9_LP(dev_priv))
-		wopcm_size -= BXT_GUC_WOPCM_RC6_RESERVED;
-
-	return wopcm_size;
-}
-
 /*
  * Load the GuC firmware blob into the MinuteIA.
  */
-static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
+static int guc_ucode_xfer(struct intel_uc_fw *guc_fw, struct i915_vma *vma)
 {
-	struct intel_uc_fw *guc_fw = &dev_priv->guc.fw;
-	struct i915_vma *vma;
+	struct intel_guc *guc = container_of(guc_fw, struct intel_guc, fw);
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
 	int ret;
 
-	ret = i915_gem_object_set_to_gtt_domain(guc_fw->obj, false);
-	if (ret) {
-		DRM_DEBUG_DRIVER("set-domain failed %d\n", ret);
-		return ret;
-	}
-
-	vma = i915_gem_object_ggtt_pin(guc_fw->obj, NULL, 0, 0,
-				       PIN_OFFSET_BIAS | GUC_WOPCM_TOP);
-	if (IS_ERR(vma)) {
-		DRM_DEBUG_DRIVER("pin failed %d\n", (int)PTR_ERR(vma));
-		return PTR_ERR(vma);
-	}
+	GEM_BUG_ON(guc_fw->type != INTEL_UC_FW_TYPE_GUC);
 
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
@@ -312,23 +225,15 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
 		I915_WRITE(GUC_ARAT_C6DIS, 0x1FF);
 	}
 
-	guc_params_init(dev_priv);
-
 	ret = guc_ucode_xfer_dma(dev_priv, vma);
 
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 
-	/*
-	 * We keep the object pages for reuse during resume. But we can unpin it
-	 * now that DMA has completed, so it doesn't continue to take up space.
-	 */
-	i915_vma_unpin(vma);
-
 	return ret;
 }
 
 /**
- * intel_guc_init_hw() - finish preparing the GuC for activity
+ * intel_guc_fw_upload() - finish preparing the GuC for activity
  * @guc: intel_guc structure
  *
  * Called during driver loading and also after a GPU reset.
@@ -340,78 +245,7 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
  *
  * Return:	non-zero code on error
  */
-int intel_guc_init_hw(struct intel_guc *guc)
-{
-	struct drm_i915_private *dev_priv = guc_to_i915(guc);
-	const char *fw_path = guc->fw.path;
-	int ret;
-
-	DRM_DEBUG_DRIVER("GuC fw status: path %s, fetch %s, load %s\n",
-		fw_path,
-		intel_uc_fw_status_repr(guc->fw.fetch_status),
-		intel_uc_fw_status_repr(guc->fw.load_status));
-
-	if (guc->fw.fetch_status != INTEL_UC_FIRMWARE_SUCCESS)
-		return -EIO;
-
-	guc->fw.load_status = INTEL_UC_FIRMWARE_PENDING;
-
-	DRM_DEBUG_DRIVER("GuC fw status: fetch %s, load %s\n",
-		intel_uc_fw_status_repr(guc->fw.fetch_status),
-		intel_uc_fw_status_repr(guc->fw.load_status));
-
-	ret = guc_ucode_xfer(dev_priv);
-
-	if (ret)
-		return -EAGAIN;
-
-	guc->fw.load_status = INTEL_UC_FIRMWARE_SUCCESS;
-
-	DRM_INFO("GuC %s (firmware %s [version %u.%u])\n",
-		 i915_modparams.enable_guc_submission ? "submission enabled" :
-							"loaded",
-		 guc->fw.path,
-		 guc->fw.major_ver_found, guc->fw.minor_ver_found);
-
-	return 0;
-}
-
-/**
- * intel_guc_select_fw() - selects GuC firmware for loading
- * @guc:	intel_guc struct
- *
- * Return: zero when we know firmware, non-zero in other case
- */
-int intel_guc_select_fw(struct intel_guc *guc)
+int intel_guc_fw_upload(struct intel_guc *guc)
 {
-	struct drm_i915_private *dev_priv = guc_to_i915(guc);
-
-	intel_uc_fw_init(&guc->fw, INTEL_UC_FW_TYPE_GUC);
-
-	if (i915_modparams.guc_firmware_path) {
-		guc->fw.path = i915_modparams.guc_firmware_path;
-		guc->fw.major_ver_wanted = 0;
-		guc->fw.minor_ver_wanted = 0;
-	} else if (IS_SKYLAKE(dev_priv)) {
-		guc->fw.path = I915_SKL_GUC_UCODE;
-		guc->fw.major_ver_wanted = SKL_FW_MAJOR;
-		guc->fw.minor_ver_wanted = SKL_FW_MINOR;
-	} else if (IS_BROXTON(dev_priv)) {
-		guc->fw.path = I915_BXT_GUC_UCODE;
-		guc->fw.major_ver_wanted = BXT_FW_MAJOR;
-		guc->fw.minor_ver_wanted = BXT_FW_MINOR;
-	} else if (IS_KABYLAKE(dev_priv) || IS_COFFEELAKE(dev_priv)) {
-		guc->fw.path = I915_KBL_GUC_UCODE;
-		guc->fw.major_ver_wanted = KBL_FW_MAJOR;
-		guc->fw.minor_ver_wanted = KBL_FW_MINOR;
-	} else if (IS_GEMINILAKE(dev_priv)) {
-		guc->fw.path = I915_GLK_GUC_UCODE;
-		guc->fw.major_ver_wanted = GLK_FW_MAJOR;
-		guc->fw.minor_ver_wanted = GLK_FW_MINOR;
-	} else {
-		DRM_ERROR("No GuC firmware known for platform with GuC!\n");
-		return -ENOENT;
-	}
-
-	return 0;
+	return intel_uc_fw_upload(&guc->fw, guc_ucode_xfer);
 }
diff --git a/drivers/gpu/drm/i915/intel_guc_fw.h b/drivers/gpu/drm/i915/intel_guc_fw.h
new file mode 100644
index 000000000000..023f5baa9dd6
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_guc_fw.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _INTEL_GUC_FW_H_
+#define _INTEL_GUC_FW_H_
+
+struct intel_guc;
+
+int intel_guc_fw_select(struct intel_guc *guc);
+int intel_guc_fw_upload(struct intel_guc *guc);
+
+#endif
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h b/drivers/gpu/drm/i915/intel_guc_fwif.h
index 1c0a2a3de121..80c507435458 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -82,8 +82,8 @@
 #define GUC_CTL_ARAT_LOW		2
 
 #define GUC_CTL_DEVICE_INFO		3
-#define   GUC_CTL_GTTYPE_SHIFT		0
-#define   GUC_CTL_COREFAMILY_SHIFT	7
+#define   GUC_CTL_GT_TYPE_SHIFT		0
+#define   GUC_CTL_CORE_FAMILY_SHIFT	7
 
 #define GUC_CTL_LOG_PARAMS		4
 #define   GUC_LOG_VALID			(1 << 0)
diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c
index e6f8f30ce7bd..5132dc814788 100644
--- a/drivers/gpu/drm/i915/intel_hdmi.c
+++ b/drivers/gpu/drm/i915/intel_hdmi.c
@@ -70,7 +70,7 @@ static struct intel_hdmi *intel_attached_hdmi(struct drm_connector *connector)
 	return enc_to_intel_hdmi(&intel_attached_encoder(connector)->base);
 }
 
-static u32 g4x_infoframe_index(enum hdmi_infoframe_type type)
+static u32 g4x_infoframe_index(unsigned int type)
 {
 	switch (type) {
 	case HDMI_INFOFRAME_TYPE_AVI:
@@ -85,7 +85,7 @@ static u32 g4x_infoframe_index(enum hdmi_infoframe_type type)
 	}
 }
 
-static u32 g4x_infoframe_enable(enum hdmi_infoframe_type type)
+static u32 g4x_infoframe_enable(unsigned int type)
 {
 	switch (type) {
 	case HDMI_INFOFRAME_TYPE_AVI:
@@ -100,9 +100,11 @@ static u32 g4x_infoframe_enable(enum hdmi_infoframe_type type)
 	}
 }
 
-static u32 hsw_infoframe_enable(enum hdmi_infoframe_type type)
+static u32 hsw_infoframe_enable(unsigned int type)
 {
 	switch (type) {
+	case DP_SDP_VSC:
+		return VIDEO_DIP_ENABLE_VSC_HSW;
 	case HDMI_INFOFRAME_TYPE_AVI:
 		return VIDEO_DIP_ENABLE_AVI_HSW;
 	case HDMI_INFOFRAME_TYPE_SPD:
@@ -118,10 +120,12 @@ static u32 hsw_infoframe_enable(enum hdmi_infoframe_type type)
 static i915_reg_t
 hsw_dip_data_reg(struct drm_i915_private *dev_priv,
 		 enum transcoder cpu_transcoder,
-		 enum hdmi_infoframe_type type,
+		 unsigned int type,
 		 int i)
 {
 	switch (type) {
+	case DP_SDP_VSC:
+		return HSW_TVIDEO_DIP_VSC_DATA(cpu_transcoder, i);
 	case HDMI_INFOFRAME_TYPE_AVI:
 		return HSW_TVIDEO_DIP_AVI_DATA(cpu_transcoder, i);
 	case HDMI_INFOFRAME_TYPE_SPD:
@@ -136,7 +140,7 @@ hsw_dip_data_reg(struct drm_i915_private *dev_priv,
 
 static void g4x_write_infoframe(struct drm_encoder *encoder,
 				const struct intel_crtc_state *crtc_state,
-				enum hdmi_infoframe_type type,
+				unsigned int type,
 				const void *frame, ssize_t len)
 {
 	const uint32_t *data = frame;
@@ -191,7 +195,7 @@ static bool g4x_infoframe_enabled(struct drm_encoder *encoder,
 
 static void ibx_write_infoframe(struct drm_encoder *encoder,
 				const struct intel_crtc_state *crtc_state,
-				enum hdmi_infoframe_type type,
+				unsigned int type,
 				const void *frame, ssize_t len)
 {
 	const uint32_t *data = frame;
@@ -251,7 +255,7 @@ static bool ibx_infoframe_enabled(struct drm_encoder *encoder,
 
 static void cpt_write_infoframe(struct drm_encoder *encoder,
 				const struct intel_crtc_state *crtc_state,
-				enum hdmi_infoframe_type type,
+				unsigned int type,
 				const void *frame, ssize_t len)
 {
 	const uint32_t *data = frame;
@@ -309,7 +313,7 @@ static bool cpt_infoframe_enabled(struct drm_encoder *encoder,
 
 static void vlv_write_infoframe(struct drm_encoder *encoder,
 				const struct intel_crtc_state *crtc_state,
-				enum hdmi_infoframe_type type,
+				unsigned int type,
 				const void *frame, ssize_t len)
 {
 	const uint32_t *data = frame;
@@ -368,7 +372,7 @@ static bool vlv_infoframe_enabled(struct drm_encoder *encoder,
 
 static void hsw_write_infoframe(struct drm_encoder *encoder,
 				const struct intel_crtc_state *crtc_state,
-				enum hdmi_infoframe_type type,
+				unsigned int type,
 				const void *frame, ssize_t len)
 {
 	const uint32_t *data = frame;
@@ -377,6 +381,8 @@ static void hsw_write_infoframe(struct drm_encoder *encoder,
 	enum transcoder cpu_transcoder = crtc_state->cpu_transcoder;
 	i915_reg_t ctl_reg = HSW_TVIDEO_DIP_CTL(cpu_transcoder);
 	i915_reg_t data_reg;
+	int data_size = type == DP_SDP_VSC ?
+		VIDEO_DIP_VSC_DATA_SIZE : VIDEO_DIP_DATA_SIZE;
 	int i;
 	u32 val = I915_READ(ctl_reg);
 
@@ -392,7 +398,7 @@ static void hsw_write_infoframe(struct drm_encoder *encoder,
 		data++;
 	}
 	/* Write every possible data byte to force correct ECC calculation. */
-	for (; i < VIDEO_DIP_DATA_SIZE; i += 4)
+	for (; i < data_size; i += 4)
 		I915_WRITE(hsw_dip_data_reg(dev_priv, cpu_transcoder,
 					    type, i >> 2), 0);
 	mmiowb();
diff --git a/drivers/gpu/drm/i915/intel_huc.c b/drivers/gpu/drm/i915/intel_huc.c
index 4b4cf56d29ad..c8a48cbc2b7d 100644
--- a/drivers/gpu/drm/i915/intel_huc.c
+++ b/drivers/gpu/drm/i915/intel_huc.c
@@ -78,6 +78,42 @@ MODULE_FIRMWARE(I915_KBL_HUC_UCODE);
 	GLK_HUC_FW_MINOR, GLK_BLD_NUM)
 
 /**
+ * intel_huc_select_fw() - selects HuC firmware for loading
+ * @huc:	intel_huc struct
+ */
+void intel_huc_select_fw(struct intel_huc *huc)
+{
+	struct drm_i915_private *dev_priv = huc_to_i915(huc);
+
+	intel_uc_fw_init(&huc->fw, INTEL_UC_FW_TYPE_HUC);
+
+	if (i915_modparams.huc_firmware_path) {
+		huc->fw.path = i915_modparams.huc_firmware_path;
+		huc->fw.major_ver_wanted = 0;
+		huc->fw.minor_ver_wanted = 0;
+	} else if (IS_SKYLAKE(dev_priv)) {
+		huc->fw.path = I915_SKL_HUC_UCODE;
+		huc->fw.major_ver_wanted = SKL_HUC_FW_MAJOR;
+		huc->fw.minor_ver_wanted = SKL_HUC_FW_MINOR;
+	} else if (IS_BROXTON(dev_priv)) {
+		huc->fw.path = I915_BXT_HUC_UCODE;
+		huc->fw.major_ver_wanted = BXT_HUC_FW_MAJOR;
+		huc->fw.minor_ver_wanted = BXT_HUC_FW_MINOR;
+	} else if (IS_KABYLAKE(dev_priv) || IS_COFFEELAKE(dev_priv)) {
+		huc->fw.path = I915_KBL_HUC_UCODE;
+		huc->fw.major_ver_wanted = KBL_HUC_FW_MAJOR;
+		huc->fw.minor_ver_wanted = KBL_HUC_FW_MINOR;
+	} else if (IS_GEMINILAKE(dev_priv)) {
+		huc->fw.path = I915_GLK_HUC_UCODE;
+		huc->fw.major_ver_wanted = GLK_HUC_FW_MAJOR;
+		huc->fw.minor_ver_wanted = GLK_HUC_FW_MINOR;
+	} else {
+		DRM_ERROR("No HuC firmware known for platform with HuC!\n");
+		return;
+	}
+}
+
+/**
  * huc_ucode_xfer() - DMA's the firmware
  * @dev_priv: the drm_i915_private device
  *
@@ -85,26 +121,15 @@ MODULE_FIRMWARE(I915_KBL_HUC_UCODE);
  *
  * Return: 0 on success, non-zero on failure
  */
-static int huc_ucode_xfer(struct drm_i915_private *dev_priv)
+static int huc_ucode_xfer(struct intel_uc_fw *huc_fw, struct i915_vma *vma)
 {
-	struct intel_uc_fw *huc_fw = &dev_priv->huc.fw;
-	struct i915_vma *vma;
+	struct intel_huc *huc = container_of(huc_fw, struct intel_huc, fw);
+	struct drm_i915_private *dev_priv = huc_to_i915(huc);
 	unsigned long offset = 0;
 	u32 size;
 	int ret;
 
-	ret = i915_gem_object_set_to_gtt_domain(huc_fw->obj, false);
-	if (ret) {
-		DRM_DEBUG_DRIVER("set-domain failed %d\n", ret);
-		return ret;
-	}
-
-	vma = i915_gem_object_ggtt_pin(huc_fw->obj, NULL, 0, 0,
-				PIN_OFFSET_BIAS | GUC_WOPCM_TOP);
-	if (IS_ERR(vma)) {
-		DRM_DEBUG_DRIVER("pin failed %d\n", (int)PTR_ERR(vma));
-		return PTR_ERR(vma);
-	}
+	GEM_BUG_ON(huc_fw->type != INTEL_UC_FW_TYPE_HUC);
 
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
@@ -135,52 +160,10 @@ static int huc_ucode_xfer(struct drm_i915_private *dev_priv)
 
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 
-	/*
-	 * We keep the object pages for reuse during resume. But we can unpin it
-	 * now that DMA has completed, so it doesn't continue to take up space.
-	 */
-	i915_vma_unpin(vma);
-
 	return ret;
 }
 
 /**
- * intel_huc_select_fw() - selects HuC firmware for loading
- * @huc:	intel_huc struct
- */
-void intel_huc_select_fw(struct intel_huc *huc)
-{
-	struct drm_i915_private *dev_priv = huc_to_i915(huc);
-
-	intel_uc_fw_init(&huc->fw, INTEL_UC_FW_TYPE_HUC);
-
-	if (i915_modparams.huc_firmware_path) {
-		huc->fw.path = i915_modparams.huc_firmware_path;
-		huc->fw.major_ver_wanted = 0;
-		huc->fw.minor_ver_wanted = 0;
-	} else if (IS_SKYLAKE(dev_priv)) {
-		huc->fw.path = I915_SKL_HUC_UCODE;
-		huc->fw.major_ver_wanted = SKL_HUC_FW_MAJOR;
-		huc->fw.minor_ver_wanted = SKL_HUC_FW_MINOR;
-	} else if (IS_BROXTON(dev_priv)) {
-		huc->fw.path = I915_BXT_HUC_UCODE;
-		huc->fw.major_ver_wanted = BXT_HUC_FW_MAJOR;
-		huc->fw.minor_ver_wanted = BXT_HUC_FW_MINOR;
-	} else if (IS_KABYLAKE(dev_priv) || IS_COFFEELAKE(dev_priv)) {
-		huc->fw.path = I915_KBL_HUC_UCODE;
-		huc->fw.major_ver_wanted = KBL_HUC_FW_MAJOR;
-		huc->fw.minor_ver_wanted = KBL_HUC_FW_MINOR;
-	} else if (IS_GEMINILAKE(dev_priv)) {
-		huc->fw.path = I915_GLK_HUC_UCODE;
-		huc->fw.major_ver_wanted = GLK_HUC_FW_MAJOR;
-		huc->fw.minor_ver_wanted = GLK_HUC_FW_MINOR;
-	} else {
-		DRM_ERROR("No HuC firmware known for platform with HuC!\n");
-		return;
-	}
-}
-
-/**
  * intel_huc_init_hw() - load HuC uCode to device
  * @huc: intel_huc structure
  *
@@ -194,33 +177,7 @@ void intel_huc_select_fw(struct intel_huc *huc)
  */
 void intel_huc_init_hw(struct intel_huc *huc)
 {
-	struct drm_i915_private *dev_priv = huc_to_i915(huc);
-	int err;
-
-	DRM_DEBUG_DRIVER("%s fw status: fetch %s, load %s\n",
-		huc->fw.path,
-		intel_uc_fw_status_repr(huc->fw.fetch_status),
-		intel_uc_fw_status_repr(huc->fw.load_status));
-
-	if (huc->fw.fetch_status != INTEL_UC_FIRMWARE_SUCCESS)
-		return;
-
-	huc->fw.load_status = INTEL_UC_FIRMWARE_PENDING;
-
-	err = huc_ucode_xfer(dev_priv);
-
-	huc->fw.load_status = err ?
-		INTEL_UC_FIRMWARE_FAIL : INTEL_UC_FIRMWARE_SUCCESS;
-
-	DRM_DEBUG_DRIVER("%s fw status: fetch %s, load %s\n",
-		huc->fw.path,
-		intel_uc_fw_status_repr(huc->fw.fetch_status),
-		intel_uc_fw_status_repr(huc->fw.load_status));
-
-	if (huc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
-		DRM_ERROR("Failed to complete HuC uCode load with ret %d\n", err);
-
-	return;
+	intel_uc_fw_upload(&huc->fw, huc_ucode_xfer);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index fbfcf88d7fe3..d36e25607435 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -575,7 +575,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			 * the state of the GPU is known (idle).
 			 */
 			inject_preempt_context(engine);
-			execlists->preempt = true;
+			execlists_set_active(execlists,
+					     EXECLISTS_ACTIVE_PREEMPT);
 			goto unlock;
 		} else {
 			/*
@@ -683,8 +684,10 @@ done:
 unlock:
 	spin_unlock_irq(&engine->timeline->lock);
 
-	if (submit)
+	if (submit) {
+		execlists_set_active(execlists, EXECLISTS_ACTIVE_USER);
 		execlists_submit_ports(engine);
+	}
 }
 
 static void
@@ -696,6 +699,7 @@ execlist_cancel_port_requests(struct intel_engine_execlists *execlists)
 	while (num_ports-- && port_isset(port)) {
 		struct drm_i915_gem_request *rq = port_request(port);
 
+		GEM_BUG_ON(!execlists->active);
 		execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_PREEMPTED);
 		i915_gem_request_put(rq);
 
@@ -730,7 +734,6 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
 			INIT_LIST_HEAD(&rq->priotree.link);
-			rq->priotree.priority = INT_MAX;
 
 			dma_fence_set_error(&rq->fence, -EIO);
 			__i915_gem_request_submit(rq);
@@ -793,7 +796,6 @@ static void intel_lrc_irq_handler(unsigned long data)
 			&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
 		unsigned int head, tail;
 
-		/* However GVT emulation depends upon intercepting CSB mmio */
 		if (unlikely(execlists->csb_use_mmio)) {
 			buf = (u32 * __force)
 				(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
@@ -862,15 +864,21 @@ static void intel_lrc_irq_handler(unsigned long data)
 				unwind_incomplete_requests(engine);
 				spin_unlock_irq(&engine->timeline->lock);
 
-				GEM_BUG_ON(!execlists->preempt);
-				execlists->preempt = false;
+				GEM_BUG_ON(!execlists_is_active(execlists,
+								EXECLISTS_ACTIVE_PREEMPT));
+				execlists_clear_active(execlists,
+						       EXECLISTS_ACTIVE_PREEMPT);
 				continue;
 			}
 
 			if (status & GEN8_CTX_STATUS_PREEMPTED &&
-			    execlists->preempt)
+			    execlists_is_active(execlists,
+						EXECLISTS_ACTIVE_PREEMPT))
 				continue;
 
+			GEM_BUG_ON(!execlists_is_active(execlists,
+							EXECLISTS_ACTIVE_USER));
+
 			/* Check the context/desc id for this event matches */
 			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
 
@@ -882,7 +890,6 @@ static void intel_lrc_irq_handler(unsigned long data)
 				execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
 
 				trace_i915_gem_request_out(rq);
-				rq->priotree.priority = INT_MAX;
 				i915_gem_request_put(rq);
 
 				execlists_port_complete(execlists, port);
@@ -893,6 +900,9 @@ static void intel_lrc_irq_handler(unsigned long data)
 			/* After the final element, the hw should be idle */
 			GEM_BUG_ON(port_count(port) == 0 &&
 				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
+			if (port_count(port) == 0)
+				execlists_clear_active(execlists,
+						       EXECLISTS_ACTIVE_USER);
 		}
 
 		if (head != execlists->csb_head) {
@@ -902,7 +912,7 @@ static void intel_lrc_irq_handler(unsigned long data)
 		}
 	}
 
-	if (!execlists->preempt)
+	if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT))
 		execlists_dequeue(engine);
 
 	intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
@@ -1094,6 +1104,7 @@ execlists_context_pin(struct intel_engine_cs *engine,
 		i915_ggtt_offset(ce->ring->vma);
 
 	ce->state->obj->mm.dirty = true;
+	ce->state->obj->pin_global++;
 
 	i915_gem_context_get(ctx);
 out:
@@ -1121,6 +1132,7 @@ static void execlists_context_unpin(struct intel_engine_cs *engine,
 
 	intel_ring_unpin(ce->ring);
 
+	ce->state->obj->pin_global--;
 	i915_gem_object_unpin_map(ce->state->obj);
 	i915_vma_unpin(ce->state);
 
@@ -1459,7 +1471,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 		   GT_CONTEXT_SWITCH_INTERRUPT << engine->irq_shift);
 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
 	execlists->csb_head = -1;
-	execlists->preempt = false;
+	execlists->active = 0;
 
 	/* After a GPU reset, we may have requests to replay */
 	if (!i915_modparams.enable_guc_submission && execlists->first)
diff --git a/drivers/gpu/drm/i915/intel_lspcon.c b/drivers/gpu/drm/i915/intel_lspcon.c
index beb9baaf2f2e..dcbc786479f9 100644
--- a/drivers/gpu/drm/i915/intel_lspcon.c
+++ b/drivers/gpu/drm/i915/intel_lspcon.c
@@ -56,7 +56,7 @@ static enum drm_lspcon_mode lspcon_get_current_mode(struct intel_lspcon *lspcon)
 	struct i2c_adapter *adapter = &lspcon_to_intel_dp(lspcon)->aux.ddc;
 
 	if (drm_lspcon_get_mode(adapter, &current_mode)) {
-		DRM_ERROR("Error reading LSPCON mode\n");
+		DRM_DEBUG_KMS("Error reading LSPCON mode\n");
 		return DRM_LSPCON_MODE_INVALID;
 	}
 	return current_mode;
@@ -68,16 +68,15 @@ static enum drm_lspcon_mode lspcon_wait_mode(struct intel_lspcon *lspcon,
 	enum drm_lspcon_mode current_mode;
 
 	current_mode = lspcon_get_current_mode(lspcon);
-	if (current_mode == mode || current_mode == DRM_LSPCON_MODE_INVALID)
+	if (current_mode == mode)
 		goto out;
 
 	DRM_DEBUG_KMS("Waiting for LSPCON mode %s to settle\n",
 		      lspcon_mode_name(mode));
 
-	wait_for((current_mode = lspcon_get_current_mode(lspcon)) == mode ||
-		 current_mode == DRM_LSPCON_MODE_INVALID, 100);
+	wait_for((current_mode = lspcon_get_current_mode(lspcon)) == mode, 100);
 	if (current_mode != mode)
-		DRM_DEBUG_KMS("LSPCON mode hasn't settled\n");
+		DRM_ERROR("LSPCON mode hasn't settled\n");
 
 out:
 	DRM_DEBUG_KMS("Current LSPCON mode %s\n",
@@ -133,6 +132,7 @@ static bool lspcon_wake_native_aux_ch(struct intel_lspcon *lspcon)
 
 static bool lspcon_probe(struct intel_lspcon *lspcon)
 {
+	int retry;
 	enum drm_dp_dual_mode_type adaptor_type;
 	struct i2c_adapter *adapter = &lspcon_to_intel_dp(lspcon)->aux.ddc;
 	enum drm_lspcon_mode expected_mode;
@@ -141,10 +141,18 @@ static bool lspcon_probe(struct intel_lspcon *lspcon)
 			DRM_LSPCON_MODE_PCON : DRM_LSPCON_MODE_LS;
 
 	/* Lets probe the adaptor and check its type */
-	adaptor_type = drm_dp_dual_mode_detect(adapter);
+	for (retry = 0; retry < 6; retry++) {
+		if (retry)
+			usleep_range(500, 1000);
+
+		adaptor_type = drm_dp_dual_mode_detect(adapter);
+		if (adaptor_type == DRM_DP_DUAL_MODE_LSPCON)
+			break;
+	}
+
 	if (adaptor_type != DRM_DP_DUAL_MODE_LSPCON) {
 		DRM_DEBUG_KMS("No LSPCON detected, found %s\n",
-			drm_dp_get_dual_mode_type_name(adaptor_type));
+			       drm_dp_get_dual_mode_type_name(adaptor_type));
 		return false;
 	}
 
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 2fcff9788b6f..aa12a44e9a76 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3133,7 +3133,11 @@ static int ilk_compute_intermediate_wm(struct drm_device *dev,
 				       struct intel_crtc_state *newstate)
 {
 	struct intel_pipe_wm *a = &newstate->wm.ilk.intermediate;
-	struct intel_pipe_wm *b = &intel_crtc->wm.active.ilk;
+	struct intel_atomic_state *intel_state =
+		to_intel_atomic_state(newstate->base.state);
+	const struct intel_crtc_state *oldstate =
+		intel_atomic_get_old_crtc_state(intel_state, intel_crtc);
+	const struct intel_pipe_wm *b = &oldstate->wm.ilk.optimal;
 	int level, max_level = ilk_wm_max_level(to_i915(dev));
 
 	/*
@@ -3142,6 +3146,9 @@ static int ilk_compute_intermediate_wm(struct drm_device *dev,
 	 * and after the vblank.
 	 */
 	*a = newstate->wm.ilk.optimal;
+	if (!newstate->base.active || drm_atomic_crtc_needs_modeset(&newstate->base))
+		return 0;
+
 	a->pipe_enabled |= b->pipe_enabled;
 	a->sprites_enabled |= b->sprites_enabled;
 	a->sprites_scaled |= b->sprites_scaled;
@@ -5755,12 +5762,30 @@ void vlv_wm_sanitize(struct drm_i915_private *dev_priv)
 	mutex_unlock(&dev_priv->wm.wm_mutex);
 }
 
+/*
+ * FIXME should probably kill this and improve
+ * the real watermark readout/sanitation instead
+ */
+static void ilk_init_lp_watermarks(struct drm_i915_private *dev_priv)
+{
+	I915_WRITE(WM3_LP_ILK, I915_READ(WM3_LP_ILK) & ~WM1_LP_SR_EN);
+	I915_WRITE(WM2_LP_ILK, I915_READ(WM2_LP_ILK) & ~WM1_LP_SR_EN);
+	I915_WRITE(WM1_LP_ILK, I915_READ(WM1_LP_ILK) & ~WM1_LP_SR_EN);
+
+	/*
+	 * Don't touch WM1S_LP_EN here.
+	 * Doing so could cause underruns.
+	 */
+}
+
 void ilk_wm_get_hw_state(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct ilk_wm_values *hw = &dev_priv->wm.hw;
 	struct drm_crtc *crtc;
 
+	ilk_init_lp_watermarks(dev_priv);
+
 	for_each_crtc(dev, crtc)
 		ilk_pipe_wm_get_hw_state(crtc);
 
@@ -6591,7 +6616,7 @@ static void gen9_enable_rc6(struct drm_i915_private *dev_priv)
 {
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
-	uint32_t rc6_mask = 0;
+	u32 rc6_mode, rc6_mask = 0;
 
 	/* 1a: Software RC state - RC0 */
 	I915_WRITE(GEN6_RC_STATE, 0);
@@ -6629,8 +6654,15 @@ static void gen9_enable_rc6(struct drm_i915_private *dev_priv)
 		rc6_mask = GEN6_RC_CTL_RC6_ENABLE;
 	DRM_INFO("RC6 %s\n", onoff(rc6_mask & GEN6_RC_CTL_RC6_ENABLE));
 	I915_WRITE(GEN6_RC6_THRESHOLD, 37500); /* 37.5/125ms per EI */
+
+	/* WaRsUseTimeoutMode:cnl (pre-prod) */
+	if (IS_CNL_REVID(dev_priv, CNL_REVID_A0, CNL_REVID_C0))
+		rc6_mode = GEN7_RC_CTL_TO_MODE;
+	else
+		rc6_mode = GEN6_RC_CTL_EI_MODE(1);
+
 	I915_WRITE(GEN6_RC_CONTROL,
-		   GEN6_RC_CTL_HW_ENABLE | GEN6_RC_CTL_EI_MODE(1) | rc6_mask);
+		   GEN6_RC_CTL_HW_ENABLE | rc6_mode | rc6_mask);
 
 	/*
 	 * 3b: Enable Coarse Power Gating only when RC6 is enabled.
@@ -8200,18 +8232,6 @@ static void g4x_disable_trickle_feed(struct drm_i915_private *dev_priv)
 	}
 }
 
-static void ilk_init_lp_watermarks(struct drm_i915_private *dev_priv)
-{
-	I915_WRITE(WM3_LP_ILK, I915_READ(WM3_LP_ILK) & ~WM1_LP_SR_EN);
-	I915_WRITE(WM2_LP_ILK, I915_READ(WM2_LP_ILK) & ~WM1_LP_SR_EN);
-	I915_WRITE(WM1_LP_ILK, I915_READ(WM1_LP_ILK) & ~WM1_LP_SR_EN);
-
-	/*
-	 * Don't touch WM1S_LP_EN here.
-	 * Doing so could cause underruns.
-	 */
-}
-
 static void ilk_init_clock_gating(struct drm_i915_private *dev_priv)
 {
 	uint32_t dspclk_gate = ILK_VRHUNIT_CLOCK_GATE_DISABLE;
@@ -8245,8 +8265,6 @@ static void ilk_init_clock_gating(struct drm_i915_private *dev_priv)
 		   (I915_READ(DISP_ARB_CTL) |
 		    DISP_FBC_WM_DIS));
 
-	ilk_init_lp_watermarks(dev_priv);
-
 	/*
 	 * Based on the document from hardware guys the following bits
 	 * should be set unconditionally in order to enable FBC.
@@ -8359,8 +8377,6 @@ static void gen6_init_clock_gating(struct drm_i915_private *dev_priv)
 	I915_WRITE(GEN6_GT_MODE,
 		   _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4));
 
-	ilk_init_lp_watermarks(dev_priv);
-
 	I915_WRITE(CACHE_MODE_0,
 		   _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
 
@@ -8477,14 +8493,17 @@ static void gen8_set_l3sqc_credits(struct drm_i915_private *dev_priv,
 				   int high_prio_credits)
 {
 	u32 misccpctl;
+	u32 val;
 
 	/* WaTempDisableDOPClkGating:bdw */
 	misccpctl = I915_READ(GEN7_MISCCPCTL);
 	I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE);
 
-	I915_WRITE(GEN8_L3SQCREG1,
-		   L3_GENERAL_PRIO_CREDITS(general_prio_credits) |
-		   L3_HIGH_PRIO_CREDITS(high_prio_credits));
+	val = I915_READ(GEN8_L3SQCREG1);
+	val &= ~L3_PRIO_CREDITS_MASK;
+	val |= L3_GENERAL_PRIO_CREDITS(general_prio_credits);
+	val |= L3_HIGH_PRIO_CREDITS(high_prio_credits);
+	I915_WRITE(GEN8_L3SQCREG1, val);
 
 	/*
 	 * Wait at least 100 clocks before re-enabling clock gating.
@@ -8584,8 +8603,6 @@ static void bdw_init_clock_gating(struct drm_i915_private *dev_priv)
 						 I915_GTT_PAGE_SIZE_2M);
 	enum pipe pipe;
 
-	ilk_init_lp_watermarks(dev_priv);
-
 	/* WaSwitchSolVfFArbitrationPriority:bdw */
 	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
 
@@ -8636,8 +8653,6 @@ static void bdw_init_clock_gating(struct drm_i915_private *dev_priv)
 
 static void hsw_init_clock_gating(struct drm_i915_private *dev_priv)
 {
-	ilk_init_lp_watermarks(dev_priv);
-
 	/* L3 caching of data atomics doesn't work -- disable it. */
 	I915_WRITE(HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
 	I915_WRITE(HSW_ROW_CHICKEN3,
@@ -8681,10 +8696,6 @@ static void hsw_init_clock_gating(struct drm_i915_private *dev_priv)
 	/* WaSwitchSolVfFArbitrationPriority:hsw */
 	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
 
-	/* WaRsPkgCStateDisplayPMReq:hsw */
-	I915_WRITE(CHICKEN_PAR1_1,
-		   I915_READ(CHICKEN_PAR1_1) | FORCE_ARB_IDLE_PLANES);
-
 	lpt_init_clock_gating(dev_priv);
 }
 
@@ -8692,8 +8703,6 @@ static void ivb_init_clock_gating(struct drm_i915_private *dev_priv)
 {
 	uint32_t snpcr;
 
-	ilk_init_lp_watermarks(dev_priv);
-
 	I915_WRITE(ILK_DSPCLK_GATE_D, ILK_VRHUNIT_CLOCK_GATE_DISABLE);
 
 	/* WaDisableEarlyCull:ivb */
diff --git a/drivers/gpu/drm/i915/intel_psr.c b/drivers/gpu/drm/i915/intel_psr.c
index 5419cda83ba8..6e3b430fccdc 100644
--- a/drivers/gpu/drm/i915/intel_psr.c
+++ b/drivers/gpu/drm/i915/intel_psr.c
@@ -58,6 +58,9 @@
 
 static bool is_edp_psr(struct intel_dp *intel_dp)
 {
+	if (!intel_dp_is_edp(intel_dp))
+		return false;
+
 	return intel_dp->psr_dpcd[0] & DP_PSR_IS_SUPPORTED;
 }
 
@@ -72,37 +75,6 @@ static bool vlv_is_psr_active_on_pipe(struct drm_device *dev, int pipe)
 	       (val == VLV_EDP_PSR_ACTIVE_SF_UPDATE);
 }
 
-static void intel_psr_write_vsc(struct intel_dp *intel_dp,
-				const struct edp_vsc_psr *vsc_psr)
-{
-	struct intel_digital_port *dig_port = dp_to_dig_port(intel_dp);
-	struct drm_device *dev = dig_port->base.base.dev;
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct intel_crtc *crtc = to_intel_crtc(dig_port->base.base.crtc);
-	enum transcoder cpu_transcoder = crtc->config->cpu_transcoder;
-	i915_reg_t ctl_reg = HSW_TVIDEO_DIP_CTL(cpu_transcoder);
-	uint32_t *data = (uint32_t *) vsc_psr;
-	unsigned int i;
-
-	/* As per BSPec (Pipe Video Data Island Packet), we need to disable
-	   the video DIP being updated before program video DIP data buffer
-	   registers for DIP being updated. */
-	I915_WRITE(ctl_reg, 0);
-	POSTING_READ(ctl_reg);
-
-	for (i = 0; i < sizeof(*vsc_psr); i += 4) {
-		I915_WRITE(HSW_TVIDEO_DIP_VSC_DATA(cpu_transcoder,
-						   i >> 2), *data);
-		data++;
-	}
-	for (; i < VIDEO_DIP_VSC_DATA_SIZE; i += 4)
-		I915_WRITE(HSW_TVIDEO_DIP_VSC_DATA(cpu_transcoder,
-						   i >> 2), 0);
-
-	I915_WRITE(ctl_reg, VIDEO_DIP_ENABLE_VSC_HSW);
-	POSTING_READ(ctl_reg);
-}
-
 static void vlv_psr_setup_vsc(struct intel_dp *intel_dp,
 			      const struct intel_crtc_state *crtc_state)
 {
@@ -149,7 +121,8 @@ static void hsw_psr_setup_vsc(struct intel_dp *intel_dp,
 		psr_vsc.sdp_header.HB3 = 0x8;
 	}
 
-	intel_psr_write_vsc(intel_dp, &psr_vsc);
+	intel_dig_port->write_infoframe(&intel_dig_port->base.base, crtc_state,
+					DP_SDP_VSC, &psr_vsc, sizeof(psr_vsc));
 }
 
 static void vlv_psr_enable_sink(struct intel_dp *intel_dp)
@@ -376,22 +349,25 @@ static void hsw_psr_activate(struct intel_dp *intel_dp)
 		hsw_activate_psr1(intel_dp);
 }
 
-static bool intel_psr_match_conditions(struct intel_dp *intel_dp)
+void intel_psr_compute_config(struct intel_dp *intel_dp,
+			      struct intel_crtc_state *crtc_state)
 {
 	struct intel_digital_port *dig_port = dp_to_dig_port(intel_dp);
-	struct drm_device *dev = dig_port->base.base.dev;
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct drm_crtc *crtc = dig_port->base.base.crtc;
-	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+	struct drm_i915_private *dev_priv = to_i915(dig_port->base.base.dev);
 	const struct drm_display_mode *adjusted_mode =
-		&intel_crtc->config->base.adjusted_mode;
+		&crtc_state->base.adjusted_mode;
 	int psr_setup_time;
 
-	lockdep_assert_held(&dev_priv->psr.lock);
-	WARN_ON(!drm_modeset_is_locked(&dev->mode_config.connection_mutex));
-	WARN_ON(!drm_modeset_is_locked(&crtc->mutex));
+	if (!HAS_PSR(dev_priv))
+		return;
+
+	if (!is_edp_psr(intel_dp))
+		return;
 
-	dev_priv->psr.source_ok = false;
+	if (!i915_modparams.enable_psr) {
+		DRM_DEBUG_KMS("PSR disable by flag\n");
+		return;
+	}
 
 	/*
 	 * HSW spec explicitly says PSR is tied to port A.
@@ -402,66 +378,70 @@ static bool intel_psr_match_conditions(struct intel_dp *intel_dp)
 	 */
 	if (HAS_DDI(dev_priv) && dig_port->port != PORT_A) {
 		DRM_DEBUG_KMS("PSR condition failed: Port not supported\n");
-		return false;
-	}
-
-	if (!i915_modparams.enable_psr) {
-		DRM_DEBUG_KMS("PSR disable by flag\n");
-		return false;
+		return;
 	}
 
 	if ((IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)) &&
 	    !dev_priv->psr.link_standby) {
 		DRM_ERROR("PSR condition failed: Link off requested but not supported on this platform\n");
-		return false;
+		return;
 	}
 
 	if (IS_HASWELL(dev_priv) &&
-	    I915_READ(HSW_STEREO_3D_CTL(intel_crtc->config->cpu_transcoder)) &
+	    I915_READ(HSW_STEREO_3D_CTL(crtc_state->cpu_transcoder)) &
 		      S3D_ENABLE) {
 		DRM_DEBUG_KMS("PSR condition failed: Stereo 3D is Enabled\n");
-		return false;
+		return;
 	}
 
 	if (IS_HASWELL(dev_priv) &&
 	    adjusted_mode->flags & DRM_MODE_FLAG_INTERLACE) {
 		DRM_DEBUG_KMS("PSR condition failed: Interlaced is Enabled\n");
-		return false;
+		return;
 	}
 
 	psr_setup_time = drm_dp_psr_setup_time(intel_dp->psr_dpcd);
 	if (psr_setup_time < 0) {
 		DRM_DEBUG_KMS("PSR condition failed: Invalid PSR setup time (0x%02x)\n",
 			      intel_dp->psr_dpcd[1]);
-		return false;
+		return;
 	}
 
 	if (intel_usecs_to_scanlines(adjusted_mode, psr_setup_time) >
 	    adjusted_mode->crtc_vtotal - adjusted_mode->crtc_vdisplay - 1) {
 		DRM_DEBUG_KMS("PSR condition failed: PSR setup time (%d us) too long\n",
 			      psr_setup_time);
-		return false;
+		return;
+	}
+
+	/*
+	 * FIXME psr2_support is messed up. It's both computed
+	 * dynamically during PSR enable, and extracted from sink
+	 * caps during eDP detection.
+	 */
+	if (!dev_priv->psr.psr2_support) {
+		crtc_state->has_psr = true;
+		return;
 	}
 
 	/* PSR2 is restricted to work with panel resolutions upto 3200x2000 */
-	if (dev_priv->psr.psr2_support &&
-	    (intel_crtc->config->pipe_src_w > 3200 ||
-	     intel_crtc->config->pipe_src_h > 2000)) {
-		dev_priv->psr.psr2_support = false;
-		return false;
+	if (adjusted_mode->crtc_hdisplay > 3200 ||
+	    adjusted_mode->crtc_vdisplay > 2000) {
+		DRM_DEBUG_KMS("PSR2 disabled, panel resolution too big\n");
+		return;
 	}
 
 	/*
 	 * FIXME:enable psr2 only for y-cordinate psr2 panels
 	 * After gtc implementation , remove this restriction.
 	 */
-	if (!dev_priv->psr.y_cord_support &&  dev_priv->psr.psr2_support) {
+	if (!dev_priv->psr.y_cord_support) {
 		DRM_DEBUG_KMS("PSR2 disabled, panel does not support Y coordinate\n");
-		return false;
+		return;
 	}
 
-	dev_priv->psr.source_ok = true;
-	return true;
+	crtc_state->has_psr = true;
+	crtc_state->has_psr2 = true;
 }
 
 static void intel_psr_activate(struct intel_dp *intel_dp)
@@ -531,13 +511,8 @@ void intel_psr_enable(struct intel_dp *intel_dp,
 	struct drm_device *dev = intel_dig_port->base.base.dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 
-	if (!HAS_PSR(dev_priv))
-		return;
-
-	if (!is_edp_psr(intel_dp)) {
-		DRM_DEBUG_KMS("PSR not supported by this panel\n");
+	if (!crtc_state->has_psr)
 		return;
-	}
 
 	WARN_ON(dev_priv->drrs.dp);
 	mutex_lock(&dev_priv->psr.lock);
@@ -546,8 +521,8 @@ void intel_psr_enable(struct intel_dp *intel_dp,
 		goto unlock;
 	}
 
-	if (!intel_psr_match_conditions(intel_dp))
-		goto unlock;
+	dev_priv->psr.psr2_support = crtc_state->has_psr2;
+	dev_priv->psr.source_ok = true;
 
 	dev_priv->psr.busy_frontbuffer_bits = 0;
 
@@ -668,7 +643,7 @@ void intel_psr_disable(struct intel_dp *intel_dp,
 	struct drm_device *dev = intel_dig_port->base.base.dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 
-	if (!HAS_PSR(dev_priv))
+	if (!old_crtc_state->has_psr)
 		return;
 
 	mutex_lock(&dev_priv->psr.lock);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 4285f09ff8b8..8da1bde442dd 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -484,11 +484,6 @@ static bool stop_ring(struct intel_engine_cs *engine)
 	I915_WRITE_HEAD(engine, 0);
 	I915_WRITE_TAIL(engine, 0);
 
-	if (INTEL_GEN(dev_priv) > 2) {
-		(void)I915_READ_CTL(engine);
-		I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
-	}
-
 	return (I915_READ_HEAD(engine) & HEAD_ADDR) == 0;
 }
 
@@ -570,6 +565,9 @@ static int init_ring_common(struct intel_engine_cs *engine)
 
 	intel_engine_init_hangcheck(engine);
 
+	if (INTEL_GEN(dev_priv) > 2)
+		I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
+
 out:
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 
@@ -1246,6 +1244,8 @@ int intel_ring_pin(struct intel_ring *ring,
 	if (IS_ERR(addr))
 		goto err;
 
+	vma->obj->pin_global++;
+
 	ring->vaddr = addr;
 	return 0;
 
@@ -1277,6 +1277,7 @@ void intel_ring_unpin(struct intel_ring *ring)
 		i915_gem_object_unpin_map(ring->vma->obj);
 	ring->vaddr = NULL;
 
+	ring->vma->obj->pin_global--;
 	i915_vma_unpin(ring->vma);
 }
 
@@ -1441,6 +1442,7 @@ intel_ring_context_pin(struct intel_engine_cs *engine,
 			goto err;
 
 		ce->state->obj->mm.dirty = true;
+		ce->state->obj->pin_global++;
 	}
 
 	/* The kernel context is only used as a placeholder for flushing the
@@ -1475,8 +1477,10 @@ static void intel_ring_context_unpin(struct intel_engine_cs *engine,
 	if (--ce->pin_count)
 		return;
 
-	if (ce->state)
+	if (ce->state) {
+		ce->state->obj->pin_global--;
 		i915_vma_unpin(ce->state);
+	}
 
 	i915_gem_context_put(ctx);
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 17186f067408..6a42ed618a28 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -241,9 +241,17 @@ struct intel_engine_execlists {
 	} port[EXECLIST_MAX_PORTS];
 
 	/**
-	 * @preempt: are we currently handling a preempting context switch?
+	 * @active: is the HW active? We consider the HW as active after
+	 * submitting any context for execution and until we have seen the
+	 * last context completion event. After that, we do not expect any
+	 * more events until we submit, and so can park the HW.
+	 *
+	 * As we have a small number of different sources from which we feed
+	 * the HW, we track the state of each inside a single bitfield.
 	 */
-	bool preempt;
+	unsigned int active;
+#define EXECLISTS_ACTIVE_USER 0
+#define EXECLISTS_ACTIVE_PREEMPT 1
 
 	/**
 	 * @port_mask: number of execlist ports - 1
@@ -525,6 +533,27 @@ struct intel_engine_cs {
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
 };
 
+static inline void
+execlists_set_active(struct intel_engine_execlists *execlists,
+		     unsigned int bit)
+{
+	__set_bit(bit, (unsigned long *)&execlists->active);
+}
+
+static inline void
+execlists_clear_active(struct intel_engine_execlists *execlists,
+		       unsigned int bit)
+{
+	__clear_bit(bit, (unsigned long *)&execlists->active);
+}
+
+static inline bool
+execlists_is_active(const struct intel_engine_execlists *execlists,
+		    unsigned int bit)
+{
+	return test_bit(bit, (unsigned long *)&execlists->active);
+}
+
 static inline unsigned int
 execlists_num_ports(const struct intel_engine_execlists * const execlists)
 {
@@ -538,6 +567,7 @@ execlists_port_complete(struct intel_engine_execlists * const execlists,
 	const unsigned int m = execlists->port_mask;
 
 	GEM_BUG_ON(port_index(port, execlists) != 0);
+	GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_USER));
 
 	memmove(port, port + 1, m * sizeof(struct execlist_port));
 	memset(port + m, 0, sizeof(struct execlist_port));
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index 86fc9b529f2d..4fcf80ca91dd 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -230,7 +230,7 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state)
 #endif
 }
 
-static void
+void
 skl_update_plane(struct intel_plane *plane,
 		 const struct intel_crtc_state *crtc_state,
 		 const struct intel_plane_state *plane_state)
@@ -311,7 +311,7 @@ skl_update_plane(struct intel_plane *plane,
 	spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
 }
 
-static void
+void
 skl_disable_plane(struct intel_plane *plane, struct intel_crtc *crtc)
 {
 	struct drm_i915_private *dev_priv = to_i915(plane->base.dev);
diff --git a/drivers/gpu/drm/i915/intel_uc.c b/drivers/gpu/drm/i915/intel_uc.c
index 7b938e822fde..25bd162f38d2 100644
--- a/drivers/gpu/drm/i915/intel_uc.c
+++ b/drivers/gpu/drm/i915/intel_uc.c
@@ -68,7 +68,7 @@ void intel_uc_sanitize_options(struct drm_i915_private *dev_priv)
 		if (HAS_HUC_UCODE(dev_priv))
 			intel_huc_select_fw(&dev_priv->huc);
 
-		if (intel_guc_select_fw(&dev_priv->guc))
+		if (intel_guc_fw_select(&dev_priv->guc))
 			i915_modparams.enable_guc_loading = 0;
 	}
 
@@ -195,7 +195,8 @@ int intel_uc_init_hw(struct drm_i915_private *dev_priv)
 			goto err_submission;
 
 		intel_huc_init_hw(&dev_priv->huc);
-		ret = intel_guc_init_hw(&dev_priv->guc);
+		intel_guc_init_params(guc);
+		ret = intel_guc_fw_upload(guc);
 		if (ret == 0 || ret != -EAGAIN)
 			break;
 
@@ -221,6 +222,12 @@ int intel_uc_init_hw(struct drm_i915_private *dev_priv)
 			goto err_interrupts;
 	}
 
+	dev_info(dev_priv->drm.dev, "GuC %s (firmware %s [version %u.%u])\n",
+		 i915_modparams.enable_guc_submission ? "submission enabled" :
+							"loaded",
+		 guc->fw.path,
+		 guc->fw.major_ver_found, guc->fw.minor_ver_found);
+
 	return 0;
 
 	/*
@@ -243,12 +250,14 @@ err_submission:
 err_guc:
 	i915_ggtt_disable_guc(dev_priv);
 
-	DRM_ERROR("GuC init failed\n");
 	if (i915_modparams.enable_guc_loading > 1 ||
-	    i915_modparams.enable_guc_submission > 1)
+	    i915_modparams.enable_guc_submission > 1) {
+		DRM_ERROR("GuC init failed. Firmware loading disabled.\n");
 		ret = -EIO;
-	else
+	} else {
+		DRM_NOTE("GuC init failed. Firmware loading disabled.\n");
 		ret = 0;
+	}
 
 	if (i915_modparams.enable_guc_submission) {
 		i915_modparams.enable_guc_submission = 0;
@@ -256,7 +265,6 @@ err_guc:
 	}
 
 	i915_modparams.enable_guc_loading = 0;
-	DRM_NOTE("GuC firmware loading disabled\n");
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/intel_uc_fw.c b/drivers/gpu/drm/i915/intel_uc_fw.c
index 766b1cbdfbd7..973888e94cba 100644
--- a/drivers/gpu/drm/i915/intel_uc_fw.c
+++ b/drivers/gpu/drm/i915/intel_uc_fw.c
@@ -23,6 +23,7 @@
  */
 
 #include <linux/firmware.h>
+#include <drm/drm_print.h>
 
 #include "intel_uc_fw.h"
 #include "i915_drv.h"
@@ -45,26 +46,33 @@ void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
 	size_t size;
 	int err;
 
+	DRM_DEBUG_DRIVER("%s fw fetch %s\n",
+			 intel_uc_fw_type_repr(uc_fw->type), uc_fw->path);
+
 	if (!uc_fw->path)
 		return;
 
 	uc_fw->fetch_status = INTEL_UC_FIRMWARE_PENDING;
-
-	DRM_DEBUG_DRIVER("before requesting firmware: uC fw fetch status %s\n",
+	DRM_DEBUG_DRIVER("%s fw fetch %s\n",
+			 intel_uc_fw_type_repr(uc_fw->type),
 			 intel_uc_fw_status_repr(uc_fw->fetch_status));
 
 	err = request_firmware(&fw, uc_fw->path, &pdev->dev);
-	if (err)
-		goto fail;
-	if (!fw)
+	if (err) {
+		DRM_DEBUG_DRIVER("%s fw request_firmware err=%d\n",
+				 intel_uc_fw_type_repr(uc_fw->type), err);
 		goto fail;
+	}
 
-	DRM_DEBUG_DRIVER("fetch uC fw from %s succeeded, fw %p\n",
-			 uc_fw->path, fw);
+	DRM_DEBUG_DRIVER("%s fw size %zu ptr %p\n",
+			 intel_uc_fw_type_repr(uc_fw->type), fw->size, fw);
 
 	/* Check the size of the blob before examining buffer contents */
 	if (fw->size < sizeof(struct uc_css_header)) {
-		DRM_NOTE("Firmware header is missing\n");
+		DRM_WARN("%s: Unexpected firmware size (%zu, min %zu)\n",
+			 intel_uc_fw_type_repr(uc_fw->type),
+			 fw->size, sizeof(struct uc_css_header));
+		err = -ENODATA;
 		goto fail;
 	}
 
@@ -77,7 +85,9 @@ void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
 			     sizeof(u32);
 
 	if (uc_fw->header_size != sizeof(struct uc_css_header)) {
-		DRM_NOTE("CSS header definition mismatch\n");
+		DRM_WARN("%s: Mismatched firmware header definition\n",
+			 intel_uc_fw_type_repr(uc_fw->type));
+		err = -ENOEXEC;
 		goto fail;
 	}
 
@@ -85,9 +95,20 @@ void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
 	uc_fw->ucode_offset = uc_fw->header_offset + uc_fw->header_size;
 	uc_fw->ucode_size = (css->size_dw - css->header_size_dw) * sizeof(u32);
 
+	/* Header and uCode will be loaded to WOPCM */
+	size = uc_fw->header_size + uc_fw->ucode_size;
+	if (size > intel_guc_wopcm_size(dev_priv)) {
+		DRM_WARN("%s: Firmware is too large to fit in WOPCM\n",
+			 intel_uc_fw_type_repr(uc_fw->type));
+		err = -E2BIG;
+		goto fail;
+	}
+
 	/* now RSA */
 	if (css->key_size_dw != UOS_RSA_SCRATCH_MAX_COUNT) {
-		DRM_NOTE("RSA key size is bad\n");
+		DRM_WARN("%s: Mismatched firmware RSA key size (%u)\n",
+			 intel_uc_fw_type_repr(uc_fw->type), css->key_size_dw);
+		err = -ENOEXEC;
 		goto fail;
 	}
 	uc_fw->rsa_offset = uc_fw->ucode_offset + uc_fw->ucode_size;
@@ -96,7 +117,9 @@ void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
 	/* At least, it should have header, uCode and RSA. Size of all three. */
 	size = uc_fw->header_size + uc_fw->ucode_size + uc_fw->rsa_size;
 	if (fw->size < size) {
-		DRM_NOTE("Missing firmware components\n");
+		DRM_WARN("%s: Truncated firmware (%zu, expected %zu)\n",
+			 intel_uc_fw_type_repr(uc_fw->type), fw->size, size);
+		err = -ENOEXEC;
 		goto fail;
 	}
 
@@ -108,14 +131,6 @@ void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
 	 */
 	switch (uc_fw->type) {
 	case INTEL_UC_FW_TYPE_GUC:
-		/* Header and uCode will be loaded to WOPCM. Size of the two. */
-		size = uc_fw->header_size + uc_fw->ucode_size;
-
-		/* Top 32k of WOPCM is reserved (8K stack + 24k RC6 context). */
-		if (size > intel_guc_wopcm_size(dev_priv)) {
-			DRM_ERROR("Firmware is too large to fit in WOPCM\n");
-			goto fail;
-		}
 		uc_fw->major_ver_found = css->guc.sw_version >> 16;
 		uc_fw->minor_ver_found = css->guc.sw_version & 0xFFFF;
 		break;
@@ -126,17 +141,21 @@ void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
 		break;
 
 	default:
-		DRM_ERROR("Unknown firmware type %d\n", uc_fw->type);
-		err = -ENOEXEC;
-		goto fail;
+		MISSING_CASE(uc_fw->type);
+		break;
 	}
 
+	DRM_DEBUG_DRIVER("%s fw version %u.%u (wanted %u.%u)\n",
+			 intel_uc_fw_type_repr(uc_fw->type),
+			 uc_fw->major_ver_found, uc_fw->minor_ver_found,
+			 uc_fw->major_ver_wanted, uc_fw->minor_ver_wanted);
+
 	if (uc_fw->major_ver_wanted == 0 && uc_fw->minor_ver_wanted == 0) {
-		DRM_NOTE("Skipping %s firmware version check\n",
+		DRM_NOTE("%s: Skipping firmware version check\n",
 			 intel_uc_fw_type_repr(uc_fw->type));
 	} else if (uc_fw->major_ver_found != uc_fw->major_ver_wanted ||
 		   uc_fw->minor_ver_found < uc_fw->minor_ver_wanted) {
-		DRM_NOTE("%s firmware version %d.%d, required %d.%d\n",
+		DRM_NOTE("%s: Wrong firmware version (%u.%u, required %u.%u)\n",
 			 intel_uc_fw_type_repr(uc_fw->type),
 			 uc_fw->major_ver_found, uc_fw->minor_ver_found,
 			 uc_fw->major_ver_wanted, uc_fw->minor_ver_wanted);
@@ -144,34 +163,115 @@ void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
 		goto fail;
 	}
 
-	DRM_DEBUG_DRIVER("firmware version %d.%d OK (minimum %d.%d)\n",
-			 uc_fw->major_ver_found, uc_fw->minor_ver_found,
-			 uc_fw->major_ver_wanted, uc_fw->minor_ver_wanted);
-
 	obj = i915_gem_object_create_from_data(dev_priv, fw->data, fw->size);
 	if (IS_ERR(obj)) {
 		err = PTR_ERR(obj);
+		DRM_DEBUG_DRIVER("%s fw object_create err=%d\n",
+				 intel_uc_fw_type_repr(uc_fw->type), err);
 		goto fail;
 	}
 
 	uc_fw->obj = obj;
 	uc_fw->size = fw->size;
-
-	DRM_DEBUG_DRIVER("uC fw fetch status SUCCESS, obj %p\n",
-			 uc_fw->obj);
+	uc_fw->fetch_status = INTEL_UC_FIRMWARE_SUCCESS;
+	DRM_DEBUG_DRIVER("%s fw fetch %s\n",
+			 intel_uc_fw_type_repr(uc_fw->type),
+			 intel_uc_fw_status_repr(uc_fw->fetch_status));
 
 	release_firmware(fw);
-	uc_fw->fetch_status = INTEL_UC_FIRMWARE_SUCCESS;
 	return;
 
 fail:
-	DRM_WARN("Failed to fetch valid uC firmware from %s (error %d)\n",
-		 uc_fw->path, err);
-	DRM_DEBUG_DRIVER("uC fw fetch status FAIL; err %d, fw %p, obj %p\n",
-			 err, fw, uc_fw->obj);
+	uc_fw->fetch_status = INTEL_UC_FIRMWARE_FAIL;
+	DRM_DEBUG_DRIVER("%s fw fetch %s\n",
+			 intel_uc_fw_type_repr(uc_fw->type),
+			 intel_uc_fw_status_repr(uc_fw->fetch_status));
+
+	DRM_WARN("%s: Failed to fetch firmware %s (error %d)\n",
+		 intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, err);
+	DRM_INFO("%s: Firmware can be downloaded from %s\n",
+		 intel_uc_fw_type_repr(uc_fw->type), INTEL_UC_FIRMWARE_URL);
 
 	release_firmware(fw);		/* OK even if fw is NULL */
-	uc_fw->fetch_status = INTEL_UC_FIRMWARE_FAIL;
+}
+
+/**
+ * intel_uc_fw_upload - load uC firmware using custom loader
+ *
+ * @uc_fw: uC firmware
+ * @loader: custom uC firmware loader function
+ *
+ * Loads uC firmware using custom loader and updates internal flags.
+ */
+int intel_uc_fw_upload(struct intel_uc_fw *uc_fw,
+		       int (*xfer)(struct intel_uc_fw *uc_fw,
+				   struct i915_vma *vma))
+{
+	struct i915_vma *vma;
+	int err;
+
+	DRM_DEBUG_DRIVER("%s fw load %s\n",
+			 intel_uc_fw_type_repr(uc_fw->type), uc_fw->path);
+
+	if (uc_fw->fetch_status != INTEL_UC_FIRMWARE_SUCCESS)
+		return -EIO;
+
+	uc_fw->load_status = INTEL_UC_FIRMWARE_PENDING;
+	DRM_DEBUG_DRIVER("%s fw load %s\n",
+			 intel_uc_fw_type_repr(uc_fw->type),
+			 intel_uc_fw_status_repr(uc_fw->load_status));
+
+	/* Pin object with firmware */
+	err = i915_gem_object_set_to_gtt_domain(uc_fw->obj, false);
+	if (err) {
+		DRM_DEBUG_DRIVER("%s fw set-domain err=%d\n",
+				 intel_uc_fw_type_repr(uc_fw->type), err);
+		goto fail;
+	}
+
+	vma = i915_gem_object_ggtt_pin(uc_fw->obj, NULL, 0, 0,
+				       PIN_OFFSET_BIAS | GUC_WOPCM_TOP);
+	if (IS_ERR(vma)) {
+		err = PTR_ERR(vma);
+		DRM_DEBUG_DRIVER("%s fw ggtt-pin err=%d\n",
+				 intel_uc_fw_type_repr(uc_fw->type), err);
+		goto fail;
+	}
+
+	/* Call custom loader */
+	err = xfer(uc_fw, vma);
+
+	/*
+	 * We keep the object pages for reuse during resume. But we can unpin it
+	 * now that DMA has completed, so it doesn't continue to take up space.
+	 */
+	i915_vma_unpin(vma);
+
+	if (err)
+		goto fail;
+
+	uc_fw->load_status = INTEL_UC_FIRMWARE_SUCCESS;
+	DRM_DEBUG_DRIVER("%s fw load %s\n",
+			 intel_uc_fw_type_repr(uc_fw->type),
+			 intel_uc_fw_status_repr(uc_fw->load_status));
+
+	DRM_INFO("%s: Loaded firmware %s (version %u.%u)\n",
+		 intel_uc_fw_type_repr(uc_fw->type),
+		 uc_fw->path,
+		 uc_fw->major_ver_found, uc_fw->minor_ver_found);
+
+	return 0;
+
+fail:
+	uc_fw->load_status = INTEL_UC_FIRMWARE_FAIL;
+	DRM_DEBUG_DRIVER("%s fw load %s\n",
+			 intel_uc_fw_type_repr(uc_fw->type),
+			 intel_uc_fw_status_repr(uc_fw->load_status));
+
+	DRM_WARN("%s: Failed to load firmware %s (error %d)\n",
+		 intel_uc_fw_type_repr(uc_fw->type), uc_fw->path, err);
+
+	return err;
 }
 
 /**
@@ -191,3 +291,28 @@ void intel_uc_fw_fini(struct intel_uc_fw *uc_fw)
 
 	uc_fw->fetch_status = INTEL_UC_FIRMWARE_NONE;
 }
+
+/**
+ * intel_uc_fw_dump - dump information about uC firmware
+ * @uc_fw: uC firmware
+ * @p: the &drm_printer
+ *
+ * Pretty printer for uC firmware.
+ */
+void intel_uc_fw_dump(struct intel_uc_fw *uc_fw, struct drm_printer *p)
+{
+	drm_printf(p, "%s firmware: %s\n",
+		   intel_uc_fw_type_repr(uc_fw->type), uc_fw->path);
+	drm_printf(p, "\tstatus: fetch %s, load %s\n",
+		   intel_uc_fw_status_repr(uc_fw->fetch_status),
+		   intel_uc_fw_status_repr(uc_fw->load_status));
+	drm_printf(p, "\tversion: wanted %u.%u, found %u.%u\n",
+		   uc_fw->major_ver_wanted, uc_fw->minor_ver_wanted,
+		   uc_fw->major_ver_found, uc_fw->minor_ver_found);
+	drm_printf(p, "\theader: offset %u, size %u\n",
+		   uc_fw->header_offset, uc_fw->header_size);
+	drm_printf(p, "\tuCode: offset %u, size %u\n",
+		   uc_fw->ucode_offset, uc_fw->ucode_size);
+	drm_printf(p, "\tRSA: offset %u, size %u\n",
+		   uc_fw->rsa_offset, uc_fw->rsa_size);
+}
diff --git a/drivers/gpu/drm/i915/intel_uc_fw.h b/drivers/gpu/drm/i915/intel_uc_fw.h
index c3e9af4b9bf0..132903669391 100644
--- a/drivers/gpu/drm/i915/intel_uc_fw.h
+++ b/drivers/gpu/drm/i915/intel_uc_fw.h
@@ -25,7 +25,12 @@
 #ifndef _INTEL_UC_FW_H_
 #define _INTEL_UC_FW_H_
 
+struct drm_printer;
 struct drm_i915_private;
+struct i915_vma;
+
+/* Home of GuC, HuC and DMC firmwares */
+#define INTEL_UC_FIRMWARE_URL "https://01.org/linuxgraphics/downloads/firmware"
 
 enum intel_uc_fw_status {
 	INTEL_UC_FIRMWARE_FAIL = -1,
@@ -50,6 +55,11 @@ struct intel_uc_fw {
 	enum intel_uc_fw_status fetch_status;
 	enum intel_uc_fw_status load_status;
 
+	/*
+	 * The firmware build process will generate a version header file with major and
+	 * minor version defined. The versions are built into CSS header of firmware.
+	 * i915 kernel driver set the minimal firmware version required per platform.
+	 */
 	u16 major_ver_wanted;
 	u16 minor_ver_wanted;
 	u16 major_ver_found;
@@ -102,6 +112,10 @@ void intel_uc_fw_init(struct intel_uc_fw *uc_fw, enum intel_uc_fw_type type)
 
 void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
 		       struct intel_uc_fw *uc_fw);
+int intel_uc_fw_upload(struct intel_uc_fw *uc_fw,
+		       int (*xfer)(struct intel_uc_fw *uc_fw,
+				   struct i915_vma *vma));
 void intel_uc_fw_fini(struct intel_uc_fw *uc_fw);
+void intel_uc_fw_dump(struct intel_uc_fw *uc_fw, struct drm_printer *p);
 
 #endif
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 983617b5b338..20e3c65c0999 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1403,6 +1403,9 @@ static void i915_stop_engines(struct drm_i915_private *dev_priv,
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
+	if (INTEL_GEN(dev_priv) < 3)
+		return;
+
 	for_each_engine_masked(engine, dev_priv, engine_mask, id)
 		gen3_stop_engine(engine);
 }
@@ -1742,16 +1745,12 @@ static reset_func intel_get_gpu_reset(struct drm_i915_private *dev_priv)
 
 int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 {
-	reset_func reset;
+	reset_func reset = intel_get_gpu_reset(dev_priv);
 	int retry;
 	int ret;
 
 	might_sleep();
 
-	reset = intel_get_gpu_reset(dev_priv);
-	if (reset == NULL)
-		return -ENODEV;
-
 	/* If the power well sleeps during the reset, the reset
 	 * request may be dropped and never completes (causing -EIO).
 	 */
@@ -1771,7 +1770,9 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 		 */
 		i915_stop_engines(dev_priv, engine_mask);
 
-		ret = reset(dev_priv, engine_mask);
+		ret = -ENODEV;
+		if (reset)
+			ret = reset(dev_priv, engine_mask);
 		if (ret != -ETIMEDOUT)
 			break;
 
diff --git a/drivers/gpu/drm/i915/intel_vbt_defs.h b/drivers/gpu/drm/i915/intel_vbt_defs.h
index 404569c9fdfc..f225c288a121 100644
--- a/drivers/gpu/drm/i915/intel_vbt_defs.h
+++ b/drivers/gpu/drm/i915/intel_vbt_defs.h
@@ -306,6 +306,14 @@ struct bdb_general_features {
 
 #define LEGACY_CHILD_DEVICE_CONFIG_SIZE		33
 
+/* DDC Bus DDI Type 155+ */
+enum vbt_gmbus_ddi {
+	DDC_BUS_DDI_B = 0x1,
+	DDC_BUS_DDI_C,
+	DDC_BUS_DDI_D,
+	DDC_BUS_DDI_F,
+};
+
 /*
  * The child device config, aka the display device data structure, provides a
  * description of a port and its configuration on the platform.
diff --git a/drivers/gpu/drm/i915/selftests/huge_pages.c b/drivers/gpu/drm/i915/selftests/huge_pages.c
index c53f8474113a..5cc8101bb2b1 100644
--- a/drivers/gpu/drm/i915/selftests/huge_pages.c
+++ b/drivers/gpu/drm/i915/selftests/huge_pages.c
@@ -609,7 +609,7 @@ static int igt_mock_ppgtt_huge_fill(void *arg)
 	bool single = false;
 	LIST_HEAD(objects);
 	IGT_TIMEOUT(end_time);
-	int err;
+	int err = -ENODEV;
 
 	for_each_prime_number_from(page_num, 1, max_pages) {
 		struct drm_i915_gem_object *obj;
@@ -1157,7 +1157,7 @@ static int igt_ppgtt_exhaust_huge(void *arg)
 	unsigned int size_mask;
 	unsigned int page_mask;
 	int n, i;
-	int err;
+	int err = -ENODEV;
 
 	/*
 	 * Sanity check creating objects with a varying mix of page sizes --
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index fb0a58fc8348..def5052862ae 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -417,7 +417,7 @@ static int fake_aliasing_ppgtt_enable(struct drm_i915_private *i915)
 	if (err)
 		return err;
 
-	list_for_each_entry(obj, &i915->mm.bound_list, global_link) {
+	list_for_each_entry(obj, &i915->mm.bound_list, mm.link) {
 		struct i915_vma *vma;
 
 		vma = i915_vma_instance(obj, &i915->ggtt.base, NULL);
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
index 5ea373221f49..f463105ff48d 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
@@ -24,6 +24,9 @@
 
 #include "../i915_selftest.h"
 
+#include "lib_sw_fence.h"
+#include "mock_context.h"
+#include "mock_drm.h"
 #include "mock_gem_device.h"
 
 static int populate_ggtt(struct drm_i915_private *i915)
@@ -47,7 +50,7 @@ static int populate_ggtt(struct drm_i915_private *i915)
 
 	if (!list_empty(&i915->mm.unbound_list)) {
 		size = 0;
-		list_for_each_entry(obj, &i915->mm.unbound_list, global_link)
+		list_for_each_entry(obj, &i915->mm.unbound_list, mm.link)
 			size++;
 
 		pr_err("Found %lld objects unbound!\n", size);
@@ -74,10 +77,10 @@ static void cleanup_objects(struct drm_i915_private *i915)
 {
 	struct drm_i915_gem_object *obj, *on;
 
-	list_for_each_entry_safe(obj, on, &i915->mm.unbound_list, global_link)
+	list_for_each_entry_safe(obj, on, &i915->mm.unbound_list, mm.link)
 		i915_gem_object_put(obj);
 
-	list_for_each_entry_safe(obj, on, &i915->mm.bound_list, global_link)
+	list_for_each_entry_safe(obj, on, &i915->mm.bound_list, mm.link)
 		i915_gem_object_put(obj);
 
 	mutex_unlock(&i915->drm.struct_mutex);
@@ -149,8 +152,6 @@ static int igt_overcommit(void *arg)
 		goto cleanup;
 	}
 
-	list_move(&obj->global_link, &i915->mm.unbound_list);
-
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
 	if (!IS_ERR(vma) || PTR_ERR(vma) != -ENOSPC) {
 		pr_err("Failed to evict+insert, i915_gem_object_ggtt_pin returned err=%d\n", (int)PTR_ERR(vma));
@@ -325,6 +326,148 @@ cleanup:
 	return err;
 }
 
+static int igt_evict_contexts(void *arg)
+{
+	const u64 PRETEND_GGTT_SIZE = 16ull << 20;
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	struct reserved {
+		struct drm_mm_node node;
+		struct reserved *next;
+	} *reserved = NULL;
+	struct drm_mm_node hole;
+	unsigned long count;
+	int err;
+
+	/*
+	 * The purpose of this test is to verify that we will trigger an
+	 * eviction in the GGTT when constructing a request that requires
+	 * additional space in the GGTT for pinning the context. This space
+	 * is not directly tied to the request so reclaiming it requires
+	 * extra work.
+	 *
+	 * As such this test is only meaningful for full-ppgtt environments
+	 * where the GTT space of the request is separate from the GGTT
+	 * allocation required to build the request.
+	 */
+	if (!USES_FULL_PPGTT(i915))
+		return 0;
+
+	mutex_lock(&i915->drm.struct_mutex);
+
+	/* Reserve a block so that we know we have enough to fit a few rq */
+	memset(&hole, 0, sizeof(hole));
+	err = i915_gem_gtt_insert(&i915->ggtt.base, &hole,
+				  PRETEND_GGTT_SIZE, 0, I915_COLOR_UNEVICTABLE,
+				  0, i915->ggtt.base.total,
+				  PIN_NOEVICT);
+	if (err)
+		goto out_locked;
+
+	/* Make the GGTT appear small by filling it with unevictable nodes */
+	count = 0;
+	do {
+		struct reserved *r;
+
+		r = kcalloc(1, sizeof(*r), GFP_KERNEL);
+		if (!r) {
+			err = -ENOMEM;
+			goto out_locked;
+		}
+
+		if (i915_gem_gtt_insert(&i915->ggtt.base, &r->node,
+					1ul << 20, 0, I915_COLOR_UNEVICTABLE,
+					0, i915->ggtt.base.total,
+					PIN_NOEVICT)) {
+			kfree(r);
+			break;
+		}
+
+		r->next = reserved;
+		reserved = r;
+
+		count++;
+	} while (1);
+	drm_mm_remove_node(&hole);
+	mutex_unlock(&i915->drm.struct_mutex);
+	pr_info("Filled GGTT with %lu 1MiB nodes\n", count);
+
+	/* Overfill the GGTT with context objects and so try to evict one. */
+	for_each_engine(engine, i915, id) {
+		struct i915_sw_fence fence;
+		struct drm_file *file;
+
+		file = mock_file(i915);
+		if (IS_ERR(file))
+			return PTR_ERR(file);
+
+		count = 0;
+		mutex_lock(&i915->drm.struct_mutex);
+		onstack_fence_init(&fence);
+		do {
+			struct drm_i915_gem_request *rq;
+			struct i915_gem_context *ctx;
+
+			ctx = live_context(i915, file);
+			if (!ctx)
+				break;
+
+			/* We will need some GGTT space for the rq's context */
+			igt_evict_ctl.fail_if_busy = true;
+			rq = i915_gem_request_alloc(engine, ctx);
+			igt_evict_ctl.fail_if_busy = false;
+
+			if (IS_ERR(rq)) {
+				/* When full, fail_if_busy will trigger EBUSY */
+				if (PTR_ERR(rq) != -EBUSY) {
+					pr_err("Unexpected error from request alloc (ctx hw id %u, on %s): %d\n",
+					       ctx->hw_id, engine->name,
+					       (int)PTR_ERR(rq));
+					err = PTR_ERR(rq);
+				}
+				break;
+			}
+
+			/* Keep every request/ctx pinned until we are full */
+			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+							       &fence,
+							       GFP_KERNEL);
+			if (err < 0)
+				break;
+
+			i915_add_request(rq);
+			count++;
+			err = 0;
+		} while(1);
+		mutex_unlock(&i915->drm.struct_mutex);
+
+		onstack_fence_fini(&fence);
+		pr_info("Submitted %lu contexts/requests on %s\n",
+			count, engine->name);
+
+		mock_file_free(i915, file);
+		if (err)
+			break;
+	}
+
+	mutex_lock(&i915->drm.struct_mutex);
+out_locked:
+	while (reserved) {
+		struct reserved *next = reserved->next;
+
+		drm_mm_remove_node(&reserved->node);
+		kfree(reserved);
+
+		reserved = next;
+	}
+	if (drm_mm_node_allocated(&hole))
+		drm_mm_remove_node(&hole);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	return err;
+}
+
 int i915_gem_evict_mock_selftests(void)
 {
 	static const struct i915_subtest tests[] = {
@@ -348,3 +491,12 @@ int i915_gem_evict_mock_selftests(void)
 	drm_dev_unref(&i915->drm);
 	return err;
 }
+
+int i915_gem_evict_live_selftests(struct drm_i915_private *i915)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(igt_evict_contexts),
+	};
+
+	return i915_subtests(tests, i915);
+}
diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
index 64acd7eccc5c..54a73534b37e 100644
--- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
@@ -15,6 +15,7 @@ selftest(objects, i915_gem_object_live_selftests)
 selftest(dmabuf, i915_gem_dmabuf_live_selftests)
 selftest(coherency, i915_gem_coherency_live_selftests)
 selftest(gtt, i915_gem_gtt_live_selftests)
+selftest(evict, i915_gem_evict_live_selftests)
 selftest(hugepages, i915_gem_huge_page_live_selftests)
 selftest(contexts, i915_gem_context_live_selftests)
 selftest(hangcheck, intel_hangcheck_live_selftests)
diff --git a/drivers/gpu/drm/i915/selftests/i915_sw_fence.c b/drivers/gpu/drm/i915/selftests/i915_sw_fence.c
index 19d145d6bf52..ea01d0fe3ace 100644
--- a/drivers/gpu/drm/i915/selftests/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/selftests/i915_sw_fence.c
@@ -24,6 +24,7 @@
 
 #include <linux/completion.h>
 #include <linux/delay.h>
+#include <linux/prime_numbers.h>
 
 #include "../i915_selftest.h"
 
@@ -565,6 +566,46 @@ err_in:
 	return ret;
 }
 
+static int test_timer(void *arg)
+{
+	unsigned long target, delay;
+	struct timed_fence tf;
+
+	timed_fence_init(&tf, target = jiffies);
+	if (!i915_sw_fence_done(&tf.fence)) {
+		pr_err("Fence with immediate expiration not signaled\n");
+		goto err;
+	}
+	timed_fence_fini(&tf);
+
+	for_each_prime_number(delay, i915_selftest.timeout_jiffies/2) {
+		timed_fence_init(&tf, target = jiffies + delay);
+		if (i915_sw_fence_done(&tf.fence)) {
+			pr_err("Fence with future expiration (%lu jiffies) already signaled\n", delay);
+			goto err;
+		}
+
+		i915_sw_fence_wait(&tf.fence);
+		if (!i915_sw_fence_done(&tf.fence)) {
+			pr_err("Fence not signaled after wait\n");
+			goto err;
+		}
+		if (time_before(jiffies, target)) {
+			pr_err("Fence signaled too early, target=%lu, now=%lu\n",
+			       target, jiffies);
+			goto err;
+		}
+
+		timed_fence_fini(&tf);
+	}
+
+	return 0;
+
+err:
+	timed_fence_fini(&tf);
+	return -EINVAL;
+}
+
 int i915_sw_fence_mock_selftests(void)
 {
 	static const struct i915_subtest tests[] = {
@@ -576,6 +617,7 @@ int i915_sw_fence_mock_selftests(void)
 		SUBTEST(test_C_AB),
 		SUBTEST(test_chain),
 		SUBTEST(test_ipc),
+		SUBTEST(test_timer),
 	};
 
 	return i915_subtests(tests, NULL);
diff --git a/drivers/gpu/drm/i915/selftests/lib_sw_fence.c b/drivers/gpu/drm/i915/selftests/lib_sw_fence.c
new file mode 100644
index 000000000000..3790fdf44a1a
--- /dev/null
+++ b/drivers/gpu/drm/i915/selftests/lib_sw_fence.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "lib_sw_fence.h"
+
+/* Small library of different fence types useful for writing tests */
+
+static int __i915_sw_fence_call
+nop_fence_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	return NOTIFY_DONE;
+}
+
+void __onstack_fence_init(struct i915_sw_fence *fence,
+			  const char *name,
+			  struct lock_class_key *key)
+{
+	debug_fence_init_onstack(fence);
+
+	__init_waitqueue_head(&fence->wait, name, key);
+	atomic_set(&fence->pending, 1);
+	fence->flags = (unsigned long)nop_fence_notify;
+}
+
+void onstack_fence_fini(struct i915_sw_fence *fence)
+{
+	i915_sw_fence_commit(fence);
+	i915_sw_fence_fini(fence);
+}
+
+static void timed_fence_wake(unsigned long data)
+{
+	struct timed_fence *tf = (struct timed_fence *)data;
+
+	i915_sw_fence_commit(&tf->fence);
+}
+
+void timed_fence_init(struct timed_fence *tf, unsigned long expires)
+{
+	onstack_fence_init(&tf->fence);
+
+	setup_timer_on_stack(&tf->timer, timed_fence_wake, (unsigned long)tf);
+
+	if (time_after(expires, jiffies))
+		mod_timer(&tf->timer, expires);
+	else
+		i915_sw_fence_commit(&tf->fence);
+}
+
+void timed_fence_fini(struct timed_fence *tf)
+{
+	if (del_timer_sync(&tf->timer))
+		i915_sw_fence_commit(&tf->fence);
+
+	destroy_timer_on_stack(&tf->timer);
+	i915_sw_fence_fini(&tf->fence);
+}
diff --git a/drivers/gpu/drm/i915/selftests/lib_sw_fence.h b/drivers/gpu/drm/i915/selftests/lib_sw_fence.h
new file mode 100644
index 000000000000..474aafb92ae1
--- /dev/null
+++ b/drivers/gpu/drm/i915/selftests/lib_sw_fence.h
@@ -0,0 +1,42 @@
+/*
+ * lib_sw_fence.h - library routines for testing N:M synchronisation points
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#ifndef _LIB_SW_FENCE_H_
+#define _LIB_SW_FENCE_H_
+
+#include <linux/timer.h>
+
+#include "../i915_sw_fence.h"
+
+#ifdef CONFIG_LOCKDEP
+#define onstack_fence_init(fence)				\
+do {								\
+	static struct lock_class_key __key;			\
+								\
+	__onstack_fence_init((fence), #fence, &__key);	\
+} while (0)
+#else
+#define onstack_fence_init(fence)				\
+	__onstack_fence_init((fence), NULL, NULL)
+#endif
+
+void __onstack_fence_init(struct i915_sw_fence *fence,
+			  const char *name,
+			  struct lock_class_key *key);
+void onstack_fence_fini(struct i915_sw_fence *fence);
+
+struct timed_fence {
+	struct i915_sw_fence fence;
+	struct timer_list timer;
+};
+
+void timed_fence_init(struct timed_fence *tf, unsigned long expires);
+void timed_fence_fini(struct timed_fence *tf);
+
+#endif /* _LIB_SW_FENCE_H_ */
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index 098ce643ad07..bbf80d42e793 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -73,11 +73,7 @@ err_put:
 
 void mock_context_close(struct i915_gem_context *ctx)
 {
-	i915_gem_context_set_closed(ctx);
-
-	i915_ppgtt_close(&ctx->ppgtt->base);
-
-	i915_gem_context_put(ctx);
+	context_close(ctx);
 }
 
 void mock_init_contexts(struct drm_i915_private *i915)
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index fc0fd7498689..331c2b09869e 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -32,9 +32,9 @@ static struct mock_request *first_request(struct mock_engine *engine)
 					link);
 }
 
-static void hw_delay_complete(unsigned long data)
+static void hw_delay_complete(struct timer_list *t)
 {
-	struct mock_engine *engine = (typeof(engine))data;
+	struct mock_engine *engine = from_timer(engine, t, hw_delay);
 	struct mock_request *request;
 
 	spin_lock(&engine->hw_lock);
@@ -161,9 +161,7 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 
 	/* fake hw queue */
 	spin_lock_init(&engine->hw_lock);
-	setup_timer(&engine->hw_delay,
-		    hw_delay_complete,
-		    (unsigned long)engine);
+	timer_setup(&engine->hw_delay, hw_delay_complete, 0);
 	INIT_LIST_HEAD(&engine->hw_queue);
 
 	return &engine->base;
diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
index 33008fa1be9b..d0b26dd80076 100644
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -8,6 +8,7 @@ msm-y := \
 	adreno/a4xx_gpu.o \
 	adreno/a5xx_gpu.o \
 	adreno/a5xx_power.o \
+	adreno/a5xx_preempt.o \
 	hdmi/hdmi.o \
 	hdmi/hdmi_audio.o \
 	hdmi/hdmi_bridge.o \
@@ -57,7 +58,8 @@ msm-y := \
 	msm_iommu.o \
 	msm_perf.o \
 	msm_rd.o \
-	msm_ringbuffer.o
+	msm_ringbuffer.o \
+	msm_submitqueue.o
 
 msm-$(CONFIG_DRM_FBDEV_EMULATION) += msm_fbdev.o
 msm-$(CONFIG_COMMON_CLK) += mdp/mdp4/mdp4_lvds_pll.o
diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
index 7791313405b5..4baef2738178 100644
--- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
@@ -44,7 +44,7 @@ static bool a3xx_idle(struct msm_gpu *gpu);
 
 static bool a3xx_me_init(struct msm_gpu *gpu)
 {
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = gpu->rb[0];
 
 	OUT_PKT3(ring, CP_ME_INIT, 17);
 	OUT_RING(ring, 0x000003f7);
@@ -65,7 +65,7 @@ static bool a3xx_me_init(struct msm_gpu *gpu)
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, 0x00000000);
 
-	gpu->funcs->flush(gpu);
+	gpu->funcs->flush(gpu, ring);
 	return a3xx_idle(gpu);
 }
 
@@ -339,7 +339,7 @@ static void a3xx_destroy(struct msm_gpu *gpu)
 static bool a3xx_idle(struct msm_gpu *gpu)
 {
 	/* wait for ringbuffer to drain: */
-	if (!adreno_idle(gpu))
+	if (!adreno_idle(gpu, gpu->rb[0]))
 		return false;
 
 	/* then wait for GPU to finish: */
@@ -444,9 +444,9 @@ static const struct adreno_gpu_funcs funcs = {
 		.pm_suspend = msm_gpu_pm_suspend,
 		.pm_resume = msm_gpu_pm_resume,
 		.recover = a3xx_recover,
-		.last_fence = adreno_last_fence,
 		.submit = adreno_submit,
 		.flush = adreno_flush,
+		.active_ring = adreno_active_ring,
 		.irq = a3xx_irq,
 		.destroy = a3xx_destroy,
 #ifdef CONFIG_DEBUG_FS
@@ -492,7 +492,7 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
 	adreno_gpu->registers = a3xx_registers;
 	adreno_gpu->reg_offsets = a3xx_register_offsets;
 
-	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs);
+	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1);
 	if (ret)
 		goto fail;
 
diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
index 58341ef6f15b..8199a4b9f2fa 100644
--- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
@@ -116,7 +116,7 @@ static void a4xx_enable_hwcg(struct msm_gpu *gpu)
 
 static bool a4xx_me_init(struct msm_gpu *gpu)
 {
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = gpu->rb[0];
 
 	OUT_PKT3(ring, CP_ME_INIT, 17);
 	OUT_RING(ring, 0x000003f7);
@@ -137,7 +137,7 @@ static bool a4xx_me_init(struct msm_gpu *gpu)
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, 0x00000000);
 
-	gpu->funcs->flush(gpu);
+	gpu->funcs->flush(gpu, ring);
 	return a4xx_idle(gpu);
 }
 
@@ -337,7 +337,7 @@ static void a4xx_destroy(struct msm_gpu *gpu)
 static bool a4xx_idle(struct msm_gpu *gpu)
 {
 	/* wait for ringbuffer to drain: */
-	if (!adreno_idle(gpu))
+	if (!adreno_idle(gpu, gpu->rb[0]))
 		return false;
 
 	/* then wait for GPU to finish: */
@@ -532,9 +532,9 @@ static const struct adreno_gpu_funcs funcs = {
 		.pm_suspend = a4xx_pm_suspend,
 		.pm_resume = a4xx_pm_resume,
 		.recover = a4xx_recover,
-		.last_fence = adreno_last_fence,
 		.submit = adreno_submit,
 		.flush = adreno_flush,
+		.active_ring = adreno_active_ring,
 		.irq = a4xx_irq,
 		.destroy = a4xx_destroy,
 #ifdef CONFIG_DEBUG_FS
@@ -574,7 +574,7 @@ struct msm_gpu *a4xx_gpu_init(struct drm_device *dev)
 	adreno_gpu->registers = a4xx_registers;
 	adreno_gpu->reg_offsets = a4xx_register_offsets;
 
-	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs);
+	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1);
 	if (ret)
 		goto fail;
 
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 17c59d839e6f..a1f4eeeb73e2 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -26,8 +26,9 @@ static void a5xx_dump(struct msm_gpu *gpu);
 
 #define GPU_PAS_ID 13
 
-static int zap_shader_load_mdt(struct device *dev, const char *fwname)
+static int zap_shader_load_mdt(struct msm_gpu *gpu, const char *fwname)
 {
+	struct device *dev = &gpu->pdev->dev;
 	const struct firmware *fw;
 	struct device_node *np;
 	struct resource r;
@@ -55,10 +56,10 @@ static int zap_shader_load_mdt(struct device *dev, const char *fwname)
 	mem_size = resource_size(&r);
 
 	/* Request the MDT file for the firmware */
-	ret = request_firmware(&fw, fwname, dev);
-	if (ret) {
+	fw = adreno_request_fw(to_adreno_gpu(gpu), fwname);
+	if (IS_ERR(fw)) {
 		DRM_DEV_ERROR(dev, "Unable to load %s\n", fwname);
-		return ret;
+		return PTR_ERR(fw);
 	}
 
 	/* Figure out how much memory we need */
@@ -75,9 +76,26 @@ static int zap_shader_load_mdt(struct device *dev, const char *fwname)
 		goto out;
 	}
 
-	/* Load the rest of the MDT */
-	ret = qcom_mdt_load(dev, fw, fwname, GPU_PAS_ID, mem_region, mem_phys,
-		mem_size);
+	/*
+	 * Load the rest of the MDT
+	 *
+	 * Note that we could be dealing with two different paths, since
+	 * with upstream linux-firmware it would be in a qcom/ subdir..
+	 * adreno_request_fw() handles this, but qcom_mdt_load() does
+	 * not.  But since we've already gotten thru adreno_request_fw()
+	 * we know which of the two cases it is:
+	 */
+	if (to_adreno_gpu(gpu)->fwloc == FW_LOCATION_LEGACY) {
+		ret = qcom_mdt_load(dev, fw, fwname, GPU_PAS_ID,
+				mem_region, mem_phys, mem_size);
+	} else {
+		char newname[strlen("qcom/") + strlen(fwname) + 1];
+
+		sprintf(newname, "qcom/%s", fwname);
+
+		ret = qcom_mdt_load(dev, fw, newname, GPU_PAS_ID,
+				mem_region, mem_phys, mem_size);
+	}
 	if (ret)
 		goto out;
 
@@ -95,14 +113,65 @@ out:
 	return ret;
 }
 
+static void a5xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	uint32_t wptr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->lock, flags);
+
+	/* Copy the shadow to the actual register */
+	ring->cur = ring->next;
+
+	/* Make sure to wrap wptr if we need to */
+	wptr = get_wptr(ring);
+
+	spin_unlock_irqrestore(&ring->lock, flags);
+
+	/* Make sure everything is posted before making a decision */
+	mb();
+
+	/* Update HW if this is the current ring and we are not in preempt */
+	if (a5xx_gpu->cur_ring == ring && !a5xx_in_preempt(a5xx_gpu))
+		gpu_write(gpu, REG_A5XX_CP_RB_WPTR, wptr);
+}
+
 static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 	struct msm_file_private *ctx)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
 	struct msm_drm_private *priv = gpu->dev->dev_private;
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = submit->ring;
 	unsigned int i, ibs = 0;
 
+	OUT_PKT7(ring, CP_PREEMPT_ENABLE_GLOBAL, 1);
+	OUT_RING(ring, 0x02);
+
+	/* Turn off protected mode to write to special registers */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 0);
+
+	/* Set the save preemption record for the ring/command */
+	OUT_PKT4(ring, REG_A5XX_CP_CONTEXT_SWITCH_SAVE_ADDR_LO, 2);
+	OUT_RING(ring, lower_32_bits(a5xx_gpu->preempt_iova[submit->ring->id]));
+	OUT_RING(ring, upper_32_bits(a5xx_gpu->preempt_iova[submit->ring->id]));
+
+	/* Turn back on protected mode */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 1);
+
+	/* Enable local preemption for finegrain preemption */
+	OUT_PKT7(ring, CP_PREEMPT_ENABLE_GLOBAL, 1);
+	OUT_RING(ring, 0x02);
+
+	/* Allow CP_CONTEXT_SWITCH_YIELD packets in the IB2 */
+	OUT_PKT7(ring, CP_YIELD_ENABLE, 1);
+	OUT_RING(ring, 0x02);
+
+	/* Submit the commands */
 	for (i = 0; i < submit->nr_cmds; i++) {
 		switch (submit->cmd[i].type) {
 		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
@@ -120,16 +189,54 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 		}
 	}
 
+	/*
+	 * Write the render mode to NULL (0) to indicate to the CP that the IBs
+	 * are done rendering - otherwise a lucky preemption would start
+	 * replaying from the last checkpoint
+	 */
+	OUT_PKT7(ring, CP_SET_RENDER_MODE, 5);
+	OUT_RING(ring, 0);
+	OUT_RING(ring, 0);
+	OUT_RING(ring, 0);
+	OUT_RING(ring, 0);
+	OUT_RING(ring, 0);
+
+	/* Turn off IB level preemptions */
+	OUT_PKT7(ring, CP_YIELD_ENABLE, 1);
+	OUT_RING(ring, 0x01);
+
+	/* Write the fence to the scratch register */
 	OUT_PKT4(ring, REG_A5XX_CP_SCRATCH_REG(2), 1);
-	OUT_RING(ring, submit->fence->seqno);
+	OUT_RING(ring, submit->seqno);
 
+	/*
+	 * Execute a CACHE_FLUSH_TS event. This will ensure that the
+	 * timestamp is written to the memory and then triggers the interrupt
+	 */
 	OUT_PKT7(ring, CP_EVENT_WRITE, 4);
 	OUT_RING(ring, CACHE_FLUSH_TS | (1 << 31));
-	OUT_RING(ring, lower_32_bits(rbmemptr(adreno_gpu, fence)));
-	OUT_RING(ring, upper_32_bits(rbmemptr(adreno_gpu, fence)));
-	OUT_RING(ring, submit->fence->seqno);
+	OUT_RING(ring, lower_32_bits(rbmemptr(ring, fence)));
+	OUT_RING(ring, upper_32_bits(rbmemptr(ring, fence)));
+	OUT_RING(ring, submit->seqno);
 
-	gpu->funcs->flush(gpu);
+	/* Yield the floor on command completion */
+	OUT_PKT7(ring, CP_CONTEXT_SWITCH_YIELD, 4);
+	/*
+	 * If dword[2:1] are non zero, they specify an address for the CP to
+	 * write the value of dword[3] to on preemption complete. Write 0 to
+	 * skip the write
+	 */
+	OUT_RING(ring, 0x00);
+	OUT_RING(ring, 0x00);
+	/* Data value - not used if the address above is 0 */
+	OUT_RING(ring, 0x01);
+	/* Set bit 0 to trigger an interrupt on preempt complete */
+	OUT_RING(ring, 0x01);
+
+	a5xx_flush(gpu, ring);
+
+	/* Check to see if we need to start preemption */
+	a5xx_preempt_trigger(gpu);
 }
 
 static const struct {
@@ -245,7 +352,7 @@ void a5xx_set_hwcg(struct msm_gpu *gpu, bool state)
 static int a5xx_me_init(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = gpu->rb[0];
 
 	OUT_PKT7(ring, CP_ME_INIT, 8);
 
@@ -276,11 +383,54 @@ static int a5xx_me_init(struct msm_gpu *gpu)
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, 0x00000000);
 
-	gpu->funcs->flush(gpu);
+	gpu->funcs->flush(gpu, ring);
+	return a5xx_idle(gpu, ring) ? 0 : -EINVAL;
+}
+
+static int a5xx_preempt_start(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	struct msm_ringbuffer *ring = gpu->rb[0];
+
+	if (gpu->nr_rings == 1)
+		return 0;
+
+	/* Turn off protected mode to write to special registers */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 0);
+
+	/* Set the save preemption record for the ring/command */
+	OUT_PKT4(ring, REG_A5XX_CP_CONTEXT_SWITCH_SAVE_ADDR_LO, 2);
+	OUT_RING(ring, lower_32_bits(a5xx_gpu->preempt_iova[ring->id]));
+	OUT_RING(ring, upper_32_bits(a5xx_gpu->preempt_iova[ring->id]));
+
+	/* Turn back on protected mode */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 1);
+
+	OUT_PKT7(ring, CP_PREEMPT_ENABLE_GLOBAL, 1);
+	OUT_RING(ring, 0x00);
+
+	OUT_PKT7(ring, CP_PREEMPT_ENABLE_LOCAL, 1);
+	OUT_RING(ring, 0x01);
 
-	return a5xx_idle(gpu) ? 0 : -EINVAL;
+	OUT_PKT7(ring, CP_YIELD_ENABLE, 1);
+	OUT_RING(ring, 0x01);
+
+	/* Yield the floor on command completion */
+	OUT_PKT7(ring, CP_CONTEXT_SWITCH_YIELD, 4);
+	OUT_RING(ring, 0x00);
+	OUT_RING(ring, 0x00);
+	OUT_RING(ring, 0x01);
+	OUT_RING(ring, 0x01);
+
+	gpu->funcs->flush(gpu, ring);
+
+	return a5xx_idle(gpu, ring) ? 0 : -EINVAL;
 }
 
+
 static struct drm_gem_object *a5xx_ucode_load_bo(struct msm_gpu *gpu,
 		const struct firmware *fw, u64 *iova)
 {
@@ -381,7 +531,7 @@ static int a5xx_zap_shader_init(struct msm_gpu *gpu)
 		return -ENODEV;
 	}
 
-	ret = zap_shader_load_mdt(&pdev->dev, adreno_gpu->info->zapfw);
+	ret = zap_shader_load_mdt(gpu, adreno_gpu->info->zapfw);
 
 	loaded = !ret;
 
@@ -396,6 +546,7 @@ static int a5xx_zap_shader_init(struct msm_gpu *gpu)
 	  A5XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNC_OVERFLOW | \
 	  A5XX_RBBM_INT_0_MASK_CP_HW_ERROR | \
 	  A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT | \
+	  A5XX_RBBM_INT_0_MASK_CP_SW | \
 	  A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS | \
 	  A5XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS | \
 	  A5XX_RBBM_INT_0_MASK_GPMU_VOLTAGE_DROOP)
@@ -536,13 +687,14 @@ static int a5xx_hw_init(struct msm_gpu *gpu)
 		REG_A5XX_RBBM_SECVID_TSB_TRUSTED_BASE_HI, 0x00000000);
 	gpu_write(gpu, REG_A5XX_RBBM_SECVID_TSB_TRUSTED_SIZE, 0x00000000);
 
-	/* Load the GPMU firmware before starting the HW init */
-	a5xx_gpmu_ucode_init(gpu);
-
 	ret = adreno_hw_init(gpu);
 	if (ret)
 		return ret;
 
+	a5xx_preempt_hw_init(gpu);
+
+	a5xx_gpmu_ucode_init(gpu);
+
 	ret = a5xx_ucode_init(gpu);
 	if (ret)
 		return ret;
@@ -565,11 +717,11 @@ static int a5xx_hw_init(struct msm_gpu *gpu)
 	 * ticking correctly
 	 */
 	if (adreno_is_a530(adreno_gpu)) {
-		OUT_PKT7(gpu->rb, CP_EVENT_WRITE, 1);
-		OUT_RING(gpu->rb, 0x0F);
+		OUT_PKT7(gpu->rb[0], CP_EVENT_WRITE, 1);
+		OUT_RING(gpu->rb[0], 0x0F);
 
-		gpu->funcs->flush(gpu);
-		if (!a5xx_idle(gpu))
+		gpu->funcs->flush(gpu, gpu->rb[0]);
+		if (!a5xx_idle(gpu, gpu->rb[0]))
 			return -EINVAL;
 	}
 
@@ -582,11 +734,11 @@ static int a5xx_hw_init(struct msm_gpu *gpu)
 	 */
 	ret = a5xx_zap_shader_init(gpu);
 	if (!ret) {
-		OUT_PKT7(gpu->rb, CP_SET_SECURE_MODE, 1);
-		OUT_RING(gpu->rb, 0x00000000);
+		OUT_PKT7(gpu->rb[0], CP_SET_SECURE_MODE, 1);
+		OUT_RING(gpu->rb[0], 0x00000000);
 
-		gpu->funcs->flush(gpu);
-		if (!a5xx_idle(gpu))
+		gpu->funcs->flush(gpu, gpu->rb[0]);
+		if (!a5xx_idle(gpu, gpu->rb[0]))
 			return -EINVAL;
 	} else {
 		/* Print a warning so if we die, we know why */
@@ -595,6 +747,9 @@ static int a5xx_hw_init(struct msm_gpu *gpu)
 		gpu_write(gpu, REG_A5XX_RBBM_SECVID_TRUST_CNTL, 0x0);
 	}
 
+	/* Last step - yield the ringbuffer */
+	a5xx_preempt_start(gpu);
+
 	return 0;
 }
 
@@ -625,6 +780,8 @@ static void a5xx_destroy(struct msm_gpu *gpu)
 
 	DBG("%s", gpu->name);
 
+	a5xx_preempt_fini(gpu);
+
 	if (a5xx_gpu->pm4_bo) {
 		if (a5xx_gpu->pm4_iova)
 			msm_gem_put_iova(a5xx_gpu->pm4_bo, gpu->aspace);
@@ -660,18 +817,27 @@ static inline bool _a5xx_check_idle(struct msm_gpu *gpu)
 		A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT);
 }
 
-bool a5xx_idle(struct msm_gpu *gpu)
+bool a5xx_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 {
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+
+	if (ring != a5xx_gpu->cur_ring) {
+		WARN(1, "Tried to idle a non-current ringbuffer\n");
+		return false;
+	}
+
 	/* wait for CP to drain ringbuffer: */
-	if (!adreno_idle(gpu))
+	if (!adreno_idle(gpu, ring))
 		return false;
 
 	if (spin_until(_a5xx_check_idle(gpu))) {
-		DRM_ERROR("%s: %ps: timeout waiting for GPU to idle: status %8.8X irq %8.8X\n",
+		DRM_ERROR("%s: %ps: timeout waiting for GPU to idle: status %8.8X irq %8.8X rptr/wptr %d/%d\n",
 			gpu->name, __builtin_return_address(0),
 			gpu_read(gpu, REG_A5XX_RBBM_STATUS),
-			gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS));
-
+			gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS),
+			gpu_read(gpu, REG_A5XX_CP_RB_RPTR),
+			gpu_read(gpu, REG_A5XX_CP_RB_WPTR));
 		return false;
 	}
 
@@ -802,9 +968,10 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
 {
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);
 
-	dev_err(dev->dev, "gpu fault fence %x status %8.8X rb %4.4x/%4.4x ib1 %16.16llX/%4.4x ib2 %16.16llX/%4.4x\n",
-		gpu->funcs->last_fence(gpu),
+	dev_err(dev->dev, "gpu fault ring %d fence %x status %8.8X rb %4.4x/%4.4x ib1 %16.16llX/%4.4x ib2 %16.16llX/%4.4x\n",
+		ring ? ring->id : -1, ring ? ring->seqno : 0,
 		gpu_read(gpu, REG_A5XX_RBBM_STATUS),
 		gpu_read(gpu, REG_A5XX_CP_RB_RPTR),
 		gpu_read(gpu, REG_A5XX_CP_RB_WPTR),
@@ -854,8 +1021,13 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
 	if (status & A5XX_RBBM_INT_0_MASK_GPMU_VOLTAGE_DROOP)
 		a5xx_gpmu_err_irq(gpu);
 
-	if (status & A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS)
+	if (status & A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS) {
+		a5xx_preempt_trigger(gpu);
 		msm_gpu_retire(gpu);
+	}
+
+	if (status & A5XX_RBBM_INT_0_MASK_CP_SW)
+		a5xx_preempt_irq(gpu);
 
 	return IRQ_HANDLED;
 }
@@ -985,6 +1157,14 @@ static void a5xx_show(struct msm_gpu *gpu, struct seq_file *m)
 }
 #endif
 
+static struct msm_ringbuffer *a5xx_active_ring(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+
+	return a5xx_gpu->cur_ring;
+}
+
 static const struct adreno_gpu_funcs funcs = {
 	.base = {
 		.get_param = adreno_get_param,
@@ -992,9 +1172,9 @@ static const struct adreno_gpu_funcs funcs = {
 		.pm_suspend = a5xx_pm_suspend,
 		.pm_resume = a5xx_pm_resume,
 		.recover = a5xx_recover,
-		.last_fence = adreno_last_fence,
 		.submit = a5xx_submit,
-		.flush = adreno_flush,
+		.flush = a5xx_flush,
+		.active_ring = a5xx_active_ring,
 		.irq = a5xx_irq,
 		.destroy = a5xx_destroy,
 #ifdef CONFIG_DEBUG_FS
@@ -1030,7 +1210,7 @@ struct msm_gpu *a5xx_gpu_init(struct drm_device *dev)
 
 	a5xx_gpu->lm_leakage = 0x4E001A;
 
-	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs);
+	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 4);
 	if (ret) {
 		a5xx_destroy(&(a5xx_gpu->base.base));
 		return ERR_PTR(ret);
@@ -1039,5 +1219,8 @@ struct msm_gpu *a5xx_gpu_init(struct drm_device *dev)
 	if (gpu->aspace)
 		msm_mmu_set_fault_handler(gpu->aspace->mmu, gpu, a5xx_fault_handler);
 
+	/* Set up the preemption specific bits and pieces for each ringbuffer */
+	a5xx_preempt_init(gpu);
+
 	return gpu;
 }
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
index e94451685bf8..6fb8c2f9b9e4 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+/* Copyright (c) 2016-2017 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -35,10 +35,100 @@ struct a5xx_gpu {
 	uint32_t gpmu_dwords;
 
 	uint32_t lm_leakage;
+
+	struct msm_ringbuffer *cur_ring;
+	struct msm_ringbuffer *next_ring;
+
+	struct drm_gem_object *preempt_bo[MSM_GPU_MAX_RINGS];
+	struct a5xx_preempt_record *preempt[MSM_GPU_MAX_RINGS];
+	uint64_t preempt_iova[MSM_GPU_MAX_RINGS];
+
+	atomic_t preempt_state;
+	struct timer_list preempt_timer;
 };
 
 #define to_a5xx_gpu(x) container_of(x, struct a5xx_gpu, base)
 
+/*
+ * In order to do lockless preemption we use a simple state machine to progress
+ * through the process.
+ *
+ * PREEMPT_NONE - no preemption in progress.  Next state START.
+ * PREEMPT_START - The trigger is evaulating if preemption is possible. Next
+ * states: TRIGGERED, NONE
+ * PREEMPT_ABORT - An intermediate state before moving back to NONE. Next
+ * state: NONE.
+ * PREEMPT_TRIGGERED: A preemption has been executed on the hardware. Next
+ * states: FAULTED, PENDING
+ * PREEMPT_FAULTED: A preemption timed out (never completed). This will trigger
+ * recovery.  Next state: N/A
+ * PREEMPT_PENDING: Preemption complete interrupt fired - the callback is
+ * checking the success of the operation. Next state: FAULTED, NONE.
+ */
+
+enum preempt_state {
+	PREEMPT_NONE = 0,
+	PREEMPT_START,
+	PREEMPT_ABORT,
+	PREEMPT_TRIGGERED,
+	PREEMPT_FAULTED,
+	PREEMPT_PENDING,
+};
+
+/*
+ * struct a5xx_preempt_record is a shared buffer between the microcode and the
+ * CPU to store the state for preemption. The record itself is much larger
+ * (64k) but most of that is used by the CP for storage.
+ *
+ * There is a preemption record assigned per ringbuffer. When the CPU triggers a
+ * preemption, it fills out the record with the useful information (wptr, ring
+ * base, etc) and the microcode uses that information to set up the CP following
+ * the preemption.  When a ring is switched out, the CP will save the ringbuffer
+ * state back to the record. In this way, once the records are properly set up
+ * the CPU can quickly switch back and forth between ringbuffers by only
+ * updating a few registers (often only the wptr).
+ *
+ * These are the CPU aware registers in the record:
+ * @magic: Must always be 0x27C4BAFC
+ * @info: Type of the record - written 0 by the CPU, updated by the CP
+ * @data: Data field from SET_RENDER_MODE or a checkpoint. Written and used by
+ * the CP
+ * @cntl: Value of RB_CNTL written by CPU, save/restored by CP
+ * @rptr: Value of RB_RPTR written by CPU, save/restored by CP
+ * @wptr: Value of RB_WPTR written by CPU, save/restored by CP
+ * @rptr_addr: Value of RB_RPTR_ADDR written by CPU, save/restored by CP
+ * @rbase: Value of RB_BASE written by CPU, save/restored by CP
+ * @counter: GPU address of the storage area for the performance counters
+ */
+struct a5xx_preempt_record {
+	uint32_t magic;
+	uint32_t info;
+	uint32_t data;
+	uint32_t cntl;
+	uint32_t rptr;
+	uint32_t wptr;
+	uint64_t rptr_addr;
+	uint64_t rbase;
+	uint64_t counter;
+};
+
+/* Magic identifier for the preemption record */
+#define A5XX_PREEMPT_RECORD_MAGIC 0x27C4BAFCUL
+
+/*
+ * Even though the structure above is only a few bytes, we need a full 64k to
+ * store the entire preemption record from the CP
+ */
+#define A5XX_PREEMPT_RECORD_SIZE (64 * 1024)
+
+/*
+ * The preemption counter block is a storage area for the value of the
+ * preemption counters that are saved immediately before context switch. We
+ * append it on to the end of the allocation for the preemption record.
+ */
+#define A5XX_PREEMPT_COUNTER_SIZE (16 * 4)
+
+
 int a5xx_power_init(struct msm_gpu *gpu);
 void a5xx_gpmu_ucode_init(struct msm_gpu *gpu);
 
@@ -55,7 +145,22 @@ static inline int spin_usecs(struct msm_gpu *gpu, uint32_t usecs,
 	return -ETIMEDOUT;
 }
 
-bool a5xx_idle(struct msm_gpu *gpu);
+bool a5xx_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 void a5xx_set_hwcg(struct msm_gpu *gpu, bool state);
 
+void a5xx_preempt_init(struct msm_gpu *gpu);
+void a5xx_preempt_hw_init(struct msm_gpu *gpu);
+void a5xx_preempt_trigger(struct msm_gpu *gpu);
+void a5xx_preempt_irq(struct msm_gpu *gpu);
+void a5xx_preempt_fini(struct msm_gpu *gpu);
+
+/* Return true if we are in a preempt state */
+static inline bool a5xx_in_preempt(struct a5xx_gpu *a5xx_gpu)
+{
+	int preempt_state = atomic_read(&a5xx_gpu->preempt_state);
+
+	return !(preempt_state == PREEMPT_NONE ||
+			preempt_state == PREEMPT_ABORT);
+}
+
 #endif /* __A5XX_GPU_H__ */
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_power.c b/drivers/gpu/drm/msm/adreno/a5xx_power.c
index 04aab1dcae2b..e5700bbf09dd 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_power.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_power.c
@@ -173,7 +173,7 @@ static int a5xx_gpmu_init(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = gpu->rb[0];
 
 	if (!a5xx_gpu->gpmu_dwords)
 		return 0;
@@ -192,9 +192,9 @@ static int a5xx_gpmu_init(struct msm_gpu *gpu)
 	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
 	OUT_RING(ring, 1);
 
-	gpu->funcs->flush(gpu);
+	gpu->funcs->flush(gpu, ring);
 
-	if (!a5xx_idle(gpu)) {
+	if (!a5xx_idle(gpu, ring)) {
 		DRM_ERROR("%s: Unable to load GPMU firmware. GPMU will not be active\n",
 			gpu->name);
 		return -EINVAL;
@@ -264,7 +264,8 @@ void a5xx_gpmu_ucode_init(struct msm_gpu *gpu)
 		return;
 
 	/* Get the firmware */
-	if (request_firmware(&fw, adreno_gpu->info->gpmufw, drm->dev)) {
+	fw = adreno_request_fw(adreno_gpu, adreno_gpu->info->gpmufw);
+	if (IS_ERR(fw)) {
 		DRM_ERROR("%s: Could not get GPMU firmware. GPMU will not be active\n",
 			gpu->name);
 		return;
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
new file mode 100644
index 000000000000..40f4840ef98e
--- /dev/null
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@@ -0,0 +1,305 @@
+/* Copyright (c) 2017 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "msm_gem.h"
+#include "a5xx_gpu.h"
+
+/*
+ * Try to transition the preemption state from old to new. Return
+ * true on success or false if the original state wasn't 'old'
+ */
+static inline bool try_preempt_state(struct a5xx_gpu *a5xx_gpu,
+		enum preempt_state old, enum preempt_state new)
+{
+	enum preempt_state cur = atomic_cmpxchg(&a5xx_gpu->preempt_state,
+		old, new);
+
+	return (cur == old);
+}
+
+/*
+ * Force the preemption state to the specified state.  This is used in cases
+ * where the current state is known and won't change
+ */
+static inline void set_preempt_state(struct a5xx_gpu *gpu,
+		enum preempt_state new)
+{
+	/*
+	 * preempt_state may be read by other cores trying to trigger a
+	 * preemption or in the interrupt handler so barriers are needed
+	 * before...
+	 */
+	smp_mb__before_atomic();
+	atomic_set(&gpu->preempt_state, new);
+	/* ... and after*/
+	smp_mb__after_atomic();
+}
+
+/* Write the most recent wptr for the given ring into the hardware */
+static inline void update_wptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
+{
+	unsigned long flags;
+	uint32_t wptr;
+
+	if (!ring)
+		return;
+
+	spin_lock_irqsave(&ring->lock, flags);
+	wptr = get_wptr(ring);
+	spin_unlock_irqrestore(&ring->lock, flags);
+
+	gpu_write(gpu, REG_A5XX_CP_RB_WPTR, wptr);
+}
+
+/* Return the highest priority ringbuffer with something in it */
+static struct msm_ringbuffer *get_next_ring(struct msm_gpu *gpu)
+{
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < gpu->nr_rings; i++) {
+		bool empty;
+		struct msm_ringbuffer *ring = gpu->rb[i];
+
+		spin_lock_irqsave(&ring->lock, flags);
+		empty = (get_wptr(ring) == ring->memptrs->rptr);
+		spin_unlock_irqrestore(&ring->lock, flags);
+
+		if (!empty)
+			return ring;
+	}
+
+	return NULL;
+}
+
+static void a5xx_preempt_timer(unsigned long data)
+{
+	struct a5xx_gpu *a5xx_gpu = (struct a5xx_gpu *) data;
+	struct msm_gpu *gpu = &a5xx_gpu->base.base;
+	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+
+	if (!try_preempt_state(a5xx_gpu, PREEMPT_TRIGGERED, PREEMPT_FAULTED))
+		return;
+
+	dev_err(dev->dev, "%s: preemption timed out\n", gpu->name);
+	queue_work(priv->wq, &gpu->recover_work);
+}
+
+/* Try to trigger a preemption switch */
+void a5xx_preempt_trigger(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	unsigned long flags;
+	struct msm_ringbuffer *ring;
+
+	if (gpu->nr_rings == 1)
+		return;
+
+	/*
+	 * Try to start preemption by moving from NONE to START. If
+	 * unsuccessful, a preemption is already in flight
+	 */
+	if (!try_preempt_state(a5xx_gpu, PREEMPT_NONE, PREEMPT_START))
+		return;
+
+	/* Get the next ring to preempt to */
+	ring = get_next_ring(gpu);
+
+	/*
+	 * If no ring is populated or the highest priority ring is the current
+	 * one do nothing except to update the wptr to the latest and greatest
+	 */
+	if (!ring || (a5xx_gpu->cur_ring == ring)) {
+		/*
+		 * Its possible that while a preemption request is in progress
+		 * from an irq context, a user context trying to submit might
+		 * fail to update the write pointer, because it determines
+		 * that the preempt state is not PREEMPT_NONE.
+		 *
+		 * Close the race by introducing an intermediate
+		 * state PREEMPT_ABORT to let the submit path
+		 * know that the ringbuffer is not going to change
+		 * and can safely update the write pointer.
+		 */
+
+		set_preempt_state(a5xx_gpu, PREEMPT_ABORT);
+		update_wptr(gpu, a5xx_gpu->cur_ring);
+		set_preempt_state(a5xx_gpu, PREEMPT_NONE);
+		return;
+	}
+
+	/* Make sure the wptr doesn't update while we're in motion */
+	spin_lock_irqsave(&ring->lock, flags);
+	a5xx_gpu->preempt[ring->id]->wptr = get_wptr(ring);
+	spin_unlock_irqrestore(&ring->lock, flags);
+
+	/* Set the address of the incoming preemption record */
+	gpu_write64(gpu, REG_A5XX_CP_CONTEXT_SWITCH_RESTORE_ADDR_LO,
+		REG_A5XX_CP_CONTEXT_SWITCH_RESTORE_ADDR_HI,
+		a5xx_gpu->preempt_iova[ring->id]);
+
+	a5xx_gpu->next_ring = ring;
+
+	/* Start a timer to catch a stuck preemption */
+	mod_timer(&a5xx_gpu->preempt_timer, jiffies + msecs_to_jiffies(10000));
+
+	/* Set the preemption state to triggered */
+	set_preempt_state(a5xx_gpu, PREEMPT_TRIGGERED);
+
+	/* Make sure everything is written before hitting the button */
+	wmb();
+
+	/* And actually start the preemption */
+	gpu_write(gpu, REG_A5XX_CP_CONTEXT_SWITCH_CNTL, 1);
+}
+
+void a5xx_preempt_irq(struct msm_gpu *gpu)
+{
+	uint32_t status;
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+
+	if (!try_preempt_state(a5xx_gpu, PREEMPT_TRIGGERED, PREEMPT_PENDING))
+		return;
+
+	/* Delete the preemption watchdog timer */
+	del_timer(&a5xx_gpu->preempt_timer);
+
+	/*
+	 * The hardware should be setting CP_CONTEXT_SWITCH_CNTL to zero before
+	 * firing the interrupt, but there is a non zero chance of a hardware
+	 * condition or a software race that could set it again before we have a
+	 * chance to finish. If that happens, log and go for recovery
+	 */
+	status = gpu_read(gpu, REG_A5XX_CP_CONTEXT_SWITCH_CNTL);
+	if (unlikely(status)) {
+		set_preempt_state(a5xx_gpu, PREEMPT_FAULTED);
+		dev_err(dev->dev, "%s: Preemption failed to complete\n",
+			gpu->name);
+		queue_work(priv->wq, &gpu->recover_work);
+		return;
+	}
+
+	a5xx_gpu->cur_ring = a5xx_gpu->next_ring;
+	a5xx_gpu->next_ring = NULL;
+
+	update_wptr(gpu, a5xx_gpu->cur_ring);
+
+	set_preempt_state(a5xx_gpu, PREEMPT_NONE);
+}
+
+void a5xx_preempt_hw_init(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	int i;
+
+	for (i = 0; i < gpu->nr_rings; i++) {
+		a5xx_gpu->preempt[i]->wptr = 0;
+		a5xx_gpu->preempt[i]->rptr = 0;
+		a5xx_gpu->preempt[i]->rbase = gpu->rb[i]->iova;
+	}
+
+	/* Write a 0 to signal that we aren't switching pagetables */
+	gpu_write64(gpu, REG_A5XX_CP_CONTEXT_SWITCH_SMMU_INFO_LO,
+		REG_A5XX_CP_CONTEXT_SWITCH_SMMU_INFO_HI, 0);
+
+	/* Reset the preemption state */
+	set_preempt_state(a5xx_gpu, PREEMPT_NONE);
+
+	/* Always come up on rb 0 */
+	a5xx_gpu->cur_ring = gpu->rb[0];
+}
+
+static int preempt_init_ring(struct a5xx_gpu *a5xx_gpu,
+		struct msm_ringbuffer *ring)
+{
+	struct adreno_gpu *adreno_gpu = &a5xx_gpu->base;
+	struct msm_gpu *gpu = &adreno_gpu->base;
+	struct a5xx_preempt_record *ptr;
+	struct drm_gem_object *bo = NULL;
+	u64 iova = 0;
+
+	ptr = msm_gem_kernel_new(gpu->dev,
+		A5XX_PREEMPT_RECORD_SIZE + A5XX_PREEMPT_COUNTER_SIZE,
+		MSM_BO_UNCACHED, gpu->aspace, &bo, &iova);
+
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+
+	a5xx_gpu->preempt_bo[ring->id] = bo;
+	a5xx_gpu->preempt_iova[ring->id] = iova;
+	a5xx_gpu->preempt[ring->id] = ptr;
+
+	/* Set up the defaults on the preemption record */
+
+	ptr->magic = A5XX_PREEMPT_RECORD_MAGIC;
+	ptr->info = 0;
+	ptr->data = 0;
+	ptr->cntl = MSM_GPU_RB_CNTL_DEFAULT;
+	ptr->rptr_addr = rbmemptr(ring, rptr);
+	ptr->counter = iova + A5XX_PREEMPT_RECORD_SIZE;
+
+	return 0;
+}
+
+void a5xx_preempt_fini(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	int i;
+
+	for (i = 0; i < gpu->nr_rings; i++) {
+		if (!a5xx_gpu->preempt_bo[i])
+			continue;
+
+		msm_gem_put_vaddr(a5xx_gpu->preempt_bo[i]);
+
+		if (a5xx_gpu->preempt_iova[i])
+			msm_gem_put_iova(a5xx_gpu->preempt_bo[i], gpu->aspace);
+
+		drm_gem_object_unreference(a5xx_gpu->preempt_bo[i]);
+		a5xx_gpu->preempt_bo[i] = NULL;
+	}
+}
+
+void a5xx_preempt_init(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	int i;
+
+	/* No preemption if we only have one ring */
+	if (gpu->nr_rings <= 1)
+		return;
+
+	for (i = 0; i < gpu->nr_rings; i++) {
+		if (preempt_init_ring(a5xx_gpu, gpu->rb[i])) {
+			/*
+			 * On any failure our adventure is over. Clean up and
+			 * set nr_rings to 1 to force preemption off
+			 */
+			a5xx_preempt_fini(gpu);
+			gpu->nr_rings = 1;
+
+			return;
+		}
+	}
+
+	setup_timer(&a5xx_gpu->preempt_timer, a5xx_preempt_timer,
+		(unsigned long) a5xx_gpu);
+}
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c
index c75c4df4bc39..05022ea2a007 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_device.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_device.c
@@ -125,51 +125,24 @@ struct msm_gpu *adreno_load_gpu(struct drm_device *dev)
 {
 	struct msm_drm_private *priv = dev->dev_private;
 	struct platform_device *pdev = priv->gpu_pdev;
-	struct adreno_platform_config *config;
-	struct adreno_rev rev;
-	const struct adreno_info *info;
-	struct msm_gpu *gpu = NULL;
+	struct msm_gpu *gpu = platform_get_drvdata(priv->gpu_pdev);
+	int ret;
 
-	if (!pdev) {
+	if (!gpu) {
 		dev_err(dev->dev, "no adreno device\n");
 		return NULL;
 	}
 
-	config = pdev->dev.platform_data;
-	rev = config->rev;
-	info = adreno_info(config->rev);
-
-	if (!info) {
-		dev_warn(dev->dev, "Unknown GPU revision: %u.%u.%u.%u\n",
-				rev.core, rev.major, rev.minor, rev.patchid);
+	pm_runtime_get_sync(&pdev->dev);
+	mutex_lock(&dev->struct_mutex);
+	ret = msm_gpu_hw_init(gpu);
+	mutex_unlock(&dev->struct_mutex);
+	pm_runtime_put_sync(&pdev->dev);
+	if (ret) {
+		dev_err(dev->dev, "gpu hw init failed: %d\n", ret);
 		return NULL;
 	}
 
-	DBG("Found GPU: %u.%u.%u.%u",  rev.core, rev.major,
-			rev.minor, rev.patchid);
-
-	gpu = info->init(dev);
-	if (IS_ERR(gpu)) {
-		dev_warn(dev->dev, "failed to load adreno gpu\n");
-		gpu = NULL;
-		/* not fatal */
-	}
-
-	if (gpu) {
-		int ret;
-
-		pm_runtime_get_sync(&pdev->dev);
-		mutex_lock(&dev->struct_mutex);
-		ret = msm_gpu_hw_init(gpu);
-		mutex_unlock(&dev->struct_mutex);
-		pm_runtime_put_sync(&pdev->dev);
-		if (ret) {
-			dev_err(dev->dev, "gpu hw init failed: %d\n", ret);
-			gpu->funcs->destroy(gpu);
-			gpu = NULL;
-		}
-	}
-
 	return gpu;
 }
 
@@ -282,6 +255,9 @@ static int adreno_get_pwrlevels(struct device *dev,
 static int adreno_bind(struct device *dev, struct device *master, void *data)
 {
 	static struct adreno_platform_config config = {};
+	const struct adreno_info *info;
+	struct drm_device *drm = dev_get_drvdata(master);
+	struct msm_gpu *gpu;
 	u32 val;
 	int ret;
 
@@ -302,13 +278,39 @@ static int adreno_bind(struct device *dev, struct device *master, void *data)
 		return ret;
 
 	dev->platform_data = &config;
-	set_gpu_pdev(dev_get_drvdata(master), to_platform_device(dev));
+	set_gpu_pdev(drm, to_platform_device(dev));
+
+	info = adreno_info(config.rev);
+
+	if (!info) {
+		dev_warn(drm->dev, "Unknown GPU revision: %u.%u.%u.%u\n",
+			config.rev.core, config.rev.major,
+			config.rev.minor, config.rev.patchid);
+		return -ENXIO;
+	}
+
+	DBG("Found GPU: %u.%u.%u.%u", config.rev.core, config.rev.major,
+		config.rev.minor, config.rev.patchid);
+
+	gpu = info->init(drm);
+	if (IS_ERR(gpu)) {
+		dev_warn(drm->dev, "failed to load adreno gpu\n");
+		return PTR_ERR(gpu);
+	}
+
+	dev_set_drvdata(dev, gpu);
+
 	return 0;
 }
 
 static void adreno_unbind(struct device *dev, struct device *master,
 		void *data)
 {
+	struct msm_gpu *gpu = dev_get_drvdata(dev);
+
+	gpu->funcs->pm_suspend(gpu);
+	gpu->funcs->destroy(gpu);
+
 	set_gpu_pdev(dev_get_drvdata(master), NULL);
 }
 
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index c8b4ac254bb5..e2ffecce59a3 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -21,8 +21,6 @@
 #include "msm_gem.h"
 #include "msm_mmu.h"
 
-#define RB_SIZE    SZ_32K
-#define RB_BLKSIZE 32
 
 int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value)
 {
@@ -58,72 +56,181 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value)
 			return ret;
 		}
 		return -EINVAL;
+	case MSM_PARAM_NR_RINGS:
+		*value = gpu->nr_rings;
+		return 0;
 	default:
 		DBG("%s: invalid param: %u", gpu->name, param);
 		return -EINVAL;
 	}
 }
 
+const struct firmware *
+adreno_request_fw(struct adreno_gpu *adreno_gpu, const char *fwname)
+{
+	struct drm_device *drm = adreno_gpu->base.dev;
+	const struct firmware *fw = NULL;
+	char newname[strlen("qcom/") + strlen(fwname) + 1];
+	int ret;
+
+	sprintf(newname, "qcom/%s", fwname);
+
+	/*
+	 * Try first to load from qcom/$fwfile using a direct load (to avoid
+	 * a potential timeout waiting for usermode helper)
+	 */
+	if ((adreno_gpu->fwloc == FW_LOCATION_UNKNOWN) ||
+	    (adreno_gpu->fwloc == FW_LOCATION_NEW)) {
+
+		ret = request_firmware_direct(&fw, newname, drm->dev);
+		if (!ret) {
+			dev_info(drm->dev, "loaded %s from new location\n",
+				newname);
+			adreno_gpu->fwloc = FW_LOCATION_NEW;
+			return fw;
+		} else if (adreno_gpu->fwloc != FW_LOCATION_UNKNOWN) {
+			dev_err(drm->dev, "failed to load %s: %d\n",
+				newname, ret);
+			return ERR_PTR(ret);
+		}
+	}
+
+	/*
+	 * Then try the legacy location without qcom/ prefix
+	 */
+	if ((adreno_gpu->fwloc == FW_LOCATION_UNKNOWN) ||
+	    (adreno_gpu->fwloc == FW_LOCATION_LEGACY)) {
+
+		ret = request_firmware_direct(&fw, fwname, drm->dev);
+		if (!ret) {
+			dev_info(drm->dev, "loaded %s from legacy location\n",
+				newname);
+			adreno_gpu->fwloc = FW_LOCATION_LEGACY;
+			return fw;
+		} else if (adreno_gpu->fwloc != FW_LOCATION_UNKNOWN) {
+			dev_err(drm->dev, "failed to load %s: %d\n",
+				fwname, ret);
+			return ERR_PTR(ret);
+		}
+	}
+
+	/*
+	 * Finally fall back to request_firmware() for cases where the
+	 * usermode helper is needed (I think mainly android)
+	 */
+	if ((adreno_gpu->fwloc == FW_LOCATION_UNKNOWN) ||
+	    (adreno_gpu->fwloc == FW_LOCATION_HELPER)) {
+
+		ret = request_firmware(&fw, newname, drm->dev);
+		if (!ret) {
+			dev_info(drm->dev, "loaded %s with helper\n",
+				newname);
+			adreno_gpu->fwloc = FW_LOCATION_HELPER;
+			return fw;
+		} else if (adreno_gpu->fwloc != FW_LOCATION_UNKNOWN) {
+			dev_err(drm->dev, "failed to load %s: %d\n",
+				newname, ret);
+			return ERR_PTR(ret);
+		}
+	}
+
+	dev_err(drm->dev, "failed to load %s\n", fwname);
+	return ERR_PTR(-ENOENT);
+}
+
+static int adreno_load_fw(struct adreno_gpu *adreno_gpu)
+{
+	const struct firmware *fw;
+
+	if (adreno_gpu->pm4)
+		return 0;
+
+	fw = adreno_request_fw(adreno_gpu, adreno_gpu->info->pm4fw);
+	if (IS_ERR(fw))
+		return PTR_ERR(fw);
+	adreno_gpu->pm4 = fw;
+
+	fw = adreno_request_fw(adreno_gpu, adreno_gpu->info->pfpfw);
+	if (IS_ERR(fw)) {
+		release_firmware(adreno_gpu->pm4);
+		adreno_gpu->pm4 = NULL;
+		return PTR_ERR(fw);
+	}
+	adreno_gpu->pfp = fw;
+
+	return 0;
+}
+
 int adreno_hw_init(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	int ret;
+	int ret, i;
 
 	DBG("%s", gpu->name);
 
-	ret = msm_gem_get_iova(gpu->rb->bo, gpu->aspace, &gpu->rb_iova);
-	if (ret) {
-		gpu->rb_iova = 0;
-		dev_err(gpu->dev->dev, "could not map ringbuffer: %d\n", ret);
+	ret = adreno_load_fw(adreno_gpu);
+	if (ret)
 		return ret;
-	}
 
-	/* reset ringbuffer: */
-	gpu->rb->cur = gpu->rb->start;
+	for (i = 0; i < gpu->nr_rings; i++) {
+		struct msm_ringbuffer *ring = gpu->rb[i];
+
+		if (!ring)
+			continue;
+
+		ret = msm_gem_get_iova(ring->bo, gpu->aspace, &ring->iova);
+		if (ret) {
+			ring->iova = 0;
+			dev_err(gpu->dev->dev,
+				"could not map ringbuffer %d: %d\n", i, ret);
+			return ret;
+		}
+
+		ring->cur = ring->start;
+		ring->next = ring->start;
 
-	/* reset completed fence seqno: */
-	adreno_gpu->memptrs->fence = gpu->fctx->completed_fence;
-	adreno_gpu->memptrs->rptr  = 0;
+		/* reset completed fence seqno: */
+		ring->memptrs->fence = ring->seqno;
+		ring->memptrs->rptr = 0;
+	}
 
-	/* Setup REG_CP_RB_CNTL: */
+	/*
+	 * Setup REG_CP_RB_CNTL.  The same value is used across targets (with
+	 * the excpetion of A430 that disables the RPTR shadow) - the cacluation
+	 * for the ringbuffer size and block size is moved to msm_gpu.h for the
+	 * pre-processor to deal with and the A430 variant is ORed in here
+	 */
 	adreno_gpu_write(adreno_gpu, REG_ADRENO_CP_RB_CNTL,
-			/* size is log2(quad-words): */
-			AXXX_CP_RB_CNTL_BUFSZ(ilog2(gpu->rb->size / 8)) |
-			AXXX_CP_RB_CNTL_BLKSZ(ilog2(RB_BLKSIZE / 8)) |
-			(adreno_is_a430(adreno_gpu) ? AXXX_CP_RB_CNTL_NO_UPDATE : 0));
+		MSM_GPU_RB_CNTL_DEFAULT |
+		(adreno_is_a430(adreno_gpu) ? AXXX_CP_RB_CNTL_NO_UPDATE : 0));
 
-	/* Setup ringbuffer address: */
+	/* Setup ringbuffer address - use ringbuffer[0] for GPU init */
 	adreno_gpu_write64(adreno_gpu, REG_ADRENO_CP_RB_BASE,
-		REG_ADRENO_CP_RB_BASE_HI, gpu->rb_iova);
+		REG_ADRENO_CP_RB_BASE_HI, gpu->rb[0]->iova);
 
 	if (!adreno_is_a430(adreno_gpu)) {
 		adreno_gpu_write64(adreno_gpu, REG_ADRENO_CP_RB_RPTR_ADDR,
 			REG_ADRENO_CP_RB_RPTR_ADDR_HI,
-			rbmemptr(adreno_gpu, rptr));
+			rbmemptr(gpu->rb[0], rptr));
 	}
 
 	return 0;
 }
 
-static uint32_t get_wptr(struct msm_ringbuffer *ring)
-{
-	return ring->cur - ring->start;
-}
-
 /* Use this helper to read rptr, since a430 doesn't update rptr in memory */
-static uint32_t get_rptr(struct adreno_gpu *adreno_gpu)
+static uint32_t get_rptr(struct adreno_gpu *adreno_gpu,
+		struct msm_ringbuffer *ring)
 {
 	if (adreno_is_a430(adreno_gpu))
-		return adreno_gpu->memptrs->rptr = adreno_gpu_read(
+		return ring->memptrs->rptr = adreno_gpu_read(
 			adreno_gpu, REG_ADRENO_CP_RB_RPTR);
 	else
-		return adreno_gpu->memptrs->rptr;
+		return ring->memptrs->rptr;
 }
 
-uint32_t adreno_last_fence(struct msm_gpu *gpu)
+struct msm_ringbuffer *adreno_active_ring(struct msm_gpu *gpu)
 {
-	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	return adreno_gpu->memptrs->fence;
+	return gpu->rb[0];
 }
 
 void adreno_recover(struct msm_gpu *gpu)
@@ -149,7 +256,7 @@ void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct msm_drm_private *priv = gpu->dev->dev_private;
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = submit->ring;
 	unsigned i;
 
 	for (i = 0; i < submit->nr_cmds; i++) {
@@ -164,7 +271,7 @@ void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 		case MSM_SUBMIT_CMD_BUF:
 			OUT_PKT3(ring, adreno_is_a430(adreno_gpu) ?
 				CP_INDIRECT_BUFFER_PFE : CP_INDIRECT_BUFFER_PFD, 2);
-			OUT_RING(ring, submit->cmd[i].iova);
+			OUT_RING(ring, lower_32_bits(submit->cmd[i].iova));
 			OUT_RING(ring, submit->cmd[i].size);
 			OUT_PKT2(ring);
 			break;
@@ -172,7 +279,7 @@ void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 	}
 
 	OUT_PKT0(ring, REG_AXXX_CP_SCRATCH_REG2, 1);
-	OUT_RING(ring, submit->fence->seqno);
+	OUT_RING(ring, submit->seqno);
 
 	if (adreno_is_a3xx(adreno_gpu) || adreno_is_a4xx(adreno_gpu)) {
 		/* Flush HLSQ lazy updates to make sure there is nothing
@@ -188,8 +295,8 @@ void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 
 	OUT_PKT3(ring, CP_EVENT_WRITE, 3);
 	OUT_RING(ring, CACHE_FLUSH_TS);
-	OUT_RING(ring, rbmemptr(adreno_gpu, fence));
-	OUT_RING(ring, submit->fence->seqno);
+	OUT_RING(ring, rbmemptr(ring, fence));
+	OUT_RING(ring, submit->seqno);
 
 	/* we could maybe be clever and only CP_COND_EXEC the interrupt: */
 	OUT_PKT3(ring, CP_INTERRUPT, 1);
@@ -215,20 +322,23 @@ void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 	}
 #endif
 
-	gpu->funcs->flush(gpu);
+	gpu->funcs->flush(gpu, ring);
 }
 
-void adreno_flush(struct msm_gpu *gpu)
+void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	uint32_t wptr;
 
+	/* Copy the shadow to the actual register */
+	ring->cur = ring->next;
+
 	/*
 	 * Mask wptr value that we calculate to fit in the HW range. This is
 	 * to account for the possibility that the last command fit exactly into
 	 * the ringbuffer and rb->next hasn't wrapped to zero yet
 	 */
-	wptr = get_wptr(gpu->rb) & ((gpu->rb->size / 4) - 1);
+	wptr = get_wptr(ring);
 
 	/* ensure writes to ringbuffer have hit system memory: */
 	mb();
@@ -236,17 +346,19 @@ void adreno_flush(struct msm_gpu *gpu)
 	adreno_gpu_write(adreno_gpu, REG_ADRENO_CP_RB_WPTR, wptr);
 }
 
-bool adreno_idle(struct msm_gpu *gpu)
+bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	uint32_t wptr = get_wptr(gpu->rb);
+	uint32_t wptr = get_wptr(ring);
 
 	/* wait for CP to drain ringbuffer: */
-	if (!spin_until(get_rptr(adreno_gpu) == wptr))
+	if (!spin_until(get_rptr(adreno_gpu, ring) == wptr))
 		return true;
 
 	/* TODO maybe we need to reset GPU here to recover from hang? */
-	DRM_ERROR("%s: timeout waiting to drain ringbuffer!\n", gpu->name);
+	DRM_ERROR("%s: timeout waiting to drain ringbuffer %d rptr/wptr = %X/%X\n",
+		gpu->name, ring->id, get_rptr(adreno_gpu, ring), wptr);
+
 	return false;
 }
 
@@ -261,10 +373,16 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
 			adreno_gpu->rev.major, adreno_gpu->rev.minor,
 			adreno_gpu->rev.patchid);
 
-	seq_printf(m, "fence:    %d/%d\n", adreno_gpu->memptrs->fence,
-			gpu->fctx->last_fence);
-	seq_printf(m, "rptr:     %d\n", get_rptr(adreno_gpu));
-	seq_printf(m, "rb wptr:  %d\n", get_wptr(gpu->rb));
+	for (i = 0; i < gpu->nr_rings; i++) {
+		struct msm_ringbuffer *ring = gpu->rb[i];
+
+		seq_printf(m, "rb %d: fence:    %d/%d\n", i,
+			ring->memptrs->fence, ring->seqno);
+
+		seq_printf(m, "      rptr:     %d\n",
+			get_rptr(adreno_gpu, ring));
+		seq_printf(m, "rb wptr:  %d\n", get_wptr(ring));
+	}
 
 	/* dump these out in a form that can be parsed by demsm: */
 	seq_printf(m, "IO:region %s 00000000 00020000\n", gpu->name);
@@ -290,16 +408,23 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
 void adreno_dump_info(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	int i;
 
 	printk("revision: %d (%d.%d.%d.%d)\n",
 			adreno_gpu->info->revn, adreno_gpu->rev.core,
 			adreno_gpu->rev.major, adreno_gpu->rev.minor,
 			adreno_gpu->rev.patchid);
 
-	printk("fence:    %d/%d\n", adreno_gpu->memptrs->fence,
-			gpu->fctx->last_fence);
-	printk("rptr:     %d\n", get_rptr(adreno_gpu));
-	printk("rb wptr:  %d\n", get_wptr(gpu->rb));
+	for (i = 0; i < gpu->nr_rings; i++) {
+		struct msm_ringbuffer *ring = gpu->rb[i];
+
+		printk("rb %d: fence:    %d/%d\n", i,
+			ring->memptrs->fence,
+			ring->seqno);
+
+		printk("rptr:     %d\n", get_rptr(adreno_gpu, ring));
+		printk("rb wptr:  %d\n", get_wptr(ring));
+	}
 }
 
 /* would be nice to not have to duplicate the _show() stuff with printk(): */
@@ -322,28 +447,31 @@ void adreno_dump(struct msm_gpu *gpu)
 	}
 }
 
-static uint32_t ring_freewords(struct msm_gpu *gpu)
+static uint32_t ring_freewords(struct msm_ringbuffer *ring)
 {
-	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	uint32_t size = gpu->rb->size / 4;
-	uint32_t wptr = get_wptr(gpu->rb);
-	uint32_t rptr = get_rptr(adreno_gpu);
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(ring->gpu);
+	uint32_t size = MSM_GPU_RINGBUFFER_SZ >> 2;
+	/* Use ring->next to calculate free size */
+	uint32_t wptr = ring->next - ring->start;
+	uint32_t rptr = get_rptr(adreno_gpu, ring);
 	return (rptr + (size - 1) - wptr) % size;
 }
 
-void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords)
+void adreno_wait_ring(struct msm_ringbuffer *ring, uint32_t ndwords)
 {
-	if (spin_until(ring_freewords(gpu) >= ndwords))
-		DRM_ERROR("%s: timeout waiting for ringbuffer space\n", gpu->name);
+	if (spin_until(ring_freewords(ring) >= ndwords))
+		DRM_DEV_ERROR(ring->gpu->dev->dev,
+			"timeout waiting for space in ringubffer %d\n",
+			ring->id);
 }
 
 int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
-		struct adreno_gpu *adreno_gpu, const struct adreno_gpu_funcs *funcs)
+		struct adreno_gpu *adreno_gpu,
+		const struct adreno_gpu_funcs *funcs, int nr_rings)
 {
 	struct adreno_platform_config *config = pdev->dev.platform_data;
 	struct msm_gpu_config adreno_gpu_config  = { 0 };
 	struct msm_gpu *gpu = &adreno_gpu->base;
-	int ret;
 
 	adreno_gpu->funcs = funcs;
 	adreno_gpu->info = adreno_info(config->rev);
@@ -366,59 +494,20 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	adreno_gpu_config.va_start = SZ_16M;
 	adreno_gpu_config.va_end = 0xffffffff;
 
-	adreno_gpu_config.ringsz = RB_SIZE;
+	adreno_gpu_config.nr_rings = nr_rings;
 
 	pm_runtime_set_autosuspend_delay(&pdev->dev, DRM_MSM_INACTIVE_PERIOD);
 	pm_runtime_use_autosuspend(&pdev->dev);
 	pm_runtime_enable(&pdev->dev);
 
-	ret = msm_gpu_init(drm, pdev, &adreno_gpu->base, &funcs->base,
+	return msm_gpu_init(drm, pdev, &adreno_gpu->base, &funcs->base,
 			adreno_gpu->info->name, &adreno_gpu_config);
-	if (ret)
-		return ret;
-
-	ret = request_firmware(&adreno_gpu->pm4, adreno_gpu->info->pm4fw, drm->dev);
-	if (ret) {
-		dev_err(drm->dev, "failed to load %s PM4 firmware: %d\n",
-				adreno_gpu->info->pm4fw, ret);
-		return ret;
-	}
-
-	ret = request_firmware(&adreno_gpu->pfp, adreno_gpu->info->pfpfw, drm->dev);
-	if (ret) {
-		dev_err(drm->dev, "failed to load %s PFP firmware: %d\n",
-				adreno_gpu->info->pfpfw, ret);
-		return ret;
-	}
-
-	adreno_gpu->memptrs = msm_gem_kernel_new(drm,
-		sizeof(*adreno_gpu->memptrs), MSM_BO_UNCACHED, gpu->aspace,
-		&adreno_gpu->memptrs_bo, &adreno_gpu->memptrs_iova);
-
-	if (IS_ERR(adreno_gpu->memptrs)) {
-		ret = PTR_ERR(adreno_gpu->memptrs);
-		adreno_gpu->memptrs = NULL;
-		dev_err(drm->dev, "could not allocate memptrs: %d\n", ret);
-	}
-
-	return ret;
 }
 
 void adreno_gpu_cleanup(struct adreno_gpu *adreno_gpu)
 {
-	struct msm_gpu *gpu = &adreno_gpu->base;
-
-	if (adreno_gpu->memptrs_bo) {
-		if (adreno_gpu->memptrs)
-			msm_gem_put_vaddr(adreno_gpu->memptrs_bo);
-
-		if (adreno_gpu->memptrs_iova)
-			msm_gem_put_iova(adreno_gpu->memptrs_bo, gpu->aspace);
-
-		drm_gem_object_unreference_unlocked(adreno_gpu->memptrs_bo);
-	}
 	release_firmware(adreno_gpu->pm4);
 	release_firmware(adreno_gpu->pfp);
 
-	msm_gpu_cleanup(gpu);
+	msm_gpu_cleanup(&adreno_gpu->base);
 }
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index 4d9165f29f43..28e3de6e5f94 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -2,7 +2,7 @@
  * Copyright (C) 2013 Red Hat
  * Author: Rob Clark <robdclark@gmail.com>
  *
- * Copyright (c) 2014 The Linux Foundation. All rights reserved.
+ * Copyright (c) 2014,2017 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published by
@@ -82,14 +82,6 @@ struct adreno_info {
 
 const struct adreno_info *adreno_info(struct adreno_rev rev);
 
-#define rbmemptr(adreno_gpu, member)  \
-	((adreno_gpu)->memptrs_iova + offsetof(struct adreno_rbmemptrs, member))
-
-struct adreno_rbmemptrs {
-	volatile uint32_t rptr;
-	volatile uint32_t fence;
-};
-
 struct adreno_gpu {
 	struct msm_gpu base;
 	struct adreno_rev rev;
@@ -101,16 +93,30 @@ struct adreno_gpu {
 	/* interesting register offsets to dump: */
 	const unsigned int *registers;
 
+	/*
+	 * Are we loading fw from legacy path?  Prior to addition
+	 * of gpu firmware to linux-firmware, the fw files were
+	 * placed in toplevel firmware directory, following qcom's
+	 * android kernel.  But linux-firmware preferred they be
+	 * placed in a 'qcom' subdirectory.
+	 *
+	 * For backwards compatibility, we try first to load from
+	 * the new path, using request_firmware_direct() to avoid
+	 * any potential timeout waiting for usermode helper, then
+	 * fall back to the old path (with direct load).  And
+	 * finally fall back to request_firmware() with the new
+	 * path to allow the usermode helper.
+	 */
+	enum {
+		FW_LOCATION_UNKNOWN = 0,
+		FW_LOCATION_NEW,       /* /lib/firmware/qcom/$fwfile */
+		FW_LOCATION_LEGACY,    /* /lib/firmware/$fwfile */
+		FW_LOCATION_HELPER,
+	} fwloc;
+
 	/* firmware: */
 	const struct firmware *pm4, *pfp;
 
-	/* ringbuffer rptr/wptr: */
-	// TODO should this be in msm_ringbuffer?  I think it would be
-	// different for z180..
-	struct adreno_rbmemptrs *memptrs;
-	struct drm_gem_object *memptrs_bo;
-	uint64_t memptrs_iova;
-
 	/*
 	 * Register offsets are different between some GPUs.
 	 * GPU specific offsets will be exported by GPU specific
@@ -196,22 +202,25 @@ static inline int adreno_is_a530(struct adreno_gpu *gpu)
 }
 
 int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value);
+const struct firmware *adreno_request_fw(struct adreno_gpu *adreno_gpu,
+		const char *fwname);
 int adreno_hw_init(struct msm_gpu *gpu);
-uint32_t adreno_last_fence(struct msm_gpu *gpu);
 void adreno_recover(struct msm_gpu *gpu);
 void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 		struct msm_file_private *ctx);
-void adreno_flush(struct msm_gpu *gpu);
-bool adreno_idle(struct msm_gpu *gpu);
+void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
+bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 #ifdef CONFIG_DEBUG_FS
 void adreno_show(struct msm_gpu *gpu, struct seq_file *m);
 #endif
 void adreno_dump_info(struct msm_gpu *gpu);
 void adreno_dump(struct msm_gpu *gpu);
-void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords);
+void adreno_wait_ring(struct msm_ringbuffer *ring, uint32_t ndwords);
+struct msm_ringbuffer *adreno_active_ring(struct msm_gpu *gpu);
 
 int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
-		struct adreno_gpu *gpu, const struct adreno_gpu_funcs *funcs);
+		struct adreno_gpu *gpu, const struct adreno_gpu_funcs *funcs,
+		int nr_rings);
 void adreno_gpu_cleanup(struct adreno_gpu *gpu);
 
 
@@ -220,7 +229,7 @@ void adreno_gpu_cleanup(struct adreno_gpu *gpu);
 static inline void
 OUT_PKT0(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
 {
-	adreno_wait_ring(ring->gpu, cnt+1);
+	adreno_wait_ring(ring, cnt+1);
 	OUT_RING(ring, CP_TYPE0_PKT | ((cnt-1) << 16) | (regindx & 0x7FFF));
 }
 
@@ -228,14 +237,14 @@ OUT_PKT0(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
 static inline void
 OUT_PKT2(struct msm_ringbuffer *ring)
 {
-	adreno_wait_ring(ring->gpu, 1);
+	adreno_wait_ring(ring, 1);
 	OUT_RING(ring, CP_TYPE2_PKT);
 }
 
 static inline void
 OUT_PKT3(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
 {
-	adreno_wait_ring(ring->gpu, cnt+1);
+	adreno_wait_ring(ring, cnt+1);
 	OUT_RING(ring, CP_TYPE3_PKT | ((cnt-1) << 16) | ((opcode & 0xFF) << 8));
 }
 
@@ -257,14 +266,14 @@ static inline u32 PM4_PARITY(u32 val)
 static inline void
 OUT_PKT4(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
 {
-	adreno_wait_ring(ring->gpu, cnt + 1);
+	adreno_wait_ring(ring, cnt + 1);
 	OUT_RING(ring, PKT4(regindx, cnt));
 }
 
 static inline void
 OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
 {
-	adreno_wait_ring(ring->gpu, cnt + 1);
+	adreno_wait_ring(ring, cnt + 1);
 	OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) |
 		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23));
 }
@@ -323,6 +332,11 @@ static inline void adreno_gpu_write64(struct adreno_gpu *gpu,
 	adreno_gpu_write(gpu, hi, upper_32_bits(data));
 }
 
+static inline uint32_t get_wptr(struct msm_ringbuffer *ring)
+{
+	return (ring->cur - ring->start) % (MSM_GPU_RINGBUFFER_SZ >> 2);
+}
+
 /*
  * Given a register and a count, return a value to program into
  * REG_CP_PROTECT_REG(n) - this will block both reads and writes for _len
diff --git a/drivers/gpu/drm/msm/dsi/dsi_cfg.c b/drivers/gpu/drm/msm/dsi/dsi_cfg.c
index a5d75c9b3a73..65c1dfbbe019 100644
--- a/drivers/gpu/drm/msm/dsi/dsi_cfg.c
+++ b/drivers/gpu/drm/msm/dsi/dsi_cfg.c
@@ -14,7 +14,7 @@
 #include "dsi_cfg.h"
 
 static const char * const dsi_v2_bus_clk_names[] = {
-	"core_mmss_clk", "iface_clk", "bus_clk",
+	"core_mmss", "iface", "bus",
 };
 
 static const struct msm_dsi_config apq8064_dsi_cfg = {
@@ -34,7 +34,7 @@ static const struct msm_dsi_config apq8064_dsi_cfg = {
 };
 
 static const char * const dsi_6g_bus_clk_names[] = {
-	"mdp_core_clk", "iface_clk", "bus_clk", "core_mmss_clk",
+	"mdp_core", "iface", "bus", "core_mmss",
 };
 
 static const struct msm_dsi_config msm8974_apq8084_dsi_cfg = {
@@ -55,7 +55,7 @@ static const struct msm_dsi_config msm8974_apq8084_dsi_cfg = {
 };
 
 static const char * const dsi_8916_bus_clk_names[] = {
-	"mdp_core_clk", "iface_clk", "bus_clk",
+	"mdp_core", "iface", "bus",
 };
 
 static const struct msm_dsi_config msm8916_dsi_cfg = {
@@ -99,7 +99,7 @@ static const struct msm_dsi_config msm8994_dsi_cfg = {
  * without it too. Figure out why it doesn't enable and uncomment below
  */
 static const char * const dsi_8996_bus_clk_names[] = {
-	"mdp_core_clk", "iface_clk", "bus_clk", /* "core_mmss_clk", */
+	"mdp_core", "iface", "bus", /* "core_mmss", */
 };
 
 static const struct msm_dsi_config msm8996_dsi_cfg = {
diff --git a/drivers/gpu/drm/msm/dsi/dsi_host.c b/drivers/gpu/drm/msm/dsi/dsi_host.c
index dbb31a014419..0f7324a686ca 100644
--- a/drivers/gpu/drm/msm/dsi/dsi_host.c
+++ b/drivers/gpu/drm/msm/dsi/dsi_host.c
@@ -248,7 +248,7 @@ disable_clks:
 	clk_disable_unprepare(ahb_clk);
 disable_gdsc:
 	regulator_disable(gdsc_reg);
-	pm_runtime_put_autosuspend(dev);
+	pm_runtime_put_sync(dev);
 put_clk:
 	clk_put(ahb_clk);
 put_gdsc:
@@ -334,46 +334,46 @@ static int dsi_regulator_init(struct msm_dsi_host *msm_host)
 
 static int dsi_clk_init(struct msm_dsi_host *msm_host)
 {
-	struct device *dev = &msm_host->pdev->dev;
+	struct platform_device *pdev = msm_host->pdev;
 	const struct msm_dsi_cfg_handler *cfg_hnd = msm_host->cfg_hnd;
 	const struct msm_dsi_config *cfg = cfg_hnd->cfg;
 	int i, ret = 0;
 
 	/* get bus clocks */
 	for (i = 0; i < cfg->num_bus_clks; i++) {
-		msm_host->bus_clks[i] = devm_clk_get(dev,
+		msm_host->bus_clks[i] = msm_clk_get(pdev,
 						cfg->bus_clk_names[i]);
 		if (IS_ERR(msm_host->bus_clks[i])) {
 			ret = PTR_ERR(msm_host->bus_clks[i]);
-			pr_err("%s: Unable to get %s, ret = %d\n",
+			pr_err("%s: Unable to get %s clock, ret = %d\n",
 				__func__, cfg->bus_clk_names[i], ret);
 			goto exit;
 		}
 	}
 
 	/* get link and source clocks */
-	msm_host->byte_clk = devm_clk_get(dev, "byte_clk");
+	msm_host->byte_clk = msm_clk_get(pdev, "byte");
 	if (IS_ERR(msm_host->byte_clk)) {
 		ret = PTR_ERR(msm_host->byte_clk);
-		pr_err("%s: can't find dsi_byte_clk. ret=%d\n",
+		pr_err("%s: can't find dsi_byte clock. ret=%d\n",
 			__func__, ret);
 		msm_host->byte_clk = NULL;
 		goto exit;
 	}
 
-	msm_host->pixel_clk = devm_clk_get(dev, "pixel_clk");
+	msm_host->pixel_clk = msm_clk_get(pdev, "pixel");
 	if (IS_ERR(msm_host->pixel_clk)) {
 		ret = PTR_ERR(msm_host->pixel_clk);
-		pr_err("%s: can't find dsi_pixel_clk. ret=%d\n",
+		pr_err("%s: can't find dsi_pixel clock. ret=%d\n",
 			__func__, ret);
 		msm_host->pixel_clk = NULL;
 		goto exit;
 	}
 
-	msm_host->esc_clk = devm_clk_get(dev, "core_clk");
+	msm_host->esc_clk = msm_clk_get(pdev, "core");
 	if (IS_ERR(msm_host->esc_clk)) {
 		ret = PTR_ERR(msm_host->esc_clk);
-		pr_err("%s: can't find dsi_esc_clk. ret=%d\n",
+		pr_err("%s: can't find dsi_esc clock. ret=%d\n",
 			__func__, ret);
 		msm_host->esc_clk = NULL;
 		goto exit;
@@ -382,22 +382,22 @@ static int dsi_clk_init(struct msm_dsi_host *msm_host)
 	msm_host->byte_clk_src = clk_get_parent(msm_host->byte_clk);
 	if (!msm_host->byte_clk_src) {
 		ret = -ENODEV;
-		pr_err("%s: can't find byte_clk_src. ret=%d\n", __func__, ret);
+		pr_err("%s: can't find byte_clk clock. ret=%d\n", __func__, ret);
 		goto exit;
 	}
 
 	msm_host->pixel_clk_src = clk_get_parent(msm_host->pixel_clk);
 	if (!msm_host->pixel_clk_src) {
 		ret = -ENODEV;
-		pr_err("%s: can't find pixel_clk_src. ret=%d\n", __func__, ret);
+		pr_err("%s: can't find pixel_clk clock. ret=%d\n", __func__, ret);
 		goto exit;
 	}
 
 	if (cfg_hnd->major == MSM_DSI_VER_MAJOR_V2) {
-		msm_host->src_clk = devm_clk_get(dev, "src_clk");
+		msm_host->src_clk = msm_clk_get(pdev, "src");
 		if (IS_ERR(msm_host->src_clk)) {
 			ret = PTR_ERR(msm_host->src_clk);
-			pr_err("%s: can't find dsi_src_clk. ret=%d\n",
+			pr_err("%s: can't find src clock. ret=%d\n",
 				__func__, ret);
 			msm_host->src_clk = NULL;
 			goto exit;
@@ -406,7 +406,7 @@ static int dsi_clk_init(struct msm_dsi_host *msm_host)
 		msm_host->esc_clk_src = clk_get_parent(msm_host->esc_clk);
 		if (!msm_host->esc_clk_src) {
 			ret = -ENODEV;
-			pr_err("%s: can't get esc_clk_src. ret=%d\n",
+			pr_err("%s: can't get esc clock parent. ret=%d\n",
 				__func__, ret);
 			goto exit;
 		}
@@ -414,7 +414,7 @@ static int dsi_clk_init(struct msm_dsi_host *msm_host)
 		msm_host->dsi_clk_src = clk_get_parent(msm_host->src_clk);
 		if (!msm_host->dsi_clk_src) {
 			ret = -ENODEV;
-			pr_err("%s: can't get dsi_clk_src. ret=%d\n",
+			pr_err("%s: can't get src clock parent. ret=%d\n",
 				__func__, ret);
 		}
 	}
diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
index 7c9bf91bc22b..790ca280cbfd 100644
--- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
+++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
@@ -482,7 +482,7 @@ static int dsi_phy_driver_probe(struct platform_device *pdev)
 		goto fail;
 	}
 
-	phy->ahb_clk = devm_clk_get(dev, "iface_clk");
+	phy->ahb_clk = msm_clk_get(pdev, "iface");
 	if (IS_ERR(phy->ahb_clk)) {
 		dev_err(dev, "%s: Unable to get ahb clk\n", __func__);
 		ret = PTR_ERR(phy->ahb_clk);
diff --git a/drivers/gpu/drm/msm/edp/edp_ctrl.c b/drivers/gpu/drm/msm/edp/edp_ctrl.c
index e32a4a4f3797..7c72264101ff 100644
--- a/drivers/gpu/drm/msm/edp/edp_ctrl.c
+++ b/drivers/gpu/drm/msm/edp/edp_ctrl.c
@@ -150,46 +150,46 @@ static const struct edp_pixel_clk_div clk_divs[2][EDP_PIXEL_CLK_NUM] = {
 
 static int edp_clk_init(struct edp_ctrl *ctrl)
 {
-	struct device *dev = &ctrl->pdev->dev;
+	struct platform_device *pdev = ctrl->pdev;
 	int ret;
 
-	ctrl->aux_clk = devm_clk_get(dev, "core_clk");
+	ctrl->aux_clk = msm_clk_get(pdev, "core");
 	if (IS_ERR(ctrl->aux_clk)) {
 		ret = PTR_ERR(ctrl->aux_clk);
-		pr_err("%s: Can't find aux_clk, %d\n", __func__, ret);
+		pr_err("%s: Can't find core clock, %d\n", __func__, ret);
 		ctrl->aux_clk = NULL;
 		return ret;
 	}
 
-	ctrl->pixel_clk = devm_clk_get(dev, "pixel_clk");
+	ctrl->pixel_clk = msm_clk_get(pdev, "pixel");
 	if (IS_ERR(ctrl->pixel_clk)) {
 		ret = PTR_ERR(ctrl->pixel_clk);
-		pr_err("%s: Can't find pixel_clk, %d\n", __func__, ret);
+		pr_err("%s: Can't find pixel clock, %d\n", __func__, ret);
 		ctrl->pixel_clk = NULL;
 		return ret;
 	}
 
-	ctrl->ahb_clk = devm_clk_get(dev, "iface_clk");
+	ctrl->ahb_clk = msm_clk_get(pdev, "iface");
 	if (IS_ERR(ctrl->ahb_clk)) {
 		ret = PTR_ERR(ctrl->ahb_clk);
-		pr_err("%s: Can't find ahb_clk, %d\n", __func__, ret);
+		pr_err("%s: Can't find iface clock, %d\n", __func__, ret);
 		ctrl->ahb_clk = NULL;
 		return ret;
 	}
 
-	ctrl->link_clk = devm_clk_get(dev, "link_clk");
+	ctrl->link_clk = msm_clk_get(pdev, "link");
 	if (IS_ERR(ctrl->link_clk)) {
 		ret = PTR_ERR(ctrl->link_clk);
-		pr_err("%s: Can't find link_clk, %d\n", __func__, ret);
+		pr_err("%s: Can't find link clock, %d\n", __func__, ret);
 		ctrl->link_clk = NULL;
 		return ret;
 	}
 
 	/* need mdp core clock to receive irq */
-	ctrl->mdp_core_clk = devm_clk_get(dev, "mdp_core_clk");
+	ctrl->mdp_core_clk = msm_clk_get(pdev, "mdp_core");
 	if (IS_ERR(ctrl->mdp_core_clk)) {
 		ret = PTR_ERR(ctrl->mdp_core_clk);
-		pr_err("%s: Can't find mdp_core_clk, %d\n", __func__, ret);
+		pr_err("%s: Can't find mdp_core clock, %d\n", __func__, ret);
 		ctrl->mdp_core_clk = NULL;
 		return ret;
 	}
diff --git a/drivers/gpu/drm/msm/hdmi/hdmi.c b/drivers/gpu/drm/msm/hdmi/hdmi.c
index 17e069a133a4..e63dc0fb55f8 100644
--- a/drivers/gpu/drm/msm/hdmi/hdmi.c
+++ b/drivers/gpu/drm/msm/hdmi/hdmi.c
@@ -208,7 +208,7 @@ static struct hdmi *msm_hdmi_init(struct platform_device *pdev)
 	for (i = 0; i < config->hpd_clk_cnt; i++) {
 		struct clk *clk;
 
-		clk = devm_clk_get(&pdev->dev, config->hpd_clk_names[i]);
+		clk = msm_clk_get(pdev, config->hpd_clk_names[i]);
 		if (IS_ERR(clk)) {
 			ret = PTR_ERR(clk);
 			dev_err(&pdev->dev, "failed to get hpd clk: %s (%d)\n",
@@ -228,7 +228,7 @@ static struct hdmi *msm_hdmi_init(struct platform_device *pdev)
 	for (i = 0; i < config->pwr_clk_cnt; i++) {
 		struct clk *clk;
 
-		clk = devm_clk_get(&pdev->dev, config->pwr_clk_names[i]);
+		clk = msm_clk_get(pdev, config->pwr_clk_names[i]);
 		if (IS_ERR(clk)) {
 			ret = PTR_ERR(clk);
 			dev_err(&pdev->dev, "failed to get pwr clk: %s (%d)\n",
@@ -361,7 +361,7 @@ static const char *hpd_reg_names_none[] = {};
 static struct hdmi_platform_config hdmi_tx_8660_config;
 
 static const char *hpd_reg_names_8960[] = {"core-vdda", "hdmi-mux"};
-static const char *hpd_clk_names_8960[] = {"core_clk", "master_iface_clk", "slave_iface_clk"};
+static const char *hpd_clk_names_8960[] = {"core", "master_iface", "slave_iface"};
 
 static struct hdmi_platform_config hdmi_tx_8960_config = {
 		HDMI_CFG(hpd_reg, 8960),
@@ -370,8 +370,8 @@ static struct hdmi_platform_config hdmi_tx_8960_config = {
 
 static const char *pwr_reg_names_8x74[] = {"core-vdda", "core-vcc"};
 static const char *hpd_reg_names_8x74[] = {"hpd-gdsc", "hpd-5v"};
-static const char *pwr_clk_names_8x74[] = {"extp_clk", "alt_iface_clk"};
-static const char *hpd_clk_names_8x74[] = {"iface_clk", "core_clk", "mdp_core_clk"};
+static const char *pwr_clk_names_8x74[] = {"extp", "alt_iface"};
+static const char *hpd_clk_names_8x74[] = {"iface", "core", "mdp_core"};
 static unsigned long hpd_clk_freq_8x74[] = {0, 19200000, 0};
 
 static struct hdmi_platform_config hdmi_tx_8974_config = {
diff --git a/drivers/gpu/drm/msm/hdmi/hdmi_phy.c b/drivers/gpu/drm/msm/hdmi/hdmi_phy.c
index 534ce5b49781..5e631392dc85 100644
--- a/drivers/gpu/drm/msm/hdmi/hdmi_phy.c
+++ b/drivers/gpu/drm/msm/hdmi/hdmi_phy.c
@@ -48,7 +48,7 @@ static int msm_hdmi_phy_resource_init(struct hdmi_phy *phy)
 	for (i = 0; i < cfg->num_clks; i++) {
 		struct clk *clk;
 
-		clk = devm_clk_get(dev, cfg->clk_names[i]);
+		clk = msm_clk_get(phy->pdev, cfg->clk_names[i]);
 		if (IS_ERR(clk)) {
 			ret = PTR_ERR(clk);
 			dev_err(dev, "failed to get phy clock: %s (%d)\n",
diff --git a/drivers/gpu/drm/msm/hdmi/hdmi_phy_8960.c b/drivers/gpu/drm/msm/hdmi/hdmi_phy_8960.c
index e6ee6b745ab7..0980da8ec966 100644
--- a/drivers/gpu/drm/msm/hdmi/hdmi_phy_8960.c
+++ b/drivers/gpu/drm/msm/hdmi/hdmi_phy_8960.c
@@ -48,7 +48,7 @@ static const char * const hdmi_phy_8960_reg_names[] = {
 };
 
 static const char * const hdmi_phy_8960_clk_names[] = {
-	"slave_iface_clk",
+	"slave_iface",
 };
 
 const struct hdmi_phy_cfg msm_hdmi_phy_8960_cfg = {
diff --git a/drivers/gpu/drm/msm/hdmi/hdmi_phy_8996.c b/drivers/gpu/drm/msm/hdmi/hdmi_phy_8996.c
index 1fb7645cc721..0df504c61833 100644
--- a/drivers/gpu/drm/msm/hdmi/hdmi_phy_8996.c
+++ b/drivers/gpu/drm/msm/hdmi/hdmi_phy_8996.c
@@ -758,9 +758,7 @@ static const char * const hdmi_phy_8996_reg_names[] = {
 };
 
 static const char * const hdmi_phy_8996_clk_names[] = {
-	"mmagic_iface_clk",
-	"iface_clk",
-	"ref_clk",
+	"iface", "ref",
 };
 
 const struct hdmi_phy_cfg msm_hdmi_phy_8996_cfg = {
diff --git a/drivers/gpu/drm/msm/hdmi/hdmi_phy_8x74.c b/drivers/gpu/drm/msm/hdmi/hdmi_phy_8x74.c
index c4a61e537851..4a8b8468586a 100644
--- a/drivers/gpu/drm/msm/hdmi/hdmi_phy_8x74.c
+++ b/drivers/gpu/drm/msm/hdmi/hdmi_phy_8x74.c
@@ -41,8 +41,7 @@ static const char * const hdmi_phy_8x74_reg_names[] = {
 };
 
 static const char * const hdmi_phy_8x74_clk_names[] = {
-	"iface_clk",
-	"alt_iface_clk"
+	"iface", "alt_iface"
 };
 
 const struct hdmi_phy_cfg msm_hdmi_phy_8x74_cfg = {
diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
index 47fa2aba1983..14bd3bd3e040 100644
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
@@ -290,6 +290,9 @@ static void mdp4_crtc_atomic_disable(struct drm_crtc *crtc,
 	if (WARN_ON(!mdp4_crtc->enabled))
 		return;
 
+	/* Disable/save vblank irq handling before power is disabled */
+	drm_crtc_vblank_off(crtc);
+
 	mdp_irq_unregister(&mdp4_kms->base, &mdp4_crtc->err);
 	mdp4_disable(mdp4_kms);
 
@@ -308,6 +311,10 @@ static void mdp4_crtc_atomic_enable(struct drm_crtc *crtc,
 		return;
 
 	mdp4_enable(mdp4_kms);
+
+	/* Restore vblank irq handling after power is enabled */
+	drm_crtc_vblank_on(crtc);
+
 	mdp_irq_register(&mdp4_kms->base, &mdp4_crtc->err);
 
 	crtc_flush(crtc);
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cfg.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cfg.c
index c2bdad88447e..824067d2d427 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cfg.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cfg.c
@@ -83,6 +83,8 @@ const struct mdp5_cfg_hw msm8x74v1_config = {
 				  .caps = MDP_LM_CAP_WB },
 			     },
 		.nb_stages = 5,
+		.max_width = 2048,
+		.max_height = 0xFFFF,
 	},
 	.dspp = {
 		.count = 3,
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cmd_encoder.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cmd_encoder.c
index 60790df91bfa..1abc7f5c345c 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cmd_encoder.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cmd_encoder.c
@@ -224,7 +224,7 @@ int mdp5_cmd_encoder_set_split_display(struct drm_encoder *encoder,
 	mdp5_write(mdp5_kms, REG_MDP5_SPLIT_DPL_LOWER,
 		   MDP5_SPLIT_DPL_LOWER_SMART_PANEL);
 	mdp5_write(mdp5_kms, REG_MDP5_SPLIT_DPL_EN, 1);
-	pm_runtime_put_autosuspend(dev);
+	pm_runtime_put_sync(dev);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
index 6fcb58ab718c..e414850dbbda 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
@@ -55,18 +55,23 @@ struct mdp5_crtc {
 
 	struct completion pp_completion;
 
+	bool lm_cursor_enabled;
+
 	struct {
 		/* protect REG_MDP5_LM_CURSOR* registers and cursor scanout_bo*/
 		spinlock_t lock;
 
 		/* current cursor being scanned out: */
 		struct drm_gem_object *scanout_bo;
+		uint64_t iova;
 		uint32_t width, height;
 		uint32_t x, y;
 	} cursor;
 };
 #define to_mdp5_crtc(x) container_of(x, struct mdp5_crtc, base)
 
+static void mdp5_crtc_restore_cursor(struct drm_crtc *crtc);
+
 static struct mdp5_kms *get_kms(struct drm_crtc *crtc)
 {
 	struct msm_drm_private *priv = crtc->dev->dev_private;
@@ -114,6 +119,8 @@ static u32 crtc_flush_all(struct drm_crtc *crtc)
 		return 0;
 
 	drm_atomic_crtc_for_each_plane(plane, crtc) {
+		if (!plane->state->visible)
+			continue;
 		flush_mask |= mdp5_plane_get_flush(plane);
 	}
 
@@ -242,6 +249,9 @@ static void blend_setup(struct drm_crtc *crtc)
 	drm_atomic_crtc_for_each_plane(plane, crtc) {
 		enum mdp5_pipe right_pipe;
 
+		if (!plane->state->visible)
+			continue;
+
 		pstate = to_mdp5_plane_state(plane->state);
 		pstates[pstate->stage] = pstate;
 		stage[pstate->stage][PIPE_LEFT] = mdp5_plane_pipe(plane);
@@ -422,11 +432,14 @@ static void mdp5_crtc_atomic_disable(struct drm_crtc *crtc,
 	if (WARN_ON(!mdp5_crtc->enabled))
 		return;
 
+	/* Disable/save vblank irq handling before power is disabled */
+	drm_crtc_vblank_off(crtc);
+
 	if (mdp5_cstate->cmd_mode)
 		mdp_irq_unregister(&mdp5_kms->base, &mdp5_crtc->pp_done);
 
 	mdp_irq_unregister(&mdp5_kms->base, &mdp5_crtc->err);
-	pm_runtime_put_autosuspend(dev);
+	pm_runtime_put_sync(dev);
 
 	mdp5_crtc->enabled = false;
 }
@@ -446,6 +459,29 @@ static void mdp5_crtc_atomic_enable(struct drm_crtc *crtc,
 
 	pm_runtime_get_sync(dev);
 
+	if (mdp5_crtc->lm_cursor_enabled) {
+		/*
+		 * Restore LM cursor state, as it might have been lost
+		 * with suspend:
+		 */
+		if (mdp5_crtc->cursor.iova) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&mdp5_crtc->cursor.lock, flags);
+			mdp5_crtc_restore_cursor(crtc);
+			spin_unlock_irqrestore(&mdp5_crtc->cursor.lock, flags);
+
+			mdp5_ctl_set_cursor(mdp5_cstate->ctl,
+					    &mdp5_cstate->pipeline, 0, true);
+		} else {
+			mdp5_ctl_set_cursor(mdp5_cstate->ctl,
+					    &mdp5_cstate->pipeline, 0, false);
+		}
+	}
+
+	/* Restore vblank irq handling after power is enabled */
+	drm_crtc_vblank_on(crtc);
+
 	mdp5_crtc_mode_set_nofb(crtc);
 
 	mdp_irq_register(&mdp5_kms->base, &mdp5_crtc->err);
@@ -580,6 +616,9 @@ static int mdp5_crtc_atomic_check(struct drm_crtc *crtc,
 	DBG("%s: check", crtc->name);
 
 	drm_atomic_crtc_state_for_each_plane_state(plane, pstate, state) {
+		if (!pstate->visible)
+			continue;
+
 		pstates[cnt].plane = plane;
 		pstates[cnt].state = to_mdp5_plane_state(pstate);
 
@@ -723,6 +762,50 @@ static void get_roi(struct drm_crtc *crtc, uint32_t *roi_w, uint32_t *roi_h)
 			mdp5_crtc->cursor.y);
 }
 
+static void mdp5_crtc_restore_cursor(struct drm_crtc *crtc)
+{
+	struct mdp5_crtc_state *mdp5_cstate = to_mdp5_crtc_state(crtc->state);
+	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
+	struct mdp5_kms *mdp5_kms = get_kms(crtc);
+	const enum mdp5_cursor_alpha cur_alpha = CURSOR_ALPHA_PER_PIXEL;
+	uint32_t blendcfg, stride;
+	uint32_t x, y, width, height;
+	uint32_t roi_w, roi_h;
+	int lm;
+
+	assert_spin_locked(&mdp5_crtc->cursor.lock);
+
+	lm = mdp5_cstate->pipeline.mixer->lm;
+
+	x = mdp5_crtc->cursor.x;
+	y = mdp5_crtc->cursor.y;
+	width = mdp5_crtc->cursor.width;
+	height = mdp5_crtc->cursor.height;
+
+	stride = width * drm_format_plane_cpp(DRM_FORMAT_ARGB8888, 0);
+
+	get_roi(crtc, &roi_w, &roi_h);
+
+	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_STRIDE(lm), stride);
+	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_FORMAT(lm),
+			MDP5_LM_CURSOR_FORMAT_FORMAT(CURSOR_FMT_ARGB8888));
+	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_IMG_SIZE(lm),
+			MDP5_LM_CURSOR_IMG_SIZE_SRC_H(height) |
+			MDP5_LM_CURSOR_IMG_SIZE_SRC_W(width));
+	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_SIZE(lm),
+			MDP5_LM_CURSOR_SIZE_ROI_H(roi_h) |
+			MDP5_LM_CURSOR_SIZE_ROI_W(roi_w));
+	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_START_XY(lm),
+			MDP5_LM_CURSOR_START_XY_Y_START(y) |
+			MDP5_LM_CURSOR_START_XY_X_START(x));
+	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_BASE_ADDR(lm),
+			mdp5_crtc->cursor.iova);
+
+	blendcfg = MDP5_LM_CURSOR_BLEND_CONFIG_BLEND_EN;
+	blendcfg |= MDP5_LM_CURSOR_BLEND_CONFIG_BLEND_ALPHA_SEL(cur_alpha);
+	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_BLEND_CONFIG(lm), blendcfg);
+}
+
 static int mdp5_crtc_cursor_set(struct drm_crtc *crtc,
 		struct drm_file *file, uint32_t handle,
 		uint32_t width, uint32_t height)
@@ -735,16 +818,18 @@ static int mdp5_crtc_cursor_set(struct drm_crtc *crtc,
 	struct platform_device *pdev = mdp5_kms->pdev;
 	struct msm_kms *kms = &mdp5_kms->base.base;
 	struct drm_gem_object *cursor_bo, *old_bo = NULL;
-	uint32_t blendcfg, stride;
-	uint64_t cursor_addr;
 	struct mdp5_ctl *ctl;
-	int ret, lm;
-	enum mdp5_cursor_alpha cur_alpha = CURSOR_ALPHA_PER_PIXEL;
+	int ret;
 	uint32_t flush_mask = mdp_ctl_flush_mask_cursor(0);
-	uint32_t roi_w, roi_h;
 	bool cursor_enable = true;
 	unsigned long flags;
 
+	if (!mdp5_crtc->lm_cursor_enabled) {
+		dev_warn(dev->dev,
+			 "cursor_set is deprecated with cursor planes\n");
+		return -EINVAL;
+	}
+
 	if ((width > CURSOR_WIDTH) || (height > CURSOR_HEIGHT)) {
 		dev_err(dev->dev, "bad cursor size: %dx%d\n", width, height);
 		return -EINVAL;
@@ -761,6 +846,7 @@ static int mdp5_crtc_cursor_set(struct drm_crtc *crtc,
 	if (!handle) {
 		DBG("Cursor off");
 		cursor_enable = false;
+		mdp5_crtc->cursor.iova = 0;
 		pm_runtime_get_sync(&pdev->dev);
 		goto set_cursor;
 	}
@@ -769,13 +855,11 @@ static int mdp5_crtc_cursor_set(struct drm_crtc *crtc,
 	if (!cursor_bo)
 		return -ENOENT;
 
-	ret = msm_gem_get_iova(cursor_bo, kms->aspace, &cursor_addr);
+	ret = msm_gem_get_iova(cursor_bo, kms->aspace,
+			&mdp5_crtc->cursor.iova);
 	if (ret)
 		return -EINVAL;
 
-	lm = mdp5_cstate->pipeline.mixer->lm;
-	stride = width * drm_format_plane_cpp(DRM_FORMAT_ARGB8888, 0);
-
 	pm_runtime_get_sync(&pdev->dev);
 
 	spin_lock_irqsave(&mdp5_crtc->cursor.lock, flags);
@@ -785,27 +869,10 @@ static int mdp5_crtc_cursor_set(struct drm_crtc *crtc,
 	mdp5_crtc->cursor.width = width;
 	mdp5_crtc->cursor.height = height;
 
-	get_roi(crtc, &roi_w, &roi_h);
-
-	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_STRIDE(lm), stride);
-	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_FORMAT(lm),
-			MDP5_LM_CURSOR_FORMAT_FORMAT(CURSOR_FMT_ARGB8888));
-	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_IMG_SIZE(lm),
-			MDP5_LM_CURSOR_IMG_SIZE_SRC_H(height) |
-			MDP5_LM_CURSOR_IMG_SIZE_SRC_W(width));
-	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_SIZE(lm),
-			MDP5_LM_CURSOR_SIZE_ROI_H(roi_h) |
-			MDP5_LM_CURSOR_SIZE_ROI_W(roi_w));
-	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_BASE_ADDR(lm), cursor_addr);
-
-	blendcfg = MDP5_LM_CURSOR_BLEND_CONFIG_BLEND_EN;
-	blendcfg |= MDP5_LM_CURSOR_BLEND_CONFIG_BLEND_ALPHA_SEL(cur_alpha);
-	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_BLEND_CONFIG(lm), blendcfg);
+	mdp5_crtc_restore_cursor(crtc);
 
 	spin_unlock_irqrestore(&mdp5_crtc->cursor.lock, flags);
 
-	pm_runtime_put_autosuspend(&pdev->dev);
-
 set_cursor:
 	ret = mdp5_ctl_set_cursor(ctl, pipeline, 0, cursor_enable);
 	if (ret) {
@@ -817,7 +884,7 @@ set_cursor:
 	crtc_flush(crtc, flush_mask);
 
 end:
-	pm_runtime_put_autosuspend(&pdev->dev);
+	pm_runtime_put_sync(&pdev->dev);
 	if (old_bo) {
 		drm_flip_work_queue(&mdp5_crtc->unref_cursor_work, old_bo);
 		/* enable vblank to complete cursor work: */
@@ -831,12 +898,18 @@ static int mdp5_crtc_cursor_move(struct drm_crtc *crtc, int x, int y)
 	struct mdp5_kms *mdp5_kms = get_kms(crtc);
 	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
 	struct mdp5_crtc_state *mdp5_cstate = to_mdp5_crtc_state(crtc->state);
-	uint32_t lm = mdp5_cstate->pipeline.mixer->lm;
 	uint32_t flush_mask = mdp_ctl_flush_mask_cursor(0);
+	struct drm_device *dev = crtc->dev;
 	uint32_t roi_w;
 	uint32_t roi_h;
 	unsigned long flags;
 
+	if (!mdp5_crtc->lm_cursor_enabled) {
+		dev_warn(dev->dev,
+			 "cursor_move is deprecated with cursor planes\n");
+		return -EINVAL;
+	}
+
 	/* don't support LM cursors when we we have source split enabled */
 	if (mdp5_cstate->pipeline.r_mixer)
 		return -EINVAL;
@@ -853,17 +926,12 @@ static int mdp5_crtc_cursor_move(struct drm_crtc *crtc, int x, int y)
 	pm_runtime_get_sync(&mdp5_kms->pdev->dev);
 
 	spin_lock_irqsave(&mdp5_crtc->cursor.lock, flags);
-	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_SIZE(lm),
-			MDP5_LM_CURSOR_SIZE_ROI_H(roi_h) |
-			MDP5_LM_CURSOR_SIZE_ROI_W(roi_w));
-	mdp5_write(mdp5_kms, REG_MDP5_LM_CURSOR_START_XY(lm),
-			MDP5_LM_CURSOR_START_XY_Y_START(y) |
-			MDP5_LM_CURSOR_START_XY_X_START(x));
+	mdp5_crtc_restore_cursor(crtc);
 	spin_unlock_irqrestore(&mdp5_crtc->cursor.lock, flags);
 
 	crtc_flush(crtc, flush_mask);
 
-	pm_runtime_put_autosuspend(&mdp5_kms->pdev->dev);
+	pm_runtime_put_sync(&mdp5_kms->pdev->dev);
 
 	return 0;
 }
@@ -943,16 +1011,6 @@ static const struct drm_crtc_funcs mdp5_crtc_funcs = {
 	.atomic_print_state = mdp5_crtc_atomic_print_state,
 };
 
-static const struct drm_crtc_funcs mdp5_crtc_no_lm_cursor_funcs = {
-	.set_config = drm_atomic_helper_set_config,
-	.destroy = mdp5_crtc_destroy,
-	.page_flip = drm_atomic_helper_page_flip,
-	.reset = mdp5_crtc_reset,
-	.atomic_duplicate_state = mdp5_crtc_duplicate_state,
-	.atomic_destroy_state = mdp5_crtc_destroy_state,
-	.atomic_print_state = mdp5_crtc_atomic_print_state,
-};
-
 static const struct drm_crtc_helper_funcs mdp5_crtc_helper_funcs = {
 	.mode_set_nofb = mdp5_crtc_mode_set_nofb,
 	.atomic_check = mdp5_crtc_atomic_check,
@@ -1121,12 +1179,10 @@ struct drm_crtc *mdp5_crtc_init(struct drm_device *dev,
 	mdp5_crtc->err.irq = mdp5_crtc_err_irq;
 	mdp5_crtc->pp_done.irq = mdp5_crtc_pp_done_irq;
 
-	if (cursor_plane)
-		drm_crtc_init_with_planes(dev, crtc, plane, cursor_plane,
-					  &mdp5_crtc_no_lm_cursor_funcs, NULL);
-	else
-		drm_crtc_init_with_planes(dev, crtc, plane, NULL,
-					  &mdp5_crtc_funcs, NULL);
+	mdp5_crtc->lm_cursor_enabled = cursor_plane ? false : true;
+
+	drm_crtc_init_with_planes(dev, crtc, plane, cursor_plane,
+				  &mdp5_crtc_funcs, NULL);
 
 	drm_flip_work_init(&mdp5_crtc->unref_cursor_work,
 			"unref cursor", unref_cursor_worker);
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_encoder.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_encoder.c
index 5b851380d3f2..36ad3cbe5f79 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_encoder.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_encoder.c
@@ -384,7 +384,7 @@ int mdp5_vid_encoder_set_split_display(struct drm_encoder *encoder,
 
 	mdp5_ctl_pair(mdp5_encoder->ctl, mdp5_slave_enc->ctl, true);
 
-	pm_runtime_put_autosuspend(dev);
+	pm_runtime_put_sync(dev);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_irq.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_irq.c
index bb5deb00c899..280e368bc9bb 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_irq.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_irq.c
@@ -54,7 +54,7 @@ void mdp5_irq_preinstall(struct msm_kms *kms)
 	pm_runtime_get_sync(dev);
 	mdp5_write(mdp5_kms, REG_MDP5_INTR_CLEAR, 0xffffffff);
 	mdp5_write(mdp5_kms, REG_MDP5_INTR_EN, 0x00000000);
-	pm_runtime_put_autosuspend(dev);
+	pm_runtime_put_sync(dev);
 }
 
 int mdp5_irq_postinstall(struct msm_kms *kms)
@@ -72,7 +72,7 @@ int mdp5_irq_postinstall(struct msm_kms *kms)
 
 	pm_runtime_get_sync(dev);
 	mdp_irq_register(mdp_kms, error_handler);
-	pm_runtime_put_autosuspend(dev);
+	pm_runtime_put_sync(dev);
 
 	return 0;
 }
@@ -84,7 +84,7 @@ void mdp5_irq_uninstall(struct msm_kms *kms)
 
 	pm_runtime_get_sync(dev);
 	mdp5_write(mdp5_kms, REG_MDP5_INTR_EN, 0x00000000);
-	pm_runtime_put_autosuspend(dev);
+	pm_runtime_put_sync(dev);
 }
 
 irqreturn_t mdp5_irq(struct msm_kms *kms)
@@ -119,7 +119,7 @@ int mdp5_enable_vblank(struct msm_kms *kms, struct drm_crtc *crtc)
 	pm_runtime_get_sync(dev);
 	mdp_update_vblank_mask(to_mdp_kms(kms),
 			mdp5_crtc_vblank(crtc), true);
-	pm_runtime_put_autosuspend(dev);
+	pm_runtime_put_sync(dev);
 
 	return 0;
 }
@@ -132,5 +132,5 @@ void mdp5_disable_vblank(struct msm_kms *kms, struct drm_crtc *crtc)
 	pm_runtime_get_sync(dev);
 	mdp_update_vblank_mask(to_mdp_kms(kms),
 			mdp5_crtc_vblank(crtc), false);
-	pm_runtime_put_autosuspend(dev);
+	pm_runtime_put_sync(dev);
 }
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
index 7e829a8d1cb1..3e9bba4d6624 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
@@ -125,7 +125,7 @@ static void mdp5_complete_commit(struct msm_kms *kms, struct drm_atomic_state *s
 	if (mdp5_kms->smp)
 		mdp5_smp_complete_commit(mdp5_kms->smp, &mdp5_kms->state->smp);
 
-	pm_runtime_put_autosuspend(dev);
+	pm_runtime_put_sync(dev);
 }
 
 static void mdp5_wait_for_crtc_commit_done(struct msm_kms *kms,
@@ -496,12 +496,12 @@ static void read_mdp_hw_revision(struct mdp5_kms *mdp5_kms,
 
 	pm_runtime_get_sync(dev);
 	version = mdp5_read(mdp5_kms, REG_MDP5_HW_VERSION);
-	pm_runtime_put_autosuspend(dev);
+	pm_runtime_put_sync(dev);
 
 	*major = FIELD(version, MDP5_HW_VERSION_MAJOR);
 	*minor = FIELD(version, MDP5_HW_VERSION_MINOR);
 
-	DBG("MDP5 version v%d.%d", *major, *minor);
+	dev_info(dev, "MDP5 version v%d.%d", *major, *minor);
 }
 
 static int get_clk(struct platform_device *pdev, struct clk **clkp,
@@ -683,7 +683,7 @@ struct msm_kms *mdp5_kms_init(struct drm_device *dev)
 		aspace = NULL;;
 	}
 
-	pm_runtime_put_autosuspend(&pdev->dev);
+	pm_runtime_put_sync(&pdev->dev);
 
 	ret = modeset_init(mdp5_kms);
 	if (ret) {
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.c
index 2bfac3712685..ff52c49095f9 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.c
@@ -17,19 +17,20 @@
 
 #include "mdp5_kms.h"
 
-struct mdp5_hw_pipe *mdp5_pipe_assign(struct drm_atomic_state *s,
-		struct drm_plane *plane, uint32_t caps, uint32_t blkcfg)
+int mdp5_pipe_assign(struct drm_atomic_state *s, struct drm_plane *plane,
+		     uint32_t caps, uint32_t blkcfg,
+		     struct mdp5_hw_pipe **hwpipe,
+		     struct mdp5_hw_pipe **r_hwpipe)
 {
 	struct msm_drm_private *priv = s->dev->dev_private;
 	struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(priv->kms));
 	struct mdp5_state *state;
 	struct mdp5_hw_pipe_state *old_state, *new_state;
-	struct mdp5_hw_pipe *hwpipe = NULL;
-	int i;
+	int i, j;
 
 	state = mdp5_get_state(s);
 	if (IS_ERR(state))
-		return ERR_CAST(state);
+		return PTR_ERR(state);
 
 	/* grab old_state after mdp5_get_state(), since now we hold lock: */
 	old_state = &mdp5_kms->state->hwpipe;
@@ -64,31 +65,67 @@ struct mdp5_hw_pipe *mdp5_pipe_assign(struct drm_atomic_state *s,
 		/* possible candidate, take the one with the
 		 * fewest unneeded caps bits set:
 		 */
-		if (!hwpipe || (hweight_long(cur->caps & ~caps) <
-				hweight_long(hwpipe->caps & ~caps)))
-			hwpipe = cur;
+		if (!(*hwpipe) || (hweight_long(cur->caps & ~caps) <
+				   hweight_long((*hwpipe)->caps & ~caps))) {
+			bool r_found = false;
+
+			if (r_hwpipe) {
+				for (j = i + 1; j < mdp5_kms->num_hwpipes;
+				     j++) {
+					struct mdp5_hw_pipe *r_cur =
+							mdp5_kms->hwpipes[j];
+
+					/* reject different types of hwpipes */
+					if (r_cur->caps != cur->caps)
+						continue;
+
+					/* respect priority, eg. VIG0 > VIG1 */
+					if (cur->pipe > r_cur->pipe)
+						continue;
+
+					*r_hwpipe = r_cur;
+					r_found = true;
+					break;
+				}
+			}
+
+			if (!r_hwpipe || r_found)
+				*hwpipe = cur;
+		}
 	}
 
-	if (!hwpipe)
-		return ERR_PTR(-ENOMEM);
+	if (!(*hwpipe))
+		return -ENOMEM;
+
+	if (r_hwpipe && !(*r_hwpipe))
+		return -ENOMEM;
 
 	if (mdp5_kms->smp) {
 		int ret;
 
-		DBG("%s: alloc SMP blocks", hwpipe->name);
+		/* We don't support SMP and 2 hwpipes/plane together */
+		WARN_ON(r_hwpipe);
+
+		DBG("%s: alloc SMP blocks", (*hwpipe)->name);
 		ret = mdp5_smp_assign(mdp5_kms->smp, &state->smp,
-				hwpipe->pipe, blkcfg);
+				(*hwpipe)->pipe, blkcfg);
 		if (ret)
-			return ERR_PTR(-ENOMEM);
+			return -ENOMEM;
 
-		hwpipe->blkcfg = blkcfg;
+		(*hwpipe)->blkcfg = blkcfg;
 	}
 
 	DBG("%s: assign to plane %s for caps %x",
-			hwpipe->name, plane->name, caps);
-	new_state->hwpipe_to_plane[hwpipe->idx] = plane;
+			(*hwpipe)->name, plane->name, caps);
+	new_state->hwpipe_to_plane[(*hwpipe)->idx] = plane;
 
-	return hwpipe;
+	if (r_hwpipe) {
+		DBG("%s: assign to right of plane %s for caps %x",
+		    (*r_hwpipe)->name, plane->name, caps);
+		new_state->hwpipe_to_plane[(*r_hwpipe)->idx] = plane;
+	}
+
+	return 0;
 }
 
 void mdp5_pipe_release(struct drm_atomic_state *s, struct mdp5_hw_pipe *hwpipe)
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.h b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.h
index 924c3e6f9517..bb2b0ac7aa2b 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.h
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.h
@@ -44,9 +44,10 @@ struct mdp5_hw_pipe_state {
 	struct drm_plane *hwpipe_to_plane[SSPP_MAX];
 };
 
-struct mdp5_hw_pipe *__must_check
-mdp5_pipe_assign(struct drm_atomic_state *s, struct drm_plane *plane,
-		uint32_t caps, uint32_t blkcfg);
+int mdp5_pipe_assign(struct drm_atomic_state *s, struct drm_plane *plane,
+		     uint32_t caps, uint32_t blkcfg,
+		     struct mdp5_hw_pipe **hwpipe,
+		     struct mdp5_hw_pipe **r_hwpipe);
 void mdp5_pipe_release(struct drm_atomic_state *s, struct mdp5_hw_pipe *hwpipe);
 
 struct mdp5_hw_pipe *mdp5_pipe_init(enum mdp5_pipe pipe,
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
index 4b22ac3413a1..be50445f9901 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
@@ -31,15 +31,6 @@ static int mdp5_plane_mode_set(struct drm_plane *plane,
 		struct drm_crtc *crtc, struct drm_framebuffer *fb,
 		struct drm_rect *src, struct drm_rect *dest);
 
-static int mdp5_update_cursor_plane_legacy(struct drm_plane *plane,
-		struct drm_crtc *crtc,
-		struct drm_framebuffer *fb,
-		int crtc_x, int crtc_y,
-		unsigned int crtc_w, unsigned int crtc_h,
-		uint32_t src_x, uint32_t src_y,
-		uint32_t src_w, uint32_t src_h,
-		struct drm_modeset_acquire_ctx *ctx);
-
 static struct mdp5_kms *get_kms(struct drm_plane *plane)
 {
 	struct msm_drm_private *priv = plane->dev->dev_private;
@@ -254,18 +245,6 @@ static const struct drm_plane_funcs mdp5_plane_funcs = {
 		.atomic_print_state = mdp5_plane_atomic_print_state,
 };
 
-static const struct drm_plane_funcs mdp5_cursor_plane_funcs = {
-		.update_plane = mdp5_update_cursor_plane_legacy,
-		.disable_plane = drm_atomic_helper_disable_plane,
-		.destroy = mdp5_plane_destroy,
-		.atomic_set_property = mdp5_plane_atomic_set_property,
-		.atomic_get_property = mdp5_plane_atomic_get_property,
-		.reset = mdp5_plane_reset,
-		.atomic_duplicate_state = mdp5_plane_duplicate_state,
-		.atomic_destroy_state = mdp5_plane_destroy_state,
-		.atomic_print_state = mdp5_plane_atomic_print_state,
-};
-
 static int mdp5_plane_prepare_fb(struct drm_plane *plane,
 				 struct drm_plane_state *new_state)
 {
@@ -414,31 +393,30 @@ static int mdp5_plane_atomic_check_with_state(struct drm_crtc_state *crtc_state,
 			struct mdp5_hw_pipe *old_hwpipe = mdp5_state->hwpipe;
 			struct mdp5_hw_pipe *old_right_hwpipe =
 							  mdp5_state->r_hwpipe;
-
-			mdp5_state->hwpipe = mdp5_pipe_assign(state->state,
-					plane, caps, blkcfg);
-			if (IS_ERR(mdp5_state->hwpipe)) {
-				DBG("%s: failed to assign hwpipe!", plane->name);
-				return PTR_ERR(mdp5_state->hwpipe);
+			struct mdp5_hw_pipe *new_hwpipe = NULL;
+			struct mdp5_hw_pipe *new_right_hwpipe = NULL;
+
+			ret = mdp5_pipe_assign(state->state, plane, caps,
+					       blkcfg, &new_hwpipe,
+					       need_right_hwpipe ?
+					       &new_right_hwpipe : NULL);
+			if (ret) {
+				DBG("%s: failed to assign hwpipe(s)!",
+				    plane->name);
+				return ret;
 			}
 
-			if (need_right_hwpipe) {
-				mdp5_state->r_hwpipe =
-					mdp5_pipe_assign(state->state, plane,
-							 caps, blkcfg);
-				if (IS_ERR(mdp5_state->r_hwpipe)) {
-					DBG("%s: failed to assign right hwpipe",
-					    plane->name);
-					return PTR_ERR(mdp5_state->r_hwpipe);
-				}
-			} else {
+			mdp5_state->hwpipe = new_hwpipe;
+			if (need_right_hwpipe)
+				mdp5_state->r_hwpipe = new_right_hwpipe;
+			else
 				/*
 				 * set it to NULL so that the driver knows we
 				 * don't have a right hwpipe when committing a
 				 * new state
 				 */
 				mdp5_state->r_hwpipe = NULL;
-			}
+
 
 			mdp5_pipe_release(state->state, old_hwpipe);
 			mdp5_pipe_release(state->state, old_right_hwpipe);
@@ -487,11 +465,98 @@ static void mdp5_plane_atomic_update(struct drm_plane *plane,
 	}
 }
 
+static int mdp5_plane_atomic_async_check(struct drm_plane *plane,
+					 struct drm_plane_state *state)
+{
+	struct mdp5_plane_state *mdp5_state = to_mdp5_plane_state(state);
+	struct drm_crtc_state *crtc_state;
+	struct drm_rect clip;
+	int min_scale, max_scale;
+	int ret;
+
+	crtc_state = drm_atomic_get_existing_crtc_state(state->state,
+							state->crtc);
+	if (WARN_ON(!crtc_state))
+		return -EINVAL;
+
+	if (!crtc_state->active)
+		return -EINVAL;
+
+	mdp5_state = to_mdp5_plane_state(state);
+
+	/* don't use fast path if we don't have a hwpipe allocated yet */
+	if (!mdp5_state->hwpipe)
+		return -EINVAL;
+
+	/* only allow changing of position(crtc x/y or src x/y) in fast path */
+	if (plane->state->crtc != state->crtc ||
+	    plane->state->src_w != state->src_w ||
+	    plane->state->src_h != state->src_h ||
+	    plane->state->crtc_w != state->crtc_w ||
+	    plane->state->crtc_h != state->crtc_h ||
+	    !plane->state->fb ||
+	    plane->state->fb != state->fb)
+		return -EINVAL;
+
+	clip.x1 = 0;
+	clip.y1 = 0;
+	clip.x2 = crtc_state->adjusted_mode.hdisplay;
+	clip.y2 = crtc_state->adjusted_mode.vdisplay;
+	min_scale = FRAC_16_16(1, 8);
+	max_scale = FRAC_16_16(8, 1);
+
+	ret = drm_plane_helper_check_state(state, &clip, min_scale,
+					   max_scale, true, true);
+	if (ret)
+		return ret;
+
+	/*
+	 * if the visibility of the plane changes (i.e, if the cursor is
+	 * clipped out completely, we can't take the async path because
+	 * we need to stage/unstage the plane from the Layer Mixer(s). We
+	 * also assign/unassign the hwpipe(s) tied to the plane. We avoid
+	 * taking the fast path for both these reasons.
+	 */
+	if (state->visible != plane->state->visible)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void mdp5_plane_atomic_async_update(struct drm_plane *plane,
+					   struct drm_plane_state *new_state)
+{
+	plane->state->src_x = new_state->src_x;
+	plane->state->src_y = new_state->src_y;
+	plane->state->crtc_x = new_state->crtc_x;
+	plane->state->crtc_y = new_state->crtc_y;
+
+	if (plane_enabled(new_state)) {
+		struct mdp5_ctl *ctl;
+		struct mdp5_pipeline *pipeline =
+					mdp5_crtc_get_pipeline(plane->crtc);
+		int ret;
+
+		ret = mdp5_plane_mode_set(plane, new_state->crtc, new_state->fb,
+				&new_state->src, &new_state->dst);
+		WARN_ON(ret < 0);
+
+		ctl = mdp5_crtc_get_ctl(new_state->crtc);
+
+		mdp5_ctl_commit(ctl, pipeline, mdp5_plane_get_flush(plane));
+	}
+
+	*to_mdp5_plane_state(plane->state) =
+		*to_mdp5_plane_state(new_state);
+}
+
 static const struct drm_plane_helper_funcs mdp5_plane_helper_funcs = {
 		.prepare_fb = mdp5_plane_prepare_fb,
 		.cleanup_fb = mdp5_plane_cleanup_fb,
 		.atomic_check = mdp5_plane_atomic_check,
 		.atomic_update = mdp5_plane_atomic_update,
+		.atomic_async_check = mdp5_plane_atomic_async_check,
+		.atomic_async_update = mdp5_plane_atomic_async_update,
 };
 
 static void set_scanout_locked(struct mdp5_kms *mdp5_kms,
@@ -996,84 +1061,6 @@ static int mdp5_plane_mode_set(struct drm_plane *plane,
 	return ret;
 }
 
-static int mdp5_update_cursor_plane_legacy(struct drm_plane *plane,
-			struct drm_crtc *crtc, struct drm_framebuffer *fb,
-			int crtc_x, int crtc_y,
-			unsigned int crtc_w, unsigned int crtc_h,
-			uint32_t src_x, uint32_t src_y,
-			uint32_t src_w, uint32_t src_h,
-			struct drm_modeset_acquire_ctx *ctx)
-{
-	struct drm_plane_state *plane_state, *new_plane_state;
-	struct mdp5_plane_state *mdp5_pstate;
-	struct drm_crtc_state *crtc_state = crtc->state;
-	int ret;
-
-	if (!crtc_state->active || drm_atomic_crtc_needs_modeset(crtc_state))
-		goto slow;
-
-	plane_state = plane->state;
-	mdp5_pstate = to_mdp5_plane_state(plane_state);
-
-	/* don't use fast path if we don't have a hwpipe allocated yet */
-	if (!mdp5_pstate->hwpipe)
-		goto slow;
-
-	/* only allow changing of position(crtc x/y or src x/y) in fast path */
-	if (plane_state->crtc != crtc ||
-	    plane_state->src_w != src_w ||
-	    plane_state->src_h != src_h ||
-	    plane_state->crtc_w != crtc_w ||
-	    plane_state->crtc_h != crtc_h ||
-	    !plane_state->fb ||
-	    plane_state->fb != fb)
-		goto slow;
-
-	new_plane_state = mdp5_plane_duplicate_state(plane);
-	if (!new_plane_state)
-		return -ENOMEM;
-
-	new_plane_state->src_x = src_x;
-	new_plane_state->src_y = src_y;
-	new_plane_state->src_w = src_w;
-	new_plane_state->src_h = src_h;
-	new_plane_state->crtc_x = crtc_x;
-	new_plane_state->crtc_y = crtc_y;
-	new_plane_state->crtc_w = crtc_w;
-	new_plane_state->crtc_h = crtc_h;
-
-	ret = mdp5_plane_atomic_check_with_state(crtc_state, new_plane_state);
-	if (ret)
-		goto slow_free;
-
-	if (new_plane_state->visible) {
-		struct mdp5_ctl *ctl;
-		struct mdp5_pipeline *pipeline = mdp5_crtc_get_pipeline(crtc);
-
-		ret = mdp5_plane_mode_set(plane, crtc, fb,
-					  &new_plane_state->src,
-					  &new_plane_state->dst);
-		WARN_ON(ret < 0);
-
-		ctl = mdp5_crtc_get_ctl(crtc);
-
-		mdp5_ctl_commit(ctl, pipeline, mdp5_plane_get_flush(plane));
-	}
-
-	*to_mdp5_plane_state(plane_state) =
-		*to_mdp5_plane_state(new_plane_state);
-
-	mdp5_plane_destroy_state(plane, new_plane_state);
-
-	return 0;
-slow_free:
-	mdp5_plane_destroy_state(plane, new_plane_state);
-slow:
-	return drm_atomic_helper_update_plane(plane, crtc, fb,
-					      crtc_x, crtc_y, crtc_w, crtc_h,
-					      src_x, src_y, src_w, src_h, ctx);
-}
-
 /*
  * Use this func and the one below only after the atomic state has been
  * successfully swapped
@@ -1133,16 +1120,9 @@ struct drm_plane *mdp5_plane_init(struct drm_device *dev,
 	mdp5_plane->nformats = mdp_get_formats(mdp5_plane->formats,
 		ARRAY_SIZE(mdp5_plane->formats), false);
 
-	if (type == DRM_PLANE_TYPE_CURSOR)
-		ret = drm_universal_plane_init(dev, plane, 0xff,
-				&mdp5_cursor_plane_funcs,
-				mdp5_plane->formats, mdp5_plane->nformats,
-				NULL, type, NULL);
-	else
-		ret = drm_universal_plane_init(dev, plane, 0xff,
-				&mdp5_plane_funcs,
-				mdp5_plane->formats, mdp5_plane->nformats,
-				NULL, type, NULL);
+	ret = drm_universal_plane_init(dev, plane, 0xff, &mdp5_plane_funcs,
+			mdp5_plane->formats, mdp5_plane->nformats,
+			NULL, type, NULL);
 	if (ret)
 		goto fail;
 
diff --git a/drivers/gpu/drm/msm/msm_atomic.c b/drivers/gpu/drm/msm/msm_atomic.c
index 025d454163b0..bf5f8c39f34d 100644
--- a/drivers/gpu/drm/msm/msm_atomic.c
+++ b/drivers/gpu/drm/msm/msm_atomic.c
@@ -146,35 +146,6 @@ static void commit_worker(struct work_struct *work)
 	complete_commit(container_of(work, struct msm_commit, work), true);
 }
 
-/*
- * this func is identical to the drm_atomic_helper_check, but we keep this
- * because we might eventually need to have a more finegrained check
- * sequence without using the atomic helpers.
- *
- * In the past, we first called drm_atomic_helper_check_planes, and then
- * drm_atomic_helper_check_modeset. We needed this because the MDP5 plane's
- * ->atomic_check could update ->mode_changed for pixel format changes.
- * This, however isn't needed now because if there is a pixel format change,
- * we just assign a new hwpipe for it with a new SMP allocation. We might
- * eventually hit a condition where we would need to do a full modeset if
- * we run out of planes. There, we'd probably need to set mode_changed.
- */
-int msm_atomic_check(struct drm_device *dev,
-		     struct drm_atomic_state *state)
-{
-	int ret;
-
-	ret = drm_atomic_helper_check_modeset(dev, state);
-	if (ret)
-		return ret;
-
-	ret = drm_atomic_helper_check_planes(dev, state);
-	if (ret)
-		return ret;
-
-	return ret;
-}
-
 /**
  * drm_atomic_helper_commit - commit validated state object
  * @dev: DRM device
@@ -202,6 +173,18 @@ int msm_atomic_commit(struct drm_device *dev,
 	if (ret)
 		return ret;
 
+	/*
+	 * Note that plane->atomic_async_check() should fail if we need
+	 * to re-assign hwpipe or anything that touches global atomic
+	 * state, so we'll never go down the async update path in those
+	 * cases.
+	 */
+	if (state->async_update) {
+		drm_atomic_helper_async_commit(dev, state);
+		drm_atomic_helper_cleanup_planes(dev, state);
+		return 0;
+	}
+
 	c = commit_init(state);
 	if (!c) {
 		ret = -ENOMEM;
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 606df7bea97b..0a3ea3034e39 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -29,9 +29,12 @@
  * - 1.0.0 - initial interface
  * - 1.1.0 - adds madvise, and support for submits with > 4 cmd buffers
  * - 1.2.0 - adds explicit fence support for submit ioctl
+ * - 1.3.0 - adds GMEM_BASE + NR_RINGS params, SUBMITQUEUE_NEW +
+ *           SUBMITQUEUE_CLOSE ioctls, and MSM_INFO_IOVA flag for
+ *           MSM_GEM_INFO ioctl.
  */
 #define MSM_VERSION_MAJOR	1
-#define MSM_VERSION_MINOR	2
+#define MSM_VERSION_MINOR	3
 #define MSM_VERSION_PATCHLEVEL	0
 
 static void msm_fb_output_poll_changed(struct drm_device *dev)
@@ -44,7 +47,7 @@ static void msm_fb_output_poll_changed(struct drm_device *dev)
 static const struct drm_mode_config_funcs mode_config_funcs = {
 	.fb_create = msm_framebuffer_create,
 	.output_poll_changed = msm_fb_output_poll_changed,
-	.atomic_check = msm_atomic_check,
+	.atomic_check = drm_atomic_helper_check,
 	.atomic_commit = msm_atomic_commit,
 	.atomic_state_alloc = msm_atomic_state_alloc,
 	.atomic_state_clear = msm_atomic_state_clear,
@@ -211,7 +214,6 @@ static int msm_drm_uninit(struct device *dev)
 	struct drm_device *ddev = platform_get_drvdata(pdev);
 	struct msm_drm_private *priv = ddev->dev_private;
 	struct msm_kms *kms = priv->kms;
-	struct msm_gpu *gpu = priv->gpu;
 	struct msm_vblank_ctrl *vbl_ctrl = &priv->vblank_ctrl;
 	struct vblank_event *vbl_ev, *tmp;
 
@@ -253,15 +255,6 @@ static int msm_drm_uninit(struct device *dev)
 	if (kms && kms->funcs)
 		kms->funcs->destroy(kms);
 
-	if (gpu) {
-		mutex_lock(&ddev->struct_mutex);
-		// XXX what do we do here?
-		//pm_runtime_enable(&pdev->dev);
-		gpu->funcs->pm_suspend(gpu);
-		mutex_unlock(&ddev->struct_mutex);
-		gpu->funcs->destroy(gpu);
-	}
-
 	if (priv->vram.paddr) {
 		unsigned long attrs = DMA_ATTR_NO_KERNEL_MAPPING;
 		drm_mm_takedown(&priv->vram.mm);
@@ -514,24 +507,37 @@ static void load_gpu(struct drm_device *dev)
 	mutex_unlock(&init_lock);
 }
 
-static int msm_open(struct drm_device *dev, struct drm_file *file)
+static int context_init(struct drm_device *dev, struct drm_file *file)
 {
 	struct msm_file_private *ctx;
 
-	/* For now, load gpu on open.. to avoid the requirement of having
-	 * firmware in the initrd.
-	 */
-	load_gpu(dev);
-
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
 
+	msm_submitqueue_init(dev, ctx);
+
 	file->driver_priv = ctx;
 
 	return 0;
 }
 
+static int msm_open(struct drm_device *dev, struct drm_file *file)
+{
+	/* For now, load gpu on open.. to avoid the requirement of having
+	 * firmware in the initrd.
+	 */
+	load_gpu(dev);
+
+	return context_init(dev, file);
+}
+
+static void context_close(struct msm_file_private *ctx)
+{
+	msm_submitqueue_close(ctx);
+	kfree(ctx);
+}
+
 static void msm_postclose(struct drm_device *dev, struct drm_file *file)
 {
 	struct msm_drm_private *priv = dev->dev_private;
@@ -542,7 +548,7 @@ static void msm_postclose(struct drm_device *dev, struct drm_file *file)
 		priv->lastctx = NULL;
 	mutex_unlock(&dev->struct_mutex);
 
-	kfree(ctx);
+	context_close(ctx);
 }
 
 static void msm_lastclose(struct drm_device *dev)
@@ -737,16 +743,27 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, void *data,
 	struct msm_drm_private *priv = dev->dev_private;
 	struct drm_msm_wait_fence *args = data;
 	ktime_t timeout = to_ktime(args->timeout);
+	struct msm_gpu_submitqueue *queue;
+	struct msm_gpu *gpu = priv->gpu;
+	int ret;
 
 	if (args->pad) {
 		DRM_ERROR("invalid pad: %08x\n", args->pad);
 		return -EINVAL;
 	}
 
-	if (!priv->gpu)
+	if (!gpu)
 		return 0;
 
-	return msm_wait_fence(priv->gpu->fctx, args->fence, &timeout, true);
+	queue = msm_submitqueue_get(file->driver_priv, args->queueid);
+	if (!queue)
+		return -ENOENT;
+
+	ret = msm_wait_fence(gpu->rb[queue->prio]->fctx, args->fence, &timeout,
+		true);
+
+	msm_submitqueue_put(queue);
+	return ret;
 }
 
 static int msm_ioctl_gem_madvise(struct drm_device *dev, void *data,
@@ -787,6 +804,28 @@ unlock:
 	return ret;
 }
 
+
+static int msm_ioctl_submitqueue_new(struct drm_device *dev, void *data,
+		struct drm_file *file)
+{
+	struct drm_msm_submitqueue *args = data;
+
+	if (args->flags & ~MSM_SUBMITQUEUE_FLAGS)
+		return -EINVAL;
+
+	return msm_submitqueue_create(dev, file->driver_priv, args->prio,
+		args->flags, &args->id);
+}
+
+
+static int msm_ioctl_submitqueue_close(struct drm_device *dev, void *data,
+		struct drm_file *file)
+{
+	u32 id = *(u32 *) data;
+
+	return msm_submitqueue_remove(file->driver_priv, id);
+}
+
 static const struct drm_ioctl_desc msm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(MSM_GET_PARAM,    msm_ioctl_get_param,    DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(MSM_GEM_NEW,      msm_ioctl_gem_new,      DRM_AUTH|DRM_RENDER_ALLOW),
@@ -796,6 +835,8 @@ static const struct drm_ioctl_desc msm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(MSM_GEM_SUBMIT,   msm_ioctl_gem_submit,   DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(MSM_WAIT_FENCE,   msm_ioctl_wait_fence,   DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(MSM_GEM_MADVISE,  msm_ioctl_gem_madvise,  DRM_AUTH|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_NEW,   msm_ioctl_submitqueue_new,   DRM_AUTH|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_CLOSE, msm_ioctl_submitqueue_close, DRM_AUTH|DRM_RENDER_ALLOW),
 };
 
 static const struct vm_operations_struct vm_ops = {
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 5e8109c07560..c646843d8822 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -56,11 +56,9 @@ struct msm_gem_address_space;
 struct msm_gem_vma;
 
 struct msm_file_private {
-	/* currently we don't do anything useful with this.. but when
-	 * per-context address spaces are supported we'd keep track of
-	 * the context's page-tables here.
-	 */
-	int dummy;
+	rwlock_t queuelock;
+	struct list_head submitqueues;
+	int queueid;
 };
 
 enum msm_mdp_plane_property {
@@ -76,6 +74,8 @@ struct msm_vblank_ctrl {
 	spinlock_t lock;
 };
 
+#define MSM_GPU_MAX_RINGS 4
+
 struct msm_drm_private {
 
 	struct drm_device *dev;
@@ -108,7 +108,8 @@ struct msm_drm_private {
 
 	struct drm_fb_helper *fbdev;
 
-	struct msm_rd_state *rd;
+	struct msm_rd_state *rd;       /* debugfs to dump all submits */
+	struct msm_rd_state *hangrd;   /* debugfs to dump hanging submits */
 	struct msm_perf_state *perf;
 
 	/* list of GEM objects: */
@@ -154,20 +155,12 @@ struct msm_drm_private {
 	struct shrinker shrinker;
 
 	struct msm_vblank_ctrl vblank_ctrl;
-
-	/* task holding struct_mutex.. currently only used in submit path
-	 * to detect and reject faults from copy_from_user() for submit
-	 * ioctl.
-	 */
-	struct task_struct *struct_mutex_task;
 };
 
 struct msm_format {
 	uint32_t pixel_format;
 };
 
-int msm_atomic_check(struct drm_device *dev,
-		     struct drm_atomic_state *state);
 int msm_atomic_commit(struct drm_device *dev,
 		struct drm_atomic_state *state, bool nonblock);
 struct drm_atomic_state *msm_atomic_state_alloc(struct drm_device *dev);
@@ -219,6 +212,7 @@ struct drm_gem_object *msm_gem_prime_import_sg_table(struct drm_device *dev,
 int msm_gem_prime_pin(struct drm_gem_object *obj);
 void msm_gem_prime_unpin(struct drm_gem_object *obj);
 void *msm_gem_get_vaddr(struct drm_gem_object *obj);
+void *msm_gem_get_vaddr_active(struct drm_gem_object *obj);
 void msm_gem_put_vaddr(struct drm_gem_object *obj);
 int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv);
 int msm_gem_sync_object(struct drm_gem_object *obj,
@@ -303,7 +297,8 @@ void msm_framebuffer_describe(struct drm_framebuffer *fb, struct seq_file *m);
 int msm_debugfs_late_init(struct drm_device *dev);
 int msm_rd_debugfs_init(struct drm_minor *minor);
 void msm_rd_debugfs_cleanup(struct msm_drm_private *priv);
-void msm_rd_dump_submit(struct msm_gem_submit *submit);
+void msm_rd_dump_submit(struct msm_rd_state *rd, struct msm_gem_submit *submit,
+		const char *fmt, ...);
 int msm_perf_debugfs_init(struct drm_minor *minor);
 void msm_perf_debugfs_cleanup(struct msm_drm_private *priv);
 #else
@@ -319,6 +314,18 @@ void __iomem *msm_ioremap(struct platform_device *pdev, const char *name,
 void msm_writel(u32 data, void __iomem *addr);
 u32 msm_readl(const void __iomem *addr);
 
+struct msm_gpu_submitqueue;
+int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx);
+struct msm_gpu_submitqueue *msm_submitqueue_get(struct msm_file_private *ctx,
+		u32 id);
+int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
+		u32 prio, u32 flags, u32 *id);
+int msm_submitqueue_remove(struct msm_file_private *ctx, u32 id);
+void msm_submitqueue_close(struct msm_file_private *ctx);
+
+void msm_submitqueue_destroy(struct kref *kref);
+
+
 #define DBG(fmt, ...) DRM_DEBUG_DRIVER(fmt"\n", ##__VA_ARGS__)
 #define VERB(fmt, ...) if (0) DRM_DEBUG_DRIVER(fmt"\n", ##__VA_ARGS__)
 
diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c
index a2f89bac9c16..349c12f670eb 100644
--- a/drivers/gpu/drm/msm/msm_fence.c
+++ b/drivers/gpu/drm/msm/msm_fence.c
@@ -31,7 +31,7 @@ msm_fence_context_alloc(struct drm_device *dev, const char *name)
 		return ERR_PTR(-ENOMEM);
 
 	fctx->dev = dev;
-	fctx->name = name;
+	strncpy(fctx->name, name, sizeof(fctx->name));
 	fctx->context = dma_fence_context_alloc(1);
 	init_waitqueue_head(&fctx->event);
 	spin_lock_init(&fctx->spinlock);
diff --git a/drivers/gpu/drm/msm/msm_fence.h b/drivers/gpu/drm/msm/msm_fence.h
index 56061aa1959d..1aa6a4c6530c 100644
--- a/drivers/gpu/drm/msm/msm_fence.h
+++ b/drivers/gpu/drm/msm/msm_fence.h
@@ -22,7 +22,7 @@
 
 struct msm_fence_context {
 	struct drm_device *dev;
-	const char *name;
+	char name[32];
 	unsigned context;
 	/* last_fence == completed_fence --> no pending work */
 	uint32_t last_fence;          /* last assigned fence */
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index f15821a0d900..81fe6d6740ce 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -470,14 +470,16 @@ fail:
 	return ret;
 }
 
-void *msm_gem_get_vaddr(struct drm_gem_object *obj)
+static void *get_vaddr(struct drm_gem_object *obj, unsigned madv)
 {
 	struct msm_gem_object *msm_obj = to_msm_bo(obj);
 	int ret = 0;
 
 	mutex_lock(&msm_obj->lock);
 
-	if (WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED)) {
+	if (WARN_ON(msm_obj->madv > madv)) {
+		dev_err(obj->dev->dev, "Invalid madv state: %u vs %u\n",
+			msm_obj->madv, madv);
 		mutex_unlock(&msm_obj->lock);
 		return ERR_PTR(-EBUSY);
 	}
@@ -513,6 +515,22 @@ fail:
 	return ERR_PTR(ret);
 }
 
+void *msm_gem_get_vaddr(struct drm_gem_object *obj)
+{
+	return get_vaddr(obj, MSM_MADV_WILLNEED);
+}
+
+/*
+ * Don't use this!  It is for the very special case of dumping
+ * submits from GPU hangs or faults, were the bo may already
+ * be MSM_MADV_DONTNEED, but we know the buffer is still on the
+ * active list.
+ */
+void *msm_gem_get_vaddr_active(struct drm_gem_object *obj)
+{
+	return get_vaddr(obj, __MSM_MADV_PURGED);
+}
+
 void msm_gem_put_vaddr(struct drm_gem_object *obj)
 {
 	struct msm_gem_object *msm_obj = to_msm_bo(obj);
@@ -610,17 +628,6 @@ int msm_gem_sync_object(struct drm_gem_object *obj,
 	struct dma_fence *fence;
 	int i, ret;
 
-	if (!exclusive) {
-		/* NOTE: _reserve_shared() must happen before _add_shared_fence(),
-		 * which makes this a slightly strange place to call it.  OTOH this
-		 * is a convenient can-fail point to hook it in.  (And similar to
-		 * how etnaviv and nouveau handle this.)
-		 */
-		ret = reservation_object_reserve_shared(msm_obj->resv);
-		if (ret)
-			return ret;
-	}
-
 	fobj = reservation_object_get_list(msm_obj->resv);
 	if (!fobj || (fobj->shared_count == 0)) {
 		fence = reservation_object_get_excl(msm_obj->resv);
@@ -1045,10 +1052,10 @@ static void *_msm_gem_kernel_new(struct drm_device *dev, uint32_t size,
 	}
 
 	vaddr = msm_gem_get_vaddr(obj);
-	if (!vaddr) {
+	if (IS_ERR(vaddr)) {
 		msm_gem_put_iova(obj, aspace);
 		drm_gem_object_unreference(obj);
-		return ERR_PTR(-ENOMEM);
+		return ERR_CAST(vaddr);
 	}
 
 	if (bo)
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 91c210d2359c..9320e184b48d 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -138,12 +138,15 @@ void msm_gem_vunmap(struct drm_gem_object *obj, enum msm_gem_lock subclass);
 struct msm_gem_submit {
 	struct drm_device *dev;
 	struct msm_gpu *gpu;
-	struct list_head node;   /* node in gpu submit_list */
+	struct list_head node;   /* node in ring submit list */
 	struct list_head bo_list;
 	struct ww_acquire_ctx ticket;
+	uint32_t seqno;		/* Sequence number of the submit on the ring */
 	struct dma_fence *fence;
+	struct msm_gpu_submitqueue *queue;
 	struct pid *pid;    /* submitting process */
 	bool valid;         /* true if no cmdstream patching needed */
+	struct msm_ringbuffer *ring;
 	unsigned int nr_cmds;
 	unsigned int nr_bos;
 	struct {
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index 5d0a75d4b249..b8dc8f96caf2 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -31,7 +31,8 @@
 #define BO_PINNED   0x2000
 
 static struct msm_gem_submit *submit_create(struct drm_device *dev,
-		struct msm_gpu *gpu, uint32_t nr_bos, uint32_t nr_cmds)
+		struct msm_gpu *gpu, struct msm_gpu_submitqueue *queue,
+		uint32_t nr_bos, uint32_t nr_cmds)
 {
 	struct msm_gem_submit *submit;
 	uint64_t sz = sizeof(*submit) + ((u64)nr_bos * sizeof(submit->bos[0])) +
@@ -49,6 +50,8 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
 	submit->fence = NULL;
 	submit->pid = get_pid(task_pid(current));
 	submit->cmd = (void *)&submit->bos[nr_bos];
+	submit->queue = queue;
+	submit->ring = gpu->rb[queue->prio];
 
 	/* initially, until copy_from_user() and bo lookup succeeds: */
 	submit->nr_bos = 0;
@@ -66,6 +69,8 @@ void msm_gem_submit_free(struct msm_gem_submit *submit)
 	dma_fence_put(submit->fence);
 	list_del(&submit->node);
 	put_pid(submit->pid);
+	msm_submitqueue_put(submit->queue);
+
 	kfree(submit);
 }
 
@@ -156,7 +161,8 @@ out:
 	return ret;
 }
 
-static void submit_unlock_unpin_bo(struct msm_gem_submit *submit, int i)
+static void submit_unlock_unpin_bo(struct msm_gem_submit *submit,
+		int i, bool backoff)
 {
 	struct msm_gem_object *msm_obj = submit->bos[i].obj;
 
@@ -166,7 +172,7 @@ static void submit_unlock_unpin_bo(struct msm_gem_submit *submit, int i)
 	if (submit->bos[i].flags & BO_LOCKED)
 		ww_mutex_unlock(&msm_obj->resv->lock);
 
-	if (!(submit->bos[i].flags & BO_VALID))
+	if (backoff && !(submit->bos[i].flags & BO_VALID))
 		submit->bos[i].iova = 0;
 
 	submit->bos[i].flags &= ~(BO_LOCKED | BO_PINNED);
@@ -201,10 +207,10 @@ retry:
 
 fail:
 	for (; i >= 0; i--)
-		submit_unlock_unpin_bo(submit, i);
+		submit_unlock_unpin_bo(submit, i, true);
 
 	if (slow_locked > 0)
-		submit_unlock_unpin_bo(submit, slow_locked);
+		submit_unlock_unpin_bo(submit, slow_locked, true);
 
 	if (ret == -EDEADLK) {
 		struct msm_gem_object *msm_obj = submit->bos[contended].obj;
@@ -221,7 +227,7 @@ fail:
 	return ret;
 }
 
-static int submit_fence_sync(struct msm_gem_submit *submit)
+static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit)
 {
 	int i, ret = 0;
 
@@ -229,7 +235,22 @@ static int submit_fence_sync(struct msm_gem_submit *submit)
 		struct msm_gem_object *msm_obj = submit->bos[i].obj;
 		bool write = submit->bos[i].flags & MSM_SUBMIT_BO_WRITE;
 
-		ret = msm_gem_sync_object(&msm_obj->base, submit->gpu->fctx, write);
+		if (!write) {
+			/* NOTE: _reserve_shared() must happen before
+			 * _add_shared_fence(), which makes this a slightly
+			 * strange place to call it.  OTOH this is a
+			 * convenient can-fail point to hook it in.
+			 */
+			ret = reservation_object_reserve_shared(msm_obj->resv);
+			if (ret)
+				return ret;
+		}
+
+		if (no_implicit)
+			continue;
+
+		ret = msm_gem_sync_object(&msm_obj->base, submit->ring->fctx,
+			write);
 		if (ret)
 			break;
 	}
@@ -373,7 +394,7 @@ static void submit_cleanup(struct msm_gem_submit *submit)
 
 	for (i = 0; i < submit->nr_bos; i++) {
 		struct msm_gem_object *msm_obj = submit->bos[i].obj;
-		submit_unlock_unpin_bo(submit, i);
+		submit_unlock_unpin_bo(submit, i, false);
 		list_del_init(&msm_obj->submit_entry);
 		drm_gem_object_unreference(&msm_obj->base);
 	}
@@ -391,6 +412,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	struct msm_gpu *gpu = priv->gpu;
 	struct dma_fence *in_fence = NULL;
 	struct sync_file *sync_file = NULL;
+	struct msm_gpu_submitqueue *queue;
+	struct msm_ringbuffer *ring;
 	int out_fence_fd = -1;
 	unsigned i;
 	int ret;
@@ -407,6 +430,12 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	if (MSM_PIPE_FLAGS(args->flags) & ~MSM_SUBMIT_FLAGS)
 		return -EINVAL;
 
+	queue = msm_submitqueue_get(ctx, args->queueid);
+	if (!queue)
+		return -ENOENT;
+
+	ring = gpu->rb[queue->prio];
+
 	if (args->flags & MSM_SUBMIT_FENCE_FD_IN) {
 		in_fence = sync_file_get_fence(args->fence_fd);
 
@@ -417,7 +446,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 		 * Wait if the fence is from a foreign context, or if the fence
 		 * array contains any fence from a foreign context.
 		 */
-		if (!dma_fence_match_context(in_fence, gpu->fctx->context)) {
+		if (!dma_fence_match_context(in_fence, ring->fctx->context)) {
 			ret = dma_fence_wait(in_fence, true);
 			if (ret)
 				return ret;
@@ -435,9 +464,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 			goto out_unlock;
 		}
 	}
-	priv->struct_mutex_task = current;
 
-	submit = submit_create(dev, gpu, args->nr_bos, args->nr_cmds);
+	submit = submit_create(dev, gpu, queue, args->nr_bos, args->nr_cmds);
 	if (!submit) {
 		ret = -ENOMEM;
 		goto out_unlock;
@@ -451,11 +479,9 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	if (ret)
 		goto out;
 
-	if (!(args->flags & MSM_SUBMIT_NO_IMPLICIT)) {
-		ret = submit_fence_sync(submit);
-		if (ret)
-			goto out;
-	}
+	ret = submit_fence_sync(submit, !!(args->flags & MSM_SUBMIT_NO_IMPLICIT));
+	if (ret)
+		goto out;
 
 	ret = submit_pin_objects(submit);
 	if (ret)
@@ -522,7 +548,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 
 	submit->nr_cmds = i;
 
-	submit->fence = msm_fence_alloc(gpu->fctx);
+	submit->fence = msm_fence_alloc(ring->fctx);
 	if (IS_ERR(submit->fence)) {
 		ret = PTR_ERR(submit->fence);
 		submit->fence = NULL;
@@ -555,7 +581,6 @@ out:
 out_unlock:
 	if (ret && (out_fence_fd >= 0))
 		put_unused_fd(out_fence_fd);
-	priv->struct_mutex_task = NULL;
 	mutex_unlock(&dev->struct_mutex);
 	return ret;
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index ffbff27600e0..8d4477818ec2 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -20,6 +20,8 @@
 #include "msm_mmu.h"
 #include "msm_fence.h"
 
+#include <linux/string_helpers.h>
+
 
 /*
  * Power Management:
@@ -221,33 +223,102 @@ int msm_gpu_hw_init(struct msm_gpu *gpu)
  * Hangcheck detection for locked gpu:
  */
 
+static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
+		uint32_t fence)
+{
+	struct msm_gem_submit *submit;
+
+	list_for_each_entry(submit, &ring->submits, node) {
+		if (submit->seqno > fence)
+			break;
+
+		msm_update_fence(submit->ring->fctx,
+			submit->fence->seqno);
+	}
+}
+
+static struct msm_gem_submit *
+find_submit(struct msm_ringbuffer *ring, uint32_t fence)
+{
+	struct msm_gem_submit *submit;
+
+	WARN_ON(!mutex_is_locked(&ring->gpu->dev->struct_mutex));
+
+	list_for_each_entry(submit, &ring->submits, node)
+		if (submit->seqno == fence)
+			return submit;
+
+	return NULL;
+}
+
 static void retire_submits(struct msm_gpu *gpu);
 
 static void recover_worker(struct work_struct *work)
 {
 	struct msm_gpu *gpu = container_of(work, struct msm_gpu, recover_work);
 	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
 	struct msm_gem_submit *submit;
-	uint32_t fence = gpu->funcs->last_fence(gpu);
-
-	msm_update_fence(gpu->fctx, fence + 1);
+	struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu);
+	int i;
 
 	mutex_lock(&dev->struct_mutex);
 
 	dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name);
-	list_for_each_entry(submit, &gpu->submit_list, node) {
-		if (submit->fence->seqno == (fence + 1)) {
-			struct task_struct *task;
-
-			rcu_read_lock();
-			task = pid_task(submit->pid, PIDTYPE_PID);
-			if (task) {
-				dev_err(dev->dev, "%s: offending task: %s\n",
-						gpu->name, task->comm);
-			}
-			rcu_read_unlock();
-			break;
+
+	submit = find_submit(cur_ring, cur_ring->memptrs->fence + 1);
+	if (submit) {
+		struct task_struct *task;
+
+		rcu_read_lock();
+		task = pid_task(submit->pid, PIDTYPE_PID);
+		if (task) {
+			char *cmd;
+
+			/*
+			 * So slightly annoying, in other paths like
+			 * mmap'ing gem buffers, mmap_sem is acquired
+			 * before struct_mutex, which means we can't
+			 * hold struct_mutex across the call to
+			 * get_cmdline().  But submits are retired
+			 * from the same in-order workqueue, so we can
+			 * safely drop the lock here without worrying
+			 * about the submit going away.
+			 */
+			mutex_unlock(&dev->struct_mutex);
+			cmd = kstrdup_quotable_cmdline(task, GFP_KERNEL);
+			mutex_lock(&dev->struct_mutex);
+
+			dev_err(dev->dev, "%s: offending task: %s (%s)\n",
+				gpu->name, task->comm, cmd);
+
+			msm_rd_dump_submit(priv->hangrd, submit,
+				"offending task: %s (%s)", task->comm, cmd);
+		} else {
+			msm_rd_dump_submit(priv->hangrd, submit, NULL);
 		}
+		rcu_read_unlock();
+	}
+
+
+	/*
+	 * Update all the rings with the latest and greatest fence.. this
+	 * needs to happen after msm_rd_dump_submit() to ensure that the
+	 * bo's referenced by the offending submit are still around.
+	 */
+	for (i = 0; i < ARRAY_SIZE(gpu->rb); i++) {
+		struct msm_ringbuffer *ring = gpu->rb[i];
+
+		uint32_t fence = ring->memptrs->fence;
+
+		/*
+		 * For the current (faulting?) ring/submit advance the fence by
+		 * one more to clear the faulting submit
+		 */
+		if (ring == cur_ring)
+			fence++;
+
+		update_fences(gpu, ring, fence);
 	}
 
 	if (msm_gpu_active(gpu)) {
@@ -258,9 +329,15 @@ static void recover_worker(struct work_struct *work)
 		gpu->funcs->recover(gpu);
 		pm_runtime_put_sync(&gpu->pdev->dev);
 
-		/* replay the remaining submits after the one that hung: */
-		list_for_each_entry(submit, &gpu->submit_list, node) {
-			gpu->funcs->submit(gpu, submit, NULL);
+		/*
+		 * Replay all remaining submits starting with highest priority
+		 * ring
+		 */
+		for (i = 0; i < gpu->nr_rings; i++) {
+			struct msm_ringbuffer *ring = gpu->rb[i];
+
+			list_for_each_entry(submit, &ring->submits, node)
+				gpu->funcs->submit(gpu, submit, NULL);
 		}
 	}
 
@@ -281,25 +358,27 @@ static void hangcheck_handler(unsigned long data)
 	struct msm_gpu *gpu = (struct msm_gpu *)data;
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
-	uint32_t fence = gpu->funcs->last_fence(gpu);
+	struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);
+	uint32_t fence = ring->memptrs->fence;
 
-	if (fence != gpu->hangcheck_fence) {
+	if (fence != ring->hangcheck_fence) {
 		/* some progress has been made.. ya! */
-		gpu->hangcheck_fence = fence;
-	} else if (fence < gpu->fctx->last_fence) {
+		ring->hangcheck_fence = fence;
+	} else if (fence < ring->seqno) {
 		/* no progress and not done.. hung! */
-		gpu->hangcheck_fence = fence;
-		dev_err(dev->dev, "%s: hangcheck detected gpu lockup!\n",
-				gpu->name);
+		ring->hangcheck_fence = fence;
+		dev_err(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
+				gpu->name, ring->id);
 		dev_err(dev->dev, "%s:     completed fence: %u\n",
 				gpu->name, fence);
 		dev_err(dev->dev, "%s:     submitted fence: %u\n",
-				gpu->name, gpu->fctx->last_fence);
+				gpu->name, ring->seqno);
+
 		queue_work(priv->wq, &gpu->recover_work);
 	}
 
 	/* if still more pending work, reset the hangcheck timer: */
-	if (gpu->fctx->last_fence > gpu->hangcheck_fence)
+	if (ring->seqno > ring->hangcheck_fence)
 		hangcheck_timer_reset(gpu);
 
 	/* workaround for missing irq: */
@@ -428,19 +507,18 @@ static void retire_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 static void retire_submits(struct msm_gpu *gpu)
 {
 	struct drm_device *dev = gpu->dev;
+	struct msm_gem_submit *submit, *tmp;
+	int i;
 
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
 
-	while (!list_empty(&gpu->submit_list)) {
-		struct msm_gem_submit *submit;
+	/* Retire the commits starting with highest priority */
+	for (i = 0; i < gpu->nr_rings; i++) {
+		struct msm_ringbuffer *ring = gpu->rb[i];
 
-		submit = list_first_entry(&gpu->submit_list,
-				struct msm_gem_submit, node);
-
-		if (dma_fence_is_signaled(submit->fence)) {
-			retire_submit(gpu, submit);
-		} else {
-			break;
+		list_for_each_entry_safe(submit, tmp, &ring->submits, node) {
+			if (dma_fence_is_signaled(submit->fence))
+				retire_submit(gpu, submit);
 		}
 	}
 }
@@ -449,9 +527,10 @@ static void retire_worker(struct work_struct *work)
 {
 	struct msm_gpu *gpu = container_of(work, struct msm_gpu, retire_work);
 	struct drm_device *dev = gpu->dev;
-	uint32_t fence = gpu->funcs->last_fence(gpu);
+	int i;
 
-	msm_update_fence(gpu->fctx, fence);
+	for (i = 0; i < gpu->nr_rings; i++)
+		update_fences(gpu, gpu->rb[i], gpu->rb[i]->memptrs->fence);
 
 	mutex_lock(&dev->struct_mutex);
 	retire_submits(gpu);
@@ -472,6 +551,7 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 {
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_ringbuffer *ring = submit->ring;
 	int i;
 
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
@@ -480,9 +560,11 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 
 	msm_gpu_hw_init(gpu);
 
-	list_add_tail(&submit->node, &gpu->submit_list);
+	submit->seqno = ++ring->seqno;
 
-	msm_rd_dump_submit(submit);
+	list_add_tail(&submit->node, &ring->submits);
+
+	msm_rd_dump_submit(priv->rd, submit, NULL);
 
 	update_sw_cntrs(gpu);
 
@@ -605,7 +687,9 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
 		const char *name, struct msm_gpu_config *config)
 {
-	int ret;
+	int i, ret, nr_rings = config->nr_rings;
+	void *memptrs;
+	uint64_t memptrs_iova;
 
 	if (WARN_ON(gpu->num_perfcntrs > ARRAY_SIZE(gpu->last_cntrs)))
 		gpu->num_perfcntrs = ARRAY_SIZE(gpu->last_cntrs);
@@ -613,18 +697,11 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	gpu->dev = drm;
 	gpu->funcs = funcs;
 	gpu->name = name;
-	gpu->fctx = msm_fence_context_alloc(drm, name);
-	if (IS_ERR(gpu->fctx)) {
-		ret = PTR_ERR(gpu->fctx);
-		gpu->fctx = NULL;
-		goto fail;
-	}
 
 	INIT_LIST_HEAD(&gpu->active_list);
 	INIT_WORK(&gpu->retire_work, retire_worker);
 	INIT_WORK(&gpu->recover_work, recover_worker);
 
-	INIT_LIST_HEAD(&gpu->submit_list);
 
 	setup_timer(&gpu->hangcheck_timer, hangcheck_handler,
 			(unsigned long)gpu);
@@ -689,36 +766,79 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		goto fail;
 	}
 
-	/* Create ringbuffer: */
-	gpu->rb = msm_ringbuffer_new(gpu, config->ringsz);
-	if (IS_ERR(gpu->rb)) {
-		ret = PTR_ERR(gpu->rb);
-		gpu->rb = NULL;
-		dev_err(drm->dev, "could not create ringbuffer: %d\n", ret);
+	memptrs = msm_gem_kernel_new(drm, sizeof(*gpu->memptrs_bo),
+		MSM_BO_UNCACHED, gpu->aspace, &gpu->memptrs_bo,
+		&memptrs_iova);
+
+	if (IS_ERR(memptrs)) {
+		ret = PTR_ERR(memptrs);
+		dev_err(drm->dev, "could not allocate memptrs: %d\n", ret);
 		goto fail;
 	}
 
+	if (nr_rings > ARRAY_SIZE(gpu->rb)) {
+		DRM_DEV_INFO_ONCE(drm->dev, "Only creating %zu ringbuffers\n",
+			ARRAY_SIZE(gpu->rb));
+		nr_rings = ARRAY_SIZE(gpu->rb);
+	}
+
+	/* Create ringbuffer(s): */
+	for (i = 0; i < nr_rings; i++) {
+		gpu->rb[i] = msm_ringbuffer_new(gpu, i, memptrs, memptrs_iova);
+
+		if (IS_ERR(gpu->rb[i])) {
+			ret = PTR_ERR(gpu->rb[i]);
+			dev_err(drm->dev,
+				"could not create ringbuffer %d: %d\n", i, ret);
+			goto fail;
+		}
+
+		memptrs += sizeof(struct msm_rbmemptrs);
+		memptrs_iova += sizeof(struct msm_rbmemptrs);
+	}
+
+	gpu->nr_rings = nr_rings;
+
 	return 0;
 
 fail:
+	for (i = 0; i < ARRAY_SIZE(gpu->rb); i++)  {
+		msm_ringbuffer_destroy(gpu->rb[i]);
+		gpu->rb[i] = NULL;
+	}
+
+	if (gpu->memptrs_bo) {
+		msm_gem_put_vaddr(gpu->memptrs_bo);
+		msm_gem_put_iova(gpu->memptrs_bo, gpu->aspace);
+		drm_gem_object_unreference_unlocked(gpu->memptrs_bo);
+	}
+
 	platform_set_drvdata(pdev, NULL);
 	return ret;
 }
 
 void msm_gpu_cleanup(struct msm_gpu *gpu)
 {
+	int i;
+
 	DBG("%s", gpu->name);
 
 	WARN_ON(!list_empty(&gpu->active_list));
 
 	bs_fini(gpu);
 
-	if (gpu->rb) {
-		if (gpu->rb_iova)
-			msm_gem_put_iova(gpu->rb->bo, gpu->aspace);
-		msm_ringbuffer_destroy(gpu->rb);
+	for (i = 0; i < ARRAY_SIZE(gpu->rb); i++) {
+		msm_ringbuffer_destroy(gpu->rb[i]);
+		gpu->rb[i] = NULL;
 	}
-	if (gpu->aspace) {
+
+	if (gpu->memptrs_bo) {
+		msm_gem_put_vaddr(gpu->memptrs_bo);
+		msm_gem_put_iova(gpu->memptrs_bo, gpu->aspace);
+		drm_gem_object_unreference_unlocked(gpu->memptrs_bo);
+	}
+
+	if (!IS_ERR_OR_NULL(gpu->aspace)) {
 		gpu->aspace->mmu->funcs->detach(gpu->aspace->mmu,
 			NULL, 0);
 		msm_gem_address_space_put(gpu->aspace);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index df4e2771fb85..e113d64574d3 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -33,7 +33,7 @@ struct msm_gpu_config {
 	const char *irqname;
 	uint64_t va_start;
 	uint64_t va_end;
-	unsigned int ringsz;
+	unsigned int nr_rings;
 };
 
 /* So far, with hardware that I've seen to date, we can have:
@@ -57,9 +57,9 @@ struct msm_gpu_funcs {
 	int (*pm_resume)(struct msm_gpu *gpu);
 	void (*submit)(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 			struct msm_file_private *ctx);
-	void (*flush)(struct msm_gpu *gpu);
+	void (*flush)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 	irqreturn_t (*irq)(struct msm_gpu *irq);
-	uint32_t (*last_fence)(struct msm_gpu *gpu);
+	struct msm_ringbuffer *(*active_ring)(struct msm_gpu *gpu);
 	void (*recover)(struct msm_gpu *gpu);
 	void (*destroy)(struct msm_gpu *gpu);
 #ifdef CONFIG_DEBUG_FS
@@ -86,16 +86,12 @@ struct msm_gpu {
 	const struct msm_gpu_perfcntr *perfcntrs;
 	uint32_t num_perfcntrs;
 
-	/* ringbuffer: */
-	struct msm_ringbuffer *rb;
-	uint64_t rb_iova;
+	struct msm_ringbuffer *rb[MSM_GPU_MAX_RINGS];
+	int nr_rings;
 
 	/* list of GEM active objects: */
 	struct list_head active_list;
 
-	/* fencing: */
-	struct msm_fence_context *fctx;
-
 	/* does gpu need hw_init? */
 	bool needs_hw_init;
 
@@ -126,15 +122,31 @@ struct msm_gpu {
 #define DRM_MSM_HANGCHECK_PERIOD 500 /* in ms */
 #define DRM_MSM_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_MSM_HANGCHECK_PERIOD)
 	struct timer_list hangcheck_timer;
-	uint32_t hangcheck_fence;
 	struct work_struct recover_work;
 
-	struct list_head submit_list;
+	struct drm_gem_object *memptrs_bo;
 };
 
+/* It turns out that all targets use the same ringbuffer size */
+#define MSM_GPU_RINGBUFFER_SZ SZ_32K
+#define MSM_GPU_RINGBUFFER_BLKSIZE 32
+
+#define MSM_GPU_RB_CNTL_DEFAULT \
+		(AXXX_CP_RB_CNTL_BUFSZ(ilog2(MSM_GPU_RINGBUFFER_SZ / 8)) | \
+		AXXX_CP_RB_CNTL_BLKSZ(ilog2(MSM_GPU_RINGBUFFER_BLKSIZE / 8)))
+
 static inline bool msm_gpu_active(struct msm_gpu *gpu)
 {
-	return gpu->fctx->last_fence > gpu->funcs->last_fence(gpu);
+	int i;
+
+	for (i = 0; i < gpu->nr_rings; i++) {
+		struct msm_ringbuffer *ring = gpu->rb[i];
+
+		if (ring->seqno > ring->memptrs->fence)
+			return true;
+	}
+
+	return false;
 }
 
 /* Perf-Counters:
@@ -150,6 +162,15 @@ struct msm_gpu_perfcntr {
 	const char *name;
 };
 
+struct msm_gpu_submitqueue {
+	int id;
+	u32 flags;
+	u32 prio;
+	int faults;
+	struct list_head node;
+	struct kref ref;
+};
+
 static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
 {
 	msm_writel(data, gpu->mmio + (reg << 2));
@@ -223,4 +244,10 @@ struct msm_gpu *adreno_load_gpu(struct drm_device *dev);
 void __init adreno_register(void);
 void __exit adreno_unregister(void);
 
+static inline void msm_submitqueue_put(struct msm_gpu_submitqueue *queue)
+{
+	if (queue)
+		kref_put(&queue->ref, msm_submitqueue_destroy);
+}
+
 #endif /* __MSM_GPU_H__ */
diff --git a/drivers/gpu/drm/msm/msm_rd.c b/drivers/gpu/drm/msm/msm_rd.c
index 0366b8092f97..3aa8a8576abe 100644
--- a/drivers/gpu/drm/msm/msm_rd.c
+++ b/drivers/gpu/drm/msm/msm_rd.c
@@ -19,11 +19,17 @@
  *
  *   tail -f /sys/kernel/debug/dri/<minor>/rd > logfile.rd
  *
- * To log the cmdstream in a format that is understood by freedreno/cffdump
+ * to log the cmdstream in a format that is understood by freedreno/cffdump
  * utility.  By comparing the last successfully completed fence #, to the
  * cmdstream for the next fence, you can narrow down which process and submit
  * caused the gpu crash/lockup.
  *
+ * Additionally:
+ *
+ *   tail -f /sys/kernel/debug/dri/<minor>/hangrd > logfile.rd
+ *
+ * will capture just the cmdstream from submits which triggered a GPU hang.
+ *
  * This bypasses drm_debugfs_create_files() mainly because we need to use
  * our own fops for a bit more control.  In particular, we don't want to
  * do anything if userspace doesn't have the debugfs file open.
@@ -111,10 +117,14 @@ static void rd_write(struct msm_rd_state *rd, const void *buf, int sz)
 
 		wait_event(rd->fifo_event, circ_space(&rd->fifo) > 0);
 
+		/* Note that smp_load_acquire() is not strictly required
+		 * as CIRC_SPACE_TO_END() does not access the tail more
+		 * than once.
+		 */
 		n = min(sz, circ_space_to_end(&rd->fifo));
 		memcpy(fptr, ptr, n);
 
-		fifo->head = (fifo->head + n) & (BUF_SZ - 1);
+		smp_store_release(&fifo->head, (fifo->head + n) & (BUF_SZ - 1));
 		sz  -= n;
 		ptr += n;
 
@@ -145,13 +155,17 @@ static ssize_t rd_read(struct file *file, char __user *buf,
 	if (ret)
 		goto out;
 
+	/* Note that smp_load_acquire() is not strictly required
+	 * as CIRC_CNT_TO_END() does not access the head more than
+	 * once.
+	 */
 	n = min_t(int, sz, circ_count_to_end(&rd->fifo));
 	if (copy_to_user(buf, fptr, n)) {
 		ret = -EFAULT;
 		goto out;
 	}
 
-	fifo->tail = (fifo->tail + n) & (BUF_SZ - 1);
+	smp_store_release(&fifo->tail, (fifo->tail + n) & (BUF_SZ - 1));
 	*ppos += n;
 
 	wake_up_all(&rd->fifo_event);
@@ -212,53 +226,89 @@ static const struct file_operations rd_debugfs_fops = {
 	.release = rd_release,
 };
 
-int msm_rd_debugfs_init(struct drm_minor *minor)
+
+static void rd_cleanup(struct msm_rd_state *rd)
+{
+	if (!rd)
+		return;
+
+	mutex_destroy(&rd->read_lock);
+	kfree(rd);
+}
+
+static struct msm_rd_state *rd_init(struct drm_minor *minor, const char *name)
 {
-	struct msm_drm_private *priv = minor->dev->dev_private;
 	struct msm_rd_state *rd;
 	struct dentry *ent;
-
-	/* only create on first minor: */
-	if (priv->rd)
-		return 0;
+	int ret = 0;
 
 	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
 	if (!rd)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	rd->dev = minor->dev;
 	rd->fifo.buf = rd->buf;
 
 	mutex_init(&rd->read_lock);
-	priv->rd = rd;
 
 	init_waitqueue_head(&rd->fifo_event);
 
-	ent = debugfs_create_file("rd", S_IFREG | S_IRUGO,
+	ent = debugfs_create_file(name, S_IFREG | S_IRUGO,
 			minor->debugfs_root, rd, &rd_debugfs_fops);
 	if (!ent) {
-		DRM_ERROR("Cannot create /sys/kernel/debug/dri/%pd/rd\n",
-				minor->debugfs_root);
+		DRM_ERROR("Cannot create /sys/kernel/debug/dri/%pd/%s\n",
+				minor->debugfs_root, name);
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	return rd;
+
+fail:
+	rd_cleanup(rd);
+	return ERR_PTR(ret);
+}
+
+int msm_rd_debugfs_init(struct drm_minor *minor)
+{
+	struct msm_drm_private *priv = minor->dev->dev_private;
+	struct msm_rd_state *rd;
+	int ret;
+
+	/* only create on first minor: */
+	if (priv->rd)
+		return 0;
+
+	rd = rd_init(minor, "rd");
+	if (IS_ERR(rd)) {
+		ret = PTR_ERR(rd);
 		goto fail;
 	}
 
+	priv->rd = rd;
+
+	rd = rd_init(minor, "hangrd");
+	if (IS_ERR(rd)) {
+		ret = PTR_ERR(rd);
+		goto fail;
+	}
+
+	priv->hangrd = rd;
+
 	return 0;
 
 fail:
 	msm_rd_debugfs_cleanup(priv);
-	return -1;
+	return ret;
 }
 
 void msm_rd_debugfs_cleanup(struct msm_drm_private *priv)
 {
-	struct msm_rd_state *rd = priv->rd;
-
-	if (!rd)
-		return;
-
+	rd_cleanup(priv->rd);
 	priv->rd = NULL;
-	mutex_destroy(&rd->read_lock);
-	kfree(rd);
+
+	rd_cleanup(priv->hangrd);
+	priv->hangrd = NULL;
 }
 
 static void snapshot_buf(struct msm_rd_state *rd,
@@ -268,10 +318,6 @@ static void snapshot_buf(struct msm_rd_state *rd,
 	struct msm_gem_object *obj = submit->bos[idx].obj;
 	const char *buf;
 
-	buf = msm_gem_get_vaddr(&obj->base);
-	if (IS_ERR(buf))
-		return;
-
 	if (iova) {
 		buf += iova - submit->bos[idx].iova;
 	} else {
@@ -279,20 +325,33 @@ static void snapshot_buf(struct msm_rd_state *rd,
 		size = obj->base.size;
 	}
 
+	/*
+	 * Always write the GPUADDR header so can get a complete list of all the
+	 * buffers in the cmd
+	 */
 	rd_write_section(rd, RD_GPUADDR,
 			(uint32_t[3]){ iova, size, iova >> 32 }, 12);
+
+	/* But only dump the contents of buffers marked READ */
+	if (!(submit->bos[idx].flags & MSM_SUBMIT_BO_READ))
+		return;
+
+	buf = msm_gem_get_vaddr_active(&obj->base);
+	if (IS_ERR(buf))
+		return;
+
 	rd_write_section(rd, RD_BUFFER_CONTENTS, buf, size);
 
 	msm_gem_put_vaddr(&obj->base);
 }
 
 /* called under struct_mutex */
-void msm_rd_dump_submit(struct msm_gem_submit *submit)
+void msm_rd_dump_submit(struct msm_rd_state *rd, struct msm_gem_submit *submit,
+		const char *fmt, ...)
 {
 	struct drm_device *dev = submit->dev;
-	struct msm_drm_private *priv = dev->dev_private;
-	struct msm_rd_state *rd = priv->rd;
-	char msg[128];
+	struct task_struct *task;
+	char msg[256];
 	int i, n;
 
 	if (!rd->open)
@@ -303,23 +362,32 @@ void msm_rd_dump_submit(struct msm_gem_submit *submit)
 	 */
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
 
-	n = snprintf(msg, sizeof(msg), "%.*s/%d: fence=%u",
-			TASK_COMM_LEN, current->comm, task_pid_nr(current),
-			submit->fence->seqno);
+	if (fmt) {
+		va_list args;
 
-	rd_write_section(rd, RD_CMD, msg, ALIGN(n, 4));
+		va_start(args, fmt);
+		n = vsnprintf(msg, sizeof(msg), fmt, args);
+		va_end(args);
 
-	if (rd_full) {
-		for (i = 0; i < submit->nr_bos; i++) {
-			/* buffers that are written to probably don't start out
-			 * with anything interesting:
-			 */
-			if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
-				continue;
+		rd_write_section(rd, RD_CMD, msg, ALIGN(n, 4));
+	}
 
-			snapshot_buf(rd, submit, i, 0, 0);
-		}
+	rcu_read_lock();
+	task = pid_task(submit->pid, PIDTYPE_PID);
+	if (task) {
+		n = snprintf(msg, sizeof(msg), "%.*s/%d: fence=%u",
+				TASK_COMM_LEN, task->comm,
+				pid_nr(submit->pid), submit->seqno);
+	} else {
+		n = snprintf(msg, sizeof(msg), "???/%d: fence=%u",
+				pid_nr(submit->pid), submit->seqno);
 	}
+	rcu_read_unlock();
+
+	rd_write_section(rd, RD_CMD, msg, ALIGN(n, 4));
+
+	for (i = 0; rd_full && i < submit->nr_bos; i++)
+		snapshot_buf(rd, submit, i, 0, 0);
 
 	for (i = 0; i < submit->nr_cmds; i++) {
 		uint64_t iova = submit->cmd[i].iova;
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
index bf065a540130..6ca98da35f63 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -18,13 +18,15 @@
 #include "msm_ringbuffer.h"
 #include "msm_gpu.h"
 
-struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size)
+struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
+		void *memptrs, uint64_t memptrs_iova)
 {
 	struct msm_ringbuffer *ring;
+	char name[32];
 	int ret;
 
-	if (WARN_ON(!is_power_of_2(size)))
-		return ERR_PTR(-EINVAL);
+	/* We assume everwhere that MSM_GPU_RINGBUFFER_SZ is a power of 2 */
+	BUILD_BUG_ON(!is_power_of_2(MSM_GPU_RINGBUFFER_SZ));
 
 	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
 	if (!ring) {
@@ -33,32 +35,46 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size)
 	}
 
 	ring->gpu = gpu;
-
+	ring->id = id;
 	/* Pass NULL for the iova pointer - we will map it later */
-	ring->start = msm_gem_kernel_new(gpu->dev, size, MSM_BO_WC,
-		gpu->aspace, &ring->bo, NULL);
+	ring->start = msm_gem_kernel_new(gpu->dev, MSM_GPU_RINGBUFFER_SZ,
+		MSM_BO_WC, gpu->aspace, &ring->bo, NULL);
 
 	if (IS_ERR(ring->start)) {
 		ret = PTR_ERR(ring->start);
 		ring->start = 0;
 		goto fail;
 	}
-	ring->end   = ring->start + (size / 4);
+	ring->end   = ring->start + (MSM_GPU_RINGBUFFER_SZ >> 2);
+	ring->next  = ring->start;
 	ring->cur   = ring->start;
 
-	ring->size = size;
+	ring->memptrs = memptrs;
+	ring->memptrs_iova = memptrs_iova;
+
+	INIT_LIST_HEAD(&ring->submits);
+	spin_lock_init(&ring->lock);
+
+	snprintf(name, sizeof(name), "gpu-ring-%d", ring->id);
+
+	ring->fctx = msm_fence_context_alloc(gpu->dev, name);
 
 	return ring;
 
 fail:
-	if (ring)
-		msm_ringbuffer_destroy(ring);
+	msm_ringbuffer_destroy(ring);
 	return ERR_PTR(ret);
 }
 
 void msm_ringbuffer_destroy(struct msm_ringbuffer *ring)
 {
+	if (IS_ERR_OR_NULL(ring))
+		return;
+
+	msm_fence_context_free(ring->fctx);
+
 	if (ring->bo) {
+		msm_gem_put_iova(ring->bo, ring->gpu->aspace);
 		msm_gem_put_vaddr(ring->bo);
 		drm_gem_object_unreference_unlocked(ring->bo);
 	}
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h
index 6e0e1049fa4f..cffce094aecb 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.h
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.h
@@ -20,14 +20,31 @@
 
 #include "msm_drv.h"
 
+#define rbmemptr(ring, member)  \
+	((ring)->memptrs_iova + offsetof(struct msm_rbmemptrs, member))
+
+struct msm_rbmemptrs {
+	volatile uint32_t rptr;
+	volatile uint32_t fence;
+};
+
 struct msm_ringbuffer {
 	struct msm_gpu *gpu;
-	int size;
+	int id;
 	struct drm_gem_object *bo;
-	uint32_t *start, *end, *cur;
+	uint32_t *start, *end, *cur, *next;
+	struct list_head submits;
+	uint64_t iova;
+	uint32_t seqno;
+	uint32_t hangcheck_fence;
+	struct msm_rbmemptrs *memptrs;
+	uint64_t memptrs_iova;
+	struct msm_fence_context *fctx;
+	spinlock_t lock;
 };
 
-struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size);
+struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
+		void *memptrs, uint64_t memptrs_iova);
 void msm_ringbuffer_destroy(struct msm_ringbuffer *ring);
 
 /* ringbuffer helpers (the parts that are same for a3xx/a2xx/z180..) */
@@ -35,9 +52,13 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring);
 static inline void
 OUT_RING(struct msm_ringbuffer *ring, uint32_t data)
 {
-	if (ring->cur == ring->end)
-		ring->cur = ring->start;
-	*(ring->cur++) = data;
+	/*
+	 * ring->next points to the current command being written - it won't be
+	 * committed as ring->cur until the flush
+	 */
+	if (ring->next == ring->end)
+		ring->next = ring->start;
+	*(ring->next++) = data;
 }
 
 #endif /* __MSM_RINGBUFFER_H__ */
diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c
new file mode 100644
index 000000000000..5115f75b5b7f
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_submitqueue.c
@@ -0,0 +1,152 @@
+/* Copyright (c) 2017 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kref.h>
+#include "msm_gpu.h"
+
+void msm_submitqueue_destroy(struct kref *kref)
+{
+	struct msm_gpu_submitqueue *queue = container_of(kref,
+		struct msm_gpu_submitqueue, ref);
+
+	kfree(queue);
+}
+
+struct msm_gpu_submitqueue *msm_submitqueue_get(struct msm_file_private *ctx,
+		u32 id)
+{
+	struct msm_gpu_submitqueue *entry;
+
+	if (!ctx)
+		return NULL;
+
+	read_lock(&ctx->queuelock);
+
+	list_for_each_entry(entry, &ctx->submitqueues, node) {
+		if (entry->id == id) {
+			kref_get(&entry->ref);
+			read_unlock(&ctx->queuelock);
+
+			return entry;
+		}
+	}
+
+	read_unlock(&ctx->queuelock);
+	return NULL;
+}
+
+void msm_submitqueue_close(struct msm_file_private *ctx)
+{
+	struct msm_gpu_submitqueue *entry, *tmp;
+
+	if (!ctx)
+		return;
+
+	/*
+	 * No lock needed in close and there won't
+	 * be any more user ioctls coming our way
+	 */
+	list_for_each_entry_safe(entry, tmp, &ctx->submitqueues, node)
+		msm_submitqueue_put(entry);
+}
+
+int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
+		u32 prio, u32 flags, u32 *id)
+{
+	struct msm_drm_private *priv = drm->dev_private;
+	struct msm_gpu_submitqueue *queue;
+
+	if (!ctx)
+		return -ENODEV;
+
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+
+	if (!queue)
+		return -ENOMEM;
+
+	kref_init(&queue->ref);
+	queue->flags = flags;
+
+	if (priv->gpu) {
+		if (prio >= priv->gpu->nr_rings)
+			return -EINVAL;
+
+		queue->prio = prio;
+	}
+
+	write_lock(&ctx->queuelock);
+
+	queue->id = ctx->queueid++;
+
+	if (id)
+		*id = queue->id;
+
+	list_add_tail(&queue->node, &ctx->submitqueues);
+
+	write_unlock(&ctx->queuelock);
+
+	return 0;
+}
+
+int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx)
+{
+	struct msm_drm_private *priv = drm->dev_private;
+	int default_prio;
+
+	if (!ctx)
+		return 0;
+
+	/*
+	 * Select priority 2 as the "default priority" unless nr_rings is less
+	 * than 2 and then pick the lowest pirority
+	 */
+	default_prio = priv->gpu ?
+		clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1) : 0;
+
+	INIT_LIST_HEAD(&ctx->submitqueues);
+
+	rwlock_init(&ctx->queuelock);
+
+	return msm_submitqueue_create(drm, ctx, default_prio, 0, NULL);
+}
+
+int msm_submitqueue_remove(struct msm_file_private *ctx, u32 id)
+{
+	struct msm_gpu_submitqueue *entry;
+
+	if (!ctx)
+		return 0;
+
+	/*
+	 * id 0 is the "default" queue and can't be destroyed
+	 * by the user
+	 */
+	if (!id)
+		return -ENOENT;
+
+	write_lock(&ctx->queuelock);
+
+	list_for_each_entry(entry, &ctx->submitqueues, node) {
+		if (entry->id == id) {
+			list_del(&entry->node);
+			write_unlock(&ctx->queuelock);
+
+			msm_submitqueue_put(entry);
+			return 0;
+		}
+	}
+
+	write_unlock(&ctx->queuelock);
+	return -ENOENT;
+}
+
diff --git a/drivers/gpu/drm/nouveau/Kbuild b/drivers/gpu/drm/nouveau/Kbuild
index 2e9ce53ae3a8..9c0c650655e9 100644
--- a/drivers/gpu/drm/nouveau/Kbuild
+++ b/drivers/gpu/drm/nouveau/Kbuild
@@ -30,9 +30,11 @@ nouveau-y += nouveau_vga.o
 # DRM - memory management
 nouveau-y += nouveau_bo.o
 nouveau-y += nouveau_gem.o
+nouveau-y += nouveau_mem.o
 nouveau-y += nouveau_prime.o
 nouveau-y += nouveau_sgdma.o
 nouveau-y += nouveau_ttm.o
+nouveau-y += nouveau_vmm.o
 
 # DRM - modesetting
 nouveau-$(CONFIG_DRM_NOUVEAU_BACKLIGHT) += nouveau_backlight.o
diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig
index c02a13406a81..4b75ad40dd80 100644
--- a/drivers/gpu/drm/nouveau/Kconfig
+++ b/drivers/gpu/drm/nouveau/Kconfig
@@ -56,6 +56,13 @@ config NOUVEAU_DEBUG_DEFAULT
 	help
 	  Selects the default debug level
 
+config NOUVEAU_DEBUG_MMU
+	bool "Enable additional MMU debugging"
+	depends on DRM_NOUVEAU
+	default n
+	help
+	  Say Y here if you want to enable verbose MMU debug output.
+
 config DRM_NOUVEAU_BACKLIGHT
 	bool "Support for backlight control"
 	depends on DRM_NOUVEAU
diff --git a/drivers/gpu/drm/nouveau/dispnv04/disp.c b/drivers/gpu/drm/nouveau/dispnv04/disp.c
index 5b9d549aa791..501d2d290e9c 100644
--- a/drivers/gpu/drm/nouveau/dispnv04/disp.c
+++ b/drivers/gpu/drm/nouveau/dispnv04/disp.c
@@ -48,7 +48,7 @@ nv04_display_create(struct drm_device *dev)
 	if (!disp)
 		return -ENOMEM;
 
-	nvif_object_map(&drm->client.device.object);
+	nvif_object_map(&drm->client.device.object, NULL, 0);
 
 	nouveau_display(dev)->priv = disp;
 	nouveau_display(dev)->dtor = nv04_display_destroy;
diff --git a/drivers/gpu/drm/nouveau/include/nvif/cl506e.h b/drivers/gpu/drm/nouveau/include/nvif/cl506e.h
index aa94b8cf9679..f50866011002 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/cl506e.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/cl506e.h
@@ -5,7 +5,7 @@ struct nv50_channel_dma_v0 {
 	__u8  version;
 	__u8  chid;
 	__u8  pad02[6];
-	__u64 vm;
+	__u64 vmm;
 	__u64 pushbuf;
 	__u64 offset;
 };
diff --git a/drivers/gpu/drm/nouveau/include/nvif/cl506f.h b/drivers/gpu/drm/nouveau/include/nvif/cl506f.h
index 3b7101966de4..0e5bbb553158 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/cl506f.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/cl506f.h
@@ -8,6 +8,6 @@ struct nv50_channel_gpfifo_v0 {
 	__u32 ilength;
 	__u64 ioffset;
 	__u64 pushbuf;
-	__u64 vm;
+	__u64 vmm;
 };
 #endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/cl826e.h b/drivers/gpu/drm/nouveau/include/nvif/cl826e.h
index 91e33db21a2f..7f6a8ce5a418 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/cl826e.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/cl826e.h
@@ -5,7 +5,7 @@ struct g82_channel_dma_v0 {
 	__u8  version;
 	__u8  chid;
 	__u8  pad02[6];
-	__u64 vm;
+	__u64 vmm;
 	__u64 pushbuf;
 	__u64 offset;
 };
diff --git a/drivers/gpu/drm/nouveau/include/nvif/cl826f.h b/drivers/gpu/drm/nouveau/include/nvif/cl826f.h
index e34efd4ec537..c4d35522331a 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/cl826f.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/cl826f.h
@@ -8,7 +8,7 @@ struct g82_channel_gpfifo_v0 {
 	__u32 ilength;
 	__u64 ioffset;
 	__u64 pushbuf;
-	__u64 vm;
+	__u64 vmm;
 };
 
 #define NV826F_V0_NTFY_NON_STALL_INTERRUPT                                 0x00
diff --git a/drivers/gpu/drm/nouveau/include/nvif/cl906f.h b/drivers/gpu/drm/nouveau/include/nvif/cl906f.h
index a2d5410a491b..169161c1587f 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/cl906f.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/cl906f.h
@@ -7,7 +7,7 @@ struct fermi_channel_gpfifo_v0 {
 	__u8  pad02[2];
 	__u32 ilength;
 	__u64 ioffset;
-	__u64 vm;
+	__u64 vmm;
 };
 
 #define NV906F_V0_NTFY_NON_STALL_INTERRUPT                                 0x00
diff --git a/drivers/gpu/drm/nouveau/include/nvif/cla06f.h b/drivers/gpu/drm/nouveau/include/nvif/cla06f.h
index 2efa3d048bb9..3e57089526e3 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/cla06f.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/cla06f.h
@@ -22,7 +22,7 @@ struct kepler_channel_gpfifo_a_v0 {
 	__u32 engines;
 	__u32 ilength;
 	__u64 ioffset;
-	__u64 vm;
+	__u64 vmm;
 };
 
 #define NVA06F_V0_NTFY_NON_STALL_INTERRUPT                                 0x00
diff --git a/drivers/gpu/drm/nouveau/include/nvif/class.h b/drivers/gpu/drm/nouveau/include/nvif/class.h
index d08da82ba7ed..56aade45067d 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/class.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/class.h
@@ -14,6 +14,23 @@
 #define NVIF_CLASS_SW_NV50                           /* if0005.h */ -0x00000006
 #define NVIF_CLASS_SW_GF100                          /* if0005.h */ -0x00000007
 
+#define NVIF_CLASS_MMU                               /* if0008.h */  0x80000008
+#define NVIF_CLASS_MMU_NV04                          /* if0008.h */  0x80000009
+#define NVIF_CLASS_MMU_NV50                          /* if0008.h */  0x80005009
+#define NVIF_CLASS_MMU_GF100                         /* if0008.h */  0x80009009
+
+#define NVIF_CLASS_MEM                               /* if000a.h */  0x8000000a
+#define NVIF_CLASS_MEM_NV04                          /* if000b.h */  0x8000000b
+#define NVIF_CLASS_MEM_NV50                          /* if500b.h */  0x8000500b
+#define NVIF_CLASS_MEM_GF100                         /* if900b.h */  0x8000900b
+
+#define NVIF_CLASS_VMM                               /* if000c.h */  0x8000000c
+#define NVIF_CLASS_VMM_NV04                          /* if000d.h */  0x8000000d
+#define NVIF_CLASS_VMM_NV50                          /* if500d.h */  0x8000500d
+#define NVIF_CLASS_VMM_GF100                         /* if900d.h */  0x8000900d
+#define NVIF_CLASS_VMM_GM200                         /* ifb00d.h */  0x8000b00d
+#define NVIF_CLASS_VMM_GP100                         /* ifc00d.h */  0x8000c00d
+
 /* the below match nvidia-assigned (either in hw, or sw) class numbers */
 #define NV_NULL_CLASS                                                0x00000030
 
diff --git a/drivers/gpu/drm/nouveau/include/nvif/device.h b/drivers/gpu/drm/nouveau/include/nvif/device.h
index bcb981711617..b579633b80c0 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/device.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/device.h
@@ -38,7 +38,6 @@ u64  nvif_device_time(struct nvif_device *);
 /*XXX*/
 #include <subdev/bios.h>
 #include <subdev/fb.h>
-#include <subdev/mmu.h>
 #include <subdev/bar.h>
 #include <subdev/gpio.h>
 #include <subdev/clk.h>
@@ -57,8 +56,6 @@ u64  nvif_device_time(struct nvif_device *);
 })
 #define nvxx_bios(a) nvxx_device(a)->bios
 #define nvxx_fb(a) nvxx_device(a)->fb
-#define nvxx_mmu(a) nvxx_device(a)->mmu
-#define nvxx_bar(a) nvxx_device(a)->bar
 #define nvxx_gpio(a) nvxx_device(a)->gpio
 #define nvxx_clk(a) nvxx_device(a)->clk
 #define nvxx_i2c(a) nvxx_device(a)->i2c
@@ -66,10 +63,8 @@ u64  nvif_device_time(struct nvif_device *);
 #define nvxx_therm(a) nvxx_device(a)->therm
 #define nvxx_volt(a) nvxx_device(a)->volt
 
-#include <core/device.h>
 #include <engine/fifo.h>
 #include <engine/gr.h>
-#include <engine/sw.h>
 
 #define nvxx_fifo(a) nvxx_device(a)->fifo
 #define nvxx_gr(a) nvxx_device(a)->gr
diff --git a/drivers/gpu/drm/nouveau/include/nvif/if0008.h b/drivers/gpu/drm/nouveau/include/nvif/if0008.h
new file mode 100644
index 000000000000..8450127420f5
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/if0008.h
@@ -0,0 +1,42 @@
+#ifndef __NVIF_IF0008_H__
+#define __NVIF_IF0008_H__
+struct nvif_mmu_v0 {
+	__u8  version;
+	__u8  dmabits;
+	__u8  heap_nr;
+	__u8  type_nr;
+	__u16 kind_nr;
+};
+
+#define NVIF_MMU_V0_HEAP                                                   0x00
+#define NVIF_MMU_V0_TYPE                                                   0x01
+#define NVIF_MMU_V0_KIND                                                   0x02
+
+struct nvif_mmu_heap_v0 {
+	__u8  version;
+	__u8  index;
+	__u8  pad02[6];
+	__u64 size;
+};
+
+struct nvif_mmu_type_v0 {
+	__u8  version;
+	__u8  index;
+	__u8  heap;
+	__u8  vram;
+	__u8  host;
+	__u8  comp;
+	__u8  disp;
+	__u8  kind;
+	__u8  mappable;
+	__u8  coherent;
+	__u8  uncached;
+};
+
+struct nvif_mmu_kind_v0 {
+	__u8  version;
+	__u8  pad01[1];
+	__u16 count;
+	__u8  data[];
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/if000a.h b/drivers/gpu/drm/nouveau/include/nvif/if000a.h
new file mode 100644
index 000000000000..88d0938fbd5a
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/if000a.h
@@ -0,0 +1,22 @@
+#ifndef __NVIF_IF000A_H__
+#define __NVIF_IF000A_H__
+struct nvif_mem_v0 {
+	__u8  version;
+	__u8  type;
+	__u8  page;
+	__u8  pad03[5];
+	__u64 size;
+	__u64 addr;
+	__u8  data[];
+};
+
+struct nvif_mem_ram_vn {
+};
+
+struct nvif_mem_ram_v0 {
+	__u8  version;
+	__u8  pad01[7];
+	dma_addr_t *dma;
+	struct scatterlist *sgl;
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/if000b.h b/drivers/gpu/drm/nouveau/include/nvif/if000b.h
new file mode 100644
index 000000000000..c677fb0293da
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/if000b.h
@@ -0,0 +1,11 @@
+#ifndef __NVIF_IF000B_H__
+#define __NVIF_IF000B_H__
+#include "if000a.h"
+
+struct nv04_mem_vn {
+	/* nvkm_mem_vX ... */
+};
+
+struct nv04_mem_map_vn {
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/if000c.h b/drivers/gpu/drm/nouveau/include/nvif/if000c.h
new file mode 100644
index 000000000000..2928ecd989ad
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/if000c.h
@@ -0,0 +1,64 @@
+#ifndef __NVIF_IF000C_H__
+#define __NVIF_IF000C_H__
+struct nvif_vmm_v0 {
+	__u8  version;
+	__u8  page_nr;
+	__u8  pad02[6];
+	__u64 addr;
+	__u64 size;
+	__u8  data[];
+};
+
+#define NVIF_VMM_V0_PAGE                                                   0x00
+#define NVIF_VMM_V0_GET                                                    0x01
+#define NVIF_VMM_V0_PUT                                                    0x02
+#define NVIF_VMM_V0_MAP                                                    0x03
+#define NVIF_VMM_V0_UNMAP                                                  0x04
+
+struct nvif_vmm_page_v0 {
+	__u8  version;
+	__u8  index;
+	__u8  shift;
+	__u8  sparse;
+	__u8  vram;
+	__u8  host;
+	__u8  comp;
+	__u8  pad07[1];
+};
+
+struct nvif_vmm_get_v0 {
+	__u8  version;
+#define NVIF_VMM_GET_V0_ADDR                                               0x00
+#define NVIF_VMM_GET_V0_PTES                                               0x01
+#define NVIF_VMM_GET_V0_LAZY	                                           0x02
+	__u8  type;
+	__u8  sparse;
+	__u8  page;
+	__u8  align;
+	__u8  pad05[3];
+	__u64 size;
+	__u64 addr;
+};
+
+struct nvif_vmm_put_v0 {
+	__u8  version;
+	__u8  pad01[7];
+	__u64 addr;
+};
+
+struct nvif_vmm_map_v0 {
+	__u8  version;
+	__u8  pad01[7];
+	__u64 addr;
+	__u64 size;
+	__u64 memory;
+	__u64 offset;
+	__u8  data[];
+};
+
+struct nvif_vmm_unmap_v0 {
+	__u8  version;
+	__u8  pad01[7];
+	__u64 addr;
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/if000d.h b/drivers/gpu/drm/nouveau/include/nvif/if000d.h
new file mode 100644
index 000000000000..516ec9401401
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/if000d.h
@@ -0,0 +1,12 @@
+#ifndef __NVIF_IF000D_H__
+#define __NVIF_IF000D_H__
+#include "if000c.h"
+
+struct nv04_vmm_vn {
+	/* nvif_vmm_vX ... */
+};
+
+struct nv04_vmm_map_vn {
+	/* nvif_vmm_map_vX ... */
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/if500b.h b/drivers/gpu/drm/nouveau/include/nvif/if500b.h
new file mode 100644
index 000000000000..c7c8431fb2ce
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/if500b.h
@@ -0,0 +1,25 @@
+#ifndef __NVIF_IF500B_H__
+#define __NVIF_IF500B_H__
+#include "if000a.h"
+
+struct nv50_mem_vn {
+	/* nvif_mem_vX ... */
+};
+
+struct nv50_mem_v0 {
+	/* nvif_mem_vX ... */
+	__u8  version;
+	__u8  bankswz;
+	__u8  contig;
+};
+
+struct nv50_mem_map_vn {
+};
+
+struct nv50_mem_map_v0 {
+	__u8  version;
+	__u8  ro;
+	__u8  kind;
+	__u8  comp;
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/if500d.h b/drivers/gpu/drm/nouveau/include/nvif/if500d.h
new file mode 100644
index 000000000000..c29a7822b363
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/if500d.h
@@ -0,0 +1,21 @@
+#ifndef __NVIF_IF500D_H__
+#define __NVIF_IF500D_H__
+#include "if000c.h"
+
+struct nv50_vmm_vn {
+	/* nvif_vmm_vX ... */
+};
+
+struct nv50_vmm_map_vn {
+	/* nvif_vmm_map_vX ... */
+};
+
+struct nv50_vmm_map_v0 {
+	/* nvif_vmm_map_vX ... */
+	__u8  version;
+	__u8  ro;
+	__u8  priv;
+	__u8  kind;
+	__u8  comp;
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/if900b.h b/drivers/gpu/drm/nouveau/include/nvif/if900b.h
new file mode 100644
index 000000000000..9b164548eea8
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/if900b.h
@@ -0,0 +1,23 @@
+#ifndef __NVIF_IF900B_H__
+#define __NVIF_IF900B_H__
+#include "if000a.h"
+
+struct gf100_mem_vn {
+	/* nvif_mem_vX ... */
+};
+
+struct gf100_mem_v0 {
+	/* nvif_mem_vX ... */
+	__u8  version;
+	__u8  contig;
+};
+
+struct gf100_mem_map_vn {
+};
+
+struct gf100_mem_map_v0 {
+	__u8  version;
+	__u8  ro;
+	__u8  kind;
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/if900d.h b/drivers/gpu/drm/nouveau/include/nvif/if900d.h
new file mode 100644
index 000000000000..49aa50583c3d
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/if900d.h
@@ -0,0 +1,21 @@
+#ifndef __NVIF_IF900D_H__
+#define __NVIF_IF900D_H__
+#include "if000c.h"
+
+struct gf100_vmm_vn {
+	/* nvif_vmm_vX ... */
+};
+
+struct gf100_vmm_map_vn {
+	/* nvif_vmm_map_vX ... */
+};
+
+struct gf100_vmm_map_v0 {
+	/* nvif_vmm_map_vX ... */
+	__u8  version;
+	__u8  vol;
+	__u8  ro;
+	__u8  priv;
+	__u8  kind;
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/ifb00d.h b/drivers/gpu/drm/nouveau/include/nvif/ifb00d.h
new file mode 100644
index 000000000000..a0e419830595
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/ifb00d.h
@@ -0,0 +1,27 @@
+#ifndef __NVIF_IFB00D_H__
+#define __NVIF_IFB00D_H__
+#include "if000c.h"
+
+struct gm200_vmm_vn {
+	/* nvif_vmm_vX ... */
+};
+
+struct gm200_vmm_v0 {
+	/* nvif_vmm_vX ... */
+	__u8  version;
+	__u8  bigpage;
+};
+
+struct gm200_vmm_map_vn {
+	/* nvif_vmm_map_vX ... */
+};
+
+struct gm200_vmm_map_v0 {
+	/* nvif_vmm_map_vX ... */
+	__u8  version;
+	__u8  vol;
+	__u8  ro;
+	__u8  priv;
+	__u8  kind;
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/ifc00d.h b/drivers/gpu/drm/nouveau/include/nvif/ifc00d.h
new file mode 100644
index 000000000000..1d9c637859f3
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/ifc00d.h
@@ -0,0 +1,21 @@
+#ifndef __NVIF_IFC00D_H__
+#define __NVIF_IFC00D_H__
+#include "if000c.h"
+
+struct gp100_vmm_vn {
+	/* nvif_vmm_vX ... */
+};
+
+struct gp100_vmm_map_vn {
+	/* nvif_vmm_map_vX ... */
+};
+
+struct gp100_vmm_map_v0 {
+	/* nvif_vmm_map_vX ... */
+	__u8  version;
+	__u8  vol;
+	__u8  ro;
+	__u8  priv;
+	__u8  kind;
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/ioctl.h b/drivers/gpu/drm/nouveau/include/nvif/ioctl.h
index c5f5eb83a594..1886366457f1 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/ioctl.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/ioctl.h
@@ -1,7 +1,7 @@
 #ifndef __NVIF_IOCTL_H__
 #define __NVIF_IOCTL_H__
 
-#define NVIF_VERSION_LATEST                               0x0000000000000000ULL
+#define NVIF_VERSION_LATEST                               0x0000000000000100ULL
 
 struct nvif_ioctl_v0 {
 	__u8  version;
@@ -83,9 +83,13 @@ struct nvif_ioctl_wr_v0 {
 struct nvif_ioctl_map_v0 {
 	/* nvif_ioctl ... */
 	__u8  version;
-	__u8  pad01[3];
-	__u32 length;
+#define NVIF_IOCTL_MAP_V0_IO                                               0x00
+#define NVIF_IOCTL_MAP_V0_VA                                               0x01
+	__u8  type;
+	__u8  pad02[6];
 	__u64 handle;
+	__u64 length;
+	__u8  data[];
 };
 
 struct nvif_ioctl_unmap {
diff --git a/drivers/gpu/drm/nouveau/include/nvif/mem.h b/drivers/gpu/drm/nouveau/include/nvif/mem.h
new file mode 100644
index 000000000000..b542fe38398e
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/mem.h
@@ -0,0 +1,18 @@
+#ifndef __NVIF_MEM_H__
+#define __NVIF_MEM_H__
+#include "mmu.h"
+
+struct nvif_mem {
+	struct nvif_object object;
+	u8  type;
+	u8  page;
+	u64 addr;
+	u64 size;
+};
+
+int nvif_mem_init_type(struct nvif_mmu *mmu, s32 oclass, int type, u8 page,
+		       u64 size, void *argv, u32 argc, struct nvif_mem *);
+int nvif_mem_init(struct nvif_mmu *mmu, s32 oclass, u8 type, u8 page,
+		  u64 size, void *argv, u32 argc, struct nvif_mem *);
+void nvif_mem_fini(struct nvif_mem *);
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/mmu.h b/drivers/gpu/drm/nouveau/include/nvif/mmu.h
new file mode 100644
index 000000000000..c8cd5b5b0688
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/mmu.h
@@ -0,0 +1,56 @@
+#ifndef __NVIF_MMU_H__
+#define __NVIF_MMU_H__
+#include <nvif/object.h>
+
+struct nvif_mmu {
+	struct nvif_object object;
+	u8  dmabits;
+	u8  heap_nr;
+	u8  type_nr;
+	u16 kind_nr;
+
+	struct {
+		u64 size;
+	} *heap;
+
+	struct {
+#define NVIF_MEM_VRAM                                                      0x01
+#define NVIF_MEM_HOST                                                      0x02
+#define NVIF_MEM_COMP                                                      0x04
+#define NVIF_MEM_DISP                                                      0x08
+#define NVIF_MEM_KIND                                                      0x10
+#define NVIF_MEM_MAPPABLE                                                  0x20
+#define NVIF_MEM_COHERENT                                                  0x40
+#define NVIF_MEM_UNCACHED                                                  0x80
+		u8 type;
+		u8 heap;
+	} *type;
+
+	u8 *kind;
+};
+
+int nvif_mmu_init(struct nvif_object *, s32 oclass, struct nvif_mmu *);
+void nvif_mmu_fini(struct nvif_mmu *);
+
+static inline bool
+nvif_mmu_kind_valid(struct nvif_mmu *mmu, u8 kind)
+{
+	const u8 invalid = mmu->kind_nr - 1;
+	if (kind) {
+		if (kind >= mmu->kind_nr || mmu->kind[kind] == invalid)
+			return false;
+	}
+	return true;
+}
+
+static inline int
+nvif_mmu_type(struct nvif_mmu *mmu, u8 mask)
+{
+	int i;
+	for (i = 0; i < mmu->type_nr; i++) {
+		if ((mmu->type[i].type & mask) == mask)
+			return i;
+	}
+	return -EINVAL;
+}
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/object.h b/drivers/gpu/drm/nouveau/include/nvif/object.h
index 9e58b305b020..0b54261bdefe 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/object.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/object.h
@@ -16,7 +16,7 @@ struct nvif_object {
 	void *priv; /*XXX: hack */
 	struct {
 		void __iomem *ptr;
-		u32 size;
+		u64 size;
 	} map;
 };
 
@@ -29,7 +29,10 @@ void nvif_object_sclass_put(struct nvif_sclass **);
 u32  nvif_object_rd(struct nvif_object *, int, u64);
 void nvif_object_wr(struct nvif_object *, int, u64, u32);
 int  nvif_object_mthd(struct nvif_object *, u32, void *, u32);
-int  nvif_object_map(struct nvif_object *);
+int  nvif_object_map_handle(struct nvif_object *, void *, u32,
+			    u64 *handle, u64 *length);
+void nvif_object_unmap_handle(struct nvif_object *);
+int  nvif_object_map(struct nvif_object *, void *, u32);
 void nvif_object_unmap(struct nvif_object *);
 
 #define nvif_handle(a) (unsigned long)(void *)(a)
diff --git a/drivers/gpu/drm/nouveau/include/nvif/os.h b/drivers/gpu/drm/nouveau/include/nvif/os.h
index 9fcab67c8557..5efdf80d5abc 100644
--- a/drivers/gpu/drm/nouveau/include/nvif/os.h
+++ b/drivers/gpu/drm/nouveau/include/nvif/os.h
@@ -33,18 +33,4 @@
 
 #include <soc/tegra/fuse.h>
 #include <soc/tegra/pmc.h>
-
-#ifndef ioread32_native
-#ifdef __BIG_ENDIAN
-#define ioread16_native ioread16be
-#define iowrite16_native iowrite16be
-#define ioread32_native  ioread32be
-#define iowrite32_native iowrite32be
-#else /* def __BIG_ENDIAN */
-#define ioread16_native ioread16
-#define iowrite16_native iowrite16
-#define ioread32_native  ioread32
-#define iowrite32_native iowrite32
-#endif /* def __BIG_ENDIAN else */
-#endif /* !ioread32_native */
 #endif
diff --git a/drivers/gpu/drm/nouveau/include/nvif/vmm.h b/drivers/gpu/drm/nouveau/include/nvif/vmm.h
new file mode 100644
index 000000000000..c5db8a2e82df
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvif/vmm.h
@@ -0,0 +1,42 @@
+#ifndef __NVIF_VMM_H__
+#define __NVIF_VMM_H__
+#include <nvif/object.h>
+struct nvif_mem;
+struct nvif_mmu;
+
+enum nvif_vmm_get {
+	ADDR,
+	PTES,
+	LAZY
+};
+
+struct nvif_vma {
+	u64 addr;
+	u64 size;
+};
+
+struct nvif_vmm {
+	struct nvif_object object;
+	u64 start;
+	u64 limit;
+
+	struct {
+		u8 shift;
+		bool sparse:1;
+		bool vram:1;
+		bool host:1;
+		bool comp:1;
+	} *page;
+	int page_nr;
+};
+
+int nvif_vmm_init(struct nvif_mmu *, s32 oclass, u64 addr, u64 size,
+		  void *argv, u32 argc, struct nvif_vmm *);
+void nvif_vmm_fini(struct nvif_vmm *);
+int nvif_vmm_get(struct nvif_vmm *, enum nvif_vmm_get, bool sparse,
+		 u8 page, u8 align, u64 size, struct nvif_vma *);
+void nvif_vmm_put(struct nvif_vmm *, struct nvif_vma *);
+int nvif_vmm_map(struct nvif_vmm *, u64 addr, u64 size, void *argv, u32 argc,
+		 struct nvif_mem *, u64 offset);
+int nvif_vmm_unmap(struct nvif_vmm *, u64);
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/client.h b/drivers/gpu/drm/nouveau/include/nvkm/core/client.h
index e876634da10a..79624f6d0a2b 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/core/client.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/client.h
@@ -16,7 +16,8 @@ struct nvkm_client {
 	void *data;
 	int (*ntfy)(const void *, u32, const void *, u32);
 
-	struct nvkm_vm *vm;
+	struct list_head umem;
+	spinlock_t lock;
 };
 
 int  nvkm_client_new(const char *name, u64 device, const char *cfg,
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/device.h b/drivers/gpu/drm/nouveau/include/nvkm/core/device.h
index bb4c214f1046..5046e1db99ac 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/core/device.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/device.h
@@ -1,7 +1,7 @@
 #ifndef __NVKM_DEVICE_H__
 #define __NVKM_DEVICE_H__
+#include <core/oclass.h>
 #include <core/event.h>
-#include <core/object.h>
 
 enum nvkm_devidx {
 	NVKM_SUBDEV_PCI,
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/engine.h b/drivers/gpu/drm/nouveau/include/nvkm/core/engine.h
index d4cd2fbfde88..7730499bfd95 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/core/engine.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/engine.h
@@ -15,6 +15,7 @@ struct nvkm_engine {
 
 struct nvkm_engine_func {
 	void *(*dtor)(struct nvkm_engine *);
+	void (*preinit)(struct nvkm_engine *);
 	int (*oneinit)(struct nvkm_engine *);
 	int (*init)(struct nvkm_engine *);
 	int (*fini)(struct nvkm_engine *, bool suspend);
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/gpuobj.h b/drivers/gpu/drm/nouveau/include/nvkm/core/gpuobj.h
index c23da4f05929..51691667b813 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/core/gpuobj.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/gpuobj.h
@@ -1,17 +1,16 @@
 #ifndef __NVKM_GPUOBJ_H__
 #define __NVKM_GPUOBJ_H__
-#include <core/object.h>
 #include <core/memory.h>
 #include <core/mm.h>
-struct nvkm_vma;
-struct nvkm_vm;
 
 #define NVOBJ_FLAG_ZERO_ALLOC 0x00000001
 #define NVOBJ_FLAG_HEAP       0x00000004
 
 struct nvkm_gpuobj {
-	struct nvkm_object object;
-	const struct nvkm_gpuobj_func *func;
+	union {
+		const struct nvkm_gpuobj_func *func;
+		const struct nvkm_gpuobj_func *ptrs;
+	};
 	struct nvkm_gpuobj *parent;
 	struct nvkm_memory *memory;
 	struct nvkm_mm_node *node;
@@ -28,15 +27,14 @@ struct nvkm_gpuobj_func {
 	void (*release)(struct nvkm_gpuobj *);
 	u32 (*rd32)(struct nvkm_gpuobj *, u32 offset);
 	void (*wr32)(struct nvkm_gpuobj *, u32 offset, u32 data);
+	int (*map)(struct nvkm_gpuobj *, u64 offset, struct nvkm_vmm *,
+		   struct nvkm_vma *, void *argv, u32 argc);
 };
 
 int nvkm_gpuobj_new(struct nvkm_device *, u32 size, int align, bool zero,
 		    struct nvkm_gpuobj *parent, struct nvkm_gpuobj **);
 void nvkm_gpuobj_del(struct nvkm_gpuobj **);
 int nvkm_gpuobj_wrap(struct nvkm_memory *, struct nvkm_gpuobj **);
-int nvkm_gpuobj_map(struct nvkm_gpuobj *, struct nvkm_vm *, u32 access,
-		    struct nvkm_vma *);
-void nvkm_gpuobj_unmap(struct nvkm_vma *);
 void nvkm_gpuobj_memcpy_to(struct nvkm_gpuobj *dst, u32 dstoffset, void *src,
 			   u32 length);
 void nvkm_gpuobj_memcpy_from(void *dst, struct nvkm_gpuobj *src, u32 srcoffset,
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/memory.h b/drivers/gpu/drm/nouveau/include/nvkm/core/memory.h
index 33ca6769266a..13ebf4da2b96 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/core/memory.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/memory.h
@@ -3,7 +3,12 @@
 #include <core/os.h>
 struct nvkm_device;
 struct nvkm_vma;
-struct nvkm_vm;
+struct nvkm_vmm;
+
+struct nvkm_tags {
+	struct nvkm_mm_node *mn;
+	refcount_t refcount;
+};
 
 enum nvkm_memory_target {
 	NVKM_MEM_TARGET_INST, /* instance memory */
@@ -14,41 +19,84 @@ enum nvkm_memory_target {
 
 struct nvkm_memory {
 	const struct nvkm_memory_func *func;
+	const struct nvkm_memory_ptrs *ptrs;
+	struct kref kref;
+	struct nvkm_tags *tags;
 };
 
 struct nvkm_memory_func {
 	void *(*dtor)(struct nvkm_memory *);
 	enum nvkm_memory_target (*target)(struct nvkm_memory *);
+	u8 (*page)(struct nvkm_memory *);
 	u64 (*addr)(struct nvkm_memory *);
 	u64 (*size)(struct nvkm_memory *);
-	void (*boot)(struct nvkm_memory *, struct nvkm_vm *);
+	void (*boot)(struct nvkm_memory *, struct nvkm_vmm *);
 	void __iomem *(*acquire)(struct nvkm_memory *);
 	void (*release)(struct nvkm_memory *);
+	int (*map)(struct nvkm_memory *, u64 offset, struct nvkm_vmm *,
+		   struct nvkm_vma *, void *argv, u32 argc);
+};
+
+struct nvkm_memory_ptrs {
 	u32 (*rd32)(struct nvkm_memory *, u64 offset);
 	void (*wr32)(struct nvkm_memory *, u64 offset, u32 data);
-	void (*map)(struct nvkm_memory *, struct nvkm_vma *, u64 offset);
 };
 
 void nvkm_memory_ctor(const struct nvkm_memory_func *, struct nvkm_memory *);
 int nvkm_memory_new(struct nvkm_device *, enum nvkm_memory_target,
 		    u64 size, u32 align, bool zero, struct nvkm_memory **);
-void nvkm_memory_del(struct nvkm_memory **);
+struct nvkm_memory *nvkm_memory_ref(struct nvkm_memory *);
+void nvkm_memory_unref(struct nvkm_memory **);
+int nvkm_memory_tags_get(struct nvkm_memory *, struct nvkm_device *, u32 tags,
+			 void (*clear)(struct nvkm_device *, u32, u32),
+			 struct nvkm_tags **);
+void nvkm_memory_tags_put(struct nvkm_memory *, struct nvkm_device *,
+			  struct nvkm_tags **);
+
 #define nvkm_memory_target(p) (p)->func->target(p)
+#define nvkm_memory_page(p) (p)->func->page(p)
 #define nvkm_memory_addr(p) (p)->func->addr(p)
 #define nvkm_memory_size(p) (p)->func->size(p)
 #define nvkm_memory_boot(p,v) (p)->func->boot((p),(v))
-#define nvkm_memory_map(p,v,o) (p)->func->map((p),(v),(o))
+#define nvkm_memory_map(p,o,vm,va,av,ac)                                       \
+	(p)->func->map((p),(o),(vm),(va),(av),(ac))
 
 /* accessor macros - kmap()/done() must bracket use of the other accessor
  * macros to guarantee correct behaviour across all chipsets
  */
 #define nvkm_kmap(o)     (o)->func->acquire(o)
-#define nvkm_ro32(o,a)   (o)->func->rd32((o), (a))
-#define nvkm_wo32(o,a,d) (o)->func->wr32((o), (a), (d))
+#define nvkm_done(o)     (o)->func->release(o)
+
+#define nvkm_ro32(o,a)   (o)->ptrs->rd32((o), (a))
+#define nvkm_wo32(o,a,d) (o)->ptrs->wr32((o), (a), (d))
 #define nvkm_mo32(o,a,m,d) ({                                                  \
 	u32 _addr = (a), _data = nvkm_ro32((o), _addr);                        \
 	nvkm_wo32((o), _addr, (_data & ~(m)) | (d));                           \
 	_data;                                                                 \
 })
-#define nvkm_done(o)     (o)->func->release(o)
+
+#define nvkm_wo64(o,a,d) do {                                                  \
+	u64 __a = (a), __d = (d);                                              \
+	nvkm_wo32((o), __a + 0, lower_32_bits(__d));                           \
+	nvkm_wo32((o), __a + 4, upper_32_bits(__d));                           \
+} while(0)
+
+#define nvkm_fill(t,s,o,a,d,c) do {                                            \
+	u64 _a = (a), _c = (c), _d = (d), _o = _a >> s, _s = _c << s;          \
+	u##t __iomem *_m = nvkm_kmap(o);                                       \
+	if (likely(_m)) {                                                      \
+		if (_d) {                                                      \
+			while (_c--)                                           \
+				iowrite##t##_native(_d, &_m[_o++]);            \
+		} else {                                                       \
+			memset_io(&_m[_o], _d, _s);                            \
+		}                                                              \
+	} else {                                                               \
+		for (; _c; _c--, _a += BIT(s))                                 \
+			nvkm_wo##t((o), _a, _d);                               \
+	}                                                                      \
+	nvkm_done(o);                                                          \
+} while(0)
+#define nvkm_fo32(o,a,d,c) nvkm_fill(32, 2, (o), (a), (d), (c))
+#define nvkm_fo64(o,a,d,c) nvkm_fill(64, 3, (o), (a), (d), (c))
 #endif
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/mm.h b/drivers/gpu/drm/nouveau/include/nvkm/core/mm.h
index 7bd4897a8a2a..5c1261351138 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/core/mm.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/mm.h
@@ -30,7 +30,7 @@ nvkm_mm_initialised(struct nvkm_mm *mm)
 	return mm->heap_nodes;
 }
 
-int  nvkm_mm_init(struct nvkm_mm *, u32 offset, u32 length, u32 block);
+int  nvkm_mm_init(struct nvkm_mm *, u8 heap, u32 offset, u32 length, u32 block);
 int  nvkm_mm_fini(struct nvkm_mm *);
 int  nvkm_mm_head(struct nvkm_mm *, u8 heap, u8 type, u32 size_max,
 		  u32 size_min, u32 align, struct nvkm_mm_node **);
@@ -39,9 +39,39 @@ int  nvkm_mm_tail(struct nvkm_mm *, u8 heap, u8 type, u32 size_max,
 void nvkm_mm_free(struct nvkm_mm *, struct nvkm_mm_node **);
 void nvkm_mm_dump(struct nvkm_mm *, const char *);
 
+static inline u32
+nvkm_mm_heap_size(struct nvkm_mm *mm, u8 heap)
+{
+	struct nvkm_mm_node *node;
+	u32 size = 0;
+	list_for_each_entry(node, &mm->nodes, nl_entry) {
+		if (node->heap == heap)
+			size += node->length;
+	}
+	return size;
+}
+
 static inline bool
 nvkm_mm_contiguous(struct nvkm_mm_node *node)
 {
 	return !node->next;
 }
+
+static inline u32
+nvkm_mm_addr(struct nvkm_mm_node *node)
+{
+	if (WARN_ON(!nvkm_mm_contiguous(node)))
+		return 0;
+	return node->offset;
+}
+
+static inline u32
+nvkm_mm_size(struct nvkm_mm_node *node)
+{
+	u32 size = 0;
+	do {
+		size += node->length;
+	} while ((node = node->next));
+	return size;
+}
 #endif
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/object.h b/drivers/gpu/drm/nouveau/include/nvkm/core/object.h
index 96dda350ada3..916a4b76d430 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/core/object.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/object.h
@@ -1,10 +1,8 @@
 #ifndef __NVKM_OBJECT_H__
 #define __NVKM_OBJECT_H__
-#include <core/os.h>
-#include <core/debug.h>
+#include <core/oclass.h>
 struct nvkm_event;
 struct nvkm_gpuobj;
-struct nvkm_oclass;
 
 struct nvkm_object {
 	const struct nvkm_object_func *func;
@@ -21,13 +19,20 @@ struct nvkm_object {
 	struct rb_node node;
 };
 
+enum nvkm_object_map {
+	NVKM_OBJECT_MAP_IO,
+	NVKM_OBJECT_MAP_VA
+};
+
 struct nvkm_object_func {
 	void *(*dtor)(struct nvkm_object *);
 	int (*init)(struct nvkm_object *);
 	int (*fini)(struct nvkm_object *, bool suspend);
 	int (*mthd)(struct nvkm_object *, u32 mthd, void *data, u32 size);
 	int (*ntfy)(struct nvkm_object *, u32 mthd, struct nvkm_event **);
-	int (*map)(struct nvkm_object *, u64 *addr, u32 *size);
+	int (*map)(struct nvkm_object *, void *argv, u32 argc,
+		   enum nvkm_object_map *, u64 *addr, u64 *size);
+	int (*unmap)(struct nvkm_object *);
 	int (*rd08)(struct nvkm_object *, u64 addr, u8 *data);
 	int (*rd16)(struct nvkm_object *, u64 addr, u16 *data);
 	int (*rd32)(struct nvkm_object *, u64 addr, u32 *data);
@@ -52,7 +57,9 @@ int nvkm_object_init(struct nvkm_object *);
 int nvkm_object_fini(struct nvkm_object *, bool suspend);
 int nvkm_object_mthd(struct nvkm_object *, u32 mthd, void *data, u32 size);
 int nvkm_object_ntfy(struct nvkm_object *, u32 mthd, struct nvkm_event **);
-int nvkm_object_map(struct nvkm_object *, u64 *addr, u32 *size);
+int nvkm_object_map(struct nvkm_object *, void *argv, u32 argc,
+		    enum nvkm_object_map *, u64 *addr, u64 *size);
+int nvkm_object_unmap(struct nvkm_object *);
 int nvkm_object_rd08(struct nvkm_object *, u64 addr, u8  *data);
 int nvkm_object_rd16(struct nvkm_object *, u64 addr, u16 *data);
 int nvkm_object_rd32(struct nvkm_object *, u64 addr, u32 *data);
@@ -66,28 +73,4 @@ bool nvkm_object_insert(struct nvkm_object *);
 void nvkm_object_remove(struct nvkm_object *);
 struct nvkm_object *nvkm_object_search(struct nvkm_client *, u64 object,
 				       const struct nvkm_object_func *);
-
-struct nvkm_sclass {
-	int minver;
-	int maxver;
-	s32 oclass;
-	const struct nvkm_object_func *func;
-	int (*ctor)(const struct nvkm_oclass *, void *data, u32 size,
-		    struct nvkm_object **);
-};
-
-struct nvkm_oclass {
-	int (*ctor)(const struct nvkm_oclass *, void *data, u32 size,
-		    struct nvkm_object **);
-	struct nvkm_sclass base;
-	const void *priv;
-	const void *engn;
-	u32 handle;
-	u8  route;
-	u64 token;
-	u64 object;
-	struct nvkm_client *client;
-	struct nvkm_object *parent;
-	struct nvkm_engine *engine;
-};
 #endif
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/oclass.h b/drivers/gpu/drm/nouveau/include/nvkm/core/oclass.h
new file mode 100644
index 000000000000..8e1b945d38f3
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/oclass.h
@@ -0,0 +1,31 @@
+#ifndef __NVKM_OCLASS_H__
+#define __NVKM_OCLASS_H__
+#include <core/os.h>
+#include <core/debug.h>
+struct nvkm_oclass;
+struct nvkm_object;
+
+struct nvkm_sclass {
+	int minver;
+	int maxver;
+	s32 oclass;
+	const struct nvkm_object_func *func;
+	int (*ctor)(const struct nvkm_oclass *, void *data, u32 size,
+		    struct nvkm_object **);
+};
+
+struct nvkm_oclass {
+	int (*ctor)(const struct nvkm_oclass *, void *data, u32 size,
+		    struct nvkm_object **);
+	struct nvkm_sclass base;
+	const void *priv;
+	const void *engn;
+	u32 handle;
+	u8  route;
+	u64 token;
+	u64 object;
+	struct nvkm_client *client;
+	struct nvkm_object *parent;
+	struct nvkm_engine *engine;
+};
+#endif
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/os.h b/drivers/gpu/drm/nouveau/include/nvkm/core/os.h
index cd57e238ddd3..1f0108fdd24a 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/core/os.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/os.h
@@ -1,4 +1,23 @@
 #ifndef __NVKM_OS_H__
 #define __NVKM_OS_H__
 #include <nvif/os.h>
+
+#ifdef __BIG_ENDIAN
+#define ioread16_native ioread16be
+#define iowrite16_native iowrite16be
+#define ioread32_native  ioread32be
+#define iowrite32_native iowrite32be
+#else
+#define ioread16_native ioread16
+#define iowrite16_native iowrite16
+#define ioread32_native  ioread32
+#define iowrite32_native iowrite32
+#endif
+
+#define iowrite64_native(v,p) do {                                             \
+	u32 __iomem *_p = (u32 __iomem *)(p);				       \
+	u64 _v = (v);							       \
+	iowrite32_native(lower_32_bits(_v), &_p[0]);			       \
+	iowrite32_native(upper_32_bits(_v), &_p[1]);			       \
+} while(0)
 #endif
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/ramht.h b/drivers/gpu/drm/nouveau/include/nvkm/core/ramht.h
index 5ee6298991e2..8a48ca67f60d 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/core/ramht.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/ramht.h
@@ -1,6 +1,7 @@
 #ifndef __NVKM_RAMHT_H__
 #define __NVKM_RAMHT_H__
 #include <core/gpuobj.h>
+struct nvkm_object;
 
 struct nvkm_ramht_data {
 	struct nvkm_gpuobj *inst;
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h b/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h
index ca9ed3d68f44..a6c21be7537f 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h
@@ -33,7 +33,7 @@ void nvkm_subdev_intr(struct nvkm_subdev *);
 /* subdev logging */
 #define nvkm_printk_(s,l,p,f,a...) do {                                        \
 	const struct nvkm_subdev *_subdev = (s);                               \
-	if (_subdev->debug >= (l)) {                                           \
+	if (CONFIG_NOUVEAU_DEBUG >= (l) && _subdev->debug >= (l)) {            \
 		dev_##p(_subdev->device->dev, "%s: "f,                         \
 			nvkm_subdev_name[_subdev->index], ##a);                \
 	}                                                                      \
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/engine/dma.h b/drivers/gpu/drm/nouveau/include/nvkm/engine/dma.h
index d2a6532ce3b9..b672a3b07f55 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/engine/dma.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/engine/dma.h
@@ -1,6 +1,7 @@
 #ifndef __NVKM_DMA_H__
 #define __NVKM_DMA_H__
 #include <core/engine.h>
+#include <core/object.h>
 struct nvkm_client;
 
 struct nvkm_dmaobj {
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/engine/falcon.h b/drivers/gpu/drm/nouveau/include/nvkm/engine/falcon.h
index e1a854e2ade1..f0024fb5a5af 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/engine/falcon.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/engine/falcon.h
@@ -3,6 +3,7 @@
 #define nvkm_falcon(p) container_of((p), struct nvkm_falcon, engine)
 #include <core/engine.h>
 struct nvkm_fifo_chan;
+struct nvkm_gpuobj;
 
 enum nvkm_falcon_dmaidx {
 	FALCON_DMAIDX_UCODE		= 0,
@@ -77,7 +78,7 @@ struct nvkm_falcon_func {
 	void (*load_imem)(struct nvkm_falcon *, void *, u32, u32, u16, u8, bool);
 	void (*load_dmem)(struct nvkm_falcon *, void *, u32, u32, u8);
 	void (*read_dmem)(struct nvkm_falcon *, u32, u32, u8, void *);
-	void (*bind_context)(struct nvkm_falcon *, struct nvkm_gpuobj *);
+	void (*bind_context)(struct nvkm_falcon *, struct nvkm_memory *);
 	int (*wait_for_halt)(struct nvkm_falcon *, u32);
 	int (*clear_interrupt)(struct nvkm_falcon *, u32);
 	void (*set_start_addr)(struct nvkm_falcon *, u32 start_addr);
@@ -112,7 +113,7 @@ void nvkm_falcon_load_imem(struct nvkm_falcon *, void *, u32, u32, u16, u8,
 			   bool);
 void nvkm_falcon_load_dmem(struct nvkm_falcon *, void *, u32, u32, u8);
 void nvkm_falcon_read_dmem(struct nvkm_falcon *, u32, u32, u8, void *);
-void nvkm_falcon_bind_context(struct nvkm_falcon *, struct nvkm_gpuobj *);
+void nvkm_falcon_bind_context(struct nvkm_falcon *, struct nvkm_memory *);
 void nvkm_falcon_set_start_addr(struct nvkm_falcon *, u32);
 void nvkm_falcon_start(struct nvkm_falcon *);
 int nvkm_falcon_wait_for_halt(struct nvkm_falcon *, u32);
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h b/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h
index f00527b36acc..e42d686fbd8b 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/engine/fifo.h
@@ -1,6 +1,7 @@
 #ifndef __NVKM_FIFO_H__
 #define __NVKM_FIFO_H__
 #include <core/engine.h>
+#include <core/object.h>
 #include <core/event.h>
 
 #define NVKM_FIFO_CHID_NR 4096
@@ -21,7 +22,7 @@ struct nvkm_fifo_chan {
 	u16 chid;
 	struct nvkm_gpuobj *inst;
 	struct nvkm_gpuobj *push;
-	struct nvkm_vm *vm;
+	struct nvkm_vmm *vmm;
 	void __iomem *user;
 	u64 addr;
 	u32 size;
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/bar.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/bar.h
index d3071b5a4f98..ffa963939e15 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/bar.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/bar.h
@@ -8,17 +8,22 @@ struct nvkm_bar {
 	struct nvkm_subdev subdev;
 
 	spinlock_t lock;
+	bool bar2;
 
 	/* whether the BAR supports to be ioremapped WC or should be uncached */
 	bool iomap_uncached;
 };
 
+struct nvkm_vmm *nvkm_bar_bar1_vmm(struct nvkm_device *);
+void nvkm_bar_bar2_init(struct nvkm_device *);
+void nvkm_bar_bar2_fini(struct nvkm_device *);
+struct nvkm_vmm *nvkm_bar_bar2_vmm(struct nvkm_device *);
 void nvkm_bar_flush(struct nvkm_bar *);
-struct nvkm_vm *nvkm_bar_kmap(struct nvkm_bar *);
-int nvkm_bar_umap(struct nvkm_bar *, u64 size, int type, struct nvkm_vma *);
 
 int nv50_bar_new(struct nvkm_device *, int, struct nvkm_bar **);
 int g84_bar_new(struct nvkm_device *, int, struct nvkm_bar **);
 int gf100_bar_new(struct nvkm_device *, int, struct nvkm_bar **);
 int gk20a_bar_new(struct nvkm_device *, int, struct nvkm_bar **);
+int gm107_bar_new(struct nvkm_device *, int, struct nvkm_bar **);
+int gm20b_bar_new(struct nvkm_device *, int, struct nvkm_bar **);
 #endif
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h
index 28d513fbf44c..a00fd2e59215 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h
@@ -1,8 +1,7 @@
 #ifndef __NVKM_FB_H__
 #define __NVKM_FB_H__
 #include <core/subdev.h>
-
-#include <subdev/mmu.h>
+#include <core/mm.h>
 
 /* memory type/access flags, do not match hardware values */
 #define NV_MEM_ACCESS_RO  1
@@ -21,22 +20,6 @@
 #define NVKM_RAM_TYPE_VM 0x7f
 #define NV_MEM_COMP_VM 0x03
 
-struct nvkm_mem {
-	struct drm_device *dev;
-
-	struct nvkm_vma bar_vma;
-	struct nvkm_vma vma[2];
-	u8  page_shift;
-
-	struct nvkm_mm_node *tag;
-	struct nvkm_mm_node *mem;
-	dma_addr_t *pages;
-	u32 memtype;
-	u64 offset;
-	u64 size;
-	struct sg_table *sg;
-};
-
 struct nvkm_fb_tile {
 	struct nvkm_mm_node *tag;
 	u32 addr;
@@ -50,6 +33,7 @@ struct nvkm_fb {
 	struct nvkm_subdev subdev;
 
 	struct nvkm_ram *ram;
+	struct nvkm_mm tags;
 
 	struct {
 		struct nvkm_fb_tile region[16];
@@ -62,7 +46,6 @@ struct nvkm_fb {
 	struct nvkm_memory *mmu_wr;
 };
 
-bool nvkm_fb_memtype_valid(struct nvkm_fb *, u32 memtype);
 void nvkm_fb_tile_init(struct nvkm_fb *, int region, u32 addr, u32 size,
 		       u32 pitch, u32 flags, struct nvkm_fb_tile *);
 void nvkm_fb_tile_fini(struct nvkm_fb *, int region, struct nvkm_fb_tile *);
@@ -129,8 +112,11 @@ struct nvkm_ram {
 	u64 size;
 
 #define NVKM_RAM_MM_SHIFT 12
+#define NVKM_RAM_MM_ANY    (NVKM_MM_HEAP_ANY + 0)
+#define NVKM_RAM_MM_NORMAL (NVKM_MM_HEAP_ANY + 1)
+#define NVKM_RAM_MM_NOMAP  (NVKM_MM_HEAP_ANY + 2)
+#define NVKM_RAM_MM_MIXED  (NVKM_MM_HEAP_ANY + 3)
 	struct nvkm_mm vram;
-	struct nvkm_mm tags;
 	u64 stolen;
 
 	int ranks;
@@ -147,6 +133,10 @@ struct nvkm_ram {
 	struct nvkm_ram_data target;
 };
 
+int
+nvkm_ram_get(struct nvkm_device *, u8 heap, u8 type, u8 page, u64 size,
+	     bool contig, bool back, struct nvkm_memory **);
+
 struct nvkm_ram_func {
 	u64 upper;
 	u32 (*probe_fbp)(const struct nvkm_ram_func *, struct nvkm_device *,
@@ -157,14 +147,8 @@ struct nvkm_ram_func {
 	void *(*dtor)(struct nvkm_ram *);
 	int (*init)(struct nvkm_ram *);
 
-	int (*get)(struct nvkm_ram *, u64 size, u32 align, u32 size_nc,
-		   u32 type, struct nvkm_mem **);
-	void (*put)(struct nvkm_ram *, struct nvkm_mem **);
-
 	int (*calc)(struct nvkm_ram *, u32 freq);
 	int (*prog)(struct nvkm_ram *);
 	void (*tidy)(struct nvkm_ram *);
 };
-
-extern const u8 gf100_pte_storage_type_map[256];
 #endif
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/instmem.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/instmem.h
index 40f845e31272..8111c0c3c5ec 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/instmem.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/instmem.h
@@ -9,6 +9,7 @@ struct nvkm_instmem {
 
 	spinlock_t lock;
 	struct list_head list;
+	struct list_head boot;
 	u32 reserved;
 
 	struct nvkm_memory *vbios;
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/ltc.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/ltc.h
index cd755baf9cab..4a224fd22e48 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/ltc.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/ltc.h
@@ -14,8 +14,7 @@ struct nvkm_ltc {
 
 	u32 num_tags;
 	u32 tag_base;
-	struct nvkm_mm tags;
-	struct nvkm_mm_node *tag_ram;
+	struct nvkm_memory *tag_ram;
 
 	int zbc_min;
 	int zbc_max;
@@ -23,9 +22,7 @@ struct nvkm_ltc {
 	u32 zbc_depth[NVKM_LTC_MAX_ZBC_CNT];
 };
 
-int nvkm_ltc_tags_alloc(struct nvkm_ltc *, u32 count, struct nvkm_mm_node **);
-void nvkm_ltc_tags_free(struct nvkm_ltc *, struct nvkm_mm_node **);
-void nvkm_ltc_tags_clear(struct nvkm_ltc *, u32 first, u32 count);
+void nvkm_ltc_tags_clear(struct nvkm_device *, u32 first, u32 count);
 
 int nvkm_ltc_zbc_color_get(struct nvkm_ltc *, int index, const u32[4]);
 int nvkm_ltc_zbc_depth_get(struct nvkm_ltc *, int index, const u32);
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h
index dcd3deff27a4..975c42f620a0 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h
@@ -1,68 +1,130 @@
 #ifndef __NVKM_MMU_H__
 #define __NVKM_MMU_H__
 #include <core/subdev.h>
-#include <core/mm.h>
-struct nvkm_device;
-struct nvkm_mem;
-
-struct nvkm_vm_pgt {
-	struct nvkm_memory *mem[2];
-	u32 refcount[2];
-};
-
-struct nvkm_vm_pgd {
-	struct list_head head;
-	struct nvkm_gpuobj *obj;
-};
 
 struct nvkm_vma {
 	struct list_head head;
-	int refcount;
-	struct nvkm_vm *vm;
-	struct nvkm_mm_node *node;
-	u64 offset;
-	u32 access;
+	struct rb_node tree;
+	u64 addr;
+	u64 size:50;
+	bool mapref:1; /* PTs (de)referenced on (un)map (vs pre-allocated). */
+	bool sparse:1; /* Unmapped PDEs/PTEs will not trigger MMU faults. */
+#define NVKM_VMA_PAGE_NONE 7
+	u8   page:3; /* Requested page type (index, or NONE for automatic). */
+	u8   refd:3; /* Current page type (index, or NONE for unreferenced). */
+	bool used:1; /* Region allocated. */
+	bool part:1; /* Region was split from an allocated region by map(). */
+	bool user:1; /* Region user-allocated. */
+	bool busy:1; /* Region busy (for temporarily preventing user access). */
+	struct nvkm_memory *memory; /* Memory currently mapped into VMA. */
+	struct nvkm_tags *tags; /* Compression tag reference. */
 };
 
-struct nvkm_vm {
+struct nvkm_vmm {
+	const struct nvkm_vmm_func *func;
 	struct nvkm_mmu *mmu;
-
+	const char *name;
+	u32 debug;
+	struct kref kref;
 	struct mutex mutex;
-	struct nvkm_mm mm;
-	struct kref refcount;
 
-	struct list_head pgd_list;
+	u64 start;
+	u64 limit;
+
+	struct nvkm_vmm_pt *pd;
+	struct list_head join;
+
+	struct list_head list;
+	struct rb_root free;
+	struct rb_root root;
+
+	bool bootstrapped;
 	atomic_t engref[NVKM_SUBDEV_NR];
 
-	struct nvkm_vm_pgt *pgt;
-	u32 fpde;
-	u32 lpde;
+	dma_addr_t null;
+	void *nullp;
 };
 
-int  nvkm_vm_new(struct nvkm_device *, u64 offset, u64 length, u64 mm_offset,
-		 struct lock_class_key *, struct nvkm_vm **);
-int  nvkm_vm_ref(struct nvkm_vm *, struct nvkm_vm **, struct nvkm_gpuobj *pgd);
-int  nvkm_vm_boot(struct nvkm_vm *, u64 size);
-int  nvkm_vm_get(struct nvkm_vm *, u64 size, u32 page_shift, u32 access,
-		 struct nvkm_vma *);
-void nvkm_vm_put(struct nvkm_vma *);
-void nvkm_vm_map(struct nvkm_vma *, struct nvkm_mem *);
-void nvkm_vm_map_at(struct nvkm_vma *, u64 offset, struct nvkm_mem *);
-void nvkm_vm_unmap(struct nvkm_vma *);
-void nvkm_vm_unmap_at(struct nvkm_vma *, u64 offset, u64 length);
+int nvkm_vmm_new(struct nvkm_device *, u64 addr, u64 size, void *argv, u32 argc,
+		 struct lock_class_key *, const char *name, struct nvkm_vmm **);
+struct nvkm_vmm *nvkm_vmm_ref(struct nvkm_vmm *);
+void nvkm_vmm_unref(struct nvkm_vmm **);
+int nvkm_vmm_boot(struct nvkm_vmm *);
+int nvkm_vmm_join(struct nvkm_vmm *, struct nvkm_memory *inst);
+void nvkm_vmm_part(struct nvkm_vmm *, struct nvkm_memory *inst);
+int nvkm_vmm_get(struct nvkm_vmm *, u8 page, u64 size, struct nvkm_vma **);
+void nvkm_vmm_put(struct nvkm_vmm *, struct nvkm_vma **);
+
+struct nvkm_vmm_map {
+	struct nvkm_memory *memory;
+	u64 offset;
+
+	struct nvkm_mm_node *mem;
+	struct scatterlist *sgl;
+	dma_addr_t *dma;
+	u64 off;
+
+	const struct nvkm_vmm_page *page;
+
+	struct nvkm_tags *tags;
+	u64 next;
+	u64 type;
+	u64 ctag;
+};
+
+int nvkm_vmm_map(struct nvkm_vmm *, struct nvkm_vma *, void *argv, u32 argc,
+		 struct nvkm_vmm_map *);
+void nvkm_vmm_unmap(struct nvkm_vmm *, struct nvkm_vma *);
+
+struct nvkm_memory *nvkm_umem_search(struct nvkm_client *, u64);
+struct nvkm_vmm *nvkm_uvmm_search(struct nvkm_client *, u64 handle);
 
 struct nvkm_mmu {
 	const struct nvkm_mmu_func *func;
 	struct nvkm_subdev subdev;
 
-	u64 limit;
 	u8  dma_bits;
-	u8  lpg_shift;
+
+	int heap_nr;
+	struct {
+#define NVKM_MEM_VRAM                                                      0x01
+#define NVKM_MEM_HOST                                                      0x02
+#define NVKM_MEM_COMP                                                      0x04
+#define NVKM_MEM_DISP                                                      0x08
+		u8  type;
+		u64 size;
+	} heap[4];
+
+	int type_nr;
+	struct {
+#define NVKM_MEM_KIND                                                      0x10
+#define NVKM_MEM_MAPPABLE                                                  0x20
+#define NVKM_MEM_COHERENT                                                  0x40
+#define NVKM_MEM_UNCACHED                                                  0x80
+		u8 type;
+		u8 heap;
+	} type[16];
+
+	struct nvkm_vmm *vmm;
+
+	struct {
+		struct mutex mutex;
+		struct list_head list;
+	} ptc, ptp;
+
+	struct nvkm_device_oclass user;
 };
 
 int nv04_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
 int nv41_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
 int nv44_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
 int nv50_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
+int g84_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
 int gf100_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
+int gk104_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
+int gk20a_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
+int gm200_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
+int gm20b_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
+int gp100_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
+int gp10b_mmu_new(struct nvkm_device *, int, struct nvkm_mmu **);
 #endif
diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
index 1bfd93b85575..9841f076da2e 100644
--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/therm.h
@@ -97,4 +97,5 @@ int gt215_therm_new(struct nvkm_device *, int, struct nvkm_therm **);
 int gf119_therm_new(struct nvkm_device *, int, struct nvkm_therm **);
 int gm107_therm_new(struct nvkm_device *, int, struct nvkm_therm **);
 int gm200_therm_new(struct nvkm_device *, int, struct nvkm_therm **);
+int gp100_therm_new(struct nvkm_device *, int, struct nvkm_therm **);
 #endif
diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c b/drivers/gpu/drm/nouveau/nouveau_abi16.c
index f98f800cc011..ece650a0c5f9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
+++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
@@ -34,6 +34,7 @@
 #include "nouveau_gem.h"
 #include "nouveau_chan.h"
 #include "nouveau_abi16.h"
+#include "nouveau_vmm.h"
 
 static struct nouveau_abi16 *
 nouveau_abi16(struct drm_file *file_priv)
@@ -134,7 +135,7 @@ nouveau_abi16_chan_fini(struct nouveau_abi16 *abi16,
 	}
 
 	if (chan->ntfy) {
-		nouveau_bo_vma_del(chan->ntfy, &chan->ntfy_vma);
+		nouveau_vma_del(&chan->ntfy_vma);
 		nouveau_bo_unpin(chan->ntfy);
 		drm_gem_object_unreference_unlocked(&chan->ntfy->gem);
 	}
@@ -184,29 +185,33 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
 		getparam->value = device->info.chipset;
 		break;
 	case NOUVEAU_GETPARAM_PCI_VENDOR:
-		if (nvxx_device(device)->func->pci)
+		if (device->info.platform != NV_DEVICE_INFO_V0_SOC)
 			getparam->value = dev->pdev->vendor;
 		else
 			getparam->value = 0;
 		break;
 	case NOUVEAU_GETPARAM_PCI_DEVICE:
-		if (nvxx_device(device)->func->pci)
+		if (device->info.platform != NV_DEVICE_INFO_V0_SOC)
 			getparam->value = dev->pdev->device;
 		else
 			getparam->value = 0;
 		break;
 	case NOUVEAU_GETPARAM_BUS_TYPE:
-		if (!nvxx_device(device)->func->pci)
-			getparam->value = 3;
-		else
-		if (pci_find_capability(dev->pdev, PCI_CAP_ID_AGP))
-			getparam->value = 0;
-		else
-		if (!pci_is_pcie(dev->pdev))
-			getparam->value = 1;
-		else
-			getparam->value = 2;
-		break;
+		switch (device->info.platform) {
+		case NV_DEVICE_INFO_V0_AGP : getparam->value = 0; break;
+		case NV_DEVICE_INFO_V0_PCI : getparam->value = 1; break;
+		case NV_DEVICE_INFO_V0_PCIE: getparam->value = 2; break;
+		case NV_DEVICE_INFO_V0_SOC : getparam->value = 3; break;
+		case NV_DEVICE_INFO_V0_IGP :
+			if (!pci_is_pcie(dev->pdev))
+				getparam->value = 1;
+			else
+				getparam->value = 2;
+			break;
+		default:
+			WARN_ON(1);
+			break;
+		}
 	case NOUVEAU_GETPARAM_FB_SIZE:
 		getparam->value = drm->gem.vram_available;
 		break;
@@ -329,8 +334,7 @@ nouveau_abi16_ioctl_channel_alloc(ABI16_IOCTL_ARGS)
 		goto done;
 
 	if (device->info.family >= NV_DEVICE_INFO_V0_TESLA) {
-		ret = nouveau_bo_vma_add(chan->ntfy, cli->vm,
-					&chan->ntfy_vma);
+		ret = nouveau_vma_new(chan->ntfy, &cli->vmm, &chan->ntfy_vma);
 		if (ret)
 			goto done;
 	}
@@ -340,7 +344,7 @@ nouveau_abi16_ioctl_channel_alloc(ABI16_IOCTL_ARGS)
 	if (ret)
 		goto done;
 
-	ret = nvkm_mm_init(&chan->heap, 0, PAGE_SIZE, 1);
+	ret = nvkm_mm_init(&chan->heap, 0, 0, PAGE_SIZE, 1);
 done:
 	if (ret)
 		nouveau_abi16_chan_fini(abi16, chan);
@@ -548,8 +552,8 @@ nouveau_abi16_ioctl_notifierobj_alloc(ABI16_IOCTL_ARGS)
 	if (device->info.family >= NV_DEVICE_INFO_V0_TESLA) {
 		args.target = NV_DMA_V0_TARGET_VM;
 		args.access = NV_DMA_V0_ACCESS_VM;
-		args.start += chan->ntfy_vma.offset;
-		args.limit += chan->ntfy_vma.offset;
+		args.start += chan->ntfy_vma->addr;
+		args.limit += chan->ntfy_vma->addr;
 	} else
 	if (drm->agp.bridge) {
 		args.target = NV_DMA_V0_TARGET_AGP;
diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.h b/drivers/gpu/drm/nouveau/nouveau_abi16.h
index 841cc556fad8..327747680324 100644
--- a/drivers/gpu/drm/nouveau/nouveau_abi16.h
+++ b/drivers/gpu/drm/nouveau/nouveau_abi16.h
@@ -23,7 +23,7 @@ struct nouveau_abi16_chan {
 	struct nouveau_channel *chan;
 	struct list_head notifiers;
 	struct nouveau_bo *ntfy;
-	struct nvkm_vma ntfy_vma;
+	struct nouveau_vma *ntfy_vma;
 	struct nvkm_mm  heap;
 };
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.c b/drivers/gpu/drm/nouveau/nouveau_bios.c
index dd6fba55ad5d..66bf2aff4a3e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bios.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bios.c
@@ -1478,9 +1478,13 @@ parse_dcb20_entry(struct drm_device *dev, struct dcb_table *dcb,
 		case 1:
 			entry->dpconf.link_bw = 270000;
 			break;
-		default:
+		case 2:
 			entry->dpconf.link_bw = 540000;
 			break;
+		case 3:
+		default:
+			entry->dpconf.link_bw = 810000;
+			break;
 		}
 		switch ((conf & 0x0f000000) >> 24) {
 		case 0xf:
@@ -1964,7 +1968,7 @@ static int load_nv17_hw_sequencer_ucode(struct drm_device *dev,
 	 * The microcode entries are found by the "HWSQ" signature.
 	 */
 
-	const uint8_t hwsq_signature[] = { 'H', 'W', 'S', 'Q' };
+	static const uint8_t hwsq_signature[] = { 'H', 'W', 'S', 'Q' };
 	const int sz = sizeof(hwsq_signature);
 	int hwsq_offset;
 
@@ -1980,7 +1984,7 @@ uint8_t *nouveau_bios_embedded_edid(struct drm_device *dev)
 {
 	struct nouveau_drm *drm = nouveau_drm(dev);
 	struct nvbios *bios = &drm->vbios;
-	const uint8_t edid_sig[] = {
+	static const uint8_t edid_sig[] = {
 			0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 };
 	uint16_t offset = 0;
 	uint16_t newoffset;
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index e427f80344c4..2615912430cc 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -37,6 +37,12 @@
 #include "nouveau_bo.h"
 #include "nouveau_ttm.h"
 #include "nouveau_gem.h"
+#include "nouveau_mem.h"
+#include "nouveau_vmm.h"
+
+#include <nvif/class.h>
+#include <nvif/if500b.h>
+#include <nvif/if900b.h>
 
 /*
  * NV10-NV40 tiling helpers
@@ -48,8 +54,7 @@ nv10_bo_update_tile_region(struct drm_device *dev, struct nouveau_drm_tile *reg,
 {
 	struct nouveau_drm *drm = nouveau_drm(dev);
 	int i = reg - drm->tile.reg;
-	struct nvkm_device *device = nvxx_device(&drm->client.device);
-	struct nvkm_fb *fb = device->fb;
+	struct nvkm_fb *fb = nvxx_fb(&drm->client.device);
 	struct nvkm_fb_tile *tile = &fb->tile.region[i];
 
 	nouveau_fence_unref(&reg->fence);
@@ -97,7 +102,7 @@ nv10_bo_put_tile_region(struct drm_device *dev, struct nouveau_drm_tile *tile,
 
 static struct nouveau_drm_tile *
 nv10_bo_set_tiling(struct drm_device *dev, u32 addr,
-		   u32 size, u32 pitch, u32 flags)
+		   u32 size, u32 pitch, u32 zeta)
 {
 	struct nouveau_drm *drm = nouveau_drm(dev);
 	struct nvkm_fb *fb = nvxx_fb(&drm->client.device);
@@ -120,8 +125,7 @@ nv10_bo_set_tiling(struct drm_device *dev, u32 addr,
 	}
 
 	if (found)
-		nv10_bo_update_tile_region(dev, found, addr, size,
-					    pitch, flags);
+		nv10_bo_update_tile_region(dev, found, addr, size, pitch, zeta);
 	return found;
 }
 
@@ -155,27 +159,27 @@ nouveau_bo_fixup_align(struct nouveau_bo *nvbo, u32 flags,
 	struct nvif_device *device = &drm->client.device;
 
 	if (device->info.family < NV_DEVICE_INFO_V0_TESLA) {
-		if (nvbo->tile_mode) {
+		if (nvbo->mode) {
 			if (device->info.chipset >= 0x40) {
 				*align = 65536;
-				*size = roundup_64(*size, 64 * nvbo->tile_mode);
+				*size = roundup_64(*size, 64 * nvbo->mode);
 
 			} else if (device->info.chipset >= 0x30) {
 				*align = 32768;
-				*size = roundup_64(*size, 64 * nvbo->tile_mode);
+				*size = roundup_64(*size, 64 * nvbo->mode);
 
 			} else if (device->info.chipset >= 0x20) {
 				*align = 16384;
-				*size = roundup_64(*size, 64 * nvbo->tile_mode);
+				*size = roundup_64(*size, 64 * nvbo->mode);
 
 			} else if (device->info.chipset >= 0x10) {
 				*align = 16384;
-				*size = roundup_64(*size, 32 * nvbo->tile_mode);
+				*size = roundup_64(*size, 32 * nvbo->mode);
 			}
 		}
 	} else {
-		*size = roundup_64(*size, (1 << nvbo->page_shift));
-		*align = max((1 <<  nvbo->page_shift), *align);
+		*size = roundup_64(*size, (1 << nvbo->page));
+		*align = max((1 <<  nvbo->page), *align);
 	}
 
 	*size = roundup_64(*size, PAGE_SIZE);
@@ -187,11 +191,13 @@ nouveau_bo_new(struct nouveau_cli *cli, u64 size, int align,
 	       struct sg_table *sg, struct reservation_object *robj,
 	       struct nouveau_bo **pnvbo)
 {
-	struct nouveau_drm *drm = nouveau_drm(cli->dev);
+	struct nouveau_drm *drm = cli->drm;
 	struct nouveau_bo *nvbo;
+	struct nvif_mmu *mmu = &cli->mmu;
+	struct nvif_vmm *vmm = &cli->vmm.vmm;
 	size_t acc_size;
-	int ret;
 	int type = ttm_bo_type_device;
+	int ret, i, pi = -1;
 
 	if (!size) {
 		NV_WARN(drm, "skipped size %016llx\n", size);
@@ -207,19 +213,80 @@ nouveau_bo_new(struct nouveau_cli *cli, u64 size, int align,
 	INIT_LIST_HEAD(&nvbo->head);
 	INIT_LIST_HEAD(&nvbo->entry);
 	INIT_LIST_HEAD(&nvbo->vma_list);
-	nvbo->tile_mode = tile_mode;
-	nvbo->tile_flags = tile_flags;
 	nvbo->bo.bdev = &drm->ttm.bdev;
 	nvbo->cli = cli;
 
-	if (!nvxx_device(&drm->client.device)->func->cpu_coherent)
-		nvbo->force_coherent = flags & TTM_PL_FLAG_UNCACHED;
+	/* This is confusing, and doesn't actually mean we want an uncached
+	 * mapping, but is what NOUVEAU_GEM_DOMAIN_COHERENT gets translated
+	 * into in nouveau_gem_new().
+	 */
+	if (flags & TTM_PL_FLAG_UNCACHED) {
+		/* Determine if we can get a cache-coherent map, forcing
+		 * uncached mapping if we can't.
+		 */
+		if (mmu->type[drm->ttm.type_host].type & NVIF_MEM_UNCACHED)
+			nvbo->force_coherent = true;
+	}
+
+	if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) {
+		nvbo->kind = (tile_flags & 0x0000ff00) >> 8;
+		if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
+			kfree(nvbo);
+			return -EINVAL;
+		}
+
+		nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind;
+	} else
+	if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
+		nvbo->kind = (tile_flags & 0x00007f00) >> 8;
+		nvbo->comp = (tile_flags & 0x00030000) >> 16;
+		if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
+			kfree(nvbo);
+			return -EINVAL;
+		}
+	} else {
+		nvbo->zeta = (tile_flags & 0x00000007);
+	}
+	nvbo->mode = tile_mode;
+	nvbo->contig = !(tile_flags & NOUVEAU_GEM_TILE_NONCONTIG);
+
+	/* Determine the desirable target GPU page size for the buffer. */
+	for (i = 0; i < vmm->page_nr; i++) {
+		/* Because we cannot currently allow VMM maps to fail
+		 * during buffer migration, we need to determine page
+		 * size for the buffer up-front, and pre-allocate its
+		 * page tables.
+		 *
+		 * Skip page sizes that can't support needed domains.
+		 */
+		if (cli->device.info.family > NV_DEVICE_INFO_V0_CURIE &&
+		    (flags & TTM_PL_FLAG_VRAM) && !vmm->page[i].vram)
+			continue;
+		if ((flags & TTM_PL_FLAG_TT  ) && !vmm->page[i].host)
+			continue;
+
+		/* Select this page size if it's the first that supports
+		 * the potential memory domains, or when it's compatible
+		 * with the requested compression settings.
+		 */
+		if (pi < 0 || !nvbo->comp || vmm->page[i].comp)
+			pi = i;
+
+		/* Stop once the buffer is larger than the current page size. */
+		if (size >= 1ULL << vmm->page[i].shift)
+			break;
+	}
+
+	if (WARN_ON(pi < 0))
+		return -EINVAL;
 
-	nvbo->page_shift = 12;
-	if (drm->client.vm) {
-		if (!(flags & TTM_PL_FLAG_TT) && size > 256 * 1024)
-			nvbo->page_shift = drm->client.vm->mmu->lpg_shift;
+	/* Disable compression if suitable settings couldn't be found. */
+	if (nvbo->comp && !vmm->page[pi].comp) {
+		if (mmu->object.oclass >= NVIF_CLASS_MMU_GF100)
+			nvbo->kind = mmu->kind[nvbo->kind];
+		nvbo->comp = 0;
 	}
+	nvbo->page = vmm->page[pi].shift;
 
 	nouveau_bo_fixup_align(nvbo, flags, &align, &size);
 	nvbo->bo.mem.num_pages = size >> PAGE_SHIFT;
@@ -262,7 +329,7 @@ set_placement_range(struct nouveau_bo *nvbo, uint32_t type)
 	unsigned i, fpfn, lpfn;
 
 	if (drm->client.device.info.family == NV_DEVICE_INFO_V0_CELSIUS &&
-	    nvbo->tile_mode && (type & TTM_PL_FLAG_VRAM) &&
+	    nvbo->mode && (type & TTM_PL_FLAG_VRAM) &&
 	    nvbo->bo.mem.num_pages < vram_pages / 4) {
 		/*
 		 * Make sure that the color and depth buffers are handled
@@ -270,7 +337,7 @@ set_placement_range(struct nouveau_bo *nvbo, uint32_t type)
 		 * speed up when alpha-blending and depth-test are enabled
 		 * at the same time.
 		 */
-		if (nvbo->tile_flags & NOUVEAU_GEM_TILE_ZETA) {
+		if (nvbo->zeta) {
 			fpfn = vram_pages / 2;
 			lpfn = ~0;
 		} else {
@@ -321,14 +388,10 @@ nouveau_bo_pin(struct nouveau_bo *nvbo, uint32_t memtype, bool contig)
 
 	if (drm->client.device.info.family >= NV_DEVICE_INFO_V0_TESLA &&
 	    memtype == TTM_PL_FLAG_VRAM && contig) {
-		if (nvbo->tile_flags & NOUVEAU_GEM_TILE_NONCONTIG) {
-			if (bo->mem.mem_type == TTM_PL_VRAM) {
-				struct nvkm_mem *mem = bo->mem.mm_node;
-				if (!nvkm_mm_contiguous(mem->mem))
-					evict = true;
-			}
-			nvbo->tile_flags &= ~NOUVEAU_GEM_TILE_NONCONTIG;
+		if (!nvbo->contig) {
+			nvbo->contig = true;
 			force = true;
+			evict = true;
 		}
 	}
 
@@ -376,7 +439,7 @@ nouveau_bo_pin(struct nouveau_bo *nvbo, uint32_t memtype, bool contig)
 
 out:
 	if (force && ret)
-		nvbo->tile_flags |= NOUVEAU_GEM_TILE_NONCONTIG;
+		nvbo->contig = false;
 	ttm_bo_unreserve(bo);
 	return ret;
 }
@@ -446,7 +509,6 @@ void
 nouveau_bo_sync_for_device(struct nouveau_bo *nvbo)
 {
 	struct nouveau_drm *drm = nouveau_bdev(nvbo->bo.bdev);
-	struct nvkm_device *device = nvxx_device(&drm->client.device);
 	struct ttm_dma_tt *ttm_dma = (struct ttm_dma_tt *)nvbo->bo.ttm;
 	int i;
 
@@ -458,7 +520,8 @@ nouveau_bo_sync_for_device(struct nouveau_bo *nvbo)
 		return;
 
 	for (i = 0; i < ttm_dma->ttm.num_pages; i++)
-		dma_sync_single_for_device(device->dev, ttm_dma->dma_address[i],
+		dma_sync_single_for_device(drm->dev->dev,
+					   ttm_dma->dma_address[i],
 					   PAGE_SIZE, DMA_TO_DEVICE);
 }
 
@@ -466,7 +529,6 @@ void
 nouveau_bo_sync_for_cpu(struct nouveau_bo *nvbo)
 {
 	struct nouveau_drm *drm = nouveau_bdev(nvbo->bo.bdev);
-	struct nvkm_device *device = nvxx_device(&drm->client.device);
 	struct ttm_dma_tt *ttm_dma = (struct ttm_dma_tt *)nvbo->bo.ttm;
 	int i;
 
@@ -478,7 +540,7 @@ nouveau_bo_sync_for_cpu(struct nouveau_bo *nvbo)
 		return;
 
 	for (i = 0; i < ttm_dma->ttm.num_pages; i++)
-		dma_sync_single_for_cpu(device->dev, ttm_dma->dma_address[i],
+		dma_sync_single_for_cpu(drm->dev->dev, ttm_dma->dma_address[i],
 					PAGE_SIZE, DMA_FROM_DEVICE);
 }
 
@@ -568,6 +630,7 @@ nouveau_bo_init_mem_type(struct ttm_bo_device *bdev, uint32_t type,
 			 struct ttm_mem_type_manager *man)
 {
 	struct nouveau_drm *drm = nouveau_bdev(bdev);
+	struct nvif_mmu *mmu = &drm->client.mmu;
 
 	switch (type) {
 	case TTM_PL_SYSTEM:
@@ -584,7 +647,8 @@ nouveau_bo_init_mem_type(struct ttm_bo_device *bdev, uint32_t type,
 
 		if (drm->client.device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
 			/* Some BARs do not support being ioremapped WC */
-			if (nvxx_bar(&drm->client.device)->iomap_uncached) {
+			const u8 type = mmu->type[drm->ttm.type_vram].type;
+			if (type & NVIF_MEM_UNCACHED) {
 				man->available_caching = TTM_PL_FLAG_UNCACHED;
 				man->default_caching = TTM_PL_FLAG_UNCACHED;
 			}
@@ -659,14 +723,14 @@ static int
 nve0_bo_move_copy(struct nouveau_channel *chan, struct ttm_buffer_object *bo,
 		  struct ttm_mem_reg *old_reg, struct ttm_mem_reg *new_reg)
 {
-	struct nvkm_mem *mem = old_reg->mm_node;
+	struct nouveau_mem *mem = nouveau_mem(old_reg);
 	int ret = RING_SPACE(chan, 10);
 	if (ret == 0) {
 		BEGIN_NVC0(chan, NvSubCopy, 0x0400, 8);
-		OUT_RING  (chan, upper_32_bits(mem->vma[0].offset));
-		OUT_RING  (chan, lower_32_bits(mem->vma[0].offset));
-		OUT_RING  (chan, upper_32_bits(mem->vma[1].offset));
-		OUT_RING  (chan, lower_32_bits(mem->vma[1].offset));
+		OUT_RING  (chan, upper_32_bits(mem->vma[0].addr));
+		OUT_RING  (chan, lower_32_bits(mem->vma[0].addr));
+		OUT_RING  (chan, upper_32_bits(mem->vma[1].addr));
+		OUT_RING  (chan, lower_32_bits(mem->vma[1].addr));
 		OUT_RING  (chan, PAGE_SIZE);
 		OUT_RING  (chan, PAGE_SIZE);
 		OUT_RING  (chan, PAGE_SIZE);
@@ -691,9 +755,9 @@ static int
 nvc0_bo_move_copy(struct nouveau_channel *chan, struct ttm_buffer_object *bo,
 		  struct ttm_mem_reg *old_reg, struct ttm_mem_reg *new_reg)
 {
-	struct nvkm_mem *mem = old_reg->mm_node;
-	u64 src_offset = mem->vma[0].offset;
-	u64 dst_offset = mem->vma[1].offset;
+	struct nouveau_mem *mem = nouveau_mem(old_reg);
+	u64 src_offset = mem->vma[0].addr;
+	u64 dst_offset = mem->vma[1].addr;
 	u32 page_count = new_reg->num_pages;
 	int ret;
 
@@ -729,9 +793,9 @@ static int
 nvc0_bo_move_m2mf(struct nouveau_channel *chan, struct ttm_buffer_object *bo,
 		  struct ttm_mem_reg *old_reg, struct ttm_mem_reg *new_reg)
 {
-	struct nvkm_mem *mem = old_reg->mm_node;
-	u64 src_offset = mem->vma[0].offset;
-	u64 dst_offset = mem->vma[1].offset;
+	struct nouveau_mem *mem = nouveau_mem(old_reg);
+	u64 src_offset = mem->vma[0].addr;
+	u64 dst_offset = mem->vma[1].addr;
 	u32 page_count = new_reg->num_pages;
 	int ret;
 
@@ -768,9 +832,9 @@ static int
 nva3_bo_move_copy(struct nouveau_channel *chan, struct ttm_buffer_object *bo,
 		  struct ttm_mem_reg *old_reg, struct ttm_mem_reg *new_reg)
 {
-	struct nvkm_mem *mem = old_reg->mm_node;
-	u64 src_offset = mem->vma[0].offset;
-	u64 dst_offset = mem->vma[1].offset;
+	struct nouveau_mem *mem = nouveau_mem(old_reg);
+	u64 src_offset = mem->vma[0].addr;
+	u64 dst_offset = mem->vma[1].addr;
 	u32 page_count = new_reg->num_pages;
 	int ret;
 
@@ -806,14 +870,14 @@ static int
 nv98_bo_move_exec(struct nouveau_channel *chan, struct ttm_buffer_object *bo,
 		  struct ttm_mem_reg *old_reg, struct ttm_mem_reg *new_reg)
 {
-	struct nvkm_mem *mem = old_reg->mm_node;
+	struct nouveau_mem *mem = nouveau_mem(old_reg);
 	int ret = RING_SPACE(chan, 7);
 	if (ret == 0) {
 		BEGIN_NV04(chan, NvSubCopy, 0x0320, 6);
-		OUT_RING  (chan, upper_32_bits(mem->vma[0].offset));
-		OUT_RING  (chan, lower_32_bits(mem->vma[0].offset));
-		OUT_RING  (chan, upper_32_bits(mem->vma[1].offset));
-		OUT_RING  (chan, lower_32_bits(mem->vma[1].offset));
+		OUT_RING  (chan, upper_32_bits(mem->vma[0].addr));
+		OUT_RING  (chan, lower_32_bits(mem->vma[0].addr));
+		OUT_RING  (chan, upper_32_bits(mem->vma[1].addr));
+		OUT_RING  (chan, lower_32_bits(mem->vma[1].addr));
 		OUT_RING  (chan, 0x00000000 /* COPY */);
 		OUT_RING  (chan, new_reg->num_pages << PAGE_SHIFT);
 	}
@@ -824,15 +888,15 @@ static int
 nv84_bo_move_exec(struct nouveau_channel *chan, struct ttm_buffer_object *bo,
 		  struct ttm_mem_reg *old_reg, struct ttm_mem_reg *new_reg)
 {
-	struct nvkm_mem *mem = old_reg->mm_node;
+	struct nouveau_mem *mem = nouveau_mem(old_reg);
 	int ret = RING_SPACE(chan, 7);
 	if (ret == 0) {
 		BEGIN_NV04(chan, NvSubCopy, 0x0304, 6);
 		OUT_RING  (chan, new_reg->num_pages << PAGE_SHIFT);
-		OUT_RING  (chan, upper_32_bits(mem->vma[0].offset));
-		OUT_RING  (chan, lower_32_bits(mem->vma[0].offset));
-		OUT_RING  (chan, upper_32_bits(mem->vma[1].offset));
-		OUT_RING  (chan, lower_32_bits(mem->vma[1].offset));
+		OUT_RING  (chan, upper_32_bits(mem->vma[0].addr));
+		OUT_RING  (chan, lower_32_bits(mem->vma[0].addr));
+		OUT_RING  (chan, upper_32_bits(mem->vma[1].addr));
+		OUT_RING  (chan, lower_32_bits(mem->vma[1].addr));
 		OUT_RING  (chan, 0x00000000 /* MODE_COPY, QUERY_NONE */);
 	}
 	return ret;
@@ -858,12 +922,12 @@ static int
 nv50_bo_move_m2mf(struct nouveau_channel *chan, struct ttm_buffer_object *bo,
 		  struct ttm_mem_reg *old_reg, struct ttm_mem_reg *new_reg)
 {
-	struct nvkm_mem *mem = old_reg->mm_node;
+	struct nouveau_mem *mem = nouveau_mem(old_reg);
 	u64 length = (new_reg->num_pages << PAGE_SHIFT);
-	u64 src_offset = mem->vma[0].offset;
-	u64 dst_offset = mem->vma[1].offset;
-	int src_tiled = !!mem->memtype;
-	int dst_tiled = !!((struct nvkm_mem *)new_reg->mm_node)->memtype;
+	u64 src_offset = mem->vma[0].addr;
+	u64 dst_offset = mem->vma[1].addr;
+	int src_tiled = !!mem->kind;
+	int dst_tiled = !!nouveau_mem(new_reg)->kind;
 	int ret;
 
 	while (length) {
@@ -1000,25 +1064,31 @@ static int
 nouveau_bo_move_prep(struct nouveau_drm *drm, struct ttm_buffer_object *bo,
 		     struct ttm_mem_reg *reg)
 {
-	struct nvkm_mem *old_mem = bo->mem.mm_node;
-	struct nvkm_mem *new_mem = reg->mm_node;
-	u64 size = (u64)reg->num_pages << PAGE_SHIFT;
+	struct nouveau_mem *old_mem = nouveau_mem(&bo->mem);
+	struct nouveau_mem *new_mem = nouveau_mem(reg);
+	struct nvif_vmm *vmm = &drm->client.vmm.vmm;
 	int ret;
 
-	ret = nvkm_vm_get(drm->client.vm, size, old_mem->page_shift,
-			  NV_MEM_ACCESS_RW, &old_mem->vma[0]);
+	ret = nvif_vmm_get(vmm, LAZY, false, old_mem->mem.page, 0,
+			   old_mem->mem.size, &old_mem->vma[0]);
 	if (ret)
 		return ret;
 
-	ret = nvkm_vm_get(drm->client.vm, size, new_mem->page_shift,
-			  NV_MEM_ACCESS_RW, &old_mem->vma[1]);
+	ret = nvif_vmm_get(vmm, LAZY, false, new_mem->mem.page, 0,
+			   new_mem->mem.size, &old_mem->vma[1]);
+	if (ret)
+		goto done;
+
+	ret = nouveau_mem_map(old_mem, vmm, &old_mem->vma[0]);
+	if (ret)
+		goto done;
+
+	ret = nouveau_mem_map(new_mem, vmm, &old_mem->vma[1]);
+done:
 	if (ret) {
-		nvkm_vm_put(&old_mem->vma[0]);
-		return ret;
+		nvif_vmm_put(vmm, &old_mem->vma[1]);
+		nvif_vmm_put(vmm, &old_mem->vma[0]);
 	}
-
-	nvkm_vm_map(&old_mem->vma[0], old_mem);
-	nvkm_vm_map(&old_mem->vma[1], new_mem);
 	return 0;
 }
 
@@ -1200,21 +1270,23 @@ static void
 nouveau_bo_move_ntfy(struct ttm_buffer_object *bo, bool evict,
 		     struct ttm_mem_reg *new_reg)
 {
+	struct nouveau_mem *mem = new_reg ? nouveau_mem(new_reg) : NULL;
 	struct nouveau_bo *nvbo = nouveau_bo(bo);
-	struct nvkm_vma *vma;
+	struct nouveau_vma *vma;
 
 	/* ttm can now (stupidly) pass the driver bos it didn't create... */
 	if (bo->destroy != nouveau_bo_del_ttm)
 		return;
 
-	list_for_each_entry(vma, &nvbo->vma_list, head) {
-		if (new_reg && new_reg->mem_type != TTM_PL_SYSTEM &&
-			      (new_reg->mem_type == TTM_PL_VRAM ||
-			       nvbo->page_shift != vma->vm->mmu->lpg_shift)) {
-			nvkm_vm_map(vma, new_reg->mm_node);
-		} else {
+	if (mem && new_reg->mem_type != TTM_PL_SYSTEM &&
+	    mem->mem.page == nvbo->page) {
+		list_for_each_entry(vma, &nvbo->vma_list, head) {
+			nouveau_vma_map(vma, mem);
+		}
+	} else {
+		list_for_each_entry(vma, &nvbo->vma_list, head) {
 			WARN_ON(ttm_bo_wait(bo, false, false));
-			nvkm_vm_unmap(vma);
+			nouveau_vma_unmap(vma);
 		}
 	}
 }
@@ -1234,8 +1306,7 @@ nouveau_bo_vm_bind(struct ttm_buffer_object *bo, struct ttm_mem_reg *new_reg,
 
 	if (drm->client.device.info.family >= NV_DEVICE_INFO_V0_CELSIUS) {
 		*new_tile = nv10_bo_set_tiling(dev, offset, new_reg->size,
-						nvbo->tile_mode,
-						nvbo->tile_flags);
+					       nvbo->mode, nvbo->zeta);
 	}
 
 	return 0;
@@ -1331,8 +1402,7 @@ nouveau_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_reg *reg)
 	struct ttm_mem_type_manager *man = &bdev->man[reg->mem_type];
 	struct nouveau_drm *drm = nouveau_bdev(bdev);
 	struct nvkm_device *device = nvxx_device(&drm->client.device);
-	struct nvkm_mem *mem = reg->mm_node;
-	int ret;
+	struct nouveau_mem *mem = nouveau_mem(reg);
 
 	reg->bus.addr = NULL;
 	reg->bus.offset = 0;
@@ -1353,7 +1423,7 @@ nouveau_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_reg *reg)
 			reg->bus.is_iomem = !drm->agp.cma;
 		}
 #endif
-		if (drm->client.device.info.family < NV_DEVICE_INFO_V0_TESLA || !mem->memtype)
+		if (drm->client.mem->oclass < NVIF_CLASS_MEM_NV50 || !mem->kind)
 			/* untiled */
 			break;
 		/* fallthrough, tiled memory */
@@ -1361,19 +1431,40 @@ nouveau_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_reg *reg)
 		reg->bus.offset = reg->start << PAGE_SHIFT;
 		reg->bus.base = device->func->resource_addr(device, 1);
 		reg->bus.is_iomem = true;
-		if (drm->client.device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
-			struct nvkm_bar *bar = nvxx_bar(&drm->client.device);
-			int page_shift = 12;
-			if (drm->client.device.info.family >= NV_DEVICE_INFO_V0_FERMI)
-				page_shift = mem->page_shift;
+		if (drm->client.mem->oclass >= NVIF_CLASS_MEM_NV50) {
+			union {
+				struct nv50_mem_map_v0 nv50;
+				struct gf100_mem_map_v0 gf100;
+			} args;
+			u64 handle, length;
+			u32 argc = 0;
+			int ret;
+
+			switch (mem->mem.object.oclass) {
+			case NVIF_CLASS_MEM_NV50:
+				args.nv50.version = 0;
+				args.nv50.ro = 0;
+				args.nv50.kind = mem->kind;
+				args.nv50.comp = mem->comp;
+				break;
+			case NVIF_CLASS_MEM_GF100:
+				args.gf100.version = 0;
+				args.gf100.ro = 0;
+				args.gf100.kind = mem->kind;
+				break;
+			default:
+				WARN_ON(1);
+				break;
+			}
 
-			ret = nvkm_bar_umap(bar, mem->size << 12, page_shift,
-					    &mem->bar_vma);
-			if (ret)
-				return ret;
+			ret = nvif_object_map_handle(&mem->mem.object,
+						     &argc, argc,
+						     &handle, &length);
+			if (ret != 1)
+				return ret ? ret : -EINVAL;
 
-			nvkm_vm_map(&mem->bar_vma, mem);
-			reg->bus.offset = mem->bar_vma.offset;
+			reg->bus.base = 0;
+			reg->bus.offset = handle;
 		}
 		break;
 	default:
@@ -1385,13 +1476,22 @@ nouveau_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_reg *reg)
 static void
 nouveau_ttm_io_mem_free(struct ttm_bo_device *bdev, struct ttm_mem_reg *reg)
 {
-	struct nvkm_mem *mem = reg->mm_node;
-
-	if (!mem->bar_vma.node)
-		return;
+	struct nouveau_drm *drm = nouveau_bdev(bdev);
+	struct nouveau_mem *mem = nouveau_mem(reg);
 
-	nvkm_vm_unmap(&mem->bar_vma);
-	nvkm_vm_put(&mem->bar_vma);
+	if (drm->client.mem->oclass >= NVIF_CLASS_MEM_NV50) {
+		switch (reg->mem_type) {
+		case TTM_PL_TT:
+			if (mem->kind)
+				nvif_object_unmap_handle(&mem->mem.object);
+			break;
+		case TTM_PL_VRAM:
+			nvif_object_unmap_handle(&mem->mem.object);
+			break;
+		default:
+			break;
+		}
+	}
 }
 
 static int
@@ -1408,7 +1508,7 @@ nouveau_ttm_fault_reserve_notify(struct ttm_buffer_object *bo)
 	 */
 	if (bo->mem.mem_type != TTM_PL_VRAM) {
 		if (drm->client.device.info.family < NV_DEVICE_INFO_V0_TESLA ||
-		    !nouveau_bo_tile_layout(nvbo))
+		    !nvbo->kind)
 			return 0;
 
 		if (bo->mem.mem_type == TTM_PL_SYSTEM) {
@@ -1445,9 +1545,7 @@ nouveau_ttm_tt_populate(struct ttm_tt *ttm)
 {
 	struct ttm_dma_tt *ttm_dma = (void *)ttm;
 	struct nouveau_drm *drm;
-	struct nvkm_device *device;
-	struct drm_device *dev;
-	struct device *pdev;
+	struct device *dev;
 	unsigned i;
 	int r;
 	bool slave = !!(ttm->page_flags & TTM_PAGE_FLAG_SG);
@@ -1464,9 +1562,7 @@ nouveau_ttm_tt_populate(struct ttm_tt *ttm)
 	}
 
 	drm = nouveau_bdev(ttm->bdev);
-	device = nvxx_device(&drm->client.device);
-	dev = drm->dev;
-	pdev = device->dev;
+	dev = drm->dev->dev;
 
 #if IS_ENABLED(CONFIG_AGP)
 	if (drm->agp.bridge) {
@@ -1476,7 +1572,7 @@ nouveau_ttm_tt_populate(struct ttm_tt *ttm)
 
 #if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86)
 	if (swiotlb_nr_tbl()) {
-		return ttm_dma_populate((void *)ttm, dev->dev);
+		return ttm_dma_populate((void *)ttm, dev);
 	}
 #endif
 
@@ -1488,12 +1584,12 @@ nouveau_ttm_tt_populate(struct ttm_tt *ttm)
 	for (i = 0; i < ttm->num_pages; i++) {
 		dma_addr_t addr;
 
-		addr = dma_map_page(pdev, ttm->pages[i], 0, PAGE_SIZE,
+		addr = dma_map_page(dev, ttm->pages[i], 0, PAGE_SIZE,
 				    DMA_BIDIRECTIONAL);
 
-		if (dma_mapping_error(pdev, addr)) {
+		if (dma_mapping_error(dev, addr)) {
 			while (i--) {
-				dma_unmap_page(pdev, ttm_dma->dma_address[i],
+				dma_unmap_page(dev, ttm_dma->dma_address[i],
 					       PAGE_SIZE, DMA_BIDIRECTIONAL);
 				ttm_dma->dma_address[i] = 0;
 			}
@@ -1511,9 +1607,7 @@ nouveau_ttm_tt_unpopulate(struct ttm_tt *ttm)
 {
 	struct ttm_dma_tt *ttm_dma = (void *)ttm;
 	struct nouveau_drm *drm;
-	struct nvkm_device *device;
-	struct drm_device *dev;
-	struct device *pdev;
+	struct device *dev;
 	unsigned i;
 	bool slave = !!(ttm->page_flags & TTM_PAGE_FLAG_SG);
 
@@ -1521,9 +1615,7 @@ nouveau_ttm_tt_unpopulate(struct ttm_tt *ttm)
 		return;
 
 	drm = nouveau_bdev(ttm->bdev);
-	device = nvxx_device(&drm->client.device);
-	dev = drm->dev;
-	pdev = device->dev;
+	dev = drm->dev->dev;
 
 #if IS_ENABLED(CONFIG_AGP)
 	if (drm->agp.bridge) {
@@ -1534,14 +1626,14 @@ nouveau_ttm_tt_unpopulate(struct ttm_tt *ttm)
 
 #if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86)
 	if (swiotlb_nr_tbl()) {
-		ttm_dma_unpopulate((void *)ttm, dev->dev);
+		ttm_dma_unpopulate((void *)ttm, dev);
 		return;
 	}
 #endif
 
 	for (i = 0; i < ttm->num_pages; i++) {
 		if (ttm_dma->dma_address[i]) {
-			dma_unmap_page(pdev, ttm_dma->dma_address[i], PAGE_SIZE,
+			dma_unmap_page(dev, ttm_dma->dma_address[i], PAGE_SIZE,
 				       DMA_BIDIRECTIONAL);
 		}
 	}
@@ -1576,48 +1668,3 @@ struct ttm_bo_driver nouveau_bo_driver = {
 	.io_mem_free = &nouveau_ttm_io_mem_free,
 	.io_mem_pfn = ttm_bo_default_io_mem_pfn,
 };
-
-struct nvkm_vma *
-nouveau_bo_vma_find(struct nouveau_bo *nvbo, struct nvkm_vm *vm)
-{
-	struct nvkm_vma *vma;
-	list_for_each_entry(vma, &nvbo->vma_list, head) {
-		if (vma->vm == vm)
-			return vma;
-	}
-
-	return NULL;
-}
-
-int
-nouveau_bo_vma_add(struct nouveau_bo *nvbo, struct nvkm_vm *vm,
-		   struct nvkm_vma *vma)
-{
-	const u32 size = nvbo->bo.mem.num_pages << PAGE_SHIFT;
-	int ret;
-
-	ret = nvkm_vm_get(vm, size, nvbo->page_shift,
-			     NV_MEM_ACCESS_RW, vma);
-	if (ret)
-		return ret;
-
-	if ( nvbo->bo.mem.mem_type != TTM_PL_SYSTEM &&
-	    (nvbo->bo.mem.mem_type == TTM_PL_VRAM ||
-	     nvbo->page_shift != vma->vm->mmu->lpg_shift))
-		nvkm_vm_map(vma, nvbo->bo.mem.mm_node);
-
-	list_add_tail(&vma->head, &nvbo->vma_list);
-	vma->refcount = 1;
-	return 0;
-}
-
-void
-nouveau_bo_vma_del(struct nouveau_bo *nvbo, struct nvkm_vma *vma)
-{
-	if (vma->node) {
-		if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM)
-			nvkm_vm_unmap(vma);
-		nvkm_vm_put(vma);
-		list_del(&vma->head);
-	}
-}
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.h b/drivers/gpu/drm/nouveau/nouveau_bo.h
index b06a5385d6dd..23002bdd94a8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.h
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.h
@@ -24,12 +24,16 @@ struct nouveau_bo {
 	bool validate_mapped;
 
 	struct list_head vma_list;
-	unsigned page_shift;
 
 	struct nouveau_cli *cli;
 
-	u32 tile_mode;
-	u32 tile_flags;
+	unsigned contig:1;
+	unsigned page:5;
+	unsigned kind:8;
+	unsigned comp:3;
+	unsigned zeta:3;
+	unsigned mode;
+
 	struct nouveau_drm_tile *tile;
 
 	/* Only valid if allocated via nouveau_gem_new() and iff you hold a
@@ -89,13 +93,6 @@ int  nouveau_bo_validate(struct nouveau_bo *, bool interruptible,
 void nouveau_bo_sync_for_device(struct nouveau_bo *nvbo);
 void nouveau_bo_sync_for_cpu(struct nouveau_bo *nvbo);
 
-struct nvkm_vma *
-nouveau_bo_vma_find(struct nouveau_bo *, struct nvkm_vm *);
-
-int  nouveau_bo_vma_add(struct nouveau_bo *, struct nvkm_vm *,
-			struct nvkm_vma *);
-void nouveau_bo_vma_del(struct nouveau_bo *, struct nvkm_vma *);
-
 /* TODO: submit equivalent to TTM generic API upstream? */
 static inline void __iomem *
 nvbo_kmap_obj_iovirtual(struct nouveau_bo *nvbo)
diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c
index dbc41fa86ee8..af1116655910 100644
--- a/drivers/gpu/drm/nouveau/nouveau_chan.c
+++ b/drivers/gpu/drm/nouveau/nouveau_chan.c
@@ -40,6 +40,7 @@
 #include "nouveau_chan.h"
 #include "nouveau_fence.h"
 #include "nouveau_abi16.h"
+#include "nouveau_vmm.h"
 
 MODULE_PARM_DESC(vram_pushbuf, "Create DMA push buffers in VRAM");
 int nouveau_vram_pushbuf;
@@ -83,6 +84,14 @@ nouveau_channel_del(struct nouveau_channel **pchan)
 {
 	struct nouveau_channel *chan = *pchan;
 	if (chan) {
+		struct nouveau_cli *cli = (void *)chan->user.client;
+		bool super;
+
+		if (cli) {
+			super = cli->base.super;
+			cli->base.super = true;
+		}
+
 		if (chan->fence)
 			nouveau_fence(chan->drm)->context_del(chan);
 		nvif_object_fini(&chan->nvsw);
@@ -91,12 +100,15 @@ nouveau_channel_del(struct nouveau_channel **pchan)
 		nvif_notify_fini(&chan->kill);
 		nvif_object_fini(&chan->user);
 		nvif_object_fini(&chan->push.ctxdma);
-		nouveau_bo_vma_del(chan->push.buffer, &chan->push.vma);
+		nouveau_vma_del(&chan->push.vma);
 		nouveau_bo_unmap(chan->push.buffer);
 		if (chan->push.buffer && chan->push.buffer->pin_refcnt)
 			nouveau_bo_unpin(chan->push.buffer);
 		nouveau_bo_ref(NULL, &chan->push.buffer);
 		kfree(chan);
+
+		if (cli)
+			cli->base.super = super;
 	}
 	*pchan = NULL;
 }
@@ -106,7 +118,6 @@ nouveau_channel_prep(struct nouveau_drm *drm, struct nvif_device *device,
 		     u32 size, struct nouveau_channel **pchan)
 {
 	struct nouveau_cli *cli = (void *)device->object.client;
-	struct nvkm_mmu *mmu = nvxx_mmu(device);
 	struct nv_dma_v0 args = {};
 	struct nouveau_channel *chan;
 	u32 target;
@@ -142,11 +153,11 @@ nouveau_channel_prep(struct nouveau_drm *drm, struct nvif_device *device,
 	 * pushbuf lives in, this is because the GEM code requires that
 	 * we be able to call out to other (indirect) push buffers
 	 */
-	chan->push.vma.offset = chan->push.buffer->bo.offset;
+	chan->push.addr = chan->push.buffer->bo.offset;
 
 	if (device->info.family >= NV_DEVICE_INFO_V0_TESLA) {
-		ret = nouveau_bo_vma_add(chan->push.buffer, cli->vm,
-					&chan->push.vma);
+		ret = nouveau_vma_new(chan->push.buffer, &cli->vmm,
+				      &chan->push.vma);
 		if (ret) {
 			nouveau_channel_del(pchan);
 			return ret;
@@ -155,7 +166,9 @@ nouveau_channel_prep(struct nouveau_drm *drm, struct nvif_device *device,
 		args.target = NV_DMA_V0_TARGET_VM;
 		args.access = NV_DMA_V0_ACCESS_VM;
 		args.start = 0;
-		args.limit = cli->vm->mmu->limit - 1;
+		args.limit = cli->vmm.vmm.limit - 1;
+
+		chan->push.addr = chan->push.vma->addr;
 	} else
 	if (chan->push.buffer->bo.mem.mem_type == TTM_PL_VRAM) {
 		if (device->info.family == NV_DEVICE_INFO_V0_TNT) {
@@ -185,7 +198,7 @@ nouveau_channel_prep(struct nouveau_drm *drm, struct nvif_device *device,
 			args.target = NV_DMA_V0_TARGET_VM;
 			args.access = NV_DMA_V0_ACCESS_RDWR;
 			args.start = 0;
-			args.limit = mmu->limit - 1;
+			args.limit = cli->vmm.vmm.limit - 1;
 		}
 	}
 
@@ -203,6 +216,7 @@ static int
 nouveau_channel_ind(struct nouveau_drm *drm, struct nvif_device *device,
 		    u32 engine, struct nouveau_channel **pchan)
 {
+	struct nouveau_cli *cli = (void *)device->object.client;
 	static const u16 oclasses[] = { PASCAL_CHANNEL_GPFIFO_A,
 					MAXWELL_CHANNEL_GPFIFO_A,
 					KEPLER_CHANNEL_GPFIFO_B,
@@ -233,22 +247,22 @@ nouveau_channel_ind(struct nouveau_drm *drm, struct nvif_device *device,
 			args.kepler.version = 0;
 			args.kepler.engines = engine;
 			args.kepler.ilength = 0x02000;
-			args.kepler.ioffset = 0x10000 + chan->push.vma.offset;
-			args.kepler.vm = 0;
+			args.kepler.ioffset = 0x10000 + chan->push.addr;
+			args.kepler.vmm = nvif_handle(&cli->vmm.vmm.object);
 			size = sizeof(args.kepler);
 		} else
 		if (oclass[0] >= FERMI_CHANNEL_GPFIFO) {
 			args.fermi.version = 0;
 			args.fermi.ilength = 0x02000;
-			args.fermi.ioffset = 0x10000 + chan->push.vma.offset;
-			args.fermi.vm = 0;
+			args.fermi.ioffset = 0x10000 + chan->push.addr;
+			args.fermi.vmm = nvif_handle(&cli->vmm.vmm.object);
 			size = sizeof(args.fermi);
 		} else {
 			args.nv50.version = 0;
 			args.nv50.ilength = 0x02000;
-			args.nv50.ioffset = 0x10000 + chan->push.vma.offset;
+			args.nv50.ioffset = 0x10000 + chan->push.addr;
 			args.nv50.pushbuf = nvif_handle(&chan->push.ctxdma);
-			args.nv50.vm = 0;
+			args.nv50.vmm = nvif_handle(&cli->vmm.vmm.object);
 			size = sizeof(args.nv50);
 		}
 
@@ -293,7 +307,7 @@ nouveau_channel_dma(struct nouveau_drm *drm, struct nvif_device *device,
 	/* create channel object */
 	args.version = 0;
 	args.pushbuf = nvif_handle(&chan->push.ctxdma);
-	args.offset = chan->push.vma.offset;
+	args.offset = chan->push.addr;
 
 	do {
 		ret = nvif_object_init(&device->object, 0, *oclass++,
@@ -314,11 +328,10 @@ nouveau_channel_init(struct nouveau_channel *chan, u32 vram, u32 gart)
 	struct nvif_device *device = chan->device;
 	struct nouveau_cli *cli = (void *)chan->user.client;
 	struct nouveau_drm *drm = chan->drm;
-	struct nvkm_mmu *mmu = nvxx_mmu(device);
 	struct nv_dma_v0 args = {};
 	int ret, i;
 
-	nvif_object_map(&chan->user);
+	nvif_object_map(&chan->user, NULL, 0);
 
 	if (chan->user.oclass >= FERMI_CHANNEL_GPFIFO) {
 		ret = nvif_notify_init(&chan->user, nouveau_channel_killed,
@@ -339,7 +352,7 @@ nouveau_channel_init(struct nouveau_channel *chan, u32 vram, u32 gart)
 			args.target = NV_DMA_V0_TARGET_VM;
 			args.access = NV_DMA_V0_ACCESS_VM;
 			args.start = 0;
-			args.limit = cli->vm->mmu->limit - 1;
+			args.limit = cli->vmm.vmm.limit - 1;
 		} else {
 			args.target = NV_DMA_V0_TARGET_VRAM;
 			args.access = NV_DMA_V0_ACCESS_RDWR;
@@ -356,7 +369,7 @@ nouveau_channel_init(struct nouveau_channel *chan, u32 vram, u32 gart)
 			args.target = NV_DMA_V0_TARGET_VM;
 			args.access = NV_DMA_V0_ACCESS_VM;
 			args.start = 0;
-			args.limit = cli->vm->mmu->limit - 1;
+			args.limit = cli->vmm.vmm.limit - 1;
 		} else
 		if (chan->drm->agp.bridge) {
 			args.target = NV_DMA_V0_TARGET_AGP;
@@ -368,7 +381,7 @@ nouveau_channel_init(struct nouveau_channel *chan, u32 vram, u32 gart)
 			args.target = NV_DMA_V0_TARGET_VM;
 			args.access = NV_DMA_V0_ACCESS_RDWR;
 			args.start = 0;
-			args.limit = mmu->limit - 1;
+			args.limit = cli->vmm.vmm.limit - 1;
 		}
 
 		ret = nvif_object_init(&chan->user, gart, NV_DMA_IN_MEMORY,
diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.h b/drivers/gpu/drm/nouveau/nouveau_chan.h
index 46b947ba1cf4..f29d3a72c48c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_chan.h
+++ b/drivers/gpu/drm/nouveau/nouveau_chan.h
@@ -16,8 +16,9 @@ struct nouveau_channel {
 
 	struct {
 		struct nouveau_bo *buffer;
-		struct nvkm_vma vma;
+		struct nouveau_vma *vma;
 		struct nvif_object ctxdma;
+		u64 addr;
 	} push;
 
 	/* TODO: this will be reworked in the near future */
diff --git a/drivers/gpu/drm/nouveau/nouveau_display.h b/drivers/gpu/drm/nouveau/nouveau_display.h
index 201aec2ea5b8..1411bf05b89d 100644
--- a/drivers/gpu/drm/nouveau/nouveau_display.h
+++ b/drivers/gpu/drm/nouveau/nouveau_display.h
@@ -1,14 +1,11 @@
 #ifndef __NOUVEAU_DISPLAY_H__
 #define __NOUVEAU_DISPLAY_H__
-
-#include <subdev/mmu.h>
-
 #include "nouveau_drv.h"
 
 struct nouveau_framebuffer {
 	struct drm_framebuffer base;
 	struct nouveau_bo *nvbo;
-	struct nvkm_vma vma;
+	struct nouveau_vma *vma;
 	u32 r_handle;
 	u32 r_format;
 	u32 r_pitch;
diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.c b/drivers/gpu/drm/nouveau/nouveau_dma.c
index 2634a1a79888..10e84f6ca2b7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dma.c
@@ -26,6 +26,7 @@
 
 #include "nouveau_drv.h"
 #include "nouveau_dma.h"
+#include "nouveau_vmm.h"
 
 void
 OUT_RINGp(struct nouveau_channel *chan, const void *data, unsigned nr_dwords)
@@ -71,11 +72,11 @@ READ_GET(struct nouveau_channel *chan, uint64_t *prev_get, int *timeout)
 			return -EBUSY;
 	}
 
-	if (val < chan->push.vma.offset ||
-	    val > chan->push.vma.offset + (chan->dma.max << 2))
+	if (val < chan->push.addr ||
+	    val > chan->push.addr + (chan->dma.max << 2))
 		return -EINVAL;
 
-	return (val - chan->push.vma.offset) >> 2;
+	return (val - chan->push.addr) >> 2;
 }
 
 void
@@ -84,13 +85,13 @@ nv50_dma_push(struct nouveau_channel *chan, struct nouveau_bo *bo,
 {
 	struct nouveau_cli *cli = (void *)chan->user.client;
 	struct nouveau_bo *pb = chan->push.buffer;
-	struct nvkm_vma *vma;
+	struct nouveau_vma *vma;
 	int ip = (chan->dma.ib_put * 2) + chan->dma.ib_base;
 	u64 offset;
 
-	vma = nouveau_bo_vma_find(bo, cli->vm);
+	vma = nouveau_vma_find(bo, &cli->vmm);
 	BUG_ON(!vma);
-	offset = vma->offset + delta;
+	offset = vma->addr + delta;
 
 	BUG_ON(chan->dma.ib_free < 1);
 
@@ -224,7 +225,7 @@ nouveau_dma_wait(struct nouveau_channel *chan, int slots, int size)
 			 * instruct the GPU to jump back to the start right
 			 * after processing the currently pending commands.
 			 */
-			OUT_RING(chan, chan->push.vma.offset | 0x20000000);
+			OUT_RING(chan, chan->push.addr | 0x20000000);
 
 			/* wait for GET to depart from the skips area.
 			 * prevents writing GET==PUT and causing a race
diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h b/drivers/gpu/drm/nouveau/nouveau_dma.h
index aff3a9d0a1fc..74e10b14a7da 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dma.h
+++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
@@ -140,7 +140,7 @@ BEGIN_IMC0(struct nouveau_channel *chan, int subc, int mthd, u16 data)
 #define WRITE_PUT(val) do {                                                    \
 	mb();                                                   \
 	nouveau_bo_rd32(chan->push.buffer, 0);                                 \
-	nvif_wr32(&chan->user, chan->user_put, ((val) << 2) + chan->push.vma.offset); \
+	nvif_wr32(&chan->user, chan->user_put, ((val) << 2) + chan->push.addr);\
 } while (0)
 
 static inline void
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 595630d1fb9e..8d4a5be3b913 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -111,33 +111,119 @@ nouveau_name(struct drm_device *dev)
 		return nouveau_platform_name(to_platform_device(dev->dev));
 }
 
+static inline bool
+nouveau_cli_work_ready(struct dma_fence *fence, bool wait)
+{
+	if (!dma_fence_is_signaled(fence)) {
+		if (!wait)
+			return false;
+		WARN_ON(dma_fence_wait_timeout(fence, false, 2 * HZ) <= 0);
+	}
+	dma_fence_put(fence);
+	return true;
+}
+
+static void
+nouveau_cli_work_flush(struct nouveau_cli *cli, bool wait)
+{
+	struct nouveau_cli_work *work, *wtmp;
+	mutex_lock(&cli->lock);
+	list_for_each_entry_safe(work, wtmp, &cli->worker, head) {
+		if (!work->fence || nouveau_cli_work_ready(work->fence, wait)) {
+			list_del(&work->head);
+			work->func(work);
+		}
+	}
+	mutex_unlock(&cli->lock);
+}
+
+static void
+nouveau_cli_work_fence(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct nouveau_cli_work *work = container_of(cb, typeof(*work), cb);
+	schedule_work(&work->cli->work);
+}
+
+void
+nouveau_cli_work_queue(struct nouveau_cli *cli, struct dma_fence *fence,
+		       struct nouveau_cli_work *work)
+{
+	work->fence = dma_fence_get(fence);
+	work->cli = cli;
+	mutex_lock(&cli->lock);
+	list_add_tail(&work->head, &cli->worker);
+	mutex_unlock(&cli->lock);
+	if (dma_fence_add_callback(fence, &work->cb, nouveau_cli_work_fence))
+		nouveau_cli_work_fence(fence, &work->cb);
+}
+
+static void
+nouveau_cli_work(struct work_struct *w)
+{
+	struct nouveau_cli *cli = container_of(w, typeof(*cli), work);
+	nouveau_cli_work_flush(cli, false);
+}
+
 static void
 nouveau_cli_fini(struct nouveau_cli *cli)
 {
-	nvkm_vm_ref(NULL, &nvxx_client(&cli->base)->vm, NULL);
+	nouveau_cli_work_flush(cli, true);
 	usif_client_fini(cli);
+	nouveau_vmm_fini(&cli->vmm);
+	nvif_mmu_fini(&cli->mmu);
 	nvif_device_fini(&cli->device);
+	mutex_lock(&cli->drm->master.lock);
 	nvif_client_fini(&cli->base);
+	mutex_unlock(&cli->drm->master.lock);
 }
 
 static int
 nouveau_cli_init(struct nouveau_drm *drm, const char *sname,
 		 struct nouveau_cli *cli)
 {
+	static const struct nvif_mclass
+	mems[] = {
+		{ NVIF_CLASS_MEM_GF100, -1 },
+		{ NVIF_CLASS_MEM_NV50 , -1 },
+		{ NVIF_CLASS_MEM_NV04 , -1 },
+		{}
+	};
+	static const struct nvif_mclass
+	mmus[] = {
+		{ NVIF_CLASS_MMU_GF100, -1 },
+		{ NVIF_CLASS_MMU_NV50 , -1 },
+		{ NVIF_CLASS_MMU_NV04 , -1 },
+		{}
+	};
+	static const struct nvif_mclass
+	vmms[] = {
+		{ NVIF_CLASS_VMM_GP100, -1 },
+		{ NVIF_CLASS_VMM_GM200, -1 },
+		{ NVIF_CLASS_VMM_GF100, -1 },
+		{ NVIF_CLASS_VMM_NV50 , -1 },
+		{ NVIF_CLASS_VMM_NV04 , -1 },
+		{}
+	};
 	u64 device = nouveau_name(drm->dev);
 	int ret;
 
 	snprintf(cli->name, sizeof(cli->name), "%s", sname);
-	cli->dev = drm->dev;
+	cli->drm = drm;
 	mutex_init(&cli->mutex);
 	usif_client_init(cli);
 
-	if (cli == &drm->client) {
+	INIT_WORK(&cli->work, nouveau_cli_work);
+	INIT_LIST_HEAD(&cli->worker);
+	mutex_init(&cli->lock);
+
+	if (cli == &drm->master) {
 		ret = nvif_driver_init(NULL, nouveau_config, nouveau_debug,
 				       cli->name, device, &cli->base);
 	} else {
-		ret = nvif_client_init(&drm->client.base, cli->name, device,
+		mutex_lock(&drm->master.lock);
+		ret = nvif_client_init(&drm->master.base, cli->name, device,
 				       &cli->base);
+		mutex_unlock(&drm->master.lock);
 	}
 	if (ret) {
 		NV_ERROR(drm, "Client allocation failed: %d\n", ret);
@@ -154,6 +240,38 @@ nouveau_cli_init(struct nouveau_drm *drm, const char *sname,
 		goto done;
 	}
 
+	ret = nvif_mclass(&cli->device.object, mmus);
+	if (ret < 0) {
+		NV_ERROR(drm, "No supported MMU class\n");
+		goto done;
+	}
+
+	ret = nvif_mmu_init(&cli->device.object, mmus[ret].oclass, &cli->mmu);
+	if (ret) {
+		NV_ERROR(drm, "MMU allocation failed: %d\n", ret);
+		goto done;
+	}
+
+	ret = nvif_mclass(&cli->mmu.object, vmms);
+	if (ret < 0) {
+		NV_ERROR(drm, "No supported VMM class\n");
+		goto done;
+	}
+
+	ret = nouveau_vmm_init(cli, vmms[ret].oclass, &cli->vmm);
+	if (ret) {
+		NV_ERROR(drm, "VMM allocation failed: %d\n", ret);
+		goto done;
+	}
+
+	ret = nvif_mclass(&cli->mmu.object, mems);
+	if (ret < 0) {
+		NV_ERROR(drm, "No supported MEM class\n");
+		goto done;
+	}
+
+	cli->mem = &mems[ret];
+	return 0;
 done:
 	if (ret)
 		nouveau_cli_fini(cli);
@@ -433,6 +551,10 @@ nouveau_drm_load(struct drm_device *dev, unsigned long flags)
 	dev->dev_private = drm;
 	drm->dev = dev;
 
+	ret = nouveau_cli_init(drm, "DRM-master", &drm->master);
+	if (ret)
+		return ret;
+
 	ret = nouveau_cli_init(drm, "DRM", &drm->client);
 	if (ret)
 		return ret;
@@ -456,21 +578,6 @@ nouveau_drm_load(struct drm_device *dev, unsigned long flags)
 
 	nouveau_vga_init(drm);
 
-	if (drm->client.device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
-		if (!nvxx_device(&drm->client.device)->mmu) {
-			ret = -ENOSYS;
-			goto fail_device;
-		}
-
-		ret = nvkm_vm_new(nvxx_device(&drm->client.device),
-				  0, (1ULL << 40), 0x1000, NULL,
-				  &drm->client.vm);
-		if (ret)
-			goto fail_device;
-
-		nvxx_client(&drm->client.base)->vm = drm->client.vm;
-	}
-
 	ret = nouveau_ttm_init(drm);
 	if (ret)
 		goto fail_ttm;
@@ -516,8 +623,8 @@ fail_bios:
 	nouveau_ttm_fini(drm);
 fail_ttm:
 	nouveau_vga_fini(drm);
-fail_device:
 	nouveau_cli_fini(&drm->client);
+	nouveau_cli_fini(&drm->master);
 	kfree(drm);
 	return ret;
 }
@@ -550,6 +657,7 @@ nouveau_drm_unload(struct drm_device *dev)
 	if (drm->hdmi_device)
 		pci_dev_put(drm->hdmi_device);
 	nouveau_cli_fini(&drm->client);
+	nouveau_cli_fini(&drm->master);
 	kfree(drm);
 }
 
@@ -618,7 +726,7 @@ nouveau_do_suspend(struct drm_device *dev, bool runtime)
 	}
 
 	NV_DEBUG(drm, "suspending object tree...\n");
-	ret = nvif_client_suspend(&drm->client.base);
+	ret = nvif_client_suspend(&drm->master.base);
 	if (ret)
 		goto fail_client;
 
@@ -642,7 +750,7 @@ nouveau_do_resume(struct drm_device *dev, bool runtime)
 	struct nouveau_drm *drm = nouveau_drm(dev);
 
 	NV_DEBUG(drm, "resuming object tree...\n");
-	nvif_client_resume(&drm->client.base);
+	nvif_client_resume(&drm->master.base);
 
 	NV_DEBUG(drm, "resuming fence...\n");
 	if (drm->fence && nouveau_fence(drm)->resume)
@@ -850,15 +958,6 @@ nouveau_drm_open(struct drm_device *dev, struct drm_file *fpriv)
 
 	cli->base.super = false;
 
-	if (drm->client.device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
-		ret = nvkm_vm_new(nvxx_device(&drm->client.device), 0,
-				  (1ULL << 40), 0x1000, NULL, &cli->vm);
-		if (ret)
-			goto done;
-
-		nvxx_client(&cli->base)->vm = cli->vm;
-	}
-
 	fpriv->driver_priv = cli;
 
 	mutex_lock(&drm->client.mutex);
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 822fe1d4d35e..e86b8220a4bb 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -5,7 +5,7 @@
 #define DRIVER_EMAIL		"nouveau@lists.freedesktop.org"
 
 #define DRIVER_NAME		"nouveau"
-#define DRIVER_DESC		"nVidia Riva/TNT/GeForce/Quadro/Tesla"
+#define DRIVER_DESC		"nVidia Riva/TNT/GeForce/Quadro/Tesla/Tegra K1+"
 #define DRIVER_DATE		"20120801"
 
 #define DRIVER_MAJOR		1
@@ -42,6 +42,8 @@
 #include <nvif/client.h>
 #include <nvif/device.h>
 #include <nvif/ioctl.h>
+#include <nvif/mmu.h>
+#include <nvif/vmm.h>
 
 #include <drm/drmP.h>
 
@@ -61,6 +63,7 @@ struct platform_device;
 
 #include "nouveau_fence.h"
 #include "nouveau_bios.h"
+#include "nouveau_vmm.h"
 
 struct nouveau_drm_tile {
 	struct nouveau_fence *fence;
@@ -86,19 +89,37 @@ enum nouveau_drm_handle {
 
 struct nouveau_cli {
 	struct nvif_client base;
-	struct drm_device *dev;
+	struct nouveau_drm *drm;
 	struct mutex mutex;
 
 	struct nvif_device device;
+	struct nvif_mmu mmu;
+	struct nouveau_vmm vmm;
+	const struct nvif_mclass *mem;
 
-	struct nvkm_vm *vm; /*XXX*/
 	struct list_head head;
 	void *abi16;
 	struct list_head objects;
 	struct list_head notifys;
 	char name[32];
+
+	struct work_struct work;
+	struct list_head worker;
+	struct mutex lock;
 };
 
+struct nouveau_cli_work {
+	void (*func)(struct nouveau_cli_work *);
+	struct nouveau_cli *cli;
+	struct list_head head;
+
+	struct dma_fence *fence;
+	struct dma_fence_cb cb;
+};
+
+void nouveau_cli_work_queue(struct nouveau_cli *, struct dma_fence *,
+			    struct nouveau_cli_work *);
+
 static inline struct nouveau_cli *
 nouveau_cli(struct drm_file *fpriv)
 {
@@ -109,6 +130,7 @@ nouveau_cli(struct drm_file *fpriv)
 #include <nvif/device.h>
 
 struct nouveau_drm {
+	struct nouveau_cli master;
 	struct nouveau_cli client;
 	struct drm_device *dev;
 
@@ -133,6 +155,9 @@ struct nouveau_drm {
 		struct nouveau_channel *chan;
 		struct nvif_object copy;
 		int mtrr;
+		int type_vram;
+		int type_host;
+		int type_ncoh;
 	} ttm;
 
 	/* GEM interface support */
@@ -204,7 +229,7 @@ void nouveau_drm_device_remove(struct drm_device *dev);
 
 #define NV_PRINTK(l,c,f,a...) do {                                             \
 	struct nouveau_cli *_cli = (c);                                        \
-	dev_##l(_cli->dev->dev, "%s: "f, _cli->name, ##a);                     \
+	dev_##l(_cli->drm->dev->dev, "%s: "f, _cli->name, ##a);                \
 } while(0)
 #define NV_FATAL(drm,f,a...) NV_PRINTK(crit, &(drm)->client, f, ##a)
 #define NV_ERROR(drm,f,a...) NV_PRINTK(err, &(drm)->client, f, ##a)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
index f7707849bb53..c533d8e04afc 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
@@ -48,6 +48,7 @@
 #include "nouveau_bo.h"
 #include "nouveau_fbcon.h"
 #include "nouveau_chan.h"
+#include "nouveau_vmm.h"
 
 #include "nouveau_crtc.h"
 
@@ -223,7 +224,7 @@ void
 nouveau_fbcon_accel_save_disable(struct drm_device *dev)
 {
 	struct nouveau_drm *drm = nouveau_drm(dev);
-	if (drm->fbcon) {
+	if (drm->fbcon && drm->fbcon->helper.fbdev) {
 		drm->fbcon->saved_flags = drm->fbcon->helper.fbdev->flags;
 		drm->fbcon->helper.fbdev->flags |= FBINFO_HWACCEL_DISABLED;
 	}
@@ -233,7 +234,7 @@ void
 nouveau_fbcon_accel_restore(struct drm_device *dev)
 {
 	struct nouveau_drm *drm = nouveau_drm(dev);
-	if (drm->fbcon) {
+	if (drm->fbcon && drm->fbcon->helper.fbdev) {
 		drm->fbcon->helper.fbdev->flags = drm->fbcon->saved_flags;
 	}
 }
@@ -245,7 +246,8 @@ nouveau_fbcon_accel_fini(struct drm_device *dev)
 	struct nouveau_fbdev *fbcon = drm->fbcon;
 	if (fbcon && drm->channel) {
 		console_lock();
-		fbcon->helper.fbdev->flags |= FBINFO_HWACCEL_DISABLED;
+		if (fbcon->helper.fbdev)
+			fbcon->helper.fbdev->flags |= FBINFO_HWACCEL_DISABLED;
 		console_unlock();
 		nouveau_channel_idle(drm->channel);
 		nvif_object_fini(&fbcon->twod);
@@ -347,7 +349,7 @@ nouveau_fbcon_create(struct drm_fb_helper *helper,
 
 	chan = nouveau_nofbaccel ? NULL : drm->channel;
 	if (chan && device->info.family >= NV_DEVICE_INFO_V0_TESLA) {
-		ret = nouveau_bo_vma_add(nvbo, drm->client.vm, &fb->vma);
+		ret = nouveau_vma_new(nvbo, &drm->client.vmm, &fb->vma);
 		if (ret) {
 			NV_ERROR(drm, "failed to map fb into chan: %d\n", ret);
 			chan = NULL;
@@ -401,7 +403,7 @@ nouveau_fbcon_create(struct drm_fb_helper *helper,
 
 out_unlock:
 	if (chan)
-		nouveau_bo_vma_del(fb->nvbo, &fb->vma);
+		nouveau_vma_del(&fb->vma);
 	nouveau_bo_unmap(fb->nvbo);
 out_unpin:
 	nouveau_bo_unpin(fb->nvbo);
@@ -428,7 +430,7 @@ nouveau_fbcon_destroy(struct drm_device *dev, struct nouveau_fbdev *fbcon)
 	drm_fb_helper_fini(&fbcon->helper);
 
 	if (nouveau_fb->nvbo) {
-		nouveau_bo_vma_del(nouveau_fb->nvbo, &nouveau_fb->vma);
+		nouveau_vma_del(&nouveau_fb->vma);
 		nouveau_bo_unmap(nouveau_fb->nvbo);
 		nouveau_bo_unpin(nouveau_fb->nvbo);
 		drm_framebuffer_unreference(&nouveau_fb->base);
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 99e14e3e0fe4..503fa94dc06d 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -199,62 +199,6 @@ nouveau_fence_context_new(struct nouveau_channel *chan, struct nouveau_fence_cha
 	WARN_ON(ret);
 }
 
-struct nouveau_fence_work {
-	struct work_struct work;
-	struct dma_fence_cb cb;
-	void (*func)(void *);
-	void *data;
-};
-
-static void
-nouveau_fence_work_handler(struct work_struct *kwork)
-{
-	struct nouveau_fence_work *work = container_of(kwork, typeof(*work), work);
-	work->func(work->data);
-	kfree(work);
-}
-
-static void nouveau_fence_work_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
-{
-	struct nouveau_fence_work *work = container_of(cb, typeof(*work), cb);
-
-	schedule_work(&work->work);
-}
-
-void
-nouveau_fence_work(struct dma_fence *fence,
-		   void (*func)(void *), void *data)
-{
-	struct nouveau_fence_work *work;
-
-	if (dma_fence_is_signaled(fence))
-		goto err;
-
-	work = kmalloc(sizeof(*work), GFP_KERNEL);
-	if (!work) {
-		/*
-		 * this might not be a nouveau fence any more,
-		 * so force a lazy wait here
-		 */
-		WARN_ON(nouveau_fence_wait((struct nouveau_fence *)fence,
-					   true, false));
-		goto err;
-	}
-
-	INIT_WORK(&work->work, nouveau_fence_work_handler);
-	work->func = func;
-	work->data = data;
-
-	if (dma_fence_add_callback(fence, &work->cb, nouveau_fence_work_cb) < 0)
-		goto err_free;
-	return;
-
-err_free:
-	kfree(work);
-err:
-	func(data);
-}
-
 int
 nouveau_fence_emit(struct nouveau_fence *fence, struct nouveau_channel *chan)
 {
@@ -474,8 +418,6 @@ nouveau_fence_new(struct nouveau_channel *chan, bool sysmem,
 	if (!fence)
 		return -ENOMEM;
 
-	fence->sysmem = sysmem;
-
 	ret = nouveau_fence_emit(fence, chan);
 	if (ret)
 		nouveau_fence_unref(&fence);
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h b/drivers/gpu/drm/nouveau/nouveau_fence.h
index d5e58a38f160..c36031aa013e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.h
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
@@ -12,8 +12,6 @@ struct nouveau_fence {
 
 	struct list_head head;
 
-	bool sysmem;
-
 	struct nouveau_channel __rcu *channel;
 	unsigned long timeout;
 };
@@ -24,7 +22,6 @@ void nouveau_fence_unref(struct nouveau_fence **);
 
 int  nouveau_fence_emit(struct nouveau_fence *, struct nouveau_channel *);
 bool nouveau_fence_done(struct nouveau_fence *);
-void nouveau_fence_work(struct dma_fence *, void (*)(void *), void *);
 int  nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool intr);
 int  nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel *, bool exclusive, bool intr);
 
@@ -90,14 +87,12 @@ int nouveau_flip_complete(struct nvif_notify *);
 
 struct nv84_fence_chan {
 	struct nouveau_fence_chan base;
-	struct nvkm_vma vma;
-	struct nvkm_vma vma_gart;
+	struct nouveau_vma *vma;
 };
 
 struct nv84_fence_priv {
 	struct nouveau_fence_priv base;
 	struct nouveau_bo *bo;
-	struct nouveau_bo *bo_gart;
 	u32 *suspend;
 	struct mutex mutex;
 };
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 2170534101ca..efc89aaef66a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -31,6 +31,10 @@
 
 #include "nouveau_ttm.h"
 #include "nouveau_gem.h"
+#include "nouveau_mem.h"
+#include "nouveau_vmm.h"
+
+#include <nvif/class.h>
 
 void
 nouveau_gem_object_del(struct drm_gem_object *gem)
@@ -64,66 +68,61 @@ nouveau_gem_object_open(struct drm_gem_object *gem, struct drm_file *file_priv)
 	struct nouveau_cli *cli = nouveau_cli(file_priv);
 	struct nouveau_bo *nvbo = nouveau_gem_object(gem);
 	struct nouveau_drm *drm = nouveau_bdev(nvbo->bo.bdev);
-	struct nvkm_vma *vma;
 	struct device *dev = drm->dev->dev;
+	struct nouveau_vma *vma;
 	int ret;
 
-	if (!cli->vm)
+	if (cli->vmm.vmm.object.oclass < NVIF_CLASS_VMM_NV50)
 		return 0;
 
 	ret = ttm_bo_reserve(&nvbo->bo, false, false, NULL);
 	if (ret)
 		return ret;
 
-	vma = nouveau_bo_vma_find(nvbo, cli->vm);
-	if (!vma) {
-		vma = kzalloc(sizeof(*vma), GFP_KERNEL);
-		if (!vma) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		ret = pm_runtime_get_sync(dev);
-		if (ret < 0 && ret != -EACCES) {
-			kfree(vma);
-			goto out;
-		}
-
-		ret = nouveau_bo_vma_add(nvbo, cli->vm, vma);
-		if (ret)
-			kfree(vma);
-
-		pm_runtime_mark_last_busy(dev);
-		pm_runtime_put_autosuspend(dev);
-	} else {
-		vma->refcount++;
-	}
+	ret = pm_runtime_get_sync(dev);
+	if (ret < 0 && ret != -EACCES)
+		goto out;
 
+	ret = nouveau_vma_new(nvbo, &cli->vmm, &vma);
+	pm_runtime_mark_last_busy(dev);
+	pm_runtime_put_autosuspend(dev);
 out:
 	ttm_bo_unreserve(&nvbo->bo);
 	return ret;
 }
 
+struct nouveau_gem_object_unmap {
+	struct nouveau_cli_work work;
+	struct nouveau_vma *vma;
+};
+
 static void
-nouveau_gem_object_delete(void *data)
+nouveau_gem_object_delete(struct nouveau_vma *vma)
 {
-	struct nvkm_vma *vma = data;
-	nvkm_vm_unmap(vma);
-	nvkm_vm_put(vma);
-	kfree(vma);
+	nouveau_vma_del(&vma);
 }
 
 static void
-nouveau_gem_object_unmap(struct nouveau_bo *nvbo, struct nvkm_vma *vma)
+nouveau_gem_object_delete_work(struct nouveau_cli_work *w)
+{
+	struct nouveau_gem_object_unmap *work =
+		container_of(w, typeof(*work), work);
+	nouveau_gem_object_delete(work->vma);
+	kfree(work);
+}
+
+static void
+nouveau_gem_object_unmap(struct nouveau_bo *nvbo, struct nouveau_vma *vma)
 {
 	const bool mapped = nvbo->bo.mem.mem_type != TTM_PL_SYSTEM;
 	struct reservation_object *resv = nvbo->bo.resv;
 	struct reservation_object_list *fobj;
+	struct nouveau_gem_object_unmap *work;
 	struct dma_fence *fence = NULL;
 
 	fobj = reservation_object_get_list(resv);
 
-	list_del(&vma->head);
+	list_del_init(&vma->head);
 
 	if (fobj && fobj->shared_count > 1)
 		ttm_bo_wait(&nvbo->bo, false, false);
@@ -133,14 +132,20 @@ nouveau_gem_object_unmap(struct nouveau_bo *nvbo, struct nvkm_vma *vma)
 	else
 		fence = reservation_object_get_excl(nvbo->bo.resv);
 
-	if (fence && mapped) {
-		nouveau_fence_work(fence, nouveau_gem_object_delete, vma);
-	} else {
-		if (mapped)
-			nvkm_vm_unmap(vma);
-		nvkm_vm_put(vma);
-		kfree(vma);
+	if (!fence || !mapped) {
+		nouveau_gem_object_delete(vma);
+		return;
+	}
+
+	if (!(work = kmalloc(sizeof(*work), GFP_KERNEL))) {
+		WARN_ON(dma_fence_wait_timeout(fence, false, 2 * HZ) <= 0);
+		nouveau_gem_object_delete(vma);
+		return;
 	}
+
+	work->work.func = nouveau_gem_object_delete_work;
+	work->vma = vma;
+	nouveau_cli_work_queue(vma->vmm->cli, fence, &work->work);
 }
 
 void
@@ -150,19 +155,19 @@ nouveau_gem_object_close(struct drm_gem_object *gem, struct drm_file *file_priv)
 	struct nouveau_bo *nvbo = nouveau_gem_object(gem);
 	struct nouveau_drm *drm = nouveau_bdev(nvbo->bo.bdev);
 	struct device *dev = drm->dev->dev;
-	struct nvkm_vma *vma;
+	struct nouveau_vma *vma;
 	int ret;
 
-	if (!cli->vm)
+	if (cli->vmm.vmm.object.oclass < NVIF_CLASS_VMM_NV50)
 		return;
 
 	ret = ttm_bo_reserve(&nvbo->bo, false, false, NULL);
 	if (ret)
 		return;
 
-	vma = nouveau_bo_vma_find(nvbo, cli->vm);
+	vma = nouveau_vma_find(nvbo, &cli->vmm);
 	if (vma) {
-		if (--vma->refcount == 0) {
+		if (--vma->refs == 0) {
 			ret = pm_runtime_get_sync(dev);
 			if (!WARN_ON(ret < 0 && ret != -EACCES)) {
 				nouveau_gem_object_unmap(nvbo, vma);
@@ -179,7 +184,7 @@ nouveau_gem_new(struct nouveau_cli *cli, u64 size, int align, uint32_t domain,
 		uint32_t tile_mode, uint32_t tile_flags,
 		struct nouveau_bo **pnvbo)
 {
-	struct nouveau_drm *drm = nouveau_drm(cli->dev);
+	struct nouveau_drm *drm = cli->drm;
 	struct nouveau_bo *nvbo;
 	u32 flags = 0;
 	int ret;
@@ -227,7 +232,7 @@ nouveau_gem_info(struct drm_file *file_priv, struct drm_gem_object *gem,
 {
 	struct nouveau_cli *cli = nouveau_cli(file_priv);
 	struct nouveau_bo *nvbo = nouveau_gem_object(gem);
-	struct nvkm_vma *vma;
+	struct nouveau_vma *vma;
 
 	if (is_power_of_2(nvbo->valid_domains))
 		rep->domain = nvbo->valid_domains;
@@ -236,18 +241,25 @@ nouveau_gem_info(struct drm_file *file_priv, struct drm_gem_object *gem,
 	else
 		rep->domain = NOUVEAU_GEM_DOMAIN_VRAM;
 	rep->offset = nvbo->bo.offset;
-	if (cli->vm) {
-		vma = nouveau_bo_vma_find(nvbo, cli->vm);
+	if (cli->vmm.vmm.object.oclass >= NVIF_CLASS_VMM_NV50) {
+		vma = nouveau_vma_find(nvbo, &cli->vmm);
 		if (!vma)
 			return -EINVAL;
 
-		rep->offset = vma->offset;
+		rep->offset = vma->addr;
 	}
 
 	rep->size = nvbo->bo.mem.num_pages << PAGE_SHIFT;
 	rep->map_handle = drm_vma_node_offset_addr(&nvbo->bo.vma_node);
-	rep->tile_mode = nvbo->tile_mode;
-	rep->tile_flags = nvbo->tile_flags;
+	rep->tile_mode = nvbo->mode;
+	rep->tile_flags = nvbo->contig ? 0 : NOUVEAU_GEM_TILE_NONCONTIG;
+	if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI)
+		rep->tile_flags |= nvbo->kind << 8;
+	else
+	if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA)
+		rep->tile_flags |= nvbo->kind << 8 | nvbo->comp << 16;
+	else
+		rep->tile_flags |= nvbo->zeta;
 	return 0;
 }
 
@@ -255,18 +267,11 @@ int
 nouveau_gem_ioctl_new(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv)
 {
-	struct nouveau_drm *drm = nouveau_drm(dev);
 	struct nouveau_cli *cli = nouveau_cli(file_priv);
-	struct nvkm_fb *fb = nvxx_fb(&drm->client.device);
 	struct drm_nouveau_gem_new *req = data;
 	struct nouveau_bo *nvbo = NULL;
 	int ret = 0;
 
-	if (!nvkm_fb_memtype_valid(fb, req->info.tile_flags)) {
-		NV_PRINTK(err, cli, "bad page flags: 0x%08x\n", req->info.tile_flags);
-		return -EINVAL;
-	}
-
 	ret = nouveau_gem_new(cli, req->info.size, req->align,
 			      req->info.domain, req->info.tile_mode,
 			      req->info.tile_flags, &nvbo);
@@ -791,7 +796,7 @@ nouveau_gem_ioctl_pushbuf(struct drm_device *dev, void *data,
 				bo[push[i].bo_index].user_priv;
 			uint32_t cmd;
 
-			cmd = chan->push.vma.offset + ((chan->dma.cur + 2) << 2);
+			cmd = chan->push.addr + ((chan->dma.cur + 2) << 2);
 			cmd |= 0x20000000;
 			if (unlikely(cmd != req->suffix0)) {
 				if (!nvbo->kmap.virtual) {
@@ -843,7 +848,7 @@ out_next:
 		req->suffix1 = 0x00000000;
 	} else {
 		req->suffix0 = 0x20000000 |
-			      (chan->push.vma.offset + ((chan->dma.cur + 2) << 2));
+			      (chan->push.addr + ((chan->dma.cur + 2) << 2));
 		req->suffix1 = 0x00000000;
 	}
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.h b/drivers/gpu/drm/nouveau/nouveau_gem.h
index 8fa6ed9ddd3a..d39f845dda87 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.h
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.h
@@ -6,9 +6,6 @@
 #include "nouveau_drv.h"
 #include "nouveau_bo.h"
 
-#define nouveau_bo_tile_layout(nvbo)				\
-	((nvbo)->tile_flags & NOUVEAU_GEM_TILE_LAYOUT_MASK)
-
 static inline struct nouveau_bo *
 nouveau_gem_object(struct drm_gem_object *gem)
 {
diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c
new file mode 100644
index 000000000000..589a9621db76
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_mem.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "nouveau_mem.h"
+#include "nouveau_drv.h"
+#include "nouveau_bo.h"
+
+#include <drm/ttm/ttm_bo_driver.h>
+
+#include <nvif/class.h>
+#include <nvif/if000a.h>
+#include <nvif/if500b.h>
+#include <nvif/if500d.h>
+#include <nvif/if900b.h>
+#include <nvif/if900d.h>
+
+int
+nouveau_mem_map(struct nouveau_mem *mem,
+		struct nvif_vmm *vmm, struct nvif_vma *vma)
+{
+	union {
+		struct nv50_vmm_map_v0 nv50;
+		struct gf100_vmm_map_v0 gf100;
+	} args;
+	u32 argc = 0;
+	bool super;
+	int ret;
+
+	switch (vmm->object.oclass) {
+	case NVIF_CLASS_VMM_NV04:
+		break;
+	case NVIF_CLASS_VMM_NV50:
+		args.nv50.version = 0;
+		args.nv50.ro = 0;
+		args.nv50.priv = 0;
+		args.nv50.kind = mem->kind;
+		args.nv50.comp = mem->comp;
+		argc = sizeof(args.nv50);
+		break;
+	case NVIF_CLASS_VMM_GF100:
+	case NVIF_CLASS_VMM_GM200:
+	case NVIF_CLASS_VMM_GP100:
+		args.gf100.version = 0;
+		if (mem->mem.type & NVIF_MEM_VRAM)
+			args.gf100.vol = 0;
+		else
+			args.gf100.vol = 1;
+		args.gf100.ro = 0;
+		args.gf100.priv = 0;
+		args.gf100.kind = mem->kind;
+		argc = sizeof(args.gf100);
+		break;
+	default:
+		WARN_ON(1);
+		return -ENOSYS;
+	}
+
+	super = vmm->object.client->super;
+	vmm->object.client->super = true;
+	ret = nvif_vmm_map(vmm, vma->addr, mem->mem.size, &args, argc,
+			   &mem->mem, 0);
+	vmm->object.client->super = super;
+	return ret;
+}
+
+void
+nouveau_mem_fini(struct nouveau_mem *mem)
+{
+	nvif_vmm_put(&mem->cli->drm->client.vmm.vmm, &mem->vma[1]);
+	nvif_vmm_put(&mem->cli->drm->client.vmm.vmm, &mem->vma[0]);
+	mutex_lock(&mem->cli->drm->master.lock);
+	nvif_mem_fini(&mem->mem);
+	mutex_unlock(&mem->cli->drm->master.lock);
+}
+
+int
+nouveau_mem_host(struct ttm_mem_reg *reg, struct ttm_dma_tt *tt)
+{
+	struct nouveau_mem *mem = nouveau_mem(reg);
+	struct nouveau_cli *cli = mem->cli;
+	struct nouveau_drm *drm = cli->drm;
+	struct nvif_mmu *mmu = &cli->mmu;
+	struct nvif_mem_ram_v0 args = {};
+	bool super = cli->base.super;
+	u8 type;
+	int ret;
+
+	if (mmu->type[drm->ttm.type_host].type & NVIF_MEM_UNCACHED)
+		type = drm->ttm.type_ncoh;
+	else
+		type = drm->ttm.type_host;
+
+	if (mem->kind && !(mmu->type[type].type & NVIF_MEM_KIND))
+		mem->comp = mem->kind = 0;
+	if (mem->comp && !(mmu->type[type].type & NVIF_MEM_COMP)) {
+		if (mmu->object.oclass >= NVIF_CLASS_MMU_GF100)
+			mem->kind = mmu->kind[mem->kind];
+		mem->comp = 0;
+	}
+
+	if (tt->ttm.sg) args.sgl = tt->ttm.sg->sgl;
+	else            args.dma = tt->dma_address;
+
+	mutex_lock(&drm->master.lock);
+	cli->base.super = true;
+	ret = nvif_mem_init_type(mmu, cli->mem->oclass, type, PAGE_SHIFT,
+				 reg->num_pages << PAGE_SHIFT,
+				 &args, sizeof(args), &mem->mem);
+	cli->base.super = super;
+	mutex_unlock(&drm->master.lock);
+	return ret;
+}
+
+int
+nouveau_mem_vram(struct ttm_mem_reg *reg, bool contig, u8 page)
+{
+	struct nouveau_mem *mem = nouveau_mem(reg);
+	struct nouveau_cli *cli = mem->cli;
+	struct nouveau_drm *drm = cli->drm;
+	struct nvif_mmu *mmu = &cli->mmu;
+	bool super = cli->base.super;
+	u64 size = ALIGN(reg->num_pages << PAGE_SHIFT, 1 << page);
+	int ret;
+
+	mutex_lock(&drm->master.lock);
+	cli->base.super = true;
+	switch (cli->mem->oclass) {
+	case NVIF_CLASS_MEM_GF100:
+		ret = nvif_mem_init_type(mmu, cli->mem->oclass,
+					 drm->ttm.type_vram, page, size,
+					 &(struct gf100_mem_v0) {
+						.contig = contig,
+					 }, sizeof(struct gf100_mem_v0),
+					 &mem->mem);
+		break;
+	case NVIF_CLASS_MEM_NV50:
+		ret = nvif_mem_init_type(mmu, cli->mem->oclass,
+					 drm->ttm.type_vram, page, size,
+					 &(struct nv50_mem_v0) {
+						.bankswz = mmu->kind[mem->kind] == 2,
+						.contig = contig,
+					 }, sizeof(struct nv50_mem_v0),
+					 &mem->mem);
+		break;
+	default:
+		ret = -ENOSYS;
+		WARN_ON(1);
+		break;
+	}
+	cli->base.super = super;
+	mutex_unlock(&drm->master.lock);
+
+	reg->start = mem->mem.addr >> PAGE_SHIFT;
+	return ret;
+}
+
+void
+nouveau_mem_del(struct ttm_mem_reg *reg)
+{
+	struct nouveau_mem *mem = nouveau_mem(reg);
+	nouveau_mem_fini(mem);
+	kfree(reg->mm_node);
+	reg->mm_node = NULL;
+}
+
+int
+nouveau_mem_new(struct nouveau_cli *cli, u8 kind, u8 comp,
+		struct ttm_mem_reg *reg)
+{
+	struct nouveau_mem *mem;
+
+	if (!(mem = kzalloc(sizeof(*mem), GFP_KERNEL)))
+		return -ENOMEM;
+	mem->cli = cli;
+	mem->kind = kind;
+	mem->comp = comp;
+
+	reg->mm_node = mem;
+	return 0;
+}
diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.h b/drivers/gpu/drm/nouveau/nouveau_mem.h
new file mode 100644
index 000000000000..f6d039e73812
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_mem.h
@@ -0,0 +1,30 @@
+#ifndef __NOUVEAU_MEM_H__
+#define __NOUVEAU_MEM_H__
+#include <drm/ttm/ttm_bo_api.h>
+struct ttm_dma_tt;
+
+#include <nvif/mem.h>
+#include <nvif/vmm.h>
+
+static inline struct nouveau_mem *
+nouveau_mem(struct ttm_mem_reg *reg)
+{
+	return reg->mm_node;
+}
+
+struct nouveau_mem {
+	struct nouveau_cli *cli;
+	u8 kind;
+	u8 comp;
+	struct nvif_mem mem;
+	struct nvif_vma vma[2];
+};
+
+int nouveau_mem_new(struct nouveau_cli *, u8 kind, u8 comp,
+		    struct ttm_mem_reg *);
+void nouveau_mem_del(struct ttm_mem_reg *);
+int nouveau_mem_vram(struct ttm_mem_reg *, bool contig, u8 page);
+int nouveau_mem_host(struct ttm_mem_reg *, struct ttm_dma_tt *);
+void nouveau_mem_fini(struct nouveau_mem *);
+int nouveau_mem_map(struct nouveau_mem *, struct nvif_vmm *, struct nvif_vma *);
+#endif
diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
index b7ab268f7d6f..941bf33bd249 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
@@ -2,6 +2,7 @@
 #include <linux/slab.h>
 
 #include "nouveau_drv.h"
+#include "nouveau_mem.h"
 #include "nouveau_ttm.h"
 
 struct nouveau_sgdma_be {
@@ -9,7 +10,7 @@ struct nouveau_sgdma_be {
 	 * nouve_bo.c works properly, otherwise have to move them here
 	 */
 	struct ttm_dma_tt ttm;
-	struct nvkm_mem *node;
+	struct nouveau_mem *mem;
 };
 
 static void
@@ -27,19 +28,20 @@ static int
 nv04_sgdma_bind(struct ttm_tt *ttm, struct ttm_mem_reg *reg)
 {
 	struct nouveau_sgdma_be *nvbe = (struct nouveau_sgdma_be *)ttm;
-	struct nvkm_mem *node = reg->mm_node;
-
-	if (ttm->sg) {
-		node->sg    = ttm->sg;
-		node->pages = NULL;
-	} else {
-		node->sg    = NULL;
-		node->pages = nvbe->ttm.dma_address;
+	struct nouveau_mem *mem = nouveau_mem(reg);
+	int ret;
+
+	ret = nouveau_mem_host(reg, &nvbe->ttm);
+	if (ret)
+		return ret;
+
+	ret = nouveau_mem_map(mem, &mem->cli->vmm.vmm, &mem->vma[0]);
+	if (ret) {
+		nouveau_mem_fini(mem);
+		return ret;
 	}
-	node->size = (reg->num_pages << PAGE_SHIFT) >> 12;
 
-	nvkm_vm_map(&node->vma[0], node);
-	nvbe->node = node;
+	nvbe->mem = mem;
 	return 0;
 }
 
@@ -47,7 +49,7 @@ static int
 nv04_sgdma_unbind(struct ttm_tt *ttm)
 {
 	struct nouveau_sgdma_be *nvbe = (struct nouveau_sgdma_be *)ttm;
-	nvkm_vm_unmap(&nvbe->node->vma[0]);
+	nouveau_mem_fini(nvbe->mem);
 	return 0;
 }
 
@@ -61,30 +63,20 @@ static int
 nv50_sgdma_bind(struct ttm_tt *ttm, struct ttm_mem_reg *reg)
 {
 	struct nouveau_sgdma_be *nvbe = (struct nouveau_sgdma_be *)ttm;
-	struct nvkm_mem *node = reg->mm_node;
-
-	/* noop: bound in move_notify() */
-	if (ttm->sg) {
-		node->sg    = ttm->sg;
-		node->pages = NULL;
-	} else {
-		node->sg    = NULL;
-		node->pages = nvbe->ttm.dma_address;
-	}
-	node->size = (reg->num_pages << PAGE_SHIFT) >> 12;
-	return 0;
-}
+	struct nouveau_mem *mem = nouveau_mem(reg);
+	int ret;
 
-static int
-nv50_sgdma_unbind(struct ttm_tt *ttm)
-{
-	/* noop: unbound in move_notify() */
+	ret = nouveau_mem_host(reg, &nvbe->ttm);
+	if (ret)
+		return ret;
+
+	nvbe->mem = mem;
 	return 0;
 }
 
 static struct ttm_backend_func nv50_sgdma_backend = {
 	.bind			= nv50_sgdma_bind,
-	.unbind			= nv50_sgdma_unbind,
+	.unbind			= nv04_sgdma_unbind,
 	.destroy		= nouveau_sgdma_destroy
 };
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c
index b0ad7fcefcf5..08b974b30482 100644
--- a/drivers/gpu/drm/nouveau/nouveau_ttm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c
@@ -23,53 +23,37 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
-
 #include "nouveau_drv.h"
-#include "nouveau_ttm.h"
 #include "nouveau_gem.h"
+#include "nouveau_mem.h"
+#include "nouveau_ttm.h"
 
 #include <drm/drm_legacy.h>
 
 #include <core/tegra.h>
 
 static int
-nouveau_vram_manager_init(struct ttm_mem_type_manager *man, unsigned long psize)
+nouveau_manager_init(struct ttm_mem_type_manager *man, unsigned long psize)
 {
-	struct nouveau_drm *drm = nouveau_bdev(man->bdev);
-	struct nvkm_fb *fb = nvxx_fb(&drm->client.device);
-	man->priv = fb;
 	return 0;
 }
 
 static int
-nouveau_vram_manager_fini(struct ttm_mem_type_manager *man)
+nouveau_manager_fini(struct ttm_mem_type_manager *man)
 {
-	man->priv = NULL;
 	return 0;
 }
 
-static inline void
-nvkm_mem_node_cleanup(struct nvkm_mem *node)
+static void
+nouveau_manager_del(struct ttm_mem_type_manager *man, struct ttm_mem_reg *reg)
 {
-	if (node->vma[0].node) {
-		nvkm_vm_unmap(&node->vma[0]);
-		nvkm_vm_put(&node->vma[0]);
-	}
-
-	if (node->vma[1].node) {
-		nvkm_vm_unmap(&node->vma[1]);
-		nvkm_vm_put(&node->vma[1]);
-	}
+	nouveau_mem_del(reg);
 }
 
 static void
-nouveau_vram_manager_del(struct ttm_mem_type_manager *man,
-			 struct ttm_mem_reg *reg)
+nouveau_manager_debug(struct ttm_mem_type_manager *man,
+		      struct drm_printer *printer)
 {
-	struct nouveau_drm *drm = nouveau_bdev(man->bdev);
-	struct nvkm_ram *ram = nvxx_fb(&drm->client.device)->ram;
-	nvkm_mem_node_cleanup(reg->mm_node);
-	ram->func->put(ram, (struct nvkm_mem **)&reg->mm_node);
 }
 
 static int
@@ -78,192 +62,105 @@ nouveau_vram_manager_new(struct ttm_mem_type_manager *man,
 			 const struct ttm_place *place,
 			 struct ttm_mem_reg *reg)
 {
-	struct nouveau_drm *drm = nouveau_bdev(man->bdev);
-	struct nvkm_ram *ram = nvxx_fb(&drm->client.device)->ram;
 	struct nouveau_bo *nvbo = nouveau_bo(bo);
-	struct nvkm_mem *node;
-	u32 size_nc = 0;
+	struct nouveau_drm *drm = nvbo->cli->drm;
+	struct nouveau_mem *mem;
 	int ret;
 
 	if (drm->client.device.info.ram_size == 0)
 		return -ENOMEM;
 
-	if (nvbo->tile_flags & NOUVEAU_GEM_TILE_NONCONTIG)
-		size_nc = 1 << nvbo->page_shift;
+	ret = nouveau_mem_new(&drm->master, nvbo->kind, nvbo->comp, reg);
+	mem = nouveau_mem(reg);
+	if (ret)
+		return ret;
 
-	ret = ram->func->get(ram, reg->num_pages << PAGE_SHIFT,
-			     reg->page_alignment << PAGE_SHIFT, size_nc,
-			     (nvbo->tile_flags >> 8) & 0x3ff, &node);
+	ret = nouveau_mem_vram(reg, nvbo->contig, nvbo->page);
 	if (ret) {
-		reg->mm_node = NULL;
-		return (ret == -ENOSPC) ? 0 : ret;
+		nouveau_mem_del(reg);
+		if (ret == -ENOSPC) {
+			reg->mm_node = NULL;
+			return 0;
+		}
+		return ret;
 	}
 
-	node->page_shift = nvbo->page_shift;
-
-	reg->mm_node = node;
-	reg->start   = node->offset >> PAGE_SHIFT;
 	return 0;
 }
 
 const struct ttm_mem_type_manager_func nouveau_vram_manager = {
-	.init = nouveau_vram_manager_init,
-	.takedown = nouveau_vram_manager_fini,
+	.init = nouveau_manager_init,
+	.takedown = nouveau_manager_fini,
 	.get_node = nouveau_vram_manager_new,
-	.put_node = nouveau_vram_manager_del,
+	.put_node = nouveau_manager_del,
+	.debug = nouveau_manager_debug,
 };
 
 static int
-nouveau_gart_manager_init(struct ttm_mem_type_manager *man, unsigned long psize)
-{
-	return 0;
-}
-
-static int
-nouveau_gart_manager_fini(struct ttm_mem_type_manager *man)
-{
-	return 0;
-}
-
-static void
-nouveau_gart_manager_del(struct ttm_mem_type_manager *man,
-			 struct ttm_mem_reg *reg)
-{
-	nvkm_mem_node_cleanup(reg->mm_node);
-	kfree(reg->mm_node);
-	reg->mm_node = NULL;
-}
-
-static int
 nouveau_gart_manager_new(struct ttm_mem_type_manager *man,
 			 struct ttm_buffer_object *bo,
 			 const struct ttm_place *place,
 			 struct ttm_mem_reg *reg)
 {
-	struct nouveau_drm *drm = nouveau_bdev(bo->bdev);
 	struct nouveau_bo *nvbo = nouveau_bo(bo);
-	struct nvkm_mem *node;
-
-	node = kzalloc(sizeof(*node), GFP_KERNEL);
-	if (!node)
-		return -ENOMEM;
+	struct nouveau_drm *drm = nvbo->cli->drm;
+	struct nouveau_mem *mem;
+	int ret;
 
-	node->page_shift = 12;
-
-	switch (drm->client.device.info.family) {
-	case NV_DEVICE_INFO_V0_TNT:
-	case NV_DEVICE_INFO_V0_CELSIUS:
-	case NV_DEVICE_INFO_V0_KELVIN:
-	case NV_DEVICE_INFO_V0_RANKINE:
-	case NV_DEVICE_INFO_V0_CURIE:
-		break;
-	case NV_DEVICE_INFO_V0_TESLA:
-		if (drm->client.device.info.chipset != 0x50)
-			node->memtype = (nvbo->tile_flags & 0x7f00) >> 8;
-		break;
-	case NV_DEVICE_INFO_V0_FERMI:
-	case NV_DEVICE_INFO_V0_KEPLER:
-	case NV_DEVICE_INFO_V0_MAXWELL:
-	case NV_DEVICE_INFO_V0_PASCAL:
-		node->memtype = (nvbo->tile_flags & 0xff00) >> 8;
-		break;
-	default:
-		NV_WARN(drm, "%s: unhandled family type %x\n", __func__,
-			drm->client.device.info.family);
-		break;
-	}
+	ret = nouveau_mem_new(&drm->master, nvbo->kind, nvbo->comp, reg);
+	mem = nouveau_mem(reg);
+	if (ret)
+		return ret;
 
-	reg->mm_node = node;
-	reg->start   = 0;
+	reg->start = 0;
 	return 0;
 }
 
-static void
-nouveau_gart_manager_debug(struct ttm_mem_type_manager *man,
-			   struct drm_printer *printer)
-{
-}
-
 const struct ttm_mem_type_manager_func nouveau_gart_manager = {
-	.init = nouveau_gart_manager_init,
-	.takedown = nouveau_gart_manager_fini,
+	.init = nouveau_manager_init,
+	.takedown = nouveau_manager_fini,
 	.get_node = nouveau_gart_manager_new,
-	.put_node = nouveau_gart_manager_del,
-	.debug = nouveau_gart_manager_debug
+	.put_node = nouveau_manager_del,
+	.debug = nouveau_manager_debug
 };
 
-/*XXX*/
-#include <subdev/mmu/nv04.h>
-static int
-nv04_gart_manager_init(struct ttm_mem_type_manager *man, unsigned long psize)
-{
-	struct nouveau_drm *drm = nouveau_bdev(man->bdev);
-	struct nvkm_mmu *mmu = nvxx_mmu(&drm->client.device);
-	struct nv04_mmu *priv = (void *)mmu;
-	struct nvkm_vm *vm = NULL;
-	nvkm_vm_ref(priv->vm, &vm, NULL);
-	man->priv = vm;
-	return 0;
-}
-
-static int
-nv04_gart_manager_fini(struct ttm_mem_type_manager *man)
-{
-	struct nvkm_vm *vm = man->priv;
-	nvkm_vm_ref(NULL, &vm, NULL);
-	man->priv = NULL;
-	return 0;
-}
-
-static void
-nv04_gart_manager_del(struct ttm_mem_type_manager *man, struct ttm_mem_reg *reg)
-{
-	struct nvkm_mem *node = reg->mm_node;
-	if (node->vma[0].node)
-		nvkm_vm_put(&node->vma[0]);
-	kfree(reg->mm_node);
-	reg->mm_node = NULL;
-}
-
 static int
 nv04_gart_manager_new(struct ttm_mem_type_manager *man,
 		      struct ttm_buffer_object *bo,
 		      const struct ttm_place *place,
 		      struct ttm_mem_reg *reg)
 {
-	struct nvkm_mem *node;
+	struct nouveau_bo *nvbo = nouveau_bo(bo);
+	struct nouveau_drm *drm = nvbo->cli->drm;
+	struct nouveau_mem *mem;
 	int ret;
 
-	node = kzalloc(sizeof(*node), GFP_KERNEL);
-	if (!node)
-		return -ENOMEM;
-
-	node->page_shift = 12;
+	ret = nouveau_mem_new(&drm->master, nvbo->kind, nvbo->comp, reg);
+	mem = nouveau_mem(reg);
+	if (ret)
+		return ret;
 
-	ret = nvkm_vm_get(man->priv, reg->num_pages << 12, node->page_shift,
-			  NV_MEM_ACCESS_RW, &node->vma[0]);
+	ret = nvif_vmm_get(&mem->cli->vmm.vmm, PTES, false, 12, 0,
+			   reg->num_pages << PAGE_SHIFT, &mem->vma[0]);
 	if (ret) {
-		kfree(node);
+		nouveau_mem_del(reg);
+		if (ret == -ENOSPC) {
+			reg->mm_node = NULL;
+			return 0;
+		}
 		return ret;
 	}
 
-	reg->mm_node = node;
-	reg->start   = node->vma[0].offset >> PAGE_SHIFT;
+	reg->start = mem->vma[0].addr >> PAGE_SHIFT;
 	return 0;
 }
 
-static void
-nv04_gart_manager_debug(struct ttm_mem_type_manager *man,
-			struct drm_printer *printer)
-{
-}
-
 const struct ttm_mem_type_manager_func nv04_gart_manager = {
-	.init = nv04_gart_manager_init,
-	.takedown = nv04_gart_manager_fini,
+	.init = nouveau_manager_init,
+	.takedown = nouveau_manager_fini,
 	.get_node = nv04_gart_manager_new,
-	.put_node = nv04_gart_manager_del,
-	.debug = nv04_gart_manager_debug
+	.put_node = nouveau_manager_del,
+	.debug = nouveau_manager_debug
 };
 
 int
@@ -343,44 +240,43 @@ nouveau_ttm_init(struct nouveau_drm *drm)
 {
 	struct nvkm_device *device = nvxx_device(&drm->client.device);
 	struct nvkm_pci *pci = device->pci;
+	struct nvif_mmu *mmu = &drm->client.mmu;
 	struct drm_device *dev = drm->dev;
-	u8 bits;
-	int ret;
+	int typei, ret;
 
-	if (pci && pci->agp.bridge) {
-		drm->agp.bridge = pci->agp.bridge;
-		drm->agp.base = pci->agp.base;
-		drm->agp.size = pci->agp.size;
-		drm->agp.cma = pci->agp.cma;
-	}
+	typei = nvif_mmu_type(mmu, NVIF_MEM_HOST | NVIF_MEM_MAPPABLE |
+						   NVIF_MEM_COHERENT);
+	if (typei < 0)
+		return -ENOSYS;
 
-	bits = nvxx_mmu(&drm->client.device)->dma_bits;
-	if (nvxx_device(&drm->client.device)->func->pci) {
-		if (drm->agp.bridge)
-			bits = 32;
-	} else if (device->func->tegra) {
-		struct nvkm_device_tegra *tegra = device->func->tegra(device);
+	drm->ttm.type_host = typei;
 
-		/*
-		 * If the platform can use a IOMMU, then the addressable DMA
-		 * space is constrained by the IOMMU bit
-		 */
-		if (tegra->func->iommu_bit)
-			bits = min(bits, tegra->func->iommu_bit);
+	typei = nvif_mmu_type(mmu, NVIF_MEM_HOST | NVIF_MEM_MAPPABLE);
+	if (typei < 0)
+		return -ENOSYS;
 
-	}
+	drm->ttm.type_ncoh = typei;
 
-	ret = dma_set_mask(dev->dev, DMA_BIT_MASK(bits));
-	if (ret && bits != 32) {
-		bits = 32;
-		ret = dma_set_mask(dev->dev, DMA_BIT_MASK(bits));
+	if (drm->client.device.info.platform != NV_DEVICE_INFO_V0_SOC &&
+	    drm->client.device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
+		typei = nvif_mmu_type(mmu, NVIF_MEM_VRAM | NVIF_MEM_MAPPABLE |
+					   NVIF_MEM_KIND |
+					   NVIF_MEM_COMP |
+					   NVIF_MEM_DISP);
+		if (typei < 0)
+			return -ENOSYS;
+
+		drm->ttm.type_vram = typei;
+	} else {
+		drm->ttm.type_vram = -1;
 	}
-	if (ret)
-		return ret;
 
-	ret = dma_set_coherent_mask(dev->dev, DMA_BIT_MASK(bits));
-	if (ret)
-		dma_set_coherent_mask(dev->dev, DMA_BIT_MASK(32));
+	if (pci && pci->agp.bridge) {
+		drm->agp.bridge = pci->agp.bridge;
+		drm->agp.base = pci->agp.base;
+		drm->agp.size = pci->agp.size;
+		drm->agp.cma = pci->agp.cma;
+	}
 
 	ret = nouveau_ttm_global_init(drm);
 	if (ret)
@@ -391,7 +287,7 @@ nouveau_ttm_init(struct nouveau_drm *drm)
 				  &nouveau_bo_driver,
 				  dev->anon_inode->i_mapping,
 				  DRM_FILE_PAGE_OFFSET,
-				  bits <= 32 ? true : false);
+				  drm->client.mmu.dmabits <= 32 ? true : false);
 	if (ret) {
 		NV_ERROR(drm, "error initialising bo driver, %d\n", ret);
 		return ret;
@@ -415,7 +311,7 @@ nouveau_ttm_init(struct nouveau_drm *drm)
 
 	/* GART init */
 	if (!drm->agp.bridge) {
-		drm->gem.gart_available = nvxx_mmu(&drm->client.device)->limit;
+		drm->gem.gart_available = drm->client.vmm.vmm.limit;
 	} else {
 		drm->gem.gart_available = drm->agp.size;
 	}
diff --git a/drivers/gpu/drm/nouveau/nouveau_vmm.c b/drivers/gpu/drm/nouveau/nouveau_vmm.c
new file mode 100644
index 000000000000..9e2628dd8e4d
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_vmm.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "nouveau_vmm.h"
+#include "nouveau_drv.h"
+#include "nouveau_bo.h"
+#include "nouveau_mem.h"
+
+void
+nouveau_vma_unmap(struct nouveau_vma *vma)
+{
+	if (vma->mem) {
+		nvif_vmm_unmap(&vma->vmm->vmm, vma->addr);
+		vma->mem = NULL;
+	}
+}
+
+int
+nouveau_vma_map(struct nouveau_vma *vma, struct nouveau_mem *mem)
+{
+	struct nvif_vma tmp = { .addr = vma->addr };
+	int ret = nouveau_mem_map(mem, &vma->vmm->vmm, &tmp);
+	if (ret)
+		return ret;
+	vma->mem = mem;
+	return 0;
+}
+
+struct nouveau_vma *
+nouveau_vma_find(struct nouveau_bo *nvbo, struct nouveau_vmm *vmm)
+{
+	struct nouveau_vma *vma;
+
+	list_for_each_entry(vma, &nvbo->vma_list, head) {
+		if (vma->vmm == vmm)
+			return vma;
+	}
+
+	return NULL;
+}
+
+void
+nouveau_vma_del(struct nouveau_vma **pvma)
+{
+	struct nouveau_vma *vma = *pvma;
+	if (vma && --vma->refs <= 0) {
+		if (likely(vma->addr != ~0ULL)) {
+			struct nvif_vma tmp = { .addr = vma->addr, .size = 1 };
+			nvif_vmm_put(&vma->vmm->vmm, &tmp);
+		}
+		list_del(&vma->head);
+		*pvma = NULL;
+		kfree(*pvma);
+	}
+}
+
+int
+nouveau_vma_new(struct nouveau_bo *nvbo, struct nouveau_vmm *vmm,
+		struct nouveau_vma **pvma)
+{
+	struct nouveau_mem *mem = nouveau_mem(&nvbo->bo.mem);
+	struct nouveau_vma *vma;
+	struct nvif_vma tmp;
+	int ret;
+
+	if ((vma = *pvma = nouveau_vma_find(nvbo, vmm))) {
+		vma->refs++;
+		return 0;
+	}
+
+	if (!(vma = *pvma = kmalloc(sizeof(*vma), GFP_KERNEL)))
+		return -ENOMEM;
+	vma->vmm = vmm;
+	vma->refs = 1;
+	vma->addr = ~0ULL;
+	vma->mem = NULL;
+	list_add_tail(&vma->head, &nvbo->vma_list);
+
+	if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM &&
+	    mem->mem.page == nvbo->page) {
+		ret = nvif_vmm_get(&vmm->vmm, LAZY, false, mem->mem.page, 0,
+				   mem->mem.size, &tmp);
+		if (ret)
+			goto done;
+
+		vma->addr = tmp.addr;
+		ret = nouveau_vma_map(vma, mem);
+	} else {
+		ret = nvif_vmm_get(&vmm->vmm, PTES, false, mem->mem.page, 0,
+				   mem->mem.size, &tmp);
+		vma->addr = tmp.addr;
+	}
+
+done:
+	if (ret)
+		nouveau_vma_del(pvma);
+	return ret;
+}
+
+void
+nouveau_vmm_fini(struct nouveau_vmm *vmm)
+{
+	nvif_vmm_fini(&vmm->vmm);
+	vmm->cli = NULL;
+}
+
+int
+nouveau_vmm_init(struct nouveau_cli *cli, s32 oclass, struct nouveau_vmm *vmm)
+{
+	int ret = nvif_vmm_init(&cli->mmu, oclass, PAGE_SIZE, 0, NULL, 0,
+				&vmm->vmm);
+	if (ret)
+		return ret;
+
+	vmm->cli = cli;
+	return 0;
+}
diff --git a/drivers/gpu/drm/nouveau/nouveau_vmm.h b/drivers/gpu/drm/nouveau/nouveau_vmm.h
new file mode 100644
index 000000000000..5c31f43678d3
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_vmm.h
@@ -0,0 +1,31 @@
+#ifndef __NOUVEAU_VMA_H__
+#define __NOUVEAU_VMA_H__
+#include <nvif/vmm.h>
+struct nouveau_bo;
+struct nouveau_mem;
+
+struct nouveau_vma {
+	struct nouveau_vmm *vmm;
+	int refs;
+	struct list_head head;
+	u64 addr;
+
+	struct nouveau_mem *mem;
+};
+
+struct nouveau_vma *nouveau_vma_find(struct nouveau_bo *, struct nouveau_vmm *);
+int nouveau_vma_new(struct nouveau_bo *, struct nouveau_vmm *,
+		    struct nouveau_vma **);
+void nouveau_vma_del(struct nouveau_vma **);
+int nouveau_vma_map(struct nouveau_vma *, struct nouveau_mem *);
+void nouveau_vma_unmap(struct nouveau_vma *);
+
+struct nouveau_vmm {
+	struct nouveau_cli *cli;
+	struct nvif_vmm vmm;
+	struct nvkm_vm *vm;
+};
+
+int nouveau_vmm_init(struct nouveau_cli *, s32 oclass, struct nouveau_vmm *);
+void nouveau_vmm_fini(struct nouveau_vmm *);
+#endif
diff --git a/drivers/gpu/drm/nouveau/nv50_display.c b/drivers/gpu/drm/nouveau/nv50_display.c
index 2dbf62a2ac41..92d46222c79d 100644
--- a/drivers/gpu/drm/nouveau/nv50_display.c
+++ b/drivers/gpu/drm/nouveau/nv50_display.c
@@ -318,7 +318,7 @@ nv50_chan_create(struct nvif_device *device, struct nvif_object *disp,
 				ret = nvif_object_init(disp, 0, oclass[0],
 						       data, size, &chan->user);
 				if (ret == 0)
-					nvif_object_map(&chan->user);
+					nvif_object_map(&chan->user, NULL, 0);
 				nvif_object_sclass_put(&sclass);
 				return ret;
 			}
@@ -424,7 +424,7 @@ nv50_dmac_ctxdma_new(struct nv50_dmac *dmac, struct nouveau_framebuffer *fb)
 {
 	struct nouveau_drm *drm = nouveau_drm(fb->base.dev);
 	struct nv50_dmac_ctxdma *ctxdma;
-	const u8    kind = (fb->nvbo->tile_flags & 0x0000ff00) >> 8;
+	const u8    kind = fb->nvbo->kind;
 	const u32 handle = 0xfb000000 | kind;
 	struct {
 		struct nv_dma_v0 base;
@@ -510,6 +510,7 @@ nv50_dmac_create(struct nvif_device *device, struct nvif_object *disp,
 	int ret;
 
 	mutex_init(&dmac->lock);
+	INIT_LIST_HEAD(&dmac->ctxdma);
 
 	dmac->ptr = dma_alloc_coherent(nvxx_device(device)->dev, PAGE_SIZE,
 				       &dmac->handle, GFP_KERNEL);
@@ -556,7 +557,6 @@ nv50_dmac_create(struct nvif_device *device, struct nvif_object *disp,
 	if (ret)
 		return ret;
 
-	INIT_LIST_HEAD(&dmac->ctxdma);
 	return ret;
 }
 
@@ -847,7 +847,7 @@ nv50_wndw_atomic_check_acquire(struct nv50_wndw *wndw,
 
 	asyw->image.w = fb->base.width;
 	asyw->image.h = fb->base.height;
-	asyw->image.kind = (fb->nvbo->tile_flags & 0x0000ff00) >> 8;
+	asyw->image.kind = fb->nvbo->kind;
 
 	if (asyh->state.pageflip_flags & DRM_MODE_PAGE_FLIP_ASYNC)
 		asyw->interval = 0;
@@ -857,9 +857,9 @@ nv50_wndw_atomic_check_acquire(struct nv50_wndw *wndw,
 	if (asyw->image.kind) {
 		asyw->image.layout = 0;
 		if (drm->client.device.info.chipset >= 0xc0)
-			asyw->image.block = fb->nvbo->tile_mode >> 4;
+			asyw->image.block = fb->nvbo->mode >> 4;
 		else
-			asyw->image.block = fb->nvbo->tile_mode;
+			asyw->image.block = fb->nvbo->mode;
 		asyw->image.pitch = (fb->base.pitches[0] / 4) << 4;
 	} else {
 		asyw->image.layout = 1;
@@ -3265,11 +3265,14 @@ nv50_mstm = {
 void
 nv50_mstm_service(struct nv50_mstm *mstm)
 {
-	struct drm_dp_aux *aux = mstm->mgr.aux;
+	struct drm_dp_aux *aux = mstm ? mstm->mgr.aux : NULL;
 	bool handled = true;
 	int ret;
 	u8 esi[8] = {};
 
+	if (!aux)
+		return;
+
 	while (handled) {
 		ret = drm_dp_dpcd_read(aux, DP_SINK_COUNT_ESI, esi, 8);
 		if (ret != 8) {
diff --git a/drivers/gpu/drm/nouveau/nv50_fbcon.c b/drivers/gpu/drm/nouveau/nv50_fbcon.c
index 327dcd7901ed..facd18564e0d 100644
--- a/drivers/gpu/drm/nouveau/nv50_fbcon.c
+++ b/drivers/gpu/drm/nouveau/nv50_fbcon.c
@@ -25,6 +25,7 @@
 #include "nouveau_drv.h"
 #include "nouveau_dma.h"
 #include "nouveau_fbcon.h"
+#include "nouveau_vmm.h"
 
 int
 nv50_fbcon_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
@@ -239,8 +240,8 @@ nv50_fbcon_accel_init(struct fb_info *info)
 	OUT_RING(chan, info->fix.line_length);
 	OUT_RING(chan, info->var.xres_virtual);
 	OUT_RING(chan, info->var.yres_virtual);
-	OUT_RING(chan, upper_32_bits(fb->vma.offset));
-	OUT_RING(chan, lower_32_bits(fb->vma.offset));
+	OUT_RING(chan, upper_32_bits(fb->vma->addr));
+	OUT_RING(chan, lower_32_bits(fb->vma->addr));
 	BEGIN_NV04(chan, NvSub2D, 0x0230, 2);
 	OUT_RING(chan, format);
 	OUT_RING(chan, 1);
@@ -248,8 +249,8 @@ nv50_fbcon_accel_init(struct fb_info *info)
 	OUT_RING(chan, info->fix.line_length);
 	OUT_RING(chan, info->var.xres_virtual);
 	OUT_RING(chan, info->var.yres_virtual);
-	OUT_RING(chan, upper_32_bits(fb->vma.offset));
-	OUT_RING(chan, lower_32_bits(fb->vma.offset));
+	OUT_RING(chan, upper_32_bits(fb->vma->addr));
+	OUT_RING(chan, lower_32_bits(fb->vma->addr));
 	FIRE_RING(chan);
 
 	return 0;
diff --git a/drivers/gpu/drm/nouveau/nv84_fence.c b/drivers/gpu/drm/nouveau/nv84_fence.c
index bd7a8a1e4ad9..5f0c0c27d5dc 100644
--- a/drivers/gpu/drm/nouveau/nv84_fence.c
+++ b/drivers/gpu/drm/nouveau/nv84_fence.c
@@ -25,6 +25,7 @@
 #include "nouveau_drv.h"
 #include "nouveau_dma.h"
 #include "nouveau_fence.h"
+#include "nouveau_vmm.h"
 
 #include "nv50_display.h"
 
@@ -68,12 +69,7 @@ nv84_fence_emit(struct nouveau_fence *fence)
 {
 	struct nouveau_channel *chan = fence->channel;
 	struct nv84_fence_chan *fctx = chan->fence;
-	u64 addr = chan->chid * 16;
-
-	if (fence->sysmem)
-		addr += fctx->vma_gart.offset;
-	else
-		addr += fctx->vma.offset;
+	u64 addr = fctx->vma->addr + chan->chid * 16;
 
 	return fctx->base.emit32(chan, addr, fence->base.seqno);
 }
@@ -83,12 +79,7 @@ nv84_fence_sync(struct nouveau_fence *fence,
 		struct nouveau_channel *prev, struct nouveau_channel *chan)
 {
 	struct nv84_fence_chan *fctx = chan->fence;
-	u64 addr = prev->chid * 16;
-
-	if (fence->sysmem)
-		addr += fctx->vma_gart.offset;
-	else
-		addr += fctx->vma.offset;
+	u64 addr = fctx->vma->addr + prev->chid * 16;
 
 	return fctx->base.sync32(chan, addr, fence->base.seqno);
 }
@@ -108,8 +99,7 @@ nv84_fence_context_del(struct nouveau_channel *chan)
 
 	nouveau_bo_wr32(priv->bo, chan->chid * 16 / 4, fctx->base.sequence);
 	mutex_lock(&priv->mutex);
-	nouveau_bo_vma_del(priv->bo, &fctx->vma_gart);
-	nouveau_bo_vma_del(priv->bo, &fctx->vma);
+	nouveau_vma_del(&fctx->vma);
 	mutex_unlock(&priv->mutex);
 	nouveau_fence_context_del(&fctx->base);
 	chan->fence = NULL;
@@ -137,11 +127,7 @@ nv84_fence_context_new(struct nouveau_channel *chan)
 	fctx->base.sequence = nv84_fence_read(chan);
 
 	mutex_lock(&priv->mutex);
-	ret = nouveau_bo_vma_add(priv->bo, cli->vm, &fctx->vma);
-	if (ret == 0) {
-		ret = nouveau_bo_vma_add(priv->bo_gart, cli->vm,
-					&fctx->vma_gart);
-	}
+	ret = nouveau_vma_new(priv->bo, &cli->vmm, &fctx->vma);
 	mutex_unlock(&priv->mutex);
 
 	if (ret)
@@ -182,10 +168,6 @@ static void
 nv84_fence_destroy(struct nouveau_drm *drm)
 {
 	struct nv84_fence_priv *priv = drm->fence;
-	nouveau_bo_unmap(priv->bo_gart);
-	if (priv->bo_gart)
-		nouveau_bo_unpin(priv->bo_gart);
-	nouveau_bo_ref(NULL, &priv->bo_gart);
 	nouveau_bo_unmap(priv->bo);
 	if (priv->bo)
 		nouveau_bo_unpin(priv->bo);
@@ -238,21 +220,6 @@ nv84_fence_create(struct nouveau_drm *drm)
 			nouveau_bo_ref(NULL, &priv->bo);
 	}
 
-	if (ret == 0)
-		ret = nouveau_bo_new(&drm->client, 16 * priv->base.contexts, 0,
-				     TTM_PL_FLAG_TT | TTM_PL_FLAG_UNCACHED, 0,
-				     0, NULL, NULL, &priv->bo_gart);
-	if (ret == 0) {
-		ret = nouveau_bo_pin(priv->bo_gart, TTM_PL_FLAG_TT, false);
-		if (ret == 0) {
-			ret = nouveau_bo_map(priv->bo_gart);
-			if (ret)
-				nouveau_bo_unpin(priv->bo_gart);
-		}
-		if (ret)
-			nouveau_bo_ref(NULL, &priv->bo_gart);
-	}
-
 	if (ret)
 		nv84_fence_destroy(drm);
 	return ret;
diff --git a/drivers/gpu/drm/nouveau/nvc0_fbcon.c b/drivers/gpu/drm/nouveau/nvc0_fbcon.c
index 90f27bfa381f..c0deef4fe727 100644
--- a/drivers/gpu/drm/nouveau/nvc0_fbcon.c
+++ b/drivers/gpu/drm/nouveau/nvc0_fbcon.c
@@ -25,6 +25,7 @@
 #include "nouveau_drv.h"
 #include "nouveau_dma.h"
 #include "nouveau_fbcon.h"
+#include "nouveau_vmm.h"
 
 int
 nvc0_fbcon_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
@@ -239,8 +240,8 @@ nvc0_fbcon_accel_init(struct fb_info *info)
 	OUT_RING  (chan, info->fix.line_length);
 	OUT_RING  (chan, info->var.xres_virtual);
 	OUT_RING  (chan, info->var.yres_virtual);
-	OUT_RING  (chan, upper_32_bits(fb->vma.offset));
-	OUT_RING  (chan, lower_32_bits(fb->vma.offset));
+	OUT_RING  (chan, upper_32_bits(fb->vma->addr));
+	OUT_RING  (chan, lower_32_bits(fb->vma->addr));
 	BEGIN_NVC0(chan, NvSub2D, 0x0230, 10);
 	OUT_RING  (chan, format);
 	OUT_RING  (chan, 1);
@@ -250,8 +251,8 @@ nvc0_fbcon_accel_init(struct fb_info *info)
 	OUT_RING  (chan, info->fix.line_length);
 	OUT_RING  (chan, info->var.xres_virtual);
 	OUT_RING  (chan, info->var.yres_virtual);
-	OUT_RING  (chan, upper_32_bits(fb->vma.offset));
-	OUT_RING  (chan, lower_32_bits(fb->vma.offset));
+	OUT_RING  (chan, upper_32_bits(fb->vma->addr));
+	OUT_RING  (chan, lower_32_bits(fb->vma->addr));
 	FIRE_RING (chan);
 
 	return 0;
diff --git a/drivers/gpu/drm/nouveau/nvif/Kbuild b/drivers/gpu/drm/nouveau/nvif/Kbuild
index 067b5e9f5ec1..f1675a4ab6fa 100644
--- a/drivers/gpu/drm/nouveau/nvif/Kbuild
+++ b/drivers/gpu/drm/nouveau/nvif/Kbuild
@@ -2,4 +2,7 @@ nvif-y := nvif/object.o
 nvif-y += nvif/client.o
 nvif-y += nvif/device.o
 nvif-y += nvif/driver.o
+nvif-y += nvif/mem.o
+nvif-y += nvif/mmu.o
 nvif-y += nvif/notify.o
+nvif-y += nvif/vmm.o
diff --git a/drivers/gpu/drm/nouveau/nvif/mem.c b/drivers/gpu/drm/nouveau/nvif/mem.c
new file mode 100644
index 000000000000..0f9382c60145
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvif/mem.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <nvif/mem.h>
+#include <nvif/client.h>
+
+#include <nvif/if000a.h>
+
+void
+nvif_mem_fini(struct nvif_mem *mem)
+{
+	nvif_object_fini(&mem->object);
+}
+
+int
+nvif_mem_init_type(struct nvif_mmu *mmu, s32 oclass, int type, u8 page,
+		   u64 size, void *argv, u32 argc, struct nvif_mem *mem)
+{
+	struct nvif_mem_v0 *args;
+	u8 stack[128];
+	int ret;
+
+	mem->object.client = NULL;
+	if (type < 0)
+		return -EINVAL;
+
+	if (sizeof(*args) + argc > sizeof(stack)) {
+		if (!(args = kmalloc(sizeof(*args) + argc, GFP_KERNEL)))
+			return -ENOMEM;
+	} else {
+		args = (void *)stack;
+	}
+	args->version = 0;
+	args->type = type;
+	args->page = page;
+	args->size = size;
+	memcpy(args->data, argv, argc);
+
+	ret = nvif_object_init(&mmu->object, 0, oclass, args,
+			       sizeof(*args) + argc, &mem->object);
+	if (ret == 0) {
+		mem->type = mmu->type[type].type;
+		mem->page = args->page;
+		mem->addr = args->addr;
+		mem->size = args->size;
+	}
+
+	if (args != (void *)stack)
+		kfree(args);
+	return ret;
+
+}
+
+int
+nvif_mem_init(struct nvif_mmu *mmu, s32 oclass, u8 type, u8 page,
+	      u64 size, void *argv, u32 argc, struct nvif_mem *mem)
+{
+	int ret = -EINVAL, i;
+
+	mem->object.client = NULL;
+
+	for (i = 0; ret && i < mmu->type_nr; i++) {
+		if ((mmu->type[i].type & type) == type) {
+			ret = nvif_mem_init_type(mmu, oclass, i, page, size,
+						 argv, argc, mem);
+		}
+	}
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/nouveau/nvif/mmu.c b/drivers/gpu/drm/nouveau/nvif/mmu.c
new file mode 100644
index 000000000000..15d0dcbf7ab4
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvif/mmu.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <nvif/mmu.h>
+
+#include <nvif/class.h>
+#include <nvif/if0008.h>
+
+void
+nvif_mmu_fini(struct nvif_mmu *mmu)
+{
+	kfree(mmu->kind);
+	kfree(mmu->type);
+	kfree(mmu->heap);
+	nvif_object_fini(&mmu->object);
+}
+
+int
+nvif_mmu_init(struct nvif_object *parent, s32 oclass, struct nvif_mmu *mmu)
+{
+	struct nvif_mmu_v0 args;
+	int ret, i;
+
+	args.version = 0;
+	mmu->heap = NULL;
+	mmu->type = NULL;
+	mmu->kind = NULL;
+
+	ret = nvif_object_init(parent, 0, oclass, &args, sizeof(args),
+			       &mmu->object);
+	if (ret)
+		goto done;
+
+	mmu->dmabits = args.dmabits;
+	mmu->heap_nr = args.heap_nr;
+	mmu->type_nr = args.type_nr;
+	mmu->kind_nr = args.kind_nr;
+
+	mmu->heap = kmalloc(sizeof(*mmu->heap) * mmu->heap_nr, GFP_KERNEL);
+	mmu->type = kmalloc(sizeof(*mmu->type) * mmu->type_nr, GFP_KERNEL);
+	if (ret = -ENOMEM, !mmu->heap || !mmu->type)
+		goto done;
+
+	mmu->kind = kmalloc(sizeof(*mmu->kind) * mmu->kind_nr, GFP_KERNEL);
+	if (!mmu->kind && mmu->kind_nr)
+		goto done;
+
+	for (i = 0; i < mmu->heap_nr; i++) {
+		struct nvif_mmu_heap_v0 args = { .index = i };
+
+		ret = nvif_object_mthd(&mmu->object, NVIF_MMU_V0_HEAP,
+				       &args, sizeof(args));
+		if (ret)
+			goto done;
+
+		mmu->heap[i].size = args.size;
+	}
+
+	for (i = 0; i < mmu->type_nr; i++) {
+		struct nvif_mmu_type_v0 args = { .index = i };
+
+		ret = nvif_object_mthd(&mmu->object, NVIF_MMU_V0_TYPE,
+				       &args, sizeof(args));
+		if (ret)
+			goto done;
+
+		mmu->type[i].type = 0;
+		if (args.vram) mmu->type[i].type |= NVIF_MEM_VRAM;
+		if (args.host) mmu->type[i].type |= NVIF_MEM_HOST;
+		if (args.comp) mmu->type[i].type |= NVIF_MEM_COMP;
+		if (args.disp) mmu->type[i].type |= NVIF_MEM_DISP;
+		if (args.kind    ) mmu->type[i].type |= NVIF_MEM_KIND;
+		if (args.mappable) mmu->type[i].type |= NVIF_MEM_MAPPABLE;
+		if (args.coherent) mmu->type[i].type |= NVIF_MEM_COHERENT;
+		if (args.uncached) mmu->type[i].type |= NVIF_MEM_UNCACHED;
+		mmu->type[i].heap = args.heap;
+	}
+
+	if (mmu->kind_nr) {
+		struct nvif_mmu_kind_v0 *kind;
+		u32 argc = sizeof(*kind) + sizeof(*kind->data) * mmu->kind_nr;
+
+		if (ret = -ENOMEM, !(kind = kmalloc(argc, GFP_KERNEL)))
+			goto done;
+		kind->version = 0;
+		kind->count = mmu->kind_nr;
+
+		ret = nvif_object_mthd(&mmu->object, NVIF_MMU_V0_KIND,
+				       kind, argc);
+		if (ret == 0)
+			memcpy(mmu->kind, kind->data, kind->count);
+		kfree(kind);
+	}
+
+done:
+	if (ret)
+		nvif_mmu_fini(mmu);
+	return ret;
+}
diff --git a/drivers/gpu/drm/nouveau/nvif/object.c b/drivers/gpu/drm/nouveau/nvif/object.c
index c3fb6a20f567..40adfe9b334b 100644
--- a/drivers/gpu/drm/nouveau/nvif/object.c
+++ b/drivers/gpu/drm/nouveau/nvif/object.c
@@ -166,46 +166,77 @@ nvif_object_mthd(struct nvif_object *object, u32 mthd, void *data, u32 size)
 }
 
 void
-nvif_object_unmap(struct nvif_object *object)
+nvif_object_unmap_handle(struct nvif_object *object)
+{
+	struct {
+		struct nvif_ioctl_v0 ioctl;
+		struct nvif_ioctl_unmap unmap;
+	} args = {
+		.ioctl.type = NVIF_IOCTL_V0_UNMAP,
+	};
+
+	nvif_object_ioctl(object, &args, sizeof(args), NULL);
+}
+
+int
+nvif_object_map_handle(struct nvif_object *object, void *argv, u32 argc,
+		       u64 *handle, u64 *length)
 {
-	if (object->map.size) {
-		struct nvif_client *client = object->client;
-		struct {
-			struct nvif_ioctl_v0 ioctl;
-			struct nvif_ioctl_unmap unmap;
-		} args = {
-			.ioctl.type = NVIF_IOCTL_V0_UNMAP,
-		};
+	struct {
+		struct nvif_ioctl_v0 ioctl;
+		struct nvif_ioctl_map_v0 map;
+	} *args;
+	u32 argn = sizeof(*args) + argc;
+	int ret, maptype;
+
+	if (!(args = kzalloc(argn, GFP_KERNEL)))
+		return -ENOMEM;
+	args->ioctl.type = NVIF_IOCTL_V0_MAP;
+	memcpy(args->map.data, argv, argc);
 
-		if (object->map.ptr) {
+	ret = nvif_object_ioctl(object, args, argn, NULL);
+	*handle = args->map.handle;
+	*length = args->map.length;
+	maptype = args->map.type;
+	kfree(args);
+	return ret ? ret : (maptype == NVIF_IOCTL_MAP_V0_IO);
+}
+
+void
+nvif_object_unmap(struct nvif_object *object)
+{
+	struct nvif_client *client = object->client;
+	if (object->map.ptr) {
+		if (object->map.size) {
 			client->driver->unmap(client, object->map.ptr,
 						      object->map.size);
-			object->map.ptr = NULL;
+			object->map.size = 0;
 		}
-
-		nvif_object_ioctl(object, &args, sizeof(args), NULL);
-		object->map.size = 0;
+		object->map.ptr = NULL;
+		nvif_object_unmap_handle(object);
 	}
 }
 
 int
-nvif_object_map(struct nvif_object *object)
+nvif_object_map(struct nvif_object *object, void *argv, u32 argc)
 {
 	struct nvif_client *client = object->client;
-	struct {
-		struct nvif_ioctl_v0 ioctl;
-		struct nvif_ioctl_map_v0 map;
-	} args = {
-		.ioctl.type = NVIF_IOCTL_V0_MAP,
-	};
-	int ret = nvif_object_ioctl(object, &args, sizeof(args), NULL);
-	if (ret == 0) {
-		object->map.size = args.map.length;
-		object->map.ptr = client->driver->map(client, args.map.handle,
-						      object->map.size);
-		if (ret = -ENOMEM, object->map.ptr)
+	u64 handle, length;
+	int ret = nvif_object_map_handle(object, argv, argc, &handle, &length);
+	if (ret >= 0) {
+		if (ret) {
+			object->map.ptr = client->driver->map(client,
+							      handle,
+							      length);
+			if (ret = -ENOMEM, object->map.ptr) {
+				object->map.size = length;
+				return 0;
+			}
+		} else {
+			object->map.ptr = (void *)(unsigned long)handle;
 			return 0;
-		nvif_object_unmap(object);
+		}
+		nvif_object_unmap_handle(object);
 	}
 	return ret;
 }
diff --git a/drivers/gpu/drm/nouveau/nvif/vmm.c b/drivers/gpu/drm/nouveau/nvif/vmm.c
new file mode 100644
index 000000000000..31cdb2d2e1ff
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvif/vmm.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <nvif/vmm.h>
+#include <nvif/mem.h>
+
+#include <nvif/if000c.h>
+
+int
+nvif_vmm_unmap(struct nvif_vmm *vmm, u64 addr)
+{
+	return nvif_object_mthd(&vmm->object, NVIF_VMM_V0_UNMAP,
+				&(struct nvif_vmm_unmap_v0) { .addr = addr },
+				sizeof(struct nvif_vmm_unmap_v0));
+}
+
+int
+nvif_vmm_map(struct nvif_vmm *vmm, u64 addr, u64 size, void *argv, u32 argc,
+	     struct nvif_mem *mem, u64 offset)
+{
+	struct nvif_vmm_map_v0 *args;
+	u8 stack[16];
+	int ret;
+
+	if (sizeof(*args) + argc > sizeof(stack)) {
+		if (!(args = kmalloc(sizeof(*args) + argc, GFP_KERNEL)))
+			return -ENOMEM;
+	} else {
+		args = (void *)stack;
+	}
+
+	args->version = 0;
+	args->addr = addr;
+	args->size = size;
+	args->memory = nvif_handle(&mem->object);
+	args->offset = offset;
+	memcpy(args->data, argv, argc);
+
+	ret = nvif_object_mthd(&vmm->object, NVIF_VMM_V0_MAP,
+			       args, sizeof(*args) + argc);
+	if (args != (void *)stack)
+		kfree(args);
+	return ret;
+}
+
+void
+nvif_vmm_put(struct nvif_vmm *vmm, struct nvif_vma *vma)
+{
+	if (vma->size) {
+		WARN_ON(nvif_object_mthd(&vmm->object, NVIF_VMM_V0_PUT,
+					 &(struct nvif_vmm_put_v0) {
+						.addr = vma->addr,
+					 }, sizeof(struct nvif_vmm_put_v0)));
+		vma->size = 0;
+	}
+}
+
+int
+nvif_vmm_get(struct nvif_vmm *vmm, enum nvif_vmm_get type, bool sparse,
+	     u8 page, u8 align, u64 size, struct nvif_vma *vma)
+{
+	struct nvif_vmm_get_v0 args;
+	int ret;
+
+	args.version = vma->size = 0;
+	args.sparse = sparse;
+	args.page = page;
+	args.align = align;
+	args.size = size;
+
+	switch (type) {
+	case ADDR: args.type = NVIF_VMM_GET_V0_ADDR; break;
+	case PTES: args.type = NVIF_VMM_GET_V0_PTES; break;
+	case LAZY: args.type = NVIF_VMM_GET_V0_LAZY; break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	ret = nvif_object_mthd(&vmm->object, NVIF_VMM_V0_GET,
+			       &args, sizeof(args));
+	if (ret == 0) {
+		vma->addr = args.addr;
+		vma->size = args.size;
+	}
+	return ret;
+}
+
+void
+nvif_vmm_fini(struct nvif_vmm *vmm)
+{
+	kfree(vmm->page);
+	nvif_object_fini(&vmm->object);
+}
+
+int
+nvif_vmm_init(struct nvif_mmu *mmu, s32 oclass, u64 addr, u64 size,
+	      void *argv, u32 argc, struct nvif_vmm *vmm)
+{
+	struct nvif_vmm_v0 *args;
+	u32 argn = sizeof(*args) + argc;
+	int ret = -ENOSYS, i;
+
+	vmm->object.client = NULL;
+	vmm->page = NULL;
+
+	if (!(args = kmalloc(argn, GFP_KERNEL)))
+		return -ENOMEM;
+	args->version = 0;
+	args->addr = addr;
+	args->size = size;
+	memcpy(args->data, argv, argc);
+
+	ret = nvif_object_init(&mmu->object, 0, oclass, args, argn,
+			       &vmm->object);
+	if (ret)
+		goto done;
+
+	vmm->start = args->addr;
+	vmm->limit = args->size;
+
+	vmm->page_nr = args->page_nr;
+	vmm->page = kmalloc(sizeof(*vmm->page) * vmm->page_nr, GFP_KERNEL);
+	if (!vmm->page) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	for (i = 0; i < vmm->page_nr; i++) {
+		struct nvif_vmm_page_v0 args = { .index = i };
+
+		ret = nvif_object_mthd(&vmm->object, NVIF_VMM_V0_PAGE,
+				       &args, sizeof(args));
+		if (ret)
+			break;
+
+		vmm->page[i].shift = args.shift;
+		vmm->page[i].sparse = args.sparse;
+		vmm->page[i].vram = args.vram;
+		vmm->page[i].host = args.host;
+		vmm->page[i].comp = args.comp;
+	}
+
+done:
+	if (ret)
+		nvif_vmm_fini(vmm);
+	kfree(args);
+	return ret;
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/core/client.c b/drivers/gpu/drm/nouveau/nvkm/core/client.c
index 0d3a896892b4..ac671202919e 100644
--- a/drivers/gpu/drm/nouveau/nvkm/core/client.c
+++ b/drivers/gpu/drm/nouveau/nvkm/core/client.c
@@ -301,5 +301,7 @@ nvkm_client_new(const char *name, u64 device, const char *cfg,
 	client->debug = nvkm_dbgopt(dbg, "CLIENT");
 	client->objroot = RB_ROOT;
 	client->ntfy = ntfy;
+	INIT_LIST_HEAD(&client->umem);
+	spin_lock_init(&client->lock);
 	return 0;
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/core/engine.c b/drivers/gpu/drm/nouveau/nvkm/core/engine.c
index b6c916954a10..657231c3c098 100644
--- a/drivers/gpu/drm/nouveau/nvkm/core/engine.c
+++ b/drivers/gpu/drm/nouveau/nvkm/core/engine.c
@@ -126,6 +126,15 @@ nvkm_engine_init(struct nvkm_subdev *subdev)
 	return ret;
 }
 
+static int
+nvkm_engine_preinit(struct nvkm_subdev *subdev)
+{
+	struct nvkm_engine *engine = nvkm_engine(subdev);
+	if (engine->func->preinit)
+		engine->func->preinit(engine);
+	return 0;
+}
+
 static void *
 nvkm_engine_dtor(struct nvkm_subdev *subdev)
 {
@@ -138,6 +147,7 @@ nvkm_engine_dtor(struct nvkm_subdev *subdev)
 static const struct nvkm_subdev_func
 nvkm_engine_func = {
 	.dtor = nvkm_engine_dtor,
+	.preinit = nvkm_engine_preinit,
 	.init = nvkm_engine_init,
 	.fini = nvkm_engine_fini,
 	.intr = nvkm_engine_intr,
diff --git a/drivers/gpu/drm/nouveau/nvkm/core/gpuobj.c b/drivers/gpu/drm/nouveau/nvkm/core/gpuobj.c
index a7bd22706b2a..d6de2b3ed2c3 100644
--- a/drivers/gpu/drm/nouveau/nvkm/core/gpuobj.c
+++ b/drivers/gpu/drm/nouveau/nvkm/core/gpuobj.c
@@ -42,6 +42,14 @@ nvkm_gpuobj_wr32_fast(struct nvkm_gpuobj *gpuobj, u32 offset, u32 data)
 }
 
 /* accessor functions for gpuobjs allocated directly from instmem */
+static int
+nvkm_gpuobj_heap_map(struct nvkm_gpuobj *gpuobj, u64 offset,
+		     struct nvkm_vmm *vmm, struct nvkm_vma *vma,
+		     void *argv, u32 argc)
+{
+	return nvkm_memory_map(gpuobj->memory, offset, vmm, vma, argv, argc);
+}
+
 static u32
 nvkm_gpuobj_heap_rd32(struct nvkm_gpuobj *gpuobj, u32 offset)
 {
@@ -67,6 +75,7 @@ nvkm_gpuobj_heap_fast = {
 	.release = nvkm_gpuobj_heap_release,
 	.rd32 = nvkm_gpuobj_rd32_fast,
 	.wr32 = nvkm_gpuobj_wr32_fast,
+	.map = nvkm_gpuobj_heap_map,
 };
 
 static const struct nvkm_gpuobj_func
@@ -74,6 +83,7 @@ nvkm_gpuobj_heap_slow = {
 	.release = nvkm_gpuobj_heap_release,
 	.rd32 = nvkm_gpuobj_heap_rd32,
 	.wr32 = nvkm_gpuobj_heap_wr32,
+	.map = nvkm_gpuobj_heap_map,
 };
 
 static void *
@@ -90,9 +100,19 @@ nvkm_gpuobj_heap_acquire(struct nvkm_gpuobj *gpuobj)
 static const struct nvkm_gpuobj_func
 nvkm_gpuobj_heap = {
 	.acquire = nvkm_gpuobj_heap_acquire,
+	.map = nvkm_gpuobj_heap_map,
 };
 
 /* accessor functions for gpuobjs sub-allocated from a parent gpuobj */
+static int
+nvkm_gpuobj_map(struct nvkm_gpuobj *gpuobj, u64 offset,
+		struct nvkm_vmm *vmm, struct nvkm_vma *vma,
+		void *argv, u32 argc)
+{
+	return nvkm_memory_map(gpuobj->parent, gpuobj->node->offset + offset,
+			       vmm, vma, argv, argc);
+}
+
 static u32
 nvkm_gpuobj_rd32(struct nvkm_gpuobj *gpuobj, u32 offset)
 {
@@ -118,6 +138,7 @@ nvkm_gpuobj_fast = {
 	.release = nvkm_gpuobj_release,
 	.rd32 = nvkm_gpuobj_rd32_fast,
 	.wr32 = nvkm_gpuobj_wr32_fast,
+	.map = nvkm_gpuobj_map,
 };
 
 static const struct nvkm_gpuobj_func
@@ -125,6 +146,7 @@ nvkm_gpuobj_slow = {
 	.release = nvkm_gpuobj_release,
 	.rd32 = nvkm_gpuobj_rd32,
 	.wr32 = nvkm_gpuobj_wr32,
+	.map = nvkm_gpuobj_map,
 };
 
 static void *
@@ -143,6 +165,7 @@ nvkm_gpuobj_acquire(struct nvkm_gpuobj *gpuobj)
 static const struct nvkm_gpuobj_func
 nvkm_gpuobj_func = {
 	.acquire = nvkm_gpuobj_acquire,
+	.map = nvkm_gpuobj_map,
 };
 
 static int
@@ -185,7 +208,7 @@ nvkm_gpuobj_ctor(struct nvkm_device *device, u32 size, int align, bool zero,
 		gpuobj->size = nvkm_memory_size(gpuobj->memory);
 	}
 
-	return nvkm_mm_init(&gpuobj->heap, 0, gpuobj->size, 1);
+	return nvkm_mm_init(&gpuobj->heap, 0, 0, gpuobj->size, 1);
 }
 
 void
@@ -196,7 +219,7 @@ nvkm_gpuobj_del(struct nvkm_gpuobj **pgpuobj)
 		if (gpuobj->parent)
 			nvkm_mm_free(&gpuobj->parent->heap, &gpuobj->node);
 		nvkm_mm_fini(&gpuobj->heap);
-		nvkm_memory_del(&gpuobj->memory);
+		nvkm_memory_unref(&gpuobj->memory);
 		kfree(*pgpuobj);
 		*pgpuobj = NULL;
 	}
@@ -218,26 +241,6 @@ nvkm_gpuobj_new(struct nvkm_device *device, u32 size, int align, bool zero,
 	return ret;
 }
 
-int
-nvkm_gpuobj_map(struct nvkm_gpuobj *gpuobj, struct nvkm_vm *vm,
-		u32 access, struct nvkm_vma *vma)
-{
-	struct nvkm_memory *memory = gpuobj->memory;
-	int ret = nvkm_vm_get(vm, gpuobj->size, 12, access, vma);
-	if (ret == 0)
-		nvkm_memory_map(memory, vma, 0);
-	return ret;
-}
-
-void
-nvkm_gpuobj_unmap(struct nvkm_vma *vma)
-{
-	if (vma->node) {
-		nvkm_vm_unmap(vma);
-		nvkm_vm_put(vma);
-	}
-}
-
 /* the below is basically only here to support sharing the paged dma object
  * for PCI(E)GART on <=nv4x chipsets, and should *not* be expected to work
  * anywhere else.
diff --git a/drivers/gpu/drm/nouveau/nvkm/core/ioctl.c b/drivers/gpu/drm/nouveau/nvkm/core/ioctl.c
index be19bbe56bba..d777df5a64e6 100644
--- a/drivers/gpu/drm/nouveau/nvkm/core/ioctl.c
+++ b/drivers/gpu/drm/nouveau/nvkm/core/ioctl.c
@@ -53,7 +53,7 @@ nvkm_ioctl_sclass(struct nvkm_client *client,
 	union {
 		struct nvif_ioctl_sclass_v0 v0;
 	} *args = data;
-	struct nvkm_oclass oclass;
+	struct nvkm_oclass oclass = { .client = client };
 	int ret = -ENOSYS, i = 0;
 
 	nvif_ioctl(object, "sclass size %d\n", size);
@@ -257,13 +257,19 @@ nvkm_ioctl_map(struct nvkm_client *client,
 	union {
 		struct nvif_ioctl_map_v0 v0;
 	} *args = data;
+	enum nvkm_object_map type;
 	int ret = -ENOSYS;
 
 	nvif_ioctl(object, "map size %d\n", size);
-	if (!(ret = nvif_unpack(ret, &data, &size, args->v0, 0, 0, false))) {
+	if (!(ret = nvif_unpack(ret, &data, &size, args->v0, 0, 0, true))) {
 		nvif_ioctl(object, "map vers %d\n", args->v0.version);
-		ret = nvkm_object_map(object, &args->v0.handle,
-					      &args->v0.length);
+		ret = nvkm_object_map(object, data, size, &type,
+				      &args->v0.handle,
+				      &args->v0.length);
+		if (type == NVKM_OBJECT_MAP_IO)
+			args->v0.type = NVIF_IOCTL_MAP_V0_IO;
+		else
+			args->v0.type = NVIF_IOCTL_MAP_V0_VA;
 	}
 
 	return ret;
@@ -281,6 +287,7 @@ nvkm_ioctl_unmap(struct nvkm_client *client,
 	nvif_ioctl(object, "unmap size %d\n", size);
 	if (!(ret = nvif_unvers(ret, &data, &size, args->none))) {
 		nvif_ioctl(object, "unmap\n");
+		ret = nvkm_object_unmap(object);
 	}
 
 	return ret;
diff --git a/drivers/gpu/drm/nouveau/nvkm/core/memory.c b/drivers/gpu/drm/nouveau/nvkm/core/memory.c
index 8903c04c977e..e85a08ecd9da 100644
--- a/drivers/gpu/drm/nouveau/nvkm/core/memory.c
+++ b/drivers/gpu/drm/nouveau/nvkm/core/memory.c
@@ -22,27 +22,117 @@
  * Authors: Ben Skeggs <bskeggs@redhat.com>
  */
 #include <core/memory.h>
+#include <core/mm.h>
+#include <subdev/fb.h>
 #include <subdev/instmem.h>
 
 void
+nvkm_memory_tags_put(struct nvkm_memory *memory, struct nvkm_device *device,
+		     struct nvkm_tags **ptags)
+{
+	struct nvkm_fb *fb = device->fb;
+	struct nvkm_tags *tags = *ptags;
+	if (tags) {
+		mutex_lock(&fb->subdev.mutex);
+		if (refcount_dec_and_test(&tags->refcount)) {
+			nvkm_mm_free(&fb->tags, &tags->mn);
+			kfree(memory->tags);
+			memory->tags = NULL;
+		}
+		mutex_unlock(&fb->subdev.mutex);
+		*ptags = NULL;
+	}
+}
+
+int
+nvkm_memory_tags_get(struct nvkm_memory *memory, struct nvkm_device *device,
+		     u32 nr, void (*clr)(struct nvkm_device *, u32, u32),
+		     struct nvkm_tags **ptags)
+{
+	struct nvkm_fb *fb = device->fb;
+	struct nvkm_tags *tags;
+
+	mutex_lock(&fb->subdev.mutex);
+	if ((tags = memory->tags)) {
+		/* If comptags exist for the memory, but a different amount
+		 * than requested, the buffer is being mapped with settings
+		 * that are incompatible with existing mappings.
+		 */
+		if (tags->mn && tags->mn->length != nr) {
+			mutex_unlock(&fb->subdev.mutex);
+			return -EINVAL;
+		}
+
+		refcount_inc(&tags->refcount);
+		mutex_unlock(&fb->subdev.mutex);
+		*ptags = tags;
+		return 0;
+	}
+
+	if (!(tags = kmalloc(sizeof(*tags), GFP_KERNEL))) {
+		mutex_unlock(&fb->subdev.mutex);
+		return -ENOMEM;
+	}
+
+	if (!nvkm_mm_head(&fb->tags, 0, 1, nr, nr, 1, &tags->mn)) {
+		if (clr)
+			clr(device, tags->mn->offset, tags->mn->length);
+	} else {
+		/* Failure to allocate HW comptags is not an error, the
+		 * caller should fall back to an uncompressed map.
+		 *
+		 * As memory can be mapped in multiple places, we still
+		 * need to track the allocation failure and ensure that
+		 * any additional mappings remain uncompressed.
+		 *
+		 * This is handled by returning an empty nvkm_tags.
+		 */
+		tags->mn = NULL;
+	}
+
+	refcount_set(&tags->refcount, 1);
+	mutex_unlock(&fb->subdev.mutex);
+	*ptags = tags;
+	return 0;
+}
+
+void
 nvkm_memory_ctor(const struct nvkm_memory_func *func,
 		 struct nvkm_memory *memory)
 {
 	memory->func = func;
+	kref_init(&memory->kref);
+}
+
+static void
+nvkm_memory_del(struct kref *kref)
+{
+	struct nvkm_memory *memory = container_of(kref, typeof(*memory), kref);
+	if (!WARN_ON(!memory->func)) {
+		if (memory->func->dtor)
+			memory = memory->func->dtor(memory);
+		kfree(memory);
+	}
 }
 
 void
-nvkm_memory_del(struct nvkm_memory **pmemory)
+nvkm_memory_unref(struct nvkm_memory **pmemory)
 {
 	struct nvkm_memory *memory = *pmemory;
-	if (memory && !WARN_ON(!memory->func)) {
-		if (memory->func->dtor)
-			*pmemory = memory->func->dtor(memory);
-		kfree(*pmemory);
+	if (memory) {
+		kref_put(&memory->kref, nvkm_memory_del);
 		*pmemory = NULL;
 	}
 }
 
+struct nvkm_memory *
+nvkm_memory_ref(struct nvkm_memory *memory)
+{
+	if (memory)
+		kref_get(&memory->kref);
+	return memory;
+}
+
 int
 nvkm_memory_new(struct nvkm_device *device, enum nvkm_memory_target target,
 		u64 size, u32 align, bool zero,
diff --git a/drivers/gpu/drm/nouveau/nvkm/core/mm.c b/drivers/gpu/drm/nouveau/nvkm/core/mm.c
index 5c7891234eea..f78a06a6b2f1 100644
--- a/drivers/gpu/drm/nouveau/nvkm/core/mm.c
+++ b/drivers/gpu/drm/nouveau/nvkm/core/mm.c
@@ -237,7 +237,7 @@ nvkm_mm_tail(struct nvkm_mm *mm, u8 heap, u8 type, u32 size_max, u32 size_min,
 }
 
 int
-nvkm_mm_init(struct nvkm_mm *mm, u32 offset, u32 length, u32 block)
+nvkm_mm_init(struct nvkm_mm *mm, u8 heap, u32 offset, u32 length, u32 block)
 {
 	struct nvkm_mm_node *node, *prev;
 	u32 next;
@@ -274,7 +274,8 @@ nvkm_mm_init(struct nvkm_mm *mm, u32 offset, u32 length, u32 block)
 
 	list_add_tail(&node->nl_entry, &mm->nodes);
 	list_add_tail(&node->fl_entry, &mm->free);
-	node->heap = ++mm->heap_nodes;
+	node->heap = heap;
+	mm->heap_nodes++;
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/core/object.c b/drivers/gpu/drm/nouveau/nvkm/core/object.c
index acd76fd4f6d8..301a5e5b5f7f 100644
--- a/drivers/gpu/drm/nouveau/nvkm/core/object.c
+++ b/drivers/gpu/drm/nouveau/nvkm/core/object.c
@@ -102,10 +102,19 @@ nvkm_object_ntfy(struct nvkm_object *object, u32 mthd,
 }
 
 int
-nvkm_object_map(struct nvkm_object *object, u64 *addr, u32 *size)
+nvkm_object_map(struct nvkm_object *object, void *argv, u32 argc,
+		enum nvkm_object_map *type, u64 *addr, u64 *size)
 {
 	if (likely(object->func->map))
-		return object->func->map(object, addr, size);
+		return object->func->map(object, argv, argc, type, addr, size);
+	return -ENODEV;
+}
+
+int
+nvkm_object_unmap(struct nvkm_object *object)
+{
+	if (likely(object->func->unmap))
+		return object->func->unmap(object);
 	return -ENODEV;
 }
 
@@ -259,6 +268,7 @@ nvkm_object_dtor(struct nvkm_object *object)
 	}
 
 	nvif_debug(object, "destroy running...\n");
+	nvkm_object_unmap(object);
 	if (object->func->dtor)
 		data = object->func->dtor(object);
 	nvkm_engine_unref(&object->engine);
diff --git a/drivers/gpu/drm/nouveau/nvkm/core/oproxy.c b/drivers/gpu/drm/nouveau/nvkm/core/oproxy.c
index e31a0479add0..16299837a296 100644
--- a/drivers/gpu/drm/nouveau/nvkm/core/oproxy.c
+++ b/drivers/gpu/drm/nouveau/nvkm/core/oproxy.c
@@ -37,9 +37,17 @@ nvkm_oproxy_ntfy(struct nvkm_object *object, u32 mthd,
 }
 
 static int
-nvkm_oproxy_map(struct nvkm_object *object, u64 *addr, u32 *size)
+nvkm_oproxy_map(struct nvkm_object *object, void *argv, u32 argc,
+		enum nvkm_object_map *type, u64 *addr, u64 *size)
 {
-	return nvkm_object_map(nvkm_oproxy(object)->object, addr, size);
+	struct nvkm_oproxy *oproxy = nvkm_oproxy(object);
+	return nvkm_object_map(oproxy->object, argv, argc, type, addr, size);
+}
+
+static int
+nvkm_oproxy_unmap(struct nvkm_object *object)
+{
+	return nvkm_object_unmap(nvkm_oproxy(object)->object);
 }
 
 static int
@@ -171,6 +179,7 @@ nvkm_oproxy_func = {
 	.mthd = nvkm_oproxy_mthd,
 	.ntfy = nvkm_oproxy_ntfy,
 	.map = nvkm_oproxy_map,
+	.unmap = nvkm_oproxy_unmap,
 	.rd08 = nvkm_oproxy_rd08,
 	.rd16 = nvkm_oproxy_rd16,
 	.rd32 = nvkm_oproxy_rd32,
diff --git a/drivers/gpu/drm/nouveau/nvkm/core/ramht.c b/drivers/gpu/drm/nouveau/nvkm/core/ramht.c
index 89da47234016..ccba4ae73cc5 100644
--- a/drivers/gpu/drm/nouveau/nvkm/core/ramht.c
+++ b/drivers/gpu/drm/nouveau/nvkm/core/ramht.c
@@ -21,6 +21,7 @@
  */
 #include <core/ramht.h>
 #include <core/engine.h>
+#include <core/object.h>
 
 static u32
 nvkm_ramht_hash(struct nvkm_ramht *ramht, int chid, u32 handle)
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/bsp/g84.c b/drivers/gpu/drm/nouveau/nvkm/engine/bsp/g84.c
index 8e2e24a74774..44e116f7880d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/bsp/g84.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/bsp/g84.c
@@ -39,5 +39,5 @@ int
 g84_bsp_new(struct nvkm_device *device, int index, struct nvkm_engine **pengine)
 {
 	return nvkm_xtensa_new_(&g84_bsp, device, index,
-				true, 0x103000, pengine);
+				device->chipset != 0x92, 0x103000, pengine);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c
index e096a5d9c292..e14643615698 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c
@@ -927,7 +927,7 @@ nv84_chipset = {
 	.i2c = nv50_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = g84_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g84_pci_new,
 	.therm = g84_therm_new,
@@ -959,7 +959,7 @@ nv86_chipset = {
 	.i2c = nv50_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = g84_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g84_pci_new,
 	.therm = g84_therm_new,
@@ -991,7 +991,7 @@ nv92_chipset = {
 	.i2c = nv50_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = g84_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g92_pci_new,
 	.therm = g84_therm_new,
@@ -1023,7 +1023,7 @@ nv94_chipset = {
 	.i2c = g94_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = g84_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g94_pci_new,
 	.therm = g84_therm_new,
@@ -1055,7 +1055,7 @@ nv96_chipset = {
 	.i2c = g94_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = g84_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g94_pci_new,
 	.therm = g84_therm_new,
@@ -1087,7 +1087,7 @@ nv98_chipset = {
 	.i2c = g94_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = g98_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g94_pci_new,
 	.therm = g84_therm_new,
@@ -1119,7 +1119,7 @@ nva0_chipset = {
 	.i2c = nv50_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = g84_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g94_pci_new,
 	.therm = g84_therm_new,
@@ -1151,7 +1151,7 @@ nva3_chipset = {
 	.i2c = g94_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = gt215_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g94_pci_new,
 	.pmu = gt215_pmu_new,
@@ -1185,7 +1185,7 @@ nva5_chipset = {
 	.i2c = g94_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = gt215_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g94_pci_new,
 	.pmu = gt215_pmu_new,
@@ -1218,7 +1218,7 @@ nva8_chipset = {
 	.i2c = g94_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = gt215_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g94_pci_new,
 	.pmu = gt215_pmu_new,
@@ -1251,7 +1251,7 @@ nvaa_chipset = {
 	.i2c = g94_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = g98_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g94_pci_new,
 	.therm = g84_therm_new,
@@ -1283,7 +1283,7 @@ nvac_chipset = {
 	.i2c = g94_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = g98_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g94_pci_new,
 	.therm = g84_therm_new,
@@ -1315,7 +1315,7 @@ nvaf_chipset = {
 	.i2c = g94_i2c_new,
 	.imem = nv50_instmem_new,
 	.mc = gt215_mc_new,
-	.mmu = nv50_mmu_new,
+	.mmu = g84_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = g94_pci_new,
 	.pmu = gt215_pmu_new,
@@ -1678,7 +1678,7 @@ nve4_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gk104_ltc_new,
 	.mc = gk104_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gk104_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gk104_pmu_new,
@@ -1717,7 +1717,7 @@ nve6_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gk104_ltc_new,
 	.mc = gk104_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gk104_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gk104_pmu_new,
@@ -1756,7 +1756,7 @@ nve7_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gk104_ltc_new,
 	.mc = gk104_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gk104_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gk104_pmu_new,
@@ -1790,7 +1790,7 @@ nvea_chipset = {
 	.imem = gk20a_instmem_new,
 	.ltc = gk104_ltc_new,
 	.mc = gk20a_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gk20a_mmu_new,
 	.pmu = gk20a_pmu_new,
 	.timer = gk20a_timer_new,
 	.top = gk104_top_new,
@@ -1820,7 +1820,7 @@ nvf0_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gk104_ltc_new,
 	.mc = gk104_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gk104_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gk110_pmu_new,
@@ -1858,7 +1858,7 @@ nvf1_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gk104_ltc_new,
 	.mc = gk104_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gk104_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gk110_pmu_new,
@@ -1896,7 +1896,7 @@ nv106_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gk104_ltc_new,
 	.mc = gk20a_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gk104_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gk208_pmu_new,
@@ -1934,7 +1934,7 @@ nv108_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gk104_ltc_new,
 	.mc = gk20a_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gk104_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gk208_pmu_new,
@@ -1958,7 +1958,7 @@ nv108_chipset = {
 static const struct nvkm_device_chip
 nv117_chipset = {
 	.name = "GM107",
-	.bar = gf100_bar_new,
+	.bar = gm107_bar_new,
 	.bios = nvkm_bios_new,
 	.bus = gf100_bus_new,
 	.clk = gk104_clk_new,
@@ -1972,7 +1972,7 @@ nv117_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gm107_ltc_new,
 	.mc = gk20a_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gk104_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gm107_pmu_new,
@@ -1992,7 +1992,7 @@ nv117_chipset = {
 static const struct nvkm_device_chip
 nv118_chipset = {
 	.name = "GM108",
-	.bar = gf100_bar_new,
+	.bar = gm107_bar_new,
 	.bios = nvkm_bios_new,
 	.bus = gf100_bus_new,
 	.clk = gk104_clk_new,
@@ -2006,7 +2006,7 @@ nv118_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gm107_ltc_new,
 	.mc = gk20a_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gk104_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gm107_pmu_new,
@@ -2026,7 +2026,7 @@ nv118_chipset = {
 static const struct nvkm_device_chip
 nv120_chipset = {
 	.name = "GM200",
-	.bar = gf100_bar_new,
+	.bar = gm107_bar_new,
 	.bios = nvkm_bios_new,
 	.bus = gf100_bus_new,
 	.devinit = gm200_devinit_new,
@@ -2039,7 +2039,7 @@ nv120_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gm200_ltc_new,
 	.mc = gk20a_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gm200_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gm107_pmu_new,
@@ -2061,7 +2061,7 @@ nv120_chipset = {
 static const struct nvkm_device_chip
 nv124_chipset = {
 	.name = "GM204",
-	.bar = gf100_bar_new,
+	.bar = gm107_bar_new,
 	.bios = nvkm_bios_new,
 	.bus = gf100_bus_new,
 	.devinit = gm200_devinit_new,
@@ -2074,7 +2074,7 @@ nv124_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gm200_ltc_new,
 	.mc = gk20a_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gm200_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gm107_pmu_new,
@@ -2096,7 +2096,7 @@ nv124_chipset = {
 static const struct nvkm_device_chip
 nv126_chipset = {
 	.name = "GM206",
-	.bar = gf100_bar_new,
+	.bar = gm107_bar_new,
 	.bios = nvkm_bios_new,
 	.bus = gf100_bus_new,
 	.devinit = gm200_devinit_new,
@@ -2109,7 +2109,7 @@ nv126_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gm200_ltc_new,
 	.mc = gk20a_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gm200_mmu_new,
 	.mxm = nv50_mxm_new,
 	.pci = gk104_pci_new,
 	.pmu = gm107_pmu_new,
@@ -2131,7 +2131,7 @@ nv126_chipset = {
 static const struct nvkm_device_chip
 nv12b_chipset = {
 	.name = "GM20B",
-	.bar = gk20a_bar_new,
+	.bar = gm20b_bar_new,
 	.bus = gf100_bus_new,
 	.clk = gm20b_clk_new,
 	.fb = gm20b_fb_new,
@@ -2140,7 +2140,7 @@ nv12b_chipset = {
 	.imem = gk20a_instmem_new,
 	.ltc = gm200_ltc_new,
 	.mc = gk20a_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gm20b_mmu_new,
 	.pmu = gm20b_pmu_new,
 	.secboot = gm20b_secboot_new,
 	.timer = gk20a_timer_new,
@@ -2156,7 +2156,7 @@ nv12b_chipset = {
 static const struct nvkm_device_chip
 nv130_chipset = {
 	.name = "GP100",
-	.bar = gf100_bar_new,
+	.bar = gm107_bar_new,
 	.bios = nvkm_bios_new,
 	.bus = gf100_bus_new,
 	.devinit = gm200_devinit_new,
@@ -2168,7 +2168,8 @@ nv130_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gp100_ltc_new,
 	.mc = gp100_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gp100_mmu_new,
+	.therm = gp100_therm_new,
 	.secboot = gm200_secboot_new,
 	.pci = gp100_pci_new,
 	.pmu = gp100_pmu_new,
@@ -2190,7 +2191,7 @@ nv130_chipset = {
 static const struct nvkm_device_chip
 nv132_chipset = {
 	.name = "GP102",
-	.bar = gf100_bar_new,
+	.bar = gm107_bar_new,
 	.bios = nvkm_bios_new,
 	.bus = gf100_bus_new,
 	.devinit = gm200_devinit_new,
@@ -2202,7 +2203,8 @@ nv132_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gp100_ltc_new,
 	.mc = gp100_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gp100_mmu_new,
+	.therm = gp100_therm_new,
 	.secboot = gp102_secboot_new,
 	.pci = gp100_pci_new,
 	.pmu = gp102_pmu_new,
@@ -2224,7 +2226,7 @@ nv132_chipset = {
 static const struct nvkm_device_chip
 nv134_chipset = {
 	.name = "GP104",
-	.bar = gf100_bar_new,
+	.bar = gm107_bar_new,
 	.bios = nvkm_bios_new,
 	.bus = gf100_bus_new,
 	.devinit = gm200_devinit_new,
@@ -2236,7 +2238,8 @@ nv134_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gp100_ltc_new,
 	.mc = gp100_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gp100_mmu_new,
+	.therm = gp100_therm_new,
 	.secboot = gp102_secboot_new,
 	.pci = gp100_pci_new,
 	.pmu = gp102_pmu_new,
@@ -2258,7 +2261,7 @@ nv134_chipset = {
 static const struct nvkm_device_chip
 nv136_chipset = {
 	.name = "GP106",
-	.bar = gf100_bar_new,
+	.bar = gm107_bar_new,
 	.bios = nvkm_bios_new,
 	.bus = gf100_bus_new,
 	.devinit = gm200_devinit_new,
@@ -2270,7 +2273,8 @@ nv136_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gp100_ltc_new,
 	.mc = gp100_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gp100_mmu_new,
+	.therm = gp100_therm_new,
 	.secboot = gp102_secboot_new,
 	.pci = gp100_pci_new,
 	.pmu = gp102_pmu_new,
@@ -2292,7 +2296,7 @@ nv136_chipset = {
 static const struct nvkm_device_chip
 nv137_chipset = {
 	.name = "GP107",
-	.bar = gf100_bar_new,
+	.bar = gm107_bar_new,
 	.bios = nvkm_bios_new,
 	.bus = gf100_bus_new,
 	.devinit = gm200_devinit_new,
@@ -2304,7 +2308,8 @@ nv137_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gp100_ltc_new,
 	.mc = gp100_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gp100_mmu_new,
+	.therm = gp100_therm_new,
 	.secboot = gp102_secboot_new,
 	.pci = gp100_pci_new,
 	.pmu = gp102_pmu_new,
@@ -2326,7 +2331,7 @@ nv137_chipset = {
 static const struct nvkm_device_chip
 nv138_chipset = {
 	.name = "GP108",
-	.bar = gf100_bar_new,
+	.bar = gm107_bar_new,
 	.bios = nvkm_bios_new,
 	.bus = gf100_bus_new,
 	.devinit = gm200_devinit_new,
@@ -2338,7 +2343,8 @@ nv138_chipset = {
 	.imem = nv50_instmem_new,
 	.ltc = gp100_ltc_new,
 	.mc = gp100_mc_new,
-	.mmu = gf100_mmu_new,
+	.mmu = gp100_mmu_new,
+	.therm = gp100_therm_new,
 	.pci = gp100_pci_new,
 	.pmu = gp102_pmu_new,
 	.timer = gk20a_timer_new,
@@ -2355,7 +2361,7 @@ nv138_chipset = {
 static const struct nvkm_device_chip
 nv13b_chipset = {
 	.name = "GP10B",
-	.bar = gk20a_bar_new,
+	.bar = gm20b_bar_new,
 	.bus = gf100_bus_new,
 	.fb = gp10b_fb_new,
 	.fuse = gm107_fuse_new,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/ctrl.h b/drivers/gpu/drm/nouveau/nvkm/engine/device/ctrl.h
index 20249d8e444d..2c3c3ee3c494 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/device/ctrl.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/ctrl.h
@@ -1,7 +1,7 @@
 #ifndef __NVKM_DEVICE_CTRL_H__
 #define __NVKM_DEVICE_CTRL_H__
 #define nvkm_control(p) container_of((p), struct nvkm_control, object)
-#include <core/device.h>
+#include <core/object.h>
 
 struct nvkm_control {
 	struct nvkm_object object;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c
index 74a1ffa425f7..f302d2b5782a 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c
@@ -1627,7 +1627,7 @@ nvkm_device_pci_new(struct pci_dev *pci_dev, const char *cfg, const char *dbg,
 	const struct nvkm_device_pci_vendor *pciv;
 	const char *name = NULL;
 	struct nvkm_device_pci *pdev;
-	int ret;
+	int ret, bits;
 
 	ret = pci_enable_device(pci_dev);
 	if (ret)
@@ -1679,17 +1679,17 @@ nvkm_device_pci_new(struct pci_dev *pci_dev, const char *cfg, const char *dbg,
 	if (ret)
 		return ret;
 
-	/*
-	 * Set a preliminary DMA mask based on the .dma_bits member of the
-	 * MMU subdevice. This allows other subdevices to create DMA mappings
-	 * in their init() or oneinit() methods, which may be called before the
-	 * TTM layer sets the DMA mask definitively.
-	 * This is necessary for platforms where the default DMA mask of 32
-	 * does not cover any system memory, i.e., when all RAM is > 4 GB.
-	 */
-	if (pdev->device.mmu)
-		dma_set_mask_and_coherent(&pci_dev->dev,
-				DMA_BIT_MASK(pdev->device.mmu->dma_bits));
+	/* Set DMA mask based on capabilities reported by the MMU subdev. */
+	if (pdev->device.mmu && !pdev->device.pci->agp.bridge)
+		bits = pdev->device.mmu->dma_bits;
+	else
+		bits = 32;
+
+	ret = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(bits));
+	if (ret && bits != 32) {
+		dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(32));
+		pdev->device.mmu->dma_bits = 32;
+	}
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
index 189ed80e21ff..78597da6313a 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/tegra.c
@@ -136,7 +136,7 @@ nvkm_device_tegra_probe_iommu(struct nvkm_device_tegra *tdev)
 		if (ret)
 			goto free_domain;
 
-		ret = nvkm_mm_init(&tdev->iommu.mm, 0,
+		ret = nvkm_mm_init(&tdev->iommu.mm, 0, 0,
 				   (1ULL << tdev->func->iommu_bit) >>
 				   tdev->iommu.pgshift, 1);
 		if (ret)
@@ -216,7 +216,7 @@ nvkm_device_tegra_fini(struct nvkm_device *device, bool suspend)
 	if (tdev->irq) {
 		free_irq(tdev->irq, tdev);
 		tdev->irq = 0;
-	};
+	}
 }
 
 static int
@@ -309,8 +309,6 @@ nvkm_device_tegra_new(const struct nvkm_device_tegra_func *func,
 
 	/**
 	 * The IOMMU bit defines the upper limit of the GPU-addressable space.
-	 * This will be refined in nouveau_ttm_init but we need to do it early
-	 * for instmem to behave properly
 	 */
 	ret = dma_set_mask(&pdev->dev, DMA_BIT_MASK(tdev->func->iommu_bit));
 	if (ret)
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c
index 513ee6b79553..17adcb4e8854 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c
@@ -206,10 +206,12 @@ nvkm_udevice_wr32(struct nvkm_object *object, u64 addr, u32 data)
 }
 
 static int
-nvkm_udevice_map(struct nvkm_object *object, u64 *addr, u32 *size)
+nvkm_udevice_map(struct nvkm_object *object, void *argv, u32 argc,
+		 enum nvkm_object_map *type, u64 *addr, u64 *size)
 {
 	struct nvkm_udevice *udev = nvkm_udevice(object);
 	struct nvkm_device *device = udev->device;
+	*type = NVKM_OBJECT_MAP_IO;
 	*addr = device->func->resource_addr(device, 0);
 	*size = device->func->resource_size(device, 0);
 	return 0;
@@ -292,6 +294,11 @@ nvkm_udevice_child_get(struct nvkm_object *object, int index,
 	if (!sclass) {
 		switch (index) {
 		case 0: sclass = &nvkm_control_oclass; break;
+		case 1:
+			if (!device->mmu)
+				return -EINVAL;
+			sclass = &device->mmu->user;
+			break;
 		default:
 			return -EINVAL;
 		}
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c
index 0c0310498afd..723dcbde2ac2 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c
@@ -191,11 +191,13 @@ nv50_disp_chan_ntfy(struct nvkm_object *object, u32 type,
 }
 
 static int
-nv50_disp_chan_map(struct nvkm_object *object, u64 *addr, u32 *size)
+nv50_disp_chan_map(struct nvkm_object *object, void *argv, u32 argc,
+		   enum nvkm_object_map *type, u64 *addr, u64 *size)
 {
 	struct nv50_disp_chan *chan = nv50_disp_chan(object);
 	struct nv50_disp *disp = chan->root->disp;
 	struct nvkm_device *device = disp->base.engine.subdev.device;
+	*type = NVKM_OBJECT_MAP_IO;
 	*addr = device->func->resource_addr(device, 0) +
 		0x640000 + (chan->chid.user * 0x1000);
 	*size = 0x001000;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.h b/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.h
index 737b38f6fbd2..9bb4ad5b0e57 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.h
@@ -1,6 +1,7 @@
 #ifndef __NV50_DISP_CHAN_H__
 #define __NV50_DISP_CHAN_H__
 #define nv50_disp_chan(p) container_of((p), struct nv50_disp_chan, object)
+#include <core/object.h>
 #include "nv50.h"
 
 struct nv50_disp_chan {
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h b/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h
index a1e8bf48b778..c9e0a8f7b5d5 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h
@@ -147,7 +147,7 @@ void gf119_hda_eld(struct nvkm_ior *, u8 *, u8);
 
 #define IOR_MSG(i,l,f,a...) do {                                               \
 	struct nvkm_ior *_ior = (i);                                           \
-	nvkm_##l(&_ior->disp->engine.subdev, "%s: "f, _ior->name, ##a);        \
+	nvkm_##l(&_ior->disp->engine.subdev, "%s: "f"\n", _ior->name, ##a);    \
 } while(0)
 #define IOR_WARN(i,f,a...) IOR_MSG((i), warn, f, ##a)
 #define IOR_DBG(i,f,a...) IOR_MSG((i), debug, f, ##a)
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/dma/usernv04.c b/drivers/gpu/drm/nouveau/nvkm/engine/dma/usernv04.c
index c95942ef8216..49ef7e57aad4 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/dma/usernv04.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/dma/usernv04.c
@@ -26,7 +26,7 @@
 
 #include <core/gpuobj.h>
 #include <subdev/fb.h>
-#include <subdev/mmu/nv04.h>
+#include <subdev/mmu/vmm.h>
 
 #include <nvif/class.h>
 
@@ -49,8 +49,8 @@ nv04_dmaobj_bind(struct nvkm_dmaobj *base, struct nvkm_gpuobj *parent,
 	int ret;
 
 	if (dmaobj->clone) {
-		struct nv04_mmu *mmu = nv04_mmu(device->mmu);
-		struct nvkm_memory *pgt = mmu->vm->pgt[0].mem[0];
+		struct nvkm_memory *pgt =
+			device->mmu->vmm->pd->pt[0]->memory;
 		if (!dmaobj->base.start)
 			return nvkm_gpuobj_wrap(pgt, pgpuobj);
 		nvkm_kmap(pgt);
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/falcon.c b/drivers/gpu/drm/nouveau/nvkm/engine/falcon.c
index 2e7b4e2105ef..816ccaedfc73 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/falcon.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/falcon.c
@@ -99,7 +99,7 @@ nvkm_falcon_fini(struct nvkm_engine *engine, bool suspend)
 	const u32 base = falcon->addr;
 
 	if (!suspend) {
-		nvkm_memory_del(&falcon->core);
+		nvkm_memory_unref(&falcon->core);
 		if (falcon->external) {
 			vfree(falcon->data.data);
 			vfree(falcon->code.data);
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
index 660ca7aa95ea..64f6b7654a08 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
@@ -27,6 +27,7 @@
 #include <core/client.h>
 #include <core/gpuobj.h>
 #include <core/notify.h>
+#include <subdev/mc.h>
 
 #include <nvif/event.h>
 #include <nvif/unpack.h>
@@ -278,6 +279,12 @@ nvkm_fifo_oneinit(struct nvkm_engine *engine)
 	return 0;
 }
 
+static void
+nvkm_fifo_preinit(struct nvkm_engine *engine)
+{
+	nvkm_mc_reset(engine->subdev.device, NVKM_ENGINE_FIFO);
+}
+
 static int
 nvkm_fifo_init(struct nvkm_engine *engine)
 {
@@ -302,6 +309,7 @@ nvkm_fifo_dtor(struct nvkm_engine *engine)
 static const struct nvkm_engine_func
 nvkm_fifo = {
 	.dtor = nvkm_fifo_dtor,
+	.preinit = nvkm_fifo_preinit,
 	.oneinit = nvkm_fifo_oneinit,
 	.init = nvkm_fifo_init,
 	.fini = nvkm_fifo_fini,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chan.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chan.c
index fab760ae922f..d83485385934 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chan.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chan.c
@@ -117,8 +117,8 @@ nvkm_fifo_chan_child_del(struct nvkm_oproxy *base)
 		if (chan->func->engine_dtor)
 			chan->func->engine_dtor(chan, engine);
 		nvkm_object_del(&engn->object);
-		if (chan->vm)
-			atomic_dec(&chan->vm->engref[engine->subdev.index]);
+		if (chan->vmm)
+			atomic_dec(&chan->vmm->engref[engine->subdev.index]);
 	}
 }
 
@@ -151,8 +151,8 @@ nvkm_fifo_chan_child_new(const struct nvkm_oclass *oclass, void *data, u32 size,
 			.engine = oclass->engine,
 		};
 
-		if (chan->vm)
-			atomic_inc(&chan->vm->engref[engine->subdev.index]);
+		if (chan->vmm)
+			atomic_inc(&chan->vmm->engref[engine->subdev.index]);
 
 		if (engine->func->fifo.cclass) {
 			ret = engine->func->fifo.cclass(chan, &cclass,
@@ -253,9 +253,11 @@ nvkm_fifo_chan_ntfy(struct nvkm_object *object, u32 type,
 }
 
 static int
-nvkm_fifo_chan_map(struct nvkm_object *object, u64 *addr, u32 *size)
+nvkm_fifo_chan_map(struct nvkm_object *object, void *argv, u32 argc,
+		   enum nvkm_object_map *type, u64 *addr, u64 *size)
 {
 	struct nvkm_fifo_chan *chan = nvkm_fifo_chan(object);
+	*type = NVKM_OBJECT_MAP_IO;
 	*addr = chan->addr;
 	*size = chan->size;
 	return 0;
@@ -325,7 +327,10 @@ nvkm_fifo_chan_dtor(struct nvkm_object *object)
 	if (chan->user)
 		iounmap(chan->user);
 
-	nvkm_vm_ref(NULL, &chan->vm, NULL);
+	if (chan->vmm) {
+		nvkm_vmm_part(chan->vmm, chan->inst->memory);
+		nvkm_vmm_unref(&chan->vmm);
+	}
 
 	nvkm_gpuobj_del(&chan->push);
 	nvkm_gpuobj_del(&chan->inst);
@@ -347,13 +352,12 @@ nvkm_fifo_chan_func = {
 int
 nvkm_fifo_chan_ctor(const struct nvkm_fifo_chan_func *func,
 		    struct nvkm_fifo *fifo, u32 size, u32 align, bool zero,
-		    u64 vm, u64 push, u64 engines, int bar, u32 base, u32 user,
-		    const struct nvkm_oclass *oclass,
+		    u64 hvmm, u64 push, u64 engines, int bar, u32 base,
+		    u32 user, const struct nvkm_oclass *oclass,
 		    struct nvkm_fifo_chan *chan)
 {
 	struct nvkm_client *client = oclass->client;
 	struct nvkm_device *device = fifo->engine.subdev.device;
-	struct nvkm_mmu *mmu = device->mmu;
 	struct nvkm_dmaobj *dmaobj;
 	unsigned long flags;
 	int ret;
@@ -382,16 +386,19 @@ nvkm_fifo_chan_ctor(const struct nvkm_fifo_chan_func *func,
 	}
 
 	/* channel address space */
-	if (!vm && mmu) {
-		if (!client->vm || client->vm->mmu == mmu) {
-			ret = nvkm_vm_ref(client->vm, &chan->vm, NULL);
-			if (ret)
-				return ret;
-		} else {
+	if (hvmm) {
+		struct nvkm_vmm *vmm = nvkm_uvmm_search(client, hvmm);
+		if (IS_ERR(vmm))
+			return PTR_ERR(vmm);
+
+		if (vmm->mmu != device->mmu)
 			return -EINVAL;
-		}
-	} else {
-		return -ENOENT;
+
+		ret = nvkm_vmm_join(vmm, chan->inst->memory);
+		if (ret)
+			return ret;
+
+		chan->vmm = nvkm_vmm_ref(vmm);
 	}
 
 	/* allocate channel id */
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c
index 61797c4dd07a..a5c998fe4485 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/chang84.c
@@ -229,15 +229,18 @@ g84_fifo_chan_func = {
 };
 
 int
-g84_fifo_chan_ctor(struct nv50_fifo *fifo, u64 vm, u64 push,
+g84_fifo_chan_ctor(struct nv50_fifo *fifo, u64 vmm, u64 push,
 		   const struct nvkm_oclass *oclass,
 		   struct nv50_fifo_chan *chan)
 {
 	struct nvkm_device *device = fifo->base.engine.subdev.device;
 	int ret;
 
+	if (!vmm)
+		return -EINVAL;
+
 	ret = nvkm_fifo_chan_ctor(&g84_fifo_chan_func, &fifo->base,
-				  0x10000, 0x1000, false, vm, push,
+				  0x10000, 0x1000, false, vmm, push,
 				  (1ULL << NVKM_ENGINE_BSP) |
 				  (1ULL << NVKM_ENGINE_CE0) |
 				  (1ULL << NVKM_ENGINE_CIPHER) |
@@ -277,9 +280,5 @@ g84_fifo_chan_ctor(struct nv50_fifo *fifo, u64 vm, u64 push,
 	if (ret)
 		return ret;
 
-	ret = nvkm_ramht_new(device, 0x8000, 16, chan->base.inst, &chan->ramht);
-	if (ret)
-		return ret;
-
-	return nvkm_vm_ref(chan->base.vm, &chan->vm, chan->pgd);
+	return nvkm_ramht_new(device, 0x8000, 16, chan->base.inst, &chan->ramht);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/changf100.h b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/changf100.h
index 7d697e2dce1a..fc1142af02cf 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/changf100.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/changf100.h
@@ -11,12 +11,9 @@ struct gf100_fifo_chan {
 	struct list_head head;
 	bool killed;
 
-	struct nvkm_gpuobj *pgd;
-	struct nvkm_vm *vm;
-
 	struct {
 		struct nvkm_gpuobj *inst;
-		struct nvkm_vma vma;
+		struct nvkm_vma *vma;
 	} engn[NVKM_SUBDEV_NR];
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/changk104.h b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/changk104.h
index 230f64e5f731..5beb5c628473 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/changk104.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/changk104.h
@@ -12,12 +12,9 @@ struct gk104_fifo_chan {
 	struct list_head head;
 	bool killed;
 
-	struct nvkm_gpuobj *pgd;
-	struct nvkm_vm *vm;
-
 	struct {
 		struct nvkm_gpuobj *inst;
-		struct nvkm_vma vma;
+		struct nvkm_vma *vma;
 	} engn[NVKM_SUBDEV_NR];
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/channv50.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/channv50.c
index 25b60aff40e4..85f7dbf53c99 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/channv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/channv50.c
@@ -206,7 +206,6 @@ void *
 nv50_fifo_chan_dtor(struct nvkm_fifo_chan *base)
 {
 	struct nv50_fifo_chan *chan = nv50_fifo_chan(base);
-	nvkm_vm_ref(NULL, &chan->vm, chan->pgd);
 	nvkm_ramht_del(&chan->ramht);
 	nvkm_gpuobj_del(&chan->pgd);
 	nvkm_gpuobj_del(&chan->eng);
@@ -229,15 +228,18 @@ nv50_fifo_chan_func = {
 };
 
 int
-nv50_fifo_chan_ctor(struct nv50_fifo *fifo, u64 vm, u64 push,
+nv50_fifo_chan_ctor(struct nv50_fifo *fifo, u64 vmm, u64 push,
 		    const struct nvkm_oclass *oclass,
 		    struct nv50_fifo_chan *chan)
 {
 	struct nvkm_device *device = fifo->base.engine.subdev.device;
 	int ret;
 
+	if (!vmm)
+		return -EINVAL;
+
 	ret = nvkm_fifo_chan_ctor(&nv50_fifo_chan_func, &fifo->base,
-				  0x10000, 0x1000, false, vm, push,
+				  0x10000, 0x1000, false, vmm, push,
 				  (1ULL << NVKM_ENGINE_DMAOBJ) |
 				  (1ULL << NVKM_ENGINE_SW) |
 				  (1ULL << NVKM_ENGINE_GR) |
@@ -262,9 +264,5 @@ nv50_fifo_chan_ctor(struct nv50_fifo *fifo, u64 vm, u64 push,
 	if (ret)
 		return ret;
 
-	ret = nvkm_ramht_new(device, 0x8000, 16, chan->base.inst, &chan->ramht);
-	if (ret)
-		return ret;
-
-	return nvkm_vm_ref(chan->base.vm, &chan->vm, chan->pgd);
+	return nvkm_ramht_new(device, 0x8000, 16, chan->base.inst, &chan->ramht);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/channv50.h b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/channv50.h
index 4b9da469b704..d853056e040b 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/channv50.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/channv50.h
@@ -13,19 +13,18 @@ struct nv50_fifo_chan {
 	struct nvkm_gpuobj *eng;
 	struct nvkm_gpuobj *pgd;
 	struct nvkm_ramht *ramht;
-	struct nvkm_vm *vm;
 
 	struct nvkm_gpuobj *engn[NVKM_SUBDEV_NR];
 };
 
-int nv50_fifo_chan_ctor(struct nv50_fifo *, u64 vm, u64 push,
+int nv50_fifo_chan_ctor(struct nv50_fifo *, u64 vmm, u64 push,
 			const struct nvkm_oclass *, struct nv50_fifo_chan *);
 void *nv50_fifo_chan_dtor(struct nvkm_fifo_chan *);
 void nv50_fifo_chan_fini(struct nvkm_fifo_chan *);
 void nv50_fifo_chan_engine_dtor(struct nvkm_fifo_chan *, struct nvkm_engine *);
 void nv50_fifo_chan_object_dtor(struct nvkm_fifo_chan *, int);
 
-int g84_fifo_chan_ctor(struct nv50_fifo *, u64 vm, u64 push,
+int g84_fifo_chan_ctor(struct nv50_fifo *, u64 vmm, u64 push,
 		       const struct nvkm_oclass *, struct nv50_fifo_chan *);
 
 extern const struct nvkm_fifo_chan_oclass nv50_fifo_dma_oclass;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmag84.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmag84.c
index caa914074752..fc34cddcd2f5 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmag84.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmag84.c
@@ -44,9 +44,9 @@ g84_fifo_dma_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 
 	nvif_ioctl(parent, "create channel dma size %d\n", size);
 	if (!(ret = nvif_unpack(ret, &data, &size, args->v0, 0, 0, false))) {
-		nvif_ioctl(parent, "create channel dma vers %d vm %llx "
+		nvif_ioctl(parent, "create channel dma vers %d vmm %llx "
 				   "pushbuf %llx offset %016llx\n",
-			   args->v0.version, args->v0.vm, args->v0.pushbuf,
+			   args->v0.version, args->v0.vmm, args->v0.pushbuf,
 			   args->v0.offset);
 		if (!args->v0.pushbuf)
 			return -EINVAL;
@@ -57,7 +57,7 @@ g84_fifo_dma_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 		return -ENOMEM;
 	*pobject = &chan->base.object;
 
-	ret = g84_fifo_chan_ctor(fifo, args->v0.vm, args->v0.pushbuf,
+	ret = g84_fifo_chan_ctor(fifo, args->v0.vmm, args->v0.pushbuf,
 				 oclass, chan);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmanv04.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmanv04.c
index 0a7b6ed5ed28..c213122cf088 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmanv04.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmanv04.c
@@ -95,6 +95,7 @@ nv04_fifo_dma_fini(struct nvkm_fifo_chan *base)
 		nvkm_mask(device, NV04_PFIFO_CACHE1_PULL0, 0x00000001, 0);
 
 		c = fifo->ramfc;
+		nvkm_kmap(fctx);
 		do {
 			u32 rm = ((1ULL << c->bits) - 1) << c->regs;
 			u32 cm = ((1ULL << c->bits) - 1) << c->ctxs;
@@ -102,6 +103,7 @@ nv04_fifo_dma_fini(struct nvkm_fifo_chan *base)
 			u32 cv = (nvkm_ro32(fctx, c->ctxp + data) & ~cm);
 			nvkm_wo32(fctx, c->ctxp + data, cv | (rv << c->ctxs));
 		} while ((++c)->bits);
+		nvkm_done(fctx);
 
 		c = fifo->ramfc;
 		do {
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmanv50.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmanv50.c
index 480bc3777be5..8043718ad150 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmanv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/dmanv50.c
@@ -44,9 +44,9 @@ nv50_fifo_dma_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 
 	nvif_ioctl(parent, "create channel dma size %d\n", size);
 	if (!(ret = nvif_unpack(ret, &data, &size, args->v0, 0, 0, false))) {
-		nvif_ioctl(parent, "create channel dma vers %d vm %llx "
+		nvif_ioctl(parent, "create channel dma vers %d vmm %llx "
 				   "pushbuf %llx offset %016llx\n",
-			   args->v0.version, args->v0.vm, args->v0.pushbuf,
+			   args->v0.version, args->v0.vmm, args->v0.pushbuf,
 			   args->v0.offset);
 		if (!args->v0.pushbuf)
 			return -EINVAL;
@@ -57,7 +57,7 @@ nv50_fifo_dma_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 		return -ENOMEM;
 	*pobject = &chan->base.object;
 
-	ret = nv50_fifo_chan_ctor(fifo, args->v0.vm, args->v0.pushbuf,
+	ret = nv50_fifo_chan_ctor(fifo, args->v0.vmm, args->v0.pushbuf,
 				  oclass, chan);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gf100.c
index cd468ab1db12..f69576868164 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gf100.c
@@ -559,6 +559,7 @@ gf100_fifo_oneinit(struct nvkm_fifo *base)
 	struct gf100_fifo *fifo = gf100_fifo(base);
 	struct nvkm_subdev *subdev = &fifo->base.engine.subdev;
 	struct nvkm_device *device = subdev->device;
+	struct nvkm_vmm *bar = nvkm_bar_bar1_vmm(device);
 	int ret;
 
 	/* Determine number of PBDMAs by checking valid enable bits. */
@@ -584,12 +585,12 @@ gf100_fifo_oneinit(struct nvkm_fifo *base)
 	if (ret)
 		return ret;
 
-	ret = nvkm_bar_umap(device->bar, 128 * 0x1000, 12, &fifo->user.bar);
+	ret = nvkm_vmm_get(bar, 12, nvkm_memory_size(fifo->user.mem),
+			   &fifo->user.bar);
 	if (ret)
 		return ret;
 
-	nvkm_memory_map(fifo->user.mem, &fifo->user.bar, 0);
-	return 0;
+	return nvkm_memory_map(fifo->user.mem, 0, bar, fifo->user.bar, NULL, 0);
 }
 
 static void
@@ -628,7 +629,7 @@ gf100_fifo_init(struct nvkm_fifo *base)
 	}
 
 	nvkm_mask(device, 0x002200, 0x00000001, 0x00000001);
-	nvkm_wr32(device, 0x002254, 0x10000000 | fifo->user.bar.offset >> 12);
+	nvkm_wr32(device, 0x002254, 0x10000000 | fifo->user.bar->addr >> 12);
 
 	nvkm_wr32(device, 0x002100, 0xffffffff);
 	nvkm_wr32(device, 0x002140, 0x7fffffff);
@@ -639,10 +640,11 @@ static void *
 gf100_fifo_dtor(struct nvkm_fifo *base)
 {
 	struct gf100_fifo *fifo = gf100_fifo(base);
-	nvkm_vm_put(&fifo->user.bar);
-	nvkm_memory_del(&fifo->user.mem);
-	nvkm_memory_del(&fifo->runlist.mem[0]);
-	nvkm_memory_del(&fifo->runlist.mem[1]);
+	struct nvkm_device *device = fifo->base.engine.subdev.device;
+	nvkm_vmm_put(nvkm_bar_bar1_vmm(device), &fifo->user.bar);
+	nvkm_memory_unref(&fifo->user.mem);
+	nvkm_memory_unref(&fifo->runlist.mem[0]);
+	nvkm_memory_unref(&fifo->runlist.mem[1]);
 	return fifo;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gf100.h b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gf100.h
index 70db58eab9c3..b81a2ad48aa4 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gf100.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gf100.h
@@ -26,7 +26,7 @@ struct gf100_fifo {
 
 	struct {
 		struct nvkm_memory *mem;
-		struct nvkm_vma bar;
+		struct nvkm_vma *bar;
 	} user;
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c
index a7e55c422501..84bd703dd897 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c
@@ -771,6 +771,7 @@ gk104_fifo_oneinit(struct nvkm_fifo *base)
 	struct gk104_fifo *fifo = gk104_fifo(base);
 	struct nvkm_subdev *subdev = &fifo->base.engine.subdev;
 	struct nvkm_device *device = subdev->device;
+	struct nvkm_vmm *bar = nvkm_bar_bar1_vmm(device);
 	int engn, runl, pbid, ret, i, j;
 	enum nvkm_devidx engidx;
 	u32 *map;
@@ -834,13 +835,12 @@ gk104_fifo_oneinit(struct nvkm_fifo *base)
 	if (ret)
 		return ret;
 
-	ret = nvkm_bar_umap(device->bar, fifo->base.nr * 0x200, 12,
-			    &fifo->user.bar);
+	ret = nvkm_vmm_get(bar, 12, nvkm_memory_size(fifo->user.mem),
+			   &fifo->user.bar);
 	if (ret)
 		return ret;
 
-	nvkm_memory_map(fifo->user.mem, &fifo->user.bar, 0);
-	return 0;
+	return nvkm_memory_map(fifo->user.mem, 0, bar, fifo->user.bar, NULL, 0);
 }
 
 static void
@@ -866,7 +866,7 @@ gk104_fifo_init(struct nvkm_fifo *base)
 		nvkm_wr32(device, 0x04014c + (i * 0x2000), 0xffffffff); /* INTREN */
 	}
 
-	nvkm_wr32(device, 0x002254, 0x10000000 | fifo->user.bar.offset >> 12);
+	nvkm_wr32(device, 0x002254, 0x10000000 | fifo->user.bar->addr >> 12);
 
 	nvkm_wr32(device, 0x002100, 0xffffffff);
 	nvkm_wr32(device, 0x002140, 0x7fffffff);
@@ -876,14 +876,15 @@ static void *
 gk104_fifo_dtor(struct nvkm_fifo *base)
 {
 	struct gk104_fifo *fifo = gk104_fifo(base);
+	struct nvkm_device *device = fifo->base.engine.subdev.device;
 	int i;
 
-	nvkm_vm_put(&fifo->user.bar);
-	nvkm_memory_del(&fifo->user.mem);
+	nvkm_vmm_put(nvkm_bar_bar1_vmm(device), &fifo->user.bar);
+	nvkm_memory_unref(&fifo->user.mem);
 
 	for (i = 0; i < fifo->runlist_nr; i++) {
-		nvkm_memory_del(&fifo->runlist[i].mem[1]);
-		nvkm_memory_del(&fifo->runlist[i].mem[0]);
+		nvkm_memory_unref(&fifo->runlist[i].mem[1]);
+		nvkm_memory_unref(&fifo->runlist[i].mem[0]);
 	}
 
 	return fifo;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.h b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.h
index 44bff98d6725..466f1051f91a 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.h
@@ -37,7 +37,7 @@ struct gk104_fifo {
 
 	struct {
 		struct nvkm_memory *mem;
-		struct nvkm_vma bar;
+		struct nvkm_vma *bar;
 	} user;
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifog84.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifog84.c
index 77c2f2a28bf3..2121f517b1dd 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifog84.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifog84.c
@@ -45,10 +45,10 @@ g84_fifo_gpfifo_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 
 	nvif_ioctl(parent, "create channel gpfifo size %d\n", size);
 	if (!(ret = nvif_unpack(ret, &data, &size, args->v0, 0, 0, false))) {
-		nvif_ioctl(parent, "create channel gpfifo vers %d vm %llx "
+		nvif_ioctl(parent, "create channel gpfifo vers %d vmm %llx "
 				   "pushbuf %llx ioffset %016llx "
 				   "ilength %08x\n",
-			   args->v0.version, args->v0.vm, args->v0.pushbuf,
+			   args->v0.version, args->v0.vmm, args->v0.pushbuf,
 			   args->v0.ioffset, args->v0.ilength);
 		if (!args->v0.pushbuf)
 			return -EINVAL;
@@ -59,7 +59,7 @@ g84_fifo_gpfifo_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 		return -ENOMEM;
 	*pobject = &chan->base.object;
 
-	ret = g84_fifo_chan_ctor(fifo, args->v0.vm, args->v0.pushbuf,
+	ret = g84_fifo_chan_ctor(fifo, args->v0.vmm, args->v0.pushbuf,
 				 oclass, chan);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifogf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifogf100.c
index f9e0377d3d24..75f9632789b3 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifogf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifogf100.c
@@ -111,7 +111,7 @@ gf100_fifo_gpfifo_engine_init(struct nvkm_fifo_chan *base,
 	struct nvkm_gpuobj *inst = chan->base.inst;
 
 	if (offset) {
-		u64 addr = chan->engn[engine->subdev.index].vma.offset;
+		u64 addr = chan->engn[engine->subdev.index].vma->addr;
 		nvkm_kmap(inst);
 		nvkm_wo32(inst, offset + 0x00, lower_32_bits(addr) | 4);
 		nvkm_wo32(inst, offset + 0x04, upper_32_bits(addr));
@@ -126,7 +126,7 @@ gf100_fifo_gpfifo_engine_dtor(struct nvkm_fifo_chan *base,
 			      struct nvkm_engine *engine)
 {
 	struct gf100_fifo_chan *chan = gf100_fifo_chan(base);
-	nvkm_gpuobj_unmap(&chan->engn[engine->subdev.index].vma);
+	nvkm_vmm_put(chan->base.vmm, &chan->engn[engine->subdev.index].vma);
 	nvkm_gpuobj_del(&chan->engn[engine->subdev.index].inst);
 }
 
@@ -146,8 +146,13 @@ gf100_fifo_gpfifo_engine_ctor(struct nvkm_fifo_chan *base,
 	if (ret)
 		return ret;
 
-	return nvkm_gpuobj_map(chan->engn[engn].inst, chan->vm,
-			       NV_MEM_ACCESS_RW, &chan->engn[engn].vma);
+	ret = nvkm_vmm_get(chan->base.vmm, 12, chan->engn[engn].inst->size,
+			   &chan->engn[engn].vma);
+	if (ret)
+		return ret;
+
+	return nvkm_memory_map(chan->engn[engn].inst, 0, chan->base.vmm,
+			       chan->engn[engn].vma, NULL, 0);
 }
 
 static void
@@ -190,10 +195,7 @@ gf100_fifo_gpfifo_init(struct nvkm_fifo_chan *base)
 static void *
 gf100_fifo_gpfifo_dtor(struct nvkm_fifo_chan *base)
 {
-	struct gf100_fifo_chan *chan = gf100_fifo_chan(base);
-	nvkm_vm_ref(NULL, &chan->vm, chan->pgd);
-	nvkm_gpuobj_del(&chan->pgd);
-	return chan;
+	return gf100_fifo_chan(base);
 }
 
 static const struct nvkm_fifo_chan_func
@@ -216,7 +218,6 @@ gf100_fifo_gpfifo_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 		struct fermi_channel_gpfifo_v0 v0;
 	} *args = data;
 	struct gf100_fifo *fifo = gf100_fifo(base);
-	struct nvkm_device *device = fifo->base.engine.subdev.device;
 	struct nvkm_object *parent = oclass->parent;
 	struct gf100_fifo_chan *chan;
 	u64 usermem, ioffset, ilength;
@@ -224,10 +225,12 @@ gf100_fifo_gpfifo_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 
 	nvif_ioctl(parent, "create channel gpfifo size %d\n", size);
 	if (!(ret = nvif_unpack(ret, &data, &size, args->v0, 0, 0, false))) {
-		nvif_ioctl(parent, "create channel gpfifo vers %d vm %llx "
+		nvif_ioctl(parent, "create channel gpfifo vers %d vmm %llx "
 				   "ioffset %016llx ilength %08x\n",
-			   args->v0.version, args->v0.vm, args->v0.ioffset,
+			   args->v0.version, args->v0.vmm, args->v0.ioffset,
 			   args->v0.ilength);
+		if (!args->v0.vmm)
+			return -EINVAL;
 	} else
 		return ret;
 
@@ -239,7 +242,7 @@ gf100_fifo_gpfifo_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 	INIT_LIST_HEAD(&chan->head);
 
 	ret = nvkm_fifo_chan_ctor(&gf100_fifo_gpfifo_func, &fifo->base,
-				  0x1000, 0x1000, true, args->v0.vm, 0,
+				  0x1000, 0x1000, true, args->v0.vmm, 0,
 				  (1ULL << NVKM_ENGINE_CE0) |
 				  (1ULL << NVKM_ENGINE_CE1) |
 				  (1ULL << NVKM_ENGINE_GR) |
@@ -247,29 +250,13 @@ gf100_fifo_gpfifo_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 				  (1ULL << NVKM_ENGINE_MSPPP) |
 				  (1ULL << NVKM_ENGINE_MSVLD) |
 				  (1ULL << NVKM_ENGINE_SW),
-				  1, fifo->user.bar.offset, 0x1000,
+				  1, fifo->user.bar->addr, 0x1000,
 				  oclass, &chan->base);
 	if (ret)
 		return ret;
 
 	args->v0.chid = chan->base.chid;
 
-	/* page directory */
-	ret = nvkm_gpuobj_new(device, 0x10000, 0x1000, false, NULL, &chan->pgd);
-	if (ret)
-		return ret;
-
-	nvkm_kmap(chan->base.inst);
-	nvkm_wo32(chan->base.inst, 0x0200, lower_32_bits(chan->pgd->addr));
-	nvkm_wo32(chan->base.inst, 0x0204, upper_32_bits(chan->pgd->addr));
-	nvkm_wo32(chan->base.inst, 0x0208, 0xffffffff);
-	nvkm_wo32(chan->base.inst, 0x020c, 0x000000ff);
-	nvkm_done(chan->base.inst);
-
-	ret = nvkm_vm_ref(chan->base.vm, &chan->vm, chan->pgd);
-	if (ret)
-		return ret;
-
 	/* clear channel control registers */
 
 	usermem = chan->base.chid * 0x1000;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifogk104.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifogk104.c
index 8abf6f8ef445..80c87521bebe 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifogk104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifogk104.c
@@ -117,7 +117,7 @@ gk104_fifo_gpfifo_engine_init(struct nvkm_fifo_chan *base,
 	u32 offset = gk104_fifo_gpfifo_engine_addr(engine);
 
 	if (offset) {
-		u64   addr = chan->engn[engine->subdev.index].vma.offset;
+		u64   addr = chan->engn[engine->subdev.index].vma->addr;
 		u32 datalo = lower_32_bits(addr) | 0x00000004;
 		u32 datahi = upper_32_bits(addr);
 		nvkm_kmap(inst);
@@ -138,7 +138,7 @@ gk104_fifo_gpfifo_engine_dtor(struct nvkm_fifo_chan *base,
 			      struct nvkm_engine *engine)
 {
 	struct gk104_fifo_chan *chan = gk104_fifo_chan(base);
-	nvkm_gpuobj_unmap(&chan->engn[engine->subdev.index].vma);
+	nvkm_vmm_put(chan->base.vmm, &chan->engn[engine->subdev.index].vma);
 	nvkm_gpuobj_del(&chan->engn[engine->subdev.index].inst);
 }
 
@@ -158,8 +158,13 @@ gk104_fifo_gpfifo_engine_ctor(struct nvkm_fifo_chan *base,
 	if (ret)
 		return ret;
 
-	return nvkm_gpuobj_map(chan->engn[engn].inst, chan->vm,
-			       NV_MEM_ACCESS_RW, &chan->engn[engn].vma);
+	ret = nvkm_vmm_get(chan->base.vmm, 12, chan->engn[engn].inst->size,
+			   &chan->engn[engn].vma);
+	if (ret)
+		return ret;
+
+	return nvkm_memory_map(chan->engn[engn].inst, 0, chan->base.vmm,
+			       chan->engn[engn].vma, NULL, 0);
 }
 
 static void
@@ -203,10 +208,7 @@ gk104_fifo_gpfifo_init(struct nvkm_fifo_chan *base)
 static void *
 gk104_fifo_gpfifo_dtor(struct nvkm_fifo_chan *base)
 {
-	struct gk104_fifo_chan *chan = gk104_fifo_chan(base);
-	nvkm_vm_ref(NULL, &chan->vm, chan->pgd);
-	nvkm_gpuobj_del(&chan->pgd);
-	return chan;
+	return gk104_fifo_chan(base);
 }
 
 static const struct nvkm_fifo_chan_func
@@ -229,17 +231,19 @@ struct gk104_fifo_chan_func {
 static int
 gk104_fifo_gpfifo_new_(const struct gk104_fifo_chan_func *func,
 		       struct gk104_fifo *fifo, u32 *engmask, u16 *chid,
-		       u64 vm, u64 ioffset, u64 ilength,
+		       u64 vmm, u64 ioffset, u64 ilength,
 		       const struct nvkm_oclass *oclass,
 		       struct nvkm_object **pobject)
 {
-	struct nvkm_device *device = fifo->base.engine.subdev.device;
 	struct gk104_fifo_chan *chan;
 	int runlist = -1, ret = -ENOSYS, i, j;
 	u32 engines = 0, present = 0;
 	u64 subdevs = 0;
 	u64 usermem;
 
+	if (!vmm)
+		return -EINVAL;
+
 	/* Determine which downstream engines are present */
 	for (i = 0; i < fifo->engine_nr; i++) {
 		struct nvkm_engine *engine = fifo->engine[i].engine;
@@ -285,30 +289,14 @@ gk104_fifo_gpfifo_new_(const struct gk104_fifo_chan_func *func,
 	INIT_LIST_HEAD(&chan->head);
 
 	ret = nvkm_fifo_chan_ctor(&gk104_fifo_gpfifo_func, &fifo->base,
-				  0x1000, 0x1000, true, vm, 0, subdevs,
-				  1, fifo->user.bar.offset, 0x200,
+				  0x1000, 0x1000, true, vmm, 0, subdevs,
+				  1, fifo->user.bar->addr, 0x200,
 				  oclass, &chan->base);
 	if (ret)
 		return ret;
 
 	*chid = chan->base.chid;
 
-	/* Page directory. */
-	ret = nvkm_gpuobj_new(device, 0x10000, 0x1000, false, NULL, &chan->pgd);
-	if (ret)
-		return ret;
-
-	nvkm_kmap(chan->base.inst);
-	nvkm_wo32(chan->base.inst, 0x0200, lower_32_bits(chan->pgd->addr));
-	nvkm_wo32(chan->base.inst, 0x0204, upper_32_bits(chan->pgd->addr));
-	nvkm_wo32(chan->base.inst, 0x0208, 0xffffffff);
-	nvkm_wo32(chan->base.inst, 0x020c, 0x000000ff);
-	nvkm_done(chan->base.inst);
-
-	ret = nvkm_vm_ref(chan->base.vm, &chan->vm, chan->pgd);
-	if (ret)
-		return ret;
-
 	/* Clear channel control registers. */
 	usermem = chan->base.chid * 0x200;
 	ilength = order_base_2(ilength / 8);
@@ -373,18 +361,17 @@ gk104_fifo_gpfifo_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 
 	nvif_ioctl(parent, "create channel gpfifo size %d\n", size);
 	if (!(ret = nvif_unpack(ret, &data, &size, args->v0, 0, 0, false))) {
-		nvif_ioctl(parent, "create channel gpfifo vers %d vm %llx "
+		nvif_ioctl(parent, "create channel gpfifo vers %d vmm %llx "
 				   "ioffset %016llx ilength %08x engine %08x\n",
-			   args->v0.version, args->v0.vm, args->v0.ioffset,
+			   args->v0.version, args->v0.vmm, args->v0.ioffset,
 			   args->v0.ilength, args->v0.engines);
 		return gk104_fifo_gpfifo_new_(gk104_fifo_gpfifo, fifo,
 					      &args->v0.engines,
 					      &args->v0.chid,
-					       args->v0.vm,
+					       args->v0.vmm,
 					       args->v0.ioffset,
 					       args->v0.ilength,
 					      oclass, pobject);
-
 	}
 
 	return ret;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifonv50.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifonv50.c
index c5a7de9db259..d8f28ec1e4a8 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifonv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gpfifonv50.c
@@ -45,10 +45,10 @@ nv50_fifo_gpfifo_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 
 	nvif_ioctl(parent, "create channel gpfifo size %d\n", size);
 	if (!(ret = nvif_unpack(ret, &data, &size, args->v0, 0, 0, false))) {
-		nvif_ioctl(parent, "create channel gpfifo vers %d vm %llx "
+		nvif_ioctl(parent, "create channel gpfifo vers %d vmm %llx "
 				   "pushbuf %llx ioffset %016llx "
 				   "ilength %08x\n",
-			   args->v0.version, args->v0.vm, args->v0.pushbuf,
+			   args->v0.version, args->v0.vmm, args->v0.pushbuf,
 			   args->v0.ioffset, args->v0.ilength);
 		if (!args->v0.pushbuf)
 			return -EINVAL;
@@ -59,7 +59,7 @@ nv50_fifo_gpfifo_new(struct nvkm_fifo *base, const struct nvkm_oclass *oclass,
 		return -ENOMEM;
 	*pobject = &chan->base.object;
 
-	ret = nv50_fifo_chan_ctor(fifo, args->v0.vm, args->v0.pushbuf,
+	ret = nv50_fifo_chan_ctor(fifo, args->v0.vmm, args->v0.pushbuf,
 				  oclass, chan);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/nv50.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/nv50.c
index 66eb12c2b5ba..fa6e094d8068 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/nv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/nv50.c
@@ -100,8 +100,8 @@ void *
 nv50_fifo_dtor(struct nvkm_fifo *base)
 {
 	struct nv50_fifo *fifo = nv50_fifo(base);
-	nvkm_memory_del(&fifo->runlist[1]);
-	nvkm_memory_del(&fifo->runlist[0]);
+	nvkm_memory_unref(&fifo->runlist[1]);
+	nvkm_memory_unref(&fifo->runlist[0]);
 	return fifo;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c
index bc77eea351a5..881015080d83 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.c
@@ -986,14 +986,14 @@ gf100_grctx_pack_tpc[] = {
  ******************************************************************************/
 
 int
-gf100_grctx_mmio_data(struct gf100_grctx *info, u32 size, u32 align, u32 access)
+gf100_grctx_mmio_data(struct gf100_grctx *info, u32 size, u32 align, bool priv)
 {
 	if (info->data) {
 		info->buffer[info->buffer_nr] = round_up(info->addr, align);
 		info->addr = info->buffer[info->buffer_nr] + size;
 		info->data->size = size;
 		info->data->align = align;
-		info->data->access = access;
+		info->data->priv = priv;
 		info->data++;
 		return info->buffer_nr++;
 	}
@@ -1028,9 +1028,8 @@ void
 gf100_grctx_generate_bundle(struct gf100_grctx *info)
 {
 	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const u32 access = NV_MEM_ACCESS_RW | NV_MEM_ACCESS_SYS;
 	const int s = 8;
-	const int b = mmio_vram(info, grctx->bundle_size, (1 << s), access);
+	const int b = mmio_vram(info, grctx->bundle_size, (1 << s), true);
 	mmio_refn(info, 0x408004, 0x00000000, s, b);
 	mmio_wr32(info, 0x408008, 0x80000000 | (grctx->bundle_size >> s));
 	mmio_refn(info, 0x418808, 0x00000000, s, b);
@@ -1041,9 +1040,8 @@ void
 gf100_grctx_generate_pagepool(struct gf100_grctx *info)
 {
 	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const u32 access = NV_MEM_ACCESS_RW | NV_MEM_ACCESS_SYS;
 	const int s = 8;
-	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), access);
+	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), true);
 	mmio_refn(info, 0x40800c, 0x00000000, s, b);
 	mmio_wr32(info, 0x408010, 0x80000000);
 	mmio_refn(info, 0x419004, 0x00000000, s, b);
@@ -1057,9 +1055,8 @@ gf100_grctx_generate_attrib(struct gf100_grctx *info)
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	const u32 attrib = grctx->attrib_nr;
 	const u32   size = 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max);
-	const u32 access = NV_MEM_ACCESS_RW;
 	const int s = 12;
-	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), access);
+	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), false);
 	int gpc, tpc;
 	u32 bo = 0;
 
@@ -1267,85 +1264,87 @@ gf100_grctx_generate_main(struct gf100_gr *gr, struct gf100_grctx *info)
 	nvkm_mc_unk260(device, 1);
 }
 
+#define CB_RESERVED 0x80000
+
 int
 gf100_grctx_generate(struct gf100_gr *gr)
 {
 	const struct gf100_grctx_func *grctx = gr->func->grctx;
 	struct nvkm_subdev *subdev = &gr->base.engine.subdev;
 	struct nvkm_device *device = subdev->device;
-	struct nvkm_memory *chan;
+	struct nvkm_memory *inst = NULL;
+	struct nvkm_memory *data = NULL;
+	struct nvkm_vmm *vmm = NULL;
+	struct nvkm_vma *ctx = NULL;
 	struct gf100_grctx info;
 	int ret, i;
 	u64 addr;
 
-	/* allocate memory to for a "channel", which we'll use to generate
-	 * the default context values
+	/* Allocate memory to for a "channel", which we'll use to generate
+	 * the default context values.
 	 */
-	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x80000 + gr->size,
-			      0x1000, true, &chan);
-	if (ret) {
-		nvkm_error(subdev, "failed to allocate chan memory, %d\n", ret);
-		return ret;
-	}
+	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST,
+			      0x1000, 0x1000, true, &inst);
+	if (ret)
+		goto done;
 
-	addr = nvkm_memory_addr(chan);
+	ret = nvkm_vmm_new(device, 0, 0, NULL, 0, NULL, "grctx", &vmm);
+	if (ret)
+		goto done;
 
-	/* PGD pointer */
-	nvkm_kmap(chan);
-	nvkm_wo32(chan, 0x0200, lower_32_bits(addr + 0x1000));
-	nvkm_wo32(chan, 0x0204, upper_32_bits(addr + 0x1000));
-	nvkm_wo32(chan, 0x0208, 0xffffffff);
-	nvkm_wo32(chan, 0x020c, 0x000000ff);
+	vmm->debug = subdev->debug;
 
-	/* PGT[0] pointer */
-	nvkm_wo32(chan, 0x1000, 0x00000000);
-	nvkm_wo32(chan, 0x1004, 0x00000001 | (addr + 0x2000) >> 8);
+	ret = nvkm_vmm_join(vmm, inst);
+	if (ret)
+		goto done;
 
-	/* identity-map the whole "channel" into its own vm */
-	for (i = 0; i < nvkm_memory_size(chan) / 4096; i++) {
-		u64 addr = ((nvkm_memory_addr(chan) + (i * 4096)) >> 8) | 1;
-		nvkm_wo32(chan, 0x2000 + (i * 8), lower_32_bits(addr));
-		nvkm_wo32(chan, 0x2004 + (i * 8), upper_32_bits(addr));
-	}
+	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST,
+			      CB_RESERVED + gr->size, 0, true, &data);
+	if (ret)
+		goto done;
 
-	/* context pointer (virt) */
-	nvkm_wo32(chan, 0x0210, 0x00080004);
-	nvkm_wo32(chan, 0x0214, 0x00000000);
-	nvkm_done(chan);
+	ret = nvkm_vmm_get(vmm, 0, nvkm_memory_size(data), &ctx);
+	if (ret)
+		goto done;
 
-	nvkm_wr32(device, 0x100cb8, (addr + 0x1000) >> 8);
-	nvkm_wr32(device, 0x100cbc, 0x80000001);
-	nvkm_msec(device, 2000,
-		if (nvkm_rd32(device, 0x100c80) & 0x00008000)
-			break;
-	);
+	ret = nvkm_memory_map(data, 0, vmm, ctx, NULL, 0);
+	if (ret)
+		goto done;
+
+
+	/* Setup context pointer. */
+	nvkm_kmap(inst);
+	nvkm_wo32(inst, 0x0210, lower_32_bits(ctx->addr + CB_RESERVED) | 4);
+	nvkm_wo32(inst, 0x0214, upper_32_bits(ctx->addr + CB_RESERVED));
+	nvkm_done(inst);
 
-	/* setup default state for mmio list construction */
+	/* Setup default state for mmio list construction. */
 	info.gr = gr;
 	info.data = gr->mmio_data;
 	info.mmio = gr->mmio_list;
-	info.addr = 0x2000 + (i * 8);
+	info.addr = ctx->addr;
 	info.buffer_nr = 0;
 
-	/* make channel current */
+	/* Make channel current. */
+	addr = nvkm_memory_addr(inst) >> 12;
 	if (gr->firmware) {
 		nvkm_wr32(device, 0x409840, 0x00000030);
-		nvkm_wr32(device, 0x409500, 0x80000000 | addr >> 12);
+		nvkm_wr32(device, 0x409500, 0x80000000 | addr);
 		nvkm_wr32(device, 0x409504, 0x00000003);
 		nvkm_msec(device, 2000,
 			if (nvkm_rd32(device, 0x409800) & 0x00000010)
 				break;
 		);
 
-		nvkm_kmap(chan);
-		nvkm_wo32(chan, 0x8001c, 1);
-		nvkm_wo32(chan, 0x80020, 0);
-		nvkm_wo32(chan, 0x80028, 0);
-		nvkm_wo32(chan, 0x8002c, 0);
-		nvkm_done(chan);
+		nvkm_kmap(data);
+		nvkm_wo32(data, 0x1c, 1);
+		nvkm_wo32(data, 0x20, 0);
+		nvkm_wo32(data, 0x28, 0);
+		nvkm_wo32(data, 0x2c, 0);
+		nvkm_done(data);
 	} else {
 		nvkm_wr32(device, 0x409840, 0x80000000);
-		nvkm_wr32(device, 0x409500, 0x80000000 | addr >> 12);
+		nvkm_wr32(device, 0x409500, 0x80000000 | addr);
 		nvkm_wr32(device, 0x409504, 0x00000001);
 		nvkm_msec(device, 2000,
 			if (nvkm_rd32(device, 0x409800) & 0x80000000)
@@ -1355,8 +1354,8 @@ gf100_grctx_generate(struct gf100_gr *gr)
 
 	grctx->main(gr, &info);
 
-	/* trigger a context unload by unsetting the "next channel valid" bit
-	 * and faking a context switch interrupt
+	/* Trigger a context unload by unsetting the "next channel valid" bit
+	 * and faking a context switch interrupt.
 	 */
 	nvkm_mask(device, 0x409b04, 0x80000000, 0x00000000);
 	nvkm_wr32(device, 0x409000, 0x00000100);
@@ -1370,17 +1369,21 @@ gf100_grctx_generate(struct gf100_gr *gr)
 
 	gr->data = kmalloc(gr->size, GFP_KERNEL);
 	if (gr->data) {
-		nvkm_kmap(chan);
+		nvkm_kmap(data);
 		for (i = 0; i < gr->size; i += 4)
-			gr->data[i / 4] = nvkm_ro32(chan, 0x80000 + i);
-		nvkm_done(chan);
+			gr->data[i / 4] = nvkm_ro32(data, CB_RESERVED + i);
+		nvkm_done(data);
 		ret = 0;
 	} else {
 		ret = -ENOMEM;
 	}
 
 done:
-	nvkm_memory_del(&chan);
+	nvkm_vmm_put(vmm, &ctx);
+	nvkm_memory_unref(&data);
+	nvkm_vmm_part(vmm, inst);
+	nvkm_vmm_unref(&vmm);
+	nvkm_memory_unref(&inst);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h
index 017180d147cf..4731e56fbb11 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf100.h
@@ -11,7 +11,7 @@ struct gf100_grctx {
 	u64 addr;
 };
 
-int  gf100_grctx_mmio_data(struct gf100_grctx *, u32 size, u32 align, u32 access);
+int  gf100_grctx_mmio_data(struct gf100_grctx *, u32 size, u32 align, bool priv);
 void gf100_grctx_mmio_item(struct gf100_grctx *, u32 addr, u32 data, int s, int);
 
 #define mmio_vram(a,b,c,d) gf100_grctx_mmio_data((a), (b), (c), (d))
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf108.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf108.c
index 505cdcbfc085..82f71b10c06e 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf108.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf108.c
@@ -735,9 +735,8 @@ gf108_grctx_generate_attrib(struct gf100_grctx *info)
 	const u32  alpha = grctx->alpha_nr;
 	const u32   beta = grctx->attrib_nr;
 	const u32   size = 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max);
-	const u32 access = NV_MEM_ACCESS_RW;
 	const int s = 12;
-	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), access);
+	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), false);
 	const int timeslice_mode = 1;
 	const int max_batches = 0xffff;
 	u32 bo = 0;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf117.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf117.c
index 74a64e3fd59a..19301d88577d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf117.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgf117.c
@@ -187,9 +187,8 @@ gf117_grctx_generate_attrib(struct gf100_grctx *info)
 	const u32  alpha = grctx->alpha_nr;
 	const u32   beta = grctx->attrib_nr;
 	const u32   size = 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max);
-	const u32 access = NV_MEM_ACCESS_RW;
 	const int s = 12;
-	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), access);
+	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), false);
 	const int timeslice_mode = 1;
 	const int max_batches = 0xffff;
 	u32 bo = 0;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c
index c46b3fdf7203..825c8fd500bc 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgk104.c
@@ -847,9 +847,8 @@ gk104_grctx_generate_bundle(struct gf100_grctx *info)
 	const u32 state_limit = min(grctx->bundle_min_gpm_fifo_depth,
 				    grctx->bundle_size / 0x20);
 	const u32 token_limit = grctx->bundle_token_limit;
-	const u32 access = NV_MEM_ACCESS_RW | NV_MEM_ACCESS_SYS;
 	const int s = 8;
-	const int b = mmio_vram(info, grctx->bundle_size, (1 << s), access);
+	const int b = mmio_vram(info, grctx->bundle_size, (1 << s), true);
 	mmio_refn(info, 0x408004, 0x00000000, s, b);
 	mmio_wr32(info, 0x408008, 0x80000000 | (grctx->bundle_size >> s));
 	mmio_refn(info, 0x418808, 0x00000000, s, b);
@@ -861,9 +860,8 @@ void
 gk104_grctx_generate_pagepool(struct gf100_grctx *info)
 {
 	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const u32 access = NV_MEM_ACCESS_RW | NV_MEM_ACCESS_SYS;
 	const int s = 8;
-	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), access);
+	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), true);
 	mmio_refn(info, 0x40800c, 0x00000000, s, b);
 	mmio_wr32(info, 0x408010, 0x80000000);
 	mmio_refn(info, 0x419004, 0x00000000, s, b);
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c
index 4c4b5ab6e46d..9b43d4ce3eaa 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgm107.c
@@ -867,9 +867,8 @@ gm107_grctx_generate_bundle(struct gf100_grctx *info)
 	const u32 state_limit = min(grctx->bundle_min_gpm_fifo_depth,
 				    grctx->bundle_size / 0x20);
 	const u32 token_limit = grctx->bundle_token_limit;
-	const u32 access = NV_MEM_ACCESS_RW | NV_MEM_ACCESS_SYS;
 	const int s = 8;
-	const int b = mmio_vram(info, grctx->bundle_size, (1 << s), access);
+	const int b = mmio_vram(info, grctx->bundle_size, (1 << s), true);
 	mmio_refn(info, 0x408004, 0x00000000, s, b);
 	mmio_wr32(info, 0x408008, 0x80000000 | (grctx->bundle_size >> s));
 	mmio_refn(info, 0x418e24, 0x00000000, s, b);
@@ -881,9 +880,8 @@ void
 gm107_grctx_generate_pagepool(struct gf100_grctx *info)
 {
 	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const u32 access = NV_MEM_ACCESS_RW | NV_MEM_ACCESS_SYS;
 	const int s = 8;
-	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), access);
+	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), true);
 	mmio_refn(info, 0x40800c, 0x00000000, s, b);
 	mmio_wr32(info, 0x408010, 0x80000000);
 	mmio_refn(info, 0x419004, 0x00000000, s, b);
@@ -900,9 +898,8 @@ gm107_grctx_generate_attrib(struct gf100_grctx *info)
 	const u32  alpha = grctx->alpha_nr;
 	const u32 attrib = grctx->attrib_nr;
 	const u32   size = 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max);
-	const u32 access = NV_MEM_ACCESS_RW;
 	const int s = 12;
-	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), access);
+	const int b = mmio_vram(info, size * gr->tpc_total, (1 << s), false);
 	const int max_batches = 0xffff;
 	u32 bo = 0;
 	u32 ao = bo + grctx->attrib_nr_max * gr->tpc_total;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp100.c
index 7833bc777a29..88ea322d956c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp100.c
@@ -33,9 +33,8 @@ void
 gp100_grctx_generate_pagepool(struct gf100_grctx *info)
 {
 	const struct gf100_grctx_func *grctx = info->gr->func->grctx;
-	const u32 access = NV_MEM_ACCESS_RW | NV_MEM_ACCESS_SYS;
 	const int s = 8;
-	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), access);
+	const int b = mmio_vram(info, grctx->pagepool_size, (1 << s), true);
 	mmio_refn(info, 0x40800c, 0x00000000, s, b);
 	mmio_wr32(info, 0x408010, 0x80000000);
 	mmio_refn(info, 0x419004, 0x00000000, s, b);
@@ -51,9 +50,8 @@ gp100_grctx_generate_attrib(struct gf100_grctx *info)
 	const u32 attrib = grctx->attrib_nr;
 	const u32 pertpc = 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max);
 	const u32   size = roundup(gr->tpc_total * pertpc, 0x80);
-	const u32 access = NV_MEM_ACCESS_RW;
 	const int s = 12;
-	const int b = mmio_vram(info, size, (1 << s), access);
+	const int b = mmio_vram(info, size, (1 << s), false);
 	const int max_batches = 0xffff;
 	u32 ao = 0;
 	u32 bo = ao + grctx->alpha_nr_max * gr->tpc_total;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp102.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp102.c
index 80b7ab0bee3a..7a66b4c2eb18 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp102.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/ctxgp102.c
@@ -38,9 +38,8 @@ gp102_grctx_generate_attrib(struct gf100_grctx *info)
 	const u32 attrib = grctx->attrib_nr;
 	const u32 pertpc = 0x20 * (grctx->attrib_nr_max + grctx->alpha_nr_max);
 	const u32   size = roundup(gr->tpc_total * pertpc, 0x80);
-	const u32 access = NV_MEM_ACCESS_RW;
 	const int s = 12;
-	const int b = mmio_vram(info, size, (1 << s), access);
+	const int b = mmio_vram(info, size, (1 << s), false);
 	const int max_batches = 0xffff;
 	u32 ao = 0;
 	u32 bo = ao + grctx->alpha_nr_max * gr->tpc_total;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
index 99689f4de502..2f8dc107047d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
@@ -37,6 +37,7 @@
 
 #include <nvif/class.h>
 #include <nvif/cl9097.h>
+#include <nvif/if900d.h>
 #include <nvif/unpack.h>
 
 /*******************************************************************************
@@ -327,13 +328,13 @@ gf100_gr_chan_bind(struct nvkm_object *object, struct nvkm_gpuobj *parent,
 
 	if (!gr->firmware) {
 		nvkm_wo32(*pgpuobj, 0x00, chan->mmio_nr / 2);
-		nvkm_wo32(*pgpuobj, 0x04, chan->mmio_vma.offset >> 8);
+		nvkm_wo32(*pgpuobj, 0x04, chan->mmio_vma->addr >> 8);
 	} else {
 		nvkm_wo32(*pgpuobj, 0xf4, 0);
 		nvkm_wo32(*pgpuobj, 0xf8, 0);
 		nvkm_wo32(*pgpuobj, 0x10, chan->mmio_nr / 2);
-		nvkm_wo32(*pgpuobj, 0x14, lower_32_bits(chan->mmio_vma.offset));
-		nvkm_wo32(*pgpuobj, 0x18, upper_32_bits(chan->mmio_vma.offset));
+		nvkm_wo32(*pgpuobj, 0x14, lower_32_bits(chan->mmio_vma->addr));
+		nvkm_wo32(*pgpuobj, 0x18, upper_32_bits(chan->mmio_vma->addr));
 		nvkm_wo32(*pgpuobj, 0x1c, 1);
 		nvkm_wo32(*pgpuobj, 0x20, 0);
 		nvkm_wo32(*pgpuobj, 0x28, 0);
@@ -350,18 +351,13 @@ gf100_gr_chan_dtor(struct nvkm_object *object)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(chan->data); i++) {
-		if (chan->data[i].vma.node) {
-			nvkm_vm_unmap(&chan->data[i].vma);
-			nvkm_vm_put(&chan->data[i].vma);
-		}
-		nvkm_memory_del(&chan->data[i].mem);
+		nvkm_vmm_put(chan->vmm, &chan->data[i].vma);
+		nvkm_memory_unref(&chan->data[i].mem);
 	}
 
-	if (chan->mmio_vma.node) {
-		nvkm_vm_unmap(&chan->mmio_vma);
-		nvkm_vm_put(&chan->mmio_vma);
-	}
-	nvkm_memory_del(&chan->mmio);
+	nvkm_vmm_put(chan->vmm, &chan->mmio_vma);
+	nvkm_memory_unref(&chan->mmio);
+	nvkm_vmm_unref(&chan->vmm);
 	return chan;
 }
 
@@ -380,6 +376,7 @@ gf100_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 	struct gf100_gr_data *data = gr->mmio_data;
 	struct gf100_gr_mmio *mmio = gr->mmio_list;
 	struct gf100_gr_chan *chan;
+	struct gf100_vmm_map_v0 args = { .priv = 1 };
 	struct nvkm_device *device = gr->base.engine.subdev.device;
 	int ret, i;
 
@@ -387,6 +384,7 @@ gf100_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		return -ENOMEM;
 	nvkm_object_ctor(&gf100_gr_chan, oclass, &chan->object);
 	chan->gr = gr;
+	chan->vmm = nvkm_vmm_ref(fifoch->vmm);
 	*pobject = &chan->object;
 
 	/* allocate memory for a "mmio list" buffer that's used by the HUB
@@ -398,12 +396,14 @@ gf100_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 	if (ret)
 		return ret;
 
-	ret = nvkm_vm_get(fifoch->vm, 0x1000, 12, NV_MEM_ACCESS_RW |
-			  NV_MEM_ACCESS_SYS, &chan->mmio_vma);
+	ret = nvkm_vmm_get(fifoch->vmm, 12, 0x1000, &chan->mmio_vma);
 	if (ret)
 		return ret;
 
-	nvkm_memory_map(chan->mmio, &chan->mmio_vma, 0);
+	ret = nvkm_memory_map(chan->mmio, 0, fifoch->vmm,
+			      chan->mmio_vma, &args, sizeof(args));
+	if (ret)
+		return ret;
 
 	/* allocate buffers referenced by mmio list */
 	for (i = 0; data->size && i < ARRAY_SIZE(gr->mmio_data); i++) {
@@ -413,13 +413,19 @@ gf100_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		if (ret)
 			return ret;
 
-		ret = nvkm_vm_get(fifoch->vm,
-				  nvkm_memory_size(chan->data[i].mem), 12,
-				  data->access, &chan->data[i].vma);
+		ret = nvkm_vmm_get(fifoch->vmm, 12,
+				   nvkm_memory_size(chan->data[i].mem),
+				   &chan->data[i].vma);
+		if (ret)
+			return ret;
+
+		args.priv = data->priv;
+
+		ret = nvkm_memory_map(chan->data[i].mem, 0, chan->vmm,
+				      chan->data[i].vma, &args, sizeof(args));
 		if (ret)
 			return ret;
 
-		nvkm_memory_map(chan->data[i].mem, &chan->data[i].vma, 0);
 		data++;
 	}
 
@@ -430,7 +436,7 @@ gf100_gr_chan_new(struct nvkm_gr *base, struct nvkm_fifo_chan *fifoch,
 		u32 data = mmio->data;
 
 		if (mmio->buffer >= 0) {
-			u64 info = chan->data[mmio->buffer].vma.offset;
+			u64 info = chan->data[mmio->buffer].vma->addr;
 			data |= info >> mmio->shift;
 		}
 
@@ -1855,8 +1861,12 @@ gf100_gr_ctor_fw(struct gf100_gr *gr, const char *fwname,
 	int ret;
 
 	ret = nvkm_firmware_get(device, fwname, &fw);
-	if (ret)
-		return gf100_gr_ctor_fw_legacy(gr, fwname, fuc, ret);
+	if (ret) {
+		ret = gf100_gr_ctor_fw_legacy(gr, fwname, fuc, ret);
+		if (ret)
+			return -ENODEV;
+		return 0;
+	}
 
 	fuc->size = fw->size;
 	fuc->data = kmemdup(fw->data, fuc->size, GFP_KERNEL);
@@ -1903,25 +1913,33 @@ gf100_gr_new_(const struct gf100_gr_func *func, struct nvkm_device *device,
 	return 0;
 }
 
+void
+gf100_gr_init_gpc_mmu(struct gf100_gr *gr)
+{
+	struct nvkm_device *device = gr->base.engine.subdev.device;
+	struct nvkm_fb *fb = device->fb;
+
+	nvkm_wr32(device, 0x418880, nvkm_rd32(device, 0x100c80) & 0x00000001);
+	nvkm_wr32(device, 0x4188a4, 0x00000000);
+	nvkm_wr32(device, 0x418888, 0x00000000);
+	nvkm_wr32(device, 0x41888c, 0x00000000);
+	nvkm_wr32(device, 0x418890, 0x00000000);
+	nvkm_wr32(device, 0x418894, 0x00000000);
+	nvkm_wr32(device, 0x4188b4, nvkm_memory_addr(fb->mmu_wr) >> 8);
+	nvkm_wr32(device, 0x4188b8, nvkm_memory_addr(fb->mmu_rd) >> 8);
+}
+
 int
 gf100_gr_init(struct gf100_gr *gr)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
-	struct nvkm_fb *fb = device->fb;
 	const u32 magicgpc918 = DIV_ROUND_UP(0x00800000, gr->tpc_total);
 	u32 data[TPC_MAX / 8] = {};
 	u8  tpcnr[GPC_MAX];
 	int gpc, tpc, rop;
 	int i;
 
-	nvkm_wr32(device, GPC_BCAST(0x0880), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x08a4), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x0888), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x088c), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x0890), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x0894), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x08b4), nvkm_memory_addr(fb->mmu_wr) >> 8);
-	nvkm_wr32(device, GPC_BCAST(0x08b8), nvkm_memory_addr(fb->mmu_rd) >> 8);
+	gr->func->init_gpc_mmu(gr);
 
 	gf100_gr_mmio(gr, gr->func->mmio);
 
@@ -2036,6 +2054,7 @@ gf100_gr_gpccs_ucode = {
 static const struct gf100_gr_func
 gf100_gr = {
 	.init = gf100_gr_init,
+	.init_gpc_mmu = gf100_gr_init_gpc_mmu,
 	.mmio = gf100_gr_pack_mmio,
 	.fecs.ucode = &gf100_gr_fecs_ucode,
 	.gpccs.ucode = &gf100_gr_gpccs_ucode,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h
index a36e45a4a635..d7c2adb9b543 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.h
@@ -45,7 +45,7 @@
 struct gf100_gr_data {
 	u32 size;
 	u32 align;
-	u32 access;
+	bool priv;
 };
 
 struct gf100_gr_mmio {
@@ -156,18 +156,20 @@ int gp100_gr_init(struct gf100_gr *);
 void gp100_gr_init_rop_active_fbps(struct gf100_gr *);
 
 #define gf100_gr_chan(p) container_of((p), struct gf100_gr_chan, object)
+#include <core/object.h>
 
 struct gf100_gr_chan {
 	struct nvkm_object object;
 	struct gf100_gr *gr;
+	struct nvkm_vmm *vmm;
 
 	struct nvkm_memory *mmio;
-	struct nvkm_vma mmio_vma;
+	struct nvkm_vma *mmio_vma;
 	int mmio_nr;
 
 	struct {
 		struct nvkm_memory *mem;
-		struct nvkm_vma vma;
+		struct nvkm_vma *vma;
 	} data[4];
 };
 
@@ -253,6 +255,7 @@ extern const struct gf100_gr_init gf100_gr_init_mpc_0[];
 extern const struct gf100_gr_init gf100_gr_init_be_0[];
 extern const struct gf100_gr_init gf100_gr_init_fe_1[];
 extern const struct gf100_gr_init gf100_gr_init_pe_1[];
+void gf100_gr_init_gpc_mmu(struct gf100_gr *);
 
 extern const struct gf100_gr_init gf104_gr_init_ds_0[];
 extern const struct gf100_gr_init gf104_gr_init_tex_0[];
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf104.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf104.c
index d736dcd55ea2..ec0f11983b23 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf104.c
@@ -115,6 +115,7 @@ gf104_gr_pack_mmio[] = {
 static const struct gf100_gr_func
 gf104_gr = {
 	.init = gf100_gr_init,
+	.init_gpc_mmu = gf100_gr_init_gpc_mmu,
 	.mmio = gf104_gr_pack_mmio,
 	.fecs.ucode = &gf100_gr_fecs_ucode,
 	.gpccs.ucode = &gf100_gr_gpccs_ucode,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf108.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf108.c
index 2f0d24498427..cc152eb74123 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf108.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf108.c
@@ -106,6 +106,7 @@ gf108_gr_pack_mmio[] = {
 static const struct gf100_gr_func
 gf108_gr = {
 	.init = gf100_gr_init,
+	.init_gpc_mmu = gf100_gr_init_gpc_mmu,
 	.mmio = gf108_gr_pack_mmio,
 	.fecs.ucode = &gf100_gr_fecs_ucode,
 	.gpccs.ucode = &gf100_gr_gpccs_ucode,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf110.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf110.c
index d1d942eb86af..10d2d73ca8c3 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf110.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf110.c
@@ -87,6 +87,7 @@ gf110_gr_pack_mmio[] = {
 static const struct gf100_gr_func
 gf110_gr = {
 	.init = gf100_gr_init,
+	.init_gpc_mmu = gf100_gr_init_gpc_mmu,
 	.mmio = gf110_gr_pack_mmio,
 	.fecs.ucode = &gf100_gr_fecs_ucode,
 	.gpccs.ucode = &gf100_gr_gpccs_ucode,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf117.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf117.c
index 0124e468086e..ac09a07c4150 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf117.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf117.c
@@ -123,6 +123,7 @@ gf117_gr_gpccs_ucode = {
 static const struct gf100_gr_func
 gf117_gr = {
 	.init = gf100_gr_init,
+	.init_gpc_mmu = gf100_gr_init_gpc_mmu,
 	.mmio = gf117_gr_pack_mmio,
 	.fecs.ucode = &gf117_gr_fecs_ucode,
 	.gpccs.ucode = &gf117_gr_gpccs_ucode,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf119.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf119.c
index 8d8e4cafe28f..7f449ec6f760 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf119.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gf119.c
@@ -178,6 +178,7 @@ gf119_gr_pack_mmio[] = {
 static const struct gf100_gr_func
 gf119_gr = {
 	.init = gf100_gr_init,
+	.init_gpc_mmu = gf100_gr_init_gpc_mmu,
 	.mmio = gf119_gr_pack_mmio,
 	.fecs.ucode = &gf100_gr_fecs_ucode,
 	.gpccs.ucode = &gf100_gr_gpccs_ucode,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk104.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk104.c
index ec22da6c99fc..5e82f94c2245 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk104.c
@@ -24,8 +24,6 @@
 #include "gf100.h"
 #include "ctxgf100.h"
 
-#include <subdev/fb.h>
-
 #include <nvif/class.h>
 
 /*******************************************************************************
@@ -207,21 +205,13 @@ int
 gk104_gr_init(struct gf100_gr *gr)
 {
 	struct nvkm_device *device = gr->base.engine.subdev.device;
-	struct nvkm_fb *fb = device->fb;
 	const u32 magicgpc918 = DIV_ROUND_UP(0x00800000, gr->tpc_total);
 	u32 data[TPC_MAX / 8] = {};
 	u8  tpcnr[GPC_MAX];
 	int gpc, tpc, rop;
 	int i;
 
-	nvkm_wr32(device, GPC_BCAST(0x0880), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x08a4), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x0888), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x088c), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x0890), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x0894), 0x00000000);
-	nvkm_wr32(device, GPC_BCAST(0x08b4), nvkm_memory_addr(fb->mmu_wr) >> 8);
-	nvkm_wr32(device, GPC_BCAST(0x08b8), nvkm_memory_addr(fb->mmu_rd) >> 8);
+	gr->func->init_gpc_mmu(gr);
 
 	gf100_gr_mmio(gr, gr->func->mmio);
 
@@ -339,6 +329,7 @@ gk104_gr_gpccs_ucode = {
 static const struct gf100_gr_func
 gk104_gr = {
 	.init = gk104_gr_init,
+	.init_gpc_mmu = gf100_gr_init_gpc_mmu,
 	.init_rop_active_fbps = gk104_gr_init_rop_active_fbps,
 	.init_ppc_exceptions = gk104_gr_init_ppc_exceptions,
 	.mmio = gk104_gr_pack_mmio,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110.c
index f31b171a4102..a38e19b61c1d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110.c
@@ -183,6 +183,7 @@ gk110_gr_gpccs_ucode = {
 static const struct gf100_gr_func
 gk110_gr = {
 	.init = gk104_gr_init,
+	.init_gpc_mmu = gf100_gr_init_gpc_mmu,
 	.init_rop_active_fbps = gk104_gr_init_rop_active_fbps,
 	.init_ppc_exceptions = gk104_gr_init_ppc_exceptions,
 	.mmio = gk110_gr_pack_mmio,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110b.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110b.c
index d76dd178007f..1912c0bfd7ee 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110b.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk110b.c
@@ -103,6 +103,7 @@ gk110b_gr_pack_mmio[] = {
 static const struct gf100_gr_func
 gk110b_gr = {
 	.init = gk104_gr_init,
+	.init_gpc_mmu = gf100_gr_init_gpc_mmu,
 	.init_rop_active_fbps = gk104_gr_init_rop_active_fbps,
 	.init_ppc_exceptions = gk104_gr_init_ppc_exceptions,
 	.mmio = gk110b_gr_pack_mmio,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk208.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk208.c
index 14bbe6ed02a9..1fc258163f25 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk208.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk208.c
@@ -162,6 +162,7 @@ gk208_gr_gpccs_ucode = {
 static const struct gf100_gr_func
 gk208_gr = {
 	.init = gk104_gr_init,
+	.init_gpc_mmu = gf100_gr_init_gpc_mmu,
 	.init_rop_active_fbps = gk104_gr_init_rop_active_fbps,
 	.init_ppc_exceptions = gk104_gr_init_ppc_exceptions,
 	.mmio = gk208_gr_pack_mmio,
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.c
index d1dc92999dc0..d6840dc81a29 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.c
@@ -59,7 +59,7 @@ void *
 nv20_gr_chan_dtor(struct nvkm_object *object)
 {
 	struct nv20_gr_chan *chan = nv20_gr_chan(object);
-	nvkm_memory_del(&chan->inst);
+	nvkm_memory_unref(&chan->inst);
 	return chan;
 }
 
@@ -323,7 +323,7 @@ void *
 nv20_gr_dtor(struct nvkm_gr *base)
 {
 	struct nv20_gr *gr = nv20_gr(base);
-	nvkm_memory_del(&gr->ctxtab);
+	nvkm_memory_unref(&gr->ctxtab);
 	return gr;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.h b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.h
index cdf4501e3798..d0cb2b8846ec 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv20.h
@@ -19,6 +19,7 @@ void nv20_gr_tile(struct nvkm_gr *, int, struct nvkm_fb_tile *);
 int nv30_gr_init(struct nvkm_gr *);
 
 #define nv20_gr_chan(p) container_of((p), struct nv20_gr_chan, object)
+#include <core/object.h>
 
 struct nv20_gr_chan {
 	struct nvkm_object object;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv40.h b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv40.h
index 2812ed11f877..bee8ef2d5697 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv40.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv40.h
@@ -16,6 +16,7 @@ void nv40_gr_intr(struct nvkm_gr *);
 u64 nv40_gr_units(struct nvkm_gr *);
 
 #define nv40_gr_chan(p) container_of((p), struct nv40_gr_chan, object)
+#include <core/object.h>
 
 struct nv40_gr_chan {
 	struct nvkm_object object;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv50.h b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv50.h
index 45eec83a5969..1ab6ea436b70 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv50.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/nv50.h
@@ -19,6 +19,7 @@ u64 nv50_gr_units(struct nvkm_gr *);
 int g84_gr_tlb_flush(struct nvkm_gr *);
 
 #define nv50_gr_chan(p) container_of((p), struct nv50_gr_chan, object)
+#include <core/object.h>
 
 struct nv50_gr_chan {
 	struct nvkm_object object;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv31.h b/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv31.h
index d3bb34fcdebf..f0d35beb58df 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv31.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv31.h
@@ -18,6 +18,7 @@ struct nv31_mpeg_func {
 };
 
 #define nv31_mpeg_chan(p) container_of((p), struct nv31_mpeg_chan, object)
+#include <core/object.h>
 
 struct nv31_mpeg_chan {
 	struct nvkm_object object;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv50.c b/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv50.c
index 4e528851e9c0..6df880a39019 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv50.c
@@ -24,6 +24,7 @@
 #include "priv.h"
 
 #include <core/gpuobj.h>
+#include <core/object.h>
 #include <subdev/timer.h>
 
 #include <nvif/class.h>
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/pm/priv.h b/drivers/gpu/drm/nouveau/nvkm/engine/pm/priv.h
index d7b81cbf82b5..4ff0475e776c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/pm/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/pm/priv.h
@@ -67,6 +67,7 @@ struct nvkm_specdom {
 };
 
 #define nvkm_perfdom(p) container_of((p), struct nvkm_perfdom, object)
+#include <core/object.h>
 
 struct nvkm_perfdom {
 	struct nvkm_object object;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/sw/chan.h b/drivers/gpu/drm/nouveau/nvkm/engine/sw/chan.h
index 6608bf6c6842..b5be49f0ac56 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/sw/chan.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/sw/chan.h
@@ -1,9 +1,11 @@
 #ifndef __NVKM_SW_CHAN_H__
 #define __NVKM_SW_CHAN_H__
 #define nvkm_sw_chan(p) container_of((p), struct nvkm_sw_chan, object)
-#include "priv.h"
+#include <core/object.h>
 #include <core/event.h>
 
+#include "priv.h"
+
 struct nvkm_sw_chan {
 	const struct nvkm_sw_chan_func *func;
 	struct nvkm_object object;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/sw/nvsw.h b/drivers/gpu/drm/nouveau/nvkm/engine/sw/nvsw.h
index 943ef4c10091..bcfff62131fe 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/sw/nvsw.h
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/sw/nvsw.h
@@ -1,7 +1,7 @@
 #ifndef __NVKM_NVSW_H__
 #define __NVKM_NVSW_H__
 #define nvkm_nvsw(p) container_of((p), struct nvkm_nvsw, object)
-#include "priv.h"
+#include <core/object.h>
 
 struct nvkm_nvsw {
 	struct nvkm_object object;
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/xtensa.c b/drivers/gpu/drm/nouveau/nvkm/engine/xtensa.c
index 06bdb67a0205..70549381e082 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/xtensa.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/xtensa.c
@@ -86,7 +86,7 @@ nvkm_xtensa_fini(struct nvkm_engine *engine, bool suspend)
 	nvkm_wr32(device, base + 0xd94, 0); /* FIFO_CTRL */
 
 	if (!suspend)
-		nvkm_memory_del(&xtensa->gpu_fw);
+		nvkm_memory_unref(&xtensa->gpu_fw);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/falcon/base.c b/drivers/gpu/drm/nouveau/nvkm/falcon/base.c
index 1b7f48efd8b1..14be41f24155 100644
--- a/drivers/gpu/drm/nouveau/nvkm/falcon/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/falcon/base.c
@@ -60,7 +60,7 @@ nvkm_falcon_read_dmem(struct nvkm_falcon *falcon, u32 start, u32 size, u8 port,
 }
 
 void
-nvkm_falcon_bind_context(struct nvkm_falcon *falcon, struct nvkm_gpuobj *inst)
+nvkm_falcon_bind_context(struct nvkm_falcon *falcon, struct nvkm_memory *inst)
 {
 	if (!falcon->func->bind_context) {
 		nvkm_error(falcon->user,
diff --git a/drivers/gpu/drm/nouveau/nvkm/falcon/v1.c b/drivers/gpu/drm/nouveau/nvkm/falcon/v1.c
index 669c24028470..9def926f24d4 100644
--- a/drivers/gpu/drm/nouveau/nvkm/falcon/v1.c
+++ b/drivers/gpu/drm/nouveau/nvkm/falcon/v1.c
@@ -180,7 +180,7 @@ nvkm_falcon_v1_read_dmem(struct nvkm_falcon *falcon, u32 start, u32 size,
 }
 
 static void
-nvkm_falcon_v1_bind_context(struct nvkm_falcon *falcon, struct nvkm_gpuobj *ctx)
+nvkm_falcon_v1_bind_context(struct nvkm_falcon *falcon, struct nvkm_memory *ctx)
 {
 	u32 inst_loc;
 	u32 fbif;
@@ -216,7 +216,7 @@ nvkm_falcon_v1_bind_context(struct nvkm_falcon *falcon, struct nvkm_gpuobj *ctx)
 	nvkm_falcon_wr32(falcon, fbif + 4 * FALCON_DMAIDX_PHYS_SYS_NCOH, 0x6);
 
 	/* Set context */
-	switch (nvkm_memory_target(ctx->memory)) {
+	switch (nvkm_memory_target(ctx)) {
 	case NVKM_MEM_TARGET_VRAM: inst_loc = 0; break;
 	case NVKM_MEM_TARGET_HOST: inst_loc = 2; break;
 	case NVKM_MEM_TARGET_NCOH: inst_loc = 3; break;
@@ -228,7 +228,7 @@ nvkm_falcon_v1_bind_context(struct nvkm_falcon *falcon, struct nvkm_gpuobj *ctx)
 	/* Enable context */
 	nvkm_falcon_mask(falcon, 0x048, 0x1, 0x1);
 	nvkm_falcon_wr32(falcon, 0x054,
-			 ((ctx->addr >> 12) & 0xfffffff) |
+			 ((nvkm_memory_addr(ctx) >> 12) & 0xfffffff) |
 			 (inst_loc << 28) | (1 << 30));
 
 	nvkm_falcon_mask(falcon, 0x090, 0x10000, 0x10000);
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/Kbuild b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/Kbuild
index 1e138b337955..e5830453813d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/Kbuild
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/Kbuild
@@ -3,3 +3,5 @@ nvkm-y += nvkm/subdev/bar/nv50.o
 nvkm-y += nvkm/subdev/bar/g84.o
 nvkm-y += nvkm/subdev/bar/gf100.o
 nvkm-y += nvkm/subdev/bar/gk20a.o
+nvkm-y += nvkm/subdev/bar/gm107.o
+nvkm-y += nvkm/subdev/bar/gm20b.o
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/base.c
index c561d148cebc..9646adec57cb 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/base.c
@@ -30,39 +30,76 @@ nvkm_bar_flush(struct nvkm_bar *bar)
 		bar->func->flush(bar);
 }
 
-struct nvkm_vm *
-nvkm_bar_kmap(struct nvkm_bar *bar)
+struct nvkm_vmm *
+nvkm_bar_bar1_vmm(struct nvkm_device *device)
 {
-	/* disallow kmap() until after vm has been bootstrapped */
-	if (bar && bar->func->kmap && bar->subdev.oneinit)
-		return bar->func->kmap(bar);
+	return device->bar->func->bar1.vmm(device->bar);
+}
+
+struct nvkm_vmm *
+nvkm_bar_bar2_vmm(struct nvkm_device *device)
+{
+	/* Denies access to BAR2 when it's not initialised, used by INSTMEM
+	 * to know when object access needs to go through the BAR0 window.
+	 */
+	struct nvkm_bar *bar = device->bar;
+	if (bar && bar->bar2)
+		return bar->func->bar2.vmm(bar);
 	return NULL;
 }
 
-int
-nvkm_bar_umap(struct nvkm_bar *bar, u64 size, int type, struct nvkm_vma *vma)
+void
+nvkm_bar_bar2_fini(struct nvkm_device *device)
 {
-	return bar->func->umap(bar, size, type, vma);
+	struct nvkm_bar *bar = device->bar;
+	if (bar && bar->bar2) {
+		bar->func->bar2.fini(bar);
+		bar->bar2 = false;
+	}
+}
+
+void
+nvkm_bar_bar2_init(struct nvkm_device *device)
+{
+	struct nvkm_bar *bar = device->bar;
+	if (bar && bar->subdev.oneinit && !bar->bar2 && bar->func->bar2.init) {
+		bar->func->bar2.init(bar);
+		bar->func->bar2.wait(bar);
+		bar->bar2 = true;
+	}
 }
 
 static int
-nvkm_bar_oneinit(struct nvkm_subdev *subdev)
+nvkm_bar_fini(struct nvkm_subdev *subdev, bool suspend)
 {
 	struct nvkm_bar *bar = nvkm_bar(subdev);
-	return bar->func->oneinit(bar);
+	bar->func->bar1.fini(bar);
+	return 0;
 }
 
 static int
 nvkm_bar_init(struct nvkm_subdev *subdev)
 {
 	struct nvkm_bar *bar = nvkm_bar(subdev);
-	return bar->func->init(bar);
+	bar->func->bar1.init(bar);
+	bar->func->bar1.wait(bar);
+	if (bar->func->init)
+		bar->func->init(bar);
+	return 0;
+}
+
+static int
+nvkm_bar_oneinit(struct nvkm_subdev *subdev)
+{
+	struct nvkm_bar *bar = nvkm_bar(subdev);
+	return bar->func->oneinit(bar);
 }
 
 static void *
 nvkm_bar_dtor(struct nvkm_subdev *subdev)
 {
 	struct nvkm_bar *bar = nvkm_bar(subdev);
+	nvkm_bar_bar2_fini(subdev->device);
 	return bar->func->dtor(bar);
 }
 
@@ -71,6 +108,7 @@ nvkm_bar = {
 	.dtor = nvkm_bar_dtor,
 	.oneinit = nvkm_bar_oneinit,
 	.init = nvkm_bar_init,
+	.fini = nvkm_bar_fini,
 };
 
 void
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/g84.c
index ef717136c838..87f26f54b481 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/g84.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/g84.c
@@ -44,8 +44,14 @@ g84_bar_func = {
 	.dtor = nv50_bar_dtor,
 	.oneinit = nv50_bar_oneinit,
 	.init = nv50_bar_init,
-	.kmap = nv50_bar_kmap,
-	.umap = nv50_bar_umap,
+	.bar1.init = nv50_bar_bar1_init,
+	.bar1.fini = nv50_bar_bar1_fini,
+	.bar1.wait = nv50_bar_bar1_wait,
+	.bar1.vmm = nv50_bar_bar1_vmm,
+	.bar2.init = nv50_bar_bar2_init,
+	.bar2.fini = nv50_bar_bar2_fini,
+	.bar2.wait = nv50_bar_bar1_wait,
+	.bar2.vmm = nv50_bar_bar2_vmm,
 	.flush = g84_bar_flush,
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gf100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gf100.c
index 676c167c95b9..a3ba7f50198b 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gf100.c
@@ -23,39 +23,73 @@
  */
 #include "gf100.h"
 
-#include <core/gpuobj.h>
+#include <core/memory.h>
 #include <core/option.h>
 #include <subdev/fb.h>
 #include <subdev/mmu.h>
 
-static struct nvkm_vm *
-gf100_bar_kmap(struct nvkm_bar *base)
+struct nvkm_vmm *
+gf100_bar_bar1_vmm(struct nvkm_bar *base)
 {
-	return gf100_bar(base)->bar[0].vm;
+	return gf100_bar(base)->bar[1].vmm;
 }
 
-int
-gf100_bar_umap(struct nvkm_bar *base, u64 size, int type, struct nvkm_vma *vma)
+void
+gf100_bar_bar1_wait(struct nvkm_bar *base)
+{
+	/* NFI why it's twice. */
+	nvkm_bar_flush(base);
+	nvkm_bar_flush(base);
+}
+
+void
+gf100_bar_bar1_fini(struct nvkm_bar *bar)
 {
+	nvkm_mask(bar->subdev.device, 0x001704, 0x80000000, 0x00000000);
+}
+
+void
+gf100_bar_bar1_init(struct nvkm_bar *base)
+{
+	struct nvkm_device *device = base->subdev.device;
 	struct gf100_bar *bar = gf100_bar(base);
-	return nvkm_vm_get(bar->bar[1].vm, size, type, NV_MEM_ACCESS_RW, vma);
+	const u32 addr = nvkm_memory_addr(bar->bar[1].inst) >> 12;
+	nvkm_wr32(device, 0x001704, 0x80000000 | addr);
+}
+
+struct nvkm_vmm *
+gf100_bar_bar2_vmm(struct nvkm_bar *base)
+{
+	return gf100_bar(base)->bar[0].vmm;
+}
+
+void
+gf100_bar_bar2_fini(struct nvkm_bar *bar)
+{
+	nvkm_mask(bar->subdev.device, 0x001714, 0x80000000, 0x00000000);
+}
+
+void
+gf100_bar_bar2_init(struct nvkm_bar *base)
+{
+	struct nvkm_device *device = base->subdev.device;
+	struct gf100_bar *bar = gf100_bar(base);
+	u32 addr = nvkm_memory_addr(bar->bar[0].inst) >> 12;
+	if (bar->bar2_halve)
+		addr |= 0x40000000;
+	nvkm_wr32(device, 0x001714, 0x80000000 | addr);
 }
 
 static int
-gf100_bar_ctor_vm(struct gf100_bar *bar, struct gf100_bar_vm *bar_vm,
-		  struct lock_class_key *key, int bar_nr)
+gf100_bar_oneinit_bar(struct gf100_bar *bar, struct gf100_barN *bar_vm,
+		      struct lock_class_key *key, int bar_nr)
 {
 	struct nvkm_device *device = bar->base.subdev.device;
-	struct nvkm_vm *vm;
 	resource_size_t bar_len;
 	int ret;
 
 	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0, false,
-			      &bar_vm->mem);
-	if (ret)
-		return ret;
-
-	ret = nvkm_gpuobj_new(device, 0x8000, 0, false, NULL, &bar_vm->pgd);
+			      &bar_vm->inst);
 	if (ret)
 		return ret;
 
@@ -63,98 +97,64 @@ gf100_bar_ctor_vm(struct gf100_bar *bar, struct gf100_bar_vm *bar_vm,
 	if (bar_nr == 3 && bar->bar2_halve)
 		bar_len >>= 1;
 
-	ret = nvkm_vm_new(device, 0, bar_len, 0, key, &vm);
+	ret = nvkm_vmm_new(device, 0, bar_len, NULL, 0, key,
+			   (bar_nr == 3) ? "bar2" : "bar1", &bar_vm->vmm);
 	if (ret)
 		return ret;
 
-	atomic_inc(&vm->engref[NVKM_SUBDEV_BAR]);
+	atomic_inc(&bar_vm->vmm->engref[NVKM_SUBDEV_BAR]);
+	bar_vm->vmm->debug = bar->base.subdev.debug;
 
 	/*
 	 * Bootstrap page table lookup.
 	 */
 	if (bar_nr == 3) {
-		ret = nvkm_vm_boot(vm, bar_len);
-		if (ret) {
-			nvkm_vm_ref(NULL, &vm, NULL);
+		ret = nvkm_vmm_boot(bar_vm->vmm);
+		if (ret)
 			return ret;
-		}
 	}
 
-	ret = nvkm_vm_ref(vm, &bar_vm->vm, bar_vm->pgd);
-	nvkm_vm_ref(NULL, &vm, NULL);
-	if (ret)
-		return ret;
-
-	nvkm_kmap(bar_vm->mem);
-	nvkm_wo32(bar_vm->mem, 0x0200, lower_32_bits(bar_vm->pgd->addr));
-	nvkm_wo32(bar_vm->mem, 0x0204, upper_32_bits(bar_vm->pgd->addr));
-	nvkm_wo32(bar_vm->mem, 0x0208, lower_32_bits(bar_len - 1));
-	nvkm_wo32(bar_vm->mem, 0x020c, upper_32_bits(bar_len - 1));
-	nvkm_done(bar_vm->mem);
-	return 0;
+	return nvkm_vmm_join(bar_vm->vmm, bar_vm->inst);
 }
 
 int
 gf100_bar_oneinit(struct nvkm_bar *base)
 {
 	static struct lock_class_key bar1_lock;
-	static struct lock_class_key bar3_lock;
+	static struct lock_class_key bar2_lock;
 	struct gf100_bar *bar = gf100_bar(base);
 	int ret;
 
-	/* BAR3 */
-	if (bar->base.func->kmap) {
-		ret = gf100_bar_ctor_vm(bar, &bar->bar[0], &bar3_lock, 3);
+	/* BAR2 */
+	if (bar->base.func->bar2.init) {
+		ret = gf100_bar_oneinit_bar(bar, &bar->bar[0], &bar2_lock, 3);
 		if (ret)
 			return ret;
+
+		bar->base.subdev.oneinit = true;
+		nvkm_bar_bar2_init(bar->base.subdev.device);
 	}
 
 	/* BAR1 */
-	ret = gf100_bar_ctor_vm(bar, &bar->bar[1], &bar1_lock, 1);
+	ret = gf100_bar_oneinit_bar(bar, &bar->bar[1], &bar1_lock, 1);
 	if (ret)
 		return ret;
 
 	return 0;
 }
 
-int
-gf100_bar_init(struct nvkm_bar *base)
-{
-	struct gf100_bar *bar = gf100_bar(base);
-	struct nvkm_device *device = bar->base.subdev.device;
-	u32 addr;
-
-	nvkm_mask(device, 0x000200, 0x00000100, 0x00000000);
-	nvkm_mask(device, 0x000200, 0x00000100, 0x00000100);
-
-	addr = nvkm_memory_addr(bar->bar[1].mem) >> 12;
-	nvkm_wr32(device, 0x001704, 0x80000000 | addr);
-
-	if (bar->bar[0].mem) {
-		addr = nvkm_memory_addr(bar->bar[0].mem) >> 12;
-		if (bar->bar2_halve)
-			addr |= 0x40000000;
-		nvkm_wr32(device, 0x001714, 0x80000000 | addr);
-	}
-
-	return 0;
-}
-
 void *
 gf100_bar_dtor(struct nvkm_bar *base)
 {
 	struct gf100_bar *bar = gf100_bar(base);
 
-	nvkm_vm_ref(NULL, &bar->bar[1].vm, bar->bar[1].pgd);
-	nvkm_gpuobj_del(&bar->bar[1].pgd);
-	nvkm_memory_del(&bar->bar[1].mem);
+	nvkm_vmm_part(bar->bar[1].vmm, bar->bar[1].inst);
+	nvkm_vmm_unref(&bar->bar[1].vmm);
+	nvkm_memory_unref(&bar->bar[1].inst);
 
-	if (bar->bar[0].vm) {
-		nvkm_memory_del(&bar->bar[0].vm->pgt[0].mem[0]);
-		nvkm_vm_ref(NULL, &bar->bar[0].vm, bar->bar[0].pgd);
-	}
-	nvkm_gpuobj_del(&bar->bar[0].pgd);
-	nvkm_memory_del(&bar->bar[0].mem);
+	nvkm_vmm_part(bar->bar[0].vmm, bar->bar[0].inst);
+	nvkm_vmm_unref(&bar->bar[0].vmm);
+	nvkm_memory_unref(&bar->bar[0].inst);
 	return bar;
 }
 
@@ -175,9 +175,14 @@ static const struct nvkm_bar_func
 gf100_bar_func = {
 	.dtor = gf100_bar_dtor,
 	.oneinit = gf100_bar_oneinit,
-	.init = gf100_bar_init,
-	.kmap = gf100_bar_kmap,
-	.umap = gf100_bar_umap,
+	.bar1.init = gf100_bar_bar1_init,
+	.bar1.fini = gf100_bar_bar1_fini,
+	.bar1.wait = gf100_bar_bar1_wait,
+	.bar1.vmm = gf100_bar_bar1_vmm,
+	.bar2.init = gf100_bar_bar2_init,
+	.bar2.fini = gf100_bar_bar2_fini,
+	.bar2.wait = gf100_bar_bar1_wait,
+	.bar2.vmm = gf100_bar_bar2_vmm,
 	.flush = g84_bar_flush,
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gf100.h b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gf100.h
index 20a5255362ba..e4da39139e95 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gf100.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gf100.h
@@ -3,22 +3,24 @@
 #define gf100_bar(p) container_of((p), struct gf100_bar, base)
 #include "priv.h"
 
-struct gf100_bar_vm {
-	struct nvkm_memory *mem;
-	struct nvkm_gpuobj *pgd;
-	struct nvkm_vm *vm;
+struct gf100_barN {
+	struct nvkm_memory *inst;
+	struct nvkm_vmm *vmm;
 };
 
 struct gf100_bar {
 	struct nvkm_bar base;
 	bool bar2_halve;
-	struct gf100_bar_vm bar[2];
+	struct gf100_barN bar[2];
 };
 
 int gf100_bar_new_(const struct nvkm_bar_func *, struct nvkm_device *,
 		   int, struct nvkm_bar **);
 void *gf100_bar_dtor(struct nvkm_bar *);
 int gf100_bar_oneinit(struct nvkm_bar *);
-int gf100_bar_init(struct nvkm_bar *);
-int gf100_bar_umap(struct nvkm_bar *, u64, int, struct nvkm_vma *);
+void gf100_bar_bar1_init(struct nvkm_bar *);
+void gf100_bar_bar1_wait(struct nvkm_bar *);
+struct nvkm_vmm *gf100_bar_bar1_vmm(struct nvkm_bar *);
+void gf100_bar_bar2_init(struct nvkm_bar *);
+struct nvkm_vmm *gf100_bar_bar2_vmm(struct nvkm_bar *);
 #endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gk20a.c
index 9232fab4274c..b10077d38839 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gk20a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gk20a.c
@@ -25,8 +25,10 @@ static const struct nvkm_bar_func
 gk20a_bar_func = {
 	.dtor = gf100_bar_dtor,
 	.oneinit = gf100_bar_oneinit,
-	.init = gf100_bar_init,
-	.umap = gf100_bar_umap,
+	.bar1.init = gf100_bar_bar1_init,
+	.bar1.fini = gf100_bar_bar1_fini,
+	.bar1.wait = gf100_bar_bar1_wait,
+	.bar1.vmm = gf100_bar_bar1_vmm,
 	.flush = g84_bar_flush,
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gm107.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gm107.c
new file mode 100644
index 000000000000..3ddf9222d935
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gm107.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "gf100.h"
+
+#include <subdev/timer.h>
+
+void
+gm107_bar_bar1_wait(struct nvkm_bar *bar)
+{
+	struct nvkm_device *device = bar->subdev.device;
+	nvkm_msec(device, 2000,
+		if (!(nvkm_rd32(device, 0x001710) & 0x00000003))
+			break;
+	);
+}
+
+static void
+gm107_bar_bar2_wait(struct nvkm_bar *bar)
+{
+	struct nvkm_device *device = bar->subdev.device;
+	nvkm_msec(device, 2000,
+		if (!(nvkm_rd32(device, 0x001710) & 0x0000000c))
+			break;
+	);
+}
+
+static const struct nvkm_bar_func
+gm107_bar_func = {
+	.dtor = gf100_bar_dtor,
+	.oneinit = gf100_bar_oneinit,
+	.bar1.init = gf100_bar_bar1_init,
+	.bar1.fini = gf100_bar_bar1_fini,
+	.bar1.wait = gm107_bar_bar1_wait,
+	.bar1.vmm = gf100_bar_bar1_vmm,
+	.bar2.init = gf100_bar_bar2_init,
+	.bar2.fini = gf100_bar_bar2_fini,
+	.bar2.wait = gm107_bar_bar2_wait,
+	.bar2.vmm = gf100_bar_bar2_vmm,
+	.flush = g84_bar_flush,
+};
+
+int
+gm107_bar_new(struct nvkm_device *device, int index, struct nvkm_bar **pbar)
+{
+	return gf100_bar_new_(&gm107_bar_func, device, index, pbar);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gm20b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gm20b.c
new file mode 100644
index 000000000000..950bff1955ad
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/gm20b.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "gf100.h"
+
+static const struct nvkm_bar_func
+gm20b_bar_func = {
+	.dtor = gf100_bar_dtor,
+	.oneinit = gf100_bar_oneinit,
+	.bar1.init = gf100_bar_bar1_init,
+	.bar1.fini = gf100_bar_bar1_fini,
+	.bar1.wait = gm107_bar_bar1_wait,
+	.bar1.vmm = gf100_bar_bar1_vmm,
+	.flush = g84_bar_flush,
+};
+
+int
+gm20b_bar_new(struct nvkm_device *device, int index, struct nvkm_bar **pbar)
+{
+	int ret = gf100_bar_new_(&gm20b_bar_func, device, index, pbar);
+	if (ret == 0)
+		(*pbar)->iomap_uncached = true;
+	return ret;
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/nv50.c
index 6eff637ac301..157b076a1272 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/nv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/nv50.c
@@ -28,19 +28,6 @@
 #include <subdev/mmu.h>
 #include <subdev/timer.h>
 
-struct nvkm_vm *
-nv50_bar_kmap(struct nvkm_bar *base)
-{
-	return nv50_bar(base)->bar3_vm;
-}
-
-int
-nv50_bar_umap(struct nvkm_bar *base, u64 size, int type, struct nvkm_vma *vma)
-{
-	struct nv50_bar *bar = nv50_bar(base);
-	return nvkm_vm_get(bar->bar1_vm, size, type, NV_MEM_ACCESS_RW, vma);
-}
-
 static void
 nv50_bar_flush(struct nvkm_bar *base)
 {
@@ -56,14 +43,72 @@ nv50_bar_flush(struct nvkm_bar *base)
 	spin_unlock_irqrestore(&bar->base.lock, flags);
 }
 
+struct nvkm_vmm *
+nv50_bar_bar1_vmm(struct nvkm_bar *base)
+{
+	return nv50_bar(base)->bar1_vmm;
+}
+
+void
+nv50_bar_bar1_wait(struct nvkm_bar *base)
+{
+	nvkm_bar_flush(base);
+}
+
+void
+nv50_bar_bar1_fini(struct nvkm_bar *bar)
+{
+	nvkm_wr32(bar->subdev.device, 0x001708, 0x00000000);
+}
+
+void
+nv50_bar_bar1_init(struct nvkm_bar *base)
+{
+	struct nvkm_device *device = base->subdev.device;
+	struct nv50_bar *bar = nv50_bar(base);
+	nvkm_wr32(device, 0x001708, 0x80000000 | bar->bar1->node->offset >> 4);
+}
+
+struct nvkm_vmm *
+nv50_bar_bar2_vmm(struct nvkm_bar *base)
+{
+	return nv50_bar(base)->bar2_vmm;
+}
+
+void
+nv50_bar_bar2_fini(struct nvkm_bar *bar)
+{
+	nvkm_wr32(bar->subdev.device, 0x00170c, 0x00000000);
+}
+
+void
+nv50_bar_bar2_init(struct nvkm_bar *base)
+{
+	struct nvkm_device *device = base->subdev.device;
+	struct nv50_bar *bar = nv50_bar(base);
+	nvkm_wr32(device, 0x001704, 0x00000000 | bar->mem->addr >> 12);
+	nvkm_wr32(device, 0x001704, 0x40000000 | bar->mem->addr >> 12);
+	nvkm_wr32(device, 0x00170c, 0x80000000 | bar->bar2->node->offset >> 4);
+}
+
+void
+nv50_bar_init(struct nvkm_bar *base)
+{
+	struct nv50_bar *bar = nv50_bar(base);
+	struct nvkm_device *device = bar->base.subdev.device;
+	int i;
+
+	for (i = 0; i < 8; i++)
+		nvkm_wr32(device, 0x001900 + (i * 4), 0x00000000);
+}
+
 int
 nv50_bar_oneinit(struct nvkm_bar *base)
 {
 	struct nv50_bar *bar = nv50_bar(base);
 	struct nvkm_device *device = bar->base.subdev.device;
 	static struct lock_class_key bar1_lock;
-	static struct lock_class_key bar3_lock;
-	struct nvkm_vm *vm;
+	static struct lock_class_key bar2_lock;
 	u64 start, limit;
 	int ret;
 
@@ -80,51 +125,54 @@ nv50_bar_oneinit(struct nvkm_bar *base)
 	if (ret)
 		return ret;
 
-	/* BAR3 */
+	/* BAR2 */
 	start = 0x0100000000ULL;
 	limit = start + device->func->resource_size(device, 3);
 
-	ret = nvkm_vm_new(device, start, limit - start, start, &bar3_lock, &vm);
+	ret = nvkm_vmm_new(device, start, limit-- - start, NULL, 0,
+			   &bar2_lock, "bar2", &bar->bar2_vmm);
 	if (ret)
 		return ret;
 
-	atomic_inc(&vm->engref[NVKM_SUBDEV_BAR]);
+	atomic_inc(&bar->bar2_vmm->engref[NVKM_SUBDEV_BAR]);
+	bar->bar2_vmm->debug = bar->base.subdev.debug;
 
-	ret = nvkm_vm_boot(vm, limit-- - start);
+	ret = nvkm_vmm_boot(bar->bar2_vmm);
 	if (ret)
 		return ret;
 
-	ret = nvkm_vm_ref(vm, &bar->bar3_vm, bar->pgd);
-	nvkm_vm_ref(NULL, &vm, NULL);
+	ret = nvkm_vmm_join(bar->bar2_vmm, bar->mem->memory);
 	if (ret)
 		return ret;
 
-	ret = nvkm_gpuobj_new(device, 24, 16, false, bar->mem, &bar->bar3);
+	ret = nvkm_gpuobj_new(device, 24, 16, false, bar->mem, &bar->bar2);
 	if (ret)
 		return ret;
 
-	nvkm_kmap(bar->bar3);
-	nvkm_wo32(bar->bar3, 0x00, 0x7fc00000);
-	nvkm_wo32(bar->bar3, 0x04, lower_32_bits(limit));
-	nvkm_wo32(bar->bar3, 0x08, lower_32_bits(start));
-	nvkm_wo32(bar->bar3, 0x0c, upper_32_bits(limit) << 24 |
+	nvkm_kmap(bar->bar2);
+	nvkm_wo32(bar->bar2, 0x00, 0x7fc00000);
+	nvkm_wo32(bar->bar2, 0x04, lower_32_bits(limit));
+	nvkm_wo32(bar->bar2, 0x08, lower_32_bits(start));
+	nvkm_wo32(bar->bar2, 0x0c, upper_32_bits(limit) << 24 |
 				   upper_32_bits(start));
-	nvkm_wo32(bar->bar3, 0x10, 0x00000000);
-	nvkm_wo32(bar->bar3, 0x14, 0x00000000);
-	nvkm_done(bar->bar3);
+	nvkm_wo32(bar->bar2, 0x10, 0x00000000);
+	nvkm_wo32(bar->bar2, 0x14, 0x00000000);
+	nvkm_done(bar->bar2);
+
+	bar->base.subdev.oneinit = true;
+	nvkm_bar_bar2_init(device);
 
 	/* BAR1 */
 	start = 0x0000000000ULL;
 	limit = start + device->func->resource_size(device, 1);
 
-	ret = nvkm_vm_new(device, start, limit-- - start, start, &bar1_lock, &vm);
-	if (ret)
-		return ret;
+	ret = nvkm_vmm_new(device, start, limit-- - start, NULL, 0,
+			   &bar1_lock, "bar1", &bar->bar1_vmm);
 
-	atomic_inc(&vm->engref[NVKM_SUBDEV_BAR]);
+	atomic_inc(&bar->bar1_vmm->engref[NVKM_SUBDEV_BAR]);
+	bar->bar1_vmm->debug = bar->base.subdev.debug;
 
-	ret = nvkm_vm_ref(vm, &bar->bar1_vm, bar->pgd);
-	nvkm_vm_ref(NULL, &vm, NULL);
+	ret = nvkm_vmm_join(bar->bar1_vmm, bar->mem->memory);
 	if (ret)
 		return ret;
 
@@ -144,45 +192,21 @@ nv50_bar_oneinit(struct nvkm_bar *base)
 	return 0;
 }
 
-int
-nv50_bar_init(struct nvkm_bar *base)
-{
-	struct nv50_bar *bar = nv50_bar(base);
-	struct nvkm_device *device = bar->base.subdev.device;
-	int i;
-
-	nvkm_mask(device, 0x000200, 0x00000100, 0x00000000);
-	nvkm_mask(device, 0x000200, 0x00000100, 0x00000100);
-	nvkm_wr32(device, 0x100c80, 0x00060001);
-	if (nvkm_msec(device, 2000,
-		if (!(nvkm_rd32(device, 0x100c80) & 0x00000001))
-			break;
-	) < 0)
-		return -EBUSY;
-
-	nvkm_wr32(device, 0x001704, 0x00000000 | bar->mem->addr >> 12);
-	nvkm_wr32(device, 0x001704, 0x40000000 | bar->mem->addr >> 12);
-	nvkm_wr32(device, 0x001708, 0x80000000 | bar->bar1->node->offset >> 4);
-	nvkm_wr32(device, 0x00170c, 0x80000000 | bar->bar3->node->offset >> 4);
-	for (i = 0; i < 8; i++)
-		nvkm_wr32(device, 0x001900 + (i * 4), 0x00000000);
-	return 0;
-}
-
 void *
 nv50_bar_dtor(struct nvkm_bar *base)
 {
 	struct nv50_bar *bar = nv50_bar(base);
-	nvkm_gpuobj_del(&bar->bar1);
-	nvkm_vm_ref(NULL, &bar->bar1_vm, bar->pgd);
-	nvkm_gpuobj_del(&bar->bar3);
-	if (bar->bar3_vm) {
-		nvkm_memory_del(&bar->bar3_vm->pgt[0].mem[0]);
-		nvkm_vm_ref(NULL, &bar->bar3_vm, bar->pgd);
+	if (bar->mem) {
+		nvkm_gpuobj_del(&bar->bar1);
+		nvkm_vmm_part(bar->bar1_vmm, bar->mem->memory);
+		nvkm_vmm_unref(&bar->bar1_vmm);
+		nvkm_gpuobj_del(&bar->bar2);
+		nvkm_vmm_part(bar->bar2_vmm, bar->mem->memory);
+		nvkm_vmm_unref(&bar->bar2_vmm);
+		nvkm_gpuobj_del(&bar->pgd);
+		nvkm_gpuobj_del(&bar->pad);
+		nvkm_gpuobj_del(&bar->mem);
 	}
-	nvkm_gpuobj_del(&bar->pgd);
-	nvkm_gpuobj_del(&bar->pad);
-	nvkm_gpuobj_del(&bar->mem);
 	return bar;
 }
 
@@ -204,8 +228,14 @@ nv50_bar_func = {
 	.dtor = nv50_bar_dtor,
 	.oneinit = nv50_bar_oneinit,
 	.init = nv50_bar_init,
-	.kmap = nv50_bar_kmap,
-	.umap = nv50_bar_umap,
+	.bar1.init = nv50_bar_bar1_init,
+	.bar1.fini = nv50_bar_bar1_fini,
+	.bar1.wait = nv50_bar_bar1_wait,
+	.bar1.vmm = nv50_bar_bar1_vmm,
+	.bar2.init = nv50_bar_bar2_init,
+	.bar2.fini = nv50_bar_bar2_fini,
+	.bar2.wait = nv50_bar_bar1_wait,
+	.bar2.vmm = nv50_bar_bar2_vmm,
 	.flush = nv50_bar_flush,
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/nv50.h b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/nv50.h
index 1eb764f22a49..140b76f588b6 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/nv50.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/nv50.h
@@ -9,18 +9,20 @@ struct nv50_bar {
 	struct nvkm_gpuobj *mem;
 	struct nvkm_gpuobj *pad;
 	struct nvkm_gpuobj *pgd;
-	struct nvkm_vm *bar1_vm;
+	struct nvkm_vmm *bar1_vmm;
 	struct nvkm_gpuobj *bar1;
-	struct nvkm_vm *bar3_vm;
-	struct nvkm_gpuobj *bar3;
+	struct nvkm_vmm *bar2_vmm;
+	struct nvkm_gpuobj *bar2;
 };
 
 int nv50_bar_new_(const struct nvkm_bar_func *, struct nvkm_device *,
 		  int, u32 pgd_addr, struct nvkm_bar **);
 void *nv50_bar_dtor(struct nvkm_bar *);
 int nv50_bar_oneinit(struct nvkm_bar *);
-int nv50_bar_init(struct nvkm_bar *);
-struct nvkm_vm *nv50_bar_kmap(struct nvkm_bar *);
-int nv50_bar_umap(struct nvkm_bar *, u64, int, struct nvkm_vma *);
-void nv50_bar_unmap(struct nvkm_bar *, struct nvkm_vma *);
+void nv50_bar_init(struct nvkm_bar *);
+void nv50_bar_bar1_init(struct nvkm_bar *);
+void nv50_bar_bar1_wait(struct nvkm_bar *);
+struct nvkm_vmm *nv50_bar_bar1_vmm(struct nvkm_bar *);
+void nv50_bar_bar2_init(struct nvkm_bar *);
+struct nvkm_vmm *nv50_bar_bar2_vmm(struct nvkm_bar *);
 #endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/priv.h
index d834ef20db5b..14398e2dbdf9 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bar/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bar/priv.h
@@ -9,11 +9,25 @@ void nvkm_bar_ctor(const struct nvkm_bar_func *, struct nvkm_device *,
 struct nvkm_bar_func {
 	void *(*dtor)(struct nvkm_bar *);
 	int (*oneinit)(struct nvkm_bar *);
-	int (*init)(struct nvkm_bar *);
-	struct nvkm_vm *(*kmap)(struct nvkm_bar *);
-	int  (*umap)(struct nvkm_bar *, u64 size, int type, struct nvkm_vma *);
+	void (*init)(struct nvkm_bar *);
+
+	struct {
+		void (*init)(struct nvkm_bar *);
+		void (*fini)(struct nvkm_bar *);
+		void (*wait)(struct nvkm_bar *);
+		struct nvkm_vmm *(*vmm)(struct nvkm_bar *);
+	} bar1, bar2;
+
 	void (*flush)(struct nvkm_bar *);
 };
 
+void nv50_bar_bar1_fini(struct nvkm_bar *);
+void nv50_bar_bar2_fini(struct nvkm_bar *);
+
 void g84_bar_flush(struct nvkm_bar *);
+
+void gf100_bar_bar1_fini(struct nvkm_bar *);
+void gf100_bar_bar2_fini(struct nvkm_bar *);
+
+void gm107_bar_bar1_wait(struct nvkm_bar *);
 #endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/iccsense.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/iccsense.c
index 23caef8df17f..73e463ed55c3 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/iccsense.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/iccsense.c
@@ -99,7 +99,7 @@ nvbios_iccsense_parse(struct nvkm_bios *bios, struct nvbios_iccsense *iccsense)
 			rail->extdev_id = nvbios_rd08(bios, entry + 0x1);
 			res_start = 0x5;
 			break;
-		};
+		}
 
 		if (nvbios_extdev_parse(bios, rail->extdev_id, &extdev))
 			continue;
@@ -115,7 +115,7 @@ nvbios_iccsense_parse(struct nvkm_bios *bios, struct nvbios_iccsense *iccsense)
 		default:
 			rail->resistor_count = 0;
 			break;
-		};
+		}
 
 		for (r = 0; r < rail->resistor_count; ++r) {
 			rail->resistors[r].mohm = nvbios_rd08(bios, entry + res_start + r * 2);
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/init.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/init.c
index b58ee99f7bfc..9cc10e438b3d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/init.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/init.c
@@ -36,6 +36,8 @@
 #include <subdev/i2c.h>
 #include <subdev/vga.h>
 
+#include <linux/kernel.h>
+
 #define bioslog(lvl, fmt, args...) do {                                        \
 	nvkm_printk(init->subdev, lvl, info, "0x%08x[%c]: "fmt,                \
 		    init->offset, init_exec(init) ?                            \
@@ -2271,8 +2273,6 @@ static struct nvbios_init_opcode {
 	[0xaa] = { init_reserved },
 };
 
-#define init_opcode_nr (sizeof(init_opcode) / sizeof(init_opcode[0]))
-
 int
 nvbios_exec(struct nvbios_init *init)
 {
@@ -2281,7 +2281,8 @@ nvbios_exec(struct nvbios_init *init)
 	init->nested++;
 	while (init->offset) {
 		u8 opcode = nvbios_rd08(bios, init->offset);
-		if (opcode >= init_opcode_nr || !init_opcode[opcode].exec) {
+		if (opcode >= ARRAY_SIZE(init_opcode) ||
+		    !init_opcode[opcode].exec) {
 			error("unknown opcode 0x%02x\n", opcode);
 			return -EINVAL;
 		}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/timing.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/timing.c
index 7e83c3985020..20ff5173cf8f 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/timing.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/timing.c
@@ -115,16 +115,21 @@ nvbios_timingEp(struct nvkm_bios *bios, int idx,
 		switch (min_t(u8, *hdr, 25)) {
 		case 25:
 			p->timing_10_24  = nvbios_rd08(bios, data + 0x18);
+			/* fall through */
 		case 24:
 		case 23:
 		case 22:
 			p->timing_10_21  = nvbios_rd08(bios, data + 0x15);
+			/* fall through */
 		case 21:
 			p->timing_10_20  = nvbios_rd08(bios, data + 0x14);
+			/* fall through */
 		case 20:
 			p->timing_10_CWL = nvbios_rd08(bios, data + 0x13);
+			/* fall through */
 		case 19:
 			p->timing_10_18  = nvbios_rd08(bios, data + 0x12);
+			/* fall through */
 		case 18:
 		case 17:
 			p->timing_10_16  = nvbios_rd08(bios, data + 0x10);
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/nv04.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/nv04.c
index 158977f8a6e6..c3dae05348eb 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/nv04.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/nv04.c
@@ -119,11 +119,11 @@ powerctrl_1_shift(int chip_version, int reg)
 
 	switch (reg) {
 	case 0x680520:
-		shift += 4;
+		shift += 4; /* fall through */
 	case 0x680508:
-		shift += 4;
+		shift += 4; /* fall through */
 	case 0x680504:
-		shift += 4;
+		shift += 4; /* fall through */
 	case 0x680500:
 		shift += 4;
 	}
@@ -245,11 +245,11 @@ setPLL_double_highregs(struct nvkm_devinit *init, u32 reg1,
 
 		switch (reg1) {
 		case 0x680504:
-			shift_c040 += 2;
+			shift_c040 += 2; /* fall through */
 		case 0x680500:
-			shift_c040 += 2;
+			shift_c040 += 2; /* fall through */
 		case 0x680520:
-			shift_c040 += 2;
+			shift_c040 += 2; /* fall through */
 		case 0x680508:
 			shift_c040 += 2;
 		}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/base.c
index a7049c041594..73b5d46104bd 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/base.c
@@ -31,12 +31,6 @@
 #include <engine/gr.h>
 #include <engine/mpeg.h>
 
-bool
-nvkm_fb_memtype_valid(struct nvkm_fb *fb, u32 memtype)
-{
-	return fb->func->memtype_valid(fb, memtype);
-}
-
 void
 nvkm_fb_tile_fini(struct nvkm_fb *fb, int region, struct nvkm_fb_tile *tile)
 {
@@ -100,6 +94,7 @@ static int
 nvkm_fb_oneinit(struct nvkm_subdev *subdev)
 {
 	struct nvkm_fb *fb = nvkm_fb(subdev);
+	u32 tags = 0;
 
 	if (fb->func->ram_new) {
 		int ret = fb->func->ram_new(fb, &fb->ram);
@@ -115,7 +110,16 @@ nvkm_fb_oneinit(struct nvkm_subdev *subdev)
 			return ret;
 	}
 
-	return 0;
+	/* Initialise compression tag allocator.
+	 *
+	 * LTC oneinit() will override this on Fermi and newer.
+	 */
+	if (fb->func->tags) {
+		tags = fb->func->tags(fb);
+		nvkm_debug(subdev, "%d comptags\n", tags);
+	}
+
+	return nvkm_mm_init(&fb->tags, 0, 0, tags, 1);
 }
 
 static int
@@ -135,8 +139,13 @@ nvkm_fb_init(struct nvkm_subdev *subdev)
 
 	if (fb->func->init)
 		fb->func->init(fb);
-	if (fb->func->init_page)
-		fb->func->init_page(fb);
+
+	if (fb->func->init_page) {
+		ret = fb->func->init_page(fb);
+		if (WARN_ON(ret))
+			return ret;
+	}
+
 	if (fb->func->init_unkn)
 		fb->func->init_unkn(fb);
 	return 0;
@@ -148,12 +157,13 @@ nvkm_fb_dtor(struct nvkm_subdev *subdev)
 	struct nvkm_fb *fb = nvkm_fb(subdev);
 	int i;
 
-	nvkm_memory_del(&fb->mmu_wr);
-	nvkm_memory_del(&fb->mmu_rd);
+	nvkm_memory_unref(&fb->mmu_wr);
+	nvkm_memory_unref(&fb->mmu_rd);
 
 	for (i = 0; i < fb->tile.regions; i++)
 		fb->func->tile.fini(fb, i, &fb->tile.region[i]);
 
+	nvkm_mm_fini(&fb->tags);
 	nvkm_ram_del(&fb->ram);
 
 	if (fb->func->dtor)
@@ -176,7 +186,8 @@ nvkm_fb_ctor(const struct nvkm_fb_func *func, struct nvkm_device *device,
 	nvkm_subdev_ctor(&nvkm_fb, device, index, &fb->subdev);
 	fb->func = func;
 	fb->tile.regions = fb->func->tile.regions;
-	fb->page = nvkm_longopt(device->cfgopt, "NvFbBigPage", 0);
+	fb->page = nvkm_longopt(device->cfgopt, "NvFbBigPage",
+				fb->func->default_bigpage);
 }
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/g84.c
index 9c28392d07e4..06bf95c0c549 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/g84.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/g84.c
@@ -27,6 +27,7 @@
 static const struct nv50_fb_func
 g84_fb = {
 	.ram_new = nv50_ram_new,
+	.tags = nv20_fb_tags,
 	.trap = 0x001d07ff,
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf100.c
index a239e73562c8..47d28c279707 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf100.c
@@ -27,15 +27,6 @@
 #include <core/memory.h>
 #include <core/option.h>
 
-extern const u8 gf100_pte_storage_type_map[256];
-
-bool
-gf100_fb_memtype_valid(struct nvkm_fb *fb, u32 tile_flags)
-{
-	u8 memtype = (tile_flags & 0x0000ff00) >> 8;
-	return likely((gf100_pte_storage_type_map[memtype] != 0xff));
-}
-
 void
 gf100_fb_intr(struct nvkm_fb *base)
 {
@@ -80,20 +71,17 @@ gf100_fb_oneinit(struct nvkm_fb *base)
 	return 0;
 }
 
-void
+int
 gf100_fb_init_page(struct nvkm_fb *fb)
 {
 	struct nvkm_device *device = fb->subdev.device;
 	switch (fb->page) {
-	case 16:
-		nvkm_mask(device, 0x100c80, 0x00000001, 0x00000001);
-		break;
-	case 17:
+	case 16: nvkm_mask(device, 0x100c80, 0x00000001, 0x00000001); break;
+	case 17: nvkm_mask(device, 0x100c80, 0x00000001, 0x00000000); break;
 	default:
-		nvkm_mask(device, 0x100c80, 0x00000001, 0x00000000);
-		fb->page = 17;
-		break;
+		return -EINVAL;
 	}
+	return 0;
 }
 
 void
@@ -143,7 +131,7 @@ gf100_fb = {
 	.init_page = gf100_fb_init_page,
 	.intr = gf100_fb_intr,
 	.ram_new = gf100_ram_new,
-	.memtype_valid = gf100_fb_memtype_valid,
+	.default_bigpage = 17,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf100.h b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf100.h
index 412eb89834e8..e3cf0515bb70 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf100.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf100.h
@@ -17,7 +17,5 @@ void gf100_fb_intr(struct nvkm_fb *);
 
 void gp100_fb_init(struct nvkm_fb *);
 
-void gm200_fb_init_page(struct nvkm_fb *fb);
 void gm200_fb_init(struct nvkm_fb *base);
-
 #endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf108.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf108.c
index 56af84aa333b..4a9f463745b5 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf108.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gf108.c
@@ -32,7 +32,7 @@ gf108_fb = {
 	.init_page = gf100_fb_init_page,
 	.intr = gf100_fb_intr,
 	.ram_new = gf108_ram_new,
-	.memtype_valid = gf100_fb_memtype_valid,
+	.default_bigpage = 17,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk104.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk104.c
index 4245e2e6e604..0a6e8eaad42c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk104.c
@@ -32,7 +32,7 @@ gk104_fb = {
 	.init_page = gf100_fb_init_page,
 	.intr = gf100_fb_intr,
 	.ram_new = gk104_ram_new,
-	.memtype_valid = gf100_fb_memtype_valid,
+	.default_bigpage = 17,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk20a.c
index 5d34d6136616..a7e29b125094 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk20a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gk20a.c
@@ -30,7 +30,7 @@ gk20a_fb = {
 	.init = gf100_fb_init,
 	.init_page = gf100_fb_init_page,
 	.intr = gf100_fb_intr,
-	.memtype_valid = gf100_fb_memtype_valid,
+	.default_bigpage = 17,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm107.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm107.c
index db699025f546..69c876d5d1c1 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm107.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm107.c
@@ -32,7 +32,7 @@ gm107_fb = {
 	.init_page = gf100_fb_init_page,
 	.intr = gf100_fb_intr,
 	.ram_new = gm107_ram_new,
-	.memtype_valid = gf100_fb_memtype_valid,
+	.default_bigpage = 17,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm200.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm200.c
index d83da5ddbc1e..8137e19d3292 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm200.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm200.c
@@ -26,22 +26,18 @@
 
 #include <core/memory.h>
 
-void
+int
 gm200_fb_init_page(struct nvkm_fb *fb)
 {
 	struct nvkm_device *device = fb->subdev.device;
 	switch (fb->page) {
-	case 16:
-		nvkm_mask(device, 0x100c80, 0x00000801, 0x00000001);
-		break;
-	case 17:
-		nvkm_mask(device, 0x100c80, 0x00000801, 0x00000000);
-		break;
+	case 16: nvkm_mask(device, 0x100c80, 0x00001801, 0x00001001); break;
+	case 17: nvkm_mask(device, 0x100c80, 0x00001801, 0x00000000); break;
+	case  0: nvkm_mask(device, 0x100c80, 0x00001800, 0x00001800); break;
 	default:
-		nvkm_mask(device, 0x100c80, 0x00000800, 0x00000800);
-		fb->page = 0;
-		break;
+		return -EINVAL;
 	}
+	return 0;
 }
 
 void
@@ -69,7 +65,7 @@ gm200_fb = {
 	.init_page = gm200_fb_init_page,
 	.intr = gf100_fb_intr,
 	.ram_new = gm200_ram_new,
-	.memtype_valid = gf100_fb_memtype_valid,
+	.default_bigpage = 0 /* per-instance. */,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm20b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm20b.c
index b87c233bcd6d..12db61e31128 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm20b.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gm20b.c
@@ -30,7 +30,7 @@ gm20b_fb = {
 	.init = gm200_fb_init,
 	.init_page = gm200_fb_init_page,
 	.intr = gf100_fb_intr,
-	.memtype_valid = gf100_fb_memtype_valid,
+	.default_bigpage = 0 /* per-instance. */,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp100.c
index 98474aec1921..147f69b30cd8 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp100.c
@@ -59,7 +59,6 @@ gp100_fb = {
 	.init_page = gm200_fb_init_page,
 	.init_unkn = gp100_fb_init_unkn,
 	.ram_new = gp100_ram_new,
-	.memtype_valid = gf100_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp102.c
index 73b4ae1c73dc..b84b9861ef26 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp102.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp102.c
@@ -33,7 +33,6 @@ gp102_fb = {
 	.init = gp100_fb_init,
 	.init_page = gm200_fb_init_page,
 	.ram_new = gp100_ram_new,
-	.memtype_valid = gf100_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp10b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp10b.c
index f2b1fbf428d5..af8e43979dc1 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp10b.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gp10b.c
@@ -28,7 +28,6 @@ gp10b_fb = {
 	.init = gm200_fb_init,
 	.init_page = gm200_fb_init_page,
 	.intr = gf100_fb_intr,
-	.memtype_valid = gf100_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gt215.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gt215.c
index ebb30608d5ef..9266559b45f9 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gt215.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gt215.c
@@ -27,6 +27,7 @@
 static const struct nv50_fb_func
 gt215_fb = {
 	.ram_new = gt215_ram_new,
+	.tags = nv20_fb_tags,
 	.trap = 0x000d0fff,
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv04.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv04.c
index 8ff2e5db4571..c886664533c8 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv04.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv04.c
@@ -25,14 +25,6 @@
 #include "ram.h"
 #include "regsnv04.h"
 
-bool
-nv04_fb_memtype_valid(struct nvkm_fb *fb, u32 tile_flags)
-{
-	if (!(tile_flags & 0xff00))
-		return true;
-	return false;
-}
-
 static void
 nv04_fb_init(struct nvkm_fb *fb)
 {
@@ -49,7 +41,6 @@ static const struct nvkm_fb_func
 nv04_fb = {
 	.init = nv04_fb_init,
 	.ram_new = nv04_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv10.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv10.c
index e8c44f5a3d84..c998b7e96aa3 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv10.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv10.c
@@ -61,7 +61,6 @@ nv10_fb = {
 	.tile.fini = nv10_fb_tile_fini,
 	.tile.prog = nv10_fb_tile_prog,
 	.ram_new = nv10_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv1a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv1a.c
index 2ae0beb87567..7b9f04f44af8 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv1a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv1a.c
@@ -33,7 +33,6 @@ nv1a_fb = {
 	.tile.fini = nv10_fb_tile_fini,
 	.tile.prog = nv10_fb_tile_prog,
 	.ram_new = nv1a_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv20.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv20.c
index 126865dfe777..a021d21ff153 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv20.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv20.c
@@ -45,7 +45,7 @@ nv20_fb_tile_comp(struct nvkm_fb *fb, int i, u32 size, u32 flags,
 {
 	u32 tiles = DIV_ROUND_UP(size, 0x40);
 	u32 tags  = round_up(tiles / fb->ram->parts, 0x40);
-	if (!nvkm_mm_head(&fb->ram->tags, 0, 1, tags, tags, 1, &tile->tag)) {
+	if (!nvkm_mm_head(&fb->tags, 0, 1, tags, tags, 1, &tile->tag)) {
 		if (!(flags & 2)) tile->zcomp = 0x00000000; /* Z16 */
 		else              tile->zcomp = 0x04000000; /* Z24S8 */
 		tile->zcomp |= tile->tag->offset;
@@ -63,7 +63,7 @@ nv20_fb_tile_fini(struct nvkm_fb *fb, int i, struct nvkm_fb_tile *tile)
 	tile->limit = 0;
 	tile->pitch = 0;
 	tile->zcomp = 0;
-	nvkm_mm_free(&fb->ram->tags, &tile->tag);
+	nvkm_mm_free(&fb->tags, &tile->tag);
 }
 
 void
@@ -77,15 +77,22 @@ nv20_fb_tile_prog(struct nvkm_fb *fb, int i, struct nvkm_fb_tile *tile)
 	nvkm_wr32(device, 0x100300 + (i * 0x04), tile->zcomp);
 }
 
+u32
+nv20_fb_tags(struct nvkm_fb *fb)
+{
+	const u32 tags = nvkm_rd32(fb->subdev.device, 0x100320);
+	return tags ? tags + 1 : 0;
+}
+
 static const struct nvkm_fb_func
 nv20_fb = {
+	.tags = nv20_fb_tags,
 	.tile.regions = 8,
 	.tile.init = nv20_fb_tile_init,
 	.tile.comp = nv20_fb_tile_comp,
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv20_fb_tile_prog,
 	.ram_new = nv20_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv25.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv25.c
index c56746d2a502..7709f5fe9a45 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv25.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv25.c
@@ -32,7 +32,7 @@ nv25_fb_tile_comp(struct nvkm_fb *fb, int i, u32 size, u32 flags,
 {
 	u32 tiles = DIV_ROUND_UP(size, 0x40);
 	u32 tags  = round_up(tiles / fb->ram->parts, 0x40);
-	if (!nvkm_mm_head(&fb->ram->tags, 0, 1, tags, tags, 1, &tile->tag)) {
+	if (!nvkm_mm_head(&fb->tags, 0, 1, tags, tags, 1, &tile->tag)) {
 		if (!(flags & 2)) tile->zcomp = 0x00100000; /* Z16 */
 		else              tile->zcomp = 0x00200000; /* Z24S8 */
 		tile->zcomp |= tile->tag->offset;
@@ -44,13 +44,13 @@ nv25_fb_tile_comp(struct nvkm_fb *fb, int i, u32 size, u32 flags,
 
 static const struct nvkm_fb_func
 nv25_fb = {
+	.tags = nv20_fb_tags,
 	.tile.regions = 8,
 	.tile.init = nv20_fb_tile_init,
 	.tile.comp = nv25_fb_tile_comp,
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv20_fb_tile_prog,
 	.ram_new = nv20_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv30.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv30.c
index 2a7c4831b821..8aa782666507 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv30.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv30.c
@@ -51,7 +51,7 @@ nv30_fb_tile_comp(struct nvkm_fb *fb, int i, u32 size, u32 flags,
 {
 	u32 tiles = DIV_ROUND_UP(size, 0x40);
 	u32 tags  = round_up(tiles / fb->ram->parts, 0x40);
-	if (!nvkm_mm_head(&fb->ram->tags, 0, 1, tags, tags, 1, &tile->tag)) {
+	if (!nvkm_mm_head(&fb->tags, 0, 1, tags, tags, 1, &tile->tag)) {
 		if (flags & 2) tile->zcomp |= 0x01000000; /* Z16 */
 		else           tile->zcomp |= 0x02000000; /* Z24S8 */
 		tile->zcomp |= ((tile->tag->offset           ) >> 6);
@@ -116,6 +116,7 @@ nv30_fb_init(struct nvkm_fb *fb)
 
 static const struct nvkm_fb_func
 nv30_fb = {
+	.tags = nv20_fb_tags,
 	.init = nv30_fb_init,
 	.tile.regions = 8,
 	.tile.init = nv30_fb_tile_init,
@@ -123,7 +124,6 @@ nv30_fb = {
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv20_fb_tile_prog,
 	.ram_new = nv20_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv35.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv35.c
index 1604b3789ad1..6e83dcff72e0 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv35.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv35.c
@@ -32,7 +32,7 @@ nv35_fb_tile_comp(struct nvkm_fb *fb, int i, u32 size, u32 flags,
 {
 	u32 tiles = DIV_ROUND_UP(size, 0x40);
 	u32 tags  = round_up(tiles / fb->ram->parts, 0x40);
-	if (!nvkm_mm_head(&fb->ram->tags, 0, 1, tags, tags, 1, &tile->tag)) {
+	if (!nvkm_mm_head(&fb->tags, 0, 1, tags, tags, 1, &tile->tag)) {
 		if (flags & 2) tile->zcomp |= 0x04000000; /* Z16 */
 		else           tile->zcomp |= 0x08000000; /* Z24S8 */
 		tile->zcomp |= ((tile->tag->offset           ) >> 6);
@@ -45,6 +45,7 @@ nv35_fb_tile_comp(struct nvkm_fb *fb, int i, u32 size, u32 flags,
 
 static const struct nvkm_fb_func
 nv35_fb = {
+	.tags = nv20_fb_tags,
 	.init = nv30_fb_init,
 	.tile.regions = 8,
 	.tile.init = nv30_fb_tile_init,
@@ -52,7 +53,6 @@ nv35_fb = {
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv20_fb_tile_prog,
 	.ram_new = nv20_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv36.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv36.c
index 80cc0a6e3416..2a07617bb44c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv36.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv36.c
@@ -32,7 +32,7 @@ nv36_fb_tile_comp(struct nvkm_fb *fb, int i, u32 size, u32 flags,
 {
 	u32 tiles = DIV_ROUND_UP(size, 0x40);
 	u32 tags  = round_up(tiles / fb->ram->parts, 0x40);
-	if (!nvkm_mm_head(&fb->ram->tags, 0, 1, tags, tags, 1, &tile->tag)) {
+	if (!nvkm_mm_head(&fb->tags, 0, 1, tags, tags, 1, &tile->tag)) {
 		if (flags & 2) tile->zcomp |= 0x10000000; /* Z16 */
 		else           tile->zcomp |= 0x20000000; /* Z24S8 */
 		tile->zcomp |= ((tile->tag->offset           ) >> 6);
@@ -45,6 +45,7 @@ nv36_fb_tile_comp(struct nvkm_fb *fb, int i, u32 size, u32 flags,
 
 static const struct nvkm_fb_func
 nv36_fb = {
+	.tags = nv20_fb_tags,
 	.init = nv30_fb_init,
 	.tile.regions = 8,
 	.tile.init = nv30_fb_tile_init,
@@ -52,7 +53,6 @@ nv36_fb = {
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv20_fb_tile_prog,
 	.ram_new = nv20_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv40.c
index deec46a310f8..955160778b5b 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv40.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv40.c
@@ -33,7 +33,7 @@ nv40_fb_tile_comp(struct nvkm_fb *fb, int i, u32 size, u32 flags,
 	u32 tiles = DIV_ROUND_UP(size, 0x80);
 	u32 tags  = round_up(tiles / fb->ram->parts, 0x100);
 	if ( (flags & 2) &&
-	    !nvkm_mm_head(&fb->ram->tags, 0, 1, tags, tags, 1, &tile->tag)) {
+	    !nvkm_mm_head(&fb->tags, 0, 1, tags, tags, 1, &tile->tag)) {
 		tile->zcomp  = 0x28000000; /* Z24S8_SPLIT_GRAD */
 		tile->zcomp |= ((tile->tag->offset           ) >> 8);
 		tile->zcomp |= ((tile->tag->offset + tags - 1) >> 8) << 13;
@@ -51,6 +51,7 @@ nv40_fb_init(struct nvkm_fb *fb)
 
 static const struct nvkm_fb_func
 nv40_fb = {
+	.tags = nv20_fb_tags,
 	.init = nv40_fb_init,
 	.tile.regions = 8,
 	.tile.init = nv30_fb_tile_init,
@@ -58,7 +59,6 @@ nv40_fb = {
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv20_fb_tile_prog,
 	.ram_new = nv40_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv41.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv41.c
index 79e57dd5a00f..b77f08d34cc3 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv41.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv41.c
@@ -45,6 +45,7 @@ nv41_fb_init(struct nvkm_fb *fb)
 
 static const struct nvkm_fb_func
 nv41_fb = {
+	.tags = nv20_fb_tags,
 	.init = nv41_fb_init,
 	.tile.regions = 12,
 	.tile.init = nv30_fb_tile_init,
@@ -52,7 +53,6 @@ nv41_fb = {
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv41_fb_tile_prog,
 	.ram_new = nv41_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv44.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv44.c
index 06246cce5ec4..b59dc486083d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv44.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv44.c
@@ -62,7 +62,6 @@ nv44_fb = {
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv44_fb_tile_prog,
 	.ram_new = nv44_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv46.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv46.c
index 3598a1aa65be..cab7d20fa039 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv46.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv46.c
@@ -48,7 +48,6 @@ nv46_fb = {
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv44_fb_tile_prog,
 	.ram_new = nv44_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv47.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv47.c
index c505e4429314..a8b0ad4c871d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv47.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv47.c
@@ -28,6 +28,7 @@
 
 static const struct nvkm_fb_func
 nv47_fb = {
+	.tags = nv20_fb_tags,
 	.init = nv41_fb_init,
 	.tile.regions = 15,
 	.tile.init = nv30_fb_tile_init,
@@ -35,7 +36,6 @@ nv47_fb = {
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv41_fb_tile_prog,
 	.ram_new = nv41_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv49.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv49.c
index 7b91b9f170e5..d0b317bb0252 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv49.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv49.c
@@ -28,6 +28,7 @@
 
 static const struct nvkm_fb_func
 nv49_fb = {
+	.tags = nv20_fb_tags,
 	.init = nv41_fb_init,
 	.tile.regions = 15,
 	.tile.init = nv30_fb_tile_init,
@@ -35,7 +36,6 @@ nv49_fb = {
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv41_fb_tile_prog,
 	.ram_new = nv49_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv4e.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv4e.c
index 4e98210c1b1c..6a6f0c086071 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv4e.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv4e.c
@@ -34,7 +34,6 @@ nv4e_fb = {
 	.tile.fini = nv20_fb_tile_fini,
 	.tile.prog = nv44_fb_tile_prog,
 	.ram_new = nv44_ram_new,
-	.memtype_valid = nv04_fb_memtype_valid,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv50.c
index 0595e0722bfc..b2f5bf8144ea 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv50.c
@@ -28,18 +28,6 @@
 #include <core/enum.h>
 #include <engine/fifo.h>
 
-int
-nv50_fb_memtype[0x80] = {
-	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0,
-	1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0,
-	0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 2, 2, 2,
-	1, 0, 2, 0, 1, 0, 2, 0, 1, 1, 2, 2, 1, 1, 0, 0
-};
-
 static int
 nv50_fb_ram_new(struct nvkm_fb *base, struct nvkm_ram **pram)
 {
@@ -47,12 +35,6 @@ nv50_fb_ram_new(struct nvkm_fb *base, struct nvkm_ram **pram)
 	return fb->func->ram_new(&fb->base, pram);
 }
 
-static bool
-nv50_fb_memtype_valid(struct nvkm_fb *fb, u32 memtype)
-{
-	return nv50_fb_memtype[(memtype & 0xff00) >> 8] != 0;
-}
-
 static const struct nvkm_enum vm_dispatch_subclients[] = {
 	{ 0x00000000, "GRCTX" },
 	{ 0x00000001, "NOTIFY" },
@@ -244,6 +226,15 @@ nv50_fb_init(struct nvkm_fb *base)
 	nvkm_wr32(device, 0x100c90, fb->func->trap);
 }
 
+static u32
+nv50_fb_tags(struct nvkm_fb *base)
+{
+	struct nv50_fb *fb = nv50_fb(base);
+	if (fb->func->tags)
+		return fb->func->tags(&fb->base);
+	return 0;
+}
+
 static void *
 nv50_fb_dtor(struct nvkm_fb *base)
 {
@@ -262,11 +253,11 @@ nv50_fb_dtor(struct nvkm_fb *base)
 static const struct nvkm_fb_func
 nv50_fb_ = {
 	.dtor = nv50_fb_dtor,
+	.tags = nv50_fb_tags,
 	.oneinit = nv50_fb_oneinit,
 	.init = nv50_fb_init,
 	.intr = nv50_fb_intr,
 	.ram_new = nv50_fb_ram_new,
-	.memtype_valid = nv50_fb_memtype_valid,
 };
 
 int
@@ -287,6 +278,7 @@ nv50_fb_new_(const struct nv50_fb_func *func, struct nvkm_device *device,
 static const struct nv50_fb_func
 nv50_fb = {
 	.ram_new = nv50_ram_new,
+	.tags = nv20_fb_tags,
 	.trap = 0x000707ff,
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv50.h b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv50.h
index faa88c8c66fe..13231d4b00d9 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv50.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/nv50.h
@@ -12,10 +12,10 @@ struct nv50_fb {
 
 struct nv50_fb_func {
 	int (*ram_new)(struct nvkm_fb *, struct nvkm_ram **);
+	u32 (*tags)(struct nvkm_fb *);
 	u32 trap;
 };
 
 int nv50_fb_new_(const struct nv50_fb_func *, struct nvkm_device *, int index,
 		 struct nvkm_fb **pfb);
-extern int nv50_fb_memtype[0x80];
 #endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h
index e905d44fa1d5..e05d95240e85 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h
@@ -6,9 +6,10 @@ struct nvkm_bios;
 
 struct nvkm_fb_func {
 	void *(*dtor)(struct nvkm_fb *);
+	u32 (*tags)(struct nvkm_fb *);
 	int (*oneinit)(struct nvkm_fb *);
 	void (*init)(struct nvkm_fb *);
-	void (*init_page)(struct nvkm_fb *);
+	int (*init_page)(struct nvkm_fb *);
 	void (*init_unkn)(struct nvkm_fb *);
 	void (*intr)(struct nvkm_fb *);
 
@@ -24,7 +25,7 @@ struct nvkm_fb_func {
 
 	int (*ram_new)(struct nvkm_fb *, struct nvkm_ram **);
 
-	bool (*memtype_valid)(struct nvkm_fb *, u32 memtype);
+	u8 default_bigpage;
 };
 
 void nvkm_fb_ctor(const struct nvkm_fb_func *, struct nvkm_device *device,
@@ -33,13 +34,12 @@ int nvkm_fb_new_(const struct nvkm_fb_func *, struct nvkm_device *device,
 		 int index, struct nvkm_fb **);
 int nvkm_fb_bios_memtype(struct nvkm_bios *);
 
-bool nv04_fb_memtype_valid(struct nvkm_fb *, u32 memtype);
-
 void nv10_fb_tile_init(struct nvkm_fb *, int i, u32 addr, u32 size,
 		       u32 pitch, u32 flags, struct nvkm_fb_tile *);
 void nv10_fb_tile_fini(struct nvkm_fb *, int i, struct nvkm_fb_tile *);
 void nv10_fb_tile_prog(struct nvkm_fb *, int, struct nvkm_fb_tile *);
 
+u32 nv20_fb_tags(struct nvkm_fb *);
 void nv20_fb_tile_init(struct nvkm_fb *, int i, u32 addr, u32 size,
 		       u32 pitch, u32 flags, struct nvkm_fb_tile *);
 void nv20_fb_tile_fini(struct nvkm_fb *, int i, struct nvkm_fb_tile *);
@@ -62,8 +62,7 @@ void nv46_fb_tile_init(struct nvkm_fb *, int i, u32 addr, u32 size,
 		       u32 pitch, u32 flags, struct nvkm_fb_tile *);
 
 int gf100_fb_oneinit(struct nvkm_fb *);
-void gf100_fb_init_page(struct nvkm_fb *);
-bool gf100_fb_memtype_valid(struct nvkm_fb *, u32);
+int gf100_fb_init_page(struct nvkm_fb *);
 
-void gm200_fb_init_page(struct nvkm_fb *);
+int gm200_fb_init_page(struct nvkm_fb *);
 #endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.c
index c17d559dbfbe..24c7bd505731 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.c
@@ -21,8 +21,132 @@
  *
  * Authors: Ben Skeggs <bskeggs@redhat.com>
  */
+#define nvkm_vram(p) container_of((p), struct nvkm_vram, memory)
 #include "ram.h"
 
+#include <core/memory.h>
+#include <subdev/mmu.h>
+
+struct nvkm_vram {
+	struct nvkm_memory memory;
+	struct nvkm_ram *ram;
+	u8 page;
+	struct nvkm_mm_node *mn;
+};
+
+static int
+nvkm_vram_map(struct nvkm_memory *memory, u64 offset, struct nvkm_vmm *vmm,
+	      struct nvkm_vma *vma, void *argv, u32 argc)
+{
+	struct nvkm_vram *vram = nvkm_vram(memory);
+	struct nvkm_vmm_map map = {
+		.memory = &vram->memory,
+		.offset = offset,
+		.mem = vram->mn,
+	};
+
+	return nvkm_vmm_map(vmm, vma, argv, argc, &map);
+}
+
+static u64
+nvkm_vram_size(struct nvkm_memory *memory)
+{
+	return (u64)nvkm_mm_size(nvkm_vram(memory)->mn) << NVKM_RAM_MM_SHIFT;
+}
+
+static u64
+nvkm_vram_addr(struct nvkm_memory *memory)
+{
+	struct nvkm_vram *vram = nvkm_vram(memory);
+	if (!nvkm_mm_contiguous(vram->mn))
+		return ~0ULL;
+	return (u64)nvkm_mm_addr(vram->mn) << NVKM_RAM_MM_SHIFT;
+}
+
+static u8
+nvkm_vram_page(struct nvkm_memory *memory)
+{
+	return nvkm_vram(memory)->page;
+}
+
+static enum nvkm_memory_target
+nvkm_vram_target(struct nvkm_memory *memory)
+{
+	return NVKM_MEM_TARGET_VRAM;
+}
+
+static void *
+nvkm_vram_dtor(struct nvkm_memory *memory)
+{
+	struct nvkm_vram *vram = nvkm_vram(memory);
+	struct nvkm_mm_node *next = vram->mn;
+	struct nvkm_mm_node *node;
+	mutex_lock(&vram->ram->fb->subdev.mutex);
+	while ((node = next)) {
+		next = node->next;
+		nvkm_mm_free(&vram->ram->vram, &node);
+	}
+	mutex_unlock(&vram->ram->fb->subdev.mutex);
+	return vram;
+}
+
+static const struct nvkm_memory_func
+nvkm_vram = {
+	.dtor = nvkm_vram_dtor,
+	.target = nvkm_vram_target,
+	.page = nvkm_vram_page,
+	.addr = nvkm_vram_addr,
+	.size = nvkm_vram_size,
+	.map = nvkm_vram_map,
+};
+
+int
+nvkm_ram_get(struct nvkm_device *device, u8 heap, u8 type, u8 rpage, u64 size,
+	     bool contig, bool back, struct nvkm_memory **pmemory)
+{
+	struct nvkm_ram *ram;
+	struct nvkm_mm *mm;
+	struct nvkm_mm_node **node, *r;
+	struct nvkm_vram *vram;
+	u8   page = max(rpage, (u8)NVKM_RAM_MM_SHIFT);
+	u32 align = (1 << page) >> NVKM_RAM_MM_SHIFT;
+	u32   max = ALIGN(size, 1 << page) >> NVKM_RAM_MM_SHIFT;
+	u32   min = contig ? max : align;
+	int ret;
+
+	if (!device->fb || !(ram = device->fb->ram))
+		return -ENODEV;
+	ram = device->fb->ram;
+	mm = &ram->vram;
+
+	if (!(vram = kzalloc(sizeof(*vram), GFP_KERNEL)))
+		return -ENOMEM;
+	nvkm_memory_ctor(&nvkm_vram, &vram->memory);
+	vram->ram = ram;
+	vram->page = page;
+	*pmemory = &vram->memory;
+
+	mutex_lock(&ram->fb->subdev.mutex);
+	node = &vram->mn;
+	do {
+		if (back)
+			ret = nvkm_mm_tail(mm, heap, type, max, min, align, &r);
+		else
+			ret = nvkm_mm_head(mm, heap, type, max, min, align, &r);
+		if (ret) {
+			mutex_unlock(&ram->fb->subdev.mutex);
+			nvkm_memory_unref(pmemory);
+			return ret;
+		}
+
+		*node = r;
+		node = &r->next;
+		max -= r->length;
+	} while (max);
+	mutex_unlock(&ram->fb->subdev.mutex);
+	return 0;
+}
+
 int
 nvkm_ram_init(struct nvkm_ram *ram)
 {
@@ -38,7 +162,6 @@ nvkm_ram_del(struct nvkm_ram **pram)
 	if (ram && !WARN_ON(!ram->func)) {
 		if (ram->func->dtor)
 			*pram = ram->func->dtor(ram);
-		nvkm_mm_fini(&ram->tags);
 		nvkm_mm_fini(&ram->vram);
 		kfree(*pram);
 		*pram = NULL;
@@ -47,8 +170,7 @@ nvkm_ram_del(struct nvkm_ram **pram)
 
 int
 nvkm_ram_ctor(const struct nvkm_ram_func *func, struct nvkm_fb *fb,
-	      enum nvkm_ram_type type, u64 size, u32 tags,
-	      struct nvkm_ram *ram)
+	      enum nvkm_ram_type type, u64 size, struct nvkm_ram *ram)
 {
 	static const char *name[] = {
 		[NVKM_RAM_TYPE_UNKNOWN] = "of unknown memory type",
@@ -73,28 +195,20 @@ nvkm_ram_ctor(const struct nvkm_ram_func *func, struct nvkm_fb *fb,
 	ram->size = size;
 
 	if (!nvkm_mm_initialised(&ram->vram)) {
-		ret = nvkm_mm_init(&ram->vram, 0, size >> NVKM_RAM_MM_SHIFT, 1);
+		ret = nvkm_mm_init(&ram->vram, NVKM_RAM_MM_NORMAL, 0,
+				   size >> NVKM_RAM_MM_SHIFT, 1);
 		if (ret)
 			return ret;
 	}
 
-	if (!nvkm_mm_initialised(&ram->tags)) {
-		ret = nvkm_mm_init(&ram->tags, 0, tags ? ++tags : 0, 1);
-		if (ret)
-			return ret;
-
-		nvkm_debug(subdev, "%d compression tags\n", tags);
-	}
-
 	return 0;
 }
 
 int
 nvkm_ram_new_(const struct nvkm_ram_func *func, struct nvkm_fb *fb,
-	      enum nvkm_ram_type type, u64 size, u32 tags,
-	      struct nvkm_ram **pram)
+	      enum nvkm_ram_type type, u64 size, struct nvkm_ram **pram)
 {
 	if (!(*pram = kzalloc(sizeof(**pram), GFP_KERNEL)))
 		return -ENOMEM;
-	return nvkm_ram_ctor(func, fb, type, size, tags, *pram);
+	return nvkm_ram_ctor(func, fb, type, size, *pram);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h
index fac7e73c3ddf..70fd59dcd06d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h
@@ -3,11 +3,9 @@
 #include "priv.h"
 
 int  nvkm_ram_ctor(const struct nvkm_ram_func *, struct nvkm_fb *,
-		   enum nvkm_ram_type, u64 size, u32 tags,
-		   struct nvkm_ram *);
+		   enum nvkm_ram_type, u64 size, struct nvkm_ram *);
 int  nvkm_ram_new_(const struct nvkm_ram_func *, struct nvkm_fb *,
-		   enum nvkm_ram_type, u64 size, u32 tags,
-		   struct nvkm_ram **);
+		   enum nvkm_ram_type, u64 size, struct nvkm_ram **);
 void nvkm_ram_del(struct nvkm_ram **);
 int  nvkm_ram_init(struct nvkm_ram *);
 
@@ -15,9 +13,6 @@ extern const struct nvkm_ram_func nv04_ram_func;
 
 int  nv50_ram_ctor(const struct nvkm_ram_func *, struct nvkm_fb *,
 		   struct nvkm_ram *);
-int  nv50_ram_get(struct nvkm_ram *, u64, u32, u32, u32, struct nvkm_mem **);
-void nv50_ram_put(struct nvkm_ram *, struct nvkm_mem **);
-void __nv50_ram_put(struct nvkm_ram *, struct nvkm_mem *);
 
 int gf100_ram_new_(const struct nvkm_ram_func *, struct nvkm_fb *,
 		   struct nvkm_ram **);
@@ -28,8 +23,6 @@ u32  gf100_ram_probe_fbp(const struct nvkm_ram_func *,
 u32  gf100_ram_probe_fbp_amount(const struct nvkm_ram_func *, u32,
 				struct nvkm_device *, int, int *);
 u32  gf100_ram_probe_fbpa_amount(struct nvkm_device *, int);
-int  gf100_ram_get(struct nvkm_ram *, u64, u32, u32, u32, struct nvkm_mem **);
-void gf100_ram_put(struct nvkm_ram *, struct nvkm_mem **);
 int gf100_ram_init(struct nvkm_ram *);
 int gf100_ram_calc(struct nvkm_ram *, u32);
 int gf100_ram_prog(struct nvkm_ram *);
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgf100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgf100.c
index 4a9bd4f1cb93..ac87a3b6b7c9 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgf100.c
@@ -32,7 +32,6 @@
 #include <subdev/bios/timing.h>
 #include <subdev/clk.h>
 #include <subdev/clk/pll.h>
-#include <subdev/ltc.h>
 
 struct gf100_ramfuc {
 	struct ramfuc base;
@@ -420,86 +419,6 @@ gf100_ram_tidy(struct nvkm_ram *base)
 	ram_exec(&ram->fuc, false);
 }
 
-void
-gf100_ram_put(struct nvkm_ram *ram, struct nvkm_mem **pmem)
-{
-	struct nvkm_ltc *ltc = ram->fb->subdev.device->ltc;
-	struct nvkm_mem *mem = *pmem;
-
-	*pmem = NULL;
-	if (unlikely(mem == NULL))
-		return;
-
-	mutex_lock(&ram->fb->subdev.mutex);
-	if (mem->tag)
-		nvkm_ltc_tags_free(ltc, &mem->tag);
-	__nv50_ram_put(ram, mem);
-	mutex_unlock(&ram->fb->subdev.mutex);
-
-	kfree(mem);
-}
-
-int
-gf100_ram_get(struct nvkm_ram *ram, u64 size, u32 align, u32 ncmin,
-	      u32 memtype, struct nvkm_mem **pmem)
-{
-	struct nvkm_ltc *ltc = ram->fb->subdev.device->ltc;
-	struct nvkm_mm *mm = &ram->vram;
-	struct nvkm_mm_node **node, *r;
-	struct nvkm_mem *mem;
-	int type = (memtype & 0x0ff);
-	int back = (memtype & 0x800);
-	const bool comp = gf100_pte_storage_type_map[type] != type;
-	int ret;
-
-	size  >>= NVKM_RAM_MM_SHIFT;
-	align >>= NVKM_RAM_MM_SHIFT;
-	ncmin >>= NVKM_RAM_MM_SHIFT;
-	if (!ncmin)
-		ncmin = size;
-
-	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
-	if (!mem)
-		return -ENOMEM;
-
-	mem->size = size;
-
-	mutex_lock(&ram->fb->subdev.mutex);
-	if (comp) {
-		/* compression only works with lpages */
-		if (align == (1 << (17 - NVKM_RAM_MM_SHIFT))) {
-			int n = size >> 5;
-			nvkm_ltc_tags_alloc(ltc, n, &mem->tag);
-		}
-
-		if (unlikely(!mem->tag))
-			type = gf100_pte_storage_type_map[type];
-	}
-	mem->memtype = type;
-
-	node = &mem->mem;
-	do {
-		if (back)
-			ret = nvkm_mm_tail(mm, 0, 1, size, ncmin, align, &r);
-		else
-			ret = nvkm_mm_head(mm, 0, 1, size, ncmin, align, &r);
-		if (ret) {
-			mutex_unlock(&ram->fb->subdev.mutex);
-			ram->func->put(ram, &mem);
-			return ret;
-		}
-
-		*node = r;
-		node = &r->next;
-		size -= r->length;
-	} while (size);
-	mutex_unlock(&ram->fb->subdev.mutex);
-
-	mem->offset = (u64)mem->mem->offset << NVKM_RAM_MM_SHIFT;
-	*pmem = mem;
-	return 0;
-}
-
 int
 gf100_ram_init(struct nvkm_ram *base)
 {
@@ -604,7 +523,7 @@ gf100_ram_ctor(const struct nvkm_ram_func *func, struct nvkm_fb *fb,
 	nvkm_debug(subdev, "Upper: %4lld MiB @ %010llx\n", usize >> 20, ubase);
 	nvkm_debug(subdev, "Total: %4lld MiB\n", total >> 20);
 
-	ret = nvkm_ram_ctor(func, fb, type, total, 0, ram);
+	ret = nvkm_ram_ctor(func, fb, type, total, ram);
 	if (ret)
 		return ret;
 
@@ -617,7 +536,8 @@ gf100_ram_ctor(const struct nvkm_ram_func *func, struct nvkm_fb *fb,
 	 */
 	if (lower != total) {
 		/* The common memory amount is addressed normally. */
-		ret = nvkm_mm_init(&ram->vram, rsvd_head >> NVKM_RAM_MM_SHIFT,
+		ret = nvkm_mm_init(&ram->vram, NVKM_RAM_MM_NORMAL,
+				   rsvd_head >> NVKM_RAM_MM_SHIFT,
 				   (lower - rsvd_head) >> NVKM_RAM_MM_SHIFT, 1);
 		if (ret)
 			return ret;
@@ -625,13 +545,15 @@ gf100_ram_ctor(const struct nvkm_ram_func *func, struct nvkm_fb *fb,
 		/* And the rest is much higher in the physical address
 		 * space, and may not be usable for certain operations.
 		 */
-		ret = nvkm_mm_init(&ram->vram, ubase >> NVKM_RAM_MM_SHIFT,
+		ret = nvkm_mm_init(&ram->vram, NVKM_RAM_MM_MIXED,
+				   ubase >> NVKM_RAM_MM_SHIFT,
 				   (usize - rsvd_tail) >> NVKM_RAM_MM_SHIFT, 1);
 		if (ret)
 			return ret;
 	} else {
 		/* GPUs without mixed-memory are a lot nicer... */
-		ret = nvkm_mm_init(&ram->vram, rsvd_head >> NVKM_RAM_MM_SHIFT,
+		ret = nvkm_mm_init(&ram->vram, NVKM_RAM_MM_NORMAL,
+				   rsvd_head >> NVKM_RAM_MM_SHIFT,
 				   (total - rsvd_head - rsvd_tail) >>
 				   NVKM_RAM_MM_SHIFT, 1);
 		if (ret)
@@ -738,8 +660,6 @@ gf100_ram = {
 	.probe_fbp_amount = gf100_ram_probe_fbp_amount,
 	.probe_fbpa_amount = gf100_ram_probe_fbpa_amount,
 	.init = gf100_ram_init,
-	.get = gf100_ram_get,
-	.put = gf100_ram_put,
 	.calc = gf100_ram_calc,
 	.prog = gf100_ram_prog,
 	.tidy = gf100_ram_tidy,
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgf108.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgf108.c
index 985ec64cf369..70a06e3cd55a 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgf108.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgf108.c
@@ -48,8 +48,6 @@ gf108_ram = {
 	.probe_fbp_amount = gf108_ram_probe_fbp_amount,
 	.probe_fbpa_amount = gf100_ram_probe_fbpa_amount,
 	.init = gf100_ram_init,
-	.get = gf100_ram_get,
-	.put = gf100_ram_put,
 	.calc = gf100_ram_calc,
 	.prog = gf100_ram_prog,
 	.tidy = gf100_ram_tidy,
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgk104.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgk104.c
index 75814f15eb53..8bcb7e79a0cb 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgk104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgk104.c
@@ -1704,8 +1704,6 @@ gk104_ram = {
 	.probe_fbpa_amount = gf100_ram_probe_fbpa_amount,
 	.dtor = gk104_ram_dtor,
 	.init = gk104_ram_init,
-	.get = gf100_ram_get,
-	.put = gf100_ram_put,
 	.calc = gk104_ram_calc,
 	.prog = gk104_ram_prog,
 	.tidy = gk104_ram_tidy,
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgm107.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgm107.c
index 3f0b56347291..27c68e3f9772 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgm107.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgm107.c
@@ -39,8 +39,6 @@ gm107_ram = {
 	.probe_fbpa_amount = gf100_ram_probe_fbpa_amount,
 	.dtor = gk104_ram_dtor,
 	.init = gk104_ram_init,
-	.get = gf100_ram_get,
-	.put = gf100_ram_put,
 	.calc = gk104_ram_calc,
 	.prog = gk104_ram_prog,
 	.tidy = gk104_ram_tidy,
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgm200.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgm200.c
index fd8facf90476..6b0cac1fe7b4 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgm200.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgm200.c
@@ -54,8 +54,6 @@ gm200_ram = {
 	.probe_fbpa_amount = gf100_ram_probe_fbpa_amount,
 	.dtor = gk104_ram_dtor,
 	.init = gk104_ram_init,
-	.get = gf100_ram_get,
-	.put = gf100_ram_put,
 	.calc = gk104_ram_calc,
 	.prog = gk104_ram_prog,
 	.tidy = gk104_ram_tidy,
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp100.c
index df8a87333b67..adb62a6beb63 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgp100.c
@@ -84,8 +84,6 @@ gp100_ram = {
 	.probe_fbp_amount = gm200_ram_probe_fbp_amount,
 	.probe_fbpa_amount = gp100_ram_probe_fbpa,
 	.init = gp100_ram_init,
-	.get = gf100_ram_get,
-	.put = gf100_ram_put,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgt215.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgt215.c
index f10664372161..920b3d347803 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgt215.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramgt215.c
@@ -26,6 +26,7 @@
 #include "ram.h"
 #include "ramfuc.h"
 
+#include <core/memory.h>
 #include <core/option.h>
 #include <subdev/bios.h>
 #include <subdev/bios/M0205.h>
@@ -86,7 +87,7 @@ struct gt215_ltrain {
 	u32 r_100720;
 	u32 r_1111e0;
 	u32 r_111400;
-	struct nvkm_mem *mem;
+	struct nvkm_memory *memory;
 };
 
 struct gt215_ram {
@@ -279,10 +280,10 @@ gt215_link_train_init(struct gt215_ram *ram)
 	struct gt215_ltrain *train = &ram->ltrain;
 	struct nvkm_device *device = ram->base.fb->subdev.device;
 	struct nvkm_bios *bios = device->bios;
-	struct nvkm_mem *mem;
 	struct nvbios_M0205E M0205E;
 	u8 ver, hdr, cnt, len;
 	u32 r001700;
+	u64 addr;
 	int ret, i = 0;
 
 	train->state = NVA3_TRAIN_UNSUPPORTED;
@@ -297,14 +298,14 @@ gt215_link_train_init(struct gt215_ram *ram)
 
 	train->state = NVA3_TRAIN_ONCE;
 
-	ret = ram->base.func->get(&ram->base, 0x8000, 0x10000, 0, 0x800,
-				  &ram->ltrain.mem);
+	ret = nvkm_ram_get(device, NVKM_RAM_MM_NORMAL, 0x01, 16, 0x8000,
+			   true, true, &ram->ltrain.memory);
 	if (ret)
 		return ret;
 
-	mem = ram->ltrain.mem;
+	addr = nvkm_memory_addr(ram->ltrain.memory);
 
-	nvkm_wr32(device, 0x100538, 0x10000000 | (mem->offset >> 16));
+	nvkm_wr32(device, 0x100538, 0x10000000 | (addr >> 16));
 	nvkm_wr32(device, 0x1005a8, 0x0000ffff);
 	nvkm_mask(device, 0x10f800, 0x00000001, 0x00000001);
 
@@ -320,7 +321,7 @@ gt215_link_train_init(struct gt215_ram *ram)
 
 	/* And upload the pattern */
 	r001700 = nvkm_rd32(device, 0x1700);
-	nvkm_wr32(device, 0x1700, mem->offset >> 16);
+	nvkm_wr32(device, 0x1700, addr >> 16);
 	for (i = 0; i < 16; i++)
 		nvkm_wr32(device, 0x700000 + (i << 2), pattern[i]);
 	for (i = 0; i < 16; i++)
@@ -336,8 +337,7 @@ gt215_link_train_init(struct gt215_ram *ram)
 static void
 gt215_link_train_fini(struct gt215_ram *ram)
 {
-	if (ram->ltrain.mem)
-		ram->base.func->put(&ram->base, &ram->ltrain.mem);
+	nvkm_memory_unref(&ram->ltrain.memory);
 }
 
 /*
@@ -931,8 +931,6 @@ static const struct nvkm_ram_func
 gt215_ram_func = {
 	.dtor = gt215_ram_dtor,
 	.init = gt215_ram_init,
-	.get = nv50_ram_get,
-	.put = nv50_ram_put,
 	.calc = gt215_ram_calc,
 	.prog = gt215_ram_prog,
 	.tidy = gt215_ram_tidy,
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/rammcp77.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/rammcp77.c
index 017a91de74a0..7de18e53ef45 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/rammcp77.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/rammcp77.c
@@ -53,8 +53,6 @@ mcp77_ram_init(struct nvkm_ram *base)
 static const struct nvkm_ram_func
 mcp77_ram_func = {
 	.init = mcp77_ram_init,
-	.get = nv50_ram_get,
-	.put = nv50_ram_put,
 };
 
 int
@@ -73,7 +71,7 @@ mcp77_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	*pram = &ram->base;
 
 	ret = nvkm_ram_ctor(&mcp77_ram_func, fb, NVKM_RAM_TYPE_STOLEN,
-			    size, 0, &ram->base);
+			    size, &ram->base);
 	if (ret)
 		return ret;
 
@@ -81,7 +79,8 @@ mcp77_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	ram->base.stolen = base;
 	nvkm_mm_fini(&ram->base.vram);
 
-	return nvkm_mm_init(&ram->base.vram, rsvd_head >> NVKM_RAM_MM_SHIFT,
+	return nvkm_mm_init(&ram->base.vram, NVKM_RAM_MM_NORMAL,
+			    rsvd_head >> NVKM_RAM_MM_SHIFT,
 			    (size - rsvd_head - rsvd_tail) >>
 			    NVKM_RAM_MM_SHIFT, 1);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv04.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv04.c
index 6f053a03d61c..cc764a93f1a3 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv04.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv04.c
@@ -61,5 +61,5 @@ nv04_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	else
 		type = NVKM_RAM_TYPE_SDRAM;
 
-	return nvkm_ram_new_(&nv04_ram_func, fb, type, size, 0, pram);
+	return nvkm_ram_new_(&nv04_ram_func, fb, type, size, pram);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv10.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv10.c
index dfd155c98dbb..afe54e323b18 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv10.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv10.c
@@ -36,5 +36,5 @@ nv10_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	else
 		type = NVKM_RAM_TYPE_SDRAM;
 
-	return nvkm_ram_new_(&nv04_ram_func, fb, type, size, 0, pram);
+	return nvkm_ram_new_(&nv04_ram_func, fb, type, size, pram);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv1a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv1a.c
index 3c6a8710e812..4c07d10bb976 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv1a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv1a.c
@@ -44,5 +44,5 @@ nv1a_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	}
 
 	return nvkm_ram_new_(&nv04_ram_func, fb, NVKM_RAM_TYPE_STOLEN,
-			     mib * 1024 * 1024, 0, pram);
+			     mib * 1024 * 1024, pram);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv20.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv20.c
index 747e47c10cc7..71d63d7daa75 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv20.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv20.c
@@ -29,7 +29,6 @@ nv20_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	struct nvkm_device *device = fb->subdev.device;
 	u32 pbus1218 =  nvkm_rd32(device, 0x001218);
 	u32     size = (nvkm_rd32(device, 0x10020c) & 0xff000000);
-	u32     tags =  nvkm_rd32(device, 0x100320);
 	enum nvkm_ram_type type = NVKM_RAM_TYPE_UNKNOWN;
 	int ret;
 
@@ -40,7 +39,7 @@ nv20_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	case 0x00000300: type = NVKM_RAM_TYPE_GDDR2; break;
 	}
 
-	ret = nvkm_ram_new_(&nv04_ram_func, fb, type, size, tags, pram);
+	ret = nvkm_ram_new_(&nv04_ram_func, fb, type, size, pram);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv40.c
index 70c63535d56b..2b12e388f47a 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv40.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv40.c
@@ -187,13 +187,13 @@ nv40_ram_func = {
 
 int
 nv40_ram_new_(struct nvkm_fb *fb, enum nvkm_ram_type type, u64 size,
-	      u32 tags, struct nvkm_ram **pram)
+	      struct nvkm_ram **pram)
 {
 	struct nv40_ram *ram;
 	if (!(ram = kzalloc(sizeof(*ram), GFP_KERNEL)))
 		return -ENOMEM;
 	*pram = &ram->base;
-	return nvkm_ram_ctor(&nv40_ram_func, fb, type, size, tags, &ram->base);
+	return nvkm_ram_ctor(&nv40_ram_func, fb, type, size, &ram->base);
 }
 
 int
@@ -202,7 +202,6 @@ nv40_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	struct nvkm_device *device = fb->subdev.device;
 	u32 pbus1218 = nvkm_rd32(device, 0x001218);
 	u32     size = nvkm_rd32(device, 0x10020c) & 0xff000000;
-	u32     tags = nvkm_rd32(device, 0x100320);
 	enum nvkm_ram_type type = NVKM_RAM_TYPE_UNKNOWN;
 	int ret;
 
@@ -213,7 +212,7 @@ nv40_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	case 0x00000300: type = NVKM_RAM_TYPE_DDR2 ; break;
 	}
 
-	ret = nv40_ram_new_(fb, type, size, tags, pram);
+	ret = nv40_ram_new_(fb, type, size, pram);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv40.h b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv40.h
index 8a0524566b48..ec5dcbfcaea8 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv40.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv40.h
@@ -9,6 +9,6 @@ struct nv40_ram {
 	u32 coef;
 };
 
-int nv40_ram_new_(struct nvkm_fb *fb, enum nvkm_ram_type, u64, u32,
+int nv40_ram_new_(struct nvkm_fb *fb, enum nvkm_ram_type, u64,
 		  struct nvkm_ram **);
 #endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv41.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv41.c
index 114828be292e..d3fea3726461 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv41.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv41.c
@@ -28,7 +28,6 @@ nv41_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 {
 	struct nvkm_device *device = fb->subdev.device;
 	u32  size = nvkm_rd32(device, 0x10020c) & 0xff000000;
-	u32  tags = nvkm_rd32(device, 0x100320);
 	u32 fb474 = nvkm_rd32(device, 0x100474);
 	enum nvkm_ram_type type = NVKM_RAM_TYPE_UNKNOWN;
 	int ret;
@@ -40,7 +39,7 @@ nv41_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	if (fb474 & 0x00000001)
 		type = NVKM_RAM_TYPE_DDR1;
 
-	ret = nv40_ram_new_(fb, type, size, tags, pram);
+	ret = nv40_ram_new_(fb, type, size, pram);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv44.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv44.c
index bc56fbf1c788..ab2630e5e6fb 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv44.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv44.c
@@ -38,5 +38,5 @@ nv44_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	if (fb474 & 0x00000001)
 		type = NVKM_RAM_TYPE_DDR1;
 
-	return nv40_ram_new_(fb, type, size, 0, pram);
+	return nv40_ram_new_(fb, type, size, pram);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv49.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv49.c
index c01f4b1022b8..946ca7c2e0b6 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv49.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv49.c
@@ -28,7 +28,6 @@ nv49_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 {
 	struct nvkm_device *device = fb->subdev.device;
 	u32  size = nvkm_rd32(device, 0x10020c) & 0xff000000;
-	u32  tags = nvkm_rd32(device, 0x100320);
 	u32 fb914 = nvkm_rd32(device, 0x100914);
 	enum nvkm_ram_type type = NVKM_RAM_TYPE_UNKNOWN;
 	int ret;
@@ -40,7 +39,7 @@ nv49_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	case 0x00000003: break;
 	}
 
-	ret = nv40_ram_new_(fb, type, size, tags, pram);
+	ret = nv40_ram_new_(fb, type, size, pram);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv4e.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv4e.c
index fa3c2e06203d..02b8bdbc819f 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv4e.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv4e.c
@@ -29,5 +29,5 @@ nv4e_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram)
 	struct nvkm_device *device = fb->subdev.device;
 	u32 size = nvkm_rd32(device, 0x10020c) & 0xff000000;
 	return nvkm_ram_new_(&nv04_ram_func, fb, NVKM_RAM_TYPE_UNKNOWN,
-			     size, 0, pram);
+			     size, pram);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv50.c
index 6549b0588309..2ccb4b6be153 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramnv50.c
@@ -493,100 +493,8 @@ nv50_ram_tidy(struct nvkm_ram *base)
 	ram_exec(&ram->hwsq, false);
 }
 
-void
-__nv50_ram_put(struct nvkm_ram *ram, struct nvkm_mem *mem)
-{
-	struct nvkm_mm_node *next = mem->mem;
-	struct nvkm_mm_node *node;
-	while ((node = next)) {
-		next = node->next;
-		nvkm_mm_free(&ram->vram, &node);
-	}
-	nvkm_mm_free(&ram->tags, &mem->tag);
-}
-
-void
-nv50_ram_put(struct nvkm_ram *ram, struct nvkm_mem **pmem)
-{
-	struct nvkm_mem *mem = *pmem;
-
-	*pmem = NULL;
-	if (unlikely(mem == NULL))
-		return;
-
-	mutex_lock(&ram->fb->subdev.mutex);
-	__nv50_ram_put(ram, mem);
-	mutex_unlock(&ram->fb->subdev.mutex);
-
-	kfree(mem);
-}
-
-int
-nv50_ram_get(struct nvkm_ram *ram, u64 size, u32 align, u32 ncmin,
-	     u32 memtype, struct nvkm_mem **pmem)
-{
-	struct nvkm_mm *heap = &ram->vram;
-	struct nvkm_mm *tags = &ram->tags;
-	struct nvkm_mm_node **node, *r;
-	struct nvkm_mem *mem;
-	int comp = (memtype & 0x300) >> 8;
-	int type = (memtype & 0x07f);
-	int back = (memtype & 0x800);
-	int min, max, ret;
-
-	max = (size >> NVKM_RAM_MM_SHIFT);
-	min = ncmin ? (ncmin >> NVKM_RAM_MM_SHIFT) : max;
-	align >>= NVKM_RAM_MM_SHIFT;
-
-	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
-	if (!mem)
-		return -ENOMEM;
-
-	mutex_lock(&ram->fb->subdev.mutex);
-	if (comp) {
-		if (align == (1 << (16 - NVKM_RAM_MM_SHIFT))) {
-			int n = (max >> 4) * comp;
-
-			ret = nvkm_mm_head(tags, 0, 1, n, n, 1, &mem->tag);
-			if (ret)
-				mem->tag = NULL;
-		}
-
-		if (unlikely(!mem->tag))
-			comp = 0;
-	}
-
-	mem->memtype = (comp << 7) | type;
-	mem->size = max;
-
-	type = nv50_fb_memtype[type];
-	node = &mem->mem;
-	do {
-		if (back)
-			ret = nvkm_mm_tail(heap, 0, type, max, min, align, &r);
-		else
-			ret = nvkm_mm_head(heap, 0, type, max, min, align, &r);
-		if (ret) {
-			mutex_unlock(&ram->fb->subdev.mutex);
-			ram->func->put(ram, &mem);
-			return ret;
-		}
-
-		*node = r;
-		node = &r->next;
-		max -= r->length;
-	} while (max);
-	mutex_unlock(&ram->fb->subdev.mutex);
-
-	mem->offset = (u64)mem->mem->offset << NVKM_RAM_MM_SHIFT;
-	*pmem = mem;
-	return 0;
-}
-
 static const struct nvkm_ram_func
 nv50_ram_func = {
-	.get = nv50_ram_get,
-	.put = nv50_ram_put,
 	.calc = nv50_ram_calc,
 	.prog = nv50_ram_prog,
 	.tidy = nv50_ram_tidy,
@@ -639,7 +547,6 @@ nv50_ram_ctor(const struct nvkm_ram_func *func,
 	const u32 rsvd_head = ( 256 * 1024); /* vga memory */
 	const u32 rsvd_tail = (1024 * 1024); /* vbios etc */
 	u64 size = nvkm_rd32(device, 0x10020c);
-	u32 tags = nvkm_rd32(device, 0x100320);
 	enum nvkm_ram_type type = NVKM_RAM_TYPE_UNKNOWN;
 	int ret;
 
@@ -660,7 +567,7 @@ nv50_ram_ctor(const struct nvkm_ram_func *func,
 
 	size = (size & 0x000000ff) << 32 | (size & 0xffffff00);
 
-	ret = nvkm_ram_ctor(func, fb, type, size, tags, ram);
+	ret = nvkm_ram_ctor(func, fb, type, size, ram);
 	if (ret)
 		return ret;
 
@@ -669,7 +576,8 @@ nv50_ram_ctor(const struct nvkm_ram_func *func,
 	ram->ranks = (nvkm_rd32(device, 0x100200) & 0x4) ? 2 : 1;
 	nvkm_mm_fini(&ram->vram);
 
-	return nvkm_mm_init(&ram->vram, rsvd_head >> NVKM_RAM_MM_SHIFT,
+	return nvkm_mm_init(&ram->vram, NVKM_RAM_MM_NORMAL,
+			    rsvd_head >> NVKM_RAM_MM_SHIFT,
 			    (size - rsvd_head - rsvd_tail) >> NVKM_RAM_MM_SHIFT,
 			    nv50_fb_vram_rblock(ram) >> NVKM_RAM_MM_SHIFT);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/base.c
index 10c987a654ec..364ea4492acc 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/base.c
@@ -23,181 +23,90 @@
  */
 #include "priv.h"
 
-#include <core/memory.h>
 #include <subdev/bar.h>
 
 /******************************************************************************
  * instmem object base implementation
  *****************************************************************************/
-#define nvkm_instobj(p) container_of((p), struct nvkm_instobj, memory)
-
-struct nvkm_instobj {
-	struct nvkm_memory memory;
-	struct nvkm_memory *parent;
-	struct nvkm_instmem *imem;
-	struct list_head head;
-	u32 *suspend;
-	void __iomem *map;
-};
-
-static enum nvkm_memory_target
-nvkm_instobj_target(struct nvkm_memory *memory)
-{
-	memory = nvkm_instobj(memory)->parent;
-	return nvkm_memory_target(memory);
-}
-
-static u64
-nvkm_instobj_addr(struct nvkm_memory *memory)
-{
-	memory = nvkm_instobj(memory)->parent;
-	return nvkm_memory_addr(memory);
-}
-
-static u64
-nvkm_instobj_size(struct nvkm_memory *memory)
-{
-	memory = nvkm_instobj(memory)->parent;
-	return nvkm_memory_size(memory);
-}
-
 static void
-nvkm_instobj_release(struct nvkm_memory *memory)
+nvkm_instobj_load(struct nvkm_instobj *iobj)
 {
-	struct nvkm_instobj *iobj = nvkm_instobj(memory);
-	nvkm_bar_flush(iobj->imem->subdev.device->bar);
-}
-
-static void __iomem *
-nvkm_instobj_acquire(struct nvkm_memory *memory)
-{
-	return nvkm_instobj(memory)->map;
-}
-
-static u32
-nvkm_instobj_rd32(struct nvkm_memory *memory, u64 offset)
-{
-	return ioread32_native(nvkm_instobj(memory)->map + offset);
-}
-
-static void
-nvkm_instobj_wr32(struct nvkm_memory *memory, u64 offset, u32 data)
-{
-	iowrite32_native(data, nvkm_instobj(memory)->map + offset);
-}
+	struct nvkm_memory *memory = &iobj->memory;
+	const u64 size = nvkm_memory_size(memory);
+	void __iomem *map;
+	int i;
 
-static void
-nvkm_instobj_map(struct nvkm_memory *memory, struct nvkm_vma *vma, u64 offset)
-{
-	memory = nvkm_instobj(memory)->parent;
-	nvkm_memory_map(memory, vma, offset);
-}
+	if (!(map = nvkm_kmap(memory))) {
+		for (i = 0; i < size; i += 4)
+			nvkm_wo32(memory, i, iobj->suspend[i / 4]);
+	} else {
+		memcpy_toio(map, iobj->suspend, size);
+	}
+	nvkm_done(memory);
 
-static void *
-nvkm_instobj_dtor(struct nvkm_memory *memory)
-{
-	struct nvkm_instobj *iobj = nvkm_instobj(memory);
-	spin_lock(&iobj->imem->lock);
-	list_del(&iobj->head);
-	spin_unlock(&iobj->imem->lock);
-	nvkm_memory_del(&iobj->parent);
-	return iobj;
+	kvfree(iobj->suspend);
+	iobj->suspend = NULL;
 }
 
-static const struct nvkm_memory_func
-nvkm_instobj_func = {
-	.dtor = nvkm_instobj_dtor,
-	.target = nvkm_instobj_target,
-	.addr = nvkm_instobj_addr,
-	.size = nvkm_instobj_size,
-	.acquire = nvkm_instobj_acquire,
-	.release = nvkm_instobj_release,
-	.rd32 = nvkm_instobj_rd32,
-	.wr32 = nvkm_instobj_wr32,
-	.map = nvkm_instobj_map,
-};
-
-static void
-nvkm_instobj_boot(struct nvkm_memory *memory, struct nvkm_vm *vm)
+static int
+nvkm_instobj_save(struct nvkm_instobj *iobj)
 {
-	memory = nvkm_instobj(memory)->parent;
-	nvkm_memory_boot(memory, vm);
-}
+	struct nvkm_memory *memory = &iobj->memory;
+	const u64 size = nvkm_memory_size(memory);
+	void __iomem *map;
+	int i;
 
-static void
-nvkm_instobj_release_slow(struct nvkm_memory *memory)
-{
-	struct nvkm_instobj *iobj = nvkm_instobj(memory);
-	nvkm_instobj_release(memory);
-	nvkm_done(iobj->parent);
-}
+	iobj->suspend = kvmalloc(size, GFP_KERNEL);
+	if (!iobj->suspend)
+		return -ENOMEM;
 
-static void __iomem *
-nvkm_instobj_acquire_slow(struct nvkm_memory *memory)
-{
-	struct nvkm_instobj *iobj = nvkm_instobj(memory);
-	iobj->map = nvkm_kmap(iobj->parent);
-	if (iobj->map)
-		memory->func = &nvkm_instobj_func;
-	return iobj->map;
+	if (!(map = nvkm_kmap(memory))) {
+		for (i = 0; i < size; i += 4)
+			iobj->suspend[i / 4] = nvkm_ro32(memory, i);
+	} else {
+		memcpy_fromio(iobj->suspend, map, size);
+	}
+	nvkm_done(memory);
+	return 0;
 }
 
-static u32
-nvkm_instobj_rd32_slow(struct nvkm_memory *memory, u64 offset)
+void
+nvkm_instobj_dtor(struct nvkm_instmem *imem, struct nvkm_instobj *iobj)
 {
-	struct nvkm_instobj *iobj = nvkm_instobj(memory);
-	return nvkm_ro32(iobj->parent, offset);
+	spin_lock(&imem->lock);
+	list_del(&iobj->head);
+	spin_unlock(&imem->lock);
 }
 
-static void
-nvkm_instobj_wr32_slow(struct nvkm_memory *memory, u64 offset, u32 data)
+void
+nvkm_instobj_ctor(const struct nvkm_memory_func *func,
+		  struct nvkm_instmem *imem, struct nvkm_instobj *iobj)
 {
-	struct nvkm_instobj *iobj = nvkm_instobj(memory);
-	return nvkm_wo32(iobj->parent, offset, data);
+	nvkm_memory_ctor(func, &iobj->memory);
+	iobj->suspend = NULL;
+	spin_lock(&imem->lock);
+	list_add_tail(&iobj->head, &imem->list);
+	spin_unlock(&imem->lock);
 }
 
-static const struct nvkm_memory_func
-nvkm_instobj_func_slow = {
-	.dtor = nvkm_instobj_dtor,
-	.target = nvkm_instobj_target,
-	.addr = nvkm_instobj_addr,
-	.size = nvkm_instobj_size,
-	.boot = nvkm_instobj_boot,
-	.acquire = nvkm_instobj_acquire_slow,
-	.release = nvkm_instobj_release_slow,
-	.rd32 = nvkm_instobj_rd32_slow,
-	.wr32 = nvkm_instobj_wr32_slow,
-	.map = nvkm_instobj_map,
-};
-
 int
 nvkm_instobj_new(struct nvkm_instmem *imem, u32 size, u32 align, bool zero,
 		 struct nvkm_memory **pmemory)
 {
+	struct nvkm_subdev *subdev = &imem->subdev;
 	struct nvkm_memory *memory = NULL;
-	struct nvkm_instobj *iobj;
 	u32 offset;
 	int ret;
 
 	ret = imem->func->memory_new(imem, size, align, zero, &memory);
-	if (ret)
+	if (ret) {
+		nvkm_error(subdev, "OOM: %08x %08x %d\n", size, align, ret);
 		goto done;
-
-	if (!imem->func->persistent) {
-		if (!(iobj = kzalloc(sizeof(*iobj), GFP_KERNEL))) {
-			ret = -ENOMEM;
-			goto done;
-		}
-
-		nvkm_memory_ctor(&nvkm_instobj_func_slow, &iobj->memory);
-		iobj->parent = memory;
-		iobj->imem = imem;
-		spin_lock(&iobj->imem->lock);
-		list_add_tail(&iobj->head, &imem->list);
-		spin_unlock(&iobj->imem->lock);
-		memory = &iobj->memory;
 	}
 
+	nvkm_trace(subdev, "new %08x %08x %d: %010llx %010llx\n", size, align,
+		   zero, nvkm_memory_addr(memory), nvkm_memory_size(memory));
+
 	if (!imem->func->zero && zero) {
 		void __iomem *map = nvkm_kmap(memory);
 		if (unlikely(!map)) {
@@ -211,7 +120,7 @@ nvkm_instobj_new(struct nvkm_instmem *imem, u32 size, u32 align, bool zero,
 
 done:
 	if (ret)
-		nvkm_memory_del(&memory);
+		nvkm_memory_unref(&memory);
 	*pmemory = memory;
 	return ret;
 }
@@ -232,39 +141,46 @@ nvkm_instmem_wr32(struct nvkm_instmem *imem, u32 addr, u32 data)
 	return imem->func->wr32(imem, addr, data);
 }
 
+void
+nvkm_instmem_boot(struct nvkm_instmem *imem)
+{
+	/* Separate bootstrapped objects from normal list, as we need
+	 * to make sure they're accessed with the slowpath on suspend
+	 * and resume.
+	 */
+	struct nvkm_instobj *iobj, *itmp;
+	spin_lock(&imem->lock);
+	list_for_each_entry_safe(iobj, itmp, &imem->list, head) {
+		list_move_tail(&iobj->head, &imem->boot);
+	}
+	spin_unlock(&imem->lock);
+}
+
 static int
 nvkm_instmem_fini(struct nvkm_subdev *subdev, bool suspend)
 {
 	struct nvkm_instmem *imem = nvkm_instmem(subdev);
 	struct nvkm_instobj *iobj;
-	int i;
-
-	if (imem->func->fini)
-		imem->func->fini(imem);
 
 	if (suspend) {
 		list_for_each_entry(iobj, &imem->list, head) {
-			struct nvkm_memory *memory = iobj->parent;
-			u64 size = nvkm_memory_size(memory);
+			int ret = nvkm_instobj_save(iobj);
+			if (ret)
+				return ret;
+		}
 
-			iobj->suspend = vmalloc(size);
-			if (!iobj->suspend)
-				return -ENOMEM;
+		nvkm_bar_bar2_fini(subdev->device);
 
-			for (i = 0; i < size; i += 4)
-				iobj->suspend[i / 4] = nvkm_ro32(memory, i);
+		list_for_each_entry(iobj, &imem->boot, head) {
+			int ret = nvkm_instobj_save(iobj);
+			if (ret)
+				return ret;
 		}
 	}
 
-	return 0;
-}
+	if (imem->func->fini)
+		imem->func->fini(imem);
 
-static int
-nvkm_instmem_oneinit(struct nvkm_subdev *subdev)
-{
-	struct nvkm_instmem *imem = nvkm_instmem(subdev);
-	if (imem->func->oneinit)
-		return imem->func->oneinit(imem);
 	return 0;
 }
 
@@ -273,22 +189,31 @@ nvkm_instmem_init(struct nvkm_subdev *subdev)
 {
 	struct nvkm_instmem *imem = nvkm_instmem(subdev);
 	struct nvkm_instobj *iobj;
-	int i;
+
+	list_for_each_entry(iobj, &imem->boot, head) {
+		if (iobj->suspend)
+			nvkm_instobj_load(iobj);
+	}
+
+	nvkm_bar_bar2_init(subdev->device);
 
 	list_for_each_entry(iobj, &imem->list, head) {
-		if (iobj->suspend) {
-			struct nvkm_memory *memory = iobj->parent;
-			u64 size = nvkm_memory_size(memory);
-			for (i = 0; i < size; i += 4)
-				nvkm_wo32(memory, i, iobj->suspend[i / 4]);
-			vfree(iobj->suspend);
-			iobj->suspend = NULL;
-		}
+		if (iobj->suspend)
+			nvkm_instobj_load(iobj);
 	}
 
 	return 0;
 }
 
+static int
+nvkm_instmem_oneinit(struct nvkm_subdev *subdev)
+{
+	struct nvkm_instmem *imem = nvkm_instmem(subdev);
+	if (imem->func->oneinit)
+		return imem->func->oneinit(imem);
+	return 0;
+}
+
 static void *
 nvkm_instmem_dtor(struct nvkm_subdev *subdev)
 {
@@ -315,4 +240,5 @@ nvkm_instmem_ctor(const struct nvkm_instmem_func *func,
 	imem->func = func;
 	spin_lock_init(&imem->lock);
 	INIT_LIST_HEAD(&imem->list);
+	INIT_LIST_HEAD(&imem->boot);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
index cd5adbec5e57..985f2990ab0d 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
@@ -44,14 +44,13 @@
 #include "priv.h"
 
 #include <core/memory.h>
-#include <core/mm.h>
 #include <core/tegra.h>
-#include <subdev/fb.h>
 #include <subdev/ltc.h>
+#include <subdev/mmu.h>
 
 struct gk20a_instobj {
 	struct nvkm_memory memory;
-	struct nvkm_mem mem;
+	struct nvkm_mm_node *mn;
 	struct gk20a_instmem *imem;
 
 	/* CPU mapping */
@@ -119,16 +118,22 @@ gk20a_instobj_target(struct nvkm_memory *memory)
 	return NVKM_MEM_TARGET_NCOH;
 }
 
+static u8
+gk20a_instobj_page(struct nvkm_memory *memory)
+{
+	return 12;
+}
+
 static u64
 gk20a_instobj_addr(struct nvkm_memory *memory)
 {
-	return gk20a_instobj(memory)->mem.offset;
+	return (u64)gk20a_instobj(memory)->mn->offset << 12;
 }
 
 static u64
 gk20a_instobj_size(struct nvkm_memory *memory)
 {
-	return (u64)gk20a_instobj(memory)->mem.size << 12;
+	return (u64)gk20a_instobj(memory)->mn->length << 12;
 }
 
 /*
@@ -272,12 +277,18 @@ gk20a_instobj_wr32(struct nvkm_memory *memory, u64 offset, u32 data)
 	node->vaddr[offset / 4] = data;
 }
 
-static void
-gk20a_instobj_map(struct nvkm_memory *memory, struct nvkm_vma *vma, u64 offset)
+static int
+gk20a_instobj_map(struct nvkm_memory *memory, u64 offset, struct nvkm_vmm *vmm,
+		  struct nvkm_vma *vma, void *argv, u32 argc)
 {
 	struct gk20a_instobj *node = gk20a_instobj(memory);
+	struct nvkm_vmm_map map = {
+		.memory = &node->memory,
+		.offset = offset,
+		.mem = node->mn,
+	};
 
-	nvkm_vm_map_at(vma, offset, &node->mem);
+	return nvkm_vmm_map(vmm, vma, argv, argc, &map);
 }
 
 static void *
@@ -290,8 +301,8 @@ gk20a_instobj_dtor_dma(struct nvkm_memory *memory)
 	if (unlikely(!node->base.vaddr))
 		goto out;
 
-	dma_free_attrs(dev, node->base.mem.size << PAGE_SHIFT, node->base.vaddr,
-		       node->handle, imem->attrs);
+	dma_free_attrs(dev, (u64)node->base.mn->length << PAGE_SHIFT,
+		       node->base.vaddr, node->handle, imem->attrs);
 
 out:
 	return node;
@@ -303,7 +314,7 @@ gk20a_instobj_dtor_iommu(struct nvkm_memory *memory)
 	struct gk20a_instobj_iommu *node = gk20a_instobj_iommu(memory);
 	struct gk20a_instmem *imem = node->base.imem;
 	struct device *dev = imem->base.subdev.device->dev;
-	struct nvkm_mm_node *r = node->base.mem.mem;
+	struct nvkm_mm_node *r = node->base.mn;
 	int i;
 
 	if (unlikely(!r))
@@ -321,7 +332,7 @@ gk20a_instobj_dtor_iommu(struct nvkm_memory *memory)
 	r->offset &= ~BIT(imem->iommu_bit - imem->iommu_pgshift);
 
 	/* Unmap pages from GPU address space and free them */
-	for (i = 0; i < node->base.mem.size; i++) {
+	for (i = 0; i < node->base.mn->length; i++) {
 		iommu_unmap(imem->domain,
 			    (r->offset + i) << imem->iommu_pgshift, PAGE_SIZE);
 		dma_unmap_page(dev, node->dma_addrs[i], PAGE_SIZE,
@@ -342,12 +353,11 @@ static const struct nvkm_memory_func
 gk20a_instobj_func_dma = {
 	.dtor = gk20a_instobj_dtor_dma,
 	.target = gk20a_instobj_target,
+	.page = gk20a_instobj_page,
 	.addr = gk20a_instobj_addr,
 	.size = gk20a_instobj_size,
 	.acquire = gk20a_instobj_acquire_dma,
 	.release = gk20a_instobj_release_dma,
-	.rd32 = gk20a_instobj_rd32,
-	.wr32 = gk20a_instobj_wr32,
 	.map = gk20a_instobj_map,
 };
 
@@ -355,13 +365,18 @@ static const struct nvkm_memory_func
 gk20a_instobj_func_iommu = {
 	.dtor = gk20a_instobj_dtor_iommu,
 	.target = gk20a_instobj_target,
+	.page = gk20a_instobj_page,
 	.addr = gk20a_instobj_addr,
 	.size = gk20a_instobj_size,
 	.acquire = gk20a_instobj_acquire_iommu,
 	.release = gk20a_instobj_release_iommu,
+	.map = gk20a_instobj_map,
+};
+
+static const struct nvkm_memory_ptrs
+gk20a_instobj_ptrs = {
 	.rd32 = gk20a_instobj_rd32,
 	.wr32 = gk20a_instobj_wr32,
-	.map = gk20a_instobj_map,
 };
 
 static int
@@ -377,6 +392,7 @@ gk20a_instobj_ctor_dma(struct gk20a_instmem *imem, u32 npages, u32 align,
 	*_node = &node->base;
 
 	nvkm_memory_ctor(&gk20a_instobj_func_dma, &node->base.memory);
+	node->base.memory.ptrs = &gk20a_instobj_ptrs;
 
 	node->base.vaddr = dma_alloc_attrs(dev, npages << PAGE_SHIFT,
 					   &node->handle, GFP_KERNEL,
@@ -397,8 +413,7 @@ gk20a_instobj_ctor_dma(struct gk20a_instmem *imem, u32 npages, u32 align,
 	node->r.offset = node->handle >> 12;
 	node->r.length = (npages << PAGE_SHIFT) >> 12;
 
-	node->base.mem.offset = node->handle;
-	node->base.mem.mem = &node->r;
+	node->base.mn = &node->r;
 	return 0;
 }
 
@@ -424,6 +439,7 @@ gk20a_instobj_ctor_iommu(struct gk20a_instmem *imem, u32 npages, u32 align,
 	node->dma_addrs = (void *)(node->pages + npages);
 
 	nvkm_memory_ctor(&gk20a_instobj_func_iommu, &node->base.memory);
+	node->base.memory.ptrs = &gk20a_instobj_ptrs;
 
 	/* Allocate backing memory */
 	for (i = 0; i < npages; i++) {
@@ -474,8 +490,7 @@ gk20a_instobj_ctor_iommu(struct gk20a_instmem *imem, u32 npages, u32 align,
 	/* IOMMU bit tells that an address is to be resolved through the IOMMU */
 	r->offset |= BIT(imem->iommu_bit - imem->iommu_pgshift);
 
-	node->base.mem.offset = ((u64)r->offset) << imem->iommu_pgshift;
-	node->base.mem.mem = r;
+	node->base.mn = r;
 	return 0;
 
 release_area:
@@ -523,13 +538,8 @@ gk20a_instobj_new(struct nvkm_instmem *base, u32 size, u32 align, bool zero,
 
 	node->imem = imem;
 
-	/* present memory for being mapped using small pages */
-	node->mem.size = size >> 12;
-	node->mem.memtype = 0;
-	node->mem.page_shift = 12;
-
 	nvkm_debug(subdev, "alloc size: 0x%x, align: 0x%x, gaddr: 0x%llx\n",
-		   size, align, node->mem.offset);
+		   size, align, (u64)node->mn->offset << 12);
 
 	return 0;
 }
@@ -554,7 +564,6 @@ static const struct nvkm_instmem_func
 gk20a_instmem = {
 	.dtor = gk20a_instmem_dtor,
 	.memory_new = gk20a_instobj_new,
-	.persistent = true,
 	.zero = false,
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv04.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv04.c
index 6133c8bb2d42..6bf0dad46919 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv04.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv04.c
@@ -24,7 +24,6 @@
 #define nv04_instmem(p) container_of((p), struct nv04_instmem, base)
 #include "priv.h"
 
-#include <core/memory.h>
 #include <core/ramht.h>
 
 struct nv04_instmem {
@@ -35,30 +34,39 @@ struct nv04_instmem {
 /******************************************************************************
  * instmem object implementation
  *****************************************************************************/
-#define nv04_instobj(p) container_of((p), struct nv04_instobj, memory)
+#define nv04_instobj(p) container_of((p), struct nv04_instobj, base.memory)
 
 struct nv04_instobj {
-	struct nvkm_memory memory;
+	struct nvkm_instobj base;
 	struct nv04_instmem *imem;
 	struct nvkm_mm_node *node;
 };
 
-static enum nvkm_memory_target
-nv04_instobj_target(struct nvkm_memory *memory)
+static void
+nv04_instobj_wr32(struct nvkm_memory *memory, u64 offset, u32 data)
 {
-	return NVKM_MEM_TARGET_INST;
+	struct nv04_instobj *iobj = nv04_instobj(memory);
+	struct nvkm_device *device = iobj->imem->base.subdev.device;
+	nvkm_wr32(device, 0x700000 + iobj->node->offset + offset, data);
 }
 
-static u64
-nv04_instobj_addr(struct nvkm_memory *memory)
+static u32
+nv04_instobj_rd32(struct nvkm_memory *memory, u64 offset)
 {
-	return nv04_instobj(memory)->node->offset;
+	struct nv04_instobj *iobj = nv04_instobj(memory);
+	struct nvkm_device *device = iobj->imem->base.subdev.device;
+	return nvkm_rd32(device, 0x700000 + iobj->node->offset + offset);
 }
 
-static u64
-nv04_instobj_size(struct nvkm_memory *memory)
+static const struct nvkm_memory_ptrs
+nv04_instobj_ptrs = {
+	.rd32 = nv04_instobj_rd32,
+	.wr32 = nv04_instobj_wr32,
+};
+
+static void
+nv04_instobj_release(struct nvkm_memory *memory)
 {
-	return nv04_instobj(memory)->node->length;
 }
 
 static void __iomem *
@@ -69,25 +77,22 @@ nv04_instobj_acquire(struct nvkm_memory *memory)
 	return device->pri + 0x700000 + iobj->node->offset;
 }
 
-static void
-nv04_instobj_release(struct nvkm_memory *memory)
+static u64
+nv04_instobj_size(struct nvkm_memory *memory)
 {
+	return nv04_instobj(memory)->node->length;
 }
 
-static u32
-nv04_instobj_rd32(struct nvkm_memory *memory, u64 offset)
+static u64
+nv04_instobj_addr(struct nvkm_memory *memory)
 {
-	struct nv04_instobj *iobj = nv04_instobj(memory);
-	struct nvkm_device *device = iobj->imem->base.subdev.device;
-	return nvkm_rd32(device, 0x700000 + iobj->node->offset + offset);
+	return nv04_instobj(memory)->node->offset;
 }
 
-static void
-nv04_instobj_wr32(struct nvkm_memory *memory, u64 offset, u32 data)
+static enum nvkm_memory_target
+nv04_instobj_target(struct nvkm_memory *memory)
 {
-	struct nv04_instobj *iobj = nv04_instobj(memory);
-	struct nvkm_device *device = iobj->imem->base.subdev.device;
-	nvkm_wr32(device, 0x700000 + iobj->node->offset + offset, data);
+	return NVKM_MEM_TARGET_INST;
 }
 
 static void *
@@ -97,6 +102,7 @@ nv04_instobj_dtor(struct nvkm_memory *memory)
 	mutex_lock(&iobj->imem->base.subdev.mutex);
 	nvkm_mm_free(&iobj->imem->heap, &iobj->node);
 	mutex_unlock(&iobj->imem->base.subdev.mutex);
+	nvkm_instobj_dtor(&iobj->imem->base, &iobj->base);
 	return iobj;
 }
 
@@ -108,8 +114,6 @@ nv04_instobj_func = {
 	.addr = nv04_instobj_addr,
 	.acquire = nv04_instobj_acquire,
 	.release = nv04_instobj_release,
-	.rd32 = nv04_instobj_rd32,
-	.wr32 = nv04_instobj_wr32,
 };
 
 static int
@@ -122,9 +126,10 @@ nv04_instobj_new(struct nvkm_instmem *base, u32 size, u32 align, bool zero,
 
 	if (!(iobj = kzalloc(sizeof(*iobj), GFP_KERNEL)))
 		return -ENOMEM;
-	*pmemory = &iobj->memory;
+	*pmemory = &iobj->base.memory;
 
-	nvkm_memory_ctor(&nv04_instobj_func, &iobj->memory);
+	nvkm_instobj_ctor(&nv04_instobj_func, &imem->base, &iobj->base);
+	iobj->base.memory.ptrs = &nv04_instobj_ptrs;
 	iobj->imem = imem;
 
 	mutex_lock(&imem->base.subdev.mutex);
@@ -160,7 +165,7 @@ nv04_instmem_oneinit(struct nvkm_instmem *base)
 	/* PRAMIN aperture maps over the end of VRAM, reserve it */
 	imem->base.reserved = 512 * 1024;
 
-	ret = nvkm_mm_init(&imem->heap, 0, imem->base.reserved, 1);
+	ret = nvkm_mm_init(&imem->heap, 0, 0, imem->base.reserved, 1);
 	if (ret)
 		return ret;
 
@@ -194,10 +199,10 @@ static void *
 nv04_instmem_dtor(struct nvkm_instmem *base)
 {
 	struct nv04_instmem *imem = nv04_instmem(base);
-	nvkm_memory_del(&imem->base.ramfc);
-	nvkm_memory_del(&imem->base.ramro);
+	nvkm_memory_unref(&imem->base.ramfc);
+	nvkm_memory_unref(&imem->base.ramro);
 	nvkm_ramht_del(&imem->base.ramht);
-	nvkm_memory_del(&imem->base.vbios);
+	nvkm_memory_unref(&imem->base.vbios);
 	nvkm_mm_fini(&imem->heap);
 	return imem;
 }
@@ -209,7 +214,6 @@ nv04_instmem = {
 	.rd32 = nv04_instmem_rd32,
 	.wr32 = nv04_instmem_wr32,
 	.memory_new = nv04_instobj_new,
-	.persistent = false,
 	.zero = false,
 };
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv40.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv40.c
index c0543875e490..086c118488ef 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv40.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv40.c
@@ -24,7 +24,6 @@
 #define nv40_instmem(p) container_of((p), struct nv40_instmem, base)
 #include "priv.h"
 
-#include <core/memory.h>
 #include <core/ramht.h>
 #include <engine/gr/nv40.h>
 
@@ -37,30 +36,38 @@ struct nv40_instmem {
 /******************************************************************************
  * instmem object implementation
  *****************************************************************************/
-#define nv40_instobj(p) container_of((p), struct nv40_instobj, memory)
+#define nv40_instobj(p) container_of((p), struct nv40_instobj, base.memory)
 
 struct nv40_instobj {
-	struct nvkm_memory memory;
+	struct nvkm_instobj base;
 	struct nv40_instmem *imem;
 	struct nvkm_mm_node *node;
 };
 
-static enum nvkm_memory_target
-nv40_instobj_target(struct nvkm_memory *memory)
+static void
+nv40_instobj_wr32(struct nvkm_memory *memory, u64 offset, u32 data)
 {
-	return NVKM_MEM_TARGET_INST;
+	struct nv40_instobj *iobj = nv40_instobj(memory);
+	iowrite32_native(data, iobj->imem->iomem + iobj->node->offset + offset);
 }
 
-static u64
-nv40_instobj_addr(struct nvkm_memory *memory)
+static u32
+nv40_instobj_rd32(struct nvkm_memory *memory, u64 offset)
 {
-	return nv40_instobj(memory)->node->offset;
+	struct nv40_instobj *iobj = nv40_instobj(memory);
+	return ioread32_native(iobj->imem->iomem + iobj->node->offset + offset);
 }
 
-static u64
-nv40_instobj_size(struct nvkm_memory *memory)
+static const struct nvkm_memory_ptrs
+nv40_instobj_ptrs = {
+	.rd32 = nv40_instobj_rd32,
+	.wr32 = nv40_instobj_wr32,
+};
+
+static void
+nv40_instobj_release(struct nvkm_memory *memory)
 {
-	return nv40_instobj(memory)->node->length;
+	wmb();
 }
 
 static void __iomem *
@@ -70,23 +77,22 @@ nv40_instobj_acquire(struct nvkm_memory *memory)
 	return iobj->imem->iomem + iobj->node->offset;
 }
 
-static void
-nv40_instobj_release(struct nvkm_memory *memory)
+static u64
+nv40_instobj_size(struct nvkm_memory *memory)
 {
+	return nv40_instobj(memory)->node->length;
 }
 
-static u32
-nv40_instobj_rd32(struct nvkm_memory *memory, u64 offset)
+static u64
+nv40_instobj_addr(struct nvkm_memory *memory)
 {
-	struct nv40_instobj *iobj = nv40_instobj(memory);
-	return ioread32_native(iobj->imem->iomem + iobj->node->offset + offset);
+	return nv40_instobj(memory)->node->offset;
 }
 
-static void
-nv40_instobj_wr32(struct nvkm_memory *memory, u64 offset, u32 data)
+static enum nvkm_memory_target
+nv40_instobj_target(struct nvkm_memory *memory)
 {
-	struct nv40_instobj *iobj = nv40_instobj(memory);
-	iowrite32_native(data, iobj->imem->iomem + iobj->node->offset + offset);
+	return NVKM_MEM_TARGET_INST;
 }
 
 static void *
@@ -96,6 +102,7 @@ nv40_instobj_dtor(struct nvkm_memory *memory)
 	mutex_lock(&iobj->imem->base.subdev.mutex);
 	nvkm_mm_free(&iobj->imem->heap, &iobj->node);
 	mutex_unlock(&iobj->imem->base.subdev.mutex);
+	nvkm_instobj_dtor(&iobj->imem->base, &iobj->base);
 	return iobj;
 }
 
@@ -107,8 +114,6 @@ nv40_instobj_func = {
 	.addr = nv40_instobj_addr,
 	.acquire = nv40_instobj_acquire,
 	.release = nv40_instobj_release,
-	.rd32 = nv40_instobj_rd32,
-	.wr32 = nv40_instobj_wr32,
 };
 
 static int
@@ -121,9 +126,10 @@ nv40_instobj_new(struct nvkm_instmem *base, u32 size, u32 align, bool zero,
 
 	if (!(iobj = kzalloc(sizeof(*iobj), GFP_KERNEL)))
 		return -ENOMEM;
-	*pmemory = &iobj->memory;
+	*pmemory = &iobj->base.memory;
 
-	nvkm_memory_ctor(&nv40_instobj_func, &iobj->memory);
+	nvkm_instobj_ctor(&nv40_instobj_func, &imem->base, &iobj->base);
+	iobj->base.memory.ptrs = &nv40_instobj_ptrs;
 	iobj->imem = imem;
 
 	mutex_lock(&imem->base.subdev.mutex);
@@ -171,7 +177,7 @@ nv40_instmem_oneinit(struct nvkm_instmem *base)
 	imem->base.reserved += 512 * 1024;	/* object storage */
 	imem->base.reserved = round_up(imem->base.reserved, 4096);
 
-	ret = nvkm_mm_init(&imem->heap, 0, imem->base.reserved, 1);
+	ret = nvkm_mm_init(&imem->heap, 0, 0, imem->base.reserved, 1);
 	if (ret)
 		return ret;
 
@@ -209,10 +215,10 @@ static void *
 nv40_instmem_dtor(struct nvkm_instmem *base)
 {
 	struct nv40_instmem *imem = nv40_instmem(base);
-	nvkm_memory_del(&imem->base.ramfc);
-	nvkm_memory_del(&imem->base.ramro);
+	nvkm_memory_unref(&imem->base.ramfc);
+	nvkm_memory_unref(&imem->base.ramro);
 	nvkm_ramht_del(&imem->base.ramht);
-	nvkm_memory_del(&imem->base.vbios);
+	nvkm_memory_unref(&imem->base.vbios);
 	nvkm_mm_fini(&imem->heap);
 	if (imem->iomem)
 		iounmap(imem->iomem);
@@ -226,7 +232,6 @@ nv40_instmem = {
 	.rd32 = nv40_instmem_rd32,
 	.wr32 = nv40_instmem_wr32,
 	.memory_new = nv40_instobj_new,
-	.persistent = false,
 	.zero = false,
 };
 
@@ -248,8 +253,8 @@ nv40_instmem_new(struct nvkm_device *device, int index,
 	else
 		bar = 3;
 
-	imem->iomem = ioremap(device->func->resource_addr(device, bar),
-			      device->func->resource_size(device, bar));
+	imem->iomem = ioremap_wc(device->func->resource_addr(device, bar),
+				 device->func->resource_size(device, bar));
 	if (!imem->iomem) {
 		nvkm_error(&imem->base.subdev, "unable to map PRAMIN BAR\n");
 		return -EFAULT;
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c
index 6d512c062ae3..1ba7289684aa 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c
@@ -31,147 +31,293 @@
 
 struct nv50_instmem {
 	struct nvkm_instmem base;
-	unsigned long lock_flags;
-	spinlock_t lock;
 	u64 addr;
+
+	/* Mappings that can be evicted when BAR2 space has been exhausted. */
+	struct list_head lru;
 };
 
 /******************************************************************************
  * instmem object implementation
  *****************************************************************************/
-#define nv50_instobj(p) container_of((p), struct nv50_instobj, memory)
+#define nv50_instobj(p) container_of((p), struct nv50_instobj, base.memory)
 
 struct nv50_instobj {
-	struct nvkm_memory memory;
+	struct nvkm_instobj base;
 	struct nv50_instmem *imem;
-	struct nvkm_mem *mem;
-	struct nvkm_vma bar;
+	struct nvkm_memory *ram;
+	struct nvkm_vma *bar;
+	refcount_t maps;
 	void *map;
+	struct list_head lru;
 };
 
-static enum nvkm_memory_target
-nv50_instobj_target(struct nvkm_memory *memory)
+static void
+nv50_instobj_wr32_slow(struct nvkm_memory *memory, u64 offset, u32 data)
 {
-	return NVKM_MEM_TARGET_VRAM;
+	struct nv50_instobj *iobj = nv50_instobj(memory);
+	struct nv50_instmem *imem = iobj->imem;
+	struct nvkm_device *device = imem->base.subdev.device;
+	u64 base = (nvkm_memory_addr(iobj->ram) + offset) & 0xffffff00000ULL;
+	u64 addr = (nvkm_memory_addr(iobj->ram) + offset) & 0x000000fffffULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&imem->base.lock, flags);
+	if (unlikely(imem->addr != base)) {
+		nvkm_wr32(device, 0x001700, base >> 16);
+		imem->addr = base;
+	}
+	nvkm_wr32(device, 0x700000 + addr, data);
+	spin_unlock_irqrestore(&imem->base.lock, flags);
 }
 
-static u64
-nv50_instobj_addr(struct nvkm_memory *memory)
+static u32
+nv50_instobj_rd32_slow(struct nvkm_memory *memory, u64 offset)
 {
-	return nv50_instobj(memory)->mem->offset;
+	struct nv50_instobj *iobj = nv50_instobj(memory);
+	struct nv50_instmem *imem = iobj->imem;
+	struct nvkm_device *device = imem->base.subdev.device;
+	u64 base = (nvkm_memory_addr(iobj->ram) + offset) & 0xffffff00000ULL;
+	u64 addr = (nvkm_memory_addr(iobj->ram) + offset) & 0x000000fffffULL;
+	u32 data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&imem->base.lock, flags);
+	if (unlikely(imem->addr != base)) {
+		nvkm_wr32(device, 0x001700, base >> 16);
+		imem->addr = base;
+	}
+	data = nvkm_rd32(device, 0x700000 + addr);
+	spin_unlock_irqrestore(&imem->base.lock, flags);
+	return data;
 }
 
-static u64
-nv50_instobj_size(struct nvkm_memory *memory)
+static const struct nvkm_memory_ptrs
+nv50_instobj_slow = {
+	.rd32 = nv50_instobj_rd32_slow,
+	.wr32 = nv50_instobj_wr32_slow,
+};
+
+static void
+nv50_instobj_wr32(struct nvkm_memory *memory, u64 offset, u32 data)
 {
-	return (u64)nv50_instobj(memory)->mem->size << NVKM_RAM_MM_SHIFT;
+	iowrite32_native(data, nv50_instobj(memory)->map + offset);
 }
 
+static u32
+nv50_instobj_rd32(struct nvkm_memory *memory, u64 offset)
+{
+	return ioread32_native(nv50_instobj(memory)->map + offset);
+}
+
+static const struct nvkm_memory_ptrs
+nv50_instobj_fast = {
+	.rd32 = nv50_instobj_rd32,
+	.wr32 = nv50_instobj_wr32,
+};
+
 static void
-nv50_instobj_boot(struct nvkm_memory *memory, struct nvkm_vm *vm)
+nv50_instobj_kmap(struct nv50_instobj *iobj, struct nvkm_vmm *vmm)
 {
-	struct nv50_instobj *iobj = nv50_instobj(memory);
-	struct nvkm_subdev *subdev = &iobj->imem->base.subdev;
+	struct nv50_instmem *imem = iobj->imem;
+	struct nv50_instobj *eobj;
+	struct nvkm_memory *memory = &iobj->base.memory;
+	struct nvkm_subdev *subdev = &imem->base.subdev;
 	struct nvkm_device *device = subdev->device;
+	struct nvkm_vma *bar = NULL, *ebar;
 	u64 size = nvkm_memory_size(memory);
-	void __iomem *map;
+	void *emap;
 	int ret;
 
-	iobj->map = ERR_PTR(-ENOMEM);
-
-	ret = nvkm_vm_get(vm, size, 12, NV_MEM_ACCESS_RW, &iobj->bar);
-	if (ret == 0) {
-		map = ioremap(device->func->resource_addr(device, 3) +
-			      (u32)iobj->bar.offset, size);
-		if (map) {
-			nvkm_memory_map(memory, &iobj->bar, 0);
-			iobj->map = map;
-		} else {
-			nvkm_warn(subdev, "PRAMIN ioremap failed\n");
-			nvkm_vm_put(&iobj->bar);
+	/* Attempt to allocate BAR2 address-space and map the object
+	 * into it.  The lock has to be dropped while doing this due
+	 * to the possibility of recursion for page table allocation.
+	 */
+	mutex_unlock(&subdev->mutex);
+	while ((ret = nvkm_vmm_get(vmm, 12, size, &bar))) {
+		/* Evict unused mappings, and keep retrying until we either
+		 * succeed,or there's no more objects left on the LRU.
+		 */
+		mutex_lock(&subdev->mutex);
+		eobj = list_first_entry_or_null(&imem->lru, typeof(*eobj), lru);
+		if (eobj) {
+			nvkm_debug(subdev, "evict %016llx %016llx @ %016llx\n",
+				   nvkm_memory_addr(&eobj->base.memory),
+				   nvkm_memory_size(&eobj->base.memory),
+				   eobj->bar->addr);
+			list_del_init(&eobj->lru);
+			ebar = eobj->bar;
+			eobj->bar = NULL;
+			emap = eobj->map;
+			eobj->map = NULL;
 		}
-	} else {
-		nvkm_warn(subdev, "PRAMIN exhausted\n");
+		mutex_unlock(&subdev->mutex);
+		if (!eobj)
+			break;
+		iounmap(emap);
+		nvkm_vmm_put(vmm, &ebar);
 	}
+
+	if (ret == 0)
+		ret = nvkm_memory_map(memory, 0, vmm, bar, NULL, 0);
+	mutex_lock(&subdev->mutex);
+	if (ret || iobj->bar) {
+		/* We either failed, or another thread beat us. */
+		mutex_unlock(&subdev->mutex);
+		nvkm_vmm_put(vmm, &bar);
+		mutex_lock(&subdev->mutex);
+		return;
+	}
+
+	/* Make the mapping visible to the host. */
+	iobj->bar = bar;
+	iobj->map = ioremap_wc(device->func->resource_addr(device, 3) +
+			       (u32)iobj->bar->addr, size);
+	if (!iobj->map) {
+		nvkm_warn(subdev, "PRAMIN ioremap failed\n");
+		nvkm_vmm_put(vmm, &iobj->bar);
+	}
+}
+
+static int
+nv50_instobj_map(struct nvkm_memory *memory, u64 offset, struct nvkm_vmm *vmm,
+		 struct nvkm_vma *vma, void *argv, u32 argc)
+{
+	memory = nv50_instobj(memory)->ram;
+	return nvkm_memory_map(memory, offset, vmm, vma, argv, argc);
 }
 
 static void
 nv50_instobj_release(struct nvkm_memory *memory)
 {
-	struct nv50_instmem *imem = nv50_instobj(memory)->imem;
-	spin_unlock_irqrestore(&imem->lock, imem->lock_flags);
+	struct nv50_instobj *iobj = nv50_instobj(memory);
+	struct nv50_instmem *imem = iobj->imem;
+	struct nvkm_subdev *subdev = &imem->base.subdev;
+
+	wmb();
+	nvkm_bar_flush(subdev->device->bar);
+
+	if (refcount_dec_and_mutex_lock(&iobj->maps, &subdev->mutex)) {
+		/* Add the now-unused mapping to the LRU instead of directly
+		 * unmapping it here, in case we need to map it again later.
+		 */
+		if (likely(iobj->lru.next) && iobj->map) {
+			BUG_ON(!list_empty(&iobj->lru));
+			list_add_tail(&iobj->lru, &imem->lru);
+		}
+
+		/* Switch back to NULL accessors when last map is gone. */
+		iobj->base.memory.ptrs = NULL;
+		mutex_unlock(&subdev->mutex);
+	}
 }
 
 static void __iomem *
 nv50_instobj_acquire(struct nvkm_memory *memory)
 {
 	struct nv50_instobj *iobj = nv50_instobj(memory);
-	struct nv50_instmem *imem = iobj->imem;
-	struct nvkm_bar *bar = imem->base.subdev.device->bar;
-	struct nvkm_vm *vm;
-	unsigned long flags;
+	struct nvkm_instmem *imem = &iobj->imem->base;
+	struct nvkm_vmm *vmm;
+	void __iomem *map = NULL;
 
-	if (!iobj->map && (vm = nvkm_bar_kmap(bar)))
-		nvkm_memory_boot(memory, vm);
-	if (!IS_ERR_OR_NULL(iobj->map))
+	/* Already mapped? */
+	if (refcount_inc_not_zero(&iobj->maps))
 		return iobj->map;
 
-	spin_lock_irqsave(&imem->lock, flags);
-	imem->lock_flags = flags;
-	return NULL;
-}
+	/* Take the lock, and re-check that another thread hasn't
+	 * already mapped the object in the meantime.
+	 */
+	mutex_lock(&imem->subdev.mutex);
+	if (refcount_inc_not_zero(&iobj->maps)) {
+		mutex_unlock(&imem->subdev.mutex);
+		return iobj->map;
+	}
 
-static u32
-nv50_instobj_rd32(struct nvkm_memory *memory, u64 offset)
-{
-	struct nv50_instobj *iobj = nv50_instobj(memory);
-	struct nv50_instmem *imem = iobj->imem;
-	struct nvkm_device *device = imem->base.subdev.device;
-	u64 base = (iobj->mem->offset + offset) & 0xffffff00000ULL;
-	u64 addr = (iobj->mem->offset + offset) & 0x000000fffffULL;
-	u32 data;
+	/* Attempt to get a direct CPU mapping of the object. */
+	if ((vmm = nvkm_bar_bar2_vmm(imem->subdev.device))) {
+		if (!iobj->map)
+			nv50_instobj_kmap(iobj, vmm);
+		map = iobj->map;
+	}
 
-	if (unlikely(imem->addr != base)) {
-		nvkm_wr32(device, 0x001700, base >> 16);
-		imem->addr = base;
+	if (!refcount_inc_not_zero(&iobj->maps)) {
+		/* Exclude object from eviction while it's being accessed. */
+		if (likely(iobj->lru.next))
+			list_del_init(&iobj->lru);
+
+		if (map)
+			iobj->base.memory.ptrs = &nv50_instobj_fast;
+		else
+			iobj->base.memory.ptrs = &nv50_instobj_slow;
+		refcount_inc(&iobj->maps);
 	}
-	data = nvkm_rd32(device, 0x700000 + addr);
-	return data;
+
+	mutex_unlock(&imem->subdev.mutex);
+	return map;
 }
 
 static void
-nv50_instobj_wr32(struct nvkm_memory *memory, u64 offset, u32 data)
+nv50_instobj_boot(struct nvkm_memory *memory, struct nvkm_vmm *vmm)
 {
 	struct nv50_instobj *iobj = nv50_instobj(memory);
-	struct nv50_instmem *imem = iobj->imem;
-	struct nvkm_device *device = imem->base.subdev.device;
-	u64 base = (iobj->mem->offset + offset) & 0xffffff00000ULL;
-	u64 addr = (iobj->mem->offset + offset) & 0x000000fffffULL;
-
-	if (unlikely(imem->addr != base)) {
-		nvkm_wr32(device, 0x001700, base >> 16);
-		imem->addr = base;
+	struct nvkm_instmem *imem = &iobj->imem->base;
+
+	/* Exclude bootstrapped objects (ie. the page tables for the
+	 * instmem BAR itself) from eviction.
+	 */
+	mutex_lock(&imem->subdev.mutex);
+	if (likely(iobj->lru.next)) {
+		list_del_init(&iobj->lru);
+		iobj->lru.next = NULL;
 	}
-	nvkm_wr32(device, 0x700000 + addr, data);
+
+	nv50_instobj_kmap(iobj, vmm);
+	nvkm_instmem_boot(imem);
+	mutex_unlock(&imem->subdev.mutex);
 }
 
-static void
-nv50_instobj_map(struct nvkm_memory *memory, struct nvkm_vma *vma, u64 offset)
+static u64
+nv50_instobj_size(struct nvkm_memory *memory)
 {
-	struct nv50_instobj *iobj = nv50_instobj(memory);
-	nvkm_vm_map_at(vma, offset, iobj->mem);
+	return nvkm_memory_size(nv50_instobj(memory)->ram);
+}
+
+static u64
+nv50_instobj_addr(struct nvkm_memory *memory)
+{
+	return nvkm_memory_addr(nv50_instobj(memory)->ram);
+}
+
+static enum nvkm_memory_target
+nv50_instobj_target(struct nvkm_memory *memory)
+{
+	return nvkm_memory_target(nv50_instobj(memory)->ram);
 }
 
 static void *
 nv50_instobj_dtor(struct nvkm_memory *memory)
 {
 	struct nv50_instobj *iobj = nv50_instobj(memory);
-	struct nvkm_ram *ram = iobj->imem->base.subdev.device->fb->ram;
-	if (!IS_ERR_OR_NULL(iobj->map)) {
-		nvkm_vm_put(&iobj->bar);
-		iounmap(iobj->map);
+	struct nvkm_instmem *imem = &iobj->imem->base;
+	struct nvkm_vma *bar;
+	void *map = map;
+
+	mutex_lock(&imem->subdev.mutex);
+	if (likely(iobj->lru.next))
+		list_del(&iobj->lru);
+	map = iobj->map;
+	bar = iobj->bar;
+	mutex_unlock(&imem->subdev.mutex);
+
+	if (map) {
+		struct nvkm_vmm *vmm = nvkm_bar_bar2_vmm(imem->subdev.device);
+		iounmap(map);
+		if (likely(vmm)) /* Can be NULL during BAR destructor. */
+			nvkm_vmm_put(vmm, &bar);
 	}
-	ram->func->put(ram, &iobj->mem);
+
+	nvkm_memory_unref(&iobj->ram);
+	nvkm_instobj_dtor(imem, &iobj->base);
 	return iobj;
 }
 
@@ -184,8 +330,6 @@ nv50_instobj_func = {
 	.boot = nv50_instobj_boot,
 	.acquire = nv50_instobj_acquire,
 	.release = nv50_instobj_release,
-	.rd32 = nv50_instobj_rd32,
-	.wr32 = nv50_instobj_wr32,
 	.map = nv50_instobj_map,
 };
 
@@ -195,25 +339,19 @@ nv50_instobj_new(struct nvkm_instmem *base, u32 size, u32 align, bool zero,
 {
 	struct nv50_instmem *imem = nv50_instmem(base);
 	struct nv50_instobj *iobj;
-	struct nvkm_ram *ram = imem->base.subdev.device->fb->ram;
-	int ret;
+	struct nvkm_device *device = imem->base.subdev.device;
+	u8 page = max(order_base_2(align), 12);
 
 	if (!(iobj = kzalloc(sizeof(*iobj), GFP_KERNEL)))
 		return -ENOMEM;
-	*pmemory = &iobj->memory;
+	*pmemory = &iobj->base.memory;
 
-	nvkm_memory_ctor(&nv50_instobj_func, &iobj->memory);
+	nvkm_instobj_ctor(&nv50_instobj_func, &imem->base, &iobj->base);
 	iobj->imem = imem;
+	refcount_set(&iobj->maps, 0);
+	INIT_LIST_HEAD(&iobj->lru);
 
-	size  = max((size  + 4095) & ~4095, (u32)4096);
-	align = max((align + 4095) & ~4095, (u32)4096);
-
-	ret = ram->func->get(ram, size, align, 0, 0x800, &iobj->mem);
-	if (ret)
-		return ret;
-
-	iobj->mem->page_shift = 12;
-	return 0;
+	return nvkm_ram_get(device, 0, 1, page, size, true, true, &iobj->ram);
 }
 
 /******************************************************************************
@@ -230,7 +368,6 @@ static const struct nvkm_instmem_func
 nv50_instmem = {
 	.fini = nv50_instmem_fini,
 	.memory_new = nv50_instobj_new,
-	.persistent = false,
 	.zero = false,
 };
 
@@ -243,7 +380,7 @@ nv50_instmem_new(struct nvkm_device *device, int index,
 	if (!(imem = kzalloc(sizeof(*imem), GFP_KERNEL)))
 		return -ENOMEM;
 	nvkm_instmem_ctor(&nv50_instmem, device, index, &imem->base);
-	spin_lock_init(&imem->lock);
+	INIT_LIST_HEAD(&imem->lru);
 	*pimem = &imem->base;
 	return 0;
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/priv.h
index ace4471864a3..44651ca42d52 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/priv.h
@@ -11,10 +11,22 @@ struct nvkm_instmem_func {
 	void (*wr32)(struct nvkm_instmem *, u32 addr, u32 data);
 	int (*memory_new)(struct nvkm_instmem *, u32 size, u32 align,
 			  bool zero, struct nvkm_memory **);
-	bool persistent;
 	bool zero;
 };
 
 void nvkm_instmem_ctor(const struct nvkm_instmem_func *, struct nvkm_device *,
 		       int index, struct nvkm_instmem *);
+void nvkm_instmem_boot(struct nvkm_instmem *);
+
+#include <core/memory.h>
+
+struct nvkm_instobj {
+	struct nvkm_memory memory;
+	struct list_head head;
+	u32 *suspend;
+};
+
+void nvkm_instobj_ctor(const struct nvkm_memory_func *func,
+		       struct nvkm_instmem *, struct nvkm_instobj *);
+void nvkm_instobj_dtor(struct nvkm_instmem *, struct nvkm_instobj *);
 #endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/base.c
index 0c7ef250dcaf..1f185274d3e6 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/base.c
@@ -23,26 +23,12 @@
  */
 #include "priv.h"
 
-#include <subdev/fb.h>
-
-int
-nvkm_ltc_tags_alloc(struct nvkm_ltc *ltc, u32 n, struct nvkm_mm_node **pnode)
-{
-	int ret = nvkm_mm_head(&ltc->tags, 0, 1, n, n, 1, pnode);
-	if (ret)
-		*pnode = NULL;
-	return ret;
-}
-
-void
-nvkm_ltc_tags_free(struct nvkm_ltc *ltc, struct nvkm_mm_node **pnode)
-{
-	nvkm_mm_free(&ltc->tags, pnode);
-}
+#include <core/memory.h>
 
 void
-nvkm_ltc_tags_clear(struct nvkm_ltc *ltc, u32 first, u32 count)
+nvkm_ltc_tags_clear(struct nvkm_device *device, u32 first, u32 count)
 {
+	struct nvkm_ltc *ltc = device->ltc;
 	const u32 limit = first + count - 1;
 
 	BUG_ON((first > limit) || (limit >= ltc->num_tags));
@@ -116,10 +102,7 @@ static void *
 nvkm_ltc_dtor(struct nvkm_subdev *subdev)
 {
 	struct nvkm_ltc *ltc = nvkm_ltc(subdev);
-	struct nvkm_ram *ram = ltc->subdev.device->fb->ram;
-	nvkm_mm_fini(&ltc->tags);
-	if (ram)
-		nvkm_mm_free(&ram->vram, &ltc->tag_ram);
+	nvkm_memory_unref(&ltc->tag_ram);
 	return ltc;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gf100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gf100.c
index 4a0fa0a9b802..a21ef45b8572 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gf100.c
@@ -23,6 +23,7 @@
  */
 #include "priv.h"
 
+#include <core/memory.h>
 #include <subdev/fb.h>
 #include <subdev/timer.h>
 
@@ -152,7 +153,10 @@ gf100_ltc_flush(struct nvkm_ltc *ltc)
 int
 gf100_ltc_oneinit_tag_ram(struct nvkm_ltc *ltc)
 {
-	struct nvkm_ram *ram = ltc->subdev.device->fb->ram;
+	struct nvkm_device *device = ltc->subdev.device;
+	struct nvkm_fb *fb = device->fb;
+	struct nvkm_ram *ram = fb->ram;
+	u32 bits = (nvkm_rd32(device, 0x100c80) & 0x00001000) ? 16 : 17;
 	u32 tag_size, tag_margin, tag_align;
 	int ret;
 
@@ -164,8 +168,8 @@ gf100_ltc_oneinit_tag_ram(struct nvkm_ltc *ltc)
 
 	/* tags for 1/4 of VRAM should be enough (8192/4 per GiB of VRAM) */
 	ltc->num_tags = (ram->size >> 17) / 4;
-	if (ltc->num_tags > (1 << 17))
-		ltc->num_tags = 1 << 17; /* we have 17 bits in PTE */
+	if (ltc->num_tags > (1 << bits))
+		ltc->num_tags = 1 << bits; /* we have 16/17 bits in PTE */
 	ltc->num_tags = (ltc->num_tags + 63) & ~63; /* round up to 64 */
 
 	tag_align = ltc->ltc_nr * 0x800;
@@ -181,14 +185,13 @@ gf100_ltc_oneinit_tag_ram(struct nvkm_ltc *ltc)
 	 */
 	tag_size  = (ltc->num_tags / 64) * 0x6000 + tag_margin;
 	tag_size += tag_align;
-	tag_size  = (tag_size + 0xfff) >> 12; /* round up */
 
-	ret = nvkm_mm_tail(&ram->vram, 1, 1, tag_size, tag_size, 1,
-			   &ltc->tag_ram);
+	ret = nvkm_ram_get(device, NVKM_RAM_MM_NORMAL, 0x01, 12, tag_size,
+			   true, true, &ltc->tag_ram);
 	if (ret) {
 		ltc->num_tags = 0;
 	} else {
-		u64 tag_base = ((u64)ltc->tag_ram->offset << 12) + tag_margin;
+		u64 tag_base = nvkm_memory_addr(ltc->tag_ram) + tag_margin;
 
 		tag_base += tag_align - 1;
 		do_div(tag_base, tag_align);
@@ -197,7 +200,8 @@ gf100_ltc_oneinit_tag_ram(struct nvkm_ltc *ltc)
 	}
 
 mm_init:
-	return nvkm_mm_init(&ltc->tags, 0, ltc->num_tags, 1);
+	nvkm_mm_fini(&fb->tags);
+	return nvkm_mm_init(&fb->tags, 0, 0, ltc->num_tags, 1);
 }
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp100.c
index 0bdfb2f40266..e34d42108019 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/ltc/gp100.c
@@ -45,7 +45,7 @@ gp100_ltc_oneinit(struct nvkm_ltc *ltc)
 	ltc->ltc_nr = nvkm_rd32(device, 0x12006c);
 	ltc->lts_nr = nvkm_rd32(device, 0x17e280) >> 28;
 	/*XXX: tagram allocation - TBD */
-	return nvkm_mm_init(&ltc->tags, 0, 0, 1);
+	return 0;
 }
 
 static void
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/Kbuild b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/Kbuild
index 012c9db687b2..352a65f9371c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/Kbuild
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/Kbuild
@@ -3,4 +3,33 @@ nvkm-y += nvkm/subdev/mmu/nv04.o
 nvkm-y += nvkm/subdev/mmu/nv41.o
 nvkm-y += nvkm/subdev/mmu/nv44.o
 nvkm-y += nvkm/subdev/mmu/nv50.o
+nvkm-y += nvkm/subdev/mmu/g84.o
 nvkm-y += nvkm/subdev/mmu/gf100.o
+nvkm-y += nvkm/subdev/mmu/gk104.o
+nvkm-y += nvkm/subdev/mmu/gk20a.o
+nvkm-y += nvkm/subdev/mmu/gm200.o
+nvkm-y += nvkm/subdev/mmu/gm20b.o
+nvkm-y += nvkm/subdev/mmu/gp100.o
+nvkm-y += nvkm/subdev/mmu/gp10b.o
+
+nvkm-y += nvkm/subdev/mmu/mem.o
+nvkm-y += nvkm/subdev/mmu/memnv04.o
+nvkm-y += nvkm/subdev/mmu/memnv50.o
+nvkm-y += nvkm/subdev/mmu/memgf100.o
+
+nvkm-y += nvkm/subdev/mmu/vmm.o
+nvkm-y += nvkm/subdev/mmu/vmmnv04.o
+nvkm-y += nvkm/subdev/mmu/vmmnv41.o
+nvkm-y += nvkm/subdev/mmu/vmmnv44.o
+nvkm-y += nvkm/subdev/mmu/vmmnv50.o
+nvkm-y += nvkm/subdev/mmu/vmmgf100.o
+nvkm-y += nvkm/subdev/mmu/vmmgk104.o
+nvkm-y += nvkm/subdev/mmu/vmmgk20a.o
+nvkm-y += nvkm/subdev/mmu/vmmgm200.o
+nvkm-y += nvkm/subdev/mmu/vmmgm20b.o
+nvkm-y += nvkm/subdev/mmu/vmmgp100.o
+nvkm-y += nvkm/subdev/mmu/vmmgp10b.o
+
+nvkm-y += nvkm/subdev/mmu/umem.o
+nvkm-y += nvkm/subdev/mmu/ummu.o
+nvkm-y += nvkm/subdev/mmu/uvmm.o
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c
index d06ad2c372bf..ee11ccaf0563 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c
@@ -21,478 +21,367 @@
  *
  * Authors: Ben Skeggs
  */
-#include "priv.h"
+#include "ummu.h"
+#include "vmm.h"
 
-#include <core/gpuobj.h>
+#include <subdev/bar.h>
 #include <subdev/fb.h>
 
-void
-nvkm_vm_map_at(struct nvkm_vma *vma, u64 delta, struct nvkm_mem *node)
-{
-	struct nvkm_vm *vm = vma->vm;
-	struct nvkm_mmu *mmu = vm->mmu;
-	struct nvkm_mm_node *r = node->mem;
-	int big = vma->node->type != mmu->func->spg_shift;
-	u32 offset = vma->node->offset + (delta >> 12);
-	u32 bits = vma->node->type - 12;
-	u32 pde  = (offset >> mmu->func->pgt_bits) - vm->fpde;
-	u32 pte  = (offset & ((1 << mmu->func->pgt_bits) - 1)) >> bits;
-	u32 max  = 1 << (mmu->func->pgt_bits - bits);
-	u32 end, len;
-
-	delta = 0;
-	while (r) {
-		u64 phys = (u64)r->offset << 12;
-		u32 num  = r->length >> bits;
-
-		while (num) {
-			struct nvkm_memory *pgt = vm->pgt[pde].mem[big];
-
-			end = (pte + num);
-			if (unlikely(end >= max))
-				end = max;
-			len = end - pte;
-
-			mmu->func->map(vma, pgt, node, pte, len, phys, delta);
-
-			num -= len;
-			pte += len;
-			if (unlikely(end >= max)) {
-				phys += len << (bits + 12);
-				pde++;
-				pte = 0;
-			}
-
-			delta += (u64)len << vma->node->type;
-		}
-		r = r->next;
-	};
+#include <nvif/if500d.h>
+#include <nvif/if900d.h>
 
-	mmu->func->flush(vm);
-}
+struct nvkm_mmu_ptp {
+	struct nvkm_mmu_pt *pt;
+	struct list_head head;
+	u8  shift;
+	u16 mask;
+	u16 free;
+};
 
 static void
-nvkm_vm_map_sg_table(struct nvkm_vma *vma, u64 delta, u64 length,
-		     struct nvkm_mem *mem)
+nvkm_mmu_ptp_put(struct nvkm_mmu *mmu, bool force, struct nvkm_mmu_pt *pt)
 {
-	struct nvkm_vm *vm = vma->vm;
-	struct nvkm_mmu *mmu = vm->mmu;
-	int big = vma->node->type != mmu->func->spg_shift;
-	u32 offset = vma->node->offset + (delta >> 12);
-	u32 bits = vma->node->type - 12;
-	u32 num  = length >> vma->node->type;
-	u32 pde  = (offset >> mmu->func->pgt_bits) - vm->fpde;
-	u32 pte  = (offset & ((1 << mmu->func->pgt_bits) - 1)) >> bits;
-	u32 max  = 1 << (mmu->func->pgt_bits - bits);
-	unsigned m, sglen;
-	u32 end, len;
-	int i;
-	struct scatterlist *sg;
-
-	for_each_sg(mem->sg->sgl, sg, mem->sg->nents, i) {
-		struct nvkm_memory *pgt = vm->pgt[pde].mem[big];
-		sglen = sg_dma_len(sg) >> PAGE_SHIFT;
-
-		end = pte + sglen;
-		if (unlikely(end >= max))
-			end = max;
-		len = end - pte;
-
-		for (m = 0; m < len; m++) {
-			dma_addr_t addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
-
-			mmu->func->map_sg(vma, pgt, mem, pte, 1, &addr);
-			num--;
-			pte++;
-
-			if (num == 0)
-				goto finish;
-		}
-		if (unlikely(end >= max)) {
-			pde++;
-			pte = 0;
-		}
-		if (m < sglen) {
-			for (; m < sglen; m++) {
-				dma_addr_t addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
-
-				mmu->func->map_sg(vma, pgt, mem, pte, 1, &addr);
-				num--;
-				pte++;
-				if (num == 0)
-					goto finish;
-			}
-		}
-
+	const int slot = pt->base >> pt->ptp->shift;
+	struct nvkm_mmu_ptp *ptp = pt->ptp;
+
+	/* If there were no free slots in the parent allocation before,
+	 * there will be now, so return PTP to the cache.
+	 */
+	if (!ptp->free)
+		list_add(&ptp->head, &mmu->ptp.list);
+	ptp->free |= BIT(slot);
+
+	/* If there's no more sub-allocations, destroy PTP. */
+	if (ptp->free == ptp->mask) {
+		nvkm_mmu_ptc_put(mmu, force, &ptp->pt);
+		list_del(&ptp->head);
+		kfree(ptp);
 	}
-finish:
-	mmu->func->flush(vm);
+
+	kfree(pt);
 }
 
-static void
-nvkm_vm_map_sg(struct nvkm_vma *vma, u64 delta, u64 length,
-	       struct nvkm_mem *mem)
+struct nvkm_mmu_pt *
+nvkm_mmu_ptp_get(struct nvkm_mmu *mmu, u32 size, bool zero)
 {
-	struct nvkm_vm *vm = vma->vm;
-	struct nvkm_mmu *mmu = vm->mmu;
-	dma_addr_t *list = mem->pages;
-	int big = vma->node->type != mmu->func->spg_shift;
-	u32 offset = vma->node->offset + (delta >> 12);
-	u32 bits = vma->node->type - 12;
-	u32 num  = length >> vma->node->type;
-	u32 pde  = (offset >> mmu->func->pgt_bits) - vm->fpde;
-	u32 pte  = (offset & ((1 << mmu->func->pgt_bits) - 1)) >> bits;
-	u32 max  = 1 << (mmu->func->pgt_bits - bits);
-	u32 end, len;
-
-	while (num) {
-		struct nvkm_memory *pgt = vm->pgt[pde].mem[big];
-
-		end = (pte + num);
-		if (unlikely(end >= max))
-			end = max;
-		len = end - pte;
-
-		mmu->func->map_sg(vma, pgt, mem, pte, len, list);
-
-		num  -= len;
-		pte  += len;
-		list += len;
-		if (unlikely(end >= max)) {
-			pde++;
-			pte = 0;
+	struct nvkm_mmu_pt *pt;
+	struct nvkm_mmu_ptp *ptp;
+	int slot;
+
+	if (!(pt = kzalloc(sizeof(*pt), GFP_KERNEL)))
+		return NULL;
+
+	ptp = list_first_entry_or_null(&mmu->ptp.list, typeof(*ptp), head);
+	if (!ptp) {
+		/* Need to allocate a new parent to sub-allocate from. */
+		if (!(ptp = kmalloc(sizeof(*ptp), GFP_KERNEL))) {
+			kfree(pt);
+			return NULL;
 		}
-	}
 
-	mmu->func->flush(vm);
-}
+		ptp->pt = nvkm_mmu_ptc_get(mmu, 0x1000, 0x1000, false);
+		if (!ptp->pt) {
+			kfree(ptp);
+			kfree(pt);
+			return NULL;
+		}
 
-void
-nvkm_vm_map(struct nvkm_vma *vma, struct nvkm_mem *node)
-{
-	if (node->sg)
-		nvkm_vm_map_sg_table(vma, 0, node->size << 12, node);
-	else
-	if (node->pages)
-		nvkm_vm_map_sg(vma, 0, node->size << 12, node);
-	else
-		nvkm_vm_map_at(vma, 0, node);
+		ptp->shift = order_base_2(size);
+		slot = nvkm_memory_size(ptp->pt->memory) >> ptp->shift;
+		ptp->mask = (1 << slot) - 1;
+		ptp->free = ptp->mask;
+		list_add(&ptp->head, &mmu->ptp.list);
+	}
+	pt->ptp = ptp;
+	pt->sub = true;
+
+	/* Sub-allocate from parent object, removing PTP from cache
+	 * if there's no more free slots left.
+	 */
+	slot = __ffs(ptp->free);
+	ptp->free &= ~BIT(slot);
+	if (!ptp->free)
+		list_del(&ptp->head);
+
+	pt->memory = pt->ptp->pt->memory;
+	pt->base = slot << ptp->shift;
+	pt->addr = pt->ptp->pt->addr + pt->base;
+	return pt;
 }
 
-void
-nvkm_vm_unmap_at(struct nvkm_vma *vma, u64 delta, u64 length)
+struct nvkm_mmu_ptc {
+	struct list_head head;
+	struct list_head item;
+	u32 size;
+	u32 refs;
+};
+
+static inline struct nvkm_mmu_ptc *
+nvkm_mmu_ptc_find(struct nvkm_mmu *mmu, u32 size)
 {
-	struct nvkm_vm *vm = vma->vm;
-	struct nvkm_mmu *mmu = vm->mmu;
-	int big = vma->node->type != mmu->func->spg_shift;
-	u32 offset = vma->node->offset + (delta >> 12);
-	u32 bits = vma->node->type - 12;
-	u32 num  = length >> vma->node->type;
-	u32 pde  = (offset >> mmu->func->pgt_bits) - vm->fpde;
-	u32 pte  = (offset & ((1 << mmu->func->pgt_bits) - 1)) >> bits;
-	u32 max  = 1 << (mmu->func->pgt_bits - bits);
-	u32 end, len;
-
-	while (num) {
-		struct nvkm_memory *pgt = vm->pgt[pde].mem[big];
-
-		end = (pte + num);
-		if (unlikely(end >= max))
-			end = max;
-		len = end - pte;
-
-		mmu->func->unmap(vma, pgt, pte, len);
-
-		num -= len;
-		pte += len;
-		if (unlikely(end >= max)) {
-			pde++;
-			pte = 0;
-		}
+	struct nvkm_mmu_ptc *ptc;
+
+	list_for_each_entry(ptc, &mmu->ptc.list, head) {
+		if (ptc->size == size)
+			return ptc;
 	}
 
-	mmu->func->flush(vm);
-}
+	ptc = kmalloc(sizeof(*ptc), GFP_KERNEL);
+	if (ptc) {
+		INIT_LIST_HEAD(&ptc->item);
+		ptc->size = size;
+		ptc->refs = 0;
+		list_add(&ptc->head, &mmu->ptc.list);
+	}
 
-void
-nvkm_vm_unmap(struct nvkm_vma *vma)
-{
-	nvkm_vm_unmap_at(vma, 0, (u64)vma->node->length << 12);
+	return ptc;
 }
 
-static void
-nvkm_vm_unmap_pgt(struct nvkm_vm *vm, int big, u32 fpde, u32 lpde)
+void
+nvkm_mmu_ptc_put(struct nvkm_mmu *mmu, bool force, struct nvkm_mmu_pt **ppt)
 {
-	struct nvkm_mmu *mmu = vm->mmu;
-	struct nvkm_vm_pgd *vpgd;
-	struct nvkm_vm_pgt *vpgt;
-	struct nvkm_memory *pgt;
-	u32 pde;
-
-	for (pde = fpde; pde <= lpde; pde++) {
-		vpgt = &vm->pgt[pde - vm->fpde];
-		if (--vpgt->refcount[big])
-			continue;
-
-		pgt = vpgt->mem[big];
-		vpgt->mem[big] = NULL;
-
-		list_for_each_entry(vpgd, &vm->pgd_list, head) {
-			mmu->func->map_pgt(vpgd->obj, pde, vpgt->mem);
+	struct nvkm_mmu_pt *pt = *ppt;
+	if (pt) {
+		/* Handle sub-allocated page tables. */
+		if (pt->sub) {
+			mutex_lock(&mmu->ptp.mutex);
+			nvkm_mmu_ptp_put(mmu, force, pt);
+			mutex_unlock(&mmu->ptp.mutex);
+			return;
 		}
 
-		nvkm_memory_del(&pgt);
+		/* Either cache or free the object. */
+		mutex_lock(&mmu->ptc.mutex);
+		if (pt->ptc->refs < 8 /* Heuristic. */ && !force) {
+			list_add_tail(&pt->head, &pt->ptc->item);
+			pt->ptc->refs++;
+		} else {
+			nvkm_memory_unref(&pt->memory);
+			kfree(pt);
+		}
+		mutex_unlock(&mmu->ptc.mutex);
 	}
 }
 
-static int
-nvkm_vm_map_pgt(struct nvkm_vm *vm, u32 pde, u32 type)
+struct nvkm_mmu_pt *
+nvkm_mmu_ptc_get(struct nvkm_mmu *mmu, u32 size, u32 align, bool zero)
 {
-	struct nvkm_mmu *mmu = vm->mmu;
-	struct nvkm_vm_pgt *vpgt = &vm->pgt[pde - vm->fpde];
-	struct nvkm_vm_pgd *vpgd;
-	int big = (type != mmu->func->spg_shift);
-	u32 pgt_size;
+	struct nvkm_mmu_ptc *ptc;
+	struct nvkm_mmu_pt *pt;
 	int ret;
 
-	pgt_size  = (1 << (mmu->func->pgt_bits + 12)) >> type;
-	pgt_size *= 8;
-
-	ret = nvkm_memory_new(mmu->subdev.device, NVKM_MEM_TARGET_INST,
-			      pgt_size, 0x1000, true, &vpgt->mem[big]);
-	if (unlikely(ret))
-		return ret;
-
-	list_for_each_entry(vpgd, &vm->pgd_list, head) {
-		mmu->func->map_pgt(vpgd->obj, pde, vpgt->mem);
+	/* Sub-allocated page table (ie. GP100 LPT). */
+	if (align < 0x1000) {
+		mutex_lock(&mmu->ptp.mutex);
+		pt = nvkm_mmu_ptp_get(mmu, align, zero);
+		mutex_unlock(&mmu->ptp.mutex);
+		return pt;
 	}
 
-	vpgt->refcount[big]++;
-	return 0;
-}
-
-int
-nvkm_vm_get(struct nvkm_vm *vm, u64 size, u32 page_shift, u32 access,
-	    struct nvkm_vma *vma)
-{
-	struct nvkm_mmu *mmu = vm->mmu;
-	u32 align = (1 << page_shift) >> 12;
-	u32 msize = size >> 12;
-	u32 fpde, lpde, pde;
-	int ret;
-
-	mutex_lock(&vm->mutex);
-	ret = nvkm_mm_head(&vm->mm, 0, page_shift, msize, msize, align,
-			   &vma->node);
-	if (unlikely(ret != 0)) {
-		mutex_unlock(&vm->mutex);
-		return ret;
+	/* Lookup cache for this page table size. */
+	mutex_lock(&mmu->ptc.mutex);
+	ptc = nvkm_mmu_ptc_find(mmu, size);
+	if (!ptc) {
+		mutex_unlock(&mmu->ptc.mutex);
+		return NULL;
 	}
 
-	fpde = (vma->node->offset >> mmu->func->pgt_bits);
-	lpde = (vma->node->offset + vma->node->length - 1) >> mmu->func->pgt_bits;
-
-	for (pde = fpde; pde <= lpde; pde++) {
-		struct nvkm_vm_pgt *vpgt = &vm->pgt[pde - vm->fpde];
-		int big = (vma->node->type != mmu->func->spg_shift);
+	/* If there's a free PT in the cache, reuse it. */
+	pt = list_first_entry_or_null(&ptc->item, typeof(*pt), head);
+	if (pt) {
+		if (zero)
+			nvkm_fo64(pt->memory, 0, 0, size >> 3);
+		list_del(&pt->head);
+		ptc->refs--;
+		mutex_unlock(&mmu->ptc.mutex);
+		return pt;
+	}
+	mutex_unlock(&mmu->ptc.mutex);
 
-		if (likely(vpgt->refcount[big])) {
-			vpgt->refcount[big]++;
-			continue;
-		}
+	/* No such luck, we need to allocate. */
+	if (!(pt = kmalloc(sizeof(*pt), GFP_KERNEL)))
+		return NULL;
+	pt->ptc = ptc;
+	pt->sub = false;
 
-		ret = nvkm_vm_map_pgt(vm, pde, vma->node->type);
-		if (ret) {
-			if (pde != fpde)
-				nvkm_vm_unmap_pgt(vm, big, fpde, pde - 1);
-			nvkm_mm_free(&vm->mm, &vma->node);
-			mutex_unlock(&vm->mutex);
-			return ret;
-		}
+	ret = nvkm_memory_new(mmu->subdev.device, NVKM_MEM_TARGET_INST,
+			      size, align, zero, &pt->memory);
+	if (ret) {
+		kfree(pt);
+		return NULL;
 	}
-	mutex_unlock(&vm->mutex);
 
-	vma->vm = NULL;
-	nvkm_vm_ref(vm, &vma->vm, NULL);
-	vma->offset = (u64)vma->node->offset << 12;
-	vma->access = access;
-	return 0;
+	pt->base = 0;
+	pt->addr = nvkm_memory_addr(pt->memory);
+	return pt;
 }
 
 void
-nvkm_vm_put(struct nvkm_vma *vma)
+nvkm_mmu_ptc_dump(struct nvkm_mmu *mmu)
 {
-	struct nvkm_mmu *mmu;
-	struct nvkm_vm *vm;
-	u32 fpde, lpde;
-
-	if (unlikely(vma->node == NULL))
-		return;
-	vm = vma->vm;
-	mmu = vm->mmu;
-
-	fpde = (vma->node->offset >> mmu->func->pgt_bits);
-	lpde = (vma->node->offset + vma->node->length - 1) >> mmu->func->pgt_bits;
-
-	mutex_lock(&vm->mutex);
-	nvkm_vm_unmap_pgt(vm, vma->node->type != mmu->func->spg_shift, fpde, lpde);
-	nvkm_mm_free(&vm->mm, &vma->node);
-	mutex_unlock(&vm->mutex);
-
-	nvkm_vm_ref(NULL, &vma->vm, NULL);
-}
-
-int
-nvkm_vm_boot(struct nvkm_vm *vm, u64 size)
-{
-	struct nvkm_mmu *mmu = vm->mmu;
-	struct nvkm_memory *pgt;
-	int ret;
-
-	ret = nvkm_memory_new(mmu->subdev.device, NVKM_MEM_TARGET_INST,
-			      (size >> mmu->func->spg_shift) * 8, 0x1000, true, &pgt);
-	if (ret == 0) {
-		vm->pgt[0].refcount[0] = 1;
-		vm->pgt[0].mem[0] = pgt;
-		nvkm_memory_boot(pgt, vm);
+	struct nvkm_mmu_ptc *ptc;
+	list_for_each_entry(ptc, &mmu->ptc.list, head) {
+		struct nvkm_mmu_pt *pt, *tt;
+		list_for_each_entry_safe(pt, tt, &ptc->item, head) {
+			nvkm_memory_unref(&pt->memory);
+			list_del(&pt->head);
+			kfree(pt);
+		}
 	}
-
-	return ret;
 }
 
-int
-nvkm_vm_create(struct nvkm_mmu *mmu, u64 offset, u64 length, u64 mm_offset,
-	       u32 block, struct lock_class_key *key, struct nvkm_vm **pvm)
+static void
+nvkm_mmu_ptc_fini(struct nvkm_mmu *mmu)
 {
-	static struct lock_class_key _key;
-	struct nvkm_vm *vm;
-	u64 mm_length = (offset + length) - mm_offset;
-	int ret;
-
-	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
-	if (!vm)
-		return -ENOMEM;
-
-	__mutex_init(&vm->mutex, "&vm->mutex", key ? key : &_key);
-	INIT_LIST_HEAD(&vm->pgd_list);
-	vm->mmu = mmu;
-	kref_init(&vm->refcount);
-	vm->fpde = offset >> (mmu->func->pgt_bits + 12);
-	vm->lpde = (offset + length - 1) >> (mmu->func->pgt_bits + 12);
-
-	vm->pgt  = vzalloc((vm->lpde - vm->fpde + 1) * sizeof(*vm->pgt));
-	if (!vm->pgt) {
-		kfree(vm);
-		return -ENOMEM;
-	}
+	struct nvkm_mmu_ptc *ptc, *ptct;
 
-	ret = nvkm_mm_init(&vm->mm, mm_offset >> 12, mm_length >> 12,
-			   block >> 12);
-	if (ret) {
-		vfree(vm->pgt);
-		kfree(vm);
-		return ret;
+	list_for_each_entry_safe(ptc, ptct, &mmu->ptc.list, head) {
+		WARN_ON(!list_empty(&ptc->item));
+		list_del(&ptc->head);
+		kfree(ptc);
 	}
-
-	*pvm = vm;
-
-	return 0;
 }
 
-int
-nvkm_vm_new(struct nvkm_device *device, u64 offset, u64 length, u64 mm_offset,
-	    struct lock_class_key *key, struct nvkm_vm **pvm)
+static void
+nvkm_mmu_ptc_init(struct nvkm_mmu *mmu)
 {
-	struct nvkm_mmu *mmu = device->mmu;
-	if (!mmu->func->create)
-		return -EINVAL;
-	return mmu->func->create(mmu, offset, length, mm_offset, key, pvm);
+	mutex_init(&mmu->ptc.mutex);
+	INIT_LIST_HEAD(&mmu->ptc.list);
+	mutex_init(&mmu->ptp.mutex);
+	INIT_LIST_HEAD(&mmu->ptp.list);
 }
 
-static int
-nvkm_vm_link(struct nvkm_vm *vm, struct nvkm_gpuobj *pgd)
+static void
+nvkm_mmu_type(struct nvkm_mmu *mmu, int heap, u8 type)
 {
-	struct nvkm_mmu *mmu = vm->mmu;
-	struct nvkm_vm_pgd *vpgd;
-	int i;
-
-	if (!pgd)
-		return 0;
-
-	vpgd = kzalloc(sizeof(*vpgd), GFP_KERNEL);
-	if (!vpgd)
-		return -ENOMEM;
-
-	vpgd->obj = pgd;
-
-	mutex_lock(&vm->mutex);
-	for (i = vm->fpde; i <= vm->lpde; i++)
-		mmu->func->map_pgt(pgd, i, vm->pgt[i - vm->fpde].mem);
-	list_add(&vpgd->head, &vm->pgd_list);
-	mutex_unlock(&vm->mutex);
-	return 0;
+	if (heap >= 0 && !WARN_ON(mmu->type_nr == ARRAY_SIZE(mmu->type))) {
+		mmu->type[mmu->type_nr].type = type | mmu->heap[heap].type;
+		mmu->type[mmu->type_nr].heap = heap;
+		mmu->type_nr++;
+	}
 }
 
-static void
-nvkm_vm_unlink(struct nvkm_vm *vm, struct nvkm_gpuobj *mpgd)
+static int
+nvkm_mmu_heap(struct nvkm_mmu *mmu, u8 type, u64 size)
 {
-	struct nvkm_vm_pgd *vpgd, *tmp;
-
-	if (!mpgd)
-		return;
-
-	mutex_lock(&vm->mutex);
-	list_for_each_entry_safe(vpgd, tmp, &vm->pgd_list, head) {
-		if (vpgd->obj == mpgd) {
-			list_del(&vpgd->head);
-			kfree(vpgd);
-			break;
+	if (size) {
+		if (!WARN_ON(mmu->heap_nr == ARRAY_SIZE(mmu->heap))) {
+			mmu->heap[mmu->heap_nr].type = type;
+			mmu->heap[mmu->heap_nr].size = size;
+			return mmu->heap_nr++;
 		}
 	}
-	mutex_unlock(&vm->mutex);
+	return -EINVAL;
 }
 
 static void
-nvkm_vm_del(struct kref *kref)
+nvkm_mmu_host(struct nvkm_mmu *mmu)
 {
-	struct nvkm_vm *vm = container_of(kref, typeof(*vm), refcount);
-	struct nvkm_vm_pgd *vpgd, *tmp;
-
-	list_for_each_entry_safe(vpgd, tmp, &vm->pgd_list, head) {
-		nvkm_vm_unlink(vm, vpgd->obj);
-	}
-
-	nvkm_mm_fini(&vm->mm);
-	vfree(vm->pgt);
-	kfree(vm);
+	struct nvkm_device *device = mmu->subdev.device;
+	u8 type = NVKM_MEM_KIND * !!mmu->func->kind_sys;
+	int heap;
+
+	/* Non-mappable system memory. */
+	heap = nvkm_mmu_heap(mmu, NVKM_MEM_HOST, ~0ULL);
+	nvkm_mmu_type(mmu, heap, type);
+
+	/* Non-coherent, cached, system memory.
+	 *
+	 * Block-linear mappings of system memory must be done through
+	 * BAR1, and cannot be supported on systems where we're unable
+	 * to map BAR1 with write-combining.
+	 */
+	type |= NVKM_MEM_MAPPABLE;
+	if (!device->bar || device->bar->iomap_uncached)
+		nvkm_mmu_type(mmu, heap, type & ~NVKM_MEM_KIND);
+	else
+		nvkm_mmu_type(mmu, heap, type);
+
+	/* Coherent, cached, system memory.
+	 *
+	 * Unsupported on systems that aren't able to support snooped
+	 * mappings, and also for block-linear mappings which must be
+	 * done through BAR1.
+	 */
+	type |= NVKM_MEM_COHERENT;
+	if (device->func->cpu_coherent)
+		nvkm_mmu_type(mmu, heap, type & ~NVKM_MEM_KIND);
+
+	/* Uncached system memory. */
+	nvkm_mmu_type(mmu, heap, type |= NVKM_MEM_UNCACHED);
 }
 
-int
-nvkm_vm_ref(struct nvkm_vm *ref, struct nvkm_vm **ptr, struct nvkm_gpuobj *pgd)
+static void
+nvkm_mmu_vram(struct nvkm_mmu *mmu)
 {
-	if (ref) {
-		int ret = nvkm_vm_link(ref, pgd);
-		if (ret)
-			return ret;
-
-		kref_get(&ref->refcount);
-	}
+	struct nvkm_device *device = mmu->subdev.device;
+	struct nvkm_mm *mm = &device->fb->ram->vram;
+	const u32 sizeN = nvkm_mm_heap_size(mm, NVKM_RAM_MM_NORMAL);
+	const u32 sizeU = nvkm_mm_heap_size(mm, NVKM_RAM_MM_NOMAP);
+	const u32 sizeM = nvkm_mm_heap_size(mm, NVKM_RAM_MM_MIXED);
+	u8 type = NVKM_MEM_KIND * !!mmu->func->kind;
+	u8 heap = NVKM_MEM_VRAM;
+	int heapM, heapN, heapU;
+
+	/* Mixed-memory doesn't support compression or display. */
+	heapM = nvkm_mmu_heap(mmu, heap, sizeM << NVKM_RAM_MM_SHIFT);
+
+	heap |= NVKM_MEM_COMP;
+	heap |= NVKM_MEM_DISP;
+	heapN = nvkm_mmu_heap(mmu, heap, sizeN << NVKM_RAM_MM_SHIFT);
+	heapU = nvkm_mmu_heap(mmu, heap, sizeU << NVKM_RAM_MM_SHIFT);
+
+	/* Add non-mappable VRAM types first so that they're preferred
+	 * over anything else.  Mixed-memory will be slower than other
+	 * heaps, it's prioritised last.
+	 */
+	nvkm_mmu_type(mmu, heapU, type);
+	nvkm_mmu_type(mmu, heapN, type);
+	nvkm_mmu_type(mmu, heapM, type);
+
+	/* Add host memory types next, under the assumption that users
+	 * wanting mappable memory want to use them as staging buffers
+	 * or the like.
+	 */
+	nvkm_mmu_host(mmu);
+
+	/* Mappable VRAM types go last, as they're basically the worst
+	 * possible type to ask for unless there's no other choice.
+	 */
+	if (device->bar) {
+		/* Write-combined BAR1 access. */
+		type |= NVKM_MEM_MAPPABLE;
+		if (!device->bar->iomap_uncached) {
+			nvkm_mmu_type(mmu, heapN, type);
+			nvkm_mmu_type(mmu, heapM, type);
+		}
 
-	if (*ptr) {
-		nvkm_vm_unlink(*ptr, pgd);
-		kref_put(&(*ptr)->refcount, nvkm_vm_del);
+		/* Uncached BAR1 access. */
+		type |= NVKM_MEM_COHERENT;
+		type |= NVKM_MEM_UNCACHED;
+		nvkm_mmu_type(mmu, heapN, type);
+		nvkm_mmu_type(mmu, heapM, type);
 	}
-
-	*ptr = ref;
-	return 0;
 }
 
 static int
 nvkm_mmu_oneinit(struct nvkm_subdev *subdev)
 {
 	struct nvkm_mmu *mmu = nvkm_mmu(subdev);
-	if (mmu->func->oneinit)
-		return mmu->func->oneinit(mmu);
+
+	/* Determine available memory types. */
+	if (mmu->subdev.device->fb && mmu->subdev.device->fb->ram)
+		nvkm_mmu_vram(mmu);
+	else
+		nvkm_mmu_host(mmu);
+
+	if (mmu->func->vmm.global) {
+		int ret = nvkm_vmm_new(subdev->device, 0, 0, NULL, 0, NULL,
+				       "gart", &mmu->vmm);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
@@ -509,8 +398,10 @@ static void *
 nvkm_mmu_dtor(struct nvkm_subdev *subdev)
 {
 	struct nvkm_mmu *mmu = nvkm_mmu(subdev);
-	if (mmu->func->dtor)
-		return mmu->func->dtor(mmu);
+
+	nvkm_vmm_unref(&mmu->vmm);
+
+	nvkm_mmu_ptc_fini(mmu);
 	return mmu;
 }
 
@@ -527,9 +418,10 @@ nvkm_mmu_ctor(const struct nvkm_mmu_func *func, struct nvkm_device *device,
 {
 	nvkm_subdev_ctor(&nvkm_mmu, device, index, &mmu->subdev);
 	mmu->func = func;
-	mmu->limit = func->limit;
 	mmu->dma_bits = func->dma_bits;
-	mmu->lpg_shift = func->lpg_shift;
+	nvkm_mmu_ptc_init(mmu);
+	mmu->user.ctor = nvkm_ummu_new;
+	mmu->user.base = func->mmu.user;
 }
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/g84.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/g84.c
new file mode 100644
index 000000000000..8accda5a772b
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/g84.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "mem.h"
+#include "vmm.h"
+
+#include <nvif/class.h>
+
+static const struct nvkm_mmu_func
+g84_mmu = {
+	.dma_bits = 40,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_NV50}},
+	.mem = {{ -1,  0, NVIF_CLASS_MEM_NV50}, nv50_mem_new, nv50_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_NV50}, nv50_vmm_new, false, 0x0200 },
+	.kind = nv50_mmu_kind,
+	.kind_sys = true,
+};
+
+int
+g84_mmu_new(struct nvkm_device *device, int index, struct nvkm_mmu **pmmu)
+{
+	return nvkm_mmu_new_(&g84_mmu, device, index, pmmu);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gf100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gf100.c
index 7ac507c927bb..2d075246dc46 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gf100.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gf100.c
@@ -21,197 +21,65 @@
  *
  * Authors: Ben Skeggs
  */
-#include "priv.h"
+#include "mem.h"
+#include "vmm.h"
 
-#include <subdev/fb.h>
-#include <subdev/ltc.h>
-#include <subdev/timer.h>
-
-#include <core/gpuobj.h>
+#include <nvif/class.h>
 
 /* Map from compressed to corresponding uncompressed storage type.
  * The value 0xff represents an invalid storage type.
  */
-const u8 gf100_pte_storage_type_map[256] =
-{
-	0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0xff, 0x01, /* 0x00 */
-	0x01, 0x01, 0x01, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0x11, 0xff, 0xff, 0xff, 0xff, 0xff, 0x11, /* 0x10 */
-	0x11, 0x11, 0x11, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x26, 0x27, /* 0x20 */
-	0x28, 0x29, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30 */
-	0xff, 0xff, 0x26, 0x27, 0x28, 0x29, 0x26, 0x27,
-	0x28, 0x29, 0xff, 0xff, 0xff, 0xff, 0x46, 0xff, /* 0x40 */
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0x46, 0x46, 0x46, 0x46, 0xff, 0xff, 0xff, /* 0x50 */
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60 */
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70 */
-	0xff, 0xff, 0xff, 0x7b, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7b, 0x7b, /* 0x80 */
-	0x7b, 0x7b, 0xff, 0x8b, 0x8c, 0x8d, 0x8e, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90 */
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0x8b, 0x8c, 0x8d, 0x8e, 0xa7, /* 0xa0 */
-	0xa8, 0xa9, 0xaa, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0 */
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xa7,
-	0xa8, 0xa9, 0xaa, 0xc3, 0xff, 0xff, 0xff, 0xff, /* 0xc0 */
-	0xff, 0xff, 0xff, 0xff, 0xfe, 0xfe, 0xc3, 0xc3,
-	0xc3, 0xc3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0 */
-	0xfe, 0xff, 0xff, 0xfe, 0xff, 0xfe, 0xff, 0xfe,
-	0xfe, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff, /* 0xe0 */
-	0xff, 0xfe, 0xff, 0xfe, 0xff, 0xfe, 0xfe, 0xff,
-	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf0 */
-	0xfe, 0xfe, 0xfe, 0xfe, 0xff, 0xfd, 0xfe, 0xff
-};
-
-
-static void
-gf100_vm_map_pgt(struct nvkm_gpuobj *pgd, u32 index, struct nvkm_memory *pgt[2])
-{
-	u32 pde[2] = { 0, 0 };
-
-	if (pgt[0])
-		pde[1] = 0x00000001 | (nvkm_memory_addr(pgt[0]) >> 8);
-	if (pgt[1])
-		pde[0] = 0x00000001 | (nvkm_memory_addr(pgt[1]) >> 8);
-
-	nvkm_kmap(pgd);
-	nvkm_wo32(pgd, (index * 8) + 0, pde[0]);
-	nvkm_wo32(pgd, (index * 8) + 4, pde[1]);
-	nvkm_done(pgd);
-}
-
-static inline u64
-gf100_vm_addr(struct nvkm_vma *vma, u64 phys, u32 memtype, u32 target)
-{
-	phys >>= 8;
-
-	phys |= 0x00000001; /* present */
-	if (vma->access & NV_MEM_ACCESS_SYS)
-		phys |= 0x00000002;
-
-	phys |= ((u64)target  << 32);
-	phys |= ((u64)memtype << 36);
-	return phys;
-}
-
-static void
-gf100_vm_map(struct nvkm_vma *vma, struct nvkm_memory *pgt,
-	     struct nvkm_mem *mem, u32 pte, u32 cnt, u64 phys, u64 delta)
-{
-	u64 next = 1 << (vma->node->type - 8);
-
-	phys  = gf100_vm_addr(vma, phys, mem->memtype, 0);
-	pte <<= 3;
-
-	if (mem->tag) {
-		struct nvkm_ltc *ltc = vma->vm->mmu->subdev.device->ltc;
-		u32 tag = mem->tag->offset + (delta >> 17);
-		phys |= (u64)tag << (32 + 12);
-		next |= (u64)1   << (32 + 12);
-		nvkm_ltc_tags_clear(ltc, tag, cnt);
-	}
-
-	nvkm_kmap(pgt);
-	while (cnt--) {
-		nvkm_wo32(pgt, pte + 0, lower_32_bits(phys));
-		nvkm_wo32(pgt, pte + 4, upper_32_bits(phys));
-		phys += next;
-		pte  += 8;
-	}
-	nvkm_done(pgt);
-}
-
-static void
-gf100_vm_map_sg(struct nvkm_vma *vma, struct nvkm_memory *pgt,
-		struct nvkm_mem *mem, u32 pte, u32 cnt, dma_addr_t *list)
-{
-	u32 target = (vma->access & NV_MEM_ACCESS_NOSNOOP) ? 7 : 5;
-	/* compressed storage types are invalid for system memory */
-	u32 memtype = gf100_pte_storage_type_map[mem->memtype & 0xff];
-
-	nvkm_kmap(pgt);
-	pte <<= 3;
-	while (cnt--) {
-		u64 phys = gf100_vm_addr(vma, *list++, memtype, target);
-		nvkm_wo32(pgt, pte + 0, lower_32_bits(phys));
-		nvkm_wo32(pgt, pte + 4, upper_32_bits(phys));
-		pte += 8;
-	}
-	nvkm_done(pgt);
-}
-
-static void
-gf100_vm_unmap(struct nvkm_vma *vma, struct nvkm_memory *pgt, u32 pte, u32 cnt)
-{
-	nvkm_kmap(pgt);
-	pte <<= 3;
-	while (cnt--) {
-		nvkm_wo32(pgt, pte + 0, 0x00000000);
-		nvkm_wo32(pgt, pte + 4, 0x00000000);
-		pte += 8;
-	}
-	nvkm_done(pgt);
-}
-
-static void
-gf100_vm_flush(struct nvkm_vm *vm)
-{
-	struct nvkm_mmu *mmu = vm->mmu;
-	struct nvkm_device *device = mmu->subdev.device;
-	struct nvkm_vm_pgd *vpgd;
-	u32 type;
-
-	type = 0x00000001; /* PAGE_ALL */
-	if (atomic_read(&vm->engref[NVKM_SUBDEV_BAR]))
-		type |= 0x00000004; /* HUB_ONLY */
-
-	mutex_lock(&mmu->subdev.mutex);
-	list_for_each_entry(vpgd, &vm->pgd_list, head) {
-		/* looks like maybe a "free flush slots" counter, the
-		 * faster you write to 0x100cbc to more it decreases
-		 */
-		nvkm_msec(device, 2000,
-			if (nvkm_rd32(device, 0x100c80) & 0x00ff0000)
-				break;
-		);
-
-		nvkm_wr32(device, 0x100cb8, vpgd->obj->addr >> 8);
-		nvkm_wr32(device, 0x100cbc, 0x80000000 | type);
-
-		/* wait for flush to be queued? */
-		nvkm_msec(device, 2000,
-			if (nvkm_rd32(device, 0x100c80) & 0x00008000)
-				break;
-		);
-	}
-	mutex_unlock(&mmu->subdev.mutex);
-}
-
-static int
-gf100_vm_create(struct nvkm_mmu *mmu, u64 offset, u64 length, u64 mm_offset,
-		struct lock_class_key *key, struct nvkm_vm **pvm)
+const u8 *
+gf100_mmu_kind(struct nvkm_mmu *mmu, int *count)
 {
-	return nvkm_vm_create(mmu, offset, length, mm_offset, 4096, key, pvm);
+	static const u8
+	kind[256] = {
+		0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0xff, 0x01, /* 0x00 */
+		0x01, 0x01, 0x01, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0x11, 0xff, 0xff, 0xff, 0xff, 0xff, 0x11, /* 0x10 */
+		0x11, 0x11, 0x11, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x26, 0x27, /* 0x20 */
+		0x28, 0x29, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30 */
+		0xff, 0xff, 0x26, 0x27, 0x28, 0x29, 0x26, 0x27,
+		0x28, 0x29, 0xff, 0xff, 0xff, 0xff, 0x46, 0xff, /* 0x40 */
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0x46, 0x46, 0x46, 0x46, 0xff, 0xff, 0xff, /* 0x50 */
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60 */
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70 */
+		0xff, 0xff, 0xff, 0x7b, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7b, 0x7b, /* 0x80 */
+		0x7b, 0x7b, 0xff, 0x8b, 0x8c, 0x8d, 0x8e, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90 */
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0x8b, 0x8c, 0x8d, 0x8e, 0xa7, /* 0xa0 */
+		0xa8, 0xa9, 0xaa, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0 */
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xa7,
+		0xa8, 0xa9, 0xaa, 0xc3, 0xff, 0xff, 0xff, 0xff, /* 0xc0 */
+		0xff, 0xff, 0xff, 0xff, 0xfe, 0xfe, 0xc3, 0xc3,
+		0xc3, 0xc3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0 */
+		0xfe, 0xff, 0xff, 0xfe, 0xff, 0xfe, 0xff, 0xfe,
+		0xfe, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff, /* 0xe0 */
+		0xff, 0xfe, 0xff, 0xfe, 0xff, 0xfe, 0xfe, 0xff,
+		0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf0 */
+		0xfe, 0xfe, 0xfe, 0xfe, 0xff, 0xfd, 0xfe, 0xff
+	};
+
+	*count = ARRAY_SIZE(kind);
+	return kind;
 }
 
 static const struct nvkm_mmu_func
 gf100_mmu = {
-	.limit = (1ULL << 40),
 	.dma_bits = 40,
-	.pgt_bits  = 27 - 12,
-	.spg_shift = 12,
-	.lpg_shift = 17,
-	.create = gf100_vm_create,
-	.map_pgt = gf100_vm_map_pgt,
-	.map = gf100_vm_map,
-	.map_sg = gf100_vm_map_sg,
-	.unmap = gf100_vm_unmap,
-	.flush = gf100_vm_flush,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_GF100}},
+	.mem = {{ -1,  0, NVIF_CLASS_MEM_GF100}, gf100_mem_new, gf100_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_GF100}, gf100_vmm_new },
+	.kind = gf100_mmu_kind,
+	.kind_sys = true,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gk104.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gk104.c
new file mode 100644
index 000000000000..3d7d1eb1cff9
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gk104.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "mem.h"
+#include "vmm.h"
+
+#include <nvif/class.h>
+
+static const struct nvkm_mmu_func
+gk104_mmu = {
+	.dma_bits = 40,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_GF100}},
+	.mem = {{ -1,  0, NVIF_CLASS_MEM_GF100}, gf100_mem_new, gf100_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_GF100}, gk104_vmm_new },
+	.kind = gf100_mmu_kind,
+	.kind_sys = true,
+};
+
+int
+gk104_mmu_new(struct nvkm_device *device, int index, struct nvkm_mmu **pmmu)
+{
+	return nvkm_mmu_new_(&gk104_mmu, device, index, pmmu);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gk20a.c
new file mode 100644
index 000000000000..ac74965a60d4
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gk20a.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "mem.h"
+#include "vmm.h"
+
+#include <nvif/class.h>
+
+static const struct nvkm_mmu_func
+gk20a_mmu = {
+	.dma_bits = 40,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_GF100}},
+	.mem = {{ -1, -1, NVIF_CLASS_MEM_GF100}, .umap = gf100_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_GF100}, gk20a_vmm_new },
+	.kind = gf100_mmu_kind,
+	.kind_sys = true,
+};
+
+int
+gk20a_mmu_new(struct nvkm_device *device, int index, struct nvkm_mmu **pmmu)
+{
+	return nvkm_mmu_new_(&gk20a_mmu, device, index, pmmu);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gm200.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gm200.c
new file mode 100644
index 000000000000..dbf644ebac97
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gm200.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "mem.h"
+#include "vmm.h"
+
+#include <subdev/fb.h>
+
+#include <nvif/class.h>
+
+const u8 *
+gm200_mmu_kind(struct nvkm_mmu *mmu, int *count)
+{
+	static const u8
+	kind[256] = {
+		0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0xff, 0x01, /* 0x00 */
+		0x01, 0x01, 0x01, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0x11, 0xff, 0xff, 0xff, 0xff, 0xff, 0x11, /* 0x10 */
+		0x11, 0x11, 0x11, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x26, 0x27, /* 0x20 */
+		0x28, 0x29, 0x2a, 0x2b, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30 */
+		0xff, 0xff, 0x26, 0x27, 0x28, 0x29, 0x26, 0x27,
+		0x28, 0x29, 0xff, 0xff, 0xff, 0xff, 0x46, 0xff, /* 0x40 */
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0x46, 0x46, 0x46, 0x46, 0xff, 0xff, 0xff, /* 0x50 */
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60 */
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70 */
+		0xff, 0xff, 0xff, 0x7b, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7b, 0x7b, /* 0x80 */
+		0x7b, 0x7b, 0xff, 0x8b, 0x8c, 0x8d, 0x8e, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90 */
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0x8b, 0x8c, 0x8d, 0x8e, 0xa7, /* 0xa0 */
+		0xa8, 0xa9, 0xaa, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0 */
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xa7,
+		0xa8, 0xa9, 0xaa, 0xc3, 0xff, 0xff, 0xff, 0xff, /* 0xc0 */
+		0xff, 0xff, 0xff, 0xff, 0xfe, 0xfe, 0xc3, 0xc3,
+		0xc3, 0xc3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0 */
+		0xfe, 0xff, 0xff, 0xfe, 0xff, 0xfe, 0xff, 0xfe,
+		0xfe, 0xff, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff, /* 0xe0 */
+		0xff, 0xfe, 0xff, 0xfe, 0xff, 0xfe, 0xfe, 0xff,
+		0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, /* 0xf0 */
+		0xfe, 0xfe, 0xfe, 0xfe, 0xff, 0xfd, 0xfe, 0xff
+	};
+	*count = ARRAY_SIZE(kind);
+	return kind;
+}
+
+static const struct nvkm_mmu_func
+gm200_mmu = {
+	.dma_bits = 40,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_GF100}},
+	.mem = {{ -1,  0, NVIF_CLASS_MEM_GF100}, gf100_mem_new, gf100_mem_map },
+	.vmm = {{ -1,  0, NVIF_CLASS_VMM_GM200}, gm200_vmm_new },
+	.kind = gm200_mmu_kind,
+	.kind_sys = true,
+};
+
+static const struct nvkm_mmu_func
+gm200_mmu_fixed = {
+	.dma_bits = 40,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_GF100}},
+	.mem = {{ -1,  0, NVIF_CLASS_MEM_GF100}, gf100_mem_new, gf100_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_GM200}, gm200_vmm_new_fixed },
+	.kind = gm200_mmu_kind,
+	.kind_sys = true,
+};
+
+int
+gm200_mmu_new(struct nvkm_device *device, int index, struct nvkm_mmu **pmmu)
+{
+	if (device->fb->page)
+		return nvkm_mmu_new_(&gm200_mmu_fixed, device, index, pmmu);
+	return nvkm_mmu_new_(&gm200_mmu, device, index, pmmu);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gm20b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gm20b.c
new file mode 100644
index 000000000000..7353a94b4091
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gm20b.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "mem.h"
+#include "vmm.h"
+
+#include <subdev/fb.h>
+
+#include <nvif/class.h>
+
+static const struct nvkm_mmu_func
+gm20b_mmu = {
+	.dma_bits = 40,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_GF100}},
+	.mem = {{ -1, -1, NVIF_CLASS_MEM_GF100}, .umap = gf100_mem_map },
+	.vmm = {{ -1,  0, NVIF_CLASS_VMM_GM200}, gm20b_vmm_new },
+	.kind = gm200_mmu_kind,
+	.kind_sys = true,
+};
+
+static const struct nvkm_mmu_func
+gm20b_mmu_fixed = {
+	.dma_bits = 40,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_GF100}},
+	.mem = {{ -1, -1, NVIF_CLASS_MEM_GF100}, .umap = gf100_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_GM200}, gm20b_vmm_new_fixed },
+	.kind = gm200_mmu_kind,
+	.kind_sys = true,
+};
+
+int
+gm20b_mmu_new(struct nvkm_device *device, int index, struct nvkm_mmu **pmmu)
+{
+	if (device->fb->page)
+		return nvkm_mmu_new_(&gm20b_mmu_fixed, device, index, pmmu);
+	return nvkm_mmu_new_(&gm20b_mmu, device, index, pmmu);
+}
diff --git a/drivers/gpu/drm/radeon/radeon_kfd.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp100.c
index 9df1fea8e971..651b8805c67c 100644
--- a/drivers/gpu/drm/radeon/radeon_kfd.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp100.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2014 Advanced Micro Devices, Inc.
+ * Copyright 2017 Red Hat Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -19,29 +19,27 @@
  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  * OTHER DEALINGS IN THE SOFTWARE.
  */
+#include "mem.h"
+#include "vmm.h"
 
-/*
- * radeon_kfd.h defines the private interface between the
- * AMD kernel graphics drivers and the AMD KFD.
- */
-
-#ifndef RADEON_KFD_H_INCLUDED
-#define RADEON_KFD_H_INCLUDED
-
-#include <linux/types.h>
-#include "kgd_kfd_interface.h"
-
-struct radeon_device;
+#include <core/option.h>
 
-int radeon_kfd_init(void);
-void radeon_kfd_fini(void);
+#include <nvif/class.h>
 
-void radeon_kfd_suspend(struct radeon_device *rdev);
-int radeon_kfd_resume(struct radeon_device *rdev);
-void radeon_kfd_interrupt(struct radeon_device *rdev,
-			const void *ih_ring_entry);
-void radeon_kfd_device_probe(struct radeon_device *rdev);
-void radeon_kfd_device_init(struct radeon_device *rdev);
-void radeon_kfd_device_fini(struct radeon_device *rdev);
+static const struct nvkm_mmu_func
+gp100_mmu = {
+	.dma_bits = 47,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_GF100}},
+	.mem = {{ -1,  0, NVIF_CLASS_MEM_GF100}, gf100_mem_new, gf100_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_GP100}, gp100_vmm_new },
+	.kind = gm200_mmu_kind,
+	.kind_sys = true,
+};
 
-#endif /* RADEON_KFD_H_INCLUDED */
+int
+gp100_mmu_new(struct nvkm_device *device, int index, struct nvkm_mmu **pmmu)
+{
+	if (!nvkm_boolopt(device->cfgopt, "GP100MmuLayout", true))
+		return gm200_mmu_new(device, index, pmmu);
+	return nvkm_mmu_new_(&gp100_mmu, device, index, pmmu);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp10b. b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp10b.
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp10b.
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp10b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp10b.c
new file mode 100644
index 000000000000..3bd3db31e0bb
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp10b.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "mem.h"
+#include "vmm.h"
+
+#include <core/option.h>
+
+#include <nvif/class.h>
+
+static const struct nvkm_mmu_func
+gp10b_mmu = {
+	.dma_bits = 47,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_GF100}},
+	.mem = {{ -1, -1, NVIF_CLASS_MEM_GF100}, .umap = gf100_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_GP100}, gp10b_vmm_new },
+	.kind = gm200_mmu_kind,
+	.kind_sys = true,
+};
+
+int
+gp10b_mmu_new(struct nvkm_device *device, int index, struct nvkm_mmu **pmmu)
+{
+	if (!nvkm_boolopt(device->cfgopt, "GP100MmuLayout", true))
+		return gm20b_mmu_new(device, index, pmmu);
+	return nvkm_mmu_new_(&gp10b_mmu, device, index, pmmu);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mem.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mem.c
new file mode 100644
index 000000000000..39808489f21d
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mem.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#define nvkm_mem(p) container_of((p), struct nvkm_mem, memory)
+#include "mem.h"
+
+#include <core/memory.h>
+
+#include <nvif/if000a.h>
+#include <nvif/unpack.h>
+
+struct nvkm_mem {
+	struct nvkm_memory memory;
+	enum nvkm_memory_target target;
+	struct nvkm_mmu *mmu;
+	u64 pages;
+	struct page **mem;
+	union {
+		struct scatterlist *sgl;
+		dma_addr_t *dma;
+	};
+};
+
+static enum nvkm_memory_target
+nvkm_mem_target(struct nvkm_memory *memory)
+{
+	return nvkm_mem(memory)->target;
+}
+
+static u8
+nvkm_mem_page(struct nvkm_memory *memory)
+{
+	return PAGE_SHIFT;
+}
+
+static u64
+nvkm_mem_addr(struct nvkm_memory *memory)
+{
+	struct nvkm_mem *mem = nvkm_mem(memory);
+	if (mem->pages == 1 && mem->mem)
+		return mem->dma[0];
+	return ~0ULL;
+}
+
+static u64
+nvkm_mem_size(struct nvkm_memory *memory)
+{
+	return nvkm_mem(memory)->pages << PAGE_SHIFT;
+}
+
+static int
+nvkm_mem_map_dma(struct nvkm_memory *memory, u64 offset, struct nvkm_vmm *vmm,
+		 struct nvkm_vma *vma, void *argv, u32 argc)
+{
+	struct nvkm_mem *mem = nvkm_mem(memory);
+	struct nvkm_vmm_map map = {
+		.memory = &mem->memory,
+		.offset = offset,
+		.dma = mem->dma,
+	};
+	return nvkm_vmm_map(vmm, vma, argv, argc, &map);
+}
+
+static void *
+nvkm_mem_dtor(struct nvkm_memory *memory)
+{
+	struct nvkm_mem *mem = nvkm_mem(memory);
+	if (mem->mem) {
+		while (mem->pages--) {
+			dma_unmap_page(mem->mmu->subdev.device->dev,
+				       mem->dma[mem->pages], PAGE_SIZE,
+				       DMA_BIDIRECTIONAL);
+			__free_page(mem->mem[mem->pages]);
+		}
+		kvfree(mem->dma);
+		kvfree(mem->mem);
+	}
+	return mem;
+}
+
+static const struct nvkm_memory_func
+nvkm_mem_dma = {
+	.dtor = nvkm_mem_dtor,
+	.target = nvkm_mem_target,
+	.page = nvkm_mem_page,
+	.addr = nvkm_mem_addr,
+	.size = nvkm_mem_size,
+	.map = nvkm_mem_map_dma,
+};
+
+static int
+nvkm_mem_map_sgl(struct nvkm_memory *memory, u64 offset, struct nvkm_vmm *vmm,
+		 struct nvkm_vma *vma, void *argv, u32 argc)
+{
+	struct nvkm_mem *mem = nvkm_mem(memory);
+	struct nvkm_vmm_map map = {
+		.memory = &mem->memory,
+		.offset = offset,
+		.sgl = mem->sgl,
+	};
+	return nvkm_vmm_map(vmm, vma, argv, argc, &map);
+}
+
+static const struct nvkm_memory_func
+nvkm_mem_sgl = {
+	.dtor = nvkm_mem_dtor,
+	.target = nvkm_mem_target,
+	.page = nvkm_mem_page,
+	.addr = nvkm_mem_addr,
+	.size = nvkm_mem_size,
+	.map = nvkm_mem_map_sgl,
+};
+
+int
+nvkm_mem_map_host(struct nvkm_memory *memory, void **pmap)
+{
+	struct nvkm_mem *mem = nvkm_mem(memory);
+	if (mem->mem) {
+		*pmap = vmap(mem->mem, mem->pages, VM_MAP, PAGE_KERNEL);
+		return *pmap ? 0 : -EFAULT;
+	}
+	return -EINVAL;
+}
+
+static int
+nvkm_mem_new_host(struct nvkm_mmu *mmu, int type, u8 page, u64 size,
+		  void *argv, u32 argc, struct nvkm_memory **pmemory)
+{
+	struct device *dev = mmu->subdev.device->dev;
+	union {
+		struct nvif_mem_ram_vn vn;
+		struct nvif_mem_ram_v0 v0;
+	} *args = argv;
+	int ret = -ENOSYS;
+	enum nvkm_memory_target target;
+	struct nvkm_mem *mem;
+	gfp_t gfp = GFP_USER | __GFP_ZERO;
+
+	if ( (mmu->type[type].type & NVKM_MEM_COHERENT) &&
+	    !(mmu->type[type].type & NVKM_MEM_UNCACHED))
+		target = NVKM_MEM_TARGET_HOST;
+	else
+		target = NVKM_MEM_TARGET_NCOH;
+
+	if (page != PAGE_SHIFT)
+		return -EINVAL;
+
+	if (!(mem = kzalloc(sizeof(*mem), GFP_KERNEL)))
+		return -ENOMEM;
+	mem->target = target;
+	mem->mmu = mmu;
+	*pmemory = &mem->memory;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		if (args->v0.dma) {
+			nvkm_memory_ctor(&nvkm_mem_dma, &mem->memory);
+			mem->dma = args->v0.dma;
+		} else {
+			nvkm_memory_ctor(&nvkm_mem_sgl, &mem->memory);
+			mem->sgl = args->v0.sgl;
+		}
+
+		if (!IS_ALIGNED(size, PAGE_SIZE))
+			return -EINVAL;
+		mem->pages = size >> PAGE_SHIFT;
+		return 0;
+	} else
+	if ( (ret = nvif_unvers(ret, &argv, &argc, args->vn))) {
+		kfree(mem);
+		return ret;
+	}
+
+	nvkm_memory_ctor(&nvkm_mem_dma, &mem->memory);
+	size = ALIGN(size, PAGE_SIZE) >> PAGE_SHIFT;
+
+	if (!(mem->mem = kvmalloc(sizeof(*mem->mem) * size, GFP_KERNEL)))
+		return -ENOMEM;
+	if (!(mem->dma = kvmalloc(sizeof(*mem->dma) * size, GFP_KERNEL)))
+		return -ENOMEM;
+
+	if (mmu->dma_bits > 32)
+		gfp |= GFP_HIGHUSER;
+	else
+		gfp |= GFP_DMA32;
+
+	for (mem->pages = 0; size; size--, mem->pages++) {
+		struct page *p = alloc_page(gfp);
+		if (!p)
+			return -ENOMEM;
+
+		mem->dma[mem->pages] = dma_map_page(mmu->subdev.device->dev,
+						    p, 0, PAGE_SIZE,
+						    DMA_BIDIRECTIONAL);
+		if (dma_mapping_error(dev, mem->dma[mem->pages])) {
+			__free_page(p);
+			return -ENOMEM;
+		}
+
+		mem->mem[mem->pages] = p;
+	}
+
+	return 0;
+}
+
+int
+nvkm_mem_new_type(struct nvkm_mmu *mmu, int type, u8 page, u64 size,
+		  void *argv, u32 argc, struct nvkm_memory **pmemory)
+{
+	struct nvkm_memory *memory = NULL;
+	int ret;
+
+	if (mmu->type[type].type & NVKM_MEM_VRAM) {
+		ret = mmu->func->mem.vram(mmu, type, page, size,
+					  argv, argc, &memory);
+	} else {
+		ret = nvkm_mem_new_host(mmu, type, page, size,
+					argv, argc, &memory);
+	}
+
+	if (ret)
+		nvkm_memory_unref(&memory);
+	*pmemory = memory;
+	return ret;
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mem.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mem.h
new file mode 100644
index 000000000000..234267e1b215
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mem.h
@@ -0,0 +1,23 @@
+#ifndef __NVKM_MEM_H__
+#define __NVKM_MEM_H__
+#include "priv.h"
+
+int nvkm_mem_new_type(struct nvkm_mmu *, int type, u8 page, u64 size,
+		      void *argv, u32 argc, struct nvkm_memory **);
+int nvkm_mem_map_host(struct nvkm_memory *, void **pmap);
+
+int nv04_mem_new(struct nvkm_mmu *, int, u8, u64, void *, u32,
+		 struct nvkm_memory **);
+int nv04_mem_map(struct nvkm_mmu *, struct nvkm_memory *, void *, u32,
+		 u64 *, u64 *, struct nvkm_vma **);
+
+int nv50_mem_new(struct nvkm_mmu *, int, u8, u64, void *, u32,
+		 struct nvkm_memory **);
+int nv50_mem_map(struct nvkm_mmu *, struct nvkm_memory *, void *, u32,
+		 u64 *, u64 *, struct nvkm_vma **);
+
+int gf100_mem_new(struct nvkm_mmu *, int, u8, u64, void *, u32,
+		  struct nvkm_memory **);
+int gf100_mem_map(struct nvkm_mmu *, struct nvkm_memory *, void *, u32,
+		  u64 *, u64 *, struct nvkm_vma **);
+#endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/memgf100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/memgf100.c
new file mode 100644
index 000000000000..d9c9bee45222
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/memgf100.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "mem.h"
+
+#include <core/memory.h>
+#include <subdev/bar.h>
+#include <subdev/fb.h>
+
+#include <nvif/class.h>
+#include <nvif/if900b.h>
+#include <nvif/if900d.h>
+#include <nvif/unpack.h>
+
+int
+gf100_mem_map(struct nvkm_mmu *mmu, struct nvkm_memory *memory, void *argv,
+	      u32 argc, u64 *paddr, u64 *psize, struct nvkm_vma **pvma)
+{
+	struct gf100_vmm_map_v0 uvmm = {};
+	union {
+		struct gf100_mem_map_vn vn;
+		struct gf100_mem_map_v0 v0;
+	} *args = argv;
+	struct nvkm_device *device = mmu->subdev.device;
+	struct nvkm_vmm *bar = nvkm_bar_bar1_vmm(device);
+	int ret = -ENOSYS;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		uvmm.ro   = args->v0.ro;
+		uvmm.kind = args->v0.kind;
+	} else
+	if (!(ret = nvif_unvers(ret, &argv, &argc, args->vn))) {
+	} else
+		return ret;
+
+	ret = nvkm_vmm_get(bar, nvkm_memory_page(memory),
+				nvkm_memory_size(memory), pvma);
+	if (ret)
+		return ret;
+
+	ret = nvkm_memory_map(memory, 0, bar, *pvma, &uvmm, sizeof(uvmm));
+	if (ret)
+		return ret;
+
+	*paddr = device->func->resource_addr(device, 1) + (*pvma)->addr;
+	*psize = (*pvma)->size;
+	return 0;
+}
+
+int
+gf100_mem_new(struct nvkm_mmu *mmu, int type, u8 page, u64 size,
+	      void *argv, u32 argc, struct nvkm_memory **pmemory)
+{
+	union {
+		struct gf100_mem_vn vn;
+		struct gf100_mem_v0 v0;
+	} *args = argv;
+	int ret = -ENOSYS;
+	bool contig;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		contig = args->v0.contig;
+	} else
+	if (!(ret = nvif_unvers(ret, &argv, &argc, args->vn))) {
+		contig = false;
+	} else
+		return ret;
+
+	if (mmu->type[type].type & (NVKM_MEM_DISP | NVKM_MEM_COMP))
+		type = NVKM_RAM_MM_NORMAL;
+	else
+		type = NVKM_RAM_MM_MIXED;
+
+	return nvkm_ram_get(mmu->subdev.device, type, 0x01, page,
+			    size, contig, false, pmemory);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/memnv04.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/memnv04.c
new file mode 100644
index 000000000000..79a3b0cc9f5b
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/memnv04.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "mem.h"
+
+#include <core/memory.h>
+#include <subdev/fb.h>
+
+#include <nvif/if000b.h>
+#include <nvif/unpack.h>
+
+int
+nv04_mem_map(struct nvkm_mmu *mmu, struct nvkm_memory *memory, void *argv,
+	     u32 argc, u64 *paddr, u64 *psize, struct nvkm_vma **pvma)
+{
+	union {
+		struct nv04_mem_map_vn vn;
+	} *args = argv;
+	struct nvkm_device *device = mmu->subdev.device;
+	const u64 addr = nvkm_memory_addr(memory);
+	int ret = -ENOSYS;
+
+	if ((ret = nvif_unvers(ret, &argv, &argc, args->vn)))
+		return ret;
+
+	*paddr = device->func->resource_addr(device, 1) + addr;
+	*psize = nvkm_memory_size(memory);
+	*pvma = ERR_PTR(-ENODEV);
+	return 0;
+}
+
+int
+nv04_mem_new(struct nvkm_mmu *mmu, int type, u8 page, u64 size,
+	     void *argv, u32 argc, struct nvkm_memory **pmemory)
+{
+	union {
+		struct nv04_mem_vn vn;
+	} *args = argv;
+	int ret = -ENOSYS;
+
+	if ((ret = nvif_unvers(ret, &argv, &argc, args->vn)))
+		return ret;
+
+	if (mmu->type[type].type & NVKM_MEM_MAPPABLE)
+		type = NVKM_RAM_MM_NORMAL;
+	else
+		type = NVKM_RAM_MM_NOMAP;
+
+	return nvkm_ram_get(mmu->subdev.device, type, 0x01, page,
+			    size, true, false, pmemory);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/memnv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/memnv50.c
new file mode 100644
index 000000000000..46759b89fc1f
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/memnv50.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "mem.h"
+
+#include <core/memory.h>
+#include <subdev/bar.h>
+#include <subdev/fb.h>
+
+#include <nvif/class.h>
+#include <nvif/if500b.h>
+#include <nvif/if500d.h>
+#include <nvif/unpack.h>
+
+int
+nv50_mem_map(struct nvkm_mmu *mmu, struct nvkm_memory *memory, void *argv,
+	     u32 argc, u64 *paddr, u64 *psize, struct nvkm_vma **pvma)
+{
+	struct nv50_vmm_map_v0 uvmm = {};
+	union {
+		struct nv50_mem_map_vn vn;
+		struct nv50_mem_map_v0 v0;
+	} *args = argv;
+	struct nvkm_device *device = mmu->subdev.device;
+	struct nvkm_vmm *bar = nvkm_bar_bar1_vmm(device);
+	u64 size = nvkm_memory_size(memory);
+	int ret = -ENOSYS;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		uvmm.ro   = args->v0.ro;
+		uvmm.kind = args->v0.kind;
+		uvmm.comp = args->v0.comp;
+	} else
+	if (!(ret = nvif_unvers(ret, &argv, &argc, args->vn))) {
+	} else
+		return ret;
+
+	ret = nvkm_vmm_get(bar, 12, size, pvma);
+	if (ret)
+		return ret;
+
+	*paddr = device->func->resource_addr(device, 1) + (*pvma)->addr;
+	*psize = (*pvma)->size;
+	return nvkm_memory_map(memory, 0, bar, *pvma, &uvmm, sizeof(uvmm));
+}
+
+int
+nv50_mem_new(struct nvkm_mmu *mmu, int type, u8 page, u64 size,
+	     void *argv, u32 argc, struct nvkm_memory **pmemory)
+{
+	union {
+		struct nv50_mem_vn vn;
+		struct nv50_mem_v0 v0;
+	} *args = argv;
+	int ret = -ENOSYS;
+	bool contig;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		type   = args->v0.bankswz ? 0x02 : 0x01;
+		contig = args->v0.contig;
+	} else
+	if (!(ret = nvif_unvers(ret, &argv, &argc, args->vn))) {
+		type   = 0x01;
+		contig = false;
+	} else
+		return -ENOSYS;
+
+	return nvkm_ram_get(mmu->subdev.device, NVKM_RAM_MM_NORMAL, type,
+			    page, size, contig, false, pmemory);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv04.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv04.c
index 37927c3fdc3e..d201c887c2cd 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv04.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv04.c
@@ -21,129 +21,21 @@
  *
  * Authors: Ben Skeggs
  */
-#include "nv04.h"
+#include "mem.h"
+#include "vmm.h"
 
-#include <core/gpuobj.h>
-
-#define NV04_PDMA_SIZE (128 * 1024 * 1024)
-#define NV04_PDMA_PAGE (  4 * 1024)
-
-/*******************************************************************************
- * VM map/unmap callbacks
- ******************************************************************************/
-
-static void
-nv04_vm_map_sg(struct nvkm_vma *vma, struct nvkm_memory *pgt,
-	       struct nvkm_mem *mem, u32 pte, u32 cnt, dma_addr_t *list)
-{
-	pte = 0x00008 + (pte * 4);
-	nvkm_kmap(pgt);
-	while (cnt) {
-		u32 page = PAGE_SIZE / NV04_PDMA_PAGE;
-		u32 phys = (u32)*list++;
-		while (cnt && page--) {
-			nvkm_wo32(pgt, pte, phys | 3);
-			phys += NV04_PDMA_PAGE;
-			pte += 4;
-			cnt -= 1;
-		}
-	}
-	nvkm_done(pgt);
-}
-
-static void
-nv04_vm_unmap(struct nvkm_vma *vma, struct nvkm_memory *pgt, u32 pte, u32 cnt)
-{
-	pte = 0x00008 + (pte * 4);
-	nvkm_kmap(pgt);
-	while (cnt--) {
-		nvkm_wo32(pgt, pte, 0x00000000);
-		pte += 4;
-	}
-	nvkm_done(pgt);
-}
-
-static void
-nv04_vm_flush(struct nvkm_vm *vm)
-{
-}
-
-/*******************************************************************************
- * MMU subdev
- ******************************************************************************/
-
-static int
-nv04_mmu_oneinit(struct nvkm_mmu *base)
-{
-	struct nv04_mmu *mmu = nv04_mmu(base);
-	struct nvkm_device *device = mmu->base.subdev.device;
-	struct nvkm_memory *dma;
-	int ret;
-
-	ret = nvkm_vm_create(&mmu->base, 0, NV04_PDMA_SIZE, 0, 4096, NULL,
-			     &mmu->vm);
-	if (ret)
-		return ret;
-
-	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST,
-			      (NV04_PDMA_SIZE / NV04_PDMA_PAGE) * 4 + 8,
-			      16, true, &dma);
-	mmu->vm->pgt[0].mem[0] = dma;
-	mmu->vm->pgt[0].refcount[0] = 1;
-	if (ret)
-		return ret;
-
-	nvkm_kmap(dma);
-	nvkm_wo32(dma, 0x00000, 0x0002103d); /* PCI, RW, PT, !LN */
-	nvkm_wo32(dma, 0x00004, NV04_PDMA_SIZE - 1);
-	nvkm_done(dma);
-	return 0;
-}
-
-void *
-nv04_mmu_dtor(struct nvkm_mmu *base)
-{
-	struct nv04_mmu *mmu = nv04_mmu(base);
-	struct nvkm_device *device = mmu->base.subdev.device;
-	if (mmu->vm) {
-		nvkm_memory_del(&mmu->vm->pgt[0].mem[0]);
-		nvkm_vm_ref(NULL, &mmu->vm, NULL);
-	}
-	if (mmu->nullp) {
-		dma_free_coherent(device->dev, 16 * 1024,
-				  mmu->nullp, mmu->null);
-	}
-	return mmu;
-}
-
-int
-nv04_mmu_new_(const struct nvkm_mmu_func *func, struct nvkm_device *device,
-	      int index, struct nvkm_mmu **pmmu)
-{
-	struct nv04_mmu *mmu;
-	if (!(mmu = kzalloc(sizeof(*mmu), GFP_KERNEL)))
-		return -ENOMEM;
-	*pmmu = &mmu->base;
-	nvkm_mmu_ctor(func, device, index, &mmu->base);
-	return 0;
-}
+#include <nvif/class.h>
 
 const struct nvkm_mmu_func
 nv04_mmu = {
-	.oneinit = nv04_mmu_oneinit,
-	.dtor = nv04_mmu_dtor,
-	.limit = NV04_PDMA_SIZE,
 	.dma_bits = 32,
-	.pgt_bits = 32 - 12,
-	.spg_shift = 12,
-	.lpg_shift = 12,
-	.map_sg = nv04_vm_map_sg,
-	.unmap = nv04_vm_unmap,
-	.flush = nv04_vm_flush,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_NV04}},
+	.mem = {{ -1, -1, NVIF_CLASS_MEM_NV04}, nv04_mem_new, nv04_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_NV04}, nv04_vmm_new, true },
 };
 
 int
 nv04_mmu_new(struct nvkm_device *device, int index, struct nvkm_mmu **pmmu)
 {
-	return nv04_mmu_new_(&nv04_mmu, device, index, pmmu);
+	return nvkm_mmu_new_(&nv04_mmu, device, index, pmmu);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv04.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv04.h
deleted file mode 100644
index 363e33b296d5..000000000000
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv04.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef __NV04_MMU_PRIV__
-#define __NV04_MMU_PRIV__
-#define nv04_mmu(p) container_of((p), struct nv04_mmu, base)
-#include "priv.h"
-
-struct nv04_mmu {
-	struct nvkm_mmu base;
-	struct nvkm_vm *vm;
-	dma_addr_t null;
-	void *nullp;
-};
-
-int nv04_mmu_new_(const struct nvkm_mmu_func *, struct nvkm_device *,
-		  int index, struct nvkm_mmu **);
-void *nv04_mmu_dtor(struct nvkm_mmu *);
-
-extern const struct nvkm_mmu_func nv04_mmu;
-#endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv41.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv41.c
index c6a26f907009..adca81895c09 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv41.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv41.c
@@ -21,113 +21,29 @@
  *
  * Authors: Ben Skeggs
  */
-#include "nv04.h"
+#include "mem.h"
+#include "vmm.h"
 
-#include <core/gpuobj.h>
 #include <core/option.h>
-#include <subdev/timer.h>
 
-#define NV41_GART_SIZE (512 * 1024 * 1024)
-#define NV41_GART_PAGE (  4 * 1024)
-
-/*******************************************************************************
- * VM map/unmap callbacks
- ******************************************************************************/
-
-static void
-nv41_vm_map_sg(struct nvkm_vma *vma, struct nvkm_memory *pgt,
-	       struct nvkm_mem *mem, u32 pte, u32 cnt, dma_addr_t *list)
-{
-	pte = pte * 4;
-	nvkm_kmap(pgt);
-	while (cnt) {
-		u32 page = PAGE_SIZE / NV41_GART_PAGE;
-		u64 phys = (u64)*list++;
-		while (cnt && page--) {
-			nvkm_wo32(pgt, pte, (phys >> 7) | 1);
-			phys += NV41_GART_PAGE;
-			pte += 4;
-			cnt -= 1;
-		}
-	}
-	nvkm_done(pgt);
-}
-
-static void
-nv41_vm_unmap(struct nvkm_vma *vma, struct nvkm_memory *pgt, u32 pte, u32 cnt)
-{
-	pte = pte * 4;
-	nvkm_kmap(pgt);
-	while (cnt--) {
-		nvkm_wo32(pgt, pte, 0x00000000);
-		pte += 4;
-	}
-	nvkm_done(pgt);
-}
-
-static void
-nv41_vm_flush(struct nvkm_vm *vm)
-{
-	struct nv04_mmu *mmu = nv04_mmu(vm->mmu);
-	struct nvkm_device *device = mmu->base.subdev.device;
-
-	mutex_lock(&mmu->base.subdev.mutex);
-	nvkm_wr32(device, 0x100810, 0x00000022);
-	nvkm_msec(device, 2000,
-		if (nvkm_rd32(device, 0x100810) & 0x00000020)
-			break;
-	);
-	nvkm_wr32(device, 0x100810, 0x00000000);
-	mutex_unlock(&mmu->base.subdev.mutex);
-}
-
-/*******************************************************************************
- * MMU subdev
- ******************************************************************************/
-
-static int
-nv41_mmu_oneinit(struct nvkm_mmu *base)
-{
-	struct nv04_mmu *mmu = nv04_mmu(base);
-	struct nvkm_device *device = mmu->base.subdev.device;
-	int ret;
-
-	ret = nvkm_vm_create(&mmu->base, 0, NV41_GART_SIZE, 0, 4096, NULL,
-			     &mmu->vm);
-	if (ret)
-		return ret;
-
-	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST,
-			      (NV41_GART_SIZE / NV41_GART_PAGE) * 4, 16, true,
-			      &mmu->vm->pgt[0].mem[0]);
-	mmu->vm->pgt[0].refcount[0] = 1;
-	return ret;
-}
+#include <nvif/class.h>
 
 static void
-nv41_mmu_init(struct nvkm_mmu *base)
+nv41_mmu_init(struct nvkm_mmu *mmu)
 {
-	struct nv04_mmu *mmu = nv04_mmu(base);
-	struct nvkm_device *device = mmu->base.subdev.device;
-	struct nvkm_memory *dma = mmu->vm->pgt[0].mem[0];
-	nvkm_wr32(device, 0x100800, 0x00000002 | nvkm_memory_addr(dma));
+	struct nvkm_device *device = mmu->subdev.device;
+	nvkm_wr32(device, 0x100800, 0x00000002 | mmu->vmm->pd->pt[0]->addr);
 	nvkm_mask(device, 0x10008c, 0x00000100, 0x00000100);
 	nvkm_wr32(device, 0x100820, 0x00000000);
 }
 
 static const struct nvkm_mmu_func
 nv41_mmu = {
-	.dtor = nv04_mmu_dtor,
-	.oneinit = nv41_mmu_oneinit,
 	.init = nv41_mmu_init,
-	.limit = NV41_GART_SIZE,
 	.dma_bits = 39,
-	.pgt_bits = 32 - 12,
-	.spg_shift = 12,
-	.lpg_shift = 12,
-	.map_sg = nv41_vm_map_sg,
-	.unmap = nv41_vm_unmap,
-	.flush = nv41_vm_flush,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_NV04}},
+	.mem = {{ -1, -1, NVIF_CLASS_MEM_NV04}, nv04_mem_new, nv04_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_NV04}, nv41_vmm_new, true },
 };
 
 int
@@ -137,5 +53,5 @@ nv41_mmu_new(struct nvkm_device *device, int index, struct nvkm_mmu **pmmu)
 	    !nvkm_boolopt(device->cfgopt, "NvPCIE", true))
 		return nv04_mmu_new(device, index, pmmu);
 
-	return nv04_mmu_new_(&nv41_mmu, device, index, pmmu);
+	return nvkm_mmu_new_(&nv41_mmu, device, index, pmmu);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv44.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv44.c
index a648c2395545..598c53a27bde 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv44.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv44.c
@@ -21,176 +21,18 @@
  *
  * Authors: Ben Skeggs
  */
-#include "nv04.h"
+#include "mem.h"
+#include "vmm.h"
 
-#include <core/gpuobj.h>
 #include <core/option.h>
-#include <subdev/timer.h>
 
-#define NV44_GART_SIZE (512 * 1024 * 1024)
-#define NV44_GART_PAGE (  4 * 1024)
-
-/*******************************************************************************
- * VM map/unmap callbacks
- ******************************************************************************/
-
-static void
-nv44_vm_fill(struct nvkm_memory *pgt, dma_addr_t null,
-	     dma_addr_t *list, u32 pte, u32 cnt)
-{
-	u32 base = (pte << 2) & ~0x0000000f;
-	u32 tmp[4];
-
-	tmp[0] = nvkm_ro32(pgt, base + 0x0);
-	tmp[1] = nvkm_ro32(pgt, base + 0x4);
-	tmp[2] = nvkm_ro32(pgt, base + 0x8);
-	tmp[3] = nvkm_ro32(pgt, base + 0xc);
-
-	while (cnt--) {
-		u32 addr = list ? (*list++ >> 12) : (null >> 12);
-		switch (pte++ & 0x3) {
-		case 0:
-			tmp[0] &= ~0x07ffffff;
-			tmp[0] |= addr;
-			break;
-		case 1:
-			tmp[0] &= ~0xf8000000;
-			tmp[0] |= addr << 27;
-			tmp[1] &= ~0x003fffff;
-			tmp[1] |= addr >> 5;
-			break;
-		case 2:
-			tmp[1] &= ~0xffc00000;
-			tmp[1] |= addr << 22;
-			tmp[2] &= ~0x0001ffff;
-			tmp[2] |= addr >> 10;
-			break;
-		case 3:
-			tmp[2] &= ~0xfffe0000;
-			tmp[2] |= addr << 17;
-			tmp[3] &= ~0x00000fff;
-			tmp[3] |= addr >> 15;
-			break;
-		}
-	}
-
-	nvkm_wo32(pgt, base + 0x0, tmp[0]);
-	nvkm_wo32(pgt, base + 0x4, tmp[1]);
-	nvkm_wo32(pgt, base + 0x8, tmp[2]);
-	nvkm_wo32(pgt, base + 0xc, tmp[3] | 0x40000000);
-}
-
-static void
-nv44_vm_map_sg(struct nvkm_vma *vma, struct nvkm_memory *pgt,
-	       struct nvkm_mem *mem, u32 pte, u32 cnt, dma_addr_t *list)
-{
-	struct nv04_mmu *mmu = nv04_mmu(vma->vm->mmu);
-	u32 tmp[4];
-	int i;
-
-	nvkm_kmap(pgt);
-	if (pte & 3) {
-		u32  max = 4 - (pte & 3);
-		u32 part = (cnt > max) ? max : cnt;
-		nv44_vm_fill(pgt, mmu->null, list, pte, part);
-		pte  += part;
-		list += part;
-		cnt  -= part;
-	}
-
-	while (cnt >= 4) {
-		for (i = 0; i < 4; i++)
-			tmp[i] = *list++ >> 12;
-		nvkm_wo32(pgt, pte++ * 4, tmp[0] >>  0 | tmp[1] << 27);
-		nvkm_wo32(pgt, pte++ * 4, tmp[1] >>  5 | tmp[2] << 22);
-		nvkm_wo32(pgt, pte++ * 4, tmp[2] >> 10 | tmp[3] << 17);
-		nvkm_wo32(pgt, pte++ * 4, tmp[3] >> 15 | 0x40000000);
-		cnt -= 4;
-	}
-
-	if (cnt)
-		nv44_vm_fill(pgt, mmu->null, list, pte, cnt);
-	nvkm_done(pgt);
-}
-
-static void
-nv44_vm_unmap(struct nvkm_vma *vma, struct nvkm_memory *pgt, u32 pte, u32 cnt)
-{
-	struct nv04_mmu *mmu = nv04_mmu(vma->vm->mmu);
-
-	nvkm_kmap(pgt);
-	if (pte & 3) {
-		u32  max = 4 - (pte & 3);
-		u32 part = (cnt > max) ? max : cnt;
-		nv44_vm_fill(pgt, mmu->null, NULL, pte, part);
-		pte  += part;
-		cnt  -= part;
-	}
-
-	while (cnt >= 4) {
-		nvkm_wo32(pgt, pte++ * 4, 0x00000000);
-		nvkm_wo32(pgt, pte++ * 4, 0x00000000);
-		nvkm_wo32(pgt, pte++ * 4, 0x00000000);
-		nvkm_wo32(pgt, pte++ * 4, 0x00000000);
-		cnt -= 4;
-	}
-
-	if (cnt)
-		nv44_vm_fill(pgt, mmu->null, NULL, pte, cnt);
-	nvkm_done(pgt);
-}
-
-static void
-nv44_vm_flush(struct nvkm_vm *vm)
-{
-	struct nv04_mmu *mmu = nv04_mmu(vm->mmu);
-	struct nvkm_device *device = mmu->base.subdev.device;
-	nvkm_wr32(device, 0x100814, mmu->base.limit - NV44_GART_PAGE);
-	nvkm_wr32(device, 0x100808, 0x00000020);
-	nvkm_msec(device, 2000,
-		if (nvkm_rd32(device, 0x100808) & 0x00000001)
-			break;
-	);
-	nvkm_wr32(device, 0x100808, 0x00000000);
-}
-
-/*******************************************************************************
- * MMU subdev
- ******************************************************************************/
-
-static int
-nv44_mmu_oneinit(struct nvkm_mmu *base)
-{
-	struct nv04_mmu *mmu = nv04_mmu(base);
-	struct nvkm_device *device = mmu->base.subdev.device;
-	int ret;
-
-	mmu->nullp = dma_alloc_coherent(device->dev, 16 * 1024,
-					&mmu->null, GFP_KERNEL);
-	if (!mmu->nullp) {
-		nvkm_warn(&mmu->base.subdev, "unable to allocate dummy pages\n");
-		mmu->null = 0;
-	}
-
-	ret = nvkm_vm_create(&mmu->base, 0, NV44_GART_SIZE, 0, 4096, NULL,
-			     &mmu->vm);
-	if (ret)
-		return ret;
-
-	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST,
-			      (NV44_GART_SIZE / NV44_GART_PAGE) * 4,
-			      512 * 1024, true,
-			      &mmu->vm->pgt[0].mem[0]);
-	mmu->vm->pgt[0].refcount[0] = 1;
-	return ret;
-}
+#include <nvif/class.h>
 
 static void
-nv44_mmu_init(struct nvkm_mmu *base)
+nv44_mmu_init(struct nvkm_mmu *mmu)
 {
-	struct nv04_mmu *mmu = nv04_mmu(base);
-	struct nvkm_device *device = mmu->base.subdev.device;
-	struct nvkm_memory *gart = mmu->vm->pgt[0].mem[0];
+	struct nvkm_device *device = mmu->subdev.device;
+	struct nvkm_memory *pt = mmu->vmm->pd->pt[0]->memory;
 	u32 addr;
 
 	/* calculate vram address of this PRAMIN block, object must be
@@ -198,11 +40,11 @@ nv44_mmu_init(struct nvkm_mmu *base)
 	 * of 512KiB for this to work correctly
 	 */
 	addr  = nvkm_rd32(device, 0x10020c);
-	addr -= ((nvkm_memory_addr(gart) >> 19) + 1) << 19;
+	addr -= ((nvkm_memory_addr(pt) >> 19) + 1) << 19;
 
 	nvkm_wr32(device, 0x100850, 0x80000000);
-	nvkm_wr32(device, 0x100818, mmu->null);
-	nvkm_wr32(device, 0x100804, NV44_GART_SIZE);
+	nvkm_wr32(device, 0x100818, mmu->vmm->null);
+	nvkm_wr32(device, 0x100804, (nvkm_memory_size(pt) / 4) * 4096);
 	nvkm_wr32(device, 0x100850, 0x00008000);
 	nvkm_mask(device, 0x10008c, 0x00000200, 0x00000200);
 	nvkm_wr32(device, 0x100820, 0x00000000);
@@ -212,17 +54,11 @@ nv44_mmu_init(struct nvkm_mmu *base)
 
 static const struct nvkm_mmu_func
 nv44_mmu = {
-	.dtor = nv04_mmu_dtor,
-	.oneinit = nv44_mmu_oneinit,
 	.init = nv44_mmu_init,
-	.limit = NV44_GART_SIZE,
 	.dma_bits = 39,
-	.pgt_bits = 32 - 12,
-	.spg_shift = 12,
-	.lpg_shift = 12,
-	.map_sg = nv44_vm_map_sg,
-	.unmap = nv44_vm_unmap,
-	.flush = nv44_vm_flush,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_NV04}},
+	.mem = {{ -1, -1, NVIF_CLASS_MEM_NV04}, nv04_mem_new, nv04_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_NV04}, nv44_vmm_new, true },
 };
 
 int
@@ -232,5 +68,5 @@ nv44_mmu_new(struct nvkm_device *device, int index, struct nvkm_mmu **pmmu)
 	    !nvkm_boolopt(device->cfgopt, "NvPCIE", true))
 		return nv04_mmu_new(device, index, pmmu);
 
-	return nv04_mmu_new_(&nv44_mmu, device, index, pmmu);
+	return nvkm_mmu_new_(&nv44_mmu, device, index, pmmu);
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv50.c
index a1f8d65f0276..db3dfbbb2aa0 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/nv50.c
@@ -21,207 +21,52 @@
  *
  * Authors: Ben Skeggs
  */
-#include "priv.h"
+#include "mem.h"
+#include "vmm.h"
 
-#include <core/gpuobj.h>
-#include <subdev/fb.h>
-#include <subdev/timer.h>
-#include <engine/gr.h>
+#include <nvif/class.h>
 
-static void
-nv50_vm_map_pgt(struct nvkm_gpuobj *pgd, u32 pde, struct nvkm_memory *pgt[2])
+const u8 *
+nv50_mmu_kind(struct nvkm_mmu *base, int *count)
 {
-	u64 phys = 0xdeadcafe00000000ULL;
-	u32 coverage = 0;
-
-	if (pgt[0]) {
-		/* present, 4KiB pages */
-		phys = 0x00000003 | nvkm_memory_addr(pgt[0]);
-		coverage = (nvkm_memory_size(pgt[0]) >> 3) << 12;
-	} else
-	if (pgt[1]) {
-		/* present, 64KiB pages  */
-		phys = 0x00000001 | nvkm_memory_addr(pgt[1]);
-		coverage = (nvkm_memory_size(pgt[1]) >> 3) << 16;
-	}
-
-	if (phys & 1) {
-		if (coverage <= 32 * 1024 * 1024)
-			phys |= 0x60;
-		else if (coverage <= 64 * 1024 * 1024)
-			phys |= 0x40;
-		else if (coverage <= 128 * 1024 * 1024)
-			phys |= 0x20;
-	}
-
-	nvkm_kmap(pgd);
-	nvkm_wo32(pgd, (pde * 8) + 0, lower_32_bits(phys));
-	nvkm_wo32(pgd, (pde * 8) + 4, upper_32_bits(phys));
-	nvkm_done(pgd);
-}
-
-static inline u64
-vm_addr(struct nvkm_vma *vma, u64 phys, u32 memtype, u32 target)
-{
-	phys |= 1; /* present */
-	phys |= (u64)memtype << 40;
-	phys |= target << 4;
-	if (vma->access & NV_MEM_ACCESS_SYS)
-		phys |= (1 << 6);
-	if (!(vma->access & NV_MEM_ACCESS_WO))
-		phys |= (1 << 3);
-	return phys;
-}
-
-static void
-nv50_vm_map(struct nvkm_vma *vma, struct nvkm_memory *pgt,
-	    struct nvkm_mem *mem, u32 pte, u32 cnt, u64 phys, u64 delta)
-{
-	struct nvkm_ram *ram = vma->vm->mmu->subdev.device->fb->ram;
-	u32 comp = (mem->memtype & 0x180) >> 7;
-	u32 block, target;
-	int i;
-
-	/* IGPs don't have real VRAM, re-target to stolen system memory */
-	target = 0;
-	if (ram->stolen) {
-		phys += ram->stolen;
-		target = 3;
-	}
-
-	phys  = vm_addr(vma, phys, mem->memtype, target);
-	pte <<= 3;
-	cnt <<= 3;
-
-	nvkm_kmap(pgt);
-	while (cnt) {
-		u32 offset_h = upper_32_bits(phys);
-		u32 offset_l = lower_32_bits(phys);
-
-		for (i = 7; i >= 0; i--) {
-			block = 1 << (i + 3);
-			if (cnt >= block && !(pte & (block - 1)))
-				break;
-		}
-		offset_l |= (i << 7);
-
-		phys += block << (vma->node->type - 3);
-		cnt  -= block;
-		if (comp) {
-			u32 tag = mem->tag->offset + ((delta >> 16) * comp);
-			offset_h |= (tag << 17);
-			delta    += block << (vma->node->type - 3);
-		}
-
-		while (block) {
-			nvkm_wo32(pgt, pte + 0, offset_l);
-			nvkm_wo32(pgt, pte + 4, offset_h);
-			pte += 8;
-			block -= 8;
-		}
-	}
-	nvkm_done(pgt);
-}
-
-static void
-nv50_vm_map_sg(struct nvkm_vma *vma, struct nvkm_memory *pgt,
-	       struct nvkm_mem *mem, u32 pte, u32 cnt, dma_addr_t *list)
-{
-	u32 target = (vma->access & NV_MEM_ACCESS_NOSNOOP) ? 3 : 2;
-	pte <<= 3;
-	nvkm_kmap(pgt);
-	while (cnt--) {
-		u64 phys = vm_addr(vma, (u64)*list++, mem->memtype, target);
-		nvkm_wo32(pgt, pte + 0, lower_32_bits(phys));
-		nvkm_wo32(pgt, pte + 4, upper_32_bits(phys));
-		pte += 8;
-	}
-	nvkm_done(pgt);
-}
-
-static void
-nv50_vm_unmap(struct nvkm_vma *vma, struct nvkm_memory *pgt, u32 pte, u32 cnt)
-{
-	pte <<= 3;
-	nvkm_kmap(pgt);
-	while (cnt--) {
-		nvkm_wo32(pgt, pte + 0, 0x00000000);
-		nvkm_wo32(pgt, pte + 4, 0x00000000);
-		pte += 8;
-	}
-	nvkm_done(pgt);
-}
-
-static void
-nv50_vm_flush(struct nvkm_vm *vm)
-{
-	struct nvkm_mmu *mmu = vm->mmu;
-	struct nvkm_subdev *subdev = &mmu->subdev;
-	struct nvkm_device *device = subdev->device;
-	int i, vme;
-
-	mutex_lock(&subdev->mutex);
-	for (i = 0; i < NVKM_SUBDEV_NR; i++) {
-		if (!atomic_read(&vm->engref[i]))
-			continue;
-
-		/* unfortunate hw bug workaround... */
-		if (i == NVKM_ENGINE_GR && device->gr) {
-			int ret = nvkm_gr_tlb_flush(device->gr);
-			if (ret != -ENODEV)
-				continue;
-		}
-
-		switch (i) {
-		case NVKM_ENGINE_GR    : vme = 0x00; break;
-		case NVKM_ENGINE_VP    :
-		case NVKM_ENGINE_MSPDEC: vme = 0x01; break;
-		case NVKM_SUBDEV_BAR   : vme = 0x06; break;
-		case NVKM_ENGINE_MSPPP :
-		case NVKM_ENGINE_MPEG  : vme = 0x08; break;
-		case NVKM_ENGINE_BSP   :
-		case NVKM_ENGINE_MSVLD : vme = 0x09; break;
-		case NVKM_ENGINE_CIPHER:
-		case NVKM_ENGINE_SEC   : vme = 0x0a; break;
-		case NVKM_ENGINE_CE0   : vme = 0x0d; break;
-		default:
-			continue;
-		}
-
-		nvkm_wr32(device, 0x100c80, (vme << 16) | 1);
-		if (nvkm_msec(device, 2000,
-			if (!(nvkm_rd32(device, 0x100c80) & 0x00000001))
-				break;
-		) < 0)
-			nvkm_error(subdev, "vm flush timeout: engine %d\n", vme);
-	}
-	mutex_unlock(&subdev->mutex);
-}
-
-static int
-nv50_vm_create(struct nvkm_mmu *mmu, u64 offset, u64 length, u64 mm_offset,
-	       struct lock_class_key *key, struct nvkm_vm **pvm)
-{
-	u32 block = (1 << (mmu->func->pgt_bits + 12));
-	if (block > length)
-		block = length;
-
-	return nvkm_vm_create(mmu, offset, length, mm_offset, block, key, pvm);
+	/* 0x01: no bank swizzle
+	 * 0x02: bank swizzled
+	 * 0x7f: invalid
+	 *
+	 * 0x01/0x02 are values understood by the VRAM allocator,
+	 * and are required to avoid mixing the two types within
+	 * a certain range.
+	 */
+	static const u8
+	kind[128] = {
+		0x01, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, /* 0x00 */
+		0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+		0x01, 0x01, 0x01, 0x01, 0x7f, 0x7f, 0x7f, 0x7f, /* 0x10 */
+		0x02, 0x02, 0x02, 0x02, 0x7f, 0x7f, 0x7f, 0x7f,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x7f, /* 0x20 */
+		0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x7f,
+		0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, /* 0x30 */
+		0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, /* 0x40 */
+		0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x7f, 0x7f,
+		0x7f, 0x7f, 0x7f, 0x7f, 0x01, 0x01, 0x01, 0x7f, /* 0x50 */
+		0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+		0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x7f, /* 0x60 */
+		0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02,
+		0x01, 0x7f, 0x02, 0x7f, 0x01, 0x7f, 0x02, 0x7f, /* 0x70 */
+		0x01, 0x01, 0x02, 0x02, 0x01, 0x01, 0x7f, 0x7f
+	};
+	*count = ARRAY_SIZE(kind);
+	return kind;
 }
 
 static const struct nvkm_mmu_func
 nv50_mmu = {
-	.limit = (1ULL << 40),
 	.dma_bits = 40,
-	.pgt_bits  = 29 - 12,
-	.spg_shift = 12,
-	.lpg_shift = 16,
-	.create = nv50_vm_create,
-	.map_pgt = nv50_vm_map_pgt,
-	.map = nv50_vm_map,
-	.map_sg = nv50_vm_map_sg,
-	.unmap = nv50_vm_unmap,
-	.flush = nv50_vm_flush,
+	.mmu = {{ -1, -1, NVIF_CLASS_MMU_NV50}},
+	.mem = {{ -1,  0, NVIF_CLASS_MEM_NV50}, nv50_mem_new, nv50_mem_map },
+	.vmm = {{ -1, -1, NVIF_CLASS_VMM_NV50}, nv50_vmm_new, false, 0x1400 },
+	.kind = nv50_mmu_kind,
 };
 
 int
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/priv.h
index 27cedc60b507..d024d8055fcb 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/priv.h
@@ -9,31 +9,57 @@ int nvkm_mmu_new_(const struct nvkm_mmu_func *, struct nvkm_device *,
 		  int index, struct nvkm_mmu **);
 
 struct nvkm_mmu_func {
-	void *(*dtor)(struct nvkm_mmu *);
-	int (*oneinit)(struct nvkm_mmu *);
 	void (*init)(struct nvkm_mmu *);
 
-	u64 limit;
 	u8  dma_bits;
-	u32 pgt_bits;
-	u8  spg_shift;
-	u8  lpg_shift;
-
-	int  (*create)(struct nvkm_mmu *, u64 offset, u64 length, u64 mm_offset,
-		       struct lock_class_key *, struct nvkm_vm **);
-
-	void (*map_pgt)(struct nvkm_gpuobj *pgd, u32 pde,
-			struct nvkm_memory *pgt[2]);
-	void (*map)(struct nvkm_vma *, struct nvkm_memory *,
-		    struct nvkm_mem *, u32 pte, u32 cnt,
-		    u64 phys, u64 delta);
-	void (*map_sg)(struct nvkm_vma *, struct nvkm_memory *,
-		       struct nvkm_mem *, u32 pte, u32 cnt, dma_addr_t *);
-	void (*unmap)(struct nvkm_vma *, struct nvkm_memory *pgt,
-		      u32 pte, u32 cnt);
-	void (*flush)(struct nvkm_vm *);
+
+	struct {
+		struct nvkm_sclass user;
+	} mmu;
+
+	struct {
+		struct nvkm_sclass user;
+		int (*vram)(struct nvkm_mmu *, int type, u8 page, u64 size,
+			    void *argv, u32 argc, struct nvkm_memory **);
+		int (*umap)(struct nvkm_mmu *, struct nvkm_memory *, void *argv,
+			    u32 argc, u64 *addr, u64 *size, struct nvkm_vma **);
+	} mem;
+
+	struct {
+		struct nvkm_sclass user;
+		int (*ctor)(struct nvkm_mmu *, u64 addr, u64 size,
+			    void *argv, u32 argc, struct lock_class_key *,
+			    const char *name, struct nvkm_vmm **);
+		bool global;
+		u32 pd_offset;
+	} vmm;
+
+	const u8 *(*kind)(struct nvkm_mmu *, int *count);
+	bool kind_sys;
+};
+
+extern const struct nvkm_mmu_func nv04_mmu;
+
+const u8 *nv50_mmu_kind(struct nvkm_mmu *, int *count);
+
+const u8 *gf100_mmu_kind(struct nvkm_mmu *, int *count);
+
+const u8 *gm200_mmu_kind(struct nvkm_mmu *, int *);
+
+struct nvkm_mmu_pt {
+	union {
+		struct nvkm_mmu_ptc *ptc;
+		struct nvkm_mmu_ptp *ptp;
+	};
+	struct nvkm_memory *memory;
+	bool sub;
+	u16 base;
+	u64 addr;
+	struct list_head head;
 };
 
-int nvkm_vm_create(struct nvkm_mmu *, u64, u64, u64, u32,
-		   struct lock_class_key *, struct nvkm_vm **);
+void nvkm_mmu_ptc_dump(struct nvkm_mmu *);
+struct nvkm_mmu_pt *
+nvkm_mmu_ptc_get(struct nvkm_mmu *, u32 size, u32 align, bool zero);
+void nvkm_mmu_ptc_put(struct nvkm_mmu *, bool force, struct nvkm_mmu_pt **);
 #endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/umem.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/umem.c
new file mode 100644
index 000000000000..fac2f9a45ea6
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/umem.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "umem.h"
+#include "ummu.h"
+
+#include <core/client.h>
+#include <core/memory.h>
+#include <subdev/bar.h>
+
+#include <nvif/class.h>
+#include <nvif/if000a.h>
+#include <nvif/unpack.h>
+
+static const struct nvkm_object_func nvkm_umem;
+struct nvkm_memory *
+nvkm_umem_search(struct nvkm_client *client, u64 handle)
+{
+	struct nvkm_client *master = client->object.client;
+	struct nvkm_memory *memory = NULL;
+	struct nvkm_object *object;
+	struct nvkm_umem *umem;
+
+	object = nvkm_object_search(client, handle, &nvkm_umem);
+	if (IS_ERR(object)) {
+		if (client->super && client != master) {
+			spin_lock(&master->lock);
+			list_for_each_entry(umem, &master->umem, head) {
+				if (umem->object.object == handle) {
+					memory = nvkm_memory_ref(umem->memory);
+					break;
+				}
+			}
+			spin_unlock(&master->lock);
+		}
+	} else {
+		umem = nvkm_umem(object);
+		if (!umem->priv || client->super)
+			memory = nvkm_memory_ref(umem->memory);
+	}
+
+	return memory ? memory : ERR_PTR(-ENOENT);
+}
+
+static int
+nvkm_umem_unmap(struct nvkm_object *object)
+{
+	struct nvkm_umem *umem = nvkm_umem(object);
+
+	if (!umem->map)
+		return -EEXIST;
+
+	if (umem->io) {
+		if (!IS_ERR(umem->bar)) {
+			struct nvkm_device *device = umem->mmu->subdev.device;
+			nvkm_vmm_put(nvkm_bar_bar1_vmm(device), &umem->bar);
+		} else {
+			umem->bar = NULL;
+		}
+	} else {
+		vunmap(umem->map);
+		umem->map = NULL;
+	}
+
+	return 0;
+}
+
+static int
+nvkm_umem_map(struct nvkm_object *object, void *argv, u32 argc,
+	      enum nvkm_object_map *type, u64 *handle, u64 *length)
+{
+	struct nvkm_umem *umem = nvkm_umem(object);
+	struct nvkm_mmu *mmu = umem->mmu;
+
+	if (!umem->mappable)
+		return -EINVAL;
+	if (umem->map)
+		return -EEXIST;
+
+	if ((umem->type & NVKM_MEM_HOST) && !argc) {
+		int ret = nvkm_mem_map_host(umem->memory, &umem->map);
+		if (ret)
+			return ret;
+
+		*handle = (unsigned long)(void *)umem->map;
+		*length = nvkm_memory_size(umem->memory);
+		*type = NVKM_OBJECT_MAP_VA;
+		return 0;
+	} else
+	if ((umem->type & NVKM_MEM_VRAM) ||
+	    (umem->type & NVKM_MEM_KIND)) {
+		int ret = mmu->func->mem.umap(mmu, umem->memory, argv, argc,
+					      handle, length, &umem->bar);
+		if (ret)
+			return ret;
+
+		*type = NVKM_OBJECT_MAP_IO;
+	} else {
+		return -EINVAL;
+	}
+
+	umem->io = (*type == NVKM_OBJECT_MAP_IO);
+	return 0;
+}
+
+static void *
+nvkm_umem_dtor(struct nvkm_object *object)
+{
+	struct nvkm_umem *umem = nvkm_umem(object);
+	spin_lock(&umem->object.client->lock);
+	list_del_init(&umem->head);
+	spin_unlock(&umem->object.client->lock);
+	nvkm_memory_unref(&umem->memory);
+	return umem;
+}
+
+static const struct nvkm_object_func
+nvkm_umem = {
+	.dtor = nvkm_umem_dtor,
+	.map = nvkm_umem_map,
+	.unmap = nvkm_umem_unmap,
+};
+
+int
+nvkm_umem_new(const struct nvkm_oclass *oclass, void *argv, u32 argc,
+	      struct nvkm_object **pobject)
+{
+	struct nvkm_mmu *mmu = nvkm_ummu(oclass->parent)->mmu;
+	union {
+		struct nvif_mem_v0 v0;
+	} *args = argv;
+	struct nvkm_umem *umem;
+	int type, ret = -ENOSYS;
+	u8  page;
+	u64 size;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, true))) {
+		type = args->v0.type;
+		page = args->v0.page;
+		size = args->v0.size;
+	} else
+		return ret;
+
+	if (type >= mmu->type_nr)
+		return -EINVAL;
+
+	if (!(umem = kzalloc(sizeof(*umem), GFP_KERNEL)))
+		return -ENOMEM;
+	nvkm_object_ctor(&nvkm_umem, oclass, &umem->object);
+	umem->mmu = mmu;
+	umem->type = mmu->type[type].type;
+	umem->priv = oclass->client->super;
+	INIT_LIST_HEAD(&umem->head);
+	*pobject = &umem->object;
+
+	if (mmu->type[type].type & NVKM_MEM_MAPPABLE) {
+		page = max_t(u8, page, PAGE_SHIFT);
+		umem->mappable = true;
+	}
+
+	ret = nvkm_mem_new_type(mmu, type, page, size, argv, argc,
+				&umem->memory);
+	if (ret)
+		return ret;
+
+	spin_lock(&umem->object.client->lock);
+	list_add(&umem->head, &umem->object.client->umem);
+	spin_unlock(&umem->object.client->lock);
+
+	args->v0.page = nvkm_memory_page(umem->memory);
+	args->v0.addr = nvkm_memory_addr(umem->memory);
+	args->v0.size = nvkm_memory_size(umem->memory);
+	return 0;
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/umem.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/umem.h
new file mode 100644
index 000000000000..85cf692d620a
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/umem.h
@@ -0,0 +1,26 @@
+#ifndef __NVKM_UMEM_H__
+#define __NVKM_UMEM_H__
+#define nvkm_umem(p) container_of((p), struct nvkm_umem, object)
+#include <core/object.h>
+#include "mem.h"
+
+struct nvkm_umem {
+	struct nvkm_object object;
+	struct nvkm_mmu *mmu;
+	u8 type:8;
+	bool priv:1;
+	bool mappable:1;
+	bool io:1;
+
+	struct nvkm_memory *memory;
+	struct list_head head;
+
+	union {
+		struct nvkm_vma *bar;
+		void *map;
+	};
+};
+
+int nvkm_umem_new(const struct nvkm_oclass *, void *argv, u32 argc,
+		  struct nvkm_object **);
+#endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/ummu.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/ummu.c
new file mode 100644
index 000000000000..353f10f92b77
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/ummu.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "ummu.h"
+#include "umem.h"
+#include "uvmm.h"
+
+#include <core/client.h>
+
+#include <nvif/if0008.h>
+#include <nvif/unpack.h>
+
+static int
+nvkm_ummu_sclass(struct nvkm_object *object, int index,
+		 struct nvkm_oclass *oclass)
+{
+	struct nvkm_mmu *mmu = nvkm_ummu(object)->mmu;
+
+	if (mmu->func->mem.user.oclass && oclass->client->super) {
+		if (index-- == 0) {
+			oclass->base = mmu->func->mem.user;
+			oclass->ctor = nvkm_umem_new;
+			return 0;
+		}
+	}
+
+	if (mmu->func->vmm.user.oclass) {
+		if (index-- == 0) {
+			oclass->base = mmu->func->vmm.user;
+			oclass->ctor = nvkm_uvmm_new;
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static int
+nvkm_ummu_heap(struct nvkm_ummu *ummu, void *argv, u32 argc)
+{
+	struct nvkm_mmu *mmu = ummu->mmu;
+	union {
+		struct nvif_mmu_heap_v0 v0;
+	} *args = argv;
+	int ret = -ENOSYS;
+	u8 index;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		if ((index = args->v0.index) >= mmu->heap_nr)
+			return -EINVAL;
+		args->v0.size = mmu->heap[index].size;
+	} else
+		return ret;
+
+	return 0;
+}
+
+static int
+nvkm_ummu_type(struct nvkm_ummu *ummu, void *argv, u32 argc)
+{
+	struct nvkm_mmu *mmu = ummu->mmu;
+	union {
+		struct nvif_mmu_type_v0 v0;
+	} *args = argv;
+	int ret = -ENOSYS;
+	u8 type, index;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		if ((index = args->v0.index) >= mmu->type_nr)
+			return -EINVAL;
+		type = mmu->type[index].type;
+		args->v0.heap = mmu->type[index].heap;
+		args->v0.vram = !!(type & NVKM_MEM_VRAM);
+		args->v0.host = !!(type & NVKM_MEM_HOST);
+		args->v0.comp = !!(type & NVKM_MEM_COMP);
+		args->v0.disp = !!(type & NVKM_MEM_DISP);
+		args->v0.kind = !!(type & NVKM_MEM_KIND);
+		args->v0.mappable = !!(type & NVKM_MEM_MAPPABLE);
+		args->v0.coherent = !!(type & NVKM_MEM_COHERENT);
+		args->v0.uncached = !!(type & NVKM_MEM_UNCACHED);
+	} else
+		return ret;
+
+	return 0;
+}
+
+static int
+nvkm_ummu_kind(struct nvkm_ummu *ummu, void *argv, u32 argc)
+{
+	struct nvkm_mmu *mmu = ummu->mmu;
+	union {
+		struct nvif_mmu_kind_v0 v0;
+	} *args = argv;
+	const u8 *kind = NULL;
+	int ret = -ENOSYS, count = 0;
+
+	if (mmu->func->kind)
+		kind = mmu->func->kind(mmu, &count);
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, true))) {
+		if (argc != args->v0.count * sizeof(*args->v0.data))
+			return -EINVAL;
+		if (args->v0.count > count)
+			return -EINVAL;
+		memcpy(args->v0.data, kind, args->v0.count);
+	} else
+		return ret;
+
+	return 0;
+}
+
+static int
+nvkm_ummu_mthd(struct nvkm_object *object, u32 mthd, void *argv, u32 argc)
+{
+	struct nvkm_ummu *ummu = nvkm_ummu(object);
+	switch (mthd) {
+	case NVIF_MMU_V0_HEAP: return nvkm_ummu_heap(ummu, argv, argc);
+	case NVIF_MMU_V0_TYPE: return nvkm_ummu_type(ummu, argv, argc);
+	case NVIF_MMU_V0_KIND: return nvkm_ummu_kind(ummu, argv, argc);
+	default:
+		break;
+	}
+	return -EINVAL;
+}
+
+static const struct nvkm_object_func
+nvkm_ummu = {
+	.mthd = nvkm_ummu_mthd,
+	.sclass = nvkm_ummu_sclass,
+};
+
+int
+nvkm_ummu_new(struct nvkm_device *device, const struct nvkm_oclass *oclass,
+	      void *argv, u32 argc, struct nvkm_object **pobject)
+{
+	union {
+		struct nvif_mmu_v0 v0;
+	} *args = argv;
+	struct nvkm_mmu *mmu = device->mmu;
+	struct nvkm_ummu *ummu;
+	int ret = -ENOSYS, kinds = 0;
+
+	if (mmu->func->kind)
+		mmu->func->kind(mmu, &kinds);
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		args->v0.dmabits = mmu->dma_bits;
+		args->v0.heap_nr = mmu->heap_nr;
+		args->v0.type_nr = mmu->type_nr;
+		args->v0.kind_nr = kinds;
+	} else
+		return ret;
+
+	if (!(ummu = kzalloc(sizeof(*ummu), GFP_KERNEL)))
+		return -ENOMEM;
+	nvkm_object_ctor(&nvkm_ummu, oclass, &ummu->object);
+	ummu->mmu = mmu;
+	*pobject = &ummu->object;
+	return 0;
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/ummu.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/ummu.h
new file mode 100644
index 000000000000..0cd510dcfc68
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/ummu.h
@@ -0,0 +1,14 @@
+#ifndef __NVKM_UMMU_H__
+#define __NVKM_UMMU_H__
+#define nvkm_ummu(p) container_of((p), struct nvkm_ummu, object)
+#include <core/object.h>
+#include "priv.h"
+
+struct nvkm_ummu {
+	struct nvkm_object object;
+	struct nvkm_mmu *mmu;
+};
+
+int nvkm_ummu_new(struct nvkm_device *, const struct nvkm_oclass *,
+		  void *argv, u32 argc, struct nvkm_object **);
+#endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.c
new file mode 100644
index 000000000000..fa81d0c1ba41
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "uvmm.h"
+#include "umem.h"
+#include "ummu.h"
+
+#include <core/client.h>
+#include <core/memory.h>
+
+#include <nvif/if000c.h>
+#include <nvif/unpack.h>
+
+static const struct nvkm_object_func nvkm_uvmm;
+struct nvkm_vmm *
+nvkm_uvmm_search(struct nvkm_client *client, u64 handle)
+{
+	struct nvkm_object *object;
+
+	object = nvkm_object_search(client, handle, &nvkm_uvmm);
+	if (IS_ERR(object))
+		return (void *)object;
+
+	return nvkm_uvmm(object)->vmm;
+}
+
+static int
+nvkm_uvmm_mthd_unmap(struct nvkm_uvmm *uvmm, void *argv, u32 argc)
+{
+	struct nvkm_client *client = uvmm->object.client;
+	union {
+		struct nvif_vmm_unmap_v0 v0;
+	} *args = argv;
+	struct nvkm_vmm *vmm = uvmm->vmm;
+	struct nvkm_vma *vma;
+	int ret = -ENOSYS;
+	u64 addr;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		addr = args->v0.addr;
+	} else
+		return ret;
+
+	mutex_lock(&vmm->mutex);
+	vma = nvkm_vmm_node_search(vmm, addr);
+	if (ret = -ENOENT, !vma || vma->addr != addr) {
+		VMM_DEBUG(vmm, "lookup %016llx: %016llx",
+			  addr, vma ? vma->addr : ~0ULL);
+		goto done;
+	}
+
+	if (ret = -ENOENT, (!vma->user && !client->super) || vma->busy) {
+		VMM_DEBUG(vmm, "denied %016llx: %d %d %d", addr,
+			  vma->user, !client->super, vma->busy);
+		goto done;
+	}
+
+	if (ret = -EINVAL, !vma->memory) {
+		VMM_DEBUG(vmm, "unmapped");
+		goto done;
+	}
+
+	nvkm_vmm_unmap_locked(vmm, vma);
+	ret = 0;
+done:
+	mutex_unlock(&vmm->mutex);
+	return ret;
+}
+
+static int
+nvkm_uvmm_mthd_map(struct nvkm_uvmm *uvmm, void *argv, u32 argc)
+{
+	struct nvkm_client *client = uvmm->object.client;
+	union {
+		struct nvif_vmm_map_v0 v0;
+	} *args = argv;
+	u64 addr, size, handle, offset;
+	struct nvkm_vmm *vmm = uvmm->vmm;
+	struct nvkm_vma *vma;
+	struct nvkm_memory *memory;
+	int ret = -ENOSYS;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, true))) {
+		addr = args->v0.addr;
+		size = args->v0.size;
+		handle = args->v0.memory;
+		offset = args->v0.offset;
+	} else
+		return ret;
+
+	if (IS_ERR((memory = nvkm_umem_search(client, handle)))) {
+		VMM_DEBUG(vmm, "memory %016llx %ld\n", handle, PTR_ERR(memory));
+		return PTR_ERR(memory);
+	}
+
+	mutex_lock(&vmm->mutex);
+	if (ret = -ENOENT, !(vma = nvkm_vmm_node_search(vmm, addr))) {
+		VMM_DEBUG(vmm, "lookup %016llx", addr);
+		goto fail;
+	}
+
+	if (ret = -ENOENT, (!vma->user && !client->super) || vma->busy) {
+		VMM_DEBUG(vmm, "denied %016llx: %d %d %d", addr,
+			  vma->user, !client->super, vma->busy);
+		goto fail;
+	}
+
+	if (ret = -EINVAL, vma->addr != addr || vma->size != size) {
+		if (addr + size > vma->addr + vma->size || vma->memory ||
+		    (vma->refd == NVKM_VMA_PAGE_NONE && !vma->mapref)) {
+			VMM_DEBUG(vmm, "split %d %d %d "
+				       "%016llx %016llx %016llx %016llx",
+				  !!vma->memory, vma->refd, vma->mapref,
+				  addr, size, vma->addr, (u64)vma->size);
+			goto fail;
+		}
+
+		if (vma->addr != addr) {
+			const u64 tail = vma->size + vma->addr - addr;
+			if (ret = -ENOMEM, !(vma = nvkm_vma_tail(vma, tail)))
+				goto fail;
+			vma->part = true;
+			nvkm_vmm_node_insert(vmm, vma);
+		}
+
+		if (vma->size != size) {
+			const u64 tail = vma->size - size;
+			struct nvkm_vma *tmp;
+			if (ret = -ENOMEM, !(tmp = nvkm_vma_tail(vma, tail))) {
+				nvkm_vmm_unmap_region(vmm, vma);
+				goto fail;
+			}
+			tmp->part = true;
+			nvkm_vmm_node_insert(vmm, tmp);
+		}
+	}
+	vma->busy = true;
+	mutex_unlock(&vmm->mutex);
+
+	ret = nvkm_memory_map(memory, offset, vmm, vma, argv, argc);
+	if (ret == 0) {
+		/* Successful map will clear vma->busy. */
+		nvkm_memory_unref(&memory);
+		return 0;
+	}
+
+	mutex_lock(&vmm->mutex);
+	vma->busy = false;
+	nvkm_vmm_unmap_region(vmm, vma);
+fail:
+	mutex_unlock(&vmm->mutex);
+	nvkm_memory_unref(&memory);
+	return ret;
+}
+
+static int
+nvkm_uvmm_mthd_put(struct nvkm_uvmm *uvmm, void *argv, u32 argc)
+{
+	struct nvkm_client *client = uvmm->object.client;
+	union {
+		struct nvif_vmm_put_v0 v0;
+	} *args = argv;
+	struct nvkm_vmm *vmm = uvmm->vmm;
+	struct nvkm_vma *vma;
+	int ret = -ENOSYS;
+	u64 addr;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		addr = args->v0.addr;
+	} else
+		return ret;
+
+	mutex_lock(&vmm->mutex);
+	vma = nvkm_vmm_node_search(vmm, args->v0.addr);
+	if (ret = -ENOENT, !vma || vma->addr != addr || vma->part) {
+		VMM_DEBUG(vmm, "lookup %016llx: %016llx %d", addr,
+			  vma ? vma->addr : ~0ULL, vma ? vma->part : 0);
+		goto done;
+	}
+
+	if (ret = -ENOENT, (!vma->user && !client->super) || vma->busy) {
+		VMM_DEBUG(vmm, "denied %016llx: %d %d %d", addr,
+			  vma->user, !client->super, vma->busy);
+		goto done;
+	}
+
+	nvkm_vmm_put_locked(vmm, vma);
+	ret = 0;
+done:
+	mutex_unlock(&vmm->mutex);
+	return ret;
+}
+
+static int
+nvkm_uvmm_mthd_get(struct nvkm_uvmm *uvmm, void *argv, u32 argc)
+{
+	struct nvkm_client *client = uvmm->object.client;
+	union {
+		struct nvif_vmm_get_v0 v0;
+	} *args = argv;
+	struct nvkm_vmm *vmm = uvmm->vmm;
+	struct nvkm_vma *vma;
+	int ret = -ENOSYS;
+	bool getref, mapref, sparse;
+	u8 page, align;
+	u64 size;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		getref = args->v0.type == NVIF_VMM_GET_V0_PTES;
+		mapref = args->v0.type == NVIF_VMM_GET_V0_ADDR;
+		sparse = args->v0.sparse;
+		page = args->v0.page;
+		align = args->v0.align;
+		size = args->v0.size;
+	} else
+		return ret;
+
+	mutex_lock(&vmm->mutex);
+	ret = nvkm_vmm_get_locked(vmm, getref, mapref, sparse,
+				  page, align, size, &vma);
+	mutex_unlock(&vmm->mutex);
+	if (ret)
+		return ret;
+
+	args->v0.addr = vma->addr;
+	vma->user = !client->super;
+	return ret;
+}
+
+static int
+nvkm_uvmm_mthd_page(struct nvkm_uvmm *uvmm, void *argv, u32 argc)
+{
+	union {
+		struct nvif_vmm_page_v0 v0;
+	} *args = argv;
+	const struct nvkm_vmm_page *page;
+	int ret = -ENOSYS;
+	u8 type, index, nr;
+
+	page = uvmm->vmm->func->page;
+	for (nr = 0; page[nr].shift; nr++);
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		if ((index = args->v0.index) >= nr)
+			return -EINVAL;
+		type = page[index].type;
+		args->v0.shift = page[index].shift;
+		args->v0.sparse = !!(type & NVKM_VMM_PAGE_SPARSE);
+		args->v0.vram = !!(type & NVKM_VMM_PAGE_VRAM);
+		args->v0.host = !!(type & NVKM_VMM_PAGE_HOST);
+		args->v0.comp = !!(type & NVKM_VMM_PAGE_COMP);
+	} else
+		return -ENOSYS;
+
+	return 0;
+}
+
+static int
+nvkm_uvmm_mthd(struct nvkm_object *object, u32 mthd, void *argv, u32 argc)
+{
+	struct nvkm_uvmm *uvmm = nvkm_uvmm(object);
+	switch (mthd) {
+	case NVIF_VMM_V0_PAGE  : return nvkm_uvmm_mthd_page  (uvmm, argv, argc);
+	case NVIF_VMM_V0_GET   : return nvkm_uvmm_mthd_get   (uvmm, argv, argc);
+	case NVIF_VMM_V0_PUT   : return nvkm_uvmm_mthd_put   (uvmm, argv, argc);
+	case NVIF_VMM_V0_MAP   : return nvkm_uvmm_mthd_map   (uvmm, argv, argc);
+	case NVIF_VMM_V0_UNMAP : return nvkm_uvmm_mthd_unmap (uvmm, argv, argc);
+	default:
+		break;
+	}
+	return -EINVAL;
+}
+
+static void *
+nvkm_uvmm_dtor(struct nvkm_object *object)
+{
+	struct nvkm_uvmm *uvmm = nvkm_uvmm(object);
+	nvkm_vmm_unref(&uvmm->vmm);
+	return uvmm;
+}
+
+static const struct nvkm_object_func
+nvkm_uvmm = {
+	.dtor = nvkm_uvmm_dtor,
+	.mthd = nvkm_uvmm_mthd,
+};
+
+int
+nvkm_uvmm_new(const struct nvkm_oclass *oclass, void *argv, u32 argc,
+	      struct nvkm_object **pobject)
+{
+	struct nvkm_mmu *mmu = nvkm_ummu(oclass->parent)->mmu;
+	const bool more = oclass->base.maxver >= 0;
+	union {
+		struct nvif_vmm_v0 v0;
+	} *args = argv;
+	const struct nvkm_vmm_page *page;
+	struct nvkm_uvmm *uvmm;
+	int ret = -ENOSYS;
+	u64 addr, size;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, more))) {
+		addr = args->v0.addr;
+		size = args->v0.size;
+	} else
+		return ret;
+
+	if (!(uvmm = kzalloc(sizeof(*uvmm), GFP_KERNEL)))
+		return -ENOMEM;
+	nvkm_object_ctor(&nvkm_uvmm, oclass, &uvmm->object);
+	*pobject = &uvmm->object;
+
+	if (!mmu->vmm) {
+		ret = mmu->func->vmm.ctor(mmu, addr, size, argv, argc,
+					  NULL, "user", &uvmm->vmm);
+		if (ret)
+			return ret;
+
+		uvmm->vmm->debug = max(uvmm->vmm->debug, oclass->client->debug);
+	} else {
+		if (size)
+			return -EINVAL;
+
+		uvmm->vmm = nvkm_vmm_ref(mmu->vmm);
+	}
+
+	page = uvmm->vmm->func->page;
+	args->v0.page_nr = 0;
+	while (page && (page++)->shift)
+		args->v0.page_nr++;
+	args->v0.addr = uvmm->vmm->start;
+	args->v0.size = uvmm->vmm->limit;
+	return 0;
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.h
new file mode 100644
index 000000000000..71dab55e18a9
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.h
@@ -0,0 +1,14 @@
+#ifndef __NVKM_UVMM_H__
+#define __NVKM_UVMM_H__
+#define nvkm_uvmm(p) container_of((p), struct nvkm_uvmm, object)
+#include <core/object.h>
+#include "vmm.h"
+
+struct nvkm_uvmm {
+	struct nvkm_object object;
+	struct nvkm_vmm *vmm;
+};
+
+int nvkm_uvmm_new(const struct nvkm_oclass *, void *argv, u32 argc,
+		  struct nvkm_object **);
+#endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c
new file mode 100644
index 000000000000..e35d3e17cd7c
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c
@@ -0,0 +1,1513 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#define NVKM_VMM_LEVELS_MAX 5
+#include "vmm.h"
+
+#include <subdev/fb.h>
+
+static void
+nvkm_vmm_pt_del(struct nvkm_vmm_pt **ppgt)
+{
+	struct nvkm_vmm_pt *pgt = *ppgt;
+	if (pgt) {
+		kvfree(pgt->pde);
+		kfree(pgt);
+		*ppgt = NULL;
+	}
+}
+
+
+static struct nvkm_vmm_pt *
+nvkm_vmm_pt_new(const struct nvkm_vmm_desc *desc, bool sparse,
+		const struct nvkm_vmm_page *page)
+{
+	const u32 pten = 1 << desc->bits;
+	struct nvkm_vmm_pt *pgt;
+	u32 lpte = 0;
+
+	if (desc->type > PGT) {
+		if (desc->type == SPT) {
+			const struct nvkm_vmm_desc *pair = page[-1].desc;
+			lpte = pten >> (desc->bits - pair->bits);
+		} else {
+			lpte = pten;
+		}
+	}
+
+	if (!(pgt = kzalloc(sizeof(*pgt) + lpte, GFP_KERNEL)))
+		return NULL;
+	pgt->page = page ? page->shift : 0;
+	pgt->sparse = sparse;
+
+	if (desc->type == PGD) {
+		pgt->pde = kvzalloc(sizeof(*pgt->pde) * pten, GFP_KERNEL);
+		if (!pgt->pde) {
+			kfree(pgt);
+			return NULL;
+		}
+	}
+
+	return pgt;
+}
+
+struct nvkm_vmm_iter {
+	const struct nvkm_vmm_page *page;
+	const struct nvkm_vmm_desc *desc;
+	struct nvkm_vmm *vmm;
+	u64 cnt;
+	u16 max, lvl;
+	u32 pte[NVKM_VMM_LEVELS_MAX];
+	struct nvkm_vmm_pt *pt[NVKM_VMM_LEVELS_MAX];
+	int flush;
+};
+
+#ifdef CONFIG_NOUVEAU_DEBUG_MMU
+static const char *
+nvkm_vmm_desc_type(const struct nvkm_vmm_desc *desc)
+{
+	switch (desc->type) {
+	case PGD: return "PGD";
+	case PGT: return "PGT";
+	case SPT: return "SPT";
+	case LPT: return "LPT";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static void
+nvkm_vmm_trace(struct nvkm_vmm_iter *it, char *buf)
+{
+	int lvl;
+	for (lvl = it->max; lvl >= 0; lvl--) {
+		if (lvl >= it->lvl)
+			buf += sprintf(buf,  "%05x:", it->pte[lvl]);
+		else
+			buf += sprintf(buf, "xxxxx:");
+	}
+}
+
+#define TRA(i,f,a...) do {                                                     \
+	char _buf[NVKM_VMM_LEVELS_MAX * 7];                                    \
+	struct nvkm_vmm_iter *_it = (i);                                       \
+	nvkm_vmm_trace(_it, _buf);                                             \
+	VMM_TRACE(_it->vmm, "%s "f, _buf, ##a);                                \
+} while(0)
+#else
+#define TRA(i,f,a...)
+#endif
+
+static inline void
+nvkm_vmm_flush_mark(struct nvkm_vmm_iter *it)
+{
+	it->flush = min(it->flush, it->max - it->lvl);
+}
+
+static inline void
+nvkm_vmm_flush(struct nvkm_vmm_iter *it)
+{
+	if (it->flush != NVKM_VMM_LEVELS_MAX) {
+		if (it->vmm->func->flush) {
+			TRA(it, "flush: %d", it->flush);
+			it->vmm->func->flush(it->vmm, it->flush);
+		}
+		it->flush = NVKM_VMM_LEVELS_MAX;
+	}
+}
+
+static void
+nvkm_vmm_unref_pdes(struct nvkm_vmm_iter *it)
+{
+	const struct nvkm_vmm_desc *desc = it->desc;
+	const int type = desc[it->lvl].type == SPT;
+	struct nvkm_vmm_pt *pgd = it->pt[it->lvl + 1];
+	struct nvkm_vmm_pt *pgt = it->pt[it->lvl];
+	struct nvkm_mmu_pt *pt = pgt->pt[type];
+	struct nvkm_vmm *vmm = it->vmm;
+	u32 pdei = it->pte[it->lvl + 1];
+
+	/* Recurse up the tree, unreferencing/destroying unneeded PDs. */
+	it->lvl++;
+	if (--pgd->refs[0]) {
+		const struct nvkm_vmm_desc_func *func = desc[it->lvl].func;
+		/* PD has other valid PDEs, so we need a proper update. */
+		TRA(it, "PDE unmap %s", nvkm_vmm_desc_type(&desc[it->lvl - 1]));
+		pgt->pt[type] = NULL;
+		if (!pgt->refs[!type]) {
+			/* PDE no longer required. */
+			if (pgd->pt[0]) {
+				if (pgt->sparse) {
+					func->sparse(vmm, pgd->pt[0], pdei, 1);
+					pgd->pde[pdei] = NVKM_VMM_PDE_SPARSE;
+				} else {
+					func->unmap(vmm, pgd->pt[0], pdei, 1);
+					pgd->pde[pdei] = NULL;
+				}
+			} else {
+				/* Special handling for Tesla-class GPUs,
+				 * where there's no central PD, but each
+				 * instance has its own embedded PD.
+				 */
+				func->pde(vmm, pgd, pdei);
+				pgd->pde[pdei] = NULL;
+			}
+		} else {
+			/* PDE was pointing at dual-PTs and we're removing
+			 * one of them, leaving the other in place.
+			 */
+			func->pde(vmm, pgd, pdei);
+		}
+
+		/* GPU may have cached the PTs, flush before freeing. */
+		nvkm_vmm_flush_mark(it);
+		nvkm_vmm_flush(it);
+	} else {
+		/* PD has no valid PDEs left, so we can just destroy it. */
+		nvkm_vmm_unref_pdes(it);
+	}
+
+	/* Destroy PD/PT. */
+	TRA(it, "PDE free %s", nvkm_vmm_desc_type(&desc[it->lvl - 1]));
+	nvkm_mmu_ptc_put(vmm->mmu, vmm->bootstrapped, &pt);
+	if (!pgt->refs[!type])
+		nvkm_vmm_pt_del(&pgt);
+	it->lvl--;
+}
+
+static void
+nvkm_vmm_unref_sptes(struct nvkm_vmm_iter *it, struct nvkm_vmm_pt *pgt,
+		     const struct nvkm_vmm_desc *desc, u32 ptei, u32 ptes)
+{
+	const struct nvkm_vmm_desc *pair = it->page[-1].desc;
+	const u32 sptb = desc->bits - pair->bits;
+	const u32 sptn = 1 << sptb;
+	struct nvkm_vmm *vmm = it->vmm;
+	u32 spti = ptei & (sptn - 1), lpti, pteb;
+
+	/* Determine how many SPTEs are being touched under each LPTE,
+	 * and drop reference counts.
+	 */
+	for (lpti = ptei >> sptb; ptes; spti = 0, lpti++) {
+		const u32 pten = min(sptn - spti, ptes);
+		pgt->pte[lpti] -= pten;
+		ptes -= pten;
+	}
+
+	/* We're done here if there's no corresponding LPT. */
+	if (!pgt->refs[0])
+		return;
+
+	for (ptei = pteb = ptei >> sptb; ptei < lpti; pteb = ptei) {
+		/* Skip over any LPTEs that still have valid SPTEs. */
+		if (pgt->pte[pteb] & NVKM_VMM_PTE_SPTES) {
+			for (ptes = 1, ptei++; ptei < lpti; ptes++, ptei++) {
+				if (!(pgt->pte[ptei] & NVKM_VMM_PTE_SPTES))
+					break;
+			}
+			continue;
+		}
+
+		/* As there's no more non-UNMAPPED SPTEs left in the range
+		 * covered by a number of LPTEs, the LPTEs once again take
+		 * control over their address range.
+		 *
+		 * Determine how many LPTEs need to transition state.
+		 */
+		pgt->pte[ptei] &= ~NVKM_VMM_PTE_VALID;
+		for (ptes = 1, ptei++; ptei < lpti; ptes++, ptei++) {
+			if (pgt->pte[ptei] & NVKM_VMM_PTE_SPTES)
+				break;
+			pgt->pte[ptei] &= ~NVKM_VMM_PTE_VALID;
+		}
+
+		if (pgt->pte[pteb] & NVKM_VMM_PTE_SPARSE) {
+			TRA(it, "LPTE %05x: U -> S %d PTEs", pteb, ptes);
+			pair->func->sparse(vmm, pgt->pt[0], pteb, ptes);
+		} else
+		if (pair->func->invalid) {
+			/* If the MMU supports it, restore the LPTE to the
+			 * INVALID state to tell the MMU there is no point
+			 * trying to fetch the corresponding SPTEs.
+			 */
+			TRA(it, "LPTE %05x: U -> I %d PTEs", pteb, ptes);
+			pair->func->invalid(vmm, pgt->pt[0], pteb, ptes);
+		}
+	}
+}
+
+static bool
+nvkm_vmm_unref_ptes(struct nvkm_vmm_iter *it, u32 ptei, u32 ptes)
+{
+	const struct nvkm_vmm_desc *desc = it->desc;
+	const int type = desc->type == SPT;
+	struct nvkm_vmm_pt *pgt = it->pt[0];
+
+	/* Drop PTE references. */
+	pgt->refs[type] -= ptes;
+
+	/* Dual-PTs need special handling, unless PDE becoming invalid. */
+	if (desc->type == SPT && (pgt->refs[0] || pgt->refs[1]))
+		nvkm_vmm_unref_sptes(it, pgt, desc, ptei, ptes);
+
+	/* PT no longer neeed?  Destroy it. */
+	if (!pgt->refs[type]) {
+		it->lvl++;
+		TRA(it, "%s empty", nvkm_vmm_desc_type(desc));
+		it->lvl--;
+		nvkm_vmm_unref_pdes(it);
+		return false; /* PTE writes for unmap() not necessary. */
+	}
+
+	return true;
+}
+
+static void
+nvkm_vmm_ref_sptes(struct nvkm_vmm_iter *it, struct nvkm_vmm_pt *pgt,
+		   const struct nvkm_vmm_desc *desc, u32 ptei, u32 ptes)
+{
+	const struct nvkm_vmm_desc *pair = it->page[-1].desc;
+	const u32 sptb = desc->bits - pair->bits;
+	const u32 sptn = 1 << sptb;
+	struct nvkm_vmm *vmm = it->vmm;
+	u32 spti = ptei & (sptn - 1), lpti, pteb;
+
+	/* Determine how many SPTEs are being touched under each LPTE,
+	 * and increase reference counts.
+	 */
+	for (lpti = ptei >> sptb; ptes; spti = 0, lpti++) {
+		const u32 pten = min(sptn - spti, ptes);
+		pgt->pte[lpti] += pten;
+		ptes -= pten;
+	}
+
+	/* We're done here if there's no corresponding LPT. */
+	if (!pgt->refs[0])
+		return;
+
+	for (ptei = pteb = ptei >> sptb; ptei < lpti; pteb = ptei) {
+		/* Skip over any LPTEs that already have valid SPTEs. */
+		if (pgt->pte[pteb] & NVKM_VMM_PTE_VALID) {
+			for (ptes = 1, ptei++; ptei < lpti; ptes++, ptei++) {
+				if (!(pgt->pte[ptei] & NVKM_VMM_PTE_VALID))
+					break;
+			}
+			continue;
+		}
+
+		/* As there are now non-UNMAPPED SPTEs in the range covered
+		 * by a number of LPTEs, we need to transfer control of the
+		 * address range to the SPTEs.
+		 *
+		 * Determine how many LPTEs need to transition state.
+		 */
+		pgt->pte[ptei] |= NVKM_VMM_PTE_VALID;
+		for (ptes = 1, ptei++; ptei < lpti; ptes++, ptei++) {
+			if (pgt->pte[ptei] & NVKM_VMM_PTE_VALID)
+				break;
+			pgt->pte[ptei] |= NVKM_VMM_PTE_VALID;
+		}
+
+		if (pgt->pte[pteb] & NVKM_VMM_PTE_SPARSE) {
+			const u32 spti = pteb * sptn;
+			const u32 sptc = ptes * sptn;
+			/* The entire LPTE is marked as sparse, we need
+			 * to make sure that the SPTEs are too.
+			 */
+			TRA(it, "SPTE %05x: U -> S %d PTEs", spti, sptc);
+			desc->func->sparse(vmm, pgt->pt[1], spti, sptc);
+			/* Sparse LPTEs prevent SPTEs from being accessed. */
+			TRA(it, "LPTE %05x: S -> U %d PTEs", pteb, ptes);
+			pair->func->unmap(vmm, pgt->pt[0], pteb, ptes);
+		} else
+		if (pair->func->invalid) {
+			/* MMU supports blocking SPTEs by marking an LPTE
+			 * as INVALID.  We need to reverse that here.
+			 */
+			TRA(it, "LPTE %05x: I -> U %d PTEs", pteb, ptes);
+			pair->func->unmap(vmm, pgt->pt[0], pteb, ptes);
+		}
+	}
+}
+
+static bool
+nvkm_vmm_ref_ptes(struct nvkm_vmm_iter *it, u32 ptei, u32 ptes)
+{
+	const struct nvkm_vmm_desc *desc = it->desc;
+	const int type = desc->type == SPT;
+	struct nvkm_vmm_pt *pgt = it->pt[0];
+
+	/* Take PTE references. */
+	pgt->refs[type] += ptes;
+
+	/* Dual-PTs need special handling. */
+	if (desc->type == SPT)
+		nvkm_vmm_ref_sptes(it, pgt, desc, ptei, ptes);
+
+	return true;
+}
+
+static void
+nvkm_vmm_sparse_ptes(const struct nvkm_vmm_desc *desc,
+		     struct nvkm_vmm_pt *pgt, u32 ptei, u32 ptes)
+{
+	if (desc->type == PGD) {
+		while (ptes--)
+			pgt->pde[ptei++] = NVKM_VMM_PDE_SPARSE;
+	} else
+	if (desc->type == LPT) {
+		memset(&pgt->pte[ptei], NVKM_VMM_PTE_SPARSE, ptes);
+	}
+}
+
+static bool
+nvkm_vmm_sparse_unref_ptes(struct nvkm_vmm_iter *it, u32 ptei, u32 ptes)
+{
+	struct nvkm_vmm_pt *pt = it->pt[0];
+	if (it->desc->type == PGD)
+		memset(&pt->pde[ptei], 0x00, sizeof(pt->pde[0]) * ptes);
+	else
+	if (it->desc->type == LPT)
+		memset(&pt->pte[ptei], 0x00, sizeof(pt->pte[0]) * ptes);
+	return nvkm_vmm_unref_ptes(it, ptei, ptes);
+}
+
+static bool
+nvkm_vmm_sparse_ref_ptes(struct nvkm_vmm_iter *it, u32 ptei, u32 ptes)
+{
+	nvkm_vmm_sparse_ptes(it->desc, it->pt[0], ptei, ptes);
+	return nvkm_vmm_ref_ptes(it, ptei, ptes);
+}
+
+static bool
+nvkm_vmm_ref_hwpt(struct nvkm_vmm_iter *it, struct nvkm_vmm_pt *pgd, u32 pdei)
+{
+	const struct nvkm_vmm_desc *desc = &it->desc[it->lvl - 1];
+	const int type = desc->type == SPT;
+	struct nvkm_vmm_pt *pgt = pgd->pde[pdei];
+	const bool zero = !pgt->sparse && !desc->func->invalid;
+	struct nvkm_vmm *vmm = it->vmm;
+	struct nvkm_mmu *mmu = vmm->mmu;
+	struct nvkm_mmu_pt *pt;
+	u32 pten = 1 << desc->bits;
+	u32 pteb, ptei, ptes;
+	u32 size = desc->size * pten;
+
+	pgd->refs[0]++;
+
+	pgt->pt[type] = nvkm_mmu_ptc_get(mmu, size, desc->align, zero);
+	if (!pgt->pt[type]) {
+		it->lvl--;
+		nvkm_vmm_unref_pdes(it);
+		return false;
+	}
+
+	if (zero)
+		goto done;
+
+	pt = pgt->pt[type];
+
+	if (desc->type == LPT && pgt->refs[1]) {
+		/* SPT already exists covering the same range as this LPT,
+		 * which means we need to be careful that any LPTEs which
+		 * overlap valid SPTEs are unmapped as opposed to invalid
+		 * or sparse, which would prevent the MMU from looking at
+		 * the SPTEs on some GPUs.
+		 */
+		for (ptei = pteb = 0; ptei < pten; pteb = ptei) {
+			bool spte = pgt->pte[ptei] & NVKM_VMM_PTE_SPTES;
+			for (ptes = 1, ptei++; ptei < pten; ptes++, ptei++) {
+				bool next = pgt->pte[ptei] & NVKM_VMM_PTE_SPTES;
+				if (spte != next)
+					break;
+			}
+
+			if (!spte) {
+				if (pgt->sparse)
+					desc->func->sparse(vmm, pt, pteb, ptes);
+				else
+					desc->func->invalid(vmm, pt, pteb, ptes);
+				memset(&pgt->pte[pteb], 0x00, ptes);
+			} else {
+				desc->func->unmap(vmm, pt, pteb, ptes);
+				while (ptes--)
+					pgt->pte[pteb++] |= NVKM_VMM_PTE_VALID;
+			}
+		}
+	} else {
+		if (pgt->sparse) {
+			nvkm_vmm_sparse_ptes(desc, pgt, 0, pten);
+			desc->func->sparse(vmm, pt, 0, pten);
+		} else {
+			desc->func->invalid(vmm, pt, 0, pten);
+		}
+	}
+
+done:
+	TRA(it, "PDE write %s", nvkm_vmm_desc_type(desc));
+	it->desc[it->lvl].func->pde(it->vmm, pgd, pdei);
+	nvkm_vmm_flush_mark(it);
+	return true;
+}
+
+static bool
+nvkm_vmm_ref_swpt(struct nvkm_vmm_iter *it, struct nvkm_vmm_pt *pgd, u32 pdei)
+{
+	const struct nvkm_vmm_desc *desc = &it->desc[it->lvl - 1];
+	struct nvkm_vmm_pt *pgt = pgd->pde[pdei];
+
+	pgt = nvkm_vmm_pt_new(desc, NVKM_VMM_PDE_SPARSED(pgt), it->page);
+	if (!pgt) {
+		if (!pgd->refs[0])
+			nvkm_vmm_unref_pdes(it);
+		return false;
+	}
+
+	pgd->pde[pdei] = pgt;
+	return true;
+}
+
+static inline u64
+nvkm_vmm_iter(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+	      u64 addr, u64 size, const char *name, bool ref,
+	      bool (*REF_PTES)(struct nvkm_vmm_iter *, u32, u32),
+	      nvkm_vmm_pte_func MAP_PTES, struct nvkm_vmm_map *map,
+	      nvkm_vmm_pxe_func CLR_PTES)
+{
+	const struct nvkm_vmm_desc *desc = page->desc;
+	struct nvkm_vmm_iter it;
+	u64 bits = addr >> page->shift;
+
+	it.page = page;
+	it.desc = desc;
+	it.vmm = vmm;
+	it.cnt = size >> page->shift;
+	it.flush = NVKM_VMM_LEVELS_MAX;
+
+	/* Deconstruct address into PTE indices for each mapping level. */
+	for (it.lvl = 0; desc[it.lvl].bits; it.lvl++) {
+		it.pte[it.lvl] = bits & ((1 << desc[it.lvl].bits) - 1);
+		bits >>= desc[it.lvl].bits;
+	}
+	it.max = --it.lvl;
+	it.pt[it.max] = vmm->pd;
+
+	it.lvl = 0;
+	TRA(&it, "%s: %016llx %016llx %d %lld PTEs", name,
+	         addr, size, page->shift, it.cnt);
+	it.lvl = it.max;
+
+	/* Depth-first traversal of page tables. */
+	while (it.cnt) {
+		struct nvkm_vmm_pt *pgt = it.pt[it.lvl];
+		const int type = desc->type == SPT;
+		const u32 pten = 1 << desc->bits;
+		const u32 ptei = it.pte[0];
+		const u32 ptes = min_t(u64, it.cnt, pten - ptei);
+
+		/* Walk down the tree, finding page tables for each level. */
+		for (; it.lvl; it.lvl--) {
+			const u32 pdei = it.pte[it.lvl];
+			struct nvkm_vmm_pt *pgd = pgt;
+
+			/* Software PT. */
+			if (ref && NVKM_VMM_PDE_INVALID(pgd->pde[pdei])) {
+				if (!nvkm_vmm_ref_swpt(&it, pgd, pdei))
+					goto fail;
+			}
+			it.pt[it.lvl - 1] = pgt = pgd->pde[pdei];
+
+			/* Hardware PT.
+			 *
+			 * This is a separate step from above due to GF100 and
+			 * newer having dual page tables at some levels, which
+			 * are refcounted independently.
+			 */
+			if (ref && !pgt->refs[desc[it.lvl - 1].type == SPT]) {
+				if (!nvkm_vmm_ref_hwpt(&it, pgd, pdei))
+					goto fail;
+			}
+		}
+
+		/* Handle PTE updates. */
+		if (!REF_PTES || REF_PTES(&it, ptei, ptes)) {
+			struct nvkm_mmu_pt *pt = pgt->pt[type];
+			if (MAP_PTES || CLR_PTES) {
+				if (MAP_PTES)
+					MAP_PTES(vmm, pt, ptei, ptes, map);
+				else
+					CLR_PTES(vmm, pt, ptei, ptes);
+				nvkm_vmm_flush_mark(&it);
+			}
+		}
+
+		/* Walk back up the tree to the next position. */
+		it.pte[it.lvl] += ptes;
+		it.cnt -= ptes;
+		if (it.cnt) {
+			while (it.pte[it.lvl] == (1 << desc[it.lvl].bits)) {
+				it.pte[it.lvl++] = 0;
+				it.pte[it.lvl]++;
+			}
+		}
+	};
+
+	nvkm_vmm_flush(&it);
+	return ~0ULL;
+
+fail:
+	/* Reconstruct the failure address so the caller is able to
+	 * reverse any partially completed operations.
+	 */
+	addr = it.pte[it.max--];
+	do {
+		addr  = addr << desc[it.max].bits;
+		addr |= it.pte[it.max];
+	} while (it.max--);
+
+	return addr << page->shift;
+}
+
+static void
+nvkm_vmm_ptes_sparse_put(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+			 u64 addr, u64 size)
+{
+	nvkm_vmm_iter(vmm, page, addr, size, "sparse unref", false,
+		      nvkm_vmm_sparse_unref_ptes, NULL, NULL,
+		      page->desc->func->invalid ?
+		      page->desc->func->invalid : page->desc->func->unmap);
+}
+
+static int
+nvkm_vmm_ptes_sparse_get(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+			 u64 addr, u64 size)
+{
+	if ((page->type & NVKM_VMM_PAGE_SPARSE)) {
+		u64 fail = nvkm_vmm_iter(vmm, page, addr, size, "sparse ref",
+					 true, nvkm_vmm_sparse_ref_ptes, NULL,
+					 NULL, page->desc->func->sparse);
+		if (fail != ~0ULL) {
+			if ((size = fail - addr))
+				nvkm_vmm_ptes_sparse_put(vmm, page, addr, size);
+			return -ENOMEM;
+		}
+		return 0;
+	}
+	return -EINVAL;
+}
+
+static int
+nvkm_vmm_ptes_sparse(struct nvkm_vmm *vmm, u64 addr, u64 size, bool ref)
+{
+	const struct nvkm_vmm_page *page = vmm->func->page;
+	int m = 0, i;
+	u64 start = addr;
+	u64 block;
+
+	while (size) {
+		/* Limit maximum page size based on remaining size. */
+		while (size < (1ULL << page[m].shift))
+			m++;
+		i = m;
+
+		/* Find largest page size suitable for alignment. */
+		while (!IS_ALIGNED(addr, 1ULL << page[i].shift))
+			i++;
+
+		/* Determine number of PTEs at this page size. */
+		if (i != m) {
+			/* Limited to alignment boundary of next page size. */
+			u64 next = 1ULL << page[i - 1].shift;
+			u64 part = ALIGN(addr, next) - addr;
+			if (size - part >= next)
+				block = (part >> page[i].shift) << page[i].shift;
+			else
+				block = (size >> page[i].shift) << page[i].shift;
+		} else {
+			block = (size >> page[i].shift) << page[i].shift;;
+		}
+
+		/* Perform operation. */
+		if (ref) {
+			int ret = nvkm_vmm_ptes_sparse_get(vmm, &page[i], addr, block);
+			if (ret) {
+				if ((size = addr - start))
+					nvkm_vmm_ptes_sparse(vmm, start, size, false);
+				return ret;
+			}
+		} else {
+			nvkm_vmm_ptes_sparse_put(vmm, &page[i], addr, block);
+		}
+
+		size -= block;
+		addr += block;
+	}
+
+	return 0;
+}
+
+static void
+nvkm_vmm_ptes_unmap_put(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+			u64 addr, u64 size, bool sparse)
+{
+	const struct nvkm_vmm_desc_func *func = page->desc->func;
+	nvkm_vmm_iter(vmm, page, addr, size, "unmap + unref",
+		      false, nvkm_vmm_unref_ptes, NULL, NULL,
+		      sparse ? func->sparse : func->invalid ? func->invalid :
+							      func->unmap);
+}
+
+static int
+nvkm_vmm_ptes_get_map(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+		      u64 addr, u64 size, struct nvkm_vmm_map *map,
+		      nvkm_vmm_pte_func func)
+{
+	u64 fail = nvkm_vmm_iter(vmm, page, addr, size, "ref + map", true,
+				 nvkm_vmm_ref_ptes, func, map, NULL);
+	if (fail != ~0ULL) {
+		if ((size = fail - addr))
+			nvkm_vmm_ptes_unmap_put(vmm, page, addr, size, false);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void
+nvkm_vmm_ptes_unmap(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+		    u64 addr, u64 size, bool sparse)
+{
+	const struct nvkm_vmm_desc_func *func = page->desc->func;
+	nvkm_vmm_iter(vmm, page, addr, size, "unmap", false, NULL, NULL, NULL,
+		      sparse ? func->sparse : func->invalid ? func->invalid :
+							      func->unmap);
+}
+
+static void
+nvkm_vmm_ptes_map(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+		  u64 addr, u64 size, struct nvkm_vmm_map *map,
+		  nvkm_vmm_pte_func func)
+{
+	nvkm_vmm_iter(vmm, page, addr, size, "map", false,
+		      NULL, func, map, NULL);
+}
+
+static void
+nvkm_vmm_ptes_put(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+		  u64 addr, u64 size)
+{
+	nvkm_vmm_iter(vmm, page, addr, size, "unref", false,
+		      nvkm_vmm_unref_ptes, NULL, NULL, NULL);
+}
+
+static int
+nvkm_vmm_ptes_get(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+		  u64 addr, u64 size)
+{
+	u64 fail = nvkm_vmm_iter(vmm, page, addr, size, "ref", true,
+				 nvkm_vmm_ref_ptes, NULL, NULL, NULL);
+	if (fail != ~0ULL) {
+		if (fail != addr)
+			nvkm_vmm_ptes_put(vmm, page, addr, fail - addr);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static inline struct nvkm_vma *
+nvkm_vma_new(u64 addr, u64 size)
+{
+	struct nvkm_vma *vma = kzalloc(sizeof(*vma), GFP_KERNEL);
+	if (vma) {
+		vma->addr = addr;
+		vma->size = size;
+		vma->page = NVKM_VMA_PAGE_NONE;
+		vma->refd = NVKM_VMA_PAGE_NONE;
+	}
+	return vma;
+}
+
+struct nvkm_vma *
+nvkm_vma_tail(struct nvkm_vma *vma, u64 tail)
+{
+	struct nvkm_vma *new;
+
+	BUG_ON(vma->size == tail);
+
+	if (!(new = nvkm_vma_new(vma->addr + (vma->size - tail), tail)))
+		return NULL;
+	vma->size -= tail;
+
+	new->mapref = vma->mapref;
+	new->sparse = vma->sparse;
+	new->page = vma->page;
+	new->refd = vma->refd;
+	new->used = vma->used;
+	new->part = vma->part;
+	new->user = vma->user;
+	new->busy = vma->busy;
+	list_add(&new->head, &vma->head);
+	return new;
+}
+
+static void
+nvkm_vmm_free_insert(struct nvkm_vmm *vmm, struct nvkm_vma *vma)
+{
+	struct rb_node **ptr = &vmm->free.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*ptr) {
+		struct nvkm_vma *this = rb_entry(*ptr, typeof(*this), tree);
+		parent = *ptr;
+		if (vma->size < this->size)
+			ptr = &parent->rb_left;
+		else
+		if (vma->size > this->size)
+			ptr = &parent->rb_right;
+		else
+		if (vma->addr < this->addr)
+			ptr = &parent->rb_left;
+		else
+		if (vma->addr > this->addr)
+			ptr = &parent->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&vma->tree, parent, ptr);
+	rb_insert_color(&vma->tree, &vmm->free);
+}
+
+void
+nvkm_vmm_node_insert(struct nvkm_vmm *vmm, struct nvkm_vma *vma)
+{
+	struct rb_node **ptr = &vmm->root.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*ptr) {
+		struct nvkm_vma *this = rb_entry(*ptr, typeof(*this), tree);
+		parent = *ptr;
+		if (vma->addr < this->addr)
+			ptr = &parent->rb_left;
+		else
+		if (vma->addr > this->addr)
+			ptr = &parent->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&vma->tree, parent, ptr);
+	rb_insert_color(&vma->tree, &vmm->root);
+}
+
+struct nvkm_vma *
+nvkm_vmm_node_search(struct nvkm_vmm *vmm, u64 addr)
+{
+	struct rb_node *node = vmm->root.rb_node;
+	while (node) {
+		struct nvkm_vma *vma = rb_entry(node, typeof(*vma), tree);
+		if (addr < vma->addr)
+			node = node->rb_left;
+		else
+		if (addr >= vma->addr + vma->size)
+			node = node->rb_right;
+		else
+			return vma;
+	}
+	return NULL;
+}
+
+static void
+nvkm_vmm_dtor(struct nvkm_vmm *vmm)
+{
+	struct nvkm_vma *vma;
+	struct rb_node *node;
+
+	while ((node = rb_first(&vmm->root))) {
+		struct nvkm_vma *vma = rb_entry(node, typeof(*vma), tree);
+		nvkm_vmm_put(vmm, &vma);
+	}
+
+	if (vmm->bootstrapped) {
+		const struct nvkm_vmm_page *page = vmm->func->page;
+		const u64 limit = vmm->limit - vmm->start;
+
+		while (page[1].shift)
+			page++;
+
+		nvkm_mmu_ptc_dump(vmm->mmu);
+		nvkm_vmm_ptes_put(vmm, page, vmm->start, limit);
+	}
+
+	vma = list_first_entry(&vmm->list, typeof(*vma), head);
+	list_del(&vma->head);
+	kfree(vma);
+	WARN_ON(!list_empty(&vmm->list));
+
+	if (vmm->nullp) {
+		dma_free_coherent(vmm->mmu->subdev.device->dev, 16 * 1024,
+				  vmm->nullp, vmm->null);
+	}
+
+	if (vmm->pd) {
+		nvkm_mmu_ptc_put(vmm->mmu, true, &vmm->pd->pt[0]);
+		nvkm_vmm_pt_del(&vmm->pd);
+	}
+}
+
+int
+nvkm_vmm_ctor(const struct nvkm_vmm_func *func, struct nvkm_mmu *mmu,
+	      u32 pd_header, u64 addr, u64 size, struct lock_class_key *key,
+	      const char *name, struct nvkm_vmm *vmm)
+{
+	static struct lock_class_key _key;
+	const struct nvkm_vmm_page *page = func->page;
+	const struct nvkm_vmm_desc *desc;
+	struct nvkm_vma *vma;
+	int levels, bits = 0;
+
+	vmm->func = func;
+	vmm->mmu = mmu;
+	vmm->name = name;
+	vmm->debug = mmu->subdev.debug;
+	kref_init(&vmm->kref);
+
+	__mutex_init(&vmm->mutex, "&vmm->mutex", key ? key : &_key);
+
+	/* Locate the smallest page size supported by the backend, it will
+	 * have the the deepest nesting of page tables.
+	 */
+	while (page[1].shift)
+		page++;
+
+	/* Locate the structure that describes the layout of the top-level
+	 * page table, and determine the number of valid bits in a virtual
+	 * address.
+	 */
+	for (levels = 0, desc = page->desc; desc->bits; desc++, levels++)
+		bits += desc->bits;
+	bits += page->shift;
+	desc--;
+
+	if (WARN_ON(levels > NVKM_VMM_LEVELS_MAX))
+		return -EINVAL;
+
+	vmm->start = addr;
+	vmm->limit = size ? (addr + size) : (1ULL << bits);
+	if (vmm->start > vmm->limit || vmm->limit > (1ULL << bits))
+		return -EINVAL;
+
+	/* Allocate top-level page table. */
+	vmm->pd = nvkm_vmm_pt_new(desc, false, NULL);
+	if (!vmm->pd)
+		return -ENOMEM;
+	vmm->pd->refs[0] = 1;
+	INIT_LIST_HEAD(&vmm->join);
+
+	/* ... and the GPU storage for it, except on Tesla-class GPUs that
+	 * have the PD embedded in the instance structure.
+	 */
+	if (desc->size) {
+		const u32 size = pd_header + desc->size * (1 << desc->bits);
+		vmm->pd->pt[0] = nvkm_mmu_ptc_get(mmu, size, desc->align, true);
+		if (!vmm->pd->pt[0])
+			return -ENOMEM;
+	}
+
+	/* Initialise address-space MM. */
+	INIT_LIST_HEAD(&vmm->list);
+	vmm->free = RB_ROOT;
+	vmm->root = RB_ROOT;
+
+	if (!(vma = nvkm_vma_new(vmm->start, vmm->limit - vmm->start)))
+		return -ENOMEM;
+
+	nvkm_vmm_free_insert(vmm, vma);
+	list_add(&vma->head, &vmm->list);
+	return 0;
+}
+
+int
+nvkm_vmm_new_(const struct nvkm_vmm_func *func, struct nvkm_mmu *mmu,
+	      u32 hdr, u64 addr, u64 size, struct lock_class_key *key,
+	      const char *name, struct nvkm_vmm **pvmm)
+{
+	if (!(*pvmm = kzalloc(sizeof(**pvmm), GFP_KERNEL)))
+		return -ENOMEM;
+	return nvkm_vmm_ctor(func, mmu, hdr, addr, size, key, name, *pvmm);
+}
+
+#define node(root, dir) ((root)->head.dir == &vmm->list) ? NULL :              \
+	list_entry((root)->head.dir, struct nvkm_vma, head)
+
+void
+nvkm_vmm_unmap_region(struct nvkm_vmm *vmm, struct nvkm_vma *vma)
+{
+	struct nvkm_vma *next;
+
+	nvkm_memory_tags_put(vma->memory, vmm->mmu->subdev.device, &vma->tags);
+	nvkm_memory_unref(&vma->memory);
+
+	if (vma->part) {
+		struct nvkm_vma *prev = node(vma, prev);
+		if (!prev->memory) {
+			prev->size += vma->size;
+			rb_erase(&vma->tree, &vmm->root);
+			list_del(&vma->head);
+			kfree(vma);
+			vma = prev;
+		}
+	}
+
+	next = node(vma, next);
+	if (next && next->part) {
+		if (!next->memory) {
+			vma->size += next->size;
+			rb_erase(&next->tree, &vmm->root);
+			list_del(&next->head);
+			kfree(next);
+		}
+	}
+}
+
+void
+nvkm_vmm_unmap_locked(struct nvkm_vmm *vmm, struct nvkm_vma *vma)
+{
+	const struct nvkm_vmm_page *page = &vmm->func->page[vma->refd];
+
+	if (vma->mapref) {
+		nvkm_vmm_ptes_unmap_put(vmm, page, vma->addr, vma->size, vma->sparse);
+		vma->refd = NVKM_VMA_PAGE_NONE;
+	} else {
+		nvkm_vmm_ptes_unmap(vmm, page, vma->addr, vma->size, vma->sparse);
+	}
+
+	nvkm_vmm_unmap_region(vmm, vma);
+}
+
+void
+nvkm_vmm_unmap(struct nvkm_vmm *vmm, struct nvkm_vma *vma)
+{
+	if (vma->memory) {
+		mutex_lock(&vmm->mutex);
+		nvkm_vmm_unmap_locked(vmm, vma);
+		mutex_unlock(&vmm->mutex);
+	}
+}
+
+static int
+nvkm_vmm_map_valid(struct nvkm_vmm *vmm, struct nvkm_vma *vma,
+		   void *argv, u32 argc, struct nvkm_vmm_map *map)
+{
+	switch (nvkm_memory_target(map->memory)) {
+	case NVKM_MEM_TARGET_VRAM:
+		if (!(map->page->type & NVKM_VMM_PAGE_VRAM)) {
+			VMM_DEBUG(vmm, "%d !VRAM", map->page->shift);
+			return -EINVAL;
+		}
+		break;
+	case NVKM_MEM_TARGET_HOST:
+	case NVKM_MEM_TARGET_NCOH:
+		if (!(map->page->type & NVKM_VMM_PAGE_HOST)) {
+			VMM_DEBUG(vmm, "%d !HOST", map->page->shift);
+			return -EINVAL;
+		}
+		break;
+	default:
+		WARN_ON(1);
+		return -ENOSYS;
+	}
+
+	if (!IS_ALIGNED(     vma->addr, 1ULL << map->page->shift) ||
+	    !IS_ALIGNED((u64)vma->size, 1ULL << map->page->shift) ||
+	    !IS_ALIGNED(   map->offset, 1ULL << map->page->shift) ||
+	    nvkm_memory_page(map->memory) < map->page->shift) {
+		VMM_DEBUG(vmm, "alignment %016llx %016llx %016llx %d %d",
+		    vma->addr, (u64)vma->size, map->offset, map->page->shift,
+		    nvkm_memory_page(map->memory));
+		return -EINVAL;
+	}
+
+	return vmm->func->valid(vmm, argv, argc, map);
+}
+
+static int
+nvkm_vmm_map_choose(struct nvkm_vmm *vmm, struct nvkm_vma *vma,
+		    void *argv, u32 argc, struct nvkm_vmm_map *map)
+{
+	for (map->page = vmm->func->page; map->page->shift; map->page++) {
+		VMM_DEBUG(vmm, "trying %d", map->page->shift);
+		if (!nvkm_vmm_map_valid(vmm, vma, argv, argc, map))
+			return 0;
+	}
+	return -EINVAL;
+}
+
+static int
+nvkm_vmm_map_locked(struct nvkm_vmm *vmm, struct nvkm_vma *vma,
+		    void *argv, u32 argc, struct nvkm_vmm_map *map)
+{
+	nvkm_vmm_pte_func func;
+	int ret;
+
+	/* Make sure we won't overrun the end of the memory object. */
+	if (unlikely(nvkm_memory_size(map->memory) < map->offset + vma->size)) {
+		VMM_DEBUG(vmm, "overrun %016llx %016llx %016llx",
+			  nvkm_memory_size(map->memory),
+			  map->offset, (u64)vma->size);
+		return -EINVAL;
+	}
+
+	/* Check remaining arguments for validity. */
+	if (vma->page == NVKM_VMA_PAGE_NONE &&
+	    vma->refd == NVKM_VMA_PAGE_NONE) {
+		/* Find the largest page size we can perform the mapping at. */
+		const u32 debug = vmm->debug;
+		vmm->debug = 0;
+		ret = nvkm_vmm_map_choose(vmm, vma, argv, argc, map);
+		vmm->debug = debug;
+		if (ret) {
+			VMM_DEBUG(vmm, "invalid at any page size");
+			nvkm_vmm_map_choose(vmm, vma, argv, argc, map);
+			return -EINVAL;
+		}
+	} else {
+		/* Page size of the VMA is already pre-determined. */
+		if (vma->refd != NVKM_VMA_PAGE_NONE)
+			map->page = &vmm->func->page[vma->refd];
+		else
+			map->page = &vmm->func->page[vma->page];
+
+		ret = nvkm_vmm_map_valid(vmm, vma, argv, argc, map);
+		if (ret) {
+			VMM_DEBUG(vmm, "invalid %d\n", ret);
+			return ret;
+		}
+	}
+
+	/* Deal with the 'offset' argument, and fetch the backend function. */
+	map->off = map->offset;
+	if (map->mem) {
+		for (; map->off; map->mem = map->mem->next) {
+			u64 size = (u64)map->mem->length << NVKM_RAM_MM_SHIFT;
+			if (size > map->off)
+				break;
+			map->off -= size;
+		}
+		func = map->page->desc->func->mem;
+	} else
+	if (map->sgl) {
+		for (; map->off; map->sgl = sg_next(map->sgl)) {
+			u64 size = sg_dma_len(map->sgl);
+			if (size > map->off)
+				break;
+			map->off -= size;
+		}
+		func = map->page->desc->func->sgl;
+	} else {
+		map->dma += map->offset >> PAGE_SHIFT;
+		map->off  = map->offset & PAGE_MASK;
+		func = map->page->desc->func->dma;
+	}
+
+	/* Perform the map. */
+	if (vma->refd == NVKM_VMA_PAGE_NONE) {
+		ret = nvkm_vmm_ptes_get_map(vmm, map->page, vma->addr, vma->size, map, func);
+		if (ret)
+			return ret;
+
+		vma->refd = map->page - vmm->func->page;
+	} else {
+		nvkm_vmm_ptes_map(vmm, map->page, vma->addr, vma->size, map, func);
+	}
+
+	nvkm_memory_tags_put(vma->memory, vmm->mmu->subdev.device, &vma->tags);
+	nvkm_memory_unref(&vma->memory);
+	vma->memory = nvkm_memory_ref(map->memory);
+	vma->tags = map->tags;
+	return 0;
+}
+
+int
+nvkm_vmm_map(struct nvkm_vmm *vmm, struct nvkm_vma *vma, void *argv, u32 argc,
+	     struct nvkm_vmm_map *map)
+{
+	int ret;
+	mutex_lock(&vmm->mutex);
+	ret = nvkm_vmm_map_locked(vmm, vma, argv, argc, map);
+	vma->busy = false;
+	mutex_unlock(&vmm->mutex);
+	return ret;
+}
+
+static void
+nvkm_vmm_put_region(struct nvkm_vmm *vmm, struct nvkm_vma *vma)
+{
+	struct nvkm_vma *prev, *next;
+
+	if ((prev = node(vma, prev)) && !prev->used) {
+		rb_erase(&prev->tree, &vmm->free);
+		list_del(&prev->head);
+		vma->addr  = prev->addr;
+		vma->size += prev->size;
+		kfree(prev);
+	}
+
+	if ((next = node(vma, next)) && !next->used) {
+		rb_erase(&next->tree, &vmm->free);
+		list_del(&next->head);
+		vma->size += next->size;
+		kfree(next);
+	}
+
+	nvkm_vmm_free_insert(vmm, vma);
+}
+
+void
+nvkm_vmm_put_locked(struct nvkm_vmm *vmm, struct nvkm_vma *vma)
+{
+	const struct nvkm_vmm_page *page = vmm->func->page;
+	struct nvkm_vma *next = vma;
+
+	BUG_ON(vma->part);
+
+	if (vma->mapref || !vma->sparse) {
+		do {
+			const bool map = next->memory != NULL;
+			const u8  refd = next->refd;
+			const u64 addr = next->addr;
+			u64 size = next->size;
+
+			/* Merge regions that are in the same state. */
+			while ((next = node(next, next)) && next->part &&
+			       (next->memory != NULL) == map &&
+			       (next->refd == refd))
+				size += next->size;
+
+			if (map) {
+				/* Region(s) are mapped, merge the unmap
+				 * and dereference into a single walk of
+				 * the page tree.
+				 */
+				nvkm_vmm_ptes_unmap_put(vmm, &page[refd], addr,
+							size, vma->sparse);
+			} else
+			if (refd != NVKM_VMA_PAGE_NONE) {
+				/* Drop allocation-time PTE references. */
+				nvkm_vmm_ptes_put(vmm, &page[refd], addr, size);
+			}
+		} while (next && next->part);
+	}
+
+	/* Merge any mapped regions that were split from the initial
+	 * address-space allocation back into the allocated VMA, and
+	 * release memory/compression resources.
+	 */
+	next = vma;
+	do {
+		if (next->memory)
+			nvkm_vmm_unmap_region(vmm, next);
+	} while ((next = node(vma, next)) && next->part);
+
+	if (vma->sparse && !vma->mapref) {
+		/* Sparse region that was allocated with a fixed page size,
+		 * meaning all relevant PTEs were referenced once when the
+		 * region was allocated, and remained that way, regardless
+		 * of whether memory was mapped into it afterwards.
+		 *
+		 * The process of unmapping, unsparsing, and dereferencing
+		 * PTEs can be done in a single page tree walk.
+		 */
+		nvkm_vmm_ptes_sparse_put(vmm, &page[vma->refd], vma->addr, vma->size);
+	} else
+	if (vma->sparse) {
+		/* Sparse region that wasn't allocated with a fixed page size,
+		 * PTE references were taken both at allocation time (to make
+		 * the GPU see the region as sparse), and when mapping memory
+		 * into the region.
+		 *
+		 * The latter was handled above, and the remaining references
+		 * are dealt with here.
+		 */
+		nvkm_vmm_ptes_sparse(vmm, vma->addr, vma->size, false);
+	}
+
+	/* Remove VMA from the list of allocated nodes. */
+	rb_erase(&vma->tree, &vmm->root);
+
+	/* Merge VMA back into the free list. */
+	vma->page = NVKM_VMA_PAGE_NONE;
+	vma->refd = NVKM_VMA_PAGE_NONE;
+	vma->used = false;
+	vma->user = false;
+	nvkm_vmm_put_region(vmm, vma);
+}
+
+void
+nvkm_vmm_put(struct nvkm_vmm *vmm, struct nvkm_vma **pvma)
+{
+	struct nvkm_vma *vma = *pvma;
+	if (vma) {
+		mutex_lock(&vmm->mutex);
+		nvkm_vmm_put_locked(vmm, vma);
+		mutex_unlock(&vmm->mutex);
+		*pvma = NULL;
+	}
+}
+
+int
+nvkm_vmm_get_locked(struct nvkm_vmm *vmm, bool getref, bool mapref, bool sparse,
+		    u8 shift, u8 align, u64 size, struct nvkm_vma **pvma)
+{
+	const struct nvkm_vmm_page *page = &vmm->func->page[NVKM_VMA_PAGE_NONE];
+	struct rb_node *node = NULL, *temp;
+	struct nvkm_vma *vma = NULL, *tmp;
+	u64 addr, tail;
+	int ret;
+
+	VMM_TRACE(vmm, "getref %d mapref %d sparse %d "
+		       "shift: %d align: %d size: %016llx",
+		  getref, mapref, sparse, shift, align, size);
+
+	/* Zero-sized, or lazily-allocated sparse VMAs, make no sense. */
+	if (unlikely(!size || (!getref && !mapref && sparse))) {
+		VMM_DEBUG(vmm, "args %016llx %d %d %d",
+			  size, getref, mapref, sparse);
+		return -EINVAL;
+	}
+
+	/* Tesla-class GPUs can only select page size per-PDE, which means
+	 * we're required to know the mapping granularity up-front to find
+	 * a suitable region of address-space.
+	 *
+	 * The same goes if we're requesting up-front allocation of PTES.
+	 */
+	if (unlikely((getref || vmm->func->page_block) && !shift)) {
+		VMM_DEBUG(vmm, "page size required: %d %016llx",
+			  getref, vmm->func->page_block);
+		return -EINVAL;
+	}
+
+	/* If a specific page size was requested, determine its index and
+	 * make sure the requested size is a multiple of the page size.
+	 */
+	if (shift) {
+		for (page = vmm->func->page; page->shift; page++) {
+			if (shift == page->shift)
+				break;
+		}
+
+		if (!page->shift || !IS_ALIGNED(size, 1ULL << page->shift)) {
+			VMM_DEBUG(vmm, "page %d %016llx", shift, size);
+			return -EINVAL;
+		}
+		align = max_t(u8, align, shift);
+	} else {
+		align = max_t(u8, align, 12);
+	}
+
+	/* Locate smallest block that can possibly satisfy the allocation. */
+	temp = vmm->free.rb_node;
+	while (temp) {
+		struct nvkm_vma *this = rb_entry(temp, typeof(*this), tree);
+		if (this->size < size) {
+			temp = temp->rb_right;
+		} else {
+			node = temp;
+			temp = temp->rb_left;
+		}
+	}
+
+	if (unlikely(!node))
+		return -ENOSPC;
+
+	/* Take into account alignment restrictions, trying larger blocks
+	 * in turn until we find a suitable free block.
+	 */
+	do {
+		struct nvkm_vma *this = rb_entry(node, typeof(*this), tree);
+		struct nvkm_vma *prev = node(this, prev);
+		struct nvkm_vma *next = node(this, next);
+		const int p = page - vmm->func->page;
+
+		addr = this->addr;
+		if (vmm->func->page_block && prev && prev->page != p)
+			addr = ALIGN(addr, vmm->func->page_block);
+		addr = ALIGN(addr, 1ULL << align);
+
+		tail = this->addr + this->size;
+		if (vmm->func->page_block && next && next->page != p)
+			tail = ALIGN_DOWN(addr, vmm->func->page_block);
+
+		if (addr <= tail && tail - addr >= size) {
+			rb_erase(&this->tree, &vmm->free);
+			vma = this;
+			break;
+		}
+	} while ((node = rb_next(node)));
+
+	if (unlikely(!vma))
+		return -ENOSPC;
+
+	/* If the VMA we found isn't already exactly the requested size,
+	 * it needs to be split, and the remaining free blocks returned.
+	 */
+	if (addr != vma->addr) {
+		if (!(tmp = nvkm_vma_tail(vma, vma->size + vma->addr - addr))) {
+			nvkm_vmm_put_region(vmm, vma);
+			return -ENOMEM;
+		}
+		nvkm_vmm_free_insert(vmm, vma);
+		vma = tmp;
+	}
+
+	if (size != vma->size) {
+		if (!(tmp = nvkm_vma_tail(vma, vma->size - size))) {
+			nvkm_vmm_put_region(vmm, vma);
+			return -ENOMEM;
+		}
+		nvkm_vmm_free_insert(vmm, tmp);
+	}
+
+	/* Pre-allocate page tables and/or setup sparse mappings. */
+	if (sparse && getref)
+		ret = nvkm_vmm_ptes_sparse_get(vmm, page, vma->addr, vma->size);
+	else if (sparse)
+		ret = nvkm_vmm_ptes_sparse(vmm, vma->addr, vma->size, true);
+	else if (getref)
+		ret = nvkm_vmm_ptes_get(vmm, page, vma->addr, vma->size);
+	else
+		ret = 0;
+	if (ret) {
+		nvkm_vmm_put_region(vmm, vma);
+		return ret;
+	}
+
+	vma->mapref = mapref && !getref;
+	vma->sparse = sparse;
+	vma->page = page - vmm->func->page;
+	vma->refd = getref ? vma->page : NVKM_VMA_PAGE_NONE;
+	vma->used = true;
+	nvkm_vmm_node_insert(vmm, vma);
+	*pvma = vma;
+	return 0;
+}
+
+int
+nvkm_vmm_get(struct nvkm_vmm *vmm, u8 page, u64 size, struct nvkm_vma **pvma)
+{
+	int ret;
+	mutex_lock(&vmm->mutex);
+	ret = nvkm_vmm_get_locked(vmm, false, true, false, page, 0, size, pvma);
+	mutex_unlock(&vmm->mutex);
+	return ret;
+}
+
+void
+nvkm_vmm_part(struct nvkm_vmm *vmm, struct nvkm_memory *inst)
+{
+	if (vmm->func->part && inst) {
+		mutex_lock(&vmm->mutex);
+		vmm->func->part(vmm, inst);
+		mutex_unlock(&vmm->mutex);
+	}
+}
+
+int
+nvkm_vmm_join(struct nvkm_vmm *vmm, struct nvkm_memory *inst)
+{
+	int ret = 0;
+	if (vmm->func->join) {
+		mutex_lock(&vmm->mutex);
+		ret = vmm->func->join(vmm, inst);
+		mutex_unlock(&vmm->mutex);
+	}
+	return ret;
+}
+
+static bool
+nvkm_vmm_boot_ptes(struct nvkm_vmm_iter *it, u32 ptei, u32 ptes)
+{
+	const struct nvkm_vmm_desc *desc = it->desc;
+	const int type = desc->type == SPT;
+	nvkm_memory_boot(it->pt[0]->pt[type]->memory, it->vmm);
+	return false;
+}
+
+int
+nvkm_vmm_boot(struct nvkm_vmm *vmm)
+{
+	const struct nvkm_vmm_page *page = vmm->func->page;
+	const u64 limit = vmm->limit - vmm->start;
+	int ret;
+
+	while (page[1].shift)
+		page++;
+
+	ret = nvkm_vmm_ptes_get(vmm, page, vmm->start, limit);
+	if (ret)
+		return ret;
+
+	nvkm_vmm_iter(vmm, page, vmm->start, limit, "bootstrap", false,
+		      nvkm_vmm_boot_ptes, NULL, NULL, NULL);
+	vmm->bootstrapped = true;
+	return 0;
+}
+
+static void
+nvkm_vmm_del(struct kref *kref)
+{
+	struct nvkm_vmm *vmm = container_of(kref, typeof(*vmm), kref);
+	nvkm_vmm_dtor(vmm);
+	kfree(vmm);
+}
+
+void
+nvkm_vmm_unref(struct nvkm_vmm **pvmm)
+{
+	struct nvkm_vmm *vmm = *pvmm;
+	if (vmm) {
+		kref_put(&vmm->kref, nvkm_vmm_del);
+		*pvmm = NULL;
+	}
+}
+
+struct nvkm_vmm *
+nvkm_vmm_ref(struct nvkm_vmm *vmm)
+{
+	if (vmm)
+		kref_get(&vmm->kref);
+	return vmm;
+}
+
+int
+nvkm_vmm_new(struct nvkm_device *device, u64 addr, u64 size, void *argv,
+	     u32 argc, struct lock_class_key *key, const char *name,
+	     struct nvkm_vmm **pvmm)
+{
+	struct nvkm_mmu *mmu = device->mmu;
+	struct nvkm_vmm *vmm = NULL;
+	int ret;
+	ret = mmu->func->vmm.ctor(mmu, addr, size, argv, argc, key, name, &vmm);
+	if (ret)
+		nvkm_vmm_unref(&vmm);
+	*pvmm = vmm;
+	return ret;
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h
new file mode 100644
index 000000000000..6d8f61ea467a
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h
@@ -0,0 +1,310 @@
+#ifndef __NVKM_VMM_H__
+#define __NVKM_VMM_H__
+#include "priv.h"
+#include <core/memory.h>
+enum nvkm_memory_target;
+
+struct nvkm_vmm_pt {
+	/* Some GPUs have a mapping level with a dual page tables to
+	 * support large and small pages in the same address-range.
+	 *
+	 * We track the state of both page tables in one place, which
+	 * is why there's multiple PT pointers/refcounts here.
+	 */
+	struct nvkm_mmu_pt *pt[2];
+	u32 refs[2];
+
+	/* Page size handled by this PT.
+	 *
+	 * Tesla backend needs to know this when writinge PDEs,
+	 * otherwise unnecessary.
+	 */
+	u8 page;
+
+	/* Entire page table sparse.
+	 *
+	 * Used to propagate sparseness to child page tables.
+	 */
+	bool sparse:1;
+
+	/* Tracking for page directories.
+	 *
+	 * The array is indexed by PDE, and will either point to the
+	 * child page table, or indicate the PDE is marked as sparse.
+	 **/
+#define NVKM_VMM_PDE_INVALID(pde) IS_ERR_OR_NULL(pde)
+#define NVKM_VMM_PDE_SPARSED(pde) IS_ERR(pde)
+#define NVKM_VMM_PDE_SPARSE       ERR_PTR(-EBUSY)
+	struct nvkm_vmm_pt **pde;
+
+	/* Tracking for dual page tables.
+	 *
+	 * There's one entry for each LPTE, keeping track of whether
+	 * there are valid SPTEs in the same address-range.
+	 *
+	 * This information is used to manage LPTE state transitions.
+	 */
+#define NVKM_VMM_PTE_SPARSE 0x80
+#define NVKM_VMM_PTE_VALID  0x40
+#define NVKM_VMM_PTE_SPTES  0x3f
+	u8 pte[];
+};
+
+typedef void (*nvkm_vmm_pxe_func)(struct nvkm_vmm *,
+				  struct nvkm_mmu_pt *, u32 ptei, u32 ptes);
+typedef void (*nvkm_vmm_pde_func)(struct nvkm_vmm *,
+				  struct nvkm_vmm_pt *, u32 pdei);
+typedef void (*nvkm_vmm_pte_func)(struct nvkm_vmm *, struct nvkm_mmu_pt *,
+				  u32 ptei, u32 ptes, struct nvkm_vmm_map *);
+
+struct nvkm_vmm_desc_func {
+	nvkm_vmm_pxe_func invalid;
+	nvkm_vmm_pxe_func unmap;
+	nvkm_vmm_pxe_func sparse;
+
+	nvkm_vmm_pde_func pde;
+
+	nvkm_vmm_pte_func mem;
+	nvkm_vmm_pte_func dma;
+	nvkm_vmm_pte_func sgl;
+};
+
+extern const struct nvkm_vmm_desc_func gf100_vmm_pgd;
+void gf100_vmm_pgd_pde(struct nvkm_vmm *, struct nvkm_vmm_pt *, u32);
+extern const struct nvkm_vmm_desc_func gf100_vmm_pgt;
+void gf100_vmm_pgt_unmap(struct nvkm_vmm *, struct nvkm_mmu_pt *, u32, u32);
+void gf100_vmm_pgt_mem(struct nvkm_vmm *, struct nvkm_mmu_pt *, u32, u32,
+		       struct nvkm_vmm_map *);
+void gf100_vmm_pgt_dma(struct nvkm_vmm *, struct nvkm_mmu_pt *, u32, u32,
+		       struct nvkm_vmm_map *);
+void gf100_vmm_pgt_sgl(struct nvkm_vmm *, struct nvkm_mmu_pt *, u32, u32,
+		       struct nvkm_vmm_map *);
+
+void gk104_vmm_lpt_invalid(struct nvkm_vmm *, struct nvkm_mmu_pt *, u32, u32);
+
+struct nvkm_vmm_desc {
+	enum {
+		PGD,
+		PGT,
+		SPT,
+		LPT,
+	} type;
+	u8 bits;	/* VMA bits covered by PT. */
+	u8 size;	/* Bytes-per-PTE. */
+	u32 align;	/* PT address alignment. */
+	const struct nvkm_vmm_desc_func *func;
+};
+
+extern const struct nvkm_vmm_desc gk104_vmm_desc_16_12[];
+extern const struct nvkm_vmm_desc gk104_vmm_desc_16_16[];
+extern const struct nvkm_vmm_desc gk104_vmm_desc_17_12[];
+extern const struct nvkm_vmm_desc gk104_vmm_desc_17_17[];
+
+extern const struct nvkm_vmm_desc gm200_vmm_desc_16_12[];
+extern const struct nvkm_vmm_desc gm200_vmm_desc_16_16[];
+extern const struct nvkm_vmm_desc gm200_vmm_desc_17_12[];
+extern const struct nvkm_vmm_desc gm200_vmm_desc_17_17[];
+
+extern const struct nvkm_vmm_desc gp100_vmm_desc_12[];
+extern const struct nvkm_vmm_desc gp100_vmm_desc_16[];
+
+struct nvkm_vmm_page {
+	u8 shift;
+	const struct nvkm_vmm_desc *desc;
+#define NVKM_VMM_PAGE_SPARSE                                               0x01
+#define NVKM_VMM_PAGE_VRAM                                                 0x02
+#define NVKM_VMM_PAGE_HOST                                                 0x04
+#define NVKM_VMM_PAGE_COMP                                                 0x08
+#define NVKM_VMM_PAGE_Sxxx                                (NVKM_VMM_PAGE_SPARSE)
+#define NVKM_VMM_PAGE_xVxx                                  (NVKM_VMM_PAGE_VRAM)
+#define NVKM_VMM_PAGE_SVxx             (NVKM_VMM_PAGE_Sxxx | NVKM_VMM_PAGE_VRAM)
+#define NVKM_VMM_PAGE_xxHx                                  (NVKM_VMM_PAGE_HOST)
+#define NVKM_VMM_PAGE_SxHx             (NVKM_VMM_PAGE_Sxxx | NVKM_VMM_PAGE_HOST)
+#define NVKM_VMM_PAGE_xVHx             (NVKM_VMM_PAGE_xVxx | NVKM_VMM_PAGE_HOST)
+#define NVKM_VMM_PAGE_SVHx             (NVKM_VMM_PAGE_SVxx | NVKM_VMM_PAGE_HOST)
+#define NVKM_VMM_PAGE_xVxC             (NVKM_VMM_PAGE_xVxx | NVKM_VMM_PAGE_COMP)
+#define NVKM_VMM_PAGE_SVxC             (NVKM_VMM_PAGE_SVxx | NVKM_VMM_PAGE_COMP)
+#define NVKM_VMM_PAGE_xxHC             (NVKM_VMM_PAGE_xxHx | NVKM_VMM_PAGE_COMP)
+#define NVKM_VMM_PAGE_SxHC             (NVKM_VMM_PAGE_SxHx | NVKM_VMM_PAGE_COMP)
+	u8 type;
+};
+
+struct nvkm_vmm_func {
+	int (*join)(struct nvkm_vmm *, struct nvkm_memory *inst);
+	void (*part)(struct nvkm_vmm *, struct nvkm_memory *inst);
+
+	int (*aper)(enum nvkm_memory_target);
+	int (*valid)(struct nvkm_vmm *, void *argv, u32 argc,
+		     struct nvkm_vmm_map *);
+	void (*flush)(struct nvkm_vmm *, int depth);
+
+	u64 page_block;
+	const struct nvkm_vmm_page page[];
+};
+
+struct nvkm_vmm_join {
+	struct nvkm_memory *inst;
+	struct list_head head;
+};
+
+int nvkm_vmm_new_(const struct nvkm_vmm_func *, struct nvkm_mmu *,
+		  u32 pd_header, u64 addr, u64 size, struct lock_class_key *,
+		  const char *name, struct nvkm_vmm **);
+int nvkm_vmm_ctor(const struct nvkm_vmm_func *, struct nvkm_mmu *,
+		  u32 pd_header, u64 addr, u64 size, struct lock_class_key *,
+		  const char *name, struct nvkm_vmm *);
+struct nvkm_vma *nvkm_vmm_node_search(struct nvkm_vmm *, u64 addr);
+int nvkm_vmm_get_locked(struct nvkm_vmm *, bool getref, bool mapref,
+			bool sparse, u8 page, u8 align, u64 size,
+			struct nvkm_vma **pvma);
+void nvkm_vmm_put_locked(struct nvkm_vmm *, struct nvkm_vma *);
+void nvkm_vmm_unmap_locked(struct nvkm_vmm *, struct nvkm_vma *);
+void nvkm_vmm_unmap_region(struct nvkm_vmm *vmm, struct nvkm_vma *vma);
+
+struct nvkm_vma *nvkm_vma_tail(struct nvkm_vma *, u64 tail);
+void nvkm_vmm_node_insert(struct nvkm_vmm *, struct nvkm_vma *);
+
+int nv04_vmm_new_(const struct nvkm_vmm_func *, struct nvkm_mmu *, u32,
+		  u64, u64, void *, u32, struct lock_class_key *,
+		  const char *, struct nvkm_vmm **);
+int nv04_vmm_valid(struct nvkm_vmm *, void *, u32, struct nvkm_vmm_map *);
+
+int gf100_vmm_new_(const struct nvkm_vmm_func *, const struct nvkm_vmm_func *,
+		   struct nvkm_mmu *, u64, u64, void *, u32,
+		   struct lock_class_key *, const char *, struct nvkm_vmm **);
+int gf100_vmm_join_(struct nvkm_vmm *, struct nvkm_memory *, u64 base);
+int gf100_vmm_join(struct nvkm_vmm *, struct nvkm_memory *);
+void gf100_vmm_part(struct nvkm_vmm *, struct nvkm_memory *);
+int gf100_vmm_aper(enum nvkm_memory_target);
+int gf100_vmm_valid(struct nvkm_vmm *, void *, u32, struct nvkm_vmm_map *);
+void gf100_vmm_flush_(struct nvkm_vmm *, int);
+void gf100_vmm_flush(struct nvkm_vmm *, int);
+
+int gk20a_vmm_aper(enum nvkm_memory_target);
+
+int gm200_vmm_new_(const struct nvkm_vmm_func *, const struct nvkm_vmm_func *,
+		   struct nvkm_mmu *, u64, u64, void *, u32,
+		   struct lock_class_key *, const char *, struct nvkm_vmm **);
+int gm200_vmm_join_(struct nvkm_vmm *, struct nvkm_memory *, u64 base);
+int gm200_vmm_join(struct nvkm_vmm *, struct nvkm_memory *);
+
+int gp100_vmm_join(struct nvkm_vmm *, struct nvkm_memory *);
+int gp100_vmm_valid(struct nvkm_vmm *, void *, u32, struct nvkm_vmm_map *);
+void gp100_vmm_flush(struct nvkm_vmm *, int);
+
+int nv04_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		 struct lock_class_key *, const char *, struct nvkm_vmm **);
+int nv41_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		 struct lock_class_key *, const char *, struct nvkm_vmm **);
+int nv44_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		 struct lock_class_key *, const char *, struct nvkm_vmm **);
+int nv50_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		 struct lock_class_key *, const char *, struct nvkm_vmm **);
+int g84_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		struct lock_class_key *, const char *, struct nvkm_vmm **);
+int gf100_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		  struct lock_class_key *, const char *, struct nvkm_vmm **);
+int gk104_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		  struct lock_class_key *, const char *, struct nvkm_vmm **);
+int gk20a_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		  struct lock_class_key *, const char *, struct nvkm_vmm **);
+int gm200_vmm_new_fixed(struct nvkm_mmu *, u64, u64, void *, u32,
+			struct lock_class_key *, const char *,
+			struct nvkm_vmm **);
+int gm200_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		  struct lock_class_key *, const char *,
+		  struct nvkm_vmm **);
+int gm20b_vmm_new_fixed(struct nvkm_mmu *, u64, u64, void *, u32,
+			struct lock_class_key *, const char *,
+			struct nvkm_vmm **);
+int gm20b_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		  struct lock_class_key *, const char *,
+		  struct nvkm_vmm **);
+int gp100_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		  struct lock_class_key *, const char *,
+		  struct nvkm_vmm **);
+int gp10b_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
+		  struct lock_class_key *, const char *,
+		  struct nvkm_vmm **);
+
+#define VMM_PRINT(l,v,p,f,a...) do {                                           \
+	struct nvkm_vmm *_vmm = (v);                                           \
+	if (CONFIG_NOUVEAU_DEBUG >= (l) && _vmm->debug >= (l)) {               \
+		nvkm_printk_(&_vmm->mmu->subdev, 0, p, "%s: "f"\n",            \
+			     _vmm->name, ##a);                                 \
+	}                                                                      \
+} while(0)
+#define VMM_DEBUG(v,f,a...) VMM_PRINT(NV_DBG_DEBUG, (v), info, f, ##a)
+#define VMM_TRACE(v,f,a...) VMM_PRINT(NV_DBG_TRACE, (v), info, f, ##a)
+#define VMM_SPAM(v,f,a...)  VMM_PRINT(NV_DBG_SPAM , (v),  dbg, f, ##a)
+
+#define VMM_MAP_ITER(VMM,PT,PTEI,PTEN,MAP,FILL,BASE,SIZE,NEXT) do {            \
+	nvkm_kmap((PT)->memory);                                               \
+	while (PTEN) {                                                         \
+		u64 _ptes = ((SIZE) - MAP->off) >> MAP->page->shift;           \
+		u64 _addr = ((BASE) + MAP->off);                               \
+                                                                               \
+		if (_ptes > PTEN) {                                            \
+			MAP->off += PTEN << MAP->page->shift;                  \
+			_ptes = PTEN;                                          \
+		} else {                                                       \
+			MAP->off = 0;                                          \
+			NEXT;                                                  \
+		}                                                              \
+                                                                               \
+		VMM_SPAM(VMM, "ITER %08x %08x PTE(s)", PTEI, (u32)_ptes);      \
+                                                                               \
+		FILL(VMM, PT, PTEI, _ptes, MAP, _addr);                        \
+		PTEI += _ptes;                                                 \
+		PTEN -= _ptes;                                                 \
+	};                                                                     \
+	nvkm_done((PT)->memory);                                               \
+} while(0)
+
+#define VMM_MAP_ITER_MEM(VMM,PT,PTEI,PTEN,MAP,FILL)                            \
+	VMM_MAP_ITER(VMM,PT,PTEI,PTEN,MAP,FILL,                                \
+		     ((u64)MAP->mem->offset << NVKM_RAM_MM_SHIFT),             \
+		     ((u64)MAP->mem->length << NVKM_RAM_MM_SHIFT),             \
+		     (MAP->mem = MAP->mem->next))
+#define VMM_MAP_ITER_DMA(VMM,PT,PTEI,PTEN,MAP,FILL)                            \
+	VMM_MAP_ITER(VMM,PT,PTEI,PTEN,MAP,FILL,                                \
+		     *MAP->dma, PAGE_SIZE, MAP->dma++)
+#define VMM_MAP_ITER_SGL(VMM,PT,PTEI,PTEN,MAP,FILL)                            \
+	VMM_MAP_ITER(VMM,PT,PTEI,PTEN,MAP,FILL,                                \
+		     sg_dma_address(MAP->sgl), sg_dma_len(MAP->sgl),           \
+		     (MAP->sgl = sg_next(MAP->sgl)))
+
+#define VMM_FO(m,o,d,c,b) nvkm_fo##b((m)->memory, (o), (d), (c))
+#define VMM_WO(m,o,d,c,b) nvkm_wo##b((m)->memory, (o), (d))
+#define VMM_XO(m,v,o,d,c,b,fn,f,a...) do {                                     \
+	const u32 _pteo = (o); u##b _data = (d);                               \
+	VMM_SPAM((v), "   %010llx "f, (m)->addr + _pteo, _data, ##a);          \
+	VMM_##fn((m), (m)->base + _pteo, _data, (c), b);                       \
+} while(0)
+
+#define VMM_WO032(m,v,o,d) VMM_XO((m),(v),(o),(d),  1, 32, WO, "%08x")
+#define VMM_FO032(m,v,o,d,c)                                                   \
+	VMM_XO((m),(v),(o),(d),(c), 32, FO, "%08x %08x", (c))
+
+#define VMM_WO064(m,v,o,d) VMM_XO((m),(v),(o),(d),  1, 64, WO, "%016llx")
+#define VMM_FO064(m,v,o,d,c)                                                   \
+	VMM_XO((m),(v),(o),(d),(c), 64, FO, "%016llx %08x", (c))
+
+#define VMM_XO128(m,v,o,lo,hi,c,f,a...) do {                                   \
+	u32 _pteo = (o), _ptes = (c);                                          \
+	const u64 _addr = (m)->addr + _pteo;                                   \
+	VMM_SPAM((v), "   %010llx %016llx%016llx"f, _addr, (hi), (lo), ##a);   \
+	while (_ptes--) {                                                      \
+		nvkm_wo64((m)->memory, (m)->base + _pteo + 0, (lo));           \
+		nvkm_wo64((m)->memory, (m)->base + _pteo + 8, (hi));           \
+		_pteo += 0x10;                                                 \
+	}                                                                      \
+} while(0)
+
+#define VMM_WO128(m,v,o,lo,hi) VMM_XO128((m),(v),(o),(lo),(hi), 1, "")
+#define VMM_FO128(m,v,o,lo,hi,c) do {                                          \
+	nvkm_kmap((m)->memory);                                                \
+	VMM_XO128((m),(v),(o),(lo),(hi),(c), " %08x", (c));                    \
+	nvkm_done((m)->memory);                                                \
+} while(0)
+#endif
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgf100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgf100.c
new file mode 100644
index 000000000000..faf5a7e9265e
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgf100.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "vmm.h"
+
+#include <subdev/fb.h>
+#include <subdev/ltc.h>
+#include <subdev/timer.h>
+
+#include <nvif/if900d.h>
+#include <nvif/unpack.h>
+
+static inline void
+gf100_vmm_pgt_pte(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		  u32 ptei, u32 ptes, struct nvkm_vmm_map *map, u64 addr)
+{
+	u64 base = (addr >> 8) | map->type;
+	u64 data = base;
+
+	if (map->ctag && !(map->next & (1ULL << 44))) {
+		while (ptes--) {
+			data = base | ((map->ctag >> 1) << 44);
+			if (!(map->ctag++ & 1))
+				data |= BIT_ULL(60);
+
+			VMM_WO064(pt, vmm, ptei++ * 8, data);
+			base += map->next;
+		}
+	} else {
+		map->type += ptes * map->ctag;
+
+		while (ptes--) {
+			VMM_WO064(pt, vmm, ptei++ * 8, data);
+			data += map->next;
+		}
+	}
+}
+
+void
+gf100_vmm_pgt_sgl(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		  u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	VMM_MAP_ITER_SGL(vmm, pt, ptei, ptes, map, gf100_vmm_pgt_pte);
+}
+
+void
+gf100_vmm_pgt_dma(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		  u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	if (map->page->shift == PAGE_SHIFT) {
+		VMM_SPAM(vmm, "DMAA %08x %08x PTE(s)", ptei, ptes);
+		nvkm_kmap(pt->memory);
+		while (ptes--) {
+			const u64 data = (*map->dma++ >> 8) | map->type;
+			VMM_WO064(pt, vmm, ptei++ * 8, data);
+			map->type += map->ctag;
+		}
+		nvkm_done(pt->memory);
+		return;
+	}
+
+	VMM_MAP_ITER_DMA(vmm, pt, ptei, ptes, map, gf100_vmm_pgt_pte);
+}
+
+void
+gf100_vmm_pgt_mem(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		  u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	VMM_MAP_ITER_MEM(vmm, pt, ptei, ptes, map, gf100_vmm_pgt_pte);
+}
+
+void
+gf100_vmm_pgt_unmap(struct nvkm_vmm *vmm,
+		    struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes)
+{
+	VMM_FO064(pt, vmm, ptei * 8, 0ULL, ptes);
+}
+
+const struct nvkm_vmm_desc_func
+gf100_vmm_pgt = {
+	.unmap = gf100_vmm_pgt_unmap,
+	.mem = gf100_vmm_pgt_mem,
+	.dma = gf100_vmm_pgt_dma,
+	.sgl = gf100_vmm_pgt_sgl,
+};
+
+void
+gf100_vmm_pgd_pde(struct nvkm_vmm *vmm, struct nvkm_vmm_pt *pgd, u32 pdei)
+{
+	struct nvkm_vmm_pt *pgt = pgd->pde[pdei];
+	struct nvkm_mmu_pt *pd = pgd->pt[0];
+	struct nvkm_mmu_pt *pt;
+	u64 data = 0;
+
+	if ((pt = pgt->pt[0])) {
+		switch (nvkm_memory_target(pt->memory)) {
+		case NVKM_MEM_TARGET_VRAM: data |= 1ULL << 0; break;
+		case NVKM_MEM_TARGET_HOST: data |= 2ULL << 0;
+			data |= BIT_ULL(35); /* VOL */
+			break;
+		case NVKM_MEM_TARGET_NCOH: data |= 3ULL << 0; break;
+		default:
+			WARN_ON(1);
+			return;
+		}
+		data |= pt->addr >> 8;
+	}
+
+	if ((pt = pgt->pt[1])) {
+		switch (nvkm_memory_target(pt->memory)) {
+		case NVKM_MEM_TARGET_VRAM: data |= 1ULL << 32; break;
+		case NVKM_MEM_TARGET_HOST: data |= 2ULL << 32;
+			data |= BIT_ULL(34); /* VOL */
+			break;
+		case NVKM_MEM_TARGET_NCOH: data |= 3ULL << 32; break;
+		default:
+			WARN_ON(1);
+			return;
+		}
+		data |= pt->addr << 24;
+	}
+
+	nvkm_kmap(pd->memory);
+	VMM_WO064(pd, vmm, pdei * 8, data);
+	nvkm_done(pd->memory);
+}
+
+const struct nvkm_vmm_desc_func
+gf100_vmm_pgd = {
+	.unmap = gf100_vmm_pgt_unmap,
+	.pde = gf100_vmm_pgd_pde,
+};
+
+static const struct nvkm_vmm_desc
+gf100_vmm_desc_17_12[] = {
+	{ SPT, 15, 8, 0x1000, &gf100_vmm_pgt },
+	{ PGD, 13, 8, 0x1000, &gf100_vmm_pgd },
+	{}
+};
+
+static const struct nvkm_vmm_desc
+gf100_vmm_desc_17_17[] = {
+	{ LPT, 10, 8, 0x1000, &gf100_vmm_pgt },
+	{ PGD, 13, 8, 0x1000, &gf100_vmm_pgd },
+	{}
+};
+
+static const struct nvkm_vmm_desc
+gf100_vmm_desc_16_12[] = {
+	{ SPT, 14, 8, 0x1000, &gf100_vmm_pgt },
+	{ PGD, 14, 8, 0x1000, &gf100_vmm_pgd },
+	{}
+};
+
+static const struct nvkm_vmm_desc
+gf100_vmm_desc_16_16[] = {
+	{ LPT, 10, 8, 0x1000, &gf100_vmm_pgt },
+	{ PGD, 14, 8, 0x1000, &gf100_vmm_pgd },
+	{}
+};
+
+void
+gf100_vmm_flush_(struct nvkm_vmm *vmm, int depth)
+{
+	struct nvkm_subdev *subdev = &vmm->mmu->subdev;
+	struct nvkm_device *device = subdev->device;
+	u32 type = depth << 24;
+
+	type = 0x00000001; /* PAGE_ALL */
+	if (atomic_read(&vmm->engref[NVKM_SUBDEV_BAR]))
+		type |= 0x00000004; /* HUB_ONLY */
+
+	mutex_lock(&subdev->mutex);
+	/* Looks like maybe a "free flush slots" counter, the
+	 * faster you write to 0x100cbc to more it decreases.
+	 */
+	nvkm_msec(device, 2000,
+		if (nvkm_rd32(device, 0x100c80) & 0x00ff0000)
+			break;
+	);
+
+	nvkm_wr32(device, 0x100cb8, vmm->pd->pt[0]->addr >> 8);
+	nvkm_wr32(device, 0x100cbc, 0x80000000 | type);
+
+	/* Wait for flush to be queued? */
+	nvkm_msec(device, 2000,
+		if (nvkm_rd32(device, 0x100c80) & 0x00008000)
+			break;
+	);
+	mutex_unlock(&subdev->mutex);
+}
+
+void
+gf100_vmm_flush(struct nvkm_vmm *vmm, int depth)
+{
+	gf100_vmm_flush_(vmm, 0);
+}
+
+int
+gf100_vmm_valid(struct nvkm_vmm *vmm, void *argv, u32 argc,
+		struct nvkm_vmm_map *map)
+{
+	const enum nvkm_memory_target target = nvkm_memory_target(map->memory);
+	const struct nvkm_vmm_page *page = map->page;
+	const bool gm20x = page->desc->func->sparse != NULL;
+	union {
+		struct gf100_vmm_map_vn vn;
+		struct gf100_vmm_map_v0 v0;
+	} *args = argv;
+	struct nvkm_device *device = vmm->mmu->subdev.device;
+	struct nvkm_memory *memory = map->memory;
+	u8  kind, priv, ro, vol;
+	int kindn, aper, ret = -ENOSYS;
+	const u8 *kindm;
+
+	map->next = (1 << page->shift) >> 8;
+	map->type = map->ctag = 0;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		vol  = !!args->v0.vol;
+		ro   = !!args->v0.ro;
+		priv = !!args->v0.priv;
+		kind =   args->v0.kind;
+	} else
+	if (!(ret = nvif_unvers(ret, &argv, &argc, args->vn))) {
+		vol  = target == NVKM_MEM_TARGET_HOST;
+		ro   = 0;
+		priv = 0;
+		kind = 0x00;
+	} else {
+		VMM_DEBUG(vmm, "args");
+		return ret;
+	}
+
+	aper = vmm->func->aper(target);
+	if (WARN_ON(aper < 0))
+		return aper;
+
+	kindm = vmm->mmu->func->kind(vmm->mmu, &kindn);
+	if (kind >= kindn || kindm[kind] == 0xff) {
+		VMM_DEBUG(vmm, "kind %02x", kind);
+		return -EINVAL;
+	}
+
+	if (kindm[kind] != kind) {
+		u32 comp = (page->shift == 16 && !gm20x) ? 16 : 17;
+		u32 tags = ALIGN(nvkm_memory_size(memory), 1 << 17) >> comp;
+		if (aper != 0 || !(page->type & NVKM_VMM_PAGE_COMP)) {
+			VMM_DEBUG(vmm, "comp %d %02x", aper, page->type);
+			return -EINVAL;
+		}
+
+		ret = nvkm_memory_tags_get(memory, device, tags,
+					   nvkm_ltc_tags_clear,
+					   &map->tags);
+		if (ret) {
+			VMM_DEBUG(vmm, "comp %d", ret);
+			return ret;
+		}
+
+		if (map->tags->mn) {
+			u64 tags = map->tags->mn->offset + (map->offset >> 17);
+			if (page->shift == 17 || !gm20x) {
+				map->type |= tags << 44;
+				map->ctag |= 1ULL << 44;
+				map->next |= 1ULL << 44;
+			} else {
+				map->ctag |= tags << 1 | 1;
+			}
+		} else {
+			kind = kindm[kind];
+		}
+	}
+
+	map->type |= BIT(0);
+	map->type |= (u64)priv << 1;
+	map->type |= (u64)  ro << 2;
+	map->type |= (u64) vol << 32;
+	map->type |= (u64)aper << 33;
+	map->type |= (u64)kind << 36;
+	return 0;
+}
+
+int
+gf100_vmm_aper(enum nvkm_memory_target target)
+{
+	switch (target) {
+	case NVKM_MEM_TARGET_VRAM: return 0;
+	case NVKM_MEM_TARGET_HOST: return 2;
+	case NVKM_MEM_TARGET_NCOH: return 3;
+	default:
+		return -EINVAL;
+	}
+}
+
+void
+gf100_vmm_part(struct nvkm_vmm *vmm, struct nvkm_memory *inst)
+{
+	nvkm_fo64(inst, 0x0200, 0x00000000, 2);
+}
+
+int
+gf100_vmm_join_(struct nvkm_vmm *vmm, struct nvkm_memory *inst, u64 base)
+{
+	struct nvkm_mmu_pt *pd = vmm->pd->pt[0];
+
+	switch (nvkm_memory_target(pd->memory)) {
+	case NVKM_MEM_TARGET_VRAM: base |= 0ULL << 0; break;
+	case NVKM_MEM_TARGET_HOST: base |= 2ULL << 0;
+		base |= BIT_ULL(2) /* VOL. */;
+		break;
+	case NVKM_MEM_TARGET_NCOH: base |= 3ULL << 0; break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+	base |= pd->addr;
+
+	nvkm_kmap(inst);
+	nvkm_wo64(inst, 0x0200, base);
+	nvkm_wo64(inst, 0x0208, vmm->limit - 1);
+	nvkm_done(inst);
+	return 0;
+}
+
+int
+gf100_vmm_join(struct nvkm_vmm *vmm, struct nvkm_memory *inst)
+{
+	return gf100_vmm_join_(vmm, inst, 0);
+}
+
+static const struct nvkm_vmm_func
+gf100_vmm_17 = {
+	.join = gf100_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gf100_vmm_aper,
+	.valid = gf100_vmm_valid,
+	.flush = gf100_vmm_flush,
+	.page = {
+		{ 17, &gf100_vmm_desc_17_17[0], NVKM_VMM_PAGE_xVxC },
+		{ 12, &gf100_vmm_desc_17_12[0], NVKM_VMM_PAGE_xVHx },
+		{}
+	}
+};
+
+static const struct nvkm_vmm_func
+gf100_vmm_16 = {
+	.join = gf100_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gf100_vmm_aper,
+	.valid = gf100_vmm_valid,
+	.flush = gf100_vmm_flush,
+	.page = {
+		{ 16, &gf100_vmm_desc_16_16[0], NVKM_VMM_PAGE_xVxC },
+		{ 12, &gf100_vmm_desc_16_12[0], NVKM_VMM_PAGE_xVHx },
+		{}
+	}
+};
+
+int
+gf100_vmm_new_(const struct nvkm_vmm_func *func_16,
+	       const struct nvkm_vmm_func *func_17,
+	       struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	       struct lock_class_key *key, const char *name,
+	       struct nvkm_vmm **pvmm)
+{
+	switch (mmu->subdev.device->fb->page) {
+	case 16: return nv04_vmm_new_(func_16, mmu, 0, addr, size,
+				      argv, argc, key, name, pvmm);
+	case 17: return nv04_vmm_new_(func_17, mmu, 0, addr, size,
+				      argv, argc, key, name, pvmm);
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+}
+
+int
+gf100_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	      struct lock_class_key *key, const char *name,
+	      struct nvkm_vmm **pvmm)
+{
+	return gf100_vmm_new_(&gf100_vmm_16, &gf100_vmm_17, mmu, addr,
+			      size, argv, argc, key, name, pvmm);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgk104.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgk104.c
new file mode 100644
index 000000000000..0ebb7bccfcd2
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgk104.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "vmm.h"
+
+void
+gk104_vmm_lpt_invalid(struct nvkm_vmm *vmm,
+		      struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes)
+{
+	/* VALID_FALSE + PRIV tells the MMU to ignore corresponding SPTEs. */
+	VMM_FO064(pt, vmm, ptei * 8, BIT_ULL(1) /* PRIV. */, ptes);
+}
+
+static const struct nvkm_vmm_desc_func
+gk104_vmm_lpt = {
+	.invalid = gk104_vmm_lpt_invalid,
+	.unmap = gf100_vmm_pgt_unmap,
+	.mem = gf100_vmm_pgt_mem,
+};
+
+const struct nvkm_vmm_desc
+gk104_vmm_desc_17_12[] = {
+	{ SPT, 15, 8, 0x1000, &gf100_vmm_pgt },
+	{ PGD, 13, 8, 0x1000, &gf100_vmm_pgd },
+	{}
+};
+
+const struct nvkm_vmm_desc
+gk104_vmm_desc_17_17[] = {
+	{ LPT, 10, 8, 0x1000, &gk104_vmm_lpt },
+	{ PGD, 13, 8, 0x1000, &gf100_vmm_pgd },
+	{}
+};
+
+const struct nvkm_vmm_desc
+gk104_vmm_desc_16_12[] = {
+	{ SPT, 14, 8, 0x1000, &gf100_vmm_pgt },
+	{ PGD, 14, 8, 0x1000, &gf100_vmm_pgd },
+	{}
+};
+
+const struct nvkm_vmm_desc
+gk104_vmm_desc_16_16[] = {
+	{ LPT, 10, 8, 0x1000, &gk104_vmm_lpt },
+	{ PGD, 14, 8, 0x1000, &gf100_vmm_pgd },
+	{}
+};
+
+static const struct nvkm_vmm_func
+gk104_vmm_17 = {
+	.join = gf100_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gf100_vmm_aper,
+	.valid = gf100_vmm_valid,
+	.flush = gf100_vmm_flush,
+	.page = {
+		{ 17, &gk104_vmm_desc_17_17[0], NVKM_VMM_PAGE_xVxC },
+		{ 12, &gk104_vmm_desc_17_12[0], NVKM_VMM_PAGE_xVHx },
+		{}
+	}
+};
+
+static const struct nvkm_vmm_func
+gk104_vmm_16 = {
+	.join = gf100_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gf100_vmm_aper,
+	.valid = gf100_vmm_valid,
+	.flush = gf100_vmm_flush,
+	.page = {
+		{ 16, &gk104_vmm_desc_16_16[0], NVKM_VMM_PAGE_xVxC },
+		{ 12, &gk104_vmm_desc_16_12[0], NVKM_VMM_PAGE_xVHx },
+		{}
+	}
+};
+
+int
+gk104_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	      struct lock_class_key *key, const char *name,
+	      struct nvkm_vmm **pvmm)
+{
+	return gf100_vmm_new_(&gk104_vmm_16, &gk104_vmm_17, mmu, addr,
+			      size, argv, argc, key, name, pvmm);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgk20a.c
new file mode 100644
index 000000000000..8086994a0446
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgk20a.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "vmm.h"
+
+#include <core/memory.h>
+
+int
+gk20a_vmm_aper(enum nvkm_memory_target target)
+{
+	switch (target) {
+	case NVKM_MEM_TARGET_NCOH: return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct nvkm_vmm_func
+gk20a_vmm_17 = {
+	.join = gf100_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gf100_vmm_aper,
+	.valid = gf100_vmm_valid,
+	.flush = gf100_vmm_flush,
+	.page = {
+		{ 17, &gk104_vmm_desc_17_17[0], NVKM_VMM_PAGE_xxHC },
+		{ 12, &gk104_vmm_desc_17_12[0], NVKM_VMM_PAGE_xxHx },
+		{}
+	}
+};
+
+static const struct nvkm_vmm_func
+gk20a_vmm_16 = {
+	.join = gf100_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gf100_vmm_aper,
+	.valid = gf100_vmm_valid,
+	.flush = gf100_vmm_flush,
+	.page = {
+		{ 16, &gk104_vmm_desc_16_16[0], NVKM_VMM_PAGE_xxHC },
+		{ 12, &gk104_vmm_desc_16_12[0], NVKM_VMM_PAGE_xxHx },
+		{}
+	}
+};
+
+int
+gk20a_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	      struct lock_class_key *key, const char *name,
+	      struct nvkm_vmm **pvmm)
+{
+	return gf100_vmm_new_(&gk20a_vmm_16, &gk20a_vmm_17, mmu, addr,
+			      size, argv, argc, key, name, pvmm);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgm200.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgm200.c
new file mode 100644
index 000000000000..a1676a4644fe
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgm200.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "vmm.h"
+
+#include <nvif/ifb00d.h>
+#include <nvif/unpack.h>
+
+static void
+gm200_vmm_pgt_sparse(struct nvkm_vmm *vmm,
+		     struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes)
+{
+	/* VALID_FALSE + VOL tells the MMU to treat the PTE as sparse. */
+	VMM_FO064(pt, vmm, ptei * 8, BIT_ULL(32) /* VOL. */, ptes);
+}
+
+static const struct nvkm_vmm_desc_func
+gm200_vmm_spt = {
+	.unmap = gf100_vmm_pgt_unmap,
+	.sparse = gm200_vmm_pgt_sparse,
+	.mem = gf100_vmm_pgt_mem,
+	.dma = gf100_vmm_pgt_dma,
+	.sgl = gf100_vmm_pgt_sgl,
+};
+
+static const struct nvkm_vmm_desc_func
+gm200_vmm_lpt = {
+	.invalid = gk104_vmm_lpt_invalid,
+	.unmap = gf100_vmm_pgt_unmap,
+	.sparse = gm200_vmm_pgt_sparse,
+	.mem = gf100_vmm_pgt_mem,
+};
+
+static void
+gm200_vmm_pgd_sparse(struct nvkm_vmm *vmm,
+		     struct nvkm_mmu_pt *pt, u32 pdei, u32 pdes)
+{
+	/* VALID_FALSE + VOL_BIG tells the MMU to treat the PDE as sparse. */
+	VMM_FO064(pt, vmm, pdei * 8, BIT_ULL(35) /* VOL_BIG. */, pdes);
+}
+
+static const struct nvkm_vmm_desc_func
+gm200_vmm_pgd = {
+	.unmap = gf100_vmm_pgt_unmap,
+	.sparse = gm200_vmm_pgd_sparse,
+	.pde = gf100_vmm_pgd_pde,
+};
+
+const struct nvkm_vmm_desc
+gm200_vmm_desc_17_12[] = {
+	{ SPT, 15, 8, 0x1000, &gm200_vmm_spt },
+	{ PGD, 13, 8, 0x1000, &gm200_vmm_pgd },
+	{}
+};
+
+const struct nvkm_vmm_desc
+gm200_vmm_desc_17_17[] = {
+	{ LPT, 10, 8, 0x1000, &gm200_vmm_lpt },
+	{ PGD, 13, 8, 0x1000, &gm200_vmm_pgd },
+	{}
+};
+
+const struct nvkm_vmm_desc
+gm200_vmm_desc_16_12[] = {
+	{ SPT, 14, 8, 0x1000, &gm200_vmm_spt },
+	{ PGD, 14, 8, 0x1000, &gm200_vmm_pgd },
+	{}
+};
+
+const struct nvkm_vmm_desc
+gm200_vmm_desc_16_16[] = {
+	{ LPT, 10, 8, 0x1000, &gm200_vmm_lpt },
+	{ PGD, 14, 8, 0x1000, &gm200_vmm_pgd },
+	{}
+};
+
+int
+gm200_vmm_join_(struct nvkm_vmm *vmm, struct nvkm_memory *inst, u64 base)
+{
+	if (vmm->func->page[1].shift == 16)
+		base |= BIT_ULL(11);
+	return gf100_vmm_join_(vmm, inst, base);
+}
+
+int
+gm200_vmm_join(struct nvkm_vmm *vmm, struct nvkm_memory *inst)
+{
+	return gm200_vmm_join_(vmm, inst, 0);
+}
+
+static const struct nvkm_vmm_func
+gm200_vmm_17 = {
+	.join = gm200_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gf100_vmm_aper,
+	.valid = gf100_vmm_valid,
+	.flush = gf100_vmm_flush,
+	.page = {
+		{ 27, &gm200_vmm_desc_17_17[1], NVKM_VMM_PAGE_Sxxx },
+		{ 17, &gm200_vmm_desc_17_17[0], NVKM_VMM_PAGE_SVxC },
+		{ 12, &gm200_vmm_desc_17_12[0], NVKM_VMM_PAGE_SVHx },
+		{}
+	}
+};
+
+static const struct nvkm_vmm_func
+gm200_vmm_16 = {
+	.join = gm200_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gf100_vmm_aper,
+	.valid = gf100_vmm_valid,
+	.flush = gf100_vmm_flush,
+	.page = {
+		{ 27, &gm200_vmm_desc_16_16[1], NVKM_VMM_PAGE_Sxxx },
+		{ 16, &gm200_vmm_desc_16_16[0], NVKM_VMM_PAGE_SVxC },
+		{ 12, &gm200_vmm_desc_16_12[0], NVKM_VMM_PAGE_SVHx },
+		{}
+	}
+};
+
+int
+gm200_vmm_new_(const struct nvkm_vmm_func *func_16,
+	       const struct nvkm_vmm_func *func_17,
+	       struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	       struct lock_class_key *key, const char *name,
+	       struct nvkm_vmm **pvmm)
+{
+	const struct nvkm_vmm_func *func;
+	union {
+		struct gm200_vmm_vn vn;
+		struct gm200_vmm_v0 v0;
+	} *args = argv;
+	int ret = -ENOSYS;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		switch (args->v0.bigpage) {
+		case 16: func = func_16; break;
+		case 17: func = func_17; break;
+		default:
+			return -EINVAL;
+		}
+	} else
+	if (!(ret = nvif_unvers(ret, &argv, &argc, args->vn))) {
+		func = func_17;
+	} else
+		return ret;
+
+	return nvkm_vmm_new_(func, mmu, 0, addr, size, key, name, pvmm);
+}
+
+int
+gm200_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	      struct lock_class_key *key, const char *name,
+	      struct nvkm_vmm **pvmm)
+{
+	return gm200_vmm_new_(&gm200_vmm_16, &gm200_vmm_17, mmu, addr,
+			      size, argv, argc, key, name, pvmm);
+}
+
+int
+gm200_vmm_new_fixed(struct nvkm_mmu *mmu, u64 addr, u64 size,
+		    void *argv, u32 argc, struct lock_class_key *key,
+		    const char *name, struct nvkm_vmm **pvmm)
+{
+	return gf100_vmm_new_(&gm200_vmm_16, &gm200_vmm_17, mmu, addr,
+			      size, argv, argc, key, name, pvmm);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgm20b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgm20b.c
new file mode 100644
index 000000000000..64d4b6cff8dd
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgm20b.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "vmm.h"
+
+static const struct nvkm_vmm_func
+gm20b_vmm_17 = {
+	.join = gm200_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gk20a_vmm_aper,
+	.valid = gf100_vmm_valid,
+	.flush = gf100_vmm_flush,
+	.page = {
+		{ 27, &gm200_vmm_desc_17_17[1], NVKM_VMM_PAGE_Sxxx },
+		{ 17, &gm200_vmm_desc_17_17[0], NVKM_VMM_PAGE_SxHC },
+		{ 12, &gm200_vmm_desc_17_12[0], NVKM_VMM_PAGE_SxHx },
+		{}
+	}
+};
+
+static const struct nvkm_vmm_func
+gm20b_vmm_16 = {
+	.join = gm200_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gk20a_vmm_aper,
+	.valid = gf100_vmm_valid,
+	.flush = gf100_vmm_flush,
+	.page = {
+		{ 27, &gm200_vmm_desc_16_16[1], NVKM_VMM_PAGE_Sxxx },
+		{ 16, &gm200_vmm_desc_16_16[0], NVKM_VMM_PAGE_SxHC },
+		{ 12, &gm200_vmm_desc_16_12[0], NVKM_VMM_PAGE_SxHx },
+		{}
+	}
+};
+
+int
+gm20b_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	      struct lock_class_key *key, const char *name,
+	      struct nvkm_vmm **pvmm)
+{
+	return gm200_vmm_new_(&gm20b_vmm_16, &gm20b_vmm_17, mmu, addr,
+			      size, argv, argc, key, name, pvmm);
+}
+
+int
+gm20b_vmm_new_fixed(struct nvkm_mmu *mmu, u64 addr, u64 size,
+		    void *argv, u32 argc, struct lock_class_key *key,
+		    const char *name, struct nvkm_vmm **pvmm)
+{
+	return gf100_vmm_new_(&gm20b_vmm_16, &gm20b_vmm_17, mmu, addr,
+			      size, argv, argc, key, name, pvmm);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c
new file mode 100644
index 000000000000..059fafe0e771
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "vmm.h"
+
+#include <subdev/fb.h>
+#include <subdev/ltc.h>
+
+#include <nvif/ifc00d.h>
+#include <nvif/unpack.h>
+
+static inline void
+gp100_vmm_pgt_pte(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		  u32 ptei, u32 ptes, struct nvkm_vmm_map *map, u64 addr)
+{
+	u64 data = (addr >> 4) | map->type;
+
+	map->type += ptes * map->ctag;
+
+	while (ptes--) {
+		VMM_WO064(pt, vmm, ptei++ * 8, data);
+		data += map->next;
+	}
+}
+
+static void
+gp100_vmm_pgt_sgl(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		  u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	VMM_MAP_ITER_SGL(vmm, pt, ptei, ptes, map, gp100_vmm_pgt_pte);
+}
+
+static void
+gp100_vmm_pgt_dma(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		  u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	if (map->page->shift == PAGE_SHIFT) {
+		VMM_SPAM(vmm, "DMAA %08x %08x PTE(s)", ptei, ptes);
+		nvkm_kmap(pt->memory);
+		while (ptes--) {
+			const u64 data = (*map->dma++ >> 4) | map->type;
+			VMM_WO064(pt, vmm, ptei++ * 8, data);
+			map->type += map->ctag;
+		}
+		nvkm_done(pt->memory);
+		return;
+	}
+
+	VMM_MAP_ITER_DMA(vmm, pt, ptei, ptes, map, gp100_vmm_pgt_pte);
+}
+
+static void
+gp100_vmm_pgt_mem(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		  u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	VMM_MAP_ITER_MEM(vmm, pt, ptei, ptes, map, gp100_vmm_pgt_pte);
+}
+
+static void
+gp100_vmm_pgt_sparse(struct nvkm_vmm *vmm,
+		     struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes)
+{
+	/* VALID_FALSE + VOL tells the MMU to treat the PTE as sparse. */
+	VMM_FO064(pt, vmm, ptei * 8, BIT_ULL(3) /* VOL. */, ptes);
+}
+
+static const struct nvkm_vmm_desc_func
+gp100_vmm_desc_spt = {
+	.unmap = gf100_vmm_pgt_unmap,
+	.sparse = gp100_vmm_pgt_sparse,
+	.mem = gp100_vmm_pgt_mem,
+	.dma = gp100_vmm_pgt_dma,
+	.sgl = gp100_vmm_pgt_sgl,
+};
+
+static void
+gp100_vmm_lpt_invalid(struct nvkm_vmm *vmm,
+		      struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes)
+{
+	/* VALID_FALSE + PRIV tells the MMU to ignore corresponding SPTEs. */
+	VMM_FO064(pt, vmm, ptei * 8, BIT_ULL(5) /* PRIV. */, ptes);
+}
+
+static const struct nvkm_vmm_desc_func
+gp100_vmm_desc_lpt = {
+	.invalid = gp100_vmm_lpt_invalid,
+	.unmap = gf100_vmm_pgt_unmap,
+	.sparse = gp100_vmm_pgt_sparse,
+	.mem = gp100_vmm_pgt_mem,
+};
+
+static inline void
+gp100_vmm_pd0_pte(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		  u32 ptei, u32 ptes, struct nvkm_vmm_map *map, u64 addr)
+{
+	u64 data = (addr >> 4) | map->type;
+
+	map->type += ptes * map->ctag;
+
+	while (ptes--) {
+		VMM_WO128(pt, vmm, ptei++ * 0x10, data, 0ULL);
+		data += map->next;
+	}
+}
+
+static void
+gp100_vmm_pd0_mem(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		  u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	VMM_MAP_ITER_MEM(vmm, pt, ptei, ptes, map, gp100_vmm_pd0_pte);
+}
+
+static inline bool
+gp100_vmm_pde(struct nvkm_mmu_pt *pt, u64 *data)
+{
+	switch (nvkm_memory_target(pt->memory)) {
+	case NVKM_MEM_TARGET_VRAM: *data |= 1ULL << 1; break;
+	case NVKM_MEM_TARGET_HOST: *data |= 2ULL << 1;
+		*data |= BIT_ULL(3); /* VOL. */
+		break;
+	case NVKM_MEM_TARGET_NCOH: *data |= 3ULL << 1; break;
+	default:
+		WARN_ON(1);
+		return false;
+	}
+	*data |= pt->addr >> 4;
+	return true;
+}
+
+static void
+gp100_vmm_pd0_pde(struct nvkm_vmm *vmm, struct nvkm_vmm_pt *pgd, u32 pdei)
+{
+	struct nvkm_vmm_pt *pgt = pgd->pde[pdei];
+	struct nvkm_mmu_pt *pd = pgd->pt[0];
+	u64 data[2] = {};
+
+	if (pgt->pt[0] && !gp100_vmm_pde(pgt->pt[0], &data[0]))
+		return;
+	if (pgt->pt[1] && !gp100_vmm_pde(pgt->pt[1], &data[1]))
+		return;
+
+	nvkm_kmap(pd->memory);
+	VMM_WO128(pd, vmm, pdei * 0x10, data[0], data[1]);
+	nvkm_done(pd->memory);
+}
+
+static void
+gp100_vmm_pd0_sparse(struct nvkm_vmm *vmm,
+		     struct nvkm_mmu_pt *pt, u32 pdei, u32 pdes)
+{
+	/* VALID_FALSE + VOL_BIG tells the MMU to treat the PDE as sparse. */
+	VMM_FO128(pt, vmm, pdei * 0x10, BIT_ULL(3) /* VOL_BIG. */, 0ULL, pdes);
+}
+
+static void
+gp100_vmm_pd0_unmap(struct nvkm_vmm *vmm,
+		    struct nvkm_mmu_pt *pt, u32 pdei, u32 pdes)
+{
+	VMM_FO128(pt, vmm, pdei * 0x10, 0ULL, 0ULL, pdes);
+}
+
+static const struct nvkm_vmm_desc_func
+gp100_vmm_desc_pd0 = {
+	.unmap = gp100_vmm_pd0_unmap,
+	.sparse = gp100_vmm_pd0_sparse,
+	.pde = gp100_vmm_pd0_pde,
+	.mem = gp100_vmm_pd0_mem,
+};
+
+static void
+gp100_vmm_pd1_pde(struct nvkm_vmm *vmm, struct nvkm_vmm_pt *pgd, u32 pdei)
+{
+	struct nvkm_vmm_pt *pgt = pgd->pde[pdei];
+	struct nvkm_mmu_pt *pd = pgd->pt[0];
+	u64 data = 0;
+
+	if (!gp100_vmm_pde(pgt->pt[0], &data))
+		return;
+
+	nvkm_kmap(pd->memory);
+	VMM_WO064(pd, vmm, pdei * 8, data);
+	nvkm_done(pd->memory);
+}
+
+static const struct nvkm_vmm_desc_func
+gp100_vmm_desc_pd1 = {
+	.unmap = gf100_vmm_pgt_unmap,
+	.sparse = gp100_vmm_pgt_sparse,
+	.pde = gp100_vmm_pd1_pde,
+};
+
+const struct nvkm_vmm_desc
+gp100_vmm_desc_16[] = {
+	{ LPT, 5,  8, 0x0100, &gp100_vmm_desc_lpt },
+	{ PGD, 8, 16, 0x1000, &gp100_vmm_desc_pd0 },
+	{ PGD, 9,  8, 0x1000, &gp100_vmm_desc_pd1 },
+	{ PGD, 9,  8, 0x1000, &gp100_vmm_desc_pd1 },
+	{ PGD, 2,  8, 0x1000, &gp100_vmm_desc_pd1 },
+	{}
+};
+
+const struct nvkm_vmm_desc
+gp100_vmm_desc_12[] = {
+	{ SPT, 9,  8, 0x1000, &gp100_vmm_desc_spt },
+	{ PGD, 8, 16, 0x1000, &gp100_vmm_desc_pd0 },
+	{ PGD, 9,  8, 0x1000, &gp100_vmm_desc_pd1 },
+	{ PGD, 9,  8, 0x1000, &gp100_vmm_desc_pd1 },
+	{ PGD, 2,  8, 0x1000, &gp100_vmm_desc_pd1 },
+	{}
+};
+
+int
+gp100_vmm_valid(struct nvkm_vmm *vmm, void *argv, u32 argc,
+		struct nvkm_vmm_map *map)
+{
+	const enum nvkm_memory_target target = nvkm_memory_target(map->memory);
+	const struct nvkm_vmm_page *page = map->page;
+	union {
+		struct gp100_vmm_map_vn vn;
+		struct gp100_vmm_map_v0 v0;
+	} *args = argv;
+	struct nvkm_device *device = vmm->mmu->subdev.device;
+	struct nvkm_memory *memory = map->memory;
+	u8  kind, priv, ro, vol;
+	int kindn, aper, ret = -ENOSYS;
+	const u8 *kindm;
+
+	map->next = (1ULL << page->shift) >> 4;
+	map->type = 0;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		vol  = !!args->v0.vol;
+		ro   = !!args->v0.ro;
+		priv = !!args->v0.priv;
+		kind =   args->v0.kind;
+	} else
+	if (!(ret = nvif_unvers(ret, &argv, &argc, args->vn))) {
+		vol  = target == NVKM_MEM_TARGET_HOST;
+		ro   = 0;
+		priv = 0;
+		kind = 0x00;
+	} else {
+		VMM_DEBUG(vmm, "args");
+		return ret;
+	}
+
+	aper = vmm->func->aper(target);
+	if (WARN_ON(aper < 0))
+		return aper;
+
+	kindm = vmm->mmu->func->kind(vmm->mmu, &kindn);
+	if (kind >= kindn || kindm[kind] == 0xff) {
+		VMM_DEBUG(vmm, "kind %02x", kind);
+		return -EINVAL;
+	}
+
+	if (kindm[kind] != kind) {
+		u64 tags = nvkm_memory_size(memory) >> 16;
+		if (aper != 0 || !(page->type & NVKM_VMM_PAGE_COMP)) {
+			VMM_DEBUG(vmm, "comp %d %02x", aper, page->type);
+			return -EINVAL;
+		}
+
+		ret = nvkm_memory_tags_get(memory, device, tags,
+					   nvkm_ltc_tags_clear,
+					   &map->tags);
+		if (ret) {
+			VMM_DEBUG(vmm, "comp %d", ret);
+			return ret;
+		}
+
+		if (map->tags->mn) {
+			tags = map->tags->mn->offset + (map->offset >> 16);
+			map->ctag |= ((1ULL << page->shift) >> 16) << 36;
+			map->type |= tags << 36;
+			map->next |= map->ctag;
+		} else {
+			kind = kindm[kind];
+		}
+	}
+
+	map->type |= BIT(0);
+	map->type |= (u64)aper << 1;
+	map->type |= (u64) vol << 3;
+	map->type |= (u64)priv << 5;
+	map->type |= (u64)  ro << 6;
+	map->type |= (u64)kind << 56;
+	return 0;
+}
+
+void
+gp100_vmm_flush(struct nvkm_vmm *vmm, int depth)
+{
+	gf100_vmm_flush_(vmm, 5 /* CACHE_LEVEL_UP_TO_PDE3 */ - depth);
+}
+
+int
+gp100_vmm_join(struct nvkm_vmm *vmm, struct nvkm_memory *inst)
+{
+	const u64 base = BIT_ULL(10) /* VER2 */ | BIT_ULL(11); /* 64KiB */
+	return gf100_vmm_join_(vmm, inst, base);
+}
+
+static const struct nvkm_vmm_func
+gp100_vmm = {
+	.join = gp100_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gf100_vmm_aper,
+	.valid = gp100_vmm_valid,
+	.flush = gp100_vmm_flush,
+	.page = {
+		{ 47, &gp100_vmm_desc_16[4], NVKM_VMM_PAGE_Sxxx },
+		{ 38, &gp100_vmm_desc_16[3], NVKM_VMM_PAGE_Sxxx },
+		{ 29, &gp100_vmm_desc_16[2], NVKM_VMM_PAGE_Sxxx },
+		{ 21, &gp100_vmm_desc_16[1], NVKM_VMM_PAGE_SVxC },
+		{ 16, &gp100_vmm_desc_16[0], NVKM_VMM_PAGE_SVxC },
+		{ 12, &gp100_vmm_desc_12[0], NVKM_VMM_PAGE_SVHx },
+		{}
+	}
+};
+
+int
+gp100_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	      struct lock_class_key *key, const char *name,
+	      struct nvkm_vmm **pvmm)
+{
+	return nv04_vmm_new_(&gp100_vmm, mmu, 0, addr, size,
+			     argv, argc, key, name, pvmm);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp10b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp10b.c
new file mode 100644
index 000000000000..3dcc6bddb32f
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp10b.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "vmm.h"
+
+static const struct nvkm_vmm_func
+gp10b_vmm = {
+	.join = gp100_vmm_join,
+	.part = gf100_vmm_part,
+	.aper = gk20a_vmm_aper,
+	.valid = gp100_vmm_valid,
+	.flush = gp100_vmm_flush,
+	.page = {
+		{ 47, &gp100_vmm_desc_16[4], NVKM_VMM_PAGE_Sxxx },
+		{ 38, &gp100_vmm_desc_16[3], NVKM_VMM_PAGE_Sxxx },
+		{ 29, &gp100_vmm_desc_16[2], NVKM_VMM_PAGE_Sxxx },
+		{ 21, &gp100_vmm_desc_16[1], NVKM_VMM_PAGE_SxHC },
+		{ 16, &gp100_vmm_desc_16[0], NVKM_VMM_PAGE_SxHC },
+		{ 12, &gp100_vmm_desc_12[0], NVKM_VMM_PAGE_SxHx },
+		{}
+	}
+};
+
+int
+gp10b_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	      struct lock_class_key *key, const char *name,
+	      struct nvkm_vmm **pvmm)
+{
+	return nv04_vmm_new_(&gp10b_vmm, mmu, 0, addr, size,
+			     argv, argc, key, name, pvmm);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv04.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv04.c
new file mode 100644
index 000000000000..0cab1ffc9f64
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv04.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "vmm.h"
+
+#include <nvif/if000d.h>
+#include <nvif/unpack.h>
+
+static inline void
+nv04_vmm_pgt_pte(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map, u64 addr)
+{
+	u32 data = addr | 0x00000003; /* PRESENT, RW. */
+	while (ptes--) {
+		VMM_WO032(pt, vmm, 8 + ptei++ * 4, data);
+		data += 0x00001000;
+	}
+}
+
+static void
+nv04_vmm_pgt_sgl(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	VMM_MAP_ITER_SGL(vmm, pt, ptei, ptes, map, nv04_vmm_pgt_pte);
+}
+
+static void
+nv04_vmm_pgt_dma(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+#if PAGE_SHIFT == 12
+	nvkm_kmap(pt->memory);
+	while (ptes--)
+		VMM_WO032(pt, vmm, 8 + (ptei++ * 4), *map->dma++ | 0x00000003);
+	nvkm_done(pt->memory);
+#else
+	VMM_MAP_ITER_DMA(vmm, pt, ptei, ptes, map, nv04_vmm_pgt_pte);
+#endif
+}
+
+static void
+nv04_vmm_pgt_unmap(struct nvkm_vmm *vmm,
+		   struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes)
+{
+	VMM_FO032(pt, vmm, 8 + (ptei * 4), 0, ptes);
+}
+
+static const struct nvkm_vmm_desc_func
+nv04_vmm_desc_pgt = {
+	.unmap = nv04_vmm_pgt_unmap,
+	.dma = nv04_vmm_pgt_dma,
+	.sgl = nv04_vmm_pgt_sgl,
+};
+
+static const struct nvkm_vmm_desc
+nv04_vmm_desc_12[] = {
+	{ PGT, 15, 4, 0x1000, &nv04_vmm_desc_pgt },
+	{}
+};
+
+int
+nv04_vmm_valid(struct nvkm_vmm *vmm, void *argv, u32 argc,
+	       struct nvkm_vmm_map *map)
+{
+	union {
+		struct nv04_vmm_map_vn vn;
+	} *args = argv;
+	int ret = -ENOSYS;
+	if ((ret = nvif_unvers(ret, &argv, &argc, args->vn)))
+		VMM_DEBUG(vmm, "args");
+	return ret;
+}
+
+static const struct nvkm_vmm_func
+nv04_vmm = {
+	.valid = nv04_vmm_valid,
+	.page = {
+		{ 12, &nv04_vmm_desc_12[0], NVKM_VMM_PAGE_HOST },
+		{}
+	}
+};
+
+int
+nv04_vmm_new_(const struct nvkm_vmm_func *func, struct nvkm_mmu *mmu,
+	      u32 pd_header, u64 addr, u64 size, void *argv, u32 argc,
+	      struct lock_class_key *key, const char *name,
+	      struct nvkm_vmm **pvmm)
+{
+	union {
+		struct nv04_vmm_vn vn;
+	} *args = argv;
+	int ret;
+
+	ret = nvkm_vmm_new_(func, mmu, pd_header, addr, size, key, name, pvmm);
+	if (ret)
+		return ret;
+
+	return nvif_unvers(-ENOSYS, &argv, &argc, args->vn);
+}
+
+int
+nv04_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	     struct lock_class_key *key, const char *name,
+	     struct nvkm_vmm **pvmm)
+{
+	struct nvkm_memory *mem;
+	struct nvkm_vmm *vmm;
+	int ret;
+
+	ret = nv04_vmm_new_(&nv04_vmm, mmu, 8, addr, size,
+			    argv, argc, key, name, &vmm);
+	*pvmm = vmm;
+	if (ret)
+		return ret;
+
+	mem = vmm->pd->pt[0]->memory;
+	nvkm_kmap(mem);
+	nvkm_wo32(mem, 0x00000, 0x0002103d); /* PCI, RW, PT, !LN */
+	nvkm_wo32(mem, 0x00004, vmm->limit - 1);
+	nvkm_done(mem);
+	return 0;
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv41.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv41.c
new file mode 100644
index 000000000000..b595f130e573
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv41.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "vmm.h"
+
+#include <subdev/timer.h>
+
+static void
+nv41_vmm_pgt_pte(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map, u64 addr)
+{
+	u32 data = (addr >> 7) | 0x00000001; /* VALID. */
+	while (ptes--) {
+		VMM_WO032(pt, vmm, ptei++ * 4, data);
+		data += 0x00000020;
+	}
+}
+
+static void
+nv41_vmm_pgt_sgl(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	VMM_MAP_ITER_SGL(vmm, pt, ptei, ptes, map, nv41_vmm_pgt_pte);
+}
+
+static void
+nv41_vmm_pgt_dma(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+#if PAGE_SHIFT == 12
+	nvkm_kmap(pt->memory);
+	while (ptes--) {
+		const u32 data = (*map->dma++ >> 7) | 0x00000001;
+		VMM_WO032(pt, vmm, ptei++ * 4, data);
+	}
+	nvkm_done(pt->memory);
+#else
+	VMM_MAP_ITER_DMA(vmm, pt, ptei, ptes, map, nv41_vmm_pgt_pte);
+#endif
+}
+
+static void
+nv41_vmm_pgt_unmap(struct nvkm_vmm *vmm,
+		   struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes)
+{
+	VMM_FO032(pt, vmm, ptei * 4, 0, ptes);
+}
+
+static const struct nvkm_vmm_desc_func
+nv41_vmm_desc_pgt = {
+	.unmap = nv41_vmm_pgt_unmap,
+	.dma = nv41_vmm_pgt_dma,
+	.sgl = nv41_vmm_pgt_sgl,
+};
+
+static const struct nvkm_vmm_desc
+nv41_vmm_desc_12[] = {
+	{ PGT, 17, 4, 0x1000, &nv41_vmm_desc_pgt },
+	{}
+};
+
+static void
+nv41_vmm_flush(struct nvkm_vmm *vmm, int level)
+{
+	struct nvkm_subdev *subdev = &vmm->mmu->subdev;
+	struct nvkm_device *device = subdev->device;
+
+	mutex_lock(&subdev->mutex);
+	nvkm_wr32(device, 0x100810, 0x00000022);
+	nvkm_msec(device, 2000,
+		if (nvkm_rd32(device, 0x100810) & 0x00000020)
+			break;
+	);
+	nvkm_wr32(device, 0x100810, 0x00000000);
+	mutex_unlock(&subdev->mutex);
+}
+
+static const struct nvkm_vmm_func
+nv41_vmm = {
+	.valid = nv04_vmm_valid,
+	.flush = nv41_vmm_flush,
+	.page = {
+		{ 12, &nv41_vmm_desc_12[0], NVKM_VMM_PAGE_HOST },
+		{}
+	}
+};
+
+int
+nv41_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	     struct lock_class_key *key, const char *name,
+	     struct nvkm_vmm **pvmm)
+{
+	return nv04_vmm_new_(&nv41_vmm, mmu, 0, addr, size,
+			     argv, argc, key, name, pvmm);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv44.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv44.c
new file mode 100644
index 000000000000..b834e4352334
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv44.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "vmm.h"
+
+#include <subdev/timer.h>
+
+static void
+nv44_vmm_pgt_fill(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		  dma_addr_t *list, u32 ptei, u32 ptes)
+{
+	u32 pteo = (ptei << 2) & ~0x0000000f;
+	u32 tmp[4];
+
+	tmp[0] = nvkm_ro32(pt->memory, pteo + 0x0);
+	tmp[1] = nvkm_ro32(pt->memory, pteo + 0x4);
+	tmp[2] = nvkm_ro32(pt->memory, pteo + 0x8);
+	tmp[3] = nvkm_ro32(pt->memory, pteo + 0xc);
+
+	while (ptes--) {
+		u32 addr = (list ? *list++ : vmm->null) >> 12;
+		switch (ptei++ & 0x3) {
+		case 0:
+			tmp[0] &= ~0x07ffffff;
+			tmp[0] |= addr;
+			break;
+		case 1:
+			tmp[0] &= ~0xf8000000;
+			tmp[0] |= addr << 27;
+			tmp[1] &= ~0x003fffff;
+			tmp[1] |= addr >> 5;
+			break;
+		case 2:
+			tmp[1] &= ~0xffc00000;
+			tmp[1] |= addr << 22;
+			tmp[2] &= ~0x0001ffff;
+			tmp[2] |= addr >> 10;
+			break;
+		case 3:
+			tmp[2] &= ~0xfffe0000;
+			tmp[2] |= addr << 17;
+			tmp[3] &= ~0x00000fff;
+			tmp[3] |= addr >> 15;
+			break;
+		}
+	}
+
+	VMM_WO032(pt, vmm, pteo + 0x0, tmp[0]);
+	VMM_WO032(pt, vmm, pteo + 0x4, tmp[1]);
+	VMM_WO032(pt, vmm, pteo + 0x8, tmp[2]);
+	VMM_WO032(pt, vmm, pteo + 0xc, tmp[3] | 0x40000000);
+}
+
+static void
+nv44_vmm_pgt_pte(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map, u64 addr)
+{
+	dma_addr_t tmp[4], i;
+
+	if (ptei & 3) {
+		const u32 pten = min(ptes, 4 - (ptei & 3));
+		for (i = 0; i < pten; i++, addr += 0x1000)
+			tmp[i] = addr;
+		nv44_vmm_pgt_fill(vmm, pt, tmp, ptei, pten);
+		ptei += pten;
+		ptes -= pten;
+	}
+
+	while (ptes >= 4) {
+		for (i = 0; i < 4; i++, addr += 0x1000)
+			tmp[i] = addr >> 12;
+		VMM_WO032(pt, vmm, ptei++ * 4, tmp[0] >>  0 | tmp[1] << 27);
+		VMM_WO032(pt, vmm, ptei++ * 4, tmp[1] >>  5 | tmp[2] << 22);
+		VMM_WO032(pt, vmm, ptei++ * 4, tmp[2] >> 10 | tmp[3] << 17);
+		VMM_WO032(pt, vmm, ptei++ * 4, tmp[3] >> 15 | 0x40000000);
+		ptes -= 4;
+	}
+
+	if (ptes) {
+		for (i = 0; i < ptes; i++, addr += 0x1000)
+			tmp[i] = addr;
+		nv44_vmm_pgt_fill(vmm, pt, tmp, ptei, ptes);
+	}
+}
+
+static void
+nv44_vmm_pgt_sgl(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	VMM_MAP_ITER_SGL(vmm, pt, ptei, ptes, map, nv44_vmm_pgt_pte);
+}
+
+static void
+nv44_vmm_pgt_dma(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+#if PAGE_SHIFT == 12
+	nvkm_kmap(pt->memory);
+	if (ptei & 3) {
+		const u32 pten = min(ptes, 4 - (ptei & 3));
+		nv44_vmm_pgt_fill(vmm, pt, map->dma, ptei, pten);
+		ptei += pten;
+		ptes -= pten;
+		map->dma += pten;
+	}
+
+	while (ptes >= 4) {
+		u32 tmp[4], i;
+		for (i = 0; i < 4; i++)
+			tmp[i] = *map->dma++ >> 12;
+		VMM_WO032(pt, vmm, ptei++ * 4, tmp[0] >>  0 | tmp[1] << 27);
+		VMM_WO032(pt, vmm, ptei++ * 4, tmp[1] >>  5 | tmp[2] << 22);
+		VMM_WO032(pt, vmm, ptei++ * 4, tmp[2] >> 10 | tmp[3] << 17);
+		VMM_WO032(pt, vmm, ptei++ * 4, tmp[3] >> 15 | 0x40000000);
+		ptes -= 4;
+	}
+
+	if (ptes) {
+		nv44_vmm_pgt_fill(vmm, pt, map->dma, ptei, ptes);
+		map->dma += ptes;
+	}
+	nvkm_done(pt->memory);
+#else
+	VMM_MAP_ITER_DMA(vmm, pt, ptei, ptes, map, nv44_vmm_pgt_pte);
+#endif
+}
+
+static void
+nv44_vmm_pgt_unmap(struct nvkm_vmm *vmm,
+		   struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes)
+{
+	nvkm_kmap(pt->memory);
+	if (ptei & 3) {
+		const u32 pten = min(ptes, 4 - (ptei & 3));
+		nv44_vmm_pgt_fill(vmm, pt, NULL, ptei, pten);
+		ptei += pten;
+		ptes -= pten;
+	}
+
+	while (ptes > 4) {
+		VMM_WO032(pt, vmm, ptei++ * 4, 0x00000000);
+		VMM_WO032(pt, vmm, ptei++ * 4, 0x00000000);
+		VMM_WO032(pt, vmm, ptei++ * 4, 0x00000000);
+		VMM_WO032(pt, vmm, ptei++ * 4, 0x00000000);
+		ptes -= 4;
+	}
+
+	if (ptes)
+		nv44_vmm_pgt_fill(vmm, pt, NULL, ptei, ptes);
+	nvkm_done(pt->memory);
+}
+
+static const struct nvkm_vmm_desc_func
+nv44_vmm_desc_pgt = {
+	.unmap = nv44_vmm_pgt_unmap,
+	.dma = nv44_vmm_pgt_dma,
+	.sgl = nv44_vmm_pgt_sgl,
+};
+
+static const struct nvkm_vmm_desc
+nv44_vmm_desc_12[] = {
+	{ PGT, 17, 4, 0x80000, &nv44_vmm_desc_pgt },
+	{}
+};
+
+static void
+nv44_vmm_flush(struct nvkm_vmm *vmm, int level)
+{
+	struct nvkm_device *device = vmm->mmu->subdev.device;
+	nvkm_wr32(device, 0x100814, vmm->limit - 4096);
+	nvkm_wr32(device, 0x100808, 0x000000020);
+	nvkm_msec(device, 2000,
+		if (nvkm_rd32(device, 0x100808) & 0x00000001)
+			break;
+	);
+	nvkm_wr32(device, 0x100808, 0x00000000);
+}
+
+static const struct nvkm_vmm_func
+nv44_vmm = {
+	.valid = nv04_vmm_valid,
+	.flush = nv44_vmm_flush,
+	.page = {
+		{ 12, &nv44_vmm_desc_12[0], NVKM_VMM_PAGE_HOST },
+		{}
+	}
+};
+
+int
+nv44_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	     struct lock_class_key *key, const char *name,
+	     struct nvkm_vmm **pvmm)
+{
+	struct nvkm_subdev *subdev = &mmu->subdev;
+	struct nvkm_vmm *vmm;
+	int ret;
+
+	ret = nv04_vmm_new_(&nv44_vmm, mmu, 0, addr, size,
+			    argv, argc, key, name, &vmm);
+	*pvmm = vmm;
+	if (ret)
+		return ret;
+
+	vmm->nullp = dma_alloc_coherent(subdev->device->dev, 16 * 1024,
+					&vmm->null, GFP_KERNEL);
+	if (!vmm->nullp) {
+		nvkm_warn(subdev, "unable to allocate dummy pages\n");
+		vmm->null = 0;
+	}
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv50.c
new file mode 100644
index 000000000000..863a2edd9861
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmnv50.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright 2017 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "vmm.h"
+
+#include <subdev/fb.h>
+#include <subdev/timer.h>
+#include <engine/gr.h>
+
+#include <nvif/if500d.h>
+#include <nvif/unpack.h>
+
+static inline void
+nv50_vmm_pgt_pte(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map, u64 addr)
+{
+	u64 next = addr | map->type, data;
+	u32 pten;
+	int log2blk;
+
+	map->type += ptes * map->ctag;
+
+	while (ptes) {
+		for (log2blk = 7; log2blk >= 0; log2blk--) {
+			pten = 1 << log2blk;
+			if (ptes >= pten && IS_ALIGNED(ptei, pten))
+				break;
+		}
+
+		data  = next | (log2blk << 7);
+		next += pten * map->next;
+		ptes -= pten;
+
+		while (pten--)
+			VMM_WO064(pt, vmm, ptei++ * 8, data);
+	}
+}
+
+static void
+nv50_vmm_pgt_sgl(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	VMM_MAP_ITER_SGL(vmm, pt, ptei, ptes, map, nv50_vmm_pgt_pte);
+}
+
+static void
+nv50_vmm_pgt_dma(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	if (map->page->shift == PAGE_SHIFT) {
+		VMM_SPAM(vmm, "DMAA %08x %08x PTE(s)", ptei, ptes);
+		nvkm_kmap(pt->memory);
+		while (ptes--) {
+			const u64 data = *map->dma++ | map->type;
+			VMM_WO064(pt, vmm, ptei++ * 8, data);
+			map->type += map->ctag;
+		}
+		nvkm_done(pt->memory);
+		return;
+	}
+
+	VMM_MAP_ITER_DMA(vmm, pt, ptei, ptes, map, nv50_vmm_pgt_pte);
+}
+
+static void
+nv50_vmm_pgt_mem(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
+		 u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
+{
+	VMM_MAP_ITER_MEM(vmm, pt, ptei, ptes, map, nv50_vmm_pgt_pte);
+}
+
+static void
+nv50_vmm_pgt_unmap(struct nvkm_vmm *vmm,
+		   struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes)
+{
+	VMM_FO064(pt, vmm, ptei * 8, 0ULL, ptes);
+}
+
+static const struct nvkm_vmm_desc_func
+nv50_vmm_pgt = {
+	.unmap = nv50_vmm_pgt_unmap,
+	.mem = nv50_vmm_pgt_mem,
+	.dma = nv50_vmm_pgt_dma,
+	.sgl = nv50_vmm_pgt_sgl,
+};
+
+static bool
+nv50_vmm_pde(struct nvkm_vmm *vmm, struct nvkm_vmm_pt *pgt, u64 *pdata)
+{
+	struct nvkm_mmu_pt *pt;
+	u64 data = 0xdeadcafe00000000ULL;
+	if (pgt && (pt = pgt->pt[0])) {
+		switch (pgt->page) {
+		case 16: data = 0x00000001; break;
+		case 12: data = 0x00000003;
+			switch (nvkm_memory_size(pt->memory)) {
+			case 0x100000: data |= 0x00000000; break;
+			case 0x040000: data |= 0x00000020; break;
+			case 0x020000: data |= 0x00000040; break;
+			case 0x010000: data |= 0x00000060; break;
+			default:
+				WARN_ON(1);
+				return false;
+			}
+			break;
+		default:
+			WARN_ON(1);
+			return false;
+		}
+
+		switch (nvkm_memory_target(pt->memory)) {
+		case NVKM_MEM_TARGET_VRAM: data |= 0x00000000; break;
+		case NVKM_MEM_TARGET_HOST: data |= 0x00000008; break;
+		case NVKM_MEM_TARGET_NCOH: data |= 0x0000000c; break;
+		default:
+			WARN_ON(1);
+			return false;
+		}
+
+		data |= pt->addr;
+	}
+	*pdata = data;
+	return true;
+}
+
+static void
+nv50_vmm_pgd_pde(struct nvkm_vmm *vmm, struct nvkm_vmm_pt *pgd, u32 pdei)
+{
+	struct nvkm_vmm_join *join;
+	u32 pdeo = vmm->mmu->func->vmm.pd_offset + (pdei * 8);
+	u64 data;
+
+	if (!nv50_vmm_pde(vmm, pgd->pde[pdei], &data))
+		return;
+
+	list_for_each_entry(join, &vmm->join, head) {
+		nvkm_kmap(join->inst);
+		nvkm_wo64(join->inst, pdeo, data);
+		nvkm_done(join->inst);
+	}
+}
+
+static const struct nvkm_vmm_desc_func
+nv50_vmm_pgd = {
+	.pde = nv50_vmm_pgd_pde,
+};
+
+static const struct nvkm_vmm_desc
+nv50_vmm_desc_12[] = {
+	{ PGT, 17, 8, 0x1000, &nv50_vmm_pgt },
+	{ PGD, 11, 0, 0x0000, &nv50_vmm_pgd },
+	{}
+};
+
+static const struct nvkm_vmm_desc
+nv50_vmm_desc_16[] = {
+	{ PGT, 13, 8, 0x1000, &nv50_vmm_pgt },
+	{ PGD, 11, 0, 0x0000, &nv50_vmm_pgd },
+	{}
+};
+
+static void
+nv50_vmm_flush(struct nvkm_vmm *vmm, int level)
+{
+	struct nvkm_subdev *subdev = &vmm->mmu->subdev;
+	struct nvkm_device *device = subdev->device;
+	int i, id;
+
+	mutex_lock(&subdev->mutex);
+	for (i = 0; i < NVKM_SUBDEV_NR; i++) {
+		if (!atomic_read(&vmm->engref[i]))
+			continue;
+
+		/* unfortunate hw bug workaround... */
+		if (i == NVKM_ENGINE_GR && device->gr) {
+			int ret = nvkm_gr_tlb_flush(device->gr);
+			if (ret != -ENODEV)
+				continue;
+		}
+
+		switch (i) {
+		case NVKM_ENGINE_GR    : id = 0x00; break;
+		case NVKM_ENGINE_VP    :
+		case NVKM_ENGINE_MSPDEC: id = 0x01; break;
+		case NVKM_SUBDEV_BAR   : id = 0x06; break;
+		case NVKM_ENGINE_MSPPP :
+		case NVKM_ENGINE_MPEG  : id = 0x08; break;
+		case NVKM_ENGINE_BSP   :
+		case NVKM_ENGINE_MSVLD : id = 0x09; break;
+		case NVKM_ENGINE_CIPHER:
+		case NVKM_ENGINE_SEC   : id = 0x0a; break;
+		case NVKM_ENGINE_CE0   : id = 0x0d; break;
+		default:
+			continue;
+		}
+
+		nvkm_wr32(device, 0x100c80, (id << 16) | 1);
+		if (nvkm_msec(device, 2000,
+			if (!(nvkm_rd32(device, 0x100c80) & 0x00000001))
+				break;
+		) < 0)
+			nvkm_error(subdev, "%s mmu invalidate timeout\n",
+				   nvkm_subdev_name[i]);
+	}
+	mutex_unlock(&subdev->mutex);
+}
+
+static int
+nv50_vmm_valid(struct nvkm_vmm *vmm, void *argv, u32 argc,
+	       struct nvkm_vmm_map *map)
+{
+	const struct nvkm_vmm_page *page = map->page;
+	union {
+		struct nv50_vmm_map_vn vn;
+		struct nv50_vmm_map_v0 v0;
+	} *args = argv;
+	struct nvkm_device *device = vmm->mmu->subdev.device;
+	struct nvkm_ram *ram = device->fb->ram;
+	struct nvkm_memory *memory = map->memory;
+	u8  aper, kind, comp, priv, ro;
+	int kindn, ret = -ENOSYS;
+	const u8 *kindm;
+
+	map->type = map->ctag = 0;
+	map->next = 1 << page->shift;
+
+	if (!(ret = nvif_unpack(ret, &argv, &argc, args->v0, 0, 0, false))) {
+		ro   = !!args->v0.ro;
+		priv = !!args->v0.priv;
+		kind = args->v0.kind & 0x7f;
+		comp = args->v0.comp & 0x03;
+	} else
+	if (!(ret = nvif_unvers(ret, &argv, &argc, args->vn))) {
+		ro   = 0;
+		priv = 0;
+		kind = 0x00;
+		comp = 0;
+	} else {
+		VMM_DEBUG(vmm, "args");
+		return ret;
+	}
+
+	switch (nvkm_memory_target(memory)) {
+	case NVKM_MEM_TARGET_VRAM:
+		if (ram->stolen) {
+			map->type |= ram->stolen;
+			aper = 3;
+		} else {
+			aper = 0;
+		}
+		break;
+	case NVKM_MEM_TARGET_HOST:
+		aper = 2;
+		break;
+	case NVKM_MEM_TARGET_NCOH:
+		aper = 3;
+		break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	kindm = vmm->mmu->func->kind(vmm->mmu, &kindn);
+	if (kind >= kindn || kindm[kind] == 0x7f) {
+		VMM_DEBUG(vmm, "kind %02x", kind);
+		return -EINVAL;
+	}
+
+	if (map->mem && map->mem->type != kindm[kind]) {
+		VMM_DEBUG(vmm, "kind %02x bankswz: %d %d", kind,
+			  kindm[kind], map->mem->type);
+		return -EINVAL;
+	}
+
+	if (comp) {
+		u32 tags = (nvkm_memory_size(memory) >> 16) * comp;
+		if (aper != 0 || !(page->type & NVKM_VMM_PAGE_COMP)) {
+			VMM_DEBUG(vmm, "comp %d %02x", aper, page->type);
+			return -EINVAL;
+		}
+
+		ret = nvkm_memory_tags_get(memory, device, tags, NULL,
+					   &map->tags);
+		if (ret) {
+			VMM_DEBUG(vmm, "comp %d", ret);
+			return ret;
+		}
+
+		if (map->tags->mn) {
+			u32 tags = map->tags->mn->offset + (map->offset >> 16);
+			map->ctag |= (u64)comp << 49;
+			map->type |= (u64)comp << 47;
+			map->type |= (u64)tags << 49;
+			map->next |= map->ctag;
+		}
+	}
+
+	map->type |= BIT(0); /* Valid. */
+	map->type |= (u64)ro << 3;
+	map->type |= (u64)aper << 4;
+	map->type |= (u64)priv << 6;
+	map->type |= (u64)kind << 40;
+	return 0;
+}
+
+static void
+nv50_vmm_part(struct nvkm_vmm *vmm, struct nvkm_memory *inst)
+{
+	struct nvkm_vmm_join *join;
+
+	list_for_each_entry(join, &vmm->join, head) {
+		if (join->inst == inst) {
+			list_del(&join->head);
+			kfree(join);
+			break;
+		}
+	}
+}
+
+static int
+nv50_vmm_join(struct nvkm_vmm *vmm, struct nvkm_memory *inst)
+{
+	const u32 pd_offset = vmm->mmu->func->vmm.pd_offset;
+	struct nvkm_vmm_join *join;
+	int ret = 0;
+	u64 data;
+	u32 pdei;
+
+	if (!(join = kmalloc(sizeof(*join), GFP_KERNEL)))
+		return -ENOMEM;
+	join->inst = inst;
+	list_add_tail(&join->head, &vmm->join);
+
+	nvkm_kmap(join->inst);
+	for (pdei = vmm->start >> 29; pdei <= (vmm->limit - 1) >> 29; pdei++) {
+		if (!nv50_vmm_pde(vmm, vmm->pd->pde[pdei], &data)) {
+			ret = -EINVAL;
+			break;
+		}
+		nvkm_wo64(join->inst, pd_offset + (pdei * 8), data);
+	}
+	nvkm_done(join->inst);
+	return ret;
+}
+
+static const struct nvkm_vmm_func
+nv50_vmm = {
+	.join = nv50_vmm_join,
+	.part = nv50_vmm_part,
+	.valid = nv50_vmm_valid,
+	.flush = nv50_vmm_flush,
+	.page_block = 1 << 29,
+	.page = {
+		{ 16, &nv50_vmm_desc_16[0], NVKM_VMM_PAGE_xVxC },
+		{ 12, &nv50_vmm_desc_12[0], NVKM_VMM_PAGE_xVHx },
+		{}
+	}
+};
+
+int
+nv50_vmm_new(struct nvkm_mmu *mmu, u64 addr, u64 size, void *argv, u32 argc,
+	     struct lock_class_key *key, const char *name,
+	     struct nvkm_vmm **pvmm)
+{
+	return nv04_vmm_new_(&nv50_vmm, mmu, 0, addr, size,
+			     argv, argc, key, name, pvmm);
+}
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
index a4cb82495cee..b1b1f3626b96 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
@@ -87,7 +87,7 @@ nvkm_pci_fini(struct nvkm_subdev *subdev, bool suspend)
 	if (pci->irq >= 0) {
 		free_irq(pci->irq, pci);
 		pci->irq = -1;
-	};
+	}
 
 	if (pci->agp.bridge)
 		nvkm_agp_fini(pci);
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm200.c b/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm200.c
index 73ca1203281d..5e91b3f90065 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm200.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm200.c
@@ -39,7 +39,7 @@ gm200_secboot_run_blob(struct nvkm_secboot *sb, struct nvkm_gpuobj *blob,
 {
 	struct gm200_secboot *gsb = gm200_secboot(sb);
 	struct nvkm_subdev *subdev = &gsb->base.subdev;
-	struct nvkm_vma vma;
+	struct nvkm_vma *vma = NULL;
 	u32 start_address;
 	int ret;
 
@@ -48,12 +48,16 @@ gm200_secboot_run_blob(struct nvkm_secboot *sb, struct nvkm_gpuobj *blob,
 		return ret;
 
 	/* Map the HS firmware so the HS bootloader can see it */
-	ret = nvkm_gpuobj_map(blob, gsb->vm, NV_MEM_ACCESS_RW, &vma);
+	ret = nvkm_vmm_get(gsb->vmm, 12, blob->size, &vma);
 	if (ret) {
 		nvkm_falcon_put(falcon, subdev);
 		return ret;
 	}
 
+	ret = nvkm_memory_map(blob, 0, gsb->vmm, vma, NULL, 0);
+	if (ret)
+		goto end;
+
 	/* Reset and set the falcon up */
 	ret = nvkm_falcon_reset(falcon);
 	if (ret)
@@ -61,7 +65,7 @@ gm200_secboot_run_blob(struct nvkm_secboot *sb, struct nvkm_gpuobj *blob,
 	nvkm_falcon_bind_context(falcon, gsb->inst);
 
 	/* Load the HS bootloader into the falcon's IMEM/DMEM */
-	ret = sb->acr->func->load(sb->acr, falcon, blob, vma.offset);
+	ret = sb->acr->func->load(sb->acr, falcon, blob, vma->addr);
 	if (ret < 0)
 		goto end;
 
@@ -91,7 +95,7 @@ end:
 	nvkm_mc_intr_mask(sb->subdev.device, falcon->owner->index, true);
 
 	/* We don't need the ACR firmware anymore */
-	nvkm_gpuobj_unmap(&vma);
+	nvkm_vmm_put(gsb->vmm, &vma);
 	nvkm_falcon_put(falcon, subdev);
 
 	return ret;
@@ -102,37 +106,26 @@ gm200_secboot_oneinit(struct nvkm_secboot *sb)
 {
 	struct gm200_secboot *gsb = gm200_secboot(sb);
 	struct nvkm_device *device = sb->subdev.device;
-	struct nvkm_vm *vm;
-	const u64 vm_area_len = 600 * 1024;
 	int ret;
 
 	/* Allocate instance block and VM */
-	ret = nvkm_gpuobj_new(device, 0x1000, 0, true, NULL, &gsb->inst);
+	ret = nvkm_memory_new(device, NVKM_MEM_TARGET_INST, 0x1000, 0, true,
+			      &gsb->inst);
 	if (ret)
 		return ret;
 
-	ret = nvkm_gpuobj_new(device, 0x8000, 0, true, NULL, &gsb->pgd);
+	ret = nvkm_vmm_new(device, 0, 600 * 1024, NULL, 0, NULL, "acr",
+			   &gsb->vmm);
 	if (ret)
 		return ret;
 
-	ret = nvkm_vm_new(device, 0, vm_area_len, 0, NULL, &vm);
-	if (ret)
-		return ret;
-
-	atomic_inc(&vm->engref[NVKM_SUBDEV_PMU]);
+	atomic_inc(&gsb->vmm->engref[NVKM_SUBDEV_PMU]);
+	gsb->vmm->debug = gsb->base.subdev.debug;
 
-	ret = nvkm_vm_ref(vm, &gsb->vm, gsb->pgd);
-	nvkm_vm_ref(NULL, &vm, NULL);
+	ret = nvkm_vmm_join(gsb->vmm, gsb->inst);
 	if (ret)
 		return ret;
 
-	nvkm_kmap(gsb->inst);
-	nvkm_wo32(gsb->inst, 0x200, lower_32_bits(gsb->pgd->addr));
-	nvkm_wo32(gsb->inst, 0x204, upper_32_bits(gsb->pgd->addr));
-	nvkm_wo32(gsb->inst, 0x208, lower_32_bits(vm_area_len - 1));
-	nvkm_wo32(gsb->inst, 0x20c, upper_32_bits(vm_area_len - 1));
-	nvkm_done(gsb->inst);
-
 	if (sb->acr->func->oneinit) {
 		ret = sb->acr->func->oneinit(sb->acr, sb);
 		if (ret)
@@ -160,9 +153,9 @@ gm200_secboot_dtor(struct nvkm_secboot *sb)
 
 	sb->acr->func->dtor(sb->acr);
 
-	nvkm_vm_ref(NULL, &gsb->vm, gsb->pgd);
-	nvkm_gpuobj_del(&gsb->pgd);
-	nvkm_gpuobj_del(&gsb->inst);
+	nvkm_vmm_part(gsb->vmm, gsb->inst);
+	nvkm_vmm_unref(&gsb->vmm);
+	nvkm_memory_unref(&gsb->inst);
 
 	return gsb;
 }
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm200.h b/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm200.h
index c8ab3d76bdef..62c5e162099a 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm200.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm200.h
@@ -29,9 +29,8 @@ struct gm200_secboot {
 	struct nvkm_secboot base;
 
 	/* Instance block & address space used for HS FW execution */
-	struct nvkm_gpuobj *inst;
-	struct nvkm_gpuobj *pgd;
-	struct nvkm_vm *vm;
+	struct nvkm_memory *inst;
+	struct nvkm_vmm *vmm;
 };
 #define gm200_secboot(sb) container_of(sb, struct gm200_secboot, base)
 
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/ls_ucode_msgqueue.c b/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/ls_ucode_msgqueue.c
index ee989210725e..6f10b098676c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/ls_ucode_msgqueue.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/ls_ucode_msgqueue.c
@@ -183,7 +183,7 @@ acr_ls_sec2_post_run(const struct nvkm_acr *acr, const struct nvkm_secboot *sb)
 			  break;
 	);
 	if (reg & BIT(4)) {
-		nvkm_debug(subdev, "applying workaround for start bug...");
+		nvkm_debug(subdev, "applying workaround for start bug...\n");
 		nvkm_falcon_start(sb->boot_falcon);
 		nvkm_msec(subdev->device, 1,
 			if ((reg = nvkm_rd32(subdev->device,
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/priv.h
index 885e919a8720..d9091f029506 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/priv.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/priv.h
@@ -25,6 +25,7 @@
 
 #include <subdev/secboot.h>
 #include <subdev/mmu.h>
+struct nvkm_gpuobj;
 
 struct nvkm_secboot_func {
 	int (*oneinit)(struct nvkm_secboot *);
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/Kbuild b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/Kbuild
index 2bafcc1d1818..7ba56b12badd 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/Kbuild
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/Kbuild
@@ -12,3 +12,4 @@ nvkm-y += nvkm/subdev/therm/gt215.o
 nvkm-y += nvkm/subdev/therm/gf119.o
 nvkm-y += nvkm/subdev/therm/gm107.o
 nvkm-y += nvkm/subdev/therm/gm200.o
+nvkm-y += nvkm/subdev/therm/gp100.o
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
index 952a7cb0a59a..f27fc6d0d4c6 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
@@ -341,7 +341,8 @@ nvkm_therm_init(struct nvkm_subdev *subdev)
 {
 	struct nvkm_therm *therm = nvkm_therm(subdev);
 
-	therm->func->init(therm);
+	if (therm->func->init)
+		therm->func->init(therm);
 
 	if (therm->suspend >= 0) {
 		/* restore the pwm value only when on manual or auto mode */
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
new file mode 100644
index 000000000000..9f0dea3f61dc
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/gp100.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2017 Rhys Kidd
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Rhys Kidd
+ */
+#include "priv.h"
+
+static int
+gp100_temp_get(struct nvkm_therm *therm)
+{
+	struct nvkm_device *device = therm->subdev.device;
+	struct nvkm_subdev *subdev = &therm->subdev;
+	u32 tsensor = nvkm_rd32(device, 0x020460);
+	u32 inttemp = (tsensor & 0x0001fff8);
+
+	/* device SHADOWed */
+	if (tsensor & 0x40000000)
+		nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");
+
+	/* device valid */
+	if (tsensor & 0x20000000)
+		return (inttemp >> 8);
+	else
+		return -ENODEV;
+}
+
+static const struct nvkm_therm_func
+gp100_therm = {
+	.temp_get = gp100_temp_get,
+	.program_alarms = nvkm_therm_program_alarms_polling,
+};
+
+int
+gp100_therm_new(struct nvkm_device *device, int index,
+		struct nvkm_therm **ptherm)
+{
+	return nvkm_therm_new_(&gp100_therm, device, index, ptherm);
+}
diff --git a/drivers/gpu/drm/qxl/qxl_cmd.c b/drivers/gpu/drm/qxl/qxl_cmd.c
index 74fc9362ecf9..c0fb52c6d4ca 100644
--- a/drivers/gpu/drm/qxl/qxl_cmd.c
+++ b/drivers/gpu/drm/qxl/qxl_cmd.c
@@ -219,7 +219,7 @@ int qxl_garbage_collect(struct qxl_device *qdev)
 	union qxl_release_info *info;
 
 	while (qxl_ring_pop(qdev->release_ring, &id)) {
-		QXL_INFO(qdev, "popped %lld\n", id);
+		DRM_DEBUG_DRIVER("popped %lld\n", id);
 		while (id) {
 			release = qxl_release_from_id_locked(qdev, id);
 			if (release == NULL)
@@ -229,8 +229,8 @@ int qxl_garbage_collect(struct qxl_device *qdev)
 			next_id = info->next;
 			qxl_release_unmap(qdev, release, info);
 
-			QXL_INFO(qdev, "popped %lld, next %lld\n", id,
-				next_id);
+			DRM_DEBUG_DRIVER("popped %lld, next %lld\n", id,
+					 next_id);
 
 			switch (release->type) {
 			case QXL_RELEASE_DRAWABLE:
@@ -248,7 +248,7 @@ int qxl_garbage_collect(struct qxl_device *qdev)
 		}
 	}
 
-	QXL_INFO(qdev, "%s: %d\n", __func__, i);
+	DRM_DEBUG_DRIVER("%d\n", i);
 
 	return i;
 }
@@ -381,17 +381,19 @@ void qxl_io_create_primary(struct qxl_device *qdev,
 {
 	struct qxl_surface_create *create;
 
-	QXL_INFO(qdev, "%s: qdev %p, ram_header %p\n", __func__, qdev,
-		 qdev->ram_header);
+	DRM_DEBUG_DRIVER("qdev %p, ram_header %p\n", qdev, qdev->ram_header);
 	create = &qdev->ram_header->create_surface;
 	create->format = bo->surf.format;
 	create->width = bo->surf.width;
 	create->height = bo->surf.height;
 	create->stride = bo->surf.stride;
-	create->mem = qxl_bo_physical_address(qdev, bo, offset);
+	if (bo->shadow) {
+		create->mem = qxl_bo_physical_address(qdev, bo->shadow, offset);
+	} else {
+		create->mem = qxl_bo_physical_address(qdev, bo, offset);
+	}
 
-	QXL_INFO(qdev, "%s: mem = %llx, from %p\n", __func__, create->mem,
-		 bo->kptr);
+	DRM_DEBUG_DRIVER("mem = %llx, from %p\n", create->mem, bo->kptr);
 
 	create->flags = QXL_SURF_FLAG_KEEP_DATA;
 	create->type = QXL_SURF_TYPE_PRIMARY;
@@ -401,7 +403,7 @@ void qxl_io_create_primary(struct qxl_device *qdev,
 
 void qxl_io_memslot_add(struct qxl_device *qdev, uint8_t id)
 {
-	QXL_INFO(qdev, "qxl_memslot_add %d\n", id);
+	DRM_DEBUG_DRIVER("qxl_memslot_add %d\n", id);
 	wait_for_io_cmd(qdev, id, QXL_IO_MEMSLOT_ADD_ASYNC);
 }
 
diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c
index afbf50d0c08f..4756b3c9bf2c 100644
--- a/drivers/gpu/drm/qxl/qxl_display.c
+++ b/drivers/gpu/drm/qxl/qxl_display.c
@@ -305,7 +305,9 @@ static const struct drm_crtc_funcs qxl_crtc_funcs = {
 void qxl_user_framebuffer_destroy(struct drm_framebuffer *fb)
 {
 	struct qxl_framebuffer *qxl_fb = to_qxl_framebuffer(fb);
+	struct qxl_bo *bo = gem_to_qxl_bo(qxl_fb->obj);
 
+	WARN_ON(bo->shadow);
 	drm_gem_object_unreference_unlocked(qxl_fb->obj);
 	drm_framebuffer_cleanup(fb);
 	kfree(qxl_fb);
@@ -508,6 +510,7 @@ static void qxl_primary_atomic_update(struct drm_plane *plane,
 	    .x2 = qfb->base.width,
 	    .y2 = qfb->base.height
 	};
+	bool same_shadow = false;
 
 	if (old_state->fb) {
 		qfb_old = to_qxl_framebuffer(old_state->fb);
@@ -519,15 +522,23 @@ static void qxl_primary_atomic_update(struct drm_plane *plane,
 	if (bo == bo_old)
 		return;
 
+	if (bo_old && bo_old->shadow && bo->shadow &&
+	    bo_old->shadow == bo->shadow) {
+		same_shadow = true;
+	}
+
 	if (bo_old && bo_old->is_primary) {
-		qxl_io_destroy_primary(qdev);
+		if (!same_shadow)
+			qxl_io_destroy_primary(qdev);
 		bo_old->is_primary = false;
 	}
 
 	if (!bo->is_primary) {
-		qxl_io_create_primary(qdev, 0, bo);
+		if (!same_shadow)
+			qxl_io_create_primary(qdev, 0, bo);
 		bo->is_primary = true;
 	}
+
 	qxl_draw_dirty_fb(qdev, qfb, bo, 0, 0, &norect, 1, 1);
 }
 
@@ -679,8 +690,9 @@ static void qxl_cursor_atomic_disable(struct drm_plane *plane,
 static int qxl_plane_prepare_fb(struct drm_plane *plane,
 				struct drm_plane_state *new_state)
 {
+	struct qxl_device *qdev = plane->dev->dev_private;
 	struct drm_gem_object *obj;
-	struct qxl_bo *user_bo;
+	struct qxl_bo *user_bo, *old_bo = NULL;
 	int ret;
 
 	if (!new_state->fb)
@@ -689,6 +701,32 @@ static int qxl_plane_prepare_fb(struct drm_plane *plane,
 	obj = to_qxl_framebuffer(new_state->fb)->obj;
 	user_bo = gem_to_qxl_bo(obj);
 
+	if (plane->type == DRM_PLANE_TYPE_PRIMARY &&
+	    user_bo->is_dumb && !user_bo->shadow) {
+		if (plane->state->fb) {
+			obj = to_qxl_framebuffer(plane->state->fb)->obj;
+			old_bo = gem_to_qxl_bo(obj);
+		}
+		if (old_bo && old_bo->shadow &&
+		    user_bo->gem_base.size == old_bo->gem_base.size &&
+		    plane->state->crtc     == new_state->crtc &&
+		    plane->state->crtc_w   == new_state->crtc_w &&
+		    plane->state->crtc_h   == new_state->crtc_h &&
+		    plane->state->src_x    == new_state->src_x &&
+		    plane->state->src_y    == new_state->src_y &&
+		    plane->state->src_w    == new_state->src_w &&
+		    plane->state->src_h    == new_state->src_h &&
+		    plane->state->rotation == new_state->rotation &&
+		    plane->state->zpos     == new_state->zpos) {
+			drm_gem_object_get(&old_bo->shadow->gem_base);
+			user_bo->shadow = old_bo->shadow;
+		} else {
+			qxl_bo_create(qdev, user_bo->gem_base.size,
+				      true, true, QXL_GEM_DOMAIN_VRAM, NULL,
+				      &user_bo->shadow);
+		}
+	}
+
 	ret = qxl_bo_pin(user_bo, QXL_GEM_DOMAIN_CPU, NULL);
 	if (ret)
 		return ret;
@@ -713,6 +751,11 @@ static void qxl_plane_cleanup_fb(struct drm_plane *plane,
 	obj = to_qxl_framebuffer(old_state->fb)->obj;
 	user_bo = gem_to_qxl_bo(obj);
 	qxl_bo_unpin(user_bo);
+
+	if (user_bo->shadow && !user_bo->is_primary) {
+		drm_gem_object_put_unlocked(&user_bo->shadow->gem_base);
+		user_bo->shadow = NULL;
+	}
 }
 
 static const uint32_t qxl_cursor_plane_formats[] = {
diff --git a/drivers/gpu/drm/qxl/qxl_drv.h b/drivers/gpu/drm/qxl/qxl_drv.h
index 3397a1907336..08752c0ffb35 100644
--- a/drivers/gpu/drm/qxl/qxl_drv.h
+++ b/drivers/gpu/drm/qxl/qxl_drv.h
@@ -62,33 +62,9 @@
 
 #define QXL_DEBUGFS_MAX_COMPONENTS		32
 
-extern int qxl_log_level;
 extern int qxl_num_crtc;
 extern int qxl_max_ioctls;
 
-enum {
-	QXL_INFO_LEVEL = 1,
-	QXL_DEBUG_LEVEL = 2,
-};
-
-#define QXL_INFO(qdev, fmt, ...) do { \
-		if (qxl_log_level >= QXL_INFO_LEVEL) {	\
-			qxl_io_log(qdev, fmt, __VA_ARGS__); \
-		}	\
-	} while (0)
-#define QXL_DEBUG(qdev, fmt, ...) do { \
-		if (qxl_log_level >= QXL_DEBUG_LEVEL) {	\
-			qxl_io_log(qdev, fmt, __VA_ARGS__); \
-		}	\
-	} while (0)
-#define QXL_INFO_ONCE(qdev, fmt, ...) do { \
-		static int done;		\
-		if (!done) {			\
-			done = 1;			\
-			QXL_INFO(qdev, fmt, __VA_ARGS__);	\
-		}						\
-	} while (0)
-
 #define DRM_FILE_OFFSET 0x100000000ULL
 #define DRM_FILE_PAGE_OFFSET (DRM_FILE_OFFSET >> PAGE_SHIFT)
 
@@ -113,6 +89,8 @@ struct qxl_bo {
 	/* Constant after initialization */
 	struct drm_gem_object		gem_base;
 	bool is_primary; /* is this now a primary surface */
+	bool is_dumb;
+	struct qxl_bo *shadow;
 	bool hw_surf_alloc;
 	struct qxl_surface surf;
 	uint32_t surface_id;
@@ -351,7 +329,7 @@ int qxl_check_idle(struct qxl_ring *ring);
 static inline void *
 qxl_fb_virtual_address(struct qxl_device *qdev, unsigned long physical)
 {
-	QXL_INFO(qdev, "not implemented (%lu)\n", physical);
+	DRM_DEBUG_DRIVER("not implemented (%lu)\n", physical);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/qxl/qxl_dumb.c b/drivers/gpu/drm/qxl/qxl_dumb.c
index 5e65d5d2d937..11085ab01374 100644
--- a/drivers/gpu/drm/qxl/qxl_dumb.c
+++ b/drivers/gpu/drm/qxl/qxl_dumb.c
@@ -63,6 +63,7 @@ int qxl_mode_dumb_create(struct drm_file *file_priv,
 					      &handle);
 	if (r)
 		return r;
+	qobj->is_dumb = true;
 	args->pitch = pitch;
 	args->handle = handle;
 	return 0;
diff --git a/drivers/gpu/drm/qxl/qxl_fb.c b/drivers/gpu/drm/qxl/qxl_fb.c
index 844c4a31ca13..23af3e352673 100644
--- a/drivers/gpu/drm/qxl/qxl_fb.c
+++ b/drivers/gpu/drm/qxl/qxl_fb.c
@@ -240,18 +240,15 @@ static int qxlfb_create(struct qxl_fbdev *qfbdev,
 		return ret;
 
 	qbo = gem_to_qxl_bo(gobj);
-	QXL_INFO(qdev, "%s: %dx%d %d\n", __func__, mode_cmd.width,
-		 mode_cmd.height, mode_cmd.pitches[0]);
+	DRM_DEBUG_DRIVER("%dx%d %d\n", mode_cmd.width,
+			 mode_cmd.height, mode_cmd.pitches[0]);
 
 	shadow = vmalloc(mode_cmd.pitches[0] * mode_cmd.height);
 	/* TODO: what's the usual response to memory allocation errors? */
 	BUG_ON(!shadow);
-	QXL_INFO(qdev,
-	"surface0 at gpu offset %lld, mmap_offset %lld (virt %p, shadow %p)\n",
-		 qxl_bo_gpu_offset(qbo),
-		 qxl_bo_mmap_offset(qbo),
-		 qbo->kptr,
-		 shadow);
+	DRM_DEBUG_DRIVER("surface0 at gpu offset %lld, mmap_offset %lld (virt %p, shadow %p)\n",
+			 qxl_bo_gpu_offset(qbo), qxl_bo_mmap_offset(qbo),
+			 qbo->kptr, shadow);
 	size = mode_cmd.pitches[0] * mode_cmd.height;
 
 	info = drm_fb_helper_alloc_fbi(&qfbdev->helper);
diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c
index e6ec845b5be0..a6da6fa6ad58 100644
--- a/drivers/gpu/drm/qxl/qxl_release.c
+++ b/drivers/gpu/drm/qxl/qxl_release.c
@@ -154,7 +154,7 @@ qxl_release_alloc(struct qxl_device *qdev, int type,
 		return handle;
 	}
 	*ret = release;
-	QXL_INFO(qdev, "allocated release %d\n", handle);
+	DRM_DEBUG_DRIVER("allocated release %d\n", handle);
 	release->id = handle;
 	return handle;
 }
@@ -179,8 +179,7 @@ void
 qxl_release_free(struct qxl_device *qdev,
 		 struct qxl_release *release)
 {
-	QXL_INFO(qdev, "release %d, type %d\n", release->id,
-		 release->type);
+	DRM_DEBUG_DRIVER("release %d, type %d\n", release->id, release->type);
 
 	if (release->surface_release_id)
 		qxl_surface_id_dealloc(qdev, release->surface_release_id);
diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
index 7ecf8a4b9fe6..ab4823875311 100644
--- a/drivers/gpu/drm/qxl/qxl_ttm.c
+++ b/drivers/gpu/drm/qxl/qxl_ttm.c
@@ -136,8 +136,8 @@ int qxl_mmap(struct file *filp, struct vm_area_struct *vma)
 		 "filp->private_data->minor->dev->dev_private == NULL\n");
 		return -EINVAL;
 	}
-	QXL_INFO(qdev, "%s: filp->private_data = 0x%p, vma->vm_pgoff = %lx\n",
-		 __func__, filp->private_data, vma->vm_pgoff);
+	DRM_DEBUG_DRIVER("filp->private_data = 0x%p, vma->vm_pgoff = %lx\n",
+		  filp->private_data, vma->vm_pgoff);
 
 	r = ttm_bo_mmap(filp, vma, &qdev->mman.bdev);
 	if (unlikely(r != 0))
diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile
index be16c6390216..cf3e5985e3e7 100644
--- a/drivers/gpu/drm/radeon/Makefile
+++ b/drivers/gpu/drm/radeon/Makefile
@@ -102,8 +102,7 @@ radeon-y += \
 radeon-y += \
 	radeon_vce.o \
 	vce_v1_0.o \
-	vce_v2_0.o \
-	radeon_kfd.o
+	vce_v2_0.o
 
 radeon-$(CONFIG_VGA_SWITCHEROO) += radeon_atpx_handler.o
 radeon-$(CONFIG_ACPI) += radeon_acpi.o
diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c
index a904c80c30e6..3e798593e042 100644
--- a/drivers/gpu/drm/radeon/atombios_dp.c
+++ b/drivers/gpu/drm/radeon/atombios_dp.c
@@ -45,34 +45,32 @@ static char *pre_emph_names[] = {
 
 /***** radeon AUX functions *****/
 
-/* Atom needs data in little endian format
- * so swap as appropriate when copying data to
- * or from atom. Note that atom operates on
- * dw units.
+/* Atom needs data in little endian format so swap as appropriate when copying
+ * data to or from atom. Note that atom operates on dw units.
+ *
+ * Use to_le=true when sending data to atom and provide at least
+ * ALIGN(num_bytes,4) bytes in the dst buffer.
+ *
+ * Use to_le=false when receiving data from atom and provide ALIGN(num_bytes,4)
+ * byes in the src buffer.
  */
 void radeon_atom_copy_swap(u8 *dst, u8 *src, u8 num_bytes, bool to_le)
 {
 #ifdef __BIG_ENDIAN
-	u8 src_tmp[20], dst_tmp[20]; /* used for byteswapping */
-	u32 *dst32, *src32;
+	u32 src_tmp[5], dst_tmp[5];
 	int i;
+	u8 align_num_bytes = ALIGN(num_bytes, 4);
 
-	memcpy(src_tmp, src, num_bytes);
-	src32 = (u32 *)src_tmp;
-	dst32 = (u32 *)dst_tmp;
 	if (to_le) {
-		for (i = 0; i < ((num_bytes + 3) / 4); i++)
-			dst32[i] = cpu_to_le32(src32[i]);
-		memcpy(dst, dst_tmp, num_bytes);
+		memcpy(src_tmp, src, num_bytes);
+		for (i = 0; i < align_num_bytes / 4; i++)
+			dst_tmp[i] = cpu_to_le32(src_tmp[i]);
+		memcpy(dst, dst_tmp, align_num_bytes);
 	} else {
-		u8 dws = num_bytes & ~3;
-		for (i = 0; i < ((num_bytes + 3) / 4); i++)
-			dst32[i] = le32_to_cpu(src32[i]);
-		memcpy(dst, dst_tmp, dws);
-		if (num_bytes % 4) {
-			for (i = 0; i < (num_bytes % 4); i++)
-				dst[dws+i] = dst_tmp[dws+i];
-		}
+		memcpy(src_tmp, src, align_num_bytes);
+		for (i = 0; i < align_num_bytes / 4; i++)
+			dst_tmp[i] = le32_to_cpu(src_tmp[i]);
+		memcpy(dst, dst_tmp, num_bytes);
 	}
 #else
 	memcpy(dst, src, num_bytes);
diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
index 3cb6c55b268d..898f9a078830 100644
--- a/drivers/gpu/drm/radeon/cik.c
+++ b/drivers/gpu/drm/radeon/cik.c
@@ -33,7 +33,6 @@
 #include "cik_blit_shaders.h"
 #include "radeon_ucode.h"
 #include "clearstate_ci.h"
-#include "radeon_kfd.h"
 
 #define SH_MEM_CONFIG_GFX_DEFAULT \
 	ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED)
@@ -5684,10 +5683,9 @@ int cik_vm_init(struct radeon_device *rdev)
 	/*
 	 * number of VMs
 	 * VMID 0 is reserved for System
-	 * radeon graphics/compute will use VMIDs 1-7
-	 * amdkfd will use VMIDs 8-15
+	 * radeon graphics/compute will use VMIDs 1-15
 	 */
-	rdev->vm_manager.nvm = RADEON_NUM_OF_VMIDS;
+	rdev->vm_manager.nvm = 16;
 	/* base offset of vram pages */
 	if (rdev->flags & RADEON_IS_IGP) {
 		u64 tmp = RREG32(MC_VM_FB_OFFSET);
@@ -7589,9 +7587,6 @@ restart_ih:
 		/* wptr/rptr are in bytes! */
 		ring_index = rptr / 4;
 
-		radeon_kfd_interrupt(rdev,
-				(const void *) &rdev->ih.ring[ring_index]);
-
 		src_id =  le32_to_cpu(rdev->ih.ring[ring_index]) & 0xff;
 		src_data = le32_to_cpu(rdev->ih.ring[ring_index + 1]) & 0xfffffff;
 		ring_id = le32_to_cpu(rdev->ih.ring[ring_index + 2]) & 0xff;
@@ -8486,10 +8481,6 @@ static int cik_startup(struct radeon_device *rdev)
 	if (r)
 		return r;
 
-	r = radeon_kfd_resume(rdev);
-	if (r)
-		return r;
-
 	return 0;
 }
 
@@ -8538,7 +8529,6 @@ int cik_resume(struct radeon_device *rdev)
  */
 int cik_suspend(struct radeon_device *rdev)
 {
-	radeon_kfd_suspend(rdev);
 	radeon_pm_suspend(rdev);
 	radeon_audio_fini(rdev);
 	radeon_vm_manager_fini(rdev);
diff --git a/drivers/gpu/drm/radeon/cikd.h b/drivers/gpu/drm/radeon/cikd.h
index e21015475ed5..cda16fcd43bb 100644
--- a/drivers/gpu/drm/radeon/cikd.h
+++ b/drivers/gpu/drm/radeon/cikd.h
@@ -30,8 +30,6 @@
 #define CIK_RB_BITMAP_WIDTH_PER_SH     2
 #define HAWAII_RB_BITMAP_WIDTH_PER_SH  4
 
-#define RADEON_NUM_OF_VMIDS	8
-
 /* DIDT IND registers */
 #define DIDT_SQ_CTRL0                                     0x0
 #       define DIDT_CTRL_EN                               (1 << 0)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 8cbaeec090c9..a8e546569858 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -2456,9 +2456,6 @@ struct radeon_device {
 	u64 vram_pin_size;
 	u64 gart_pin_size;
 
-	/* amdkfd interface */
-	struct kfd_dev		*kfd;
-
 	struct mutex	mn_lock;
 	DECLARE_HASHTABLE(mn_hash, 7);
 };
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index f4becad0a78c..31dd04f6baa1 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -43,7 +43,6 @@
 #include <drm/drm_fb_helper.h>
 
 #include <drm/drm_crtc_helper.h>
-#include "radeon_kfd.h"
 
 /*
  * KMS wrapper.
@@ -338,14 +337,6 @@ static int radeon_pci_probe(struct pci_dev *pdev,
 {
 	int ret;
 
-	/*
-	 * Initialize amdkfd before starting radeon. If it was not loaded yet,
-	 * defer radeon probing
-	 */
-	ret = radeon_kfd_init();
-	if (ret == -EPROBE_DEFER)
-		return ret;
-
 	if (vga_switcheroo_client_probe_defer(pdev))
 		return -EPROBE_DEFER;
 
@@ -645,7 +636,6 @@ static int __init radeon_init(void)
 
 static void __exit radeon_exit(void)
 {
-	radeon_kfd_fini();
 	pci_unregister_driver(pdriver);
 	radeon_unregister_atpx_handler();
 }
diff --git a/drivers/gpu/drm/radeon/radeon_kfd.c b/drivers/gpu/drm/radeon/radeon_kfd.c
deleted file mode 100644
index 385b4d76956d..000000000000
--- a/drivers/gpu/drm/radeon/radeon_kfd.c
+++ /dev/null
@@ -1,901 +0,0 @@
-/*
- * Copyright 2014 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include <linux/module.h>
-#include <linux/fdtable.h>
-#include <linux/uaccess.h>
-#include <drm/drmP.h>
-#include "radeon.h"
-#include "cikd.h"
-#include "cik_reg.h"
-#include "radeon_kfd.h"
-#include "radeon_ucode.h"
-#include <linux/firmware.h>
-#include "cik_structs.h"
-
-#define CIK_PIPE_PER_MEC	(4)
-
-static const uint32_t watchRegs[MAX_WATCH_ADDRESSES * ADDRESS_WATCH_REG_MAX] = {
-	TCP_WATCH0_ADDR_H, TCP_WATCH0_ADDR_L, TCP_WATCH0_CNTL,
-	TCP_WATCH1_ADDR_H, TCP_WATCH1_ADDR_L, TCP_WATCH1_CNTL,
-	TCP_WATCH2_ADDR_H, TCP_WATCH2_ADDR_L, TCP_WATCH2_CNTL,
-	TCP_WATCH3_ADDR_H, TCP_WATCH3_ADDR_L, TCP_WATCH3_CNTL
-};
-
-struct kgd_mem {
-	struct radeon_bo *bo;
-	uint64_t gpu_addr;
-	void *cpu_ptr;
-};
-
-
-static int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
-			void **mem_obj, uint64_t *gpu_addr,
-			void **cpu_ptr);
-
-static void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj);
-
-static uint64_t get_vmem_size(struct kgd_dev *kgd);
-static uint64_t get_gpu_clock_counter(struct kgd_dev *kgd);
-
-static uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd);
-
-static int alloc_pasid(unsigned int bits);
-static void free_pasid(unsigned int pasid);
-
-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
-
-/*
- * Register access functions
- */
-
-static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
-		uint32_t sh_mem_config,	uint32_t sh_mem_ape1_base,
-		uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
-
-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
-					unsigned int vmid);
-
-static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
-				uint32_t hpd_size, uint64_t hpd_gpu_addr);
-static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
-static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
-			uint32_t queue_id, uint32_t __user *wptr,
-			uint32_t wptr_shift, uint32_t wptr_mask,
-			struct mm_struct *mm);
-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
-static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
-				uint32_t pipe_id, uint32_t queue_id);
-
-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t reset_type,
-				unsigned int timeout, uint32_t pipe_id,
-				uint32_t queue_id);
-static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
-static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
-				unsigned int timeout);
-static int kgd_address_watch_disable(struct kgd_dev *kgd);
-static int kgd_address_watch_execute(struct kgd_dev *kgd,
-					unsigned int watch_point_id,
-					uint32_t cntl_val,
-					uint32_t addr_hi,
-					uint32_t addr_lo);
-static int kgd_wave_control_execute(struct kgd_dev *kgd,
-					uint32_t gfx_index_val,
-					uint32_t sq_cmd);
-static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
-					unsigned int watch_point_id,
-					unsigned int reg_offset);
-
-static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid);
-static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
-							uint8_t vmid);
-static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid);
-
-static const struct kfd2kgd_calls kfd2kgd = {
-	.init_gtt_mem_allocation = alloc_gtt_mem,
-	.free_gtt_mem = free_gtt_mem,
-	.get_vmem_size = get_vmem_size,
-	.get_gpu_clock_counter = get_gpu_clock_counter,
-	.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
-	.alloc_pasid = alloc_pasid,
-	.free_pasid = free_pasid,
-	.program_sh_mem_settings = kgd_program_sh_mem_settings,
-	.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
-	.init_pipeline = kgd_init_pipeline,
-	.init_interrupts = kgd_init_interrupts,
-	.hqd_load = kgd_hqd_load,
-	.hqd_sdma_load = kgd_hqd_sdma_load,
-	.hqd_is_occupied = kgd_hqd_is_occupied,
-	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
-	.hqd_destroy = kgd_hqd_destroy,
-	.hqd_sdma_destroy = kgd_hqd_sdma_destroy,
-	.address_watch_disable = kgd_address_watch_disable,
-	.address_watch_execute = kgd_address_watch_execute,
-	.wave_control_execute = kgd_wave_control_execute,
-	.address_watch_get_offset = kgd_address_watch_get_offset,
-	.get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid,
-	.get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid,
-	.write_vmid_invalidate_request = write_vmid_invalidate_request,
-	.get_fw_version = get_fw_version
-};
-
-static const struct kgd2kfd_calls *kgd2kfd;
-
-int radeon_kfd_init(void)
-{
-	int ret;
-
-#if defined(CONFIG_HSA_AMD_MODULE)
-	int (*kgd2kfd_init_p)(unsigned, const struct kgd2kfd_calls**);
-
-	kgd2kfd_init_p = symbol_request(kgd2kfd_init);
-
-	if (kgd2kfd_init_p == NULL)
-		return -ENOENT;
-
-	ret = kgd2kfd_init_p(KFD_INTERFACE_VERSION, &kgd2kfd);
-	if (ret) {
-		symbol_put(kgd2kfd_init);
-		kgd2kfd = NULL;
-	}
-
-#elif defined(CONFIG_HSA_AMD)
-	ret = kgd2kfd_init(KFD_INTERFACE_VERSION, &kgd2kfd);
-	if (ret)
-		kgd2kfd = NULL;
-
-#else
-	ret = -ENOENT;
-#endif
-
-	return ret;
-}
-
-void radeon_kfd_fini(void)
-{
-	if (kgd2kfd) {
-		kgd2kfd->exit();
-		symbol_put(kgd2kfd_init);
-	}
-}
-
-void radeon_kfd_device_probe(struct radeon_device *rdev)
-{
-	if (kgd2kfd)
-		rdev->kfd = kgd2kfd->probe((struct kgd_dev *)rdev,
-			rdev->pdev, &kfd2kgd);
-}
-
-void radeon_kfd_device_init(struct radeon_device *rdev)
-{
-	int i, queue, pipe, mec;
-
-	if (rdev->kfd) {
-		struct kgd2kfd_shared_resources gpu_resources = {
-			.compute_vmid_bitmap = 0xFF00,
-			.num_pipe_per_mec = 4,
-			.num_queue_per_pipe = 8
-		};
-
-		bitmap_zero(gpu_resources.queue_bitmap, KGD_MAX_QUEUES);
-
-		for (i = 0; i < KGD_MAX_QUEUES; ++i) {
-			queue = i % gpu_resources.num_queue_per_pipe;
-			pipe = (i / gpu_resources.num_queue_per_pipe)
-				% gpu_resources.num_pipe_per_mec;
-			mec = (i / gpu_resources.num_queue_per_pipe)
-				/ gpu_resources.num_pipe_per_mec;
-
-			if (mec == 0 && pipe > 0)
-				set_bit(i, gpu_resources.queue_bitmap);
-		}
-
-		radeon_doorbell_get_kfd_info(rdev,
-				&gpu_resources.doorbell_physical_address,
-				&gpu_resources.doorbell_aperture_size,
-				&gpu_resources.doorbell_start_offset);
-
-		kgd2kfd->device_init(rdev->kfd, &gpu_resources);
-	}
-}
-
-void radeon_kfd_device_fini(struct radeon_device *rdev)
-{
-	if (rdev->kfd) {
-		kgd2kfd->device_exit(rdev->kfd);
-		rdev->kfd = NULL;
-	}
-}
-
-void radeon_kfd_interrupt(struct radeon_device *rdev, const void *ih_ring_entry)
-{
-	if (rdev->kfd)
-		kgd2kfd->interrupt(rdev->kfd, ih_ring_entry);
-}
-
-void radeon_kfd_suspend(struct radeon_device *rdev)
-{
-	if (rdev->kfd)
-		kgd2kfd->suspend(rdev->kfd);
-}
-
-int radeon_kfd_resume(struct radeon_device *rdev)
-{
-	int r = 0;
-
-	if (rdev->kfd)
-		r = kgd2kfd->resume(rdev->kfd);
-
-	return r;
-}
-
-static int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
-			void **mem_obj, uint64_t *gpu_addr,
-			void **cpu_ptr)
-{
-	struct radeon_device *rdev = (struct radeon_device *)kgd;
-	struct kgd_mem **mem = (struct kgd_mem **) mem_obj;
-	int r;
-
-	BUG_ON(kgd == NULL);
-	BUG_ON(gpu_addr == NULL);
-	BUG_ON(cpu_ptr == NULL);
-
-	*mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL);
-	if ((*mem) == NULL)
-		return -ENOMEM;
-
-	r = radeon_bo_create(rdev, size, PAGE_SIZE, true, RADEON_GEM_DOMAIN_GTT,
-				RADEON_GEM_GTT_WC, NULL, NULL, &(*mem)->bo);
-	if (r) {
-		dev_err(rdev->dev,
-			"failed to allocate BO for amdkfd (%d)\n", r);
-		return r;
-	}
-
-	/* map the buffer */
-	r = radeon_bo_reserve((*mem)->bo, true);
-	if (r) {
-		dev_err(rdev->dev, "(%d) failed to reserve bo for amdkfd\n", r);
-		goto allocate_mem_reserve_bo_failed;
-	}
-
-	r = radeon_bo_pin((*mem)->bo, RADEON_GEM_DOMAIN_GTT,
-				&(*mem)->gpu_addr);
-	if (r) {
-		dev_err(rdev->dev, "(%d) failed to pin bo for amdkfd\n", r);
-		goto allocate_mem_pin_bo_failed;
-	}
-	*gpu_addr = (*mem)->gpu_addr;
-
-	r = radeon_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr);
-	if (r) {
-		dev_err(rdev->dev,
-			"(%d) failed to map bo to kernel for amdkfd\n", r);
-		goto allocate_mem_kmap_bo_failed;
-	}
-	*cpu_ptr = (*mem)->cpu_ptr;
-
-	radeon_bo_unreserve((*mem)->bo);
-
-	return 0;
-
-allocate_mem_kmap_bo_failed:
-	radeon_bo_unpin((*mem)->bo);
-allocate_mem_pin_bo_failed:
-	radeon_bo_unreserve((*mem)->bo);
-allocate_mem_reserve_bo_failed:
-	radeon_bo_unref(&(*mem)->bo);
-
-	return r;
-}
-
-static void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
-{
-	struct kgd_mem *mem = (struct kgd_mem *) mem_obj;
-
-	BUG_ON(mem == NULL);
-
-	radeon_bo_reserve(mem->bo, true);
-	radeon_bo_kunmap(mem->bo);
-	radeon_bo_unpin(mem->bo);
-	radeon_bo_unreserve(mem->bo);
-	radeon_bo_unref(&(mem->bo));
-	kfree(mem);
-}
-
-static uint64_t get_vmem_size(struct kgd_dev *kgd)
-{
-	struct radeon_device *rdev = (struct radeon_device *)kgd;
-
-	BUG_ON(kgd == NULL);
-
-	return rdev->mc.real_vram_size;
-}
-
-static uint64_t get_gpu_clock_counter(struct kgd_dev *kgd)
-{
-	struct radeon_device *rdev = (struct radeon_device *)kgd;
-
-	return rdev->asic->get_gpu_clock_counter(rdev);
-}
-
-static uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd)
-{
-	struct radeon_device *rdev = (struct radeon_device *)kgd;
-
-	/* The sclk is in quantas of 10kHz */
-	return rdev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100;
-}
-
-/*
- * PASID manager
- */
-static DEFINE_IDA(pasid_ida);
-
-static int alloc_pasid(unsigned int bits)
-{
-	int pasid = -EINVAL;
-
-	for (bits = min(bits, 31U); bits > 0; bits--) {
-		pasid = ida_simple_get(&pasid_ida,
-				       1U << (bits - 1), 1U << bits,
-				       GFP_KERNEL);
-		if (pasid != -ENOSPC)
-			break;
-	}
-
-	return pasid;
-}
-
-static void free_pasid(unsigned int pasid)
-{
-	ida_simple_remove(&pasid_ida, pasid);
-}
-
-static inline struct radeon_device *get_radeon_device(struct kgd_dev *kgd)
-{
-	return (struct radeon_device *)kgd;
-}
-
-static void write_register(struct kgd_dev *kgd, uint32_t offset, uint32_t value)
-{
-	struct radeon_device *rdev = get_radeon_device(kgd);
-
-	writel(value, (void __iomem *)(rdev->rmmio + offset));
-}
-
-static uint32_t read_register(struct kgd_dev *kgd, uint32_t offset)
-{
-	struct radeon_device *rdev = get_radeon_device(kgd);
-
-	return readl((void __iomem *)(rdev->rmmio + offset));
-}
-
-static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe,
-			uint32_t queue, uint32_t vmid)
-{
-	struct radeon_device *rdev = get_radeon_device(kgd);
-	uint32_t value = PIPEID(pipe) | MEID(mec) | VMID(vmid) | QUEUEID(queue);
-
-	mutex_lock(&rdev->srbm_mutex);
-	write_register(kgd, SRBM_GFX_CNTL, value);
-}
-
-static void unlock_srbm(struct kgd_dev *kgd)
-{
-	struct radeon_device *rdev = get_radeon_device(kgd);
-
-	write_register(kgd, SRBM_GFX_CNTL, 0);
-	mutex_unlock(&rdev->srbm_mutex);
-}
-
-static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
-				uint32_t queue_id)
-{
-	uint32_t mec = (++pipe_id / CIK_PIPE_PER_MEC) + 1;
-	uint32_t pipe = (pipe_id % CIK_PIPE_PER_MEC);
-
-	lock_srbm(kgd, mec, pipe, queue_id, 0);
-}
-
-static void release_queue(struct kgd_dev *kgd)
-{
-	unlock_srbm(kgd);
-}
-
-static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
-					uint32_t sh_mem_config,
-					uint32_t sh_mem_ape1_base,
-					uint32_t sh_mem_ape1_limit,
-					uint32_t sh_mem_bases)
-{
-	lock_srbm(kgd, 0, 0, 0, vmid);
-
-	write_register(kgd, SH_MEM_CONFIG, sh_mem_config);
-	write_register(kgd, SH_MEM_APE1_BASE, sh_mem_ape1_base);
-	write_register(kgd, SH_MEM_APE1_LIMIT, sh_mem_ape1_limit);
-	write_register(kgd, SH_MEM_BASES, sh_mem_bases);
-
-	unlock_srbm(kgd);
-}
-
-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
-					unsigned int vmid)
-{
-	/*
-	 * We have to assume that there is no outstanding mapping.
-	 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0
-	 * because a mapping is in progress or because a mapping finished and
-	 * the SW cleared it.
-	 * So the protocol is to always wait & clear.
-	 */
-	uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
-					ATC_VMID_PASID_MAPPING_VALID_MASK;
-
-	write_register(kgd, ATC_VMID0_PASID_MAPPING + vmid*sizeof(uint32_t),
-			pasid_mapping);
-
-	while (!(read_register(kgd, ATC_VMID_PASID_MAPPING_UPDATE_STATUS) &
-								(1U << vmid)))
-		cpu_relax();
-	write_register(kgd, ATC_VMID_PASID_MAPPING_UPDATE_STATUS, 1U << vmid);
-
-	/* Mapping vmid to pasid also for IH block */
-	write_register(kgd, IH_VMID_0_LUT + vmid * sizeof(uint32_t),
-			pasid_mapping);
-
-	return 0;
-}
-
-static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
-				uint32_t hpd_size, uint64_t hpd_gpu_addr)
-{
-	/* nothing to do here */
-	return 0;
-}
-
-static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
-{
-	uint32_t mec;
-	uint32_t pipe;
-
-	mec = (pipe_id / CIK_PIPE_PER_MEC) + 1;
-	pipe = (pipe_id % CIK_PIPE_PER_MEC);
-
-	lock_srbm(kgd, mec, pipe, 0, 0);
-
-	write_register(kgd, CPC_INT_CNTL,
-			TIME_STAMP_INT_ENABLE | OPCODE_ERROR_INT_ENABLE);
-
-	unlock_srbm(kgd);
-
-	return 0;
-}
-
-static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m)
-{
-	uint32_t retval;
-
-	retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET +
-			m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET;
-
-	pr_debug("kfd: sdma base address: 0x%x\n", retval);
-
-	return retval;
-}
-
-static inline struct cik_mqd *get_mqd(void *mqd)
-{
-	return (struct cik_mqd *)mqd;
-}
-
-static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
-{
-	return (struct cik_sdma_rlc_registers *)mqd;
-}
-
-static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
-			uint32_t queue_id, uint32_t __user *wptr,
-			uint32_t wptr_shift, uint32_t wptr_mask,
-			struct mm_struct *mm)
-{
-	uint32_t wptr_shadow, is_wptr_shadow_valid;
-	struct cik_mqd *m;
-
-	m = get_mqd(mqd);
-
-	is_wptr_shadow_valid = !get_user(wptr_shadow, wptr);
-
-	acquire_queue(kgd, pipe_id, queue_id);
-	write_register(kgd, CP_MQD_BASE_ADDR, m->cp_mqd_base_addr_lo);
-	write_register(kgd, CP_MQD_BASE_ADDR_HI, m->cp_mqd_base_addr_hi);
-	write_register(kgd, CP_MQD_CONTROL, m->cp_mqd_control);
-
-	write_register(kgd, CP_HQD_PQ_BASE, m->cp_hqd_pq_base_lo);
-	write_register(kgd, CP_HQD_PQ_BASE_HI, m->cp_hqd_pq_base_hi);
-	write_register(kgd, CP_HQD_PQ_CONTROL, m->cp_hqd_pq_control);
-
-	write_register(kgd, CP_HQD_IB_CONTROL, m->cp_hqd_ib_control);
-	write_register(kgd, CP_HQD_IB_BASE_ADDR, m->cp_hqd_ib_base_addr_lo);
-	write_register(kgd, CP_HQD_IB_BASE_ADDR_HI, m->cp_hqd_ib_base_addr_hi);
-
-	write_register(kgd, CP_HQD_IB_RPTR, m->cp_hqd_ib_rptr);
-
-	write_register(kgd, CP_HQD_PERSISTENT_STATE,
-			m->cp_hqd_persistent_state);
-	write_register(kgd, CP_HQD_SEMA_CMD, m->cp_hqd_sema_cmd);
-	write_register(kgd, CP_HQD_MSG_TYPE, m->cp_hqd_msg_type);
-
-	write_register(kgd, CP_HQD_ATOMIC0_PREOP_LO,
-			m->cp_hqd_atomic0_preop_lo);
-
-	write_register(kgd, CP_HQD_ATOMIC0_PREOP_HI,
-			m->cp_hqd_atomic0_preop_hi);
-
-	write_register(kgd, CP_HQD_ATOMIC1_PREOP_LO,
-			m->cp_hqd_atomic1_preop_lo);
-
-	write_register(kgd, CP_HQD_ATOMIC1_PREOP_HI,
-			m->cp_hqd_atomic1_preop_hi);
-
-	write_register(kgd, CP_HQD_PQ_RPTR_REPORT_ADDR,
-			m->cp_hqd_pq_rptr_report_addr_lo);
-
-	write_register(kgd, CP_HQD_PQ_RPTR_REPORT_ADDR_HI,
-			m->cp_hqd_pq_rptr_report_addr_hi);
-
-	write_register(kgd, CP_HQD_PQ_RPTR, m->cp_hqd_pq_rptr);
-
-	write_register(kgd, CP_HQD_PQ_WPTR_POLL_ADDR,
-			m->cp_hqd_pq_wptr_poll_addr_lo);
-
-	write_register(kgd, CP_HQD_PQ_WPTR_POLL_ADDR_HI,
-			m->cp_hqd_pq_wptr_poll_addr_hi);
-
-	write_register(kgd, CP_HQD_PQ_DOORBELL_CONTROL,
-			m->cp_hqd_pq_doorbell_control);
-
-	write_register(kgd, CP_HQD_VMID, m->cp_hqd_vmid);
-
-	write_register(kgd, CP_HQD_QUANTUM, m->cp_hqd_quantum);
-
-	write_register(kgd, CP_HQD_PIPE_PRIORITY, m->cp_hqd_pipe_priority);
-	write_register(kgd, CP_HQD_QUEUE_PRIORITY, m->cp_hqd_queue_priority);
-
-	write_register(kgd, CP_HQD_IQ_RPTR, m->cp_hqd_iq_rptr);
-
-	if (is_wptr_shadow_valid)
-		write_register(kgd, CP_HQD_PQ_WPTR, wptr_shadow);
-
-	write_register(kgd, CP_HQD_ACTIVE, m->cp_hqd_active);
-	release_queue(kgd);
-
-	return 0;
-}
-
-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
-{
-	struct cik_sdma_rlc_registers *m;
-	uint32_t sdma_base_addr;
-
-	m = get_sdma_mqd(mqd);
-	sdma_base_addr = get_sdma_base_addr(m);
-
-	write_register(kgd,
-			sdma_base_addr + SDMA0_RLC0_VIRTUAL_ADDR,
-			m->sdma_rlc_virtual_addr);
-
-	write_register(kgd,
-			sdma_base_addr + SDMA0_RLC0_RB_BASE,
-			m->sdma_rlc_rb_base);
-
-	write_register(kgd,
-			sdma_base_addr + SDMA0_RLC0_RB_BASE_HI,
-			m->sdma_rlc_rb_base_hi);
-
-	write_register(kgd,
-			sdma_base_addr + SDMA0_RLC0_RB_RPTR_ADDR_LO,
-			m->sdma_rlc_rb_rptr_addr_lo);
-
-	write_register(kgd,
-			sdma_base_addr + SDMA0_RLC0_RB_RPTR_ADDR_HI,
-			m->sdma_rlc_rb_rptr_addr_hi);
-
-	write_register(kgd,
-			sdma_base_addr + SDMA0_RLC0_DOORBELL,
-			m->sdma_rlc_doorbell);
-
-	write_register(kgd,
-			sdma_base_addr + SDMA0_RLC0_RB_CNTL,
-			m->sdma_rlc_rb_cntl);
-
-	return 0;
-}
-
-static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
-				uint32_t pipe_id, uint32_t queue_id)
-{
-	uint32_t act;
-	bool retval = false;
-	uint32_t low, high;
-
-	acquire_queue(kgd, pipe_id, queue_id);
-	act = read_register(kgd, CP_HQD_ACTIVE);
-	if (act) {
-		low = lower_32_bits(queue_address >> 8);
-		high = upper_32_bits(queue_address >> 8);
-
-		if (low == read_register(kgd, CP_HQD_PQ_BASE) &&
-				high == read_register(kgd, CP_HQD_PQ_BASE_HI))
-			retval = true;
-	}
-	release_queue(kgd);
-	return retval;
-}
-
-static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
-{
-	struct cik_sdma_rlc_registers *m;
-	uint32_t sdma_base_addr;
-	uint32_t sdma_rlc_rb_cntl;
-
-	m = get_sdma_mqd(mqd);
-	sdma_base_addr = get_sdma_base_addr(m);
-
-	sdma_rlc_rb_cntl = read_register(kgd,
-					sdma_base_addr + SDMA0_RLC0_RB_CNTL);
-
-	if (sdma_rlc_rb_cntl & SDMA_RB_ENABLE)
-		return true;
-
-	return false;
-}
-
-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t reset_type,
-				unsigned int timeout, uint32_t pipe_id,
-				uint32_t queue_id)
-{
-	uint32_t temp;
-
-	acquire_queue(kgd, pipe_id, queue_id);
-	write_register(kgd, CP_HQD_PQ_DOORBELL_CONTROL, 0);
-
-	write_register(kgd, CP_HQD_DEQUEUE_REQUEST, reset_type);
-
-	while (true) {
-		temp = read_register(kgd, CP_HQD_ACTIVE);
-		if (temp & 0x1)
-			break;
-		if (timeout == 0) {
-			pr_err("kfd: cp queue preemption time out (%dms)\n",
-				temp);
-			release_queue(kgd);
-			return -ETIME;
-		}
-		msleep(20);
-		timeout -= 20;
-	}
-
-	release_queue(kgd);
-	return 0;
-}
-
-static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
-				unsigned int timeout)
-{
-	struct cik_sdma_rlc_registers *m;
-	uint32_t sdma_base_addr;
-	uint32_t temp;
-
-	m = get_sdma_mqd(mqd);
-	sdma_base_addr = get_sdma_base_addr(m);
-
-	temp = read_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_CNTL);
-	temp = temp & ~SDMA_RB_ENABLE;
-	write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_CNTL, temp);
-
-	while (true) {
-		temp = read_register(kgd, sdma_base_addr +
-						SDMA0_RLC0_CONTEXT_STATUS);
-		if (temp & SDMA_RLC_IDLE)
-			break;
-		if (timeout == 0)
-			return -ETIME;
-		msleep(20);
-		timeout -= 20;
-	}
-
-	write_register(kgd, sdma_base_addr + SDMA0_RLC0_DOORBELL, 0);
-	write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_RPTR, 0);
-	write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_WPTR, 0);
-	write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_BASE, 0);
-
-	return 0;
-}
-
-static int kgd_address_watch_disable(struct kgd_dev *kgd)
-{
-	union TCP_WATCH_CNTL_BITS cntl;
-	unsigned int i;
-
-	cntl.u32All = 0;
-
-	cntl.bitfields.valid = 0;
-	cntl.bitfields.mask = ADDRESS_WATCH_REG_CNTL_DEFAULT_MASK;
-	cntl.bitfields.atc = 1;
-
-	/* Turning off this address until we set all the registers */
-	for (i = 0; i < MAX_WATCH_ADDRESSES; i++)
-		write_register(kgd,
-				watchRegs[i * ADDRESS_WATCH_REG_MAX +
-					ADDRESS_WATCH_REG_CNTL],
-				cntl.u32All);
-
-	return 0;
-}
-
-static int kgd_address_watch_execute(struct kgd_dev *kgd,
-					unsigned int watch_point_id,
-					uint32_t cntl_val,
-					uint32_t addr_hi,
-					uint32_t addr_lo)
-{
-	union TCP_WATCH_CNTL_BITS cntl;
-
-	cntl.u32All = cntl_val;
-
-	/* Turning off this watch point until we set all the registers */
-	cntl.bitfields.valid = 0;
-	write_register(kgd,
-			watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
-				ADDRESS_WATCH_REG_CNTL],
-			cntl.u32All);
-
-	write_register(kgd,
-			watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
-				ADDRESS_WATCH_REG_ADDR_HI],
-			addr_hi);
-
-	write_register(kgd,
-			watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
-				ADDRESS_WATCH_REG_ADDR_LO],
-			addr_lo);
-
-	/* Enable the watch point */
-	cntl.bitfields.valid = 1;
-
-	write_register(kgd,
-			watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX +
-				ADDRESS_WATCH_REG_CNTL],
-			cntl.u32All);
-
-	return 0;
-}
-
-static int kgd_wave_control_execute(struct kgd_dev *kgd,
-					uint32_t gfx_index_val,
-					uint32_t sq_cmd)
-{
-	struct radeon_device *rdev = get_radeon_device(kgd);
-	uint32_t data;
-
-	mutex_lock(&rdev->grbm_idx_mutex);
-
-	write_register(kgd, GRBM_GFX_INDEX, gfx_index_val);
-	write_register(kgd, SQ_CMD, sq_cmd);
-
-	/*  Restore the GRBM_GFX_INDEX register  */
-
-	data = INSTANCE_BROADCAST_WRITES | SH_BROADCAST_WRITES |
-		SE_BROADCAST_WRITES;
-
-	write_register(kgd, GRBM_GFX_INDEX, data);
-
-	mutex_unlock(&rdev->grbm_idx_mutex);
-
-	return 0;
-}
-
-static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
-					unsigned int watch_point_id,
-					unsigned int reg_offset)
-{
-	return watchRegs[watch_point_id * ADDRESS_WATCH_REG_MAX + reg_offset]
-		/ 4;
-}
-
-static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, uint8_t vmid)
-{
-	uint32_t reg;
-	struct radeon_device *rdev = (struct radeon_device *) kgd;
-
-	reg = RREG32(ATC_VMID0_PASID_MAPPING + vmid*4);
-	return reg & ATC_VMID_PASID_MAPPING_VALID_MASK;
-}
-
-static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
-							uint8_t vmid)
-{
-	uint32_t reg;
-	struct radeon_device *rdev = (struct radeon_device *) kgd;
-
-	reg = RREG32(ATC_VMID0_PASID_MAPPING + vmid*4);
-	return reg & ATC_VMID_PASID_MAPPING_PASID_MASK;
-}
-
-static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
-{
-	struct radeon_device *rdev = (struct radeon_device *) kgd;
-
-	return WREG32(VM_INVALIDATE_REQUEST, 1 << vmid);
-}
-
-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
-{
-	struct radeon_device *rdev = (struct radeon_device *) kgd;
-	const union radeon_firmware_header *hdr;
-
-	BUG_ON(kgd == NULL || rdev->mec_fw == NULL);
-
-	switch (type) {
-	case KGD_ENGINE_PFP:
-		hdr = (const union radeon_firmware_header *) rdev->pfp_fw->data;
-		break;
-
-	case KGD_ENGINE_ME:
-		hdr = (const union radeon_firmware_header *) rdev->me_fw->data;
-		break;
-
-	case KGD_ENGINE_CE:
-		hdr = (const union radeon_firmware_header *) rdev->ce_fw->data;
-		break;
-
-	case KGD_ENGINE_MEC1:
-		hdr = (const union radeon_firmware_header *) rdev->mec_fw->data;
-		break;
-
-	case KGD_ENGINE_MEC2:
-		hdr = (const union radeon_firmware_header *)
-							rdev->mec2_fw->data;
-		break;
-
-	case KGD_ENGINE_RLC:
-		hdr = (const union radeon_firmware_header *) rdev->rlc_fw->data;
-		break;
-
-	case KGD_ENGINE_SDMA1:
-	case KGD_ENGINE_SDMA2:
-		hdr = (const union radeon_firmware_header *)
-							rdev->sdma_fw->data;
-		break;
-
-	default:
-		return 0;
-	}
-
-	if (hdr == NULL)
-		return 0;
-
-	/* Only 12 bit in use*/
-	return hdr->common.ucode_version;
-}
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
index dfee8f7d94ae..cde037f213d7 100644
--- a/drivers/gpu/drm/radeon/radeon_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_kms.c
@@ -34,8 +34,6 @@
 #include <linux/slab.h>
 #include <linux/pm_runtime.h>
 
-#include "radeon_kfd.h"
-
 #if defined(CONFIG_VGA_SWITCHEROO)
 bool radeon_has_atpx(void);
 #else
@@ -68,8 +66,6 @@ void radeon_driver_unload_kms(struct drm_device *dev)
 		pm_runtime_forbid(dev->dev);
 	}
 
-	radeon_kfd_device_fini(rdev);
-
 	radeon_acpi_fini(rdev);
 	
 	radeon_modeset_fini(rdev);
@@ -174,9 +170,6 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags)
 				"Error during ACPI methods call\n");
 	}
 
-	radeon_kfd_device_probe(rdev);
-	radeon_kfd_device_init(rdev);
-
 	if (radeon_is_px(dev)) {
 		pm_runtime_use_autosuspend(dev->dev);
 		pm_runtime_set_autosuspend_delay(dev->dev, 5000);
diff --git a/drivers/gpu/drm/rockchip/Kconfig b/drivers/gpu/drm/rockchip/Kconfig
index 3c70c6224bd2..0ccc76217ee4 100644
--- a/drivers/gpu/drm/rockchip/Kconfig
+++ b/drivers/gpu/drm/rockchip/Kconfig
@@ -60,7 +60,7 @@ config ROCKCHIP_INNO_HDMI
 config ROCKCHIP_LVDS
 	bool "Rockchip LVDS support"
 	depends on DRM_ROCKCHIP
-	depends on PINCTRL
+	depends on PINCTRL && OF
 	help
 	  Choose this option to enable support for Rockchip LVDS controllers.
 	  Rockchip rk3288 SoC has LVDS TX Controller can be used, and it
diff --git a/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c b/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c
index 4d3f6ad0abdd..93b7102dd008 100644
--- a/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c
+++ b/drivers/gpu/drm/rockchip/analogix_dp-rockchip.c
@@ -72,7 +72,7 @@ struct rockchip_dp_device {
 	struct reset_control     *rst;
 
 	struct work_struct	 psr_work;
-	spinlock_t		 psr_lock;
+	struct mutex             psr_lock;
 	unsigned int             psr_state;
 
 	const struct rockchip_dp_chip_data *data;
@@ -83,21 +83,20 @@ struct rockchip_dp_device {
 static void analogix_dp_psr_set(struct drm_encoder *encoder, bool enabled)
 {
 	struct rockchip_dp_device *dp = to_dp(encoder);
-	unsigned long flags;
 
 	if (!analogix_dp_psr_supported(dp->dev))
 		return;
 
 	DRM_DEV_DEBUG(dp->dev, "%s PSR...\n", enabled ? "Entry" : "Exit");
 
-	spin_lock_irqsave(&dp->psr_lock, flags);
+	mutex_lock(&dp->psr_lock);
 	if (enabled)
 		dp->psr_state = EDP_VSC_PSR_STATE_ACTIVE;
 	else
 		dp->psr_state = ~EDP_VSC_PSR_STATE_ACTIVE;
 
 	schedule_work(&dp->psr_work);
-	spin_unlock_irqrestore(&dp->psr_lock, flags);
+	mutex_unlock(&dp->psr_lock);
 }
 
 static void analogix_dp_psr_work(struct work_struct *work)
@@ -105,7 +104,6 @@ static void analogix_dp_psr_work(struct work_struct *work)
 	struct rockchip_dp_device *dp =
 				container_of(work, typeof(*dp), psr_work);
 	int ret;
-	unsigned long flags;
 
 	ret = rockchip_drm_wait_vact_end(dp->encoder.crtc,
 					 PSR_WAIT_LINE_FLAG_TIMEOUT_MS);
@@ -114,12 +112,12 @@ static void analogix_dp_psr_work(struct work_struct *work)
 		return;
 	}
 
-	spin_lock_irqsave(&dp->psr_lock, flags);
+	mutex_lock(&dp->psr_lock);
 	if (dp->psr_state == EDP_VSC_PSR_STATE_ACTIVE)
 		analogix_dp_enable_psr(dp->dev);
 	else
 		analogix_dp_disable_psr(dp->dev);
-	spin_unlock_irqrestore(&dp->psr_lock, flags);
+	mutex_unlock(&dp->psr_lock);
 }
 
 static int rockchip_dp_pre_init(struct rockchip_dp_device *dp)
@@ -381,7 +379,7 @@ static int rockchip_dp_bind(struct device *dev, struct device *master,
 	dp->plat_data.power_off = rockchip_dp_powerdown;
 	dp->plat_data.get_modes = rockchip_dp_get_modes;
 
-	spin_lock_init(&dp->psr_lock);
+	mutex_init(&dp->psr_lock);
 	dp->psr_state = ~EDP_VSC_PSR_STATE_ACTIVE;
 	INIT_WORK(&dp->psr_work, analogix_dp_psr_work);
 
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index d79607a1187c..c088703777e2 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -150,8 +150,7 @@ static void ttm_bo_release_list(struct kref *list_kref)
 	ttm_tt_destroy(bo->ttm);
 	atomic_dec(&bo->glob->bo_count);
 	dma_fence_put(bo->moving);
-	if (bo->resv == &bo->ttm_resv)
-		reservation_object_fini(&bo->ttm_resv);
+	reservation_object_fini(&bo->ttm_resv);
 	mutex_destroy(&bo->wu_mutex);
 	if (bo->destroy)
 		bo->destroy(bo);
@@ -402,14 +401,11 @@ static int ttm_bo_individualize_resv(struct ttm_buffer_object *bo)
 	if (bo->resv == &bo->ttm_resv)
 		return 0;
 
-	reservation_object_init(&bo->ttm_resv);
 	BUG_ON(!reservation_object_trylock(&bo->ttm_resv));
 
 	r = reservation_object_copy_fences(&bo->ttm_resv, bo->resv);
-	if (r) {
+	if (r)
 		reservation_object_unlock(&bo->ttm_resv);
-		reservation_object_fini(&bo->ttm_resv);
-	}
 
 	return r;
 }
@@ -459,6 +455,7 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo)
 			spin_unlock(&glob->lru_lock);
 			if (bo->resv != &bo->ttm_resv)
 				reservation_object_unlock(&bo->ttm_resv);
+
 			ttm_bo_cleanup_memtype_use(bo);
 			return;
 		}
@@ -557,8 +554,6 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo,
 	}
 
 	ttm_bo_del_from_lru(bo);
-	if (!list_empty(&bo->ddestroy) && (bo->resv != &bo->ttm_resv))
-		reservation_object_fini(&bo->ttm_resv);
 	list_del_init(&bo->ddestroy);
 	kref_put(&bo->list_kref, ttm_bo_ref_bug);
 
@@ -1207,8 +1202,8 @@ int ttm_bo_init_reserved(struct ttm_bo_device *bdev,
 		lockdep_assert_held(&bo->resv->lock.base);
 	} else {
 		bo->resv = &bo->ttm_resv;
-		reservation_object_init(&bo->ttm_resv);
 	}
+	reservation_object_init(&bo->ttm_resv);
 	atomic_inc(&bo->glob->bo_count);
 	drm_vma_node_reset(&bo->vma_node);
 	bo->priority = 0;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index 78cb99be7146..e7a519f1849b 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -474,6 +474,7 @@ static int ttm_buffer_object_transfer(struct ttm_buffer_object *bo,
 	INIT_LIST_HEAD(&fbo->lru);
 	INIT_LIST_HEAD(&fbo->swap);
 	INIT_LIST_HEAD(&fbo->io_reserve_lru);
+	mutex_init(&fbo->wu_mutex);
 	fbo->moving = NULL;
 	drm_vma_node_reset(&fbo->vma_node);
 	atomic_set(&fbo->cpu_writers, 0);
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 4d688c8d7853..316f831ad5f0 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -329,7 +329,7 @@ static int ttm_page_pool_free(struct ttm_page_pool *pool, unsigned nr_free,
 		pages_to_free = kmalloc(npages_to_free * sizeof(struct page *),
 					GFP_KERNEL);
 	if (!pages_to_free) {
-		pr_err("Failed to allocate memory for pool free operation\n");
+		pr_debug("Failed to allocate memory for pool free operation\n");
 		return 0;
 	}
 
@@ -517,7 +517,7 @@ static int ttm_alloc_new_pages(struct list_head *pages, gfp_t gfp_flags,
 	caching_array = kmalloc(max_cpages*sizeof(struct page *), GFP_KERNEL);
 
 	if (!caching_array) {
-		pr_err("Unable to allocate table for new pages\n");
+		pr_debug("Unable to allocate table for new pages\n");
 		return -ENOMEM;
 	}
 
@@ -525,7 +525,7 @@ static int ttm_alloc_new_pages(struct list_head *pages, gfp_t gfp_flags,
 		p = alloc_pages(gfp_flags, order);
 
 		if (!p) {
-			pr_err("Unable to get page %u\n", i);
+			pr_debug("Unable to get page %u\n", i);
 
 			/* store already allocated pages in the pool after
 			 * setting the caching state */
@@ -625,7 +625,7 @@ static void ttm_page_pool_fill_locked(struct ttm_page_pool *pool, int ttm_flags,
 			++pool->nrefills;
 			pool->npages += alloc_size;
 		} else {
-			pr_err("Failed to fill pool (%p)\n", pool);
+			pr_debug("Failed to fill pool (%p)\n", pool);
 			/* If we have any pages left put them to the pool. */
 			list_for_each_entry(p, &new_pages, lru) {
 				++cpages;
@@ -885,8 +885,7 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
 		while (npages) {
 			p = alloc_page(gfp_flags);
 			if (!p) {
-
-				pr_err("Unable to allocate page\n");
+				pr_debug("Unable to allocate page\n");
 				return -ENOMEM;
 			}
 
@@ -925,7 +924,7 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
 		/* If there is any pages in the list put them back to
 		 * the pool.
 		 */
-		pr_err("Failed to allocate extra pages for large request\n");
+		pr_debug("Failed to allocate extra pages for large request\n");
 		ttm_put_pages(pages, count, flags, cstate);
 		return r;
 	}
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
index 96ad12906621..6b2627fe9bc1 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
@@ -463,7 +463,7 @@ static unsigned ttm_dma_page_pool_free(struct dma_pool *pool, unsigned nr_free,
 					GFP_KERNEL);
 
 	if (!pages_to_free) {
-		pr_err("%s: Failed to allocate memory for pool free operation\n",
+		pr_debug("%s: Failed to allocate memory for pool free operation\n",
 		       pool->dev_name);
 		return 0;
 	}
@@ -755,7 +755,7 @@ static int ttm_dma_pool_alloc_new_pages(struct dma_pool *pool,
 	caching_array = kmalloc(max_cpages*sizeof(struct page *), GFP_KERNEL);
 
 	if (!caching_array) {
-		pr_err("%s: Unable to allocate table for new pages\n",
+		pr_debug("%s: Unable to allocate table for new pages\n",
 		       pool->dev_name);
 		return -ENOMEM;
 	}
@@ -768,8 +768,8 @@ static int ttm_dma_pool_alloc_new_pages(struct dma_pool *pool,
 	for (i = 0, cpages = 0; i < count; ++i) {
 		dma_p = __ttm_dma_alloc_page(pool);
 		if (!dma_p) {
-			pr_err("%s: Unable to get page %u\n",
-			       pool->dev_name, i);
+			pr_debug("%s: Unable to get page %u\n",
+				 pool->dev_name, i);
 
 			/* store already allocated pages in the pool after
 			 * setting the caching state */
@@ -855,8 +855,8 @@ static int ttm_dma_page_pool_fill_locked(struct dma_pool *pool,
 			struct dma_page *d_page;
 			unsigned cpages = 0;
 
-			pr_err("%s: Failed to fill %s pool (r:%d)!\n",
-			       pool->dev_name, pool->name, r);
+			pr_debug("%s: Failed to fill %s pool (r:%d)!\n",
+				 pool->dev_name, pool->name, r);
 
 			list_for_each_entry(d_page, &d_pages, page_list) {
 				cpages++;
diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index 01a53ba304f8..98a6cb9f44fc 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -88,11 +88,11 @@ int vc4_bo_stats_debugfs(struct seq_file *m, void *unused)
 
 	mutex_lock(&vc4->purgeable.lock);
 	if (vc4->purgeable.num)
-		seq_printf(m, "%30s: %6dkb BOs (%d)\n", "userspace BO cache",
+		seq_printf(m, "%30s: %6zdkb BOs (%d)\n", "userspace BO cache",
 			   vc4->purgeable.size / 1024, vc4->purgeable.num);
 
 	if (vc4->purgeable.purged_num)
-		seq_printf(m, "%30s: %6dkb BOs (%d)\n", "total purged BO",
+		seq_printf(m, "%30s: %6zdkb BOs (%d)\n", "total purged BO",
 			   vc4->purgeable.purged_size / 1024,
 			   vc4->purgeable.purged_num);
 	mutex_unlock(&vc4->purgeable.lock);