22 files changed, 407 insertions, 212 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig
index 3858820a0055..fbf0ee5201c3 100644
--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
@@ -3,7 +3,7 @@
 #
 
 config HSA_AMD
-	tristate "HSA kernel driver for AMD GPU devices"
+	bool "HSA kernel driver for AMD GPU devices"
 	depends on DRM_AMDGPU && X86_64
 	imply AMD_IOMMU_V2
 	select MMU_NOTIFIER
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index ffd096fffc1c..69ec96998bb9 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -23,26 +23,41 @@
 # Makefile for Heterogenous System Architecture support for AMD GPU devices
 #
 
-ccflags-y := -Idrivers/gpu/drm/amd/include/  \
-		-Idrivers/gpu/drm/amd/include/asic_reg
-
-amdkfd-y	:= kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
-		kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \
-		kfd_process.o kfd_queue.o kfd_mqd_manager.o \
-		kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \
-		kfd_mqd_manager_v9.o \
-		kfd_kernel_queue.o kfd_kernel_queue_cik.o \
-		kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \
-		kfd_packet_manager.o kfd_process_queue_manager.o \
-		kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \
-		kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \
-		kfd_interrupt.o kfd_events.o cik_event_interrupt.o \
-		kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o
+AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
+		$(AMDKFD_PATH)/kfd_device.o \
+		$(AMDKFD_PATH)/kfd_chardev.o \
+		$(AMDKFD_PATH)/kfd_topology.o \
+		$(AMDKFD_PATH)/kfd_pasid.o \
+		$(AMDKFD_PATH)/kfd_doorbell.o \
+		$(AMDKFD_PATH)/kfd_flat_memory.o \
+		$(AMDKFD_PATH)/kfd_process.o \
+		$(AMDKFD_PATH)/kfd_queue.o \
+		$(AMDKFD_PATH)/kfd_mqd_manager.o \
+		$(AMDKFD_PATH)/kfd_mqd_manager_cik.o \
+		$(AMDKFD_PATH)/kfd_mqd_manager_vi.o \
+		$(AMDKFD_PATH)/kfd_mqd_manager_v9.o \
+		$(AMDKFD_PATH)/kfd_kernel_queue.o \
+		$(AMDKFD_PATH)/kfd_kernel_queue_cik.o \
+		$(AMDKFD_PATH)/kfd_kernel_queue_vi.o \
+		$(AMDKFD_PATH)/kfd_kernel_queue_v9.o \
+		$(AMDKFD_PATH)/kfd_packet_manager.o \
+		$(AMDKFD_PATH)/kfd_process_queue_manager.o \
+		$(AMDKFD_PATH)/kfd_device_queue_manager.o \
+		$(AMDKFD_PATH)/kfd_device_queue_manager_cik.o \
+		$(AMDKFD_PATH)/kfd_device_queue_manager_vi.o \
+		$(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \
+		$(AMDKFD_PATH)/kfd_interrupt.o \
+		$(AMDKFD_PATH)/kfd_events.o \
+		$(AMDKFD_PATH)/cik_event_interrupt.o \
+		$(AMDKFD_PATH)/kfd_int_process_v9.o \
+		$(AMDKFD_PATH)/kfd_dbgdev.o \
+		$(AMDKFD_PATH)/kfd_dbgmgr.o \
+		$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
-amdkfd-y += kfd_iommu.o
+AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
 endif
 
-amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o
-
-obj-$(CONFIG_HSA_AMD)	+= amdkfd.o
+ifneq ($(CONFIG_DEBUG_FS),)
+AMDKFD_FILES += $(AMDKFD_PATH)/kfd_debugfs.o
+endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 297b36c26a05..14d5b5fa822d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -447,6 +447,24 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
 	return retval;
 }
 
+static int kfd_ioctl_get_queue_wave_state(struct file *filep,
+					  struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_get_queue_wave_state_args *args = data;
+	int r;
+
+	mutex_lock(&p->mutex);
+
+	r = pqm_get_wave_state(&p->pqm, args->queue_id,
+			       (void __user *)args->ctl_stack_address,
+			       &args->ctl_stack_used_size,
+			       &args->save_area_used_size);
+
+	mutex_unlock(&p->mutex);
+
+	return r;
+}
+
 static int kfd_ioctl_set_memory_policy(struct file *filep,
 					struct kfd_process *p, void *data)
 {
@@ -1210,7 +1228,7 @@ err_unlock:
 	return ret;
 }
 
-static bool kfd_dev_is_large_bar(struct kfd_dev *dev)
+bool kfd_dev_is_large_bar(struct kfd_dev *dev)
 {
 	struct kfd_local_mem_info mem_info;
 
@@ -1615,6 +1633,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK,
 			kfd_ioctl_set_cu_mask, 0),
 
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_QUEUE_WAVE_STATE,
+			kfd_ioctl_get_queue_wave_state, 0)
+
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index ee4996029a86..56412b0e7e1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -346,15 +346,15 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
 					struct list_head *device_list)
 {
 	struct kfd_iolink_properties *props = NULL, *props2;
-	struct kfd_topology_device *dev, *cpu_dev;
+	struct kfd_topology_device *dev, *to_dev;
 	uint32_t id_from;
 	uint32_t id_to;
 
 	id_from = iolink->proximity_domain_from;
 	id_to = iolink->proximity_domain_to;
 
-	pr_debug("Found IO link entry in CRAT table with id_from=%d\n",
-			id_from);
+	pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n",
+			id_from, id_to);
 	list_for_each_entry(dev, device_list, list) {
 		if (id_from == dev->proximity_domain) {
 			props = kfd_alloc_struct(props);
@@ -369,6 +369,8 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
 
 			if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
 				props->weight = 20;
+			else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI)
+				props->weight = 15;
 			else
 				props->weight = node_distance(id_from, id_to);
 
@@ -389,20 +391,23 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
 	/* CPU topology is created before GPUs are detected, so CPU->GPU
 	 * links are not built at that time. If a PCIe type is discovered, it
 	 * means a GPU is detected and we are adding GPU->CPU to the topology.
-	 * At this time, also add the corresponded CPU->GPU link.
+	 * At this time, also add the corresponded CPU->GPU link if GPU
+	 * is large bar.
+	 * For xGMI, we only added the link with one direction in the crat
+	 * table, add corresponded reversed direction link now.
 	 */
-	if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) {
-		cpu_dev = kfd_topology_device_by_proximity_domain(id_to);
-		if (!cpu_dev)
+	if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
+		to_dev = kfd_topology_device_by_proximity_domain(id_to);
+		if (!to_dev)
 			return -ENODEV;
 		/* same everything but the other direction */
 		props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
 		props2->node_from = id_to;
 		props2->node_to = id_from;
 		props2->kobj = NULL;
-		cpu_dev->io_link_count++;
-		cpu_dev->node_props.io_links_count++;
-		list_add_tail(&props2->list, &cpu_dev->io_link_props);
+		to_dev->io_link_count++;
+		to_dev->node_props.io_links_count++;
+		list_add_tail(&props2->list, &to_dev->io_link_props);
 	}
 
 	return 0;
@@ -642,6 +647,7 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
 		num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
 		break;
 	case CHIP_VEGA10:
+	case CHIP_VEGA20:
 		pcache_info = vega10_cache_info;
 		num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
 		break;
@@ -1037,7 +1043,7 @@ static int kfd_fill_gpu_memory_affinity(int *avail_size,
  *
  *	Return 0 if successful else return -ve value
  */
-static int kfd_fill_gpu_direct_io_link(int *avail_size,
+static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
 			struct kfd_dev *kdev,
 			struct crat_subtype_iolink *sub_type_hdr,
 			uint32_t proximity_domain)
@@ -1052,6 +1058,8 @@ static int kfd_fill_gpu_direct_io_link(int *avail_size,
 	sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
 	sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
 	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+	if (kfd_dev_is_large_bar(kdev))
+		sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
 
 	/* Fill in IOLINK subtype.
 	 * TODO: Fill-in other fields of iolink subtype
@@ -1069,6 +1077,29 @@ static int kfd_fill_gpu_direct_io_link(int *avail_size,
 	return 0;
 }
 
+static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
+			struct kfd_dev *kdev,
+			struct crat_subtype_iolink *sub_type_hdr,
+			uint32_t proximity_domain_from,
+			uint32_t proximity_domain_to)
+{
+	*avail_size -= sizeof(struct crat_subtype_iolink);
+	if (*avail_size < 0)
+		return -ENOMEM;
+
+	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
+
+	sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
+	sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
+	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED |
+			       CRAT_IOLINK_FLAGS_BI_DIRECTIONAL;
+
+	sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
+	sub_type_hdr->proximity_domain_from = proximity_domain_from;
+	sub_type_hdr->proximity_domain_to = proximity_domain_to;
+	return 0;
+}
+
 /* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU
  *
  *	@pcrat_image: Fill in VCRAT for GPU
@@ -1081,14 +1112,16 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 {
 	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
 	struct crat_subtype_generic *sub_type_hdr;
+	struct kfd_local_mem_info local_mem_info;
+	struct kfd_topology_device *peer_dev;
 	struct crat_subtype_computeunit *cu;
 	struct kfd_cu_info cu_info;
 	int avail_size = *size;
 	uint32_t total_num_of_cu;
 	int num_of_cache_entries = 0;
 	int cache_mem_filled = 0;
+	uint32_t nid = 0;
 	int ret = 0;
-	struct kfd_local_mem_info local_mem_info;
 
 	if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU)
 		return -EINVAL;
@@ -1212,7 +1245,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 	 */
 	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
 		cache_mem_filled);
-	ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev,
+	ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev,
 		(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
 
 	if (ret < 0)
@@ -1221,6 +1254,35 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 	crat_table->length += sub_type_hdr->length;
 	crat_table->total_entries++;
 
+
+	/* Fill in Subtype: IO_LINKS
+	 * Direct links from GPU to other GPUs through xGMI.
+	 * We will loop GPUs that already be processed (with lower value
+	 * of proximity_domain), add the link for the GPUs with same
+	 * hive id (from this GPU to other GPU) . The reversed iolink
+	 * (from other GPU to this GPU) will be added
+	 * in kfd_parse_subtype_iolink.
+	 */
+	if (kdev->hive_id) {
+		for (nid = 0; nid < proximity_domain; ++nid) {
+			peer_dev = kfd_topology_device_by_proximity_domain(nid);
+			if (!peer_dev->gpu)
+				continue;
+			if (peer_dev->gpu->hive_id != kdev->hive_id)
+				continue;
+			sub_type_hdr = (typeof(sub_type_hdr))(
+				(char *)sub_type_hdr +
+				sizeof(struct crat_subtype_iolink));
+			ret = kfd_fill_gpu_xgmi_link_to_gpu(
+				&avail_size, kdev,
+				(struct crat_subtype_iolink *)sub_type_hdr,
+				proximity_domain, nid);
+			if (ret < 0)
+				return ret;
+			crat_table->length += sub_type_hdr->length;
+			crat_table->total_entries++;
+		}
+	}
 	*size = crat_table->length;
 	pr_info("Virtual CRAT table created for GPU\n");
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
index b5cd182b9edd..7c3f192fe25f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@@ -232,7 +232,8 @@ struct crat_subtype_ccompute {
 #define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT	(1 << 2)
 #define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT	(1 << 3)
 #define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA	(1 << 4)
-#define CRAT_IOLINK_FLAGS_RESERVED_MASK		0xffffffe0
+#define CRAT_IOLINK_FLAGS_BI_DIRECTIONAL 	(1 << 31)
+#define CRAT_IOLINK_FLAGS_RESERVED_MASK		0x7fffffe0
 
 /*
  * IO interface types
@@ -248,7 +249,12 @@ struct crat_subtype_ccompute {
 #define CRAT_IOLINK_TYPE_RAPID_IO	8
 #define CRAT_IOLINK_TYPE_INFINIBAND	9
 #define CRAT_IOLINK_TYPE_RESERVED3	10
-#define CRAT_IOLINK_TYPE_OTHER		11
+#define CRAT_IOLINK_TYPE_XGMI		11
+#define CRAT_IOLINK_TYPE_XGOP		12
+#define CRAT_IOLINK_TYPE_GZ		13
+#define CRAT_IOLINK_TYPE_ETHERNET_RDMA	14
+#define CRAT_IOLINK_TYPE_RDMA_OTHER	15
+#define CRAT_IOLINK_TYPE_OTHER		16
 #define CRAT_IOLINK_TYPE_MAX		255
 
 #define CRAT_IOLINK_RESERVED_LENGTH	24
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 29ac74f40dce..a9f18ea7e354 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -53,6 +53,7 @@ static const struct kfd_device_info kaveri_device_info = {
 	.needs_iommu_device = true,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info carrizo_device_info = {
@@ -69,6 +70,7 @@ static const struct kfd_device_info carrizo_device_info = {
 	.needs_iommu_device = true,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info raven_device_info = {
@@ -84,6 +86,7 @@ static const struct kfd_device_info raven_device_info = {
 	.needs_iommu_device = true,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 1,
+	.num_sdma_queues_per_engine = 2,
 };
 #endif
 
@@ -101,6 +104,7 @@ static const struct kfd_device_info hawaii_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info tonga_device_info = {
@@ -116,21 +120,7 @@ static const struct kfd_device_info tonga_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
-};
-
-static const struct kfd_device_info tonga_vf_device_info = {
-	.asic_family = CHIP_TONGA,
-	.max_pasid_bits = 16,
-	.max_no_of_hqd  = 24,
-	.doorbell_size  = 4,
-	.ih_ring_entry_size = 4 * sizeof(uint32_t),
-	.event_interrupt_class = &event_interrupt_class_cik,
-	.num_of_watch_points = 4,
-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
-	.supports_cwsr = false,
-	.needs_iommu_device = false,
-	.needs_pci_atomics = false,
-	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info fiji_device_info = {
@@ -146,6 +136,7 @@ static const struct kfd_device_info fiji_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info fiji_vf_device_info = {
@@ -161,6 +152,7 @@ static const struct kfd_device_info fiji_vf_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 2,
 };
 
 
@@ -177,6 +169,7 @@ static const struct kfd_device_info polaris10_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info polaris10_vf_device_info = {
@@ -192,6 +185,7 @@ static const struct kfd_device_info polaris10_vf_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info polaris11_device_info = {
@@ -207,6 +201,7 @@ static const struct kfd_device_info polaris11_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info vega10_device_info = {
@@ -222,6 +217,7 @@ static const struct kfd_device_info vega10_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info vega10_vf_device_info = {
@@ -237,8 +233,24 @@ static const struct kfd_device_info vega10_vf_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 2,
 };
 
+static const struct kfd_device_info vega20_device_info = {
+	.asic_family = CHIP_VEGA20,
+	.max_pasid_bits = 16,
+	.max_no_of_hqd	= 24,
+	.doorbell_size	= 8,
+	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_v9,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+	.num_sdma_queues_per_engine = 8,
+};
 
 struct kfd_deviceid {
 	unsigned short did;
@@ -293,7 +305,6 @@ static const struct kfd_deviceid supported_devices[] = {
 	{ 0x6928, &tonga_device_info },		/* Tonga */
 	{ 0x6929, &tonga_device_info },		/* Tonga */
 	{ 0x692B, &tonga_device_info },		/* Tonga */
-	{ 0x692F, &tonga_vf_device_info },	/* Tonga vf */
 	{ 0x6938, &tonga_device_info },		/* Tonga */
 	{ 0x6939, &tonga_device_info },		/* Tonga */
 	{ 0x7300, &fiji_device_info },		/* Fiji */
@@ -328,6 +339,12 @@ static const struct kfd_deviceid supported_devices[] = {
 	{ 0x6868, &vega10_device_info },	/* Vega10 */
 	{ 0x686C, &vega10_vf_device_info },	/* Vega10  vf*/
 	{ 0x687F, &vega10_device_info },	/* Vega10 */
+	{ 0x66a0, &vega20_device_info },	/* Vega20 */
+	{ 0x66a1, &vega20_device_info },	/* Vega20 */
+	{ 0x66a2, &vega20_device_info },	/* Vega20 */
+	{ 0x66a3, &vega20_device_info },	/* Vega20 */
+	{ 0x66a7, &vega20_device_info },	/* Vega20 */
+	{ 0x66af, &vega20_device_info }		/* Vega20 */
 };
 
 static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
@@ -366,6 +383,10 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
 		return NULL;
 	}
 
+	kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
+	if (!kfd)
+		return NULL;
+
 	/* Allow BIF to recode atomics to PCIe 3.0 AtomicOps.
 	 * 32 and 64-bit requests are possible and must be
 	 * supported.
@@ -377,12 +398,10 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
 		dev_info(kfd_device,
 			 "skipped device %x:%x, PCI rejects atomics\n",
 			 pdev->vendor, pdev->device);
+		kfree(kfd);
 		return NULL;
-	}
-
-	kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
-	if (!kfd)
-		return NULL;
+	} else if (!ret)
+		kfd->pci_atomic_requested = true;
 
 	kfd->kgd = kgd;
 	kfd->device_info = device_info;
@@ -419,6 +438,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 {
 	unsigned int size;
 
+	kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd,
+			KGD_ENGINE_MEC1);
+	kfd->sdma_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd,
+			KGD_ENGINE_SDMA1);
 	kfd->shared_resources = *gpu_resources;
 
 	kfd->vm_info.first_vmid_kfd = ffs(gpu_resources->compute_vmid_bitmap)-1;
@@ -477,6 +500,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 		goto kfd_doorbell_error;
 	}
 
+	if (kfd->kfd2kgd->get_hive_id)
+		kfd->hive_id = kfd->kfd2kgd->get_hive_id(kfd->kgd);
+
 	if (kfd_topology_add_device(kfd)) {
 		dev_err(kfd_device, "Error adding device to topology\n");
 		goto kfd_topology_add_device_error;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 4f22e745df51..a3b933967171 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -109,7 +109,7 @@ static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm)
 unsigned int get_num_sdma_queues(struct device_queue_manager *dqm)
 {
 	return dqm->dev->device_info->num_sdma_engines
-			* KFD_SDMA_QUEUES_PER_ENGINE;
+			* dqm->dev->device_info->num_sdma_queues_per_engine;
 }
 
 void program_sh_mem_settings(struct device_queue_manager *dqm,
@@ -667,7 +667,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
 	struct queue *q;
 	struct mqd_manager *mqd_mgr;
 	struct kfd_process_device *pdd;
-	uint32_t pd_base;
+	uint64_t pd_base;
 	int retval = 0;
 
 	pdd = qpd_to_pdd(qpd);
@@ -687,7 +687,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
 
 	/* Update PD Base in QPD */
 	qpd->page_table_base = pd_base;
-	pr_debug("Updated PD address to 0x%08x\n", pd_base);
+	pr_debug("Updated PD address to 0x%llx\n", pd_base);
 
 	if (!list_empty(&qpd->queues_list)) {
 		dqm->dev->kfd2kgd->set_vm_context_page_table_base(
@@ -738,7 +738,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 {
 	struct queue *q;
 	struct kfd_process_device *pdd;
-	uint32_t pd_base;
+	uint64_t pd_base;
 	int retval = 0;
 
 	pdd = qpd_to_pdd(qpd);
@@ -758,7 +758,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 
 	/* Update PD Base in QPD */
 	qpd->page_table_base = pd_base;
-	pr_debug("Updated PD address to 0x%08x\n", pd_base);
+	pr_debug("Updated PD address to 0x%llx\n", pd_base);
 
 	/* activate all active queues on the qpd */
 	list_for_each_entry(q, &qpd->queues_list, list) {
@@ -782,7 +782,7 @@ static int register_process(struct device_queue_manager *dqm,
 {
 	struct device_process_node *n;
 	struct kfd_process_device *pdd;
-	uint32_t pd_base;
+	uint64_t pd_base;
 	int retval;
 
 	n = kzalloc(sizeof(*n), GFP_KERNEL);
@@ -800,6 +800,7 @@ static int register_process(struct device_queue_manager *dqm,
 
 	/* Update PD Base in QPD */
 	qpd->page_table_base = pd_base;
+	pr_debug("Updated PD address to 0x%llx\n", pd_base);
 
 	retval = dqm->asic_ops.update_qpd(dqm, qpd);
 
@@ -1363,9 +1364,6 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 {
 	int retval;
 	struct mqd_manager *mqd_mgr;
-	bool preempt_all_queues;
-
-	preempt_all_queues = false;
 
 	retval = 0;
 
@@ -1549,6 +1547,41 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
 	return retval;
 }
 
+static int get_wave_state(struct device_queue_manager *dqm,
+			  struct queue *q,
+			  void __user *ctl_stack,
+			  u32 *ctl_stack_used_size,
+			  u32 *save_area_used_size)
+{
+	struct mqd_manager *mqd;
+	int r;
+
+	dqm_lock(dqm);
+
+	if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE ||
+	    q->properties.is_active || !q->device->cwsr_enabled) {
+		r = -EINVAL;
+		goto dqm_unlock;
+	}
+
+	mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
+	if (!mqd) {
+		r = -ENOMEM;
+		goto dqm_unlock;
+	}
+
+	if (!mqd->get_wave_state) {
+		r = -EINVAL;
+		goto dqm_unlock;
+	}
+
+	r = mqd->get_wave_state(mqd, q->mqd, ctl_stack, ctl_stack_used_size,
+				save_area_used_size);
+
+dqm_unlock:
+	dqm_unlock(dqm);
+	return r;
+}
 
 static int process_termination_cpsch(struct device_queue_manager *dqm,
 		struct qcm_process_device *qpd)
@@ -1670,6 +1703,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.process_termination = process_termination_cpsch;
 		dqm->ops.evict_process_queues = evict_process_queues_cpsch;
 		dqm->ops.restore_process_queues = restore_process_queues_cpsch;
+		dqm->ops.get_wave_state = get_wave_state;
 		break;
 	case KFD_SCHED_POLICY_NO_HWS:
 		/* initialize dqm for no cp scheduling */
@@ -1689,6 +1723,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.evict_process_queues = evict_process_queues_nocpsch;
 		dqm->ops.restore_process_queues =
 			restore_process_queues_nocpsch;
+		dqm->ops.get_wave_state = get_wave_state;
 		break;
 	default:
 		pr_err("Invalid scheduling policy %d\n", dqm->sched_policy);
@@ -1716,6 +1751,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		break;
 
 	case CHIP_VEGA10:
+	case CHIP_VEGA20:
 	case CHIP_RAVEN:
 		device_queue_manager_init_v9(&dqm->asic_ops);
 		break;
@@ -1827,7 +1863,9 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
 	}
 
 	for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) {
-		for (queue = 0; queue < KFD_SDMA_QUEUES_PER_ENGINE; queue++) {
+		for (queue = 0;
+		     queue < dqm->dev->device_info->num_sdma_queues_per_engine;
+		     queue++) {
 			r = dqm->dev->kfd2kgd->hqd_sdma_dump(
 				dqm->dev->kgd, pipe, queue, &dump, &n_regs);
 			if (r)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 00da3169a004..70e38a2e23b9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -33,7 +33,6 @@
 
 #define KFD_UNMAP_LATENCY_MS			(4000)
 #define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
-#define KFD_SDMA_QUEUES_PER_ENGINE		(2)
 
 struct device_process_node {
 	struct qcm_process_device *qpd;
@@ -82,6 +81,8 @@ struct device_process_node {
  *
  * @restore_process_queues: Restore all evicted queues queues of a process
  *
+ * @get_wave_state: Retrieves context save state and optionally copies the
+ * control stack, if kept in the MQD, to the given userspace address.
  */
 
 struct device_queue_manager_ops {
@@ -137,6 +138,12 @@ struct device_queue_manager_ops {
 				    struct qcm_process_device *qpd);
 	int (*restore_process_queues)(struct device_queue_manager *dqm,
 				      struct qcm_process_device *qpd);
+
+	int	(*get_wave_state)(struct device_queue_manager *dqm,
+				  struct queue *q,
+				  void __user *ctl_stack,
+				  u32 *ctl_stack_used_size,
+				  u32 *save_area_used_size);
 };
 
 struct device_queue_manager_asic_ops {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index 97d5423c5673..3d66cec414af 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -400,6 +400,7 @@ int kfd_init_apertures(struct kfd_process *process)
 				kfd_init_apertures_vi(pdd, id);
 				break;
 			case CHIP_VEGA10:
+			case CHIP_VEGA20:
 			case CHIP_RAVEN:
 				kfd_init_apertures_v9(pdd, id);
 				break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 9f84b4d9fb88..6c31f7370193 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -322,6 +322,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 		break;
 
 	case CHIP_VEGA10:
+	case CHIP_VEGA20:
 	case CHIP_RAVEN:
 		kernel_queue_init_v9(&kq->ops_asic_specific);
 		break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
index 684a3bf07efd..33830b1a5a54 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
@@ -71,8 +71,7 @@ static int pm_map_process_v9(struct packet_manager *pm,
 		uint32_t *buffer, struct qcm_process_device *qpd)
 {
 	struct pm4_mes_map_process *packet;
-	uint64_t vm_page_table_base_addr =
-		(uint64_t)(qpd->page_table_base) << 12;
+	uint64_t vm_page_table_base_addr = qpd->page_table_base;
 
 	packet = (struct pm4_mes_map_process *)buffer;
 	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index 6e1f5c7c2d4b..8018163414ff 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -20,21 +20,10 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/moduleparam.h>
 #include <linux/device.h>
-#include <linux/printk.h>
 #include "kfd_priv.h"
 
-#define KFD_DRIVER_AUTHOR	"AMD Inc. and others"
-
-#define KFD_DRIVER_DESC		"Standalone HSA driver for AMD's GPUs"
-#define KFD_DRIVER_DATE		"20150421"
-#define KFD_DRIVER_MAJOR	0
-#define KFD_DRIVER_MINOR	7
-#define KFD_DRIVER_PATCHLEVEL	2
-
 static const struct kgd2kfd_calls kgd2kfd = {
 	.exit		= kgd2kfd_exit,
 	.probe		= kgd2kfd_probe,
@@ -51,77 +40,7 @@ static const struct kgd2kfd_calls kgd2kfd = {
 	.post_reset	= kgd2kfd_post_reset,
 };
 
-int sched_policy = KFD_SCHED_POLICY_HWS;
-module_param(sched_policy, int, 0444);
-MODULE_PARM_DESC(sched_policy,
-	"Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)");
-
-int hws_max_conc_proc = 8;
-module_param(hws_max_conc_proc, int, 0444);
-MODULE_PARM_DESC(hws_max_conc_proc,
-	"Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))");
-
-int cwsr_enable = 1;
-module_param(cwsr_enable, int, 0444);
-MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = off, 1 = on (default))");
-
-int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
-module_param(max_num_of_queues_per_device, int, 0444);
-MODULE_PARM_DESC(max_num_of_queues_per_device,
-	"Maximum number of supported queues per device (1 = Minimum, 4096 = default)");
-
-int send_sigterm;
-module_param(send_sigterm, int, 0444);
-MODULE_PARM_DESC(send_sigterm,
-	"Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)");
-
-int debug_largebar;
-module_param(debug_largebar, int, 0444);
-MODULE_PARM_DESC(debug_largebar,
-	"Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)");
-
-int ignore_crat;
-module_param(ignore_crat, int, 0444);
-MODULE_PARM_DESC(ignore_crat,
-	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");
-
-int noretry;
-module_param(noretry, int, 0644);
-MODULE_PARM_DESC(noretry,
-	"Set sh_mem_config.retry_disable on GFXv9+ dGPUs (0 = retry enabled (default), 1 = retry disabled)");
-
-int halt_if_hws_hang;
-module_param(halt_if_hws_hang, int, 0644);
-MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (default), 1 = on)");
-
-
-static int amdkfd_init_completed;
-
-
-int kgd2kfd_init(unsigned int interface_version,
-		const struct kgd2kfd_calls **g2f)
-{
-	if (!amdkfd_init_completed)
-		return -EPROBE_DEFER;
-
-	/*
-	 * Only one interface version is supported,
-	 * no kfd/kgd version skew allowed.
-	 */
-	if (interface_version != KFD_INTERFACE_VERSION)
-		return -EINVAL;
-
-	*g2f = &kgd2kfd;
-
-	return 0;
-}
-EXPORT_SYMBOL(kgd2kfd_init);
-
-void kgd2kfd_exit(void)
-{
-}
-
-static int __init kfd_module_init(void)
+static int kfd_init(void)
 {
 	int err;
 
@@ -129,7 +48,7 @@ static int __init kfd_module_init(void)
 	if ((sched_policy < KFD_SCHED_POLICY_HWS) ||
 		(sched_policy > KFD_SCHED_POLICY_NO_HWS)) {
 		pr_err("sched_policy has invalid value\n");
-		return -1;
+		return -EINVAL;
 	}
 
 	/* Verify module parameters */
@@ -137,7 +56,7 @@ static int __init kfd_module_init(void)
 		(max_num_of_queues_per_device >
 			KFD_MAX_NUM_OF_QUEUES_PER_DEVICE)) {
 		pr_err("max_num_of_queues_per_device must be between 1 to KFD_MAX_NUM_OF_QUEUES_PER_DEVICE\n");
-		return -1;
+		return -EINVAL;
 	}
 
 	err = kfd_chardev_init();
@@ -154,10 +73,6 @@ static int __init kfd_module_init(void)
 
 	kfd_debugfs_init();
 
-	amdkfd_init_completed = 1;
-
-	dev_info(kfd_device, "Initialized module\n");
-
 	return 0;
 
 err_create_wq:
@@ -168,23 +83,30 @@ err_ioctl:
 	return err;
 }
 
-static void __exit kfd_module_exit(void)
+static void kfd_exit(void)
 {
-	amdkfd_init_completed = 0;
-
 	kfd_debugfs_fini();
 	kfd_process_destroy_wq();
 	kfd_topology_shutdown();
 	kfd_chardev_exit();
-	pr_info("amdkfd: Removed module\n");
 }
 
-module_init(kfd_module_init);
-module_exit(kfd_module_exit);
+int kgd2kfd_init(unsigned int interface_version,
+		const struct kgd2kfd_calls **g2f)
+{
+	int err;
+
+	err = kfd_init();
+	if (err)
+		return err;
+
+	*g2f = &kgd2kfd;
+
+	return 0;
+}
+EXPORT_SYMBOL(kgd2kfd_init);
 
-MODULE_AUTHOR(KFD_DRIVER_AUTHOR);
-MODULE_DESCRIPTION(KFD_DRIVER_DESC);
-MODULE_LICENSE("GPL and additional rights");
-MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "."
-	       __stringify(KFD_DRIVER_MINOR) "."
-	       __stringify(KFD_DRIVER_PATCHLEVEL));
+void kgd2kfd_exit(void)
+{
+	kfd_exit();
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index 3bc25ab84f34..e33019a7a883 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -39,6 +39,7 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
 	case CHIP_POLARIS11:
 		return mqd_manager_init_vi_tonga(type, dev);
 	case CHIP_VEGA10:
+	case CHIP_VEGA20:
 	case CHIP_RAVEN:
 		return mqd_manager_init_v9(type, dev);
 	default:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
index 4e84052d4e21..f8261313ae7b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@@ -43,6 +43,9 @@
  *
  * @is_occupied: Checks if the relevant HQD slot is occupied.
  *
+ * @get_wave_state: Retrieves context save state and optionally copies the
+ * control stack, if kept in the MQD, to the given userspace address.
+ *
  * @mqd_mutex: Mqd manager mutex.
  *
  * @dev: The kfd device structure coupled with this module.
@@ -85,6 +88,11 @@ struct mqd_manager {
 				uint64_t queue_address,	uint32_t pipe_id,
 				uint32_t queue_id);
 
+	int	(*get_wave_state)(struct mqd_manager *mm, void *mqd,
+				  void __user *ctl_stack,
+				  u32 *ctl_stack_used_size,
+				  u32 *save_area_used_size);
+
 #if defined(CONFIG_DEBUG_FS)
 	int	(*debugfs_show_mqd)(struct seq_file *m, void *data);
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 0cedb37cf513..f381c1cb27bd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -266,6 +266,28 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd,
 		pipe_id, queue_id);
 }
 
+static int get_wave_state(struct mqd_manager *mm, void *mqd,
+			  void __user *ctl_stack,
+			  u32 *ctl_stack_used_size,
+			  u32 *save_area_used_size)
+{
+	struct v9_mqd *m;
+
+	/* Control stack is located one page after MQD. */
+	void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
+
+	m = get_mqd(mqd);
+
+	*ctl_stack_used_size = m->cp_hqd_cntl_stack_size -
+		m->cp_hqd_cntl_stack_offset;
+	*save_area_used_size = m->cp_hqd_wg_state_offset;
+
+	if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size))
+		return -EFAULT;
+
+	return 0;
+}
+
 static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
 			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
 			struct queue_properties *q)
@@ -435,6 +457,7 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
 		mqd->update_mqd = update_mqd;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
+		mqd->get_wave_state = get_wave_state;
 #if defined(CONFIG_DEBUG_FS)
 		mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index b81fda3754da..6469b3456f00 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -269,6 +269,28 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd,
 		pipe_id, queue_id);
 }
 
+static int get_wave_state(struct mqd_manager *mm, void *mqd,
+			  void __user *ctl_stack,
+			  u32 *ctl_stack_used_size,
+			  u32 *save_area_used_size)
+{
+	struct vi_mqd *m;
+
+	m = get_mqd(mqd);
+
+	*ctl_stack_used_size = m->cp_hqd_cntl_stack_size -
+		m->cp_hqd_cntl_stack_offset;
+	*save_area_used_size = m->cp_hqd_wg_state_offset -
+		m->cp_hqd_cntl_stack_size;
+
+	/* Control stack is not copied to user mode for GFXv8 because
+	 * it's part of the context save area that is already
+	 * accessible to user mode
+	 */
+
+	return 0;
+}
+
 static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
 			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
 			struct queue_properties *q)
@@ -436,6 +458,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
 		mqd->update_mqd = update_mqd;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
+		mqd->get_wave_state = get_wave_state;
 #if defined(CONFIG_DEBUG_FS)
 		mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 1092631765cb..c6080ed3b6a7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -229,6 +229,7 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
 		pm->pmf = &kfd_vi_pm_funcs;
 		break;
 	case CHIP_VEGA10:
+	case CHIP_VEGA20:
 	case CHIP_RAVEN:
 		pm->pmf = &kfd_v9_pm_funcs;
 		break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 92b285ca73aa..53ff86d45d91 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -103,7 +103,6 @@
  */
 extern int max_num_of_queues_per_device;
 
-#define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT 4096
 #define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE		\
 	(KFD_MAX_NUM_OF_PROCESSES *			\
 			KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)
@@ -149,33 +148,6 @@ extern int noretry;
  */
 extern int halt_if_hws_hang;
 
-/**
- * enum kfd_sched_policy
- *
- * @KFD_SCHED_POLICY_HWS: H/W scheduling policy known as command processor (cp)
- * scheduling. In this scheduling mode we're using the firmware code to
- * schedule the user mode queues and kernel queues such as HIQ and DIQ.
- * the HIQ queue is used as a special queue that dispatches the configuration
- * to the cp and the user mode queues list that are currently running.
- * the DIQ queue is a debugging queue that dispatches debugging commands to the
- * firmware.
- * in this scheduling mode user mode queues over subscription feature is
- * enabled.
- *
- * @KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: The same as above but the over
- * subscription feature disabled.
- *
- * @KFD_SCHED_POLICY_NO_HWS: no H/W scheduling policy is a mode which directly
- * set the command processor registers and sets the queues "manually". This
- * mode is used *ONLY* for debugging proposes.
- *
- */
-enum kfd_sched_policy {
-	KFD_SCHED_POLICY_HWS = 0,
-	KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION,
-	KFD_SCHED_POLICY_NO_HWS
-};
-
 enum cache_policy {
 	cache_policy_coherent,
 	cache_policy_noncoherent
@@ -204,6 +176,7 @@ struct kfd_device_info {
 	bool needs_iommu_device;
 	bool needs_pci_atomics;
 	unsigned int num_sdma_engines;
+	unsigned int num_sdma_queues_per_engine;
 };
 
 struct kfd_mem_obj {
@@ -275,6 +248,10 @@ struct kfd_dev {
 	/* Debug manager */
 	struct kfd_dbgmgr           *dbgmgr;
 
+	/* Firmware versions */
+	uint16_t mec_fw_version;
+	uint16_t sdma_fw_version;
+
 	/* Maximum process number mapped to HW scheduler */
 	unsigned int max_proc_per_quantum;
 
@@ -282,6 +259,11 @@ struct kfd_dev {
 	bool cwsr_enabled;
 	const void *cwsr_isa;
 	unsigned int cwsr_isa_size;
+
+	/* xGMI */
+	uint64_t hive_id;
+
+	bool pci_atomic_requested;
 };
 
 /* KGD2KFD callbacks */
@@ -525,11 +507,11 @@ struct qcm_process_device {
 	 * All the memory management data should be here too
 	 */
 	uint64_t gds_context_area;
+	uint64_t page_table_base;
 	uint32_t sh_mem_config;
 	uint32_t sh_mem_bases;
 	uint32_t sh_mem_ape1_base;
 	uint32_t sh_mem_ape1_limit;
-	uint32_t page_table_base;
 	uint32_t gds_size;
 	uint32_t num_gws;
 	uint32_t num_oac;
@@ -721,6 +703,7 @@ struct amdkfd_ioctl_desc {
 	unsigned int cmd_drv;
 	const char *name;
 };
+bool kfd_dev_is_large_bar(struct kfd_dev *dev);
 
 int kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
@@ -880,6 +863,11 @@ int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
 			struct queue_properties *p);
 struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
 						unsigned int qid);
+int pqm_get_wave_state(struct process_queue_manager *pqm,
+		       unsigned int qid,
+		       void __user *ctl_stack,
+		       u32 *ctl_stack_used_size,
+		       u32 *save_area_used_size);
 
 int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
 				unsigned int fence_value,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 4694386cc623..0039e451d9af 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -322,8 +322,10 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
 		pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n",
 				pdd->dev->id, p->pasid);
 
-		if (pdd->drm_file)
+		if (pdd->drm_file) {
+			pdd->dev->kfd2kgd->release_process_vm(pdd->dev->kgd, pdd->vm);
 			fput(pdd->drm_file);
+		}
 		else if (pdd->vm)
 			pdd->dev->kfd2kgd->destroy_process_vm(
 				pdd->dev->kgd, pdd->vm);
@@ -687,11 +689,11 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
 
 	if (drm_file)
 		ret = dev->kfd2kgd->acquire_process_vm(
-			dev->kgd, drm_file,
+			dev->kgd, drm_file, p->pasid,
 			&pdd->vm, &p->kgd_process_info, &p->ef);
 	else
 		ret = dev->kfd2kgd->create_process_vm(
-			dev->kgd, &pdd->vm, &p->kgd_process_info, &p->ef);
+			dev->kgd, p->pasid, &pdd->vm, &p->kgd_process_info, &p->ef);
 	if (ret) {
 		pr_err("Failed to create process VM object\n");
 		return ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index c8cad9c078ae..fcaaf93681ac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -408,6 +408,28 @@ struct kernel_queue *pqm_get_kernel_queue(
 	return NULL;
 }
 
+int pqm_get_wave_state(struct process_queue_manager *pqm,
+		       unsigned int qid,
+		       void __user *ctl_stack,
+		       u32 *ctl_stack_used_size,
+		       u32 *save_area_used_size)
+{
+	struct process_queue_node *pqn;
+
+	pqn = get_queue_by_qid(pqm, qid);
+	if (!pqn) {
+		pr_debug("amdkfd: No queue %d exists for operation\n",
+			 qid);
+		return -EFAULT;
+	}
+
+	return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm,
+						       pqn->q,
+						       ctl_stack,
+						       ctl_stack_used_size,
+						       save_area_used_size);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 int pqm_debugfs_mqds(struct seq_file *m, void *data)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 80f5db4ef75f..e3843c5929ed 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -454,6 +454,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 			dev->node_props.location_id);
 	sysfs_show_32bit_prop(buffer, "drm_render_minor",
 			dev->node_props.drm_render_minor);
+	sysfs_show_64bit_prop(buffer, "hive_id",
+			dev->node_props.hive_id);
 
 	if (dev->gpu) {
 		log_max_watch_addr =
@@ -480,11 +482,11 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 				(unsigned long long int) 0);
 
 		sysfs_show_32bit_prop(buffer, "fw_version",
-			dev->gpu->kfd2kgd->get_fw_version(
-						dev->gpu->kgd,
-						KGD_ENGINE_MEC1));
+				dev->gpu->mec_fw_version);
 		sysfs_show_32bit_prop(buffer, "capability",
 				dev->node_props.capability);
+		sysfs_show_32bit_prop(buffer, "sdma_fw_version",
+				dev->gpu->sdma_fw_version);
 	}
 
 	return sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute",
@@ -1125,17 +1127,40 @@ static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev)
 
 static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
 {
-	struct kfd_iolink_properties *link;
+	struct kfd_iolink_properties *link, *cpu_link;
+	struct kfd_topology_device *cpu_dev;
+	uint32_t cap;
+	uint32_t cpu_flag = CRAT_IOLINK_FLAGS_ENABLED;
+	uint32_t flag = CRAT_IOLINK_FLAGS_ENABLED;
 
 	if (!dev || !dev->gpu)
 		return;
 
-	/* GPU only creates direck links so apply flags setting to all */
-	if (dev->gpu->device_info->asic_family == CHIP_HAWAII)
-		list_for_each_entry(link, &dev->io_link_props, list)
-			link->flags = CRAT_IOLINK_FLAGS_ENABLED |
-				CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT |
-				CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT;
+	pcie_capability_read_dword(dev->gpu->pdev,
+			PCI_EXP_DEVCAP2, &cap);
+
+	if (!(cap & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+		     PCI_EXP_DEVCAP2_ATOMIC_COMP64)))
+		cpu_flag |= CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT |
+			CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT;
+
+	if (!dev->gpu->pci_atomic_requested ||
+	    dev->gpu->device_info->asic_family == CHIP_HAWAII)
+		flag |= CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT |
+			CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT;
+
+	/* GPU only creates direct links so apply flags setting to all */
+	list_for_each_entry(link, &dev->io_link_props, list) {
+		link->flags = flag;
+		cpu_dev = kfd_topology_device_by_proximity_domain(
+				link->node_to);
+		if (cpu_dev) {
+			list_for_each_entry(cpu_link,
+					    &cpu_dev->io_link_props, list)
+				if (cpu_link->node_to == link->node_from)
+					cpu_link->flags = cpu_flag;
+		}
+	}
 }
 
 int kfd_topology_add_device(struct kfd_dev *gpu)
@@ -1230,6 +1255,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 	dev->node_props.drm_render_minor =
 		gpu->shared_resources.drm_render_minor;
 
+	dev->node_props.hive_id = gpu->hive_id;
+
 	kfd_fill_mem_clk_max_info(dev);
 	kfd_fill_iolink_non_crat_info(dev);
 
@@ -1251,6 +1278,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
 		break;
 	case CHIP_VEGA10:
+	case CHIP_VEGA20:
 	case CHIP_RAVEN:
 		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
 			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index 7d9c3f948dff..92a19be07344 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -49,6 +49,7 @@
 #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
 
 struct kfd_node_properties {
+	uint64_t hive_id;
 	uint32_t cpu_cores_count;
 	uint32_t simd_count;
 	uint32_t mem_banks_count;