summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2018-05-15 15:59:10 +1000
committerDave Airlie <airlied@redhat.com>2018-05-15 16:06:08 +1000
commitc76f0b2cc2f1be1a8a20f0fe2c0f30919bc559fb (patch)
tree1aeeb74795b2951952aa443f7104d6c090c58141 /drivers/gpu/drm/amd/amdkfd
parent444ac87becd8a2ff76f9e4194dd98da4f5d5586d (diff)
parentaf47b390273f1068bdb1d01263a81948c4e2f97a (diff)
downloadtalos-obmc-linux-c76f0b2cc2f1be1a8a20f0fe2c0f30919bc559fb.tar.gz
talos-obmc-linux-c76f0b2cc2f1be1a8a20f0fe2c0f30919bc559fb.zip
Merge tag 'drm-amdkfd-next-2018-05-14' of git://people.freedesktop.org/~gabbayo/linux into drm-next
This is amdkfd pull for 4.18. The major new features are: - Add support for GFXv9 dGPUs (VEGA) - Add support for userptr memory mapping In addition, there are a couple of small fixes and improvements, such as: - Fix lock handling - Fix rollback packet in kernel kfd_queue - Optimize kfd signal handling - Fix CP hang in APU Signed-off-by: Dave Airlie <airlied@redhat.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180514070126.GA1827@odedg-x270
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/Makefile10
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c20
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cik_regs.h3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h560
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm274
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm1214
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_chardev.c52
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_crat.c11
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c131
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c114
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c84
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c65
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c119
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c92
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c8
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c39
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h7
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c9
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c340
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c319
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_module.c7
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c6
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c443
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c392
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h583
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h112
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c50
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c22
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_queue.c8
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.c6
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.h1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/soc15_int.h47
36 files changed, 4409 insertions, 750 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 0d0242240c47..ffd096fffc1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -30,12 +30,14 @@ amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \
kfd_process.o kfd_queue.o kfd_mqd_manager.o \
kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \
+ kfd_mqd_manager_v9.o \
kfd_kernel_queue.o kfd_kernel_queue_cik.o \
- kfd_kernel_queue_vi.o kfd_packet_manager.o \
- kfd_process_queue_manager.o kfd_device_queue_manager.o \
- kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \
+ kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \
+ kfd_packet_manager.o kfd_process_queue_manager.o \
+ kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \
+ kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \
kfd_interrupt.o kfd_events.o cik_event_interrupt.o \
- kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o
+ kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o
ifneq ($(CONFIG_AMD_IOMMU_V2),)
amdkfd-y += kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 3d5ccb3755d4..49df6c791cfc 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -27,18 +27,28 @@
static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
{
- unsigned int pasid;
const struct cik_ih_ring_entry *ihre =
(const struct cik_ih_ring_entry *)ih_ring_entry;
+ unsigned int vmid, pasid;
+
+ /* Only handle interrupts from KFD VMIDs */
+ vmid = (ihre->ring_id & 0x0000ff00) >> 8;
+ if (vmid < dev->vm_info.first_vmid_kfd ||
+ vmid > dev->vm_info.last_vmid_kfd)
+ return 0;
+ /* If there is no valid PASID, it's likely a firmware bug */
pasid = (ihre->ring_id & 0xffff0000) >> 16;
+ if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt"))
+ return 0;
- /* Do not process in ISR, just request it to be forwarded to WQ. */
- return (pasid != 0) &&
- (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
+ /* Interrupt types we care about: various signals and faults.
+ * They will be forwarded to a work queue (see below).
+ */
+ return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
- ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE);
+ ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE;
}
static void cik_event_interrupt_wq(struct kfd_dev *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
index 48769d12dd7b..37ce6dd65391 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h
+++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
@@ -33,7 +33,8 @@
#define APE1_MTYPE(x) ((x) << 7)
/* valid for both DEFAULT_MTYPE and APE1_MTYPE */
-#define MTYPE_CACHED 0
+#define MTYPE_CACHED_NV 0
+#define MTYPE_CACHED 1
#define MTYPE_NONCACHED 3
#define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8)
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
new file mode 100644
index 000000000000..f68aef02fc1f
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -0,0 +1,560 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+static const uint32_t cwsr_trap_gfx8_hex[] = {
+ 0xbf820001, 0xbf820125,
+ 0xb8f4f802, 0x89748674,
+ 0xb8f5f803, 0x8675ff75,
+ 0x00000400, 0xbf850011,
+ 0xc00a1e37, 0x00000000,
+ 0xbf8c007f, 0x87777978,
+ 0xbf840002, 0xb974f802,
+ 0xbe801d78, 0xb8f5f803,
+ 0x8675ff75, 0x000001ff,
+ 0xbf850002, 0x80708470,
+ 0x82718071, 0x8671ff71,
+ 0x0000ffff, 0xb974f802,
+ 0xbe801f70, 0xb8f5f803,
+ 0x8675ff75, 0x00000100,
+ 0xbf840006, 0xbefa0080,
+ 0xb97a0203, 0x8671ff71,
+ 0x0000ffff, 0x80f08870,
+ 0x82f18071, 0xbefa0080,
+ 0xb97a0283, 0xbef60068,
+ 0xbef70069, 0xb8fa1c07,
+ 0x8e7a9c7a, 0x87717a71,
+ 0xb8fa03c7, 0x8e7a9b7a,
+ 0x87717a71, 0xb8faf807,
+ 0x867aff7a, 0x00007fff,
+ 0xb97af807, 0xbef2007e,
+ 0xbef3007f, 0xbefe0180,
+ 0xbf900004, 0x877a8474,
+ 0xb97af802, 0xbf8e0002,
+ 0xbf88fffe, 0xbef8007e,
+ 0x8679ff7f, 0x0000ffff,
+ 0x8779ff79, 0x00040000,
+ 0xbefa0080, 0xbefb00ff,
+ 0x00807fac, 0x867aff7f,
+ 0x08000000, 0x8f7a837a,
+ 0x877b7a7b, 0x867aff7f,
+ 0x70000000, 0x8f7a817a,
+ 0x877b7a7b, 0xbeef007c,
+ 0xbeee0080, 0xb8ee2a05,
+ 0x806e816e, 0x8e6e8a6e,
+ 0xb8fa1605, 0x807a817a,
+ 0x8e7a867a, 0x806e7a6e,
+ 0xbefa0084, 0xbefa00ff,
+ 0x01000000, 0xbefe007c,
+ 0xbefc006e, 0xc0611bfc,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc006e, 0xc0611c3c,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc006e, 0xc0611c7c,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc006e, 0xc0611cbc,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc006e, 0xc0611cfc,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc006e, 0xc0611d3c,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0xb8f5f803,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0611d7c, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0611dbc, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xbefe007c, 0xbefc006e,
+ 0xc0611dfc, 0x0000007c,
+ 0x806e846e, 0xbefc007e,
+ 0xb8eff801, 0xbefe007c,
+ 0xbefc006e, 0xc0611bfc,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc006e, 0xc0611b3c,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc006e, 0xc0611b7c,
+ 0x0000007c, 0x806e846e,
+ 0xbefc007e, 0x867aff7f,
+ 0x04000000, 0xbef30080,
+ 0x8773737a, 0xb8ee2a05,
+ 0x806e816e, 0x8e6e8a6e,
+ 0xb8f51605, 0x80758175,
+ 0x8e758475, 0x8e7a8275,
+ 0xbefa00ff, 0x01000000,
+ 0xbef60178, 0x80786e78,
+ 0x82798079, 0xbefc0080,
+ 0xbe802b00, 0xbe822b02,
+ 0xbe842b04, 0xbe862b06,
+ 0xbe882b08, 0xbe8a2b0a,
+ 0xbe8c2b0c, 0xbe8e2b0e,
+ 0xc06b003c, 0x00000000,
+ 0xc06b013c, 0x00000010,
+ 0xc06b023c, 0x00000020,
+ 0xc06b033c, 0x00000030,
+ 0x8078c078, 0x82798079,
+ 0x807c907c, 0xbf0a757c,
+ 0xbf85ffeb, 0xbef80176,
+ 0xbeee0080, 0xbefe00c1,
+ 0xbeff00c1, 0xbefa00ff,
+ 0x01000000, 0xe0724000,
+ 0x6e1e0000, 0xe0724100,
+ 0x6e1e0100, 0xe0724200,
+ 0x6e1e0200, 0xe0724300,
+ 0x6e1e0300, 0xbefe00c1,
+ 0xbeff00c1, 0xb8f54306,
+ 0x8675c175, 0xbf84002c,
+ 0xbf8a0000, 0x867aff73,
+ 0x04000000, 0xbf840028,
+ 0x8e758675, 0x8e758275,
+ 0xbefa0075, 0xb8ee2a05,
+ 0x806e816e, 0x8e6e8a6e,
+ 0xb8fa1605, 0x807a817a,
+ 0x8e7a867a, 0x806e7a6e,
+ 0x806eff6e, 0x00000080,
+ 0xbefa00ff, 0x01000000,
+ 0xbefc0080, 0xd28c0002,
+ 0x000100c1, 0xd28d0003,
+ 0x000204c1, 0xd1060002,
+ 0x00011103, 0x7e0602ff,
+ 0x00000200, 0xbefc00ff,
+ 0x00010000, 0xbe80007b,
+ 0x867bff7b, 0xff7fffff,
+ 0x877bff7b, 0x00058000,
+ 0xd8ec0000, 0x00000002,
+ 0xbf8c007f, 0xe0765000,
+ 0x6e1e0002, 0x32040702,
+ 0xd0c9006a, 0x0000eb02,
+ 0xbf87fff7, 0xbefb0000,
+ 0xbeee00ff, 0x00000400,
+ 0xbefe00c1, 0xbeff00c1,
+ 0xb8f52a05, 0x80758175,
+ 0x8e758275, 0x8e7a8875,
+ 0xbefa00ff, 0x01000000,
+ 0xbefc0084, 0xbf0a757c,
+ 0xbf840015, 0xbf11017c,
+ 0x8075ff75, 0x00001000,
+ 0x7e000300, 0x7e020301,
+ 0x7e040302, 0x7e060303,
+ 0xe0724000, 0x6e1e0000,
+ 0xe0724100, 0x6e1e0100,
+ 0xe0724200, 0x6e1e0200,
+ 0xe0724300, 0x6e1e0300,
+ 0x807c847c, 0x806eff6e,
+ 0x00000400, 0xbf0a757c,
+ 0xbf85ffef, 0xbf9c0000,
+ 0xbf8200ca, 0xbef8007e,
+ 0x8679ff7f, 0x0000ffff,
+ 0x8779ff79, 0x00040000,
+ 0xbefa0080, 0xbefb00ff,
+ 0x00807fac, 0x8676ff7f,
+ 0x08000000, 0x8f768376,
+ 0x877b767b, 0x8676ff7f,
+ 0x70000000, 0x8f768176,
+ 0x877b767b, 0x8676ff7f,
+ 0x04000000, 0xbf84001e,
+ 0xbefe00c1, 0xbeff00c1,
+ 0xb8f34306, 0x8673c173,
+ 0xbf840019, 0x8e738673,
+ 0x8e738273, 0xbefa0073,
+ 0xb8f22a05, 0x80728172,
+ 0x8e728a72, 0xb8f61605,
+ 0x80768176, 0x8e768676,
+ 0x80727672, 0x8072ff72,
+ 0x00000080, 0xbefa00ff,
+ 0x01000000, 0xbefc0080,
+ 0xe0510000, 0x721e0000,
+ 0xe0510100, 0x721e0000,
+ 0x807cff7c, 0x00000200,
+ 0x8072ff72, 0x00000200,
+ 0xbf0a737c, 0xbf85fff6,
+ 0xbef20080, 0xbefe00c1,
+ 0xbeff00c1, 0xb8f32a05,
+ 0x80738173, 0x8e738273,
+ 0x8e7a8873, 0xbefa00ff,
+ 0x01000000, 0xbef60072,
+ 0x8072ff72, 0x00000400,
+ 0xbefc0084, 0xbf11087c,
+ 0x8073ff73, 0x00008000,
+ 0xe0524000, 0x721e0000,
+ 0xe0524100, 0x721e0100,
+ 0xe0524200, 0x721e0200,
+ 0xe0524300, 0x721e0300,
+ 0xbf8c0f70, 0x7e000300,
+ 0x7e020301, 0x7e040302,
+ 0x7e060303, 0x807c847c,
+ 0x8072ff72, 0x00000400,
+ 0xbf0a737c, 0xbf85ffee,
+ 0xbf9c0000, 0xe0524000,
+ 0x761e0000, 0xe0524100,
+ 0x761e0100, 0xe0524200,
+ 0x761e0200, 0xe0524300,
+ 0x761e0300, 0xb8f22a05,
+ 0x80728172, 0x8e728a72,
+ 0xb8f61605, 0x80768176,
+ 0x8e768676, 0x80727672,
+ 0x80f2c072, 0xb8f31605,
+ 0x80738173, 0x8e738473,
+ 0x8e7a8273, 0xbefa00ff,
+ 0x01000000, 0xbefc0073,
+ 0xc031003c, 0x00000072,
+ 0x80f2c072, 0xbf8c007f,
+ 0x80fc907c, 0xbe802d00,
+ 0xbe822d02, 0xbe842d04,
+ 0xbe862d06, 0xbe882d08,
+ 0xbe8a2d0a, 0xbe8c2d0c,
+ 0xbe8e2d0e, 0xbf06807c,
+ 0xbf84fff1, 0xb8f22a05,
+ 0x80728172, 0x8e728a72,
+ 0xb8f61605, 0x80768176,
+ 0x8e768676, 0x80727672,
+ 0xbefa0084, 0xbefa00ff,
+ 0x01000000, 0xc0211cfc,
+ 0x00000072, 0x80728472,
+ 0xc0211c3c, 0x00000072,
+ 0x80728472, 0xc0211c7c,
+ 0x00000072, 0x80728472,
+ 0xc0211bbc, 0x00000072,
+ 0x80728472, 0xc0211bfc,
+ 0x00000072, 0x80728472,
+ 0xc0211d3c, 0x00000072,
+ 0x80728472, 0xc0211d7c,
+ 0x00000072, 0x80728472,
+ 0xc0211a3c, 0x00000072,
+ 0x80728472, 0xc0211a7c,
+ 0x00000072, 0x80728472,
+ 0xc0211dfc, 0x00000072,
+ 0x80728472, 0xc0211b3c,
+ 0x00000072, 0x80728472,
+ 0xc0211b7c, 0x00000072,
+ 0x80728472, 0xbf8c007f,
+ 0xbefc0073, 0xbefe006e,
+ 0xbeff006f, 0x867375ff,
+ 0x000003ff, 0xb9734803,
+ 0x867375ff, 0xfffff800,
+ 0x8f738b73, 0xb973a2c3,
+ 0xb977f801, 0x8673ff71,
+ 0xf0000000, 0x8f739c73,
+ 0x8e739073, 0xbef60080,
+ 0x87767376, 0x8673ff71,
+ 0x08000000, 0x8f739b73,
+ 0x8e738f73, 0x87767376,
+ 0x8673ff74, 0x00800000,
+ 0x8f739773, 0xb976f807,
+ 0x8671ff71, 0x0000ffff,
+ 0x86fe7e7e, 0x86ea6a6a,
+ 0xb974f802, 0xbf8a0000,
+ 0x95807370, 0xbf810000,
+};
+
+
+static const uint32_t cwsr_trap_gfx9_hex[] = {
+ 0xbf820001, 0xbf82015a,
+ 0xb8f8f802, 0x89788678,
+ 0xb8f1f803, 0x866eff71,
+ 0x00000400, 0xbf850034,
+ 0x866eff71, 0x00000800,
+ 0xbf850003, 0x866eff71,
+ 0x00000100, 0xbf840008,
+ 0x866eff78, 0x00002000,
+ 0xbf840001, 0xbf810000,
+ 0x8778ff78, 0x00002000,
+ 0x80ec886c, 0x82ed806d,
+ 0xb8eef807, 0x866fff6e,
+ 0x001f8000, 0x8e6f8b6f,
+ 0x8977ff77, 0xfc000000,
+ 0x87776f77, 0x896eff6e,
+ 0x001f8000, 0xb96ef807,
+ 0xb8f0f812, 0xb8f1f813,
+ 0x8ef08870, 0xc0071bb8,
+ 0x00000000, 0xbf8cc07f,
+ 0xc0071c38, 0x00000008,
+ 0xbf8cc07f, 0x86ee6e6e,
+ 0xbf840001, 0xbe801d6e,
+ 0xb8f1f803, 0x8671ff71,
+ 0x000001ff, 0xbf850002,
+ 0x806c846c, 0x826d806d,
+ 0x866dff6d, 0x0000ffff,
+ 0x8f6e8b77, 0x866eff6e,
+ 0x001f8000, 0xb96ef807,
+ 0x86fe7e7e, 0x86ea6a6a,
+ 0xb978f802, 0xbe801f6c,
+ 0x866dff6d, 0x0000ffff,
+ 0xbef00080, 0xb9700283,
+ 0xb8f02407, 0x8e709c70,
+ 0x876d706d, 0xb8f003c7,
+ 0x8e709b70, 0x876d706d,
+ 0xb8f0f807, 0x8670ff70,
+ 0x00007fff, 0xb970f807,
+ 0xbeee007e, 0xbeef007f,
+ 0xbefe0180, 0xbf900004,
+ 0x87708478, 0xb970f802,
+ 0xbf8e0002, 0xbf88fffe,
+ 0xb8f02a05, 0x80708170,
+ 0x8e708a70, 0xb8f11605,
+ 0x80718171, 0x8e718671,
+ 0x80707170, 0x80707e70,
+ 0x8271807f, 0x8671ff71,
+ 0x0000ffff, 0xc0471cb8,
+ 0x00000040, 0xbf8cc07f,
+ 0xc04b1d38, 0x00000048,
+ 0xbf8cc07f, 0xc0431e78,
+ 0x00000058, 0xbf8cc07f,
+ 0xc0471eb8, 0x0000005c,
+ 0xbf8cc07f, 0xbef4007e,
+ 0x8675ff7f, 0x0000ffff,
+ 0x8775ff75, 0x00040000,
+ 0xbef60080, 0xbef700ff,
+ 0x00807fac, 0x8670ff7f,
+ 0x08000000, 0x8f708370,
+ 0x87777077, 0x8670ff7f,
+ 0x70000000, 0x8f708170,
+ 0x87777077, 0xbefb007c,
+ 0xbefa0080, 0xb8fa2a05,
+ 0x807a817a, 0x8e7a8a7a,
+ 0xb8f01605, 0x80708170,
+ 0x8e708670, 0x807a707a,
+ 0xbef60084, 0xbef600ff,
+ 0x01000000, 0xbefe007c,
+ 0xbefc007a, 0xc0611efa,
+ 0x0000007c, 0xbf8cc07f,
+ 0x807a847a, 0xbefc007e,
+ 0xbefe007c, 0xbefc007a,
+ 0xc0611b3a, 0x0000007c,
+ 0xbf8cc07f, 0x807a847a,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc007a, 0xc0611b7a,
+ 0x0000007c, 0xbf8cc07f,
+ 0x807a847a, 0xbefc007e,
+ 0xbefe007c, 0xbefc007a,
+ 0xc0611bba, 0x0000007c,
+ 0xbf8cc07f, 0x807a847a,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc007a, 0xc0611bfa,
+ 0x0000007c, 0xbf8cc07f,
+ 0x807a847a, 0xbefc007e,
+ 0xbefe007c, 0xbefc007a,
+ 0xc0611e3a, 0x0000007c,
+ 0xbf8cc07f, 0x807a847a,
+ 0xbefc007e, 0xb8f1f803,
+ 0xbefe007c, 0xbefc007a,
+ 0xc0611c7a, 0x0000007c,
+ 0xbf8cc07f, 0x807a847a,
+ 0xbefc007e, 0xbefe007c,
+ 0xbefc007a, 0xc0611a3a,
+ 0x0000007c, 0xbf8cc07f,
+ 0x807a847a, 0xbefc007e,
+ 0xbefe007c, 0xbefc007a,
+ 0xc0611a7a, 0x0000007c,
+ 0xbf8cc07f, 0x807a847a,
+ 0xbefc007e, 0xb8fbf801,
+ 0xbefe007c, 0xbefc007a,
+ 0xc0611efa, 0x0000007c,
+ 0xbf8cc07f, 0x807a847a,
+ 0xbefc007e, 0x8670ff7f,
+ 0x04000000, 0xbeef0080,
+ 0x876f6f70, 0xb8fa2a05,
+ 0x807a817a, 0x8e7a8a7a,
+ 0xb8f11605, 0x80718171,
+ 0x8e718471, 0x8e768271,
+ 0xbef600ff, 0x01000000,
+ 0xbef20174, 0x80747a74,
+ 0x82758075, 0xbefc0080,
+ 0xbf800000, 0xbe802b00,
+ 0xbe822b02, 0xbe842b04,
+ 0xbe862b06, 0xbe882b08,
+ 0xbe8a2b0a, 0xbe8c2b0c,
+ 0xbe8e2b0e, 0xc06b003a,
+ 0x00000000, 0xbf8cc07f,
+ 0xc06b013a, 0x00000010,
+ 0xbf8cc07f, 0xc06b023a,
+ 0x00000020, 0xbf8cc07f,
+ 0xc06b033a, 0x00000030,
+ 0xbf8cc07f, 0x8074c074,
+ 0x82758075, 0x807c907c,
+ 0xbf0a717c, 0xbf85ffe7,
+ 0xbef40172, 0xbefa0080,
+ 0xbefe00c1, 0xbeff00c1,
+ 0xbee80080, 0xbee90080,
+ 0xbef600ff, 0x01000000,
+ 0xe0724000, 0x7a1d0000,
+ 0xe0724100, 0x7a1d0100,
+ 0xe0724200, 0x7a1d0200,
+ 0xe0724300, 0x7a1d0300,
+ 0xbefe00c1, 0xbeff00c1,
+ 0xb8f14306, 0x8671c171,
+ 0xbf84002c, 0xbf8a0000,
+ 0x8670ff6f, 0x04000000,
+ 0xbf840028, 0x8e718671,
+ 0x8e718271, 0xbef60071,
+ 0xb8fa2a05, 0x807a817a,
+ 0x8e7a8a7a, 0xb8f01605,
+ 0x80708170, 0x8e708670,
+ 0x807a707a, 0x807aff7a,
+ 0x00000080, 0xbef600ff,
+ 0x01000000, 0xbefc0080,
+ 0xd28c0002, 0x000100c1,
+ 0xd28d0003, 0x000204c1,
+ 0xd1060002, 0x00011103,
+ 0x7e0602ff, 0x00000200,
+ 0xbefc00ff, 0x00010000,
+ 0xbe800077, 0x8677ff77,
+ 0xff7fffff, 0x8777ff77,
+ 0x00058000, 0xd8ec0000,
+ 0x00000002, 0xbf8cc07f,
+ 0xe0765000, 0x7a1d0002,
+ 0x68040702, 0xd0c9006a,
+ 0x0000e302, 0xbf87fff7,
+ 0xbef70000, 0xbefa00ff,
+ 0x00000400, 0xbefe00c1,
+ 0xbeff00c1, 0xb8f12a05,
+ 0x80718171, 0x8e718271,
+ 0x8e768871, 0xbef600ff,
+ 0x01000000, 0xbefc0084,
+ 0xbf0a717c, 0xbf840015,
+ 0xbf11017c, 0x8071ff71,
+ 0x00001000, 0x7e000300,
+ 0x7e020301, 0x7e040302,
+ 0x7e060303, 0xe0724000,
+ 0x7a1d0000, 0xe0724100,
+ 0x7a1d0100, 0xe0724200,
+ 0x7a1d0200, 0xe0724300,
+ 0x7a1d0300, 0x807c847c,
+ 0x807aff7a, 0x00000400,
+ 0xbf0a717c, 0xbf85ffef,
+ 0xbf9c0000, 0xbf8200d9,
+ 0xbef4007e, 0x8675ff7f,
+ 0x0000ffff, 0x8775ff75,
+ 0x00040000, 0xbef60080,
+ 0xbef700ff, 0x00807fac,
+ 0x866eff7f, 0x08000000,
+ 0x8f6e836e, 0x87776e77,
+ 0x866eff7f, 0x70000000,
+ 0x8f6e816e, 0x87776e77,
+ 0x866eff7f, 0x04000000,
+ 0xbf84001e, 0xbefe00c1,
+ 0xbeff00c1, 0xb8ef4306,
+ 0x866fc16f, 0xbf840019,
+ 0x8e6f866f, 0x8e6f826f,
+ 0xbef6006f, 0xb8f82a05,
+ 0x80788178, 0x8e788a78,
+ 0xb8ee1605, 0x806e816e,
+ 0x8e6e866e, 0x80786e78,
+ 0x8078ff78, 0x00000080,
+ 0xbef600ff, 0x01000000,
+ 0xbefc0080, 0xe0510000,
+ 0x781d0000, 0xe0510100,
+ 0x781d0000, 0x807cff7c,
+ 0x00000200, 0x8078ff78,
+ 0x00000200, 0xbf0a6f7c,
+ 0xbf85fff6, 0xbef80080,
+ 0xbefe00c1, 0xbeff00c1,
+ 0xb8ef2a05, 0x806f816f,
+ 0x8e6f826f, 0x8e76886f,
+ 0xbef600ff, 0x01000000,
+ 0xbeee0078, 0x8078ff78,
+ 0x00000400, 0xbefc0084,
+ 0xbf11087c, 0x806fff6f,
+ 0x00008000, 0xe0524000,
+ 0x781d0000, 0xe0524100,
+ 0x781d0100, 0xe0524200,
+ 0x781d0200, 0xe0524300,
+ 0x781d0300, 0xbf8c0f70,
+ 0x7e000300, 0x7e020301,
+ 0x7e040302, 0x7e060303,
+ 0x807c847c, 0x8078ff78,
+ 0x00000400, 0xbf0a6f7c,
+ 0xbf85ffee, 0xbf9c0000,
+ 0xe0524000, 0x6e1d0000,
+ 0xe0524100, 0x6e1d0100,
+ 0xe0524200, 0x6e1d0200,
+ 0xe0524300, 0x6e1d0300,
+ 0xb8f82a05, 0x80788178,
+ 0x8e788a78, 0xb8ee1605,
+ 0x806e816e, 0x8e6e866e,
+ 0x80786e78, 0x80f8c078,
+ 0xb8ef1605, 0x806f816f,
+ 0x8e6f846f, 0x8e76826f,
+ 0xbef600ff, 0x01000000,
+ 0xbefc006f, 0xc031003a,
+ 0x00000078, 0x80f8c078,
+ 0xbf8cc07f, 0x80fc907c,
+ 0xbf800000, 0xbe802d00,
+ 0xbe822d02, 0xbe842d04,
+ 0xbe862d06, 0xbe882d08,
+ 0xbe8a2d0a, 0xbe8c2d0c,
+ 0xbe8e2d0e, 0xbf06807c,
+ 0xbf84fff0, 0xb8f82a05,
+ 0x80788178, 0x8e788a78,
+ 0xb8ee1605, 0x806e816e,
+ 0x8e6e866e, 0x80786e78,
+ 0xbef60084, 0xbef600ff,
+ 0x01000000, 0xc0211bfa,
+ 0x00000078, 0x80788478,
+ 0xc0211b3a, 0x00000078,
+ 0x80788478, 0xc0211b7a,
+ 0x00000078, 0x80788478,
+ 0xc0211eba, 0x00000078,
+ 0x80788478, 0xc0211efa,
+ 0x00000078, 0x80788478,
+ 0xc0211c3a, 0x00000078,
+ 0x80788478, 0xc0211c7a,
+ 0x00000078, 0x80788478,
+ 0xc0211a3a, 0x00000078,
+ 0x80788478, 0xc0211a7a,
+ 0x00000078, 0x80788478,
+ 0xc0211cfa, 0x00000078,
+ 0x80788478, 0xbf8cc07f,
+ 0xbefc006f, 0xbefe007a,
+ 0xbeff007b, 0x866f71ff,
+ 0x000003ff, 0xb96f4803,
+ 0x866f71ff, 0xfffff800,
+ 0x8f6f8b6f, 0xb96fa2c3,
+ 0xb973f801, 0xb8ee2a05,
+ 0x806e816e, 0x8e6e8a6e,
+ 0xb8ef1605, 0x806f816f,
+ 0x8e6f866f, 0x806e6f6e,
+ 0x806e746e, 0x826f8075,
+ 0x866fff6f, 0x0000ffff,
+ 0xc0071cb7, 0x00000040,
+ 0xc00b1d37, 0x00000048,
+ 0xc0031e77, 0x00000058,
+ 0xc0071eb7, 0x0000005c,
+ 0xbf8cc07f, 0x866fff6d,
+ 0xf0000000, 0x8f6f9c6f,
+ 0x8e6f906f, 0xbeee0080,
+ 0x876e6f6e, 0x866fff6d,
+ 0x08000000, 0x8f6f9b6f,
+ 0x8e6f8f6f, 0x876e6f6e,
+ 0x866fff70, 0x00800000,
+ 0x8f6f976f, 0xb96ef807,
+ 0x866dff6d, 0x0000ffff,
+ 0x86fe7e7e, 0x86ea6a6a,
+ 0xb970f802, 0xbf8a0000,
+ 0x95806f6c, 0xbf810000,
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
index 997a383dcb8b..a2a04bb64096 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
@@ -20,9 +20,12 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#if 0
-HW (VI) source code for CWSR trap handler
-#Version 18 + multiple trap handler
+/* To compile this assembly code:
+ * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex
+ */
+
+/* HW (VI) source code for CWSR trap handler */
+/* Version 18 + multiple trap handler */
// this performance-optimal version was originally from Seven Xu at SRDC
@@ -98,6 +101,7 @@ var SWIZZLE_EN = 0 //whether we use swi
/**************************************************************************/
var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
+var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1
var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
@@ -149,7 +153,7 @@ var s_save_spi_init_lo = exec_lo
var s_save_spi_init_hi = exec_hi
//tba_lo and tba_hi need to be saved/restored
-var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
var s_save_pc_hi = ttmp1
var s_save_exec_lo = ttmp2
var s_save_exec_hi = ttmp3
@@ -319,6 +323,10 @@ end
s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
end
+ // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
+ s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
+ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
+
L_SLEEP:
s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
@@ -1007,8 +1015,6 @@ end
s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
- s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
-
//for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
@@ -1044,6 +1050,7 @@ end
s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
+ s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu
@@ -1127,258 +1134,3 @@ end
function get_hwreg_size_bytes
return 128 //HWREG size 128 bytes
end
-
-
-#endif
-
-static const uint32_t cwsr_trap_gfx8_hex[] = {
- 0xbf820001, 0xbf820123,
- 0xb8f4f802, 0x89748674,
- 0xb8f5f803, 0x8675ff75,
- 0x00000400, 0xbf850011,
- 0xc00a1e37, 0x00000000,
- 0xbf8c007f, 0x87777978,
- 0xbf840002, 0xb974f802,
- 0xbe801d78, 0xb8f5f803,
- 0x8675ff75, 0x000001ff,
- 0xbf850002, 0x80708470,
- 0x82718071, 0x8671ff71,
- 0x0000ffff, 0xb974f802,
- 0xbe801f70, 0xb8f5f803,
- 0x8675ff75, 0x00000100,
- 0xbf840006, 0xbefa0080,
- 0xb97a0203, 0x8671ff71,
- 0x0000ffff, 0x80f08870,
- 0x82f18071, 0xbefa0080,
- 0xb97a0283, 0xbef60068,
- 0xbef70069, 0xb8fa1c07,
- 0x8e7a9c7a, 0x87717a71,
- 0xb8fa03c7, 0x8e7a9b7a,
- 0x87717a71, 0xb8faf807,
- 0x867aff7a, 0x00007fff,
- 0xb97af807, 0xbef2007e,
- 0xbef3007f, 0xbefe0180,
- 0xbf900004, 0xbf8e0002,
- 0xbf88fffe, 0xbef8007e,
- 0x8679ff7f, 0x0000ffff,
- 0x8779ff79, 0x00040000,
- 0xbefa0080, 0xbefb00ff,
- 0x00807fac, 0x867aff7f,
- 0x08000000, 0x8f7a837a,
- 0x877b7a7b, 0x867aff7f,
- 0x70000000, 0x8f7a817a,
- 0x877b7a7b, 0xbeef007c,
- 0xbeee0080, 0xb8ee2a05,
- 0x806e816e, 0x8e6e8a6e,
- 0xb8fa1605, 0x807a817a,
- 0x8e7a867a, 0x806e7a6e,
- 0xbefa0084, 0xbefa00ff,
- 0x01000000, 0xbefe007c,
- 0xbefc006e, 0xc0611bfc,
- 0x0000007c, 0x806e846e,
- 0xbefc007e, 0xbefe007c,
- 0xbefc006e, 0xc0611c3c,
- 0x0000007c, 0x806e846e,
- 0xbefc007e, 0xbefe007c,
- 0xbefc006e, 0xc0611c7c,
- 0x0000007c, 0x806e846e,
- 0xbefc007e, 0xbefe007c,
- 0xbefc006e, 0xc0611cbc,
- 0x0000007c, 0x806e846e,
- 0xbefc007e, 0xbefe007c,
- 0xbefc006e, 0xc0611cfc,
- 0x0000007c, 0x806e846e,
- 0xbefc007e, 0xbefe007c,
- 0xbefc006e, 0xc0611d3c,
- 0x0000007c, 0x806e846e,
- 0xbefc007e, 0xb8f5f803,
- 0xbefe007c, 0xbefc006e,
- 0xc0611d7c, 0x0000007c,
- 0x806e846e, 0xbefc007e,
- 0xbefe007c, 0xbefc006e,
- 0xc0611dbc, 0x0000007c,
- 0x806e846e, 0xbefc007e,
- 0xbefe007c, 0xbefc006e,
- 0xc0611dfc, 0x0000007c,
- 0x806e846e, 0xbefc007e,
- 0xb8eff801, 0xbefe007c,
- 0xbefc006e, 0xc0611bfc,
- 0x0000007c, 0x806e846e,
- 0xbefc007e, 0xbefe007c,
- 0xbefc006e, 0xc0611b3c,
- 0x0000007c, 0x806e846e,
- 0xbefc007e, 0xbefe007c,
- 0xbefc006e, 0xc0611b7c,
- 0x0000007c, 0x806e846e,
- 0xbefc007e, 0x867aff7f,
- 0x04000000, 0xbef30080,
- 0x8773737a, 0xb8ee2a05,
- 0x806e816e, 0x8e6e8a6e,
- 0xb8f51605, 0x80758175,
- 0x8e758475, 0x8e7a8275,
- 0xbefa00ff, 0x01000000,
- 0xbef60178, 0x80786e78,
- 0x82798079, 0xbefc0080,
- 0xbe802b00, 0xbe822b02,
- 0xbe842b04, 0xbe862b06,
- 0xbe882b08, 0xbe8a2b0a,
- 0xbe8c2b0c, 0xbe8e2b0e,
- 0xc06b003c, 0x00000000,
- 0xc06b013c, 0x00000010,
- 0xc06b023c, 0x00000020,
- 0xc06b033c, 0x00000030,
- 0x8078c078, 0x82798079,
- 0x807c907c, 0xbf0a757c,
- 0xbf85ffeb, 0xbef80176,
- 0xbeee0080, 0xbefe00c1,
- 0xbeff00c1, 0xbefa00ff,
- 0x01000000, 0xe0724000,
- 0x6e1e0000, 0xe0724100,
- 0x6e1e0100, 0xe0724200,
- 0x6e1e0200, 0xe0724300,
- 0x6e1e0300, 0xbefe00c1,
- 0xbeff00c1, 0xb8f54306,
- 0x8675c175, 0xbf84002c,
- 0xbf8a0000, 0x867aff73,
- 0x04000000, 0xbf840028,
- 0x8e758675, 0x8e758275,
- 0xbefa0075, 0xb8ee2a05,
- 0x806e816e, 0x8e6e8a6e,
- 0xb8fa1605, 0x807a817a,
- 0x8e7a867a, 0x806e7a6e,
- 0x806eff6e, 0x00000080,
- 0xbefa00ff, 0x01000000,
- 0xbefc0080, 0xd28c0002,
- 0x000100c1, 0xd28d0003,
- 0x000204c1, 0xd1060002,
- 0x00011103, 0x7e0602ff,
- 0x00000200, 0xbefc00ff,
- 0x00010000, 0xbe80007b,
- 0x867bff7b, 0xff7fffff,
- 0x877bff7b, 0x00058000,
- 0xd8ec0000, 0x00000002,
- 0xbf8c007f, 0xe0765000,
- 0x6e1e0002, 0x32040702,
- 0xd0c9006a, 0x0000eb02,
- 0xbf87fff7, 0xbefb0000,
- 0xbeee00ff, 0x00000400,
- 0xbefe00c1, 0xbeff00c1,
- 0xb8f52a05, 0x80758175,
- 0x8e758275, 0x8e7a8875,
- 0xbefa00ff, 0x01000000,
- 0xbefc0084, 0xbf0a757c,
- 0xbf840015, 0xbf11017c,
- 0x8075ff75, 0x00001000,
- 0x7e000300, 0x7e020301,
- 0x7e040302, 0x7e060303,
- 0xe0724000, 0x6e1e0000,
- 0xe0724100, 0x6e1e0100,
- 0xe0724200, 0x6e1e0200,
- 0xe0724300, 0x6e1e0300,
- 0x807c847c, 0x806eff6e,
- 0x00000400, 0xbf0a757c,
- 0xbf85ffef, 0xbf9c0000,
- 0xbf8200ca, 0xbef8007e,
- 0x8679ff7f, 0x0000ffff,
- 0x8779ff79, 0x00040000,
- 0xbefa0080, 0xbefb00ff,
- 0x00807fac, 0x8676ff7f,
- 0x08000000, 0x8f768376,
- 0x877b767b, 0x8676ff7f,
- 0x70000000, 0x8f768176,
- 0x877b767b, 0x8676ff7f,
- 0x04000000, 0xbf84001e,
- 0xbefe00c1, 0xbeff00c1,
- 0xb8f34306, 0x8673c173,
- 0xbf840019, 0x8e738673,
- 0x8e738273, 0xbefa0073,
- 0xb8f22a05, 0x80728172,
- 0x8e728a72, 0xb8f61605,
- 0x80768176, 0x8e768676,
- 0x80727672, 0x8072ff72,
- 0x00000080, 0xbefa00ff,
- 0x01000000, 0xbefc0080,
- 0xe0510000, 0x721e0000,
- 0xe0510100, 0x721e0000,
- 0x807cff7c, 0x00000200,
- 0x8072ff72, 0x00000200,
- 0xbf0a737c, 0xbf85fff6,
- 0xbef20080, 0xbefe00c1,
- 0xbeff00c1, 0xb8f32a05,
- 0x80738173, 0x8e738273,
- 0x8e7a8873, 0xbefa00ff,
- 0x01000000, 0xbef60072,
- 0x8072ff72, 0x00000400,
- 0xbefc0084, 0xbf11087c,
- 0x8073ff73, 0x00008000,
- 0xe0524000, 0x721e0000,
- 0xe0524100, 0x721e0100,
- 0xe0524200, 0x721e0200,
- 0xe0524300, 0x721e0300,
- 0xbf8c0f70, 0x7e000300,
- 0x7e020301, 0x7e040302,
- 0x7e060303, 0x807c847c,
- 0x8072ff72, 0x00000400,
- 0xbf0a737c, 0xbf85ffee,
- 0xbf9c0000, 0xe0524000,
- 0x761e0000, 0xe0524100,
- 0x761e0100, 0xe0524200,
- 0x761e0200, 0xe0524300,
- 0x761e0300, 0xb8f22a05,
- 0x80728172, 0x8e728a72,
- 0xb8f61605, 0x80768176,
- 0x8e768676, 0x80727672,
- 0x80f2c072, 0xb8f31605,
- 0x80738173, 0x8e738473,
- 0x8e7a8273, 0xbefa00ff,
- 0x01000000, 0xbefc0073,
- 0xc031003c, 0x00000072,
- 0x80f2c072, 0xbf8c007f,
- 0x80fc907c, 0xbe802d00,
- 0xbe822d02, 0xbe842d04,
- 0xbe862d06, 0xbe882d08,
- 0xbe8a2d0a, 0xbe8c2d0c,
- 0xbe8e2d0e, 0xbf06807c,
- 0xbf84fff1, 0xb8f22a05,
- 0x80728172, 0x8e728a72,
- 0xb8f61605, 0x80768176,
- 0x8e768676, 0x80727672,
- 0xbefa0084, 0xbefa00ff,
- 0x01000000, 0xc0211cfc,
- 0x00000072, 0x80728472,
- 0xc0211c3c, 0x00000072,
- 0x80728472, 0xc0211c7c,
- 0x00000072, 0x80728472,
- 0xc0211bbc, 0x00000072,
- 0x80728472, 0xc0211bfc,
- 0x00000072, 0x80728472,
- 0xc0211d3c, 0x00000072,
- 0x80728472, 0xc0211d7c,
- 0x00000072, 0x80728472,
- 0xc0211a3c, 0x00000072,
- 0x80728472, 0xc0211a7c,
- 0x00000072, 0x80728472,
- 0xc0211dfc, 0x00000072,
- 0x80728472, 0xc0211b3c,
- 0x00000072, 0x80728472,
- 0xc0211b7c, 0x00000072,
- 0x80728472, 0xbf8c007f,
- 0x8671ff71, 0x0000ffff,
- 0xbefc0073, 0xbefe006e,
- 0xbeff006f, 0x867375ff,
- 0x000003ff, 0xb9734803,
- 0x867375ff, 0xfffff800,
- 0x8f738b73, 0xb973a2c3,
- 0xb977f801, 0x8673ff71,
- 0xf0000000, 0x8f739c73,
- 0x8e739073, 0xbef60080,
- 0x87767376, 0x8673ff71,
- 0x08000000, 0x8f739b73,
- 0x8e738f73, 0x87767376,
- 0x8673ff74, 0x00800000,
- 0x8f739773, 0xb976f807,
- 0x86fe7e7e, 0x86ea6a6a,
- 0xb974f802, 0xbf8a0000,
- 0x95807370, 0xbf810000,
-};
-
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
new file mode 100644
index 000000000000..998be96be736
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -0,0 +1,1214 @@
+/*
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* To compile this assembly code:
+ * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex
+ */
+
+/* HW (GFX9) source code for CWSR trap handler */
+/* Version 18 + multiple trap handler */
+
+// this performance-optimal version was originally from Seven Xu at SRDC
+
+// Revison #18 --...
+/* Rev History
+** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
+** #4. SR Memory Layout:
+** 1. VGPR-SGPR-HWREG-{LDS}
+** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
+** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
+** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
+** #7. Update: 1. don't barrier if noLDS
+** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
+** 2. Fix SQ issue by s_sleep 2
+** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
+** 2. optimize s_buffer save by burst 16sgprs...
+** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
+** #11. Update 1. Add 2 more timestamp for debug version
+** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
+** #13. Integ 1. Always use MUBUF for PV trap shader...
+** #14. Update 1. s_buffer_store soft clause...
+** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
+** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
+** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
+** 2. PERF - Save LDS before save VGPR to cover LDS save long latency...
+** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
+** 2. FUNC - Handle non-CWSR traps
+*/
+
+var G8SR_WDMEM_HWREG_OFFSET = 0
+var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes
+
+// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
+
+var G8SR_DEBUG_TIMESTAMP = 0
+var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
+var s_g8sr_ts_save_s = s[34:35] // save start
+var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi
+var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ
+var s_g8sr_ts_save_d = s[40:41] // save end
+var s_g8sr_ts_restore_s = s[42:43] // restore start
+var s_g8sr_ts_restore_d = s[44:45] // restore end
+
+var G8SR_VGPR_SR_IN_DWX4 = 0
+var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes
+var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
+
+
+/*************************************************************************/
+/* control on how to run the shader */
+/*************************************************************************/
+//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
+var EMU_RUN_HACK = 0
+var EMU_RUN_HACK_RESTORE_NORMAL = 0
+var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
+var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
+var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
+var SAVE_LDS = 1
+var WG_BASE_ADDR_LO = 0x9000a000
+var WG_BASE_ADDR_HI = 0x0
+var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem
+var CTX_SAVE_CONTROL = 0x0
+var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
+var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
+var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write
+var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
+var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
+var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency
+
+/**************************************************************************/
+/* variables */
+/**************************************************************************/
+var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
+var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
+var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1
+var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
+var SQ_WAVE_STATUS_HALT_MASK = 0x2000
+
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
+
+var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
+var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask
+var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
+var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
+var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
+var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800
+
+var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
+
+var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
+var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
+
+var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 26 // bits [31:26] unused by SPI debug data
+var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0xFC000000
+
+/* Save */
+var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
+var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
+
+var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
+var S_SAVE_SPI_INIT_ATC_SHIFT = 27
+var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
+var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
+var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
+var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
+
+var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
+var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
+
+var s_save_spi_init_lo = exec_lo
+var s_save_spi_init_hi = exec_hi
+
+var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+var s_save_pc_hi = ttmp1
+var s_save_exec_lo = ttmp2
+var s_save_exec_hi = ttmp3
+var s_save_tmp = ttmp4
+var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
+var s_save_xnack_mask_lo = ttmp6
+var s_save_xnack_mask_hi = ttmp7
+var s_save_buf_rsrc0 = ttmp8
+var s_save_buf_rsrc1 = ttmp9
+var s_save_buf_rsrc2 = ttmp10
+var s_save_buf_rsrc3 = ttmp11
+var s_save_status = ttmp12
+var s_save_mem_offset = ttmp14
+var s_save_alloc_size = s_save_trapsts //conflict
+var s_save_m0 = ttmp15
+var s_save_ttmps_lo = s_save_tmp //no conflict
+var s_save_ttmps_hi = s_save_trapsts //no conflict
+
+/* Restore */
+var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
+var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
+
+var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
+var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
+var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
+var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
+var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
+var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
+
+var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
+var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
+var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
+
+var s_restore_spi_init_lo = exec_lo
+var s_restore_spi_init_hi = exec_hi
+
+var s_restore_mem_offset = ttmp12
+var s_restore_alloc_size = ttmp3
+var s_restore_tmp = ttmp2
+var s_restore_mem_offset_save = s_restore_tmp //no conflict
+
+var s_restore_m0 = s_restore_alloc_size //no conflict
+
+var s_restore_mode = ttmp7
+
+var s_restore_pc_lo = ttmp0
+var s_restore_pc_hi = ttmp1
+var s_restore_exec_lo = ttmp14
+var s_restore_exec_hi = ttmp15
+var s_restore_status = ttmp4
+var s_restore_trapsts = ttmp5
+var s_restore_xnack_mask_lo = xnack_mask_lo
+var s_restore_xnack_mask_hi = xnack_mask_hi
+var s_restore_buf_rsrc0 = ttmp8
+var s_restore_buf_rsrc1 = ttmp9
+var s_restore_buf_rsrc2 = ttmp10
+var s_restore_buf_rsrc3 = ttmp11
+var s_restore_ttmps_lo = s_restore_tmp //no conflict
+var s_restore_ttmps_hi = s_restore_alloc_size //no conflict
+
+/**************************************************************************/
+/* trap handler entry points */
+/**************************************************************************/
+/* Shader Main*/
+
+shader main
+ asic(GFX9)
+ type(CS)
+
+
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
+ //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
+ s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
+ s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
+ s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
+ //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
+ s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
+ else
+ s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
+ end
+
+L_JUMP_TO_RESTORE:
+ s_branch L_RESTORE //restore
+
+L_SKIP_RESTORE:
+
+ s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
+ s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
+ s_cbranch_scc1 L_SAVE //this is the operation for save
+
+ // ********* Handle non-CWSR traps *******************
+if (!EMU_RUN_HACK)
+ // Illegal instruction is a non-maskable exception which blocks context save.
+ // Halt the wavefront and return from the trap.
+ s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
+ s_cbranch_scc1 L_HALT_WAVE
+
+ // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA.
+ // Instead, halt the wavefront and return from the trap.
+ s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+ s_cbranch_scc0 L_FETCH_2ND_TRAP
+
+L_HALT_WAVE:
+ // If STATUS.HALT is set then this fault must come from SQC instruction fetch.
+ // We cannot prevent further faults so just terminate the wavefront.
+ s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
+ s_cbranch_scc0 L_NOT_ALREADY_HALTED
+ s_endpgm
+L_NOT_ALREADY_HALTED:
+ s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
+
+ // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set.
+ // Rewind the PC to prevent this from occurring. The debugger compensates for this.
+ s_sub_u32 ttmp0, ttmp0, 0x8
+ s_subb_u32 ttmp1, ttmp1, 0x0
+
+L_FETCH_2ND_TRAP:
+ // Preserve and clear scalar XNACK state before issuing scalar reads.
+ // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26].
+ s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS)
+ s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+ s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+ s_andn2_b32 ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK
+ s_or_b32 ttmp11, ttmp11, ttmp3
+
+ s_andn2_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+ s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
+
+ // Read second-level TBA/TMA from first-level TMA and jump if available.
+ // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
+ // ttmp12 holds SQ_WAVE_STATUS
+ s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO)
+ s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI)
+ s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8
+ s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA
+ s_waitcnt lgkmcnt(0)
+ s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA
+ s_waitcnt lgkmcnt(0)
+ s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
+ s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set
+ s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler
+
+L_NO_NEXT_TRAP:
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
+ s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly.
+ s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0
+ s_addc_u32 ttmp1, ttmp1, 0
+L_EXCP_CASE:
+ s_and_b32 ttmp1, ttmp1, 0xFFFF
+
+ // Restore SQ_WAVE_IB_STS.
+ s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+ s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+ s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
+
+ // Restore SQ_WAVE_STATUS.
+ s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
+ s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
+ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status
+
+ s_rfe_b64 [ttmp0, ttmp1]
+end
+ // ********* End handling of non-CWSR traps *******************
+
+/**************************************************************************/
+/* save routine */
+/**************************************************************************/
+
+L_SAVE:
+
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_save_s
+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
+end
+
+ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
+
+ s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
+
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
+ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
+ s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
+
+ s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
+
+ /* inform SPI the readiness and wait for SPI's go signal */
+ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
+ s_mov_b32 s_save_exec_hi, exec_hi
+ s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
+
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_sq_save_msg
+ s_waitcnt lgkmcnt(0)
+end
+
+ if (EMU_RUN_HACK)
+
+ else
+ s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
+ end
+
+ // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
+ s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
+ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
+
+ L_SLEEP:
+ s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
+
+ if (EMU_RUN_HACK)
+
+ else
+ s_cbranch_execz L_SLEEP
+ end
+
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_spi_wrexec
+ s_waitcnt lgkmcnt(0)
+end
+
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+ //calculate wd_addr using absolute thread id
+ v_readlane_b32 s_save_tmp, v9, 0
+ s_lshr_b32 s_save_tmp, s_save_tmp, 6
+ s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
+ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+ else
+ end
+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+ else
+ end
+
+ // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
+ // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
+ get_vgpr_size_bytes(s_save_ttmps_lo)
+ get_sgpr_size_bytes(s_save_ttmps_hi)
+ s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
+ s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
+ s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0
+ s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF
+ s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1
+ ack_sqc_store_workaround()
+ s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1
+ ack_sqc_store_workaround()
+ s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1
+ ack_sqc_store_workaround()
+ s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1
+ ack_sqc_store_workaround()
+
+ /* setup Resource Contants */
+ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
+ s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
+ s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
+ s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
+ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
+ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
+
+ //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?)
+ s_mov_b32 s_save_m0, m0 //save M0
+
+ /* global mem offset */
+ s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
+
+
+
+
+ /* save HW registers */
+ //////////////////////////////
+
+ L_SAVE_HWREG:
+ // HWREG SR memory offset : size(VGPR)+size(SGPR)
+ get_vgpr_size_bytes(s_save_mem_offset)
+ get_sgpr_size_bytes(s_save_tmp)
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
+
+
+ s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+
+ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0
+
+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
+ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
+ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
+ end
+
+ write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC
+ write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
+ write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC
+ write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
+ write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS
+
+ //s_save_trapsts conflicts with s_save_alloc_size
+ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+ write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS
+
+ write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO
+ write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI
+
+ //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+ s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
+ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+
+
+
+ /* the first wave in the threadgroup */
+ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit
+ s_mov_b32 s_save_exec_hi, 0x0
+ s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26]
+
+
+ /* save SGPRs */
+ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
+ //////////////////////////////
+
+ // SGPR SR memory offset : size(VGPR)
+ get_vgpr_size_bytes(s_save_mem_offset)
+ // TODO, change RSRC word to rearrange memory layout for SGPRS
+
+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
+
+ if (SGPR_SAVE_USE_SQC)
+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes
+ else
+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
+ end
+
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+
+ // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
+ //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
+ s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
+ s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
+ s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
+
+ s_mov_b32 m0, 0x0 //SGPR initial index value =0
+ s_nop 0x0 //Manually inserted wait states
+ L_SAVE_SGPR_LOOP:
+ // SGPR is allocated in 16 SGPR granularity
+ s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
+ s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
+ s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
+ s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
+ s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
+ s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
+ s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
+ s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
+
+ write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
+ s_add_u32 m0, m0, 16 //next sgpr index
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete?
+ // restore s_save_buf_rsrc0,1
+ //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
+ s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
+
+
+
+
+ /* save first 4 VGPR, then LDS save could use */
+ // each wave will alloc 4 vgprs at least...
+ /////////////////////////////////////////////////////////////////////////////////////
+
+ s_mov_b32 s_save_mem_offset, 0
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+ s_mov_b32 xnack_mask_lo, 0x0
+ s_mov_b32 xnack_mask_hi, 0x0
+
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+
+ // VGPR Allocated in 4-GPR granularity
+
+if G8SR_VGPR_SR_IN_DWX4
+ // the const stride for DWx4 is 4*4 bytes
+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
+
+ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+
+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes
+else
+ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
+ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
+ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
+end
+
+
+
+ /* save LDS */
+ //////////////////////////////
+
+ L_SAVE_LDS:
+
+ // Change EXEC to all threads...
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+
+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
+ s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
+ s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE
+
+ s_barrier //LDS is used? wait for other waves in the same TG
+ s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
+ s_cbranch_scc0 L_SAVE_LDS_DONE
+
+ // first wave do LDS save;
+
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
+ s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
+
+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+ //
+ get_vgpr_size_bytes(s_save_mem_offset)
+ get_sgpr_size_bytes(s_save_tmp)
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
+
+
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_mov_b32 m0, 0x0 //lds_offset initial value = 0
+
+
+var LDS_DMA_ENABLE = 0
+var UNROLL = 0
+if UNROLL==0 && LDS_DMA_ENABLE==1
+ s_mov_b32 s3, 256*2
+ s_nop 0
+ s_nop 0
+ s_nop 0
+ L_SAVE_LDS_LOOP:
+ //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
+ if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity
+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW
+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
+ end
+
+ s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete?
+
+elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss
+ // store from higest LDS address to lowest
+ s_mov_b32 s3, 256*2
+ s_sub_u32 m0, s_save_alloc_size, s3
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
+ s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks...
+ s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest
+ s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc
+ s_nop 0
+ s_nop 0
+ s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes
+ s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved
+ s_add_u32 s0, s0,s_save_alloc_size
+ s_addc_u32 s1, s1, 0
+ s_setpc_b64 s[0:1]
+
+
+ for var i =0; i< 128; i++
+ // be careful to make here a 64Byte aligned address, which could improve performance...
+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW
+ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
+
+ if i!=127
+ s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline
+ s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3
+ end
+ end
+
+else // BUFFER_STORE
+ v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
+ v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid
+ v_mul_i32_i24 v2, v3, 8 // tid*8
+ v_mov_b32 v3, 256*2
+ s_mov_b32 m0, 0x10000
+ s_mov_b32 s0, s_save_buf_rsrc3
+ s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid
+ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT
+
+L_SAVE_LDS_LOOP_VECTOR:
+ ds_read_b64 v[0:1], v2 //x =LDS[a], byte address
+ s_waitcnt lgkmcnt(0)
+ buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1
+// s_waitcnt vmcnt(0)
+// v_add_u32 v2, vcc[0:1], v2, v3
+ v_add_u32 v2, v2, v3
+ v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
+ s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
+
+ // restore rsrc3
+ s_mov_b32 s_save_buf_rsrc3, s0
+
+end
+
+L_SAVE_LDS_DONE:
+
+
+ /* save VGPRs - set the Rest VGPRs */
+ //////////////////////////////////////////////////////////////////////////////////////
+ L_SAVE_VGPR:
+ // VGPR SR memory offset: 0
+ // TODO rearrange the RSRC words to use swizzle for VGPR save...
+
+ s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+
+ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
+ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
+ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
+ if (SWIZZLE_EN)
+ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+
+ // VGPR Allocated in 4-GPR granularity
+
+if G8SR_VGPR_SR_IN_DWX4
+ // the const stride for DWx4 is 4*4 bytes
+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
+
+ s_mov_b32 m0, 4 // skip first 4 VGPRs
+ s_cmp_lt_u32 m0, s_save_alloc_size
+ s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs
+
+ s_set_gpr_idx_on m0, 0x1 // This will change M0
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0
+L_SAVE_VGPR_LOOP:
+ v_mov_b32 v0, v0 // v0 = v[0+m0]
+ v_mov_b32 v1, v1
+ v_mov_b32 v2, v2
+ v_mov_b32 v3, v3
+
+
+ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+ s_add_u32 m0, m0, 4
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4
+ s_cmp_lt_u32 m0, s_save_alloc_size
+ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
+ s_set_gpr_idx_off
+L_SAVE_VGPR_LOOP_END:
+
+ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes
+else
+ // VGPR store using dw burst
+ s_mov_b32 m0, 0x4 //VGPR initial index value =0
+ s_cmp_lt_u32 m0, s_save_alloc_size
+ s_cbranch_scc0 L_SAVE_VGPR_END
+
+
+ s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later
+
+ L_SAVE_VGPR_LOOP:
+ v_mov_b32 v0, v0 //v0 = v[0+m0]
+ v_mov_b32 v1, v1 //v0 = v[0+m0]
+ v_mov_b32 v2, v2 //v0 = v[0+m0]
+ v_mov_b32 v3, v3 //v0 = v[0+m0]
+
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
+ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
+ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
+ end
+
+ s_add_u32 m0, m0, 4 //next vgpr index
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
+ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
+ s_set_gpr_idx_off
+end
+
+L_SAVE_VGPR_END:
+
+
+
+
+
+
+ /* S_PGM_END_SAVED */ //FIXME graphics ONLY
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
+ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
+ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
+ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
+ s_rfe_b64 s_save_pc_lo //Return to the main shader program
+ else
+ end
+
+// Save Done timestamp
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_save_d
+ // SGPR SR memory offset : size(VGPR)
+ get_vgpr_size_bytes(s_save_mem_offset)
+ s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
+ // Need reset rsrc2??
+ s_mov_b32 m0, s_save_mem_offset
+ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1
+end
+
+
+ s_branch L_END_PGM
+
+
+
+/**************************************************************************/
+/* restore routine */
+/**************************************************************************/
+
+L_RESTORE:
+ /* Setup Resource Contants */
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+ //calculate wd_addr using absolute thread id
+ v_readlane_b32 s_restore_tmp, v9, 0
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
+ s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
+ s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
+ s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
+ s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
+ else
+ end
+
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_restore_s
+ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
+ // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
+ s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
+ s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored..
+end
+
+
+
+ s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
+ s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
+ s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
+ s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
+ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
+ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
+ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
+
+ /* global mem offset */
+// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
+
+ /* the first wave in the threadgroup */
+ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
+ s_cbranch_scc0 L_RESTORE_VGPR
+
+ /* restore LDS */
+ //////////////////////////////
+ L_RESTORE_LDS:
+
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+
+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
+ s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
+ s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
+ s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
+
+ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+ //
+ get_vgpr_size_bytes(s_restore_mem_offset)
+ get_sgpr_size_bytes(s_restore_tmp)
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow???
+
+
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+ s_mov_b32 m0, 0x0 //lds_offset initial value = 0
+
+ L_RESTORE_LDS_LOOP:
+ if (SAVE_LDS)
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW
+ end
+ s_add_u32 m0, m0, 256*2 // 128 DW
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW
+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?
+
+
+ /* restore VGPRs */
+ //////////////////////////////
+ L_RESTORE_VGPR:
+ // VGPR SR memory offset : 0
+ s_mov_b32 s_restore_mem_offset, 0x0
+ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
+ s_mov_b32 exec_hi, 0xFFFFFFFF
+
+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+if G8SR_VGPR_SR_IN_DWX4
+ get_vgpr_size_bytes(s_restore_mem_offset)
+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
+
+ // the const stride for DWx4 is 4*4 bytes
+ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
+
+ s_mov_b32 m0, s_restore_alloc_size
+ s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0
+
+L_RESTORE_VGPR_LOOP:
+ buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+ s_waitcnt vmcnt(0)
+ s_sub_u32 m0, m0, 4
+ v_mov_b32 v0, v0 // v[0+m0] = v0
+ v_mov_b32 v1, v1
+ v_mov_b32 v2, v2
+ v_mov_b32 v3, v3
+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
+ s_cmp_eq_u32 m0, 0x8000
+ s_cbranch_scc0 L_RESTORE_VGPR_LOOP
+ s_set_gpr_idx_off
+
+ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0
+ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes
+
+else
+ // VGPR load using dw burst
+ s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
+ s_mov_b32 m0, 4 //VGPR initial index value = 1
+ s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
+
+ L_RESTORE_VGPR_LOOP:
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
+ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
+ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
+ end
+ s_waitcnt vmcnt(0) //ensure data ready
+ v_mov_b32 v0, v0 //v[0+m0] = v0
+ v_mov_b32 v1, v1
+ v_mov_b32 v2, v2
+ v_mov_b32 v3, v3
+ s_add_u32 m0, m0, 4 //next vgpr index
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
+ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete?
+ s_set_gpr_idx_off
+ /* VGPR restore on v0 */
+ if(USE_MTBUF_INSTEAD_OF_MUBUF)
+ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+ else
+ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
+ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
+ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
+ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
+ end
+
+end
+
+ /* restore SGPRs */
+ //////////////////////////////
+
+ // SGPR SR memory offset : size(VGPR)
+ get_vgpr_size_bytes(s_restore_mem_offset)
+ get_sgpr_size_bytes(s_restore_tmp)
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group
+ // TODO, change RSRC word to rearrange memory layout for SGPRS
+
+ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
+ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
+ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
+
+ if (SGPR_SAVE_USE_SQC)
+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes
+ else
+ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
+ end
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ s_mov_b32 m0, s_restore_alloc_size
+
+ L_RESTORE_SGPR_LOOP:
+ read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made
+ s_waitcnt lgkmcnt(0) //ensure data ready
+
+ s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
+ s_nop 0 // hazard SALU M0=> S_MOVREL
+
+ s_movreld_b64 s0, s0 //s[0+m0] = s0
+ s_movreld_b64 s2, s2
+ s_movreld_b64 s4, s4
+ s_movreld_b64 s6, s6
+ s_movreld_b64 s8, s8
+ s_movreld_b64 s10, s10
+ s_movreld_b64 s12, s12
+ s_movreld_b64 s14, s14
+
+ s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+ s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete?
+
+ /* restore HW registers */
+ //////////////////////////////
+ L_RESTORE_HWREG:
+
+
+if G8SR_DEBUG_TIMESTAMP
+ s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
+ s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
+end
+
+ // HWREG SR memory offset : size(VGPR)+size(SGPR)
+ get_vgpr_size_bytes(s_restore_mem_offset)
+ get_sgpr_size_bytes(s_restore_tmp)
+ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+
+
+ s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+ if (SWIZZLE_EN)
+ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+ else
+ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+ end
+
+ read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0
+ read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC
+ read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+ read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC
+ read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+ read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS
+ read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS
+ read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO
+ read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI
+ read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE
+
+ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
+
+ //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
+ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
+ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
+ end
+ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
+ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
+ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
+ end
+
+ s_mov_b32 m0, s_restore_m0
+ s_mov_b32 exec_lo, s_restore_exec_lo
+ s_mov_b32 exec_hi, s_restore_exec_hi
+
+ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
+ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
+ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
+ //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
+ s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
+
+ // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
+ // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
+ get_vgpr_size_bytes(s_restore_ttmps_lo)
+ get_sgpr_size_bytes(s_restore_ttmps_hi)
+ s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
+ s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
+ s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
+ s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
+ s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1
+ s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1
+ s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1
+ s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1
+ s_waitcnt lgkmcnt(0)
+
+ //reuse s_restore_m0 as a temp register
+ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
+ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
+ s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
+ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
+ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
+ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
+ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
+ s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
+ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+ s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
+
+ s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
+ s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
+ s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
+ s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu
+
+ s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
+
+if G8SR_DEBUG_TIMESTAMP
+ s_memrealtime s_g8sr_ts_restore_d
+ s_waitcnt lgkmcnt(0)
+end
+
+// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
+ s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc
+
+
+/**************************************************************************/
+/* the END */
+/**************************************************************************/
+L_END_PGM:
+ s_endpgm
+
+end
+
+
+/**************************************************************************/
+/* the helper functions */
+/**************************************************************************/
+
+//Only for save hwreg to mem
+function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
+ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
+ s_mov_b32 m0, s_mem_offset
+ s_buffer_store_dword s, s_rsrc, m0 glc:1
+ ack_sqc_store_workaround()
+ s_add_u32 s_mem_offset, s_mem_offset, 4
+ s_mov_b32 m0, exec_lo
+end
+
+
+// HWREG are saved before SGPRs, so all HWREG could be use.
+function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
+
+ s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1
+ ack_sqc_store_workaround()
+ s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1
+ ack_sqc_store_workaround()
+ s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1
+ ack_sqc_store_workaround()
+ s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
+ ack_sqc_store_workaround()
+ s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
+ s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc
+end
+
+
+function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
+ s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
+ s_add_u32 s_mem_offset, s_mem_offset, 4
+end
+
+function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
+ s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1
+ s_sub_u32 s_mem_offset, s_mem_offset, 4*16
+end
+
+
+
+function get_lds_size_bytes(s_lds_size_byte)
+ // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
+ s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size
+ s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
+end
+
+function get_vgpr_size_bytes(s_vgpr_size_byte)
+ s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
+ s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
+ s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible
+end
+
+function get_sgpr_size_bytes(s_sgpr_size_byte)
+ s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
+ s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1
+ s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value)
+end
+
+function get_hwreg_size_bytes
+ return 128 //HWREG size 128 bytes
+end
+
+function ack_sqc_store_workaround
+ if ACK_SQC_STORE
+ s_waitcnt lgkmcnt(0)
+ end
+end
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 59808a39ecf4..f64c5551cdba 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -233,7 +233,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
pr_debug("Queue Size: 0x%llX, %u\n",
q_properties->queue_size, args->ring_size);
- pr_debug("Queue r/w Pointers: %p, %p\n",
+ pr_debug("Queue r/w Pointers: %px, %px\n",
q_properties->read_ptr,
q_properties->write_ptr);
@@ -292,8 +292,16 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
/* Return gpu_id as doorbell offset for mmap usage */
- args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id);
+ args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL;
+ args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id);
args->doorbell_offset <<= PAGE_SHIFT;
+ if (KFD_IS_SOC15(dev->device_info->asic_family))
+ /* On SOC15 ASICs, doorbell allocation must be
+ * per-device, and independent from the per-process
+ * queue_id. Return the doorbell offset within the
+ * doorbell aperture to user mode.
+ */
+ args->doorbell_offset |= q_properties.doorbell_off;
mutex_unlock(&p->mutex);
@@ -1296,8 +1304,8 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
return -EINVAL;
}
- devices_arr = kmalloc(args->n_devices * sizeof(*devices_arr),
- GFP_KERNEL);
+ devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr),
+ GFP_KERNEL);
if (!devices_arr)
return -ENOMEM;
@@ -1405,8 +1413,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
return -EINVAL;
}
- devices_arr = kmalloc(args->n_devices * sizeof(*devices_arr),
- GFP_KERNEL);
+ devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr),
+ GFP_KERNEL);
if (!devices_arr)
return -ENOMEM;
@@ -1645,23 +1653,33 @@ err_i1:
static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct kfd_process *process;
+ struct kfd_dev *dev = NULL;
+ unsigned long vm_pgoff;
+ unsigned int gpu_id;
process = kfd_get_process(current);
if (IS_ERR(process))
return PTR_ERR(process);
- if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) ==
- KFD_MMAP_DOORBELL_MASK) {
- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK;
- return kfd_doorbell_mmap(process, vma);
- } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) ==
- KFD_MMAP_EVENTS_MASK) {
- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK;
+ vm_pgoff = vma->vm_pgoff;
+ vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vm_pgoff);
+ gpu_id = KFD_MMAP_GPU_ID_GET(vm_pgoff);
+ if (gpu_id)
+ dev = kfd_device_by_id(gpu_id);
+
+ switch (vm_pgoff & KFD_MMAP_TYPE_MASK) {
+ case KFD_MMAP_TYPE_DOORBELL:
+ if (!dev)
+ return -ENODEV;
+ return kfd_doorbell_mmap(dev, process, vma);
+
+ case KFD_MMAP_TYPE_EVENTS:
return kfd_event_mmap(process, vma);
- } else if ((vma->vm_pgoff & KFD_MMAP_RESERVED_MEM_MASK) ==
- KFD_MMAP_RESERVED_MEM_MASK) {
- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_RESERVED_MEM_MASK;
- return kfd_reserved_mem_mmap(process, vma);
+
+ case KFD_MMAP_TYPE_RESERVED_MEM:
+ if (!dev)
+ return -ENODEV;
+ return kfd_reserved_mem_mmap(dev, process, vma);
}
return -EFAULT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 4f126ef6139b..296b3f230280 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -132,6 +132,9 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = {
#define fiji_cache_info carrizo_cache_info
#define polaris10_cache_info carrizo_cache_info
#define polaris11_cache_info carrizo_cache_info
+/* TODO - check & update Vega10 cache details */
+#define vega10_cache_info carrizo_cache_info
+#define raven_cache_info carrizo_cache_info
static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
struct crat_subtype_computeunit *cu)
@@ -603,6 +606,14 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
pcache_info = polaris11_cache_info;
num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
break;
+ case CHIP_VEGA10:
+ pcache_info = vega10_cache_info;
+ num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
+ break;
+ case CHIP_RAVEN:
+ pcache_info = raven_cache_info;
+ num_of_cache_types = ARRAY_SIZE(raven_cache_info);
+ break;
default:
return -EINVAL;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 3346699960dd..7ee6cec2c060 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -20,16 +20,13 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
-#include <linux/amd-iommu.h>
-#endif
#include <linux/bsearch.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include "kfd_priv.h"
#include "kfd_device_queue_manager.h"
#include "kfd_pm4_headers_vi.h"
-#include "cwsr_trap_handler_gfx8.asm"
+#include "cwsr_trap_handler.h"
#include "kfd_iommu.h"
#define MQD_SIZE_ALIGNED 768
@@ -41,6 +38,7 @@ static const struct kfd_device_info kaveri_device_info = {
.max_pasid_bits = 16,
/* max num of queues for KV.TODO should be a dynamic value */
.max_no_of_hqd = 24,
+ .doorbell_size = 4,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
@@ -55,6 +53,7 @@ static const struct kfd_device_info carrizo_device_info = {
.max_pasid_bits = 16,
/* max num of queues for CZ.TODO should be a dynamic value */
.max_no_of_hqd = 24,
+ .doorbell_size = 4,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
@@ -70,6 +69,7 @@ static const struct kfd_device_info hawaii_device_info = {
.max_pasid_bits = 16,
/* max num of queues for KV.TODO should be a dynamic value */
.max_no_of_hqd = 24,
+ .doorbell_size = 4,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
@@ -83,6 +83,7 @@ static const struct kfd_device_info tonga_device_info = {
.asic_family = CHIP_TONGA,
.max_pasid_bits = 16,
.max_no_of_hqd = 24,
+ .doorbell_size = 4,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
@@ -96,6 +97,7 @@ static const struct kfd_device_info tonga_vf_device_info = {
.asic_family = CHIP_TONGA,
.max_pasid_bits = 16,
.max_no_of_hqd = 24,
+ .doorbell_size = 4,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
@@ -109,6 +111,7 @@ static const struct kfd_device_info fiji_device_info = {
.asic_family = CHIP_FIJI,
.max_pasid_bits = 16,
.max_no_of_hqd = 24,
+ .doorbell_size = 4,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
@@ -122,6 +125,7 @@ static const struct kfd_device_info fiji_vf_device_info = {
.asic_family = CHIP_FIJI,
.max_pasid_bits = 16,
.max_no_of_hqd = 24,
+ .doorbell_size = 4,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
@@ -136,6 +140,7 @@ static const struct kfd_device_info polaris10_device_info = {
.asic_family = CHIP_POLARIS10,
.max_pasid_bits = 16,
.max_no_of_hqd = 24,
+ .doorbell_size = 4,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
@@ -149,6 +154,7 @@ static const struct kfd_device_info polaris10_vf_device_info = {
.asic_family = CHIP_POLARIS10,
.max_pasid_bits = 16,
.max_no_of_hqd = 24,
+ .doorbell_size = 4,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
@@ -162,6 +168,7 @@ static const struct kfd_device_info polaris11_device_info = {
.asic_family = CHIP_POLARIS11,
.max_pasid_bits = 16,
.max_no_of_hqd = 24,
+ .doorbell_size = 4,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.event_interrupt_class = &event_interrupt_class_cik,
.num_of_watch_points = 4,
@@ -171,6 +178,34 @@ static const struct kfd_device_info polaris11_device_info = {
.needs_pci_atomics = true,
};
+static const struct kfd_device_info vega10_device_info = {
+ .asic_family = CHIP_VEGA10,
+ .max_pasid_bits = 16,
+ .max_no_of_hqd = 24,
+ .doorbell_size = 8,
+ .ih_ring_entry_size = 8 * sizeof(uint32_t),
+ .event_interrupt_class = &event_interrupt_class_v9,
+ .num_of_watch_points = 4,
+ .mqd_size_aligned = MQD_SIZE_ALIGNED,
+ .supports_cwsr = true,
+ .needs_iommu_device = false,
+ .needs_pci_atomics = false,
+};
+
+static const struct kfd_device_info vega10_vf_device_info = {
+ .asic_family = CHIP_VEGA10,
+ .max_pasid_bits = 16,
+ .max_no_of_hqd = 24,
+ .doorbell_size = 8,
+ .ih_ring_entry_size = 8 * sizeof(uint32_t),
+ .event_interrupt_class = &event_interrupt_class_v9,
+ .num_of_watch_points = 4,
+ .mqd_size_aligned = MQD_SIZE_ALIGNED,
+ .supports_cwsr = true,
+ .needs_iommu_device = false,
+ .needs_pci_atomics = false,
+};
+
struct kfd_deviceid {
unsigned short did;
@@ -250,6 +285,15 @@ static const struct kfd_deviceid supported_devices[] = {
{ 0x67EB, &polaris11_device_info }, /* Polaris11 */
{ 0x67EF, &polaris11_device_info }, /* Polaris11 */
{ 0x67FF, &polaris11_device_info }, /* Polaris11 */
+ { 0x6860, &vega10_device_info }, /* Vega10 */
+ { 0x6861, &vega10_device_info }, /* Vega10 */
+ { 0x6862, &vega10_device_info }, /* Vega10 */
+ { 0x6863, &vega10_device_info }, /* Vega10 */
+ { 0x6864, &vega10_device_info }, /* Vega10 */
+ { 0x6867, &vega10_device_info }, /* Vega10 */
+ { 0x6868, &vega10_device_info }, /* Vega10 */
+ { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/
+ { 0x687F, &vega10_device_info }, /* Vega10 */
};
static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
@@ -279,7 +323,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
struct pci_dev *pdev, const struct kfd2kgd_calls *f2g)
{
struct kfd_dev *kfd;
-
+ int ret;
const struct kfd_device_info *device_info =
lookup_device_info(pdev->device);
@@ -288,19 +332,18 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
return NULL;
}
- if (device_info->needs_pci_atomics) {
- /* Allow BIF to recode atomics to PCIe 3.0
- * AtomicOps. 32 and 64-bit requests are possible and
- * must be supported.
- */
- if (pci_enable_atomic_ops_to_root(pdev,
- PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
- PCI_EXP_DEVCAP2_ATOMIC_COMP64) < 0) {
- dev_info(kfd_device,
- "skipped device %x:%x, PCI rejects atomics",
- pdev->vendor, pdev->device);
- return NULL;
- }
+ /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps.
+ * 32 and 64-bit requests are possible and must be
+ * supported.
+ */
+ ret = pci_enable_atomic_ops_to_root(pdev,
+ PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+ PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+ if (device_info->needs_pci_atomics && ret < 0) {
+ dev_info(kfd_device,
+ "skipped device %x:%x, PCI rejects atomics\n",
+ pdev->vendor, pdev->device);
+ return NULL;
}
kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
@@ -323,10 +366,16 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
static void kfd_cwsr_init(struct kfd_dev *kfd)
{
if (cwsr_enable && kfd->device_info->supports_cwsr) {
- BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
+ if (kfd->device_info->asic_family < CHIP_VEGA10) {
+ BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
+ kfd->cwsr_isa = cwsr_trap_gfx8_hex;
+ kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
+ } else {
+ BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE);
+ kfd->cwsr_isa = cwsr_trap_gfx9_hex;
+ kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex);
+ }
- kfd->cwsr_isa = cwsr_trap_gfx8_hex;
- kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
kfd->cwsr_enabled = true;
}
}
@@ -541,6 +590,44 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
spin_unlock(&kfd->interrupt_lock);
}
+int kgd2kfd_quiesce_mm(struct mm_struct *mm)
+{
+ struct kfd_process *p;
+ int r;
+
+ /* Because we are called from arbitrary context (workqueue) as opposed
+ * to process context, kfd_process could attempt to exit while we are
+ * running so the lookup function increments the process ref count.
+ */
+ p = kfd_lookup_process_by_mm(mm);
+ if (!p)
+ return -ESRCH;
+
+ r = kfd_process_evict_queues(p);
+
+ kfd_unref_process(p);
+ return r;
+}
+
+int kgd2kfd_resume_mm(struct mm_struct *mm)
+{
+ struct kfd_process *p;
+ int r;
+
+ /* Because we are called from arbitrary context (workqueue) as opposed
+ * to process context, kfd_process could attempt to exit while we are
+ * running so the lookup function increments the process ref count.
+ */
+ p = kfd_lookup_process_by_mm(mm);
+ if (!p)
+ return -ESRCH;
+
+ r = kfd_process_restore_queues(p);
+
+ kfd_unref_process(p);
+ return r;
+}
+
/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will
* prepare for safe eviction of KFD BOs that belong to the specified
* process.
@@ -652,7 +739,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size,
if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size)
return -ENOMEM;
- *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+ *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
if ((*mem_obj) == NULL)
return -ENOMEM;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index d55d29d31da4..668ad07ebe1f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -110,6 +110,57 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
qpd->sh_mem_bases);
}
+static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q)
+{
+ struct kfd_dev *dev = qpd->dqm->dev;
+
+ if (!KFD_IS_SOC15(dev->device_info->asic_family)) {
+ /* On pre-SOC15 chips we need to use the queue ID to
+ * preserve the user mode ABI.
+ */
+ q->doorbell_id = q->properties.queue_id;
+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+ /* For SDMA queues on SOC15, use static doorbell
+ * assignments based on the engine and queue.
+ */
+ q->doorbell_id = dev->shared_resources.sdma_doorbell
+ [q->properties.sdma_engine_id]
+ [q->properties.sdma_queue_id];
+ } else {
+ /* For CP queues on SOC15 reserve a free doorbell ID */
+ unsigned int found;
+
+ found = find_first_zero_bit(qpd->doorbell_bitmap,
+ KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
+ if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) {
+ pr_debug("No doorbells available");
+ return -EBUSY;
+ }
+ set_bit(found, qpd->doorbell_bitmap);
+ q->doorbell_id = found;
+ }
+
+ q->properties.doorbell_off =
+ kfd_doorbell_id_to_offset(dev, q->process,
+ q->doorbell_id);
+
+ return 0;
+}
+
+static void deallocate_doorbell(struct qcm_process_device *qpd,
+ struct queue *q)
+{
+ unsigned int old;
+ struct kfd_dev *dev = qpd->dqm->dev;
+
+ if (!KFD_IS_SOC15(dev->device_info->asic_family) ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA)
+ return;
+
+ old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap);
+ WARN_ON(!old);
+}
+
static int allocate_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
@@ -145,15 +196,19 @@ static int allocate_vmid(struct device_queue_manager *dqm,
static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
struct qcm_process_device *qpd)
{
- uint32_t len;
+ const struct packet_manager_funcs *pmf = qpd->dqm->packets.pmf;
+ int ret;
if (!qpd->ib_kaddr)
return -ENOMEM;
- len = pm_create_release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr);
+ ret = pmf->release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr);
+ if (ret)
+ return ret;
return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid,
- qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len);
+ qpd->ib_base, (uint32_t *)qpd->ib_kaddr,
+ pmf->release_mem_size / sizeof(uint32_t));
}
static void deallocate_vmid(struct device_queue_manager *dqm,
@@ -301,10 +356,14 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
if (retval)
return retval;
+ retval = allocate_doorbell(qpd, q);
+ if (retval)
+ goto out_deallocate_hqd;
+
retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
&q->gart_mqd_addr, &q->properties);
if (retval)
- goto out_deallocate_hqd;
+ goto out_deallocate_doorbell;
pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
q->pipe, q->queue);
@@ -324,6 +383,8 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
out_uninit_mqd:
mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+out_deallocate_doorbell:
+ deallocate_doorbell(qpd, q);
out_deallocate_hqd:
deallocate_hqd(dqm, q);
@@ -357,6 +418,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
}
dqm->total_queue_count--;
+ deallocate_doorbell(qpd, q);
+
retval = mqd->destroy_mqd(mqd, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
KFD_UNMAP_LATENCY_MS,
@@ -861,6 +924,10 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE;
q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
+ retval = allocate_doorbell(qpd, q);
+ if (retval)
+ goto out_deallocate_sdma_queue;
+
pr_debug("SDMA id is: %d\n", q->sdma_id);
pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
@@ -869,7 +936,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
&q->gart_mqd_addr, &q->properties);
if (retval)
- goto out_deallocate_sdma_queue;
+ goto out_deallocate_doorbell;
retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL);
if (retval)
@@ -879,6 +946,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
out_uninit_mqd:
mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+out_deallocate_doorbell:
+ deallocate_doorbell(qpd, q);
out_deallocate_sdma_queue:
deallocate_sdma_queue(dqm, q->sdma_id);
@@ -1070,12 +1139,17 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
q->properties.sdma_engine_id =
q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
}
+
+ retval = allocate_doorbell(qpd, q);
+ if (retval)
+ goto out_deallocate_sdma_queue;
+
mqd = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
if (!mqd) {
retval = -ENOMEM;
- goto out_deallocate_sdma_queue;
+ goto out_deallocate_doorbell;
}
/*
* Eviction state logic: we only mark active queues as evicted
@@ -1093,7 +1167,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
&q->gart_mqd_addr, &q->properties);
if (retval)
- goto out_deallocate_sdma_queue;
+ goto out_deallocate_doorbell;
list_add(&q->list, &qpd->queues_list);
qpd->queue_count++;
@@ -1117,6 +1191,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
mutex_unlock(&dqm->lock);
return retval;
+out_deallocate_doorbell:
+ deallocate_doorbell(qpd, q);
out_deallocate_sdma_queue:
if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
deallocate_sdma_queue(dqm, q->sdma_id);
@@ -1257,6 +1333,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
goto failed;
}
+ deallocate_doorbell(qpd, q);
+
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
dqm->sdma_queue_count--;
deallocate_sdma_queue(dqm, q->sdma_id);
@@ -1308,7 +1386,10 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
void __user *alternate_aperture_base,
uint64_t alternate_aperture_size)
{
- bool retval;
+ bool retval = true;
+
+ if (!dqm->asic_ops.set_cache_memory_policy)
+ return retval;
mutex_lock(&dqm->lock);
@@ -1577,6 +1658,11 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
case CHIP_POLARIS11:
device_queue_manager_init_vi_tonga(&dqm->asic_ops);
break;
+
+ case CHIP_VEGA10:
+ case CHIP_RAVEN:
+ device_queue_manager_init_v9(&dqm->asic_ops);
+ break;
default:
WARN(1, "Unexpected ASIC family %u",
dev->device_info->asic_family);
@@ -1627,6 +1713,18 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
int pipe, queue;
int r = 0;
+ r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
+ KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs);
+ if (!r) {
+ seq_printf(m, " HIQ on MEC %d Pipe %d Queue %d\n",
+ KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1,
+ KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm),
+ KFD_CIK_HIQ_QUEUE);
+ seq_reg_dump(m, dump, n_regs);
+
+ kfree(dump);
+ }
+
for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
int pipe_offset = pipe * get_queues_per_pipe(dqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 412beff3281d..59a6b1956932 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -200,6 +200,8 @@ void device_queue_manager_init_vi(
struct device_queue_manager_asic_ops *asic_ops);
void device_queue_manager_init_vi_tonga(
struct device_queue_manager_asic_ops *asic_ops);
+void device_queue_manager_init_v9(
+ struct device_queue_manager_asic_ops *asic_ops);
void program_sh_mem_settings(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
unsigned int get_queues_num(struct device_queue_manager *dqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
new file mode 100644
index 000000000000..79e5bcf6367c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_device_queue_manager.h"
+#include "vega10_enum.h"
+#include "gc/gc_9_0_offset.h"
+#include "gc/gc_9_0_sh_mask.h"
+#include "sdma0/sdma0_4_0_sh_mask.h"
+
+static int update_qpd_v9(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd);
+static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd);
+
+void device_queue_manager_init_v9(
+ struct device_queue_manager_asic_ops *asic_ops)
+{
+ asic_ops->update_qpd = update_qpd_v9;
+ asic_ops->init_sdma_vm = init_sdma_vm_v9;
+}
+
+static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
+{
+ uint32_t shared_base = pdd->lds_base >> 48;
+ uint32_t private_base = pdd->scratch_base >> 48;
+
+ return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) |
+ private_base;
+}
+
+static int update_qpd_v9(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd)
+{
+ struct kfd_process_device *pdd;
+
+ pdd = qpd_to_pdd(qpd);
+
+ /* check if sh_mem_config register already configured */
+ if (qpd->sh_mem_config == 0) {
+ qpd->sh_mem_config =
+ SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
+ if (vega10_noretry &&
+ !dqm->dev->device_info->needs_iommu_device)
+ qpd->sh_mem_config |=
+ 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
+
+ qpd->sh_mem_ape1_limit = 0;
+ qpd->sh_mem_ape1_base = 0;
+ }
+
+ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
+
+ pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
+
+ return 0;
+}
+
+static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q,
+ struct qcm_process_device *qpd)
+{
+ /* Not needed on SDMAv4 any more */
+ q->properties.sdma_vm_addr = 0;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
index ebb4da14e3df..c3744d89352c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
@@ -33,7 +33,6 @@
static DEFINE_IDA(doorbell_ida);
static unsigned int max_doorbell_slices;
-#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4
/*
* Each device exposes a doorbell aperture, a PCI MMIO aperture that
@@ -50,9 +49,9 @@ static unsigned int max_doorbell_slices;
*/
/* # of doorbell bytes allocated for each process. */
-static inline size_t doorbell_process_allocation(void)
+size_t kfd_doorbell_process_slice(struct kfd_dev *kfd)
{
- return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES *
+ return roundup(kfd->device_info->doorbell_size *
KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
PAGE_SIZE);
}
@@ -72,16 +71,16 @@ int kfd_doorbell_init(struct kfd_dev *kfd)
doorbell_start_offset =
roundup(kfd->shared_resources.doorbell_start_offset,
- doorbell_process_allocation());
+ kfd_doorbell_process_slice(kfd));
doorbell_aperture_size =
rounddown(kfd->shared_resources.doorbell_aperture_size,
- doorbell_process_allocation());
+ kfd_doorbell_process_slice(kfd));
if (doorbell_aperture_size > doorbell_start_offset)
doorbell_process_limit =
(doorbell_aperture_size - doorbell_start_offset) /
- doorbell_process_allocation();
+ kfd_doorbell_process_slice(kfd);
else
return -ENOSPC;
@@ -95,7 +94,7 @@ int kfd_doorbell_init(struct kfd_dev *kfd)
kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32);
kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base,
- doorbell_process_allocation());
+ kfd_doorbell_process_slice(kfd));
if (!kfd->doorbell_kernel_ptr)
return -ENOMEM;
@@ -127,21 +126,16 @@ void kfd_doorbell_fini(struct kfd_dev *kfd)
iounmap(kfd->doorbell_kernel_ptr);
}
-int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma)
+int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process,
+ struct vm_area_struct *vma)
{
phys_addr_t address;
- struct kfd_dev *dev;
/*
* For simplicitly we only allow mapping of the entire doorbell
* allocation of a single device & process.
*/
- if (vma->vm_end - vma->vm_start != doorbell_process_allocation())
- return -EINVAL;
-
- /* Find kfd device according to gpu id */
- dev = kfd_device_by_id(vma->vm_pgoff);
- if (!dev)
+ if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev))
return -EINVAL;
/* Calculate physical address of doorbell */
@@ -158,19 +152,19 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma)
" vm_flags == 0x%04lX\n"
" size == 0x%04lX\n",
(unsigned long long) vma->vm_start, address, vma->vm_flags,
- doorbell_process_allocation());
+ kfd_doorbell_process_slice(dev));
return io_remap_pfn_range(vma,
vma->vm_start,
address >> PAGE_SHIFT,
- doorbell_process_allocation(),
+ kfd_doorbell_process_slice(dev),
vma->vm_page_prot);
}
/* get kernel iomem pointer for a doorbell */
-u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
+void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
unsigned int *doorbell_off)
{
u32 inx;
@@ -185,6 +179,8 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)
return NULL;
+ inx *= kfd->device_info->doorbell_size / sizeof(u32);
+
/*
* Calculating the kernel doorbell offset using the first
* doorbell page.
@@ -210,7 +206,7 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr)
mutex_unlock(&kfd->doorbell_mutex);
}
-inline void write_kernel_doorbell(u32 __iomem *db, u32 value)
+void write_kernel_doorbell(void __iomem *db, u32 value)
{
if (db) {
writel(value, db);
@@ -218,30 +214,37 @@ inline void write_kernel_doorbell(u32 __iomem *db, u32 value)
}
}
-/*
- * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1
- * to doorbells with the process's doorbell page
- */
-unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd,
+void write_kernel_doorbell64(void __iomem *db, u64 value)
+{
+ if (db) {
+ WARN(((unsigned long)db & 7) != 0,
+ "Unaligned 64-bit doorbell");
+ writeq(value, (u64 __iomem *)db);
+ pr_debug("writing %llu to doorbell address %p\n", value, db);
+ }
+}
+
+unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd,
struct kfd_process *process,
- unsigned int queue_id)
+ unsigned int doorbell_id)
{
/*
* doorbell_id_offset accounts for doorbells taken by KGD.
- * index * doorbell_process_allocation/sizeof(u32) adjusts to
- * the process's doorbells.
+ * index * kfd_doorbell_process_slice/sizeof(u32) adjusts to
+ * the process's doorbells. The offset returned is in dword
+ * units regardless of the ASIC-dependent doorbell size.
*/
return kfd->doorbell_id_offset +
process->doorbell_index
- * doorbell_process_allocation() / sizeof(u32) +
- queue_id;
+ * kfd_doorbell_process_slice(kfd) / sizeof(u32) +
+ doorbell_id * kfd->device_info->doorbell_size / sizeof(u32);
}
uint64_t kfd_get_number_elems(struct kfd_dev *kfd)
{
uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size -
kfd->shared_resources.doorbell_start_offset) /
- doorbell_process_allocation() + 1;
+ kfd_doorbell_process_slice(kfd) + 1;
return num_of_elems;
@@ -251,7 +254,7 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
struct kfd_process *process)
{
return dev->doorbell_base +
- process->doorbell_index * doorbell_process_allocation();
+ process->doorbell_index * kfd_doorbell_process_slice(dev);
}
int kfd_alloc_process_doorbells(struct kfd_process *process)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 4890a90f1e44..5562e94e786a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -345,7 +345,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
case KFD_EVENT_TYPE_DEBUG:
ret = create_signal_event(devkfd, p, ev);
if (!ret) {
- *event_page_offset = KFD_MMAP_EVENTS_MASK;
+ *event_page_offset = KFD_MMAP_TYPE_EVENTS;
*event_page_offset <<= PAGE_SHIFT;
*event_slot_index = ev->event_id;
}
@@ -496,7 +496,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
partial_id, valid_id_bits);
- if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT/2) {
+ if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT / 64) {
/* With relatively few events, it's faster to
* iterate over the event IDR
*/
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index 66852de410c8..97d5423c5673 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -275,23 +275,35 @@
* for FLAT_* / S_LOAD operations.
*/
-#define MAKE_GPUVM_APP_BASE(gpu_num) \
+#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \
(((uint64_t)(gpu_num) << 61) + 0x1000000000000L)
#define MAKE_GPUVM_APP_LIMIT(base, size) \
(((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1)
-#define MAKE_SCRATCH_APP_BASE() \
+#define MAKE_SCRATCH_APP_BASE_VI() \
(((uint64_t)(0x1UL) << 61) + 0x100000000L)
#define MAKE_SCRATCH_APP_LIMIT(base) \
(((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
-#define MAKE_LDS_APP_BASE() \
+#define MAKE_LDS_APP_BASE_VI() \
(((uint64_t)(0x1UL) << 61) + 0x0)
#define MAKE_LDS_APP_LIMIT(base) \
(((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
+/* On GFXv9 the LDS and scratch apertures are programmed independently
+ * using the high 16 bits of the 64-bit virtual address. They must be
+ * in the hole, which will be the case as long as the high 16 bits are
+ * not 0.
+ *
+ * The aperture sizes are still 4GB implicitly.
+ *
+ * A GPUVM aperture is not applicable on GFXv9.
+ */
+#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48)
+#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48)
+
/* User mode manages most of the SVM aperture address space. The low
* 16MB are reserved for kernel use (CWSR trap handler and kernel IB
* for now).
@@ -300,6 +312,55 @@
#define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE)
#define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE)
+static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id)
+{
+ /*
+ * node id couldn't be 0 - the three MSB bits of
+ * aperture shoudn't be 0
+ */
+ pdd->lds_base = MAKE_LDS_APP_BASE_VI();
+ pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
+
+ if (!pdd->dev->device_info->needs_iommu_device) {
+ /* dGPUs: SVM aperture starting at 0
+ * with small reserved space for kernel.
+ * Set them to CANONICAL addresses.
+ */
+ pdd->gpuvm_base = SVM_USER_BASE;
+ pdd->gpuvm_limit =
+ pdd->dev->shared_resources.gpuvm_size - 1;
+ } else {
+ /* set them to non CANONICAL addresses, and no SVM is
+ * allocated.
+ */
+ pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1);
+ pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base,
+ pdd->dev->shared_resources.gpuvm_size);
+ }
+
+ pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI();
+ pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+}
+
+static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id)
+{
+ pdd->lds_base = MAKE_LDS_APP_BASE_V9();
+ pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
+
+ /* Raven needs SVM to support graphic handle, etc. Leave the small
+ * reserved space before SVM on Raven as well, even though we don't
+ * have to.
+ * Set gpuvm_base and gpuvm_limit to CANONICAL addresses so that they
+ * are used in Thunk to reserve SVM.
+ */
+ pdd->gpuvm_base = SVM_USER_BASE;
+ pdd->gpuvm_limit =
+ pdd->dev->shared_resources.gpuvm_size - 1;
+
+ pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9();
+ pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+}
+
int kfd_init_apertures(struct kfd_process *process)
{
uint8_t id = 0;
@@ -307,9 +368,7 @@ int kfd_init_apertures(struct kfd_process *process)
struct kfd_process_device *pdd;
/*Iterating over all devices*/
- while (kfd_topology_enum_kfd_devices(id, &dev) == 0 &&
- id < NUM_OF_SUPPORTED_GPUS) {
-
+ while (kfd_topology_enum_kfd_devices(id, &dev) == 0) {
if (!dev) {
id++; /* Skip non GPU devices */
continue;
@@ -318,7 +377,7 @@ int kfd_init_apertures(struct kfd_process *process)
pdd = kfd_create_process_device_data(dev, process);
if (!pdd) {
pr_err("Failed to create process device data\n");
- return -1;
+ return -ENOMEM;
}
/*
* For 64 bit process apertures will be statically reserved in
@@ -330,32 +389,30 @@ int kfd_init_apertures(struct kfd_process *process)
pdd->gpuvm_base = pdd->gpuvm_limit = 0;
pdd->scratch_base = pdd->scratch_limit = 0;
} else {
- /* Same LDS and scratch apertures can be used
- * on all GPUs. This allows using more dGPUs
- * than placement options for apertures.
- */
- pdd->lds_base = MAKE_LDS_APP_BASE();
- pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
-
- pdd->scratch_base = MAKE_SCRATCH_APP_BASE();
- pdd->scratch_limit =
- MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+ switch (dev->device_info->asic_family) {
+ case CHIP_KAVERI:
+ case CHIP_HAWAII:
+ case CHIP_CARRIZO:
+ case CHIP_TONGA:
+ case CHIP_FIJI:
+ case CHIP_POLARIS10:
+ case CHIP_POLARIS11:
+ kfd_init_apertures_vi(pdd, id);
+ break;
+ case CHIP_VEGA10:
+ case CHIP_RAVEN:
+ kfd_init_apertures_v9(pdd, id);
+ break;
+ default:
+ WARN(1, "Unexpected ASIC family %u",
+ dev->device_info->asic_family);
+ return -EINVAL;
+ }
- if (dev->device_info->needs_iommu_device) {
- /* APUs: GPUVM aperture in
- * non-canonical address space
- */
- pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1);
- pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(
- pdd->gpuvm_base,
- dev->shared_resources.gpuvm_size);
- } else {
- /* dGPUs: SVM aperture starting at 0
- * with small reserved space for kernel
+ if (!dev->device_info->needs_iommu_device) {
+ /* dGPUs: the reserved space for kernel
+ * before SVM
*/
- pdd->gpuvm_base = SVM_USER_BASE;
- pdd->gpuvm_limit =
- dev->shared_resources.gpuvm_size - 1;
pdd->qpd.cwsr_base = SVM_CWSR_BASE;
pdd->qpd.ib_base = SVM_IB_BASE;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
new file mode 100644
index 000000000000..37029baa3346
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfd_priv.h"
+#include "kfd_events.h"
+#include "soc15_int.h"
+
+
+static bool event_interrupt_isr_v9(struct kfd_dev *dev,
+ const uint32_t *ih_ring_entry)
+{
+ uint16_t source_id, client_id, pasid, vmid;
+ const uint32_t *data = ih_ring_entry;
+
+ /* Only handle interrupts from KFD VMIDs */
+ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+ if (vmid < dev->vm_info.first_vmid_kfd ||
+ vmid > dev->vm_info.last_vmid_kfd)
+ return 0;
+
+ /* If there is no valid PASID, it's likely a firmware bug */
+ pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+ if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt"))
+ return 0;
+
+ source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+ client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+
+ pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n",
+ client_id, source_id, pasid);
+ pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
+ data[0], data[1], data[2], data[3],
+ data[4], data[5], data[6], data[7]);
+
+ /* Interrupt types we care about: various signals and faults.
+ * They will be forwarded to a work queue (see below).
+ */
+ return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
+ source_id == SOC15_INTSRC_SDMA_TRAP ||
+ source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
+ source_id == SOC15_INTSRC_CP_BAD_OPCODE;
+}
+
+static void event_interrupt_wq_v9(struct kfd_dev *dev,
+ const uint32_t *ih_ring_entry)
+{
+ uint16_t source_id, client_id, pasid, vmid;
+ uint32_t context_id;
+
+ source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+ client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+ pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+ context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+
+ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
+ kfd_signal_event_interrupt(pasid, context_id, 32);
+ else if (source_id == SOC15_INTSRC_SDMA_TRAP)
+ kfd_signal_event_interrupt(pasid, context_id & 0xfffffff, 28);
+ else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG)
+ kfd_signal_event_interrupt(pasid, context_id & 0xffffff, 24);
+ else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
+ kfd_signal_hw_exception_event(pasid);
+ else if (client_id == SOC15_IH_CLIENTID_VMC ||
+ client_id == SOC15_IH_CLIENTID_UTCL2) {
+ /* TODO */
+ }
+}
+
+const struct kfd_event_interrupt_class event_interrupt_class_v9 = {
+ .interrupt_isr = event_interrupt_isr_v9,
+ .interrupt_wq = event_interrupt_wq_v9,
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
index 035c351f47c5..db6d9336b80d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
@@ -139,10 +139,12 @@ static void interrupt_wq(struct work_struct *work)
{
struct kfd_dev *dev = container_of(work, struct kfd_dev,
interrupt_work);
+ uint32_t ih_ring_entry[KFD_MAX_RING_ENTRY_SIZE];
- uint32_t ih_ring_entry[DIV_ROUND_UP(
- dev->device_info->ih_ring_entry_size,
- sizeof(uint32_t))];
+ if (dev->device_info->ih_ring_entry_size > sizeof(ih_ring_entry)) {
+ dev_err_once(kfd_chardev(), "Ring entry too small\n");
+ return;
+ }
while (dequeue_ih_ring_entry(dev, ih_ring_entry))
dev->device_info->event_interrupt_class->interrupt_wq(dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 69f496485331..476951d8c91c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -99,7 +99,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
kq->rptr_kernel = kq->rptr_mem->cpu_ptr;
kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr;
- retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel),
+ retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size,
&kq->wptr_mem);
if (retval != 0)
@@ -208,6 +208,7 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
size_t available_size;
size_t queue_size_dwords;
uint32_t wptr, rptr;
+ uint64_t wptr64;
unsigned int *queue_address;
/* When rptr == wptr, the buffer is empty.
@@ -216,7 +217,8 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
* the opposite. So we can only use up to queue_size_dwords - 1 dwords.
*/
rptr = *kq->rptr_kernel;
- wptr = *kq->wptr_kernel;
+ wptr = kq->pending_wptr;
+ wptr64 = kq->pending_wptr64;
queue_address = (unsigned int *)kq->pq_kernel_addr;
queue_size_dwords = kq->queue->properties.queue_size / 4;
@@ -232,29 +234,33 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
* make sure calling functions know
* acquire_packet_buffer() failed
*/
- *buffer_ptr = NULL;
- return -ENOMEM;
+ goto err_no_space;
}
if (wptr + packet_size_in_dwords >= queue_size_dwords) {
/* make sure after rolling back to position 0, there is
* still enough space.
*/
- if (packet_size_in_dwords >= rptr) {
- *buffer_ptr = NULL;
- return -ENOMEM;
- }
+ if (packet_size_in_dwords >= rptr)
+ goto err_no_space;
+
/* fill nops, roll back and start at position 0 */
while (wptr > 0) {
queue_address[wptr] = kq->nop_packet;
wptr = (wptr + 1) % queue_size_dwords;
+ wptr64++;
}
}
*buffer_ptr = &queue_address[wptr];
kq->pending_wptr = wptr + packet_size_in_dwords;
+ kq->pending_wptr64 = wptr64 + packet_size_in_dwords;
return 0;
+
+err_no_space:
+ *buffer_ptr = NULL;
+ return -ENOMEM;
}
static void submit_packet(struct kernel_queue *kq)
@@ -270,14 +276,18 @@ static void submit_packet(struct kernel_queue *kq)
pr_debug("\n");
#endif
- *kq->wptr_kernel = kq->pending_wptr;
- write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
- kq->pending_wptr);
+ kq->ops_asic_specific.submit_packet(kq);
}
static void rollback_packet(struct kernel_queue *kq)
{
- kq->pending_wptr = *kq->queue->properties.write_ptr;
+ if (kq->dev->device_info->doorbell_size == 8) {
+ kq->pending_wptr64 = *kq->wptr64_kernel;
+ kq->pending_wptr = *kq->wptr_kernel %
+ (kq->queue->properties.queue_size / 4);
+ } else {
+ kq->pending_wptr = *kq->wptr_kernel;
+ }
}
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
@@ -308,6 +318,11 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
case CHIP_HAWAII:
kernel_queue_init_cik(&kq->ops_asic_specific);
break;
+
+ case CHIP_VEGA10:
+ case CHIP_RAVEN:
+ kernel_queue_init_v9(&kq->ops_asic_specific);
+ break;
default:
WARN(1, "Unexpected ASIC family %u",
dev->device_info->asic_family);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
index 594053136ee4..97aff2041a5d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
@@ -72,6 +72,7 @@ struct kernel_queue {
struct kfd_dev *dev;
struct mqd_manager *mqd;
struct queue *queue;
+ uint64_t pending_wptr64;
uint32_t pending_wptr;
unsigned int nop_packet;
@@ -79,7 +80,10 @@ struct kernel_queue {
uint32_t *rptr_kernel;
uint64_t rptr_gpu_addr;
struct kfd_mem_obj *wptr_mem;
- uint32_t *wptr_kernel;
+ union {
+ uint64_t *wptr64_kernel;
+ uint32_t *wptr_kernel;
+ };
uint64_t wptr_gpu_addr;
struct kfd_mem_obj *pq;
uint64_t pq_gpu_addr;
@@ -97,5 +101,6 @@ struct kernel_queue {
void kernel_queue_init_cik(struct kernel_queue_ops *ops);
void kernel_queue_init_vi(struct kernel_queue_ops *ops);
+void kernel_queue_init_v9(struct kernel_queue_ops *ops);
#endif /* KFD_KERNEL_QUEUE_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
index a90eb440b1fb..19e54acb4125 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
@@ -26,11 +26,13 @@
static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
enum kfd_queue_type type, unsigned int queue_size);
static void uninitialize_cik(struct kernel_queue *kq);
+static void submit_packet_cik(struct kernel_queue *kq);
void kernel_queue_init_cik(struct kernel_queue_ops *ops)
{
ops->initialize = initialize_cik;
ops->uninitialize = uninitialize_cik;
+ ops->submit_packet = submit_packet_cik;
}
static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
@@ -42,3 +44,10 @@ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
static void uninitialize_cik(struct kernel_queue *kq)
{
}
+
+static void submit_packet_cik(struct kernel_queue *kq)
+{
+ *kq->wptr_kernel = kq->pending_wptr;
+ write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
+ kq->pending_wptr);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
new file mode 100644
index 000000000000..684a3bf07efd
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
@@ -0,0 +1,340 @@
+/*
+ * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_kernel_queue.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_pm4_headers_ai.h"
+#include "kfd_pm4_opcodes.h"
+
+static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev,
+ enum kfd_queue_type type, unsigned int queue_size);
+static void uninitialize_v9(struct kernel_queue *kq);
+static void submit_packet_v9(struct kernel_queue *kq);
+
+void kernel_queue_init_v9(struct kernel_queue_ops *ops)
+{
+ ops->initialize = initialize_v9;
+ ops->uninitialize = uninitialize_v9;
+ ops->submit_packet = submit_packet_v9;
+}
+
+static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev,
+ enum kfd_queue_type type, unsigned int queue_size)
+{
+ int retval;
+
+ retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem);
+ if (retval)
+ return false;
+
+ kq->eop_gpu_addr = kq->eop_mem->gpu_addr;
+ kq->eop_kernel_addr = kq->eop_mem->cpu_ptr;
+
+ memset(kq->eop_kernel_addr, 0, PAGE_SIZE);
+
+ return true;
+}
+
+static void uninitialize_v9(struct kernel_queue *kq)
+{
+ kfd_gtt_sa_free(kq->dev, kq->eop_mem);
+}
+
+static void submit_packet_v9(struct kernel_queue *kq)
+{
+ *kq->wptr64_kernel = kq->pending_wptr64;
+ write_kernel_doorbell64(kq->queue->properties.doorbell_ptr,
+ kq->pending_wptr64);
+}
+
+static int pm_map_process_v9(struct packet_manager *pm,
+ uint32_t *buffer, struct qcm_process_device *qpd)
+{
+ struct pm4_mes_map_process *packet;
+ uint64_t vm_page_table_base_addr =
+ (uint64_t)(qpd->page_table_base) << 12;
+
+ packet = (struct pm4_mes_map_process *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_map_process));
+
+ packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
+ sizeof(struct pm4_mes_map_process));
+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+ packet->bitfields2.process_quantum = 1;
+ packet->bitfields2.pasid = qpd->pqm->process->pasid;
+ packet->bitfields14.gds_size = qpd->gds_size;
+ packet->bitfields14.num_gws = qpd->num_gws;
+ packet->bitfields14.num_oac = qpd->num_oac;
+ packet->bitfields14.sdma_enable = 1;
+ packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+
+ packet->sh_mem_config = qpd->sh_mem_config;
+ packet->sh_mem_bases = qpd->sh_mem_bases;
+ packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8);
+ packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8);
+ packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8);
+ packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8);
+
+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+
+ packet->vm_context_page_table_base_addr_lo32 =
+ lower_32_bits(vm_page_table_base_addr);
+ packet->vm_context_page_table_base_addr_hi32 =
+ upper_32_bits(vm_page_table_base_addr);
+
+ return 0;
+}
+
+static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
+ uint64_t ib, size_t ib_size_in_dwords, bool chain)
+{
+ struct pm4_mes_runlist *packet;
+
+ int concurrent_proc_cnt = 0;
+ struct kfd_dev *kfd = pm->dqm->dev;
+
+ /* Determine the number of processes to map together to HW:
+ * it can not exceed the number of VMIDs available to the
+ * scheduler, and it is determined by the smaller of the number
+ * of processes in the runlist and kfd module parameter
+ * hws_max_conc_proc.
+ * Note: the arbitration between the number of VMIDs and
+ * hws_max_conc_proc has been done in
+ * kgd2kfd_device_init().
+ */
+ concurrent_proc_cnt = min(pm->dqm->processes_count,
+ kfd->max_proc_per_quantum);
+
+ packet = (struct pm4_mes_runlist *)buffer;
+
+ memset(buffer, 0, sizeof(struct pm4_mes_runlist));
+ packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST,
+ sizeof(struct pm4_mes_runlist));
+
+ packet->bitfields4.ib_size = ib_size_in_dwords;
+ packet->bitfields4.chain = chain ? 1 : 0;
+ packet->bitfields4.offload_polling = 0;
+ packet->bitfields4.valid = 1;
+ packet->bitfields4.process_cnt = concurrent_proc_cnt;
+ packet->ordinal2 = lower_32_bits(ib);
+ packet->ib_base_hi = upper_32_bits(ib);
+
+ return 0;
+}
+
+static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
+ struct queue *q, bool is_static)
+{
+ struct pm4_mes_map_queues *packet;
+ bool use_static = is_static;
+
+ packet = (struct pm4_mes_map_queues *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
+
+ packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
+ sizeof(struct pm4_mes_map_queues));
+ packet->bitfields2.alloc_format =
+ alloc_format__mes_map_queues__one_per_pipe_vi;
+ packet->bitfields2.num_queues = 1;
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
+
+ packet->bitfields2.engine_sel =
+ engine_sel__mes_map_queues__compute_vi;
+ packet->bitfields2.queue_type =
+ queue_type__mes_map_queues__normal_compute_vi;
+
+ switch (q->properties.type) {
+ case KFD_QUEUE_TYPE_COMPUTE:
+ if (use_static)
+ packet->bitfields2.queue_type =
+ queue_type__mes_map_queues__normal_latency_static_queue_vi;
+ break;
+ case KFD_QUEUE_TYPE_DIQ:
+ packet->bitfields2.queue_type =
+ queue_type__mes_map_queues__debug_interface_queue_vi;
+ break;
+ case KFD_QUEUE_TYPE_SDMA:
+ packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
+ engine_sel__mes_map_queues__sdma0_vi;
+ use_static = false; /* no static queues under SDMA */
+ break;
+ default:
+ WARN(1, "queue type %d", q->properties.type);
+ return -EINVAL;
+ }
+ packet->bitfields3.doorbell_offset =
+ q->properties.doorbell_off;
+
+ packet->mqd_addr_lo =
+ lower_32_bits(q->gart_mqd_addr);
+
+ packet->mqd_addr_hi =
+ upper_32_bits(q->gart_mqd_addr);
+
+ packet->wptr_addr_lo =
+ lower_32_bits((uint64_t)q->properties.write_ptr);
+
+ packet->wptr_addr_hi =
+ upper_32_bits((uint64_t)q->properties.write_ptr);
+
+ return 0;
+}
+
+static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
+ enum kfd_queue_type type,
+ enum kfd_unmap_queues_filter filter,
+ uint32_t filter_param, bool reset,
+ unsigned int sdma_engine)
+{
+ struct pm4_mes_unmap_queues *packet;
+
+ packet = (struct pm4_mes_unmap_queues *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
+
+ packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES,
+ sizeof(struct pm4_mes_unmap_queues));
+ switch (type) {
+ case KFD_QUEUE_TYPE_COMPUTE:
+ case KFD_QUEUE_TYPE_DIQ:
+ packet->bitfields2.engine_sel =
+ engine_sel__mes_unmap_queues__compute;
+ break;
+ case KFD_QUEUE_TYPE_SDMA:
+ packet->bitfields2.engine_sel =
+ engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
+ break;
+ default:
+ WARN(1, "queue type %d", type);
+ return -EINVAL;
+ }
+
+ if (reset)
+ packet->bitfields2.action =
+ action__mes_unmap_queues__reset_queues;
+ else
+ packet->bitfields2.action =
+ action__mes_unmap_queues__preempt_queues;
+
+ switch (filter) {
+ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
+ packet->bitfields2.num_queues = 1;
+ packet->bitfields3b.doorbell_offset0 = filter_param;
+ break;
+ case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
+ packet->bitfields3a.pasid = filter_param;
+ break;
+ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__unmap_all_queues;
+ break;
+ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
+ /* in this case, we do not preempt static queues */
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
+ break;
+ default:
+ WARN(1, "filter %d", filter);
+ return -EINVAL;
+ }
+
+ return 0;
+
+}
+
+static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer,
+ uint64_t fence_address, uint32_t fence_value)
+{
+ struct pm4_mes_query_status *packet;
+
+ packet = (struct pm4_mes_query_status *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_query_status));
+
+
+ packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
+ sizeof(struct pm4_mes_query_status));
+
+ packet->bitfields2.context_id = 0;
+ packet->bitfields2.interrupt_sel =
+ interrupt_sel__mes_query_status__completion_status;
+ packet->bitfields2.command =
+ command__mes_query_status__fence_only_after_write_ack;
+
+ packet->addr_hi = upper_32_bits((uint64_t)fence_address);
+ packet->addr_lo = lower_32_bits((uint64_t)fence_address);
+ packet->data_hi = upper_32_bits((uint64_t)fence_value);
+ packet->data_lo = lower_32_bits((uint64_t)fence_value);
+
+ return 0;
+}
+
+
+static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer)
+{
+ struct pm4_mec_release_mem *packet;
+
+ packet = (struct pm4_mec_release_mem *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mec_release_mem));
+
+ packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
+ sizeof(struct pm4_mec_release_mem));
+
+ packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+ packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe;
+ packet->bitfields2.tcl1_action_ena = 1;
+ packet->bitfields2.tc_action_ena = 1;
+ packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru;
+
+ packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low;
+ packet->bitfields3.int_sel =
+ int_sel__mec_release_mem__send_interrupt_after_write_confirm;
+
+ packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
+ packet->address_hi = upper_32_bits(gpu_addr);
+
+ packet->data_lo = 0;
+
+ return 0;
+}
+
+const struct packet_manager_funcs kfd_v9_pm_funcs = {
+ .map_process = pm_map_process_v9,
+ .runlist = pm_runlist_v9,
+ .set_resources = pm_set_resources_vi,
+ .map_queues = pm_map_queues_v9,
+ .unmap_queues = pm_unmap_queues_v9,
+ .query_status = pm_query_status_v9,
+ .release_mem = pm_release_mem_v9,
+ .map_process_size = sizeof(struct pm4_mes_map_process),
+ .runlist_size = sizeof(struct pm4_mes_runlist),
+ .set_resources_size = sizeof(struct pm4_mes_set_resources),
+ .map_queues_size = sizeof(struct pm4_mes_map_queues),
+ .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues),
+ .query_status_size = sizeof(struct pm4_mes_query_status),
+ .release_mem_size = sizeof(struct pm4_mec_release_mem)
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
index f1d48281e322..bf20c6d32ef3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
@@ -22,15 +22,20 @@
*/
#include "kfd_kernel_queue.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_pm4_headers_vi.h"
+#include "kfd_pm4_opcodes.h"
static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev,
enum kfd_queue_type type, unsigned int queue_size);
static void uninitialize_vi(struct kernel_queue *kq);
+static void submit_packet_vi(struct kernel_queue *kq);
void kernel_queue_init_vi(struct kernel_queue_ops *ops)
{
ops->initialize = initialize_vi;
ops->uninitialize = uninitialize_vi;
+ ops->submit_packet = submit_packet_vi;
}
static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev,
@@ -54,3 +59,317 @@ static void uninitialize_vi(struct kernel_queue *kq)
{
kfd_gtt_sa_free(kq->dev, kq->eop_mem);
}
+
+static void submit_packet_vi(struct kernel_queue *kq)
+{
+ *kq->wptr_kernel = kq->pending_wptr;
+ write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
+ kq->pending_wptr);
+}
+
+unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size)
+{
+ union PM4_MES_TYPE_3_HEADER header;
+
+ header.u32All = 0;
+ header.opcode = opcode;
+ header.count = packet_size / 4 - 2;
+ header.type = PM4_TYPE_3;
+
+ return header.u32All;
+}
+
+static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer,
+ struct qcm_process_device *qpd)
+{
+ struct pm4_mes_map_process *packet;
+
+ packet = (struct pm4_mes_map_process *)buffer;
+
+ memset(buffer, 0, sizeof(struct pm4_mes_map_process));
+
+ packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
+ sizeof(struct pm4_mes_map_process));
+ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+ packet->bitfields2.process_quantum = 1;
+ packet->bitfields2.pasid = qpd->pqm->process->pasid;
+ packet->bitfields3.page_table_base = qpd->page_table_base;
+ packet->bitfields10.gds_size = qpd->gds_size;
+ packet->bitfields10.num_gws = qpd->num_gws;
+ packet->bitfields10.num_oac = qpd->num_oac;
+ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+
+ packet->sh_mem_config = qpd->sh_mem_config;
+ packet->sh_mem_bases = qpd->sh_mem_bases;
+ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
+ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
+
+ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
+
+ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+
+ return 0;
+}
+
+static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer,
+ uint64_t ib, size_t ib_size_in_dwords, bool chain)
+{
+ struct pm4_mes_runlist *packet;
+ int concurrent_proc_cnt = 0;
+ struct kfd_dev *kfd = pm->dqm->dev;
+
+ if (WARN_ON(!ib))
+ return -EFAULT;
+
+ /* Determine the number of processes to map together to HW:
+ * it can not exceed the number of VMIDs available to the
+ * scheduler, and it is determined by the smaller of the number
+ * of processes in the runlist and kfd module parameter
+ * hws_max_conc_proc.
+ * Note: the arbitration between the number of VMIDs and
+ * hws_max_conc_proc has been done in
+ * kgd2kfd_device_init().
+ */
+ concurrent_proc_cnt = min(pm->dqm->processes_count,
+ kfd->max_proc_per_quantum);
+
+ packet = (struct pm4_mes_runlist *)buffer;
+
+ memset(buffer, 0, sizeof(struct pm4_mes_runlist));
+ packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST,
+ sizeof(struct pm4_mes_runlist));
+
+ packet->bitfields4.ib_size = ib_size_in_dwords;
+ packet->bitfields4.chain = chain ? 1 : 0;
+ packet->bitfields4.offload_polling = 0;
+ packet->bitfields4.valid = 1;
+ packet->bitfields4.process_cnt = concurrent_proc_cnt;
+ packet->ordinal2 = lower_32_bits(ib);
+ packet->bitfields3.ib_base_hi = upper_32_bits(ib);
+
+ return 0;
+}
+
+int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
+ struct scheduling_resources *res)
+{
+ struct pm4_mes_set_resources *packet;
+
+ packet = (struct pm4_mes_set_resources *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_set_resources));
+
+ packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES,
+ sizeof(struct pm4_mes_set_resources));
+
+ packet->bitfields2.queue_type =
+ queue_type__mes_set_resources__hsa_interface_queue_hiq;
+ packet->bitfields2.vmid_mask = res->vmid_mask;
+ packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
+ packet->bitfields7.oac_mask = res->oac_mask;
+ packet->bitfields8.gds_heap_base = res->gds_heap_base;
+ packet->bitfields8.gds_heap_size = res->gds_heap_size;
+
+ packet->gws_mask_lo = lower_32_bits(res->gws_mask);
+ packet->gws_mask_hi = upper_32_bits(res->gws_mask);
+
+ packet->queue_mask_lo = lower_32_bits(res->queue_mask);
+ packet->queue_mask_hi = upper_32_bits(res->queue_mask);
+
+ return 0;
+}
+
+static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+ struct queue *q, bool is_static)
+{
+ struct pm4_mes_map_queues *packet;
+ bool use_static = is_static;
+
+ packet = (struct pm4_mes_map_queues *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
+
+ packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
+ sizeof(struct pm4_mes_map_queues));
+ packet->bitfields2.alloc_format =
+ alloc_format__mes_map_queues__one_per_pipe_vi;
+ packet->bitfields2.num_queues = 1;
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
+
+ packet->bitfields2.engine_sel =
+ engine_sel__mes_map_queues__compute_vi;
+ packet->bitfields2.queue_type =
+ queue_type__mes_map_queues__normal_compute_vi;
+
+ switch (q->properties.type) {
+ case KFD_QUEUE_TYPE_COMPUTE:
+ if (use_static)
+ packet->bitfields2.queue_type =
+ queue_type__mes_map_queues__normal_latency_static_queue_vi;
+ break;
+ case KFD_QUEUE_TYPE_DIQ:
+ packet->bitfields2.queue_type =
+ queue_type__mes_map_queues__debug_interface_queue_vi;
+ break;
+ case KFD_QUEUE_TYPE_SDMA:
+ packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
+ engine_sel__mes_map_queues__sdma0_vi;
+ use_static = false; /* no static queues under SDMA */
+ break;
+ default:
+ WARN(1, "queue type %d", q->properties.type);
+ return -EINVAL;
+ }
+ packet->bitfields3.doorbell_offset =
+ q->properties.doorbell_off;
+
+ packet->mqd_addr_lo =
+ lower_32_bits(q->gart_mqd_addr);
+
+ packet->mqd_addr_hi =
+ upper_32_bits(q->gart_mqd_addr);
+
+ packet->wptr_addr_lo =
+ lower_32_bits((uint64_t)q->properties.write_ptr);
+
+ packet->wptr_addr_hi =
+ upper_32_bits((uint64_t)q->properties.write_ptr);
+
+ return 0;
+}
+
+static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+ enum kfd_queue_type type,
+ enum kfd_unmap_queues_filter filter,
+ uint32_t filter_param, bool reset,
+ unsigned int sdma_engine)
+{
+ struct pm4_mes_unmap_queues *packet;
+
+ packet = (struct pm4_mes_unmap_queues *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
+
+ packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES,
+ sizeof(struct pm4_mes_unmap_queues));
+ switch (type) {
+ case KFD_QUEUE_TYPE_COMPUTE:
+ case KFD_QUEUE_TYPE_DIQ:
+ packet->bitfields2.engine_sel =
+ engine_sel__mes_unmap_queues__compute;
+ break;
+ case KFD_QUEUE_TYPE_SDMA:
+ packet->bitfields2.engine_sel =
+ engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
+ break;
+ default:
+ WARN(1, "queue type %d", type);
+ return -EINVAL;
+ }
+
+ if (reset)
+ packet->bitfields2.action =
+ action__mes_unmap_queues__reset_queues;
+ else
+ packet->bitfields2.action =
+ action__mes_unmap_queues__preempt_queues;
+
+ switch (filter) {
+ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
+ packet->bitfields2.num_queues = 1;
+ packet->bitfields3b.doorbell_offset0 = filter_param;
+ break;
+ case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
+ packet->bitfields3a.pasid = filter_param;
+ break;
+ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__unmap_all_queues;
+ break;
+ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
+ /* in this case, we do not preempt static queues */
+ packet->bitfields2.queue_sel =
+ queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
+ break;
+ default:
+ WARN(1, "filter %d", filter);
+ return -EINVAL;
+ }
+
+ return 0;
+
+}
+
+static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer,
+ uint64_t fence_address, uint32_t fence_value)
+{
+ struct pm4_mes_query_status *packet;
+
+ packet = (struct pm4_mes_query_status *)buffer;
+ memset(buffer, 0, sizeof(struct pm4_mes_query_status));
+
+ packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
+ sizeof(struct pm4_mes_query_status));
+
+ packet->bitfields2.context_id = 0;
+ packet->bitfields2.interrupt_sel =
+ interrupt_sel__mes_query_status__completion_status;
+ packet->bitfields2.command =
+ command__mes_query_status__fence_only_after_write_ack;
+
+ packet->addr_hi = upper_32_bits((uint64_t)fence_address);
+ packet->addr_lo = lower_32_bits((uint64_t)fence_address);
+ packet->data_hi = upper_32_bits((uint64_t)fence_value);
+ packet->data_lo = lower_32_bits((uint64_t)fence_value);
+
+ return 0;
+}
+
+static int pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer)
+{
+ struct pm4_mec_release_mem *packet;
+
+ packet = (struct pm4_mec_release_mem *)buffer;
+ memset(buffer, 0, sizeof(*packet));
+
+ packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
+ sizeof(*packet));
+
+ packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+ packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
+ packet->bitfields2.tcl1_action_ena = 1;
+ packet->bitfields2.tc_action_ena = 1;
+ packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
+ packet->bitfields2.atc = 0;
+
+ packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low;
+ packet->bitfields3.int_sel =
+ int_sel___release_mem__send_interrupt_after_write_confirm;
+
+ packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
+ packet->address_hi = upper_32_bits(gpu_addr);
+
+ packet->data_lo = 0;
+
+ return 0;
+}
+
+const struct packet_manager_funcs kfd_vi_pm_funcs = {
+ .map_process = pm_map_process_vi,
+ .runlist = pm_runlist_vi,
+ .set_resources = pm_set_resources_vi,
+ .map_queues = pm_map_queues_vi,
+ .unmap_queues = pm_unmap_queues_vi,
+ .query_status = pm_query_status_vi,
+ .release_mem = pm_release_mem_vi,
+ .map_process_size = sizeof(struct pm4_mes_map_process),
+ .runlist_size = sizeof(struct pm4_mes_runlist),
+ .set_resources_size = sizeof(struct pm4_mes_set_resources),
+ .map_queues_size = sizeof(struct pm4_mes_map_queues),
+ .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues),
+ .query_status_size = sizeof(struct pm4_mes_query_status),
+ .release_mem_size = sizeof(struct pm4_mec_release_mem)
+};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index e0c07d24d251..76bf2dc8aec4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -43,6 +43,8 @@ static const struct kgd2kfd_calls kgd2kfd = {
.interrupt = kgd2kfd_interrupt,
.suspend = kgd2kfd_suspend,
.resume = kgd2kfd_resume,
+ .quiesce_mm = kgd2kfd_quiesce_mm,
+ .resume_mm = kgd2kfd_resume_mm,
.schedule_evict_and_restore_process =
kgd2kfd_schedule_evict_and_restore_process,
};
@@ -81,6 +83,11 @@ module_param(ignore_crat, int, 0444);
MODULE_PARM_DESC(ignore_crat,
"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");
+int vega10_noretry;
+module_param_named(noretry, vega10_noretry, int, 0644);
+MODULE_PARM_DESC(noretry,
+ "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");
+
static int amdkfd_init_completed;
int kgd2kfd_init(unsigned int interface_version,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index ee7061e1c466..4b8eb506642b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -38,6 +38,9 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
case CHIP_POLARIS10:
case CHIP_POLARIS11:
return mqd_manager_init_vi_tonga(type, dev);
+ case CHIP_VEGA10:
+ case CHIP_RAVEN:
+ return mqd_manager_init_v9(type, dev);
default:
WARN(1, "Unexpected ASIC family %u",
dev->device_info->asic_family);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index c00c325ed3c9..06eaa218eba6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -79,10 +79,6 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
m->cp_mqd_base_addr_lo = lower_32_bits(addr);
m->cp_mqd_base_addr_hi = upper_32_bits(addr);
- m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN;
- /* Although WinKFD writes this, I suspect it should not be necessary */
- m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE;
-
m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS |
QUANTUM_DURATION(10);
@@ -412,7 +408,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
return NULL;
- mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+ mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
if (!mqd)
return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
new file mode 100644
index 000000000000..684054ff02cd
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "kfd_priv.h"
+#include "kfd_mqd_manager.h"
+#include "v9_structs.h"
+#include "gc/gc_9_0_offset.h"
+#include "gc/gc_9_0_sh_mask.h"
+#include "sdma0/sdma0_4_0_sh_mask.h"
+
+static inline struct v9_mqd *get_mqd(void *mqd)
+{
+ return (struct v9_mqd *)mqd;
+}
+
+static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
+{
+ return (struct v9_sdma_mqd *)mqd;
+}
+
+static int init_mqd(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ int retval;
+ uint64_t addr;
+ struct v9_mqd *m;
+ struct kfd_dev *kfd = mm->dev;
+
+ /* From V9, for CWSR, the control stack is located on the next page
+ * boundary after the mqd, we will use the gtt allocation function
+ * instead of sub-allocation function.
+ */
+ if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
+ *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
+ if (!*mqd_mem_obj)
+ return -ENOMEM;
+ retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd,
+ ALIGN(q->ctl_stack_size, PAGE_SIZE) +
+ ALIGN(sizeof(struct v9_mqd), PAGE_SIZE),
+ &((*mqd_mem_obj)->gtt_mem),
+ &((*mqd_mem_obj)->gpu_addr),
+ (void *)&((*mqd_mem_obj)->cpu_ptr));
+ } else
+ retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd),
+ mqd_mem_obj);
+ if (retval != 0)
+ return -ENOMEM;
+
+ m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr;
+ addr = (*mqd_mem_obj)->gpu_addr;
+
+ memset(m, 0, sizeof(struct v9_mqd));
+
+ m->header = 0xC0310800;
+ m->compute_pipelinestat_enable = 1;
+ m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
+ m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+
+ m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
+ 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
+
+ m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT;
+
+ m->cp_mqd_base_addr_lo = lower_32_bits(addr);
+ m->cp_mqd_base_addr_hi = upper_32_bits(addr);
+
+ m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT |
+ 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
+ 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
+
+ m->cp_hqd_pipe_priority = 1;
+ m->cp_hqd_queue_priority = 15;
+
+ if (q->format == KFD_QUEUE_FORMAT_AQL) {
+ m->cp_hqd_aql_control =
+ 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
+ }
+
+ if (q->tba_addr) {
+ m->compute_pgm_rsrc2 |=
+ (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
+ }
+
+ if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) {
+ m->cp_hqd_persistent_state |=
+ (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+ m->cp_hqd_ctx_save_base_addr_lo =
+ lower_32_bits(q->ctx_save_restore_area_address);
+ m->cp_hqd_ctx_save_base_addr_hi =
+ upper_32_bits(q->ctx_save_restore_area_address);
+ m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
+ m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
+ m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
+ m->cp_hqd_wg_state_offset = q->ctl_stack_size;
+ }
+
+ *mqd = m;
+ if (gart_addr)
+ *gart_addr = addr;
+ retval = mm->update_mqd(mm, m, q);
+
+ return retval;
+}
+
+static int load_mqd(struct mqd_manager *mm, void *mqd,
+ uint32_t pipe_id, uint32_t queue_id,
+ struct queue_properties *p, struct mm_struct *mms)
+{
+ /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
+ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
+
+ return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
+ (uint32_t __user *)p->write_ptr,
+ wptr_shift, 0, mms);
+}
+
+static int update_mqd(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q)
+{
+ struct v9_mqd *m;
+
+ m = get_mqd(mqd);
+
+ m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT;
+ m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
+ pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
+
+ m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+ m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
+
+ m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+ m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+ m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
+ m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
+
+ m->cp_hqd_pq_doorbell_control =
+ q->doorbell_off <<
+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
+ pr_debug("cp_hqd_pq_doorbell_control 0x%x\n",
+ m->cp_hqd_pq_doorbell_control);
+
+ m->cp_hqd_ib_control =
+ 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT |
+ 1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT;
+
+ /*
+ * HW does not clamp this field correctly. Maximum EOP queue size
+ * is constrained by per-SE EOP done signal count, which is 8-bit.
+ * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit
+ * more than (EOP entry count - 1) so a queue size of 0x800 dwords
+ * is safe, giving a maximum field value of 0xA.
+ */
+ m->cp_hqd_eop_control = min(0xA,
+ order_base_2(q->eop_ring_buffer_size / 4) - 1);
+ m->cp_hqd_eop_base_addr_lo =
+ lower_32_bits(q->eop_ring_buffer_address >> 8);
+ m->cp_hqd_eop_base_addr_hi =
+ upper_32_bits(q->eop_ring_buffer_address >> 8);
+
+ m->cp_hqd_iq_timer = 0;
+
+ m->cp_hqd_vmid = q->vmid;
+
+ if (q->format == KFD_QUEUE_FORMAT_AQL) {
+ m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
+ 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT |
+ 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT |
+ 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT;
+ m->cp_hqd_pq_doorbell_control |= 1 <<
+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT;
+ }
+ if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address)
+ m->cp_hqd_ctx_save_control = 0;
+
+ q->is_active = (q->queue_size > 0 &&
+ q->queue_address != 0 &&
+ q->queue_percent > 0 &&
+ !q->is_evicted);
+
+ return 0;
+}
+
+
+static int destroy_mqd(struct mqd_manager *mm, void *mqd,
+ enum kfd_preempt_type type,
+ unsigned int timeout, uint32_t pipe_id,
+ uint32_t queue_id)
+{
+ return mm->dev->kfd2kgd->hqd_destroy
+ (mm->dev->kgd, mqd, type, timeout,
+ pipe_id, queue_id);
+}
+
+static void uninit_mqd(struct mqd_manager *mm, void *mqd,
+ struct kfd_mem_obj *mqd_mem_obj)
+{
+ struct kfd_dev *kfd = mm->dev;
+
+ if (mqd_mem_obj->gtt_mem) {
+ kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem);
+ kfree(mqd_mem_obj);
+ } else {
+ kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+ }
+}
+
+static bool is_occupied(struct mqd_manager *mm, void *mqd,
+ uint64_t queue_address, uint32_t pipe_id,
+ uint32_t queue_id)
+{
+ return mm->dev->kfd2kgd->hqd_is_occupied(
+ mm->dev->kgd, queue_address,
+ pipe_id, queue_id);
+}
+
+static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ struct v9_mqd *m;
+ int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
+
+ if (retval != 0)
+ return retval;
+
+ m = get_mqd(*mqd);
+
+ m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
+ 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
+
+ return retval;
+}
+
+static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q)
+{
+ struct v9_mqd *m;
+ int retval = update_mqd(mm, mqd, q);
+
+ if (retval != 0)
+ return retval;
+
+ /* TODO: what's the point? update_mqd already does this. */
+ m = get_mqd(mqd);
+ m->cp_hqd_vmid = q->vmid;
+ return retval;
+}
+
+static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+ struct queue_properties *q)
+{
+ int retval;
+ struct v9_sdma_mqd *m;
+
+
+ retval = kfd_gtt_sa_allocate(mm->dev,
+ sizeof(struct v9_sdma_mqd),
+ mqd_mem_obj);
+
+ if (retval != 0)
+ return -ENOMEM;
+
+ m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
+
+ memset(m, 0, sizeof(struct v9_sdma_mqd));
+
+ *mqd = m;
+ if (gart_addr)
+ *gart_addr = (*mqd_mem_obj)->gpu_addr;
+
+ retval = mm->update_mqd(mm, m, q);
+
+ return retval;
+}
+
+static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ struct kfd_mem_obj *mqd_mem_obj)
+{
+ kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+}
+
+static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ uint32_t pipe_id, uint32_t queue_id,
+ struct queue_properties *p, struct mm_struct *mms)
+{
+ return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
+ (uint32_t __user *)p->write_ptr,
+ mms);
+}
+
+#define SDMA_RLC_DUMMY_DEFAULT 0xf
+
+static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ struct queue_properties *q)
+{
+ struct v9_sdma_mqd *m;
+
+ m = get_sdma_mqd(mqd);
+ m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4)
+ << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+ q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+ 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+ 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+
+ m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
+ m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+ m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+ m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+ m->sdmax_rlcx_doorbell_offset =
+ q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT;
+
+ m->sdma_engine_id = q->sdma_engine_id;
+ m->sdma_queue_id = q->sdma_queue_id;
+ m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
+
+ q->is_active = (q->queue_size > 0 &&
+ q->queue_address != 0 &&
+ q->queue_percent > 0 &&
+ !q->is_evicted);
+
+ return 0;
+}
+
+/*
+ * * preempt type here is ignored because there is only one way
+ * * to preempt sdma queue
+ */
+static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ enum kfd_preempt_type type,
+ unsigned int timeout, uint32_t pipe_id,
+ uint32_t queue_id)
+{
+ return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
+}
+
+static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+ uint64_t queue_address, uint32_t pipe_id,
+ uint32_t queue_id)
+{
+ return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int debugfs_show_mqd(struct seq_file *m, void *data)
+{
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct v9_mqd), false);
+ return 0;
+}
+
+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+{
+ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
+ data, sizeof(struct v9_sdma_mqd), false);
+ return 0;
+}
+
+#endif
+
+struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
+ struct kfd_dev *dev)
+{
+ struct mqd_manager *mqd;
+
+ if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+ return NULL;
+
+ mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
+ if (!mqd)
+ return NULL;
+
+ mqd->dev = dev;
+
+ switch (type) {
+ case KFD_MQD_TYPE_CP:
+ case KFD_MQD_TYPE_COMPUTE:
+ mqd->init_mqd = init_mqd;
+ mqd->uninit_mqd = uninit_mqd;
+ mqd->load_mqd = load_mqd;
+ mqd->update_mqd = update_mqd;
+ mqd->destroy_mqd = destroy_mqd;
+ mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+ break;
+ case KFD_MQD_TYPE_HIQ:
+ mqd->init_mqd = init_mqd_hiq;
+ mqd->uninit_mqd = uninit_mqd;
+ mqd->load_mqd = load_mqd;
+ mqd->update_mqd = update_mqd_hiq;
+ mqd->destroy_mqd = destroy_mqd;
+ mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+ break;
+ case KFD_MQD_TYPE_SDMA:
+ mqd->init_mqd = init_mqd_sdma;
+ mqd->uninit_mqd = uninit_mqd_sdma;
+ mqd->load_mqd = load_mqd_sdma;
+ mqd->update_mqd = update_mqd_sdma;
+ mqd->destroy_mqd = destroy_mqd_sdma;
+ mqd->is_occupied = is_occupied_sdma;
+#if defined(CONFIG_DEBUG_FS)
+ mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+#endif
+ break;
+ default:
+ kfree(mqd);
+ return NULL;
+ }
+
+ return mqd;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index 89e4242e43e7..481307b8b4db 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -394,7 +394,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
return NULL;
- mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+ mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
if (!mqd)
return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 89ba4c670ec5..c317feb43f69 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -26,8 +26,6 @@
#include "kfd_device_queue_manager.h"
#include "kfd_kernel_queue.h"
#include "kfd_priv.h"
-#include "kfd_pm4_headers_vi.h"
-#include "kfd_pm4_opcodes.h"
static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
unsigned int buffer_size_bytes)
@@ -39,18 +37,6 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
*wptr = temp;
}
-static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size)
-{
- union PM4_MES_TYPE_3_HEADER header;
-
- header.u32All = 0;
- header.opcode = opcode;
- header.count = packet_size / 4 - 2;
- header.type = PM4_TYPE_3;
-
- return header.u32All;
-}
-
static void pm_calc_rlib_size(struct packet_manager *pm,
unsigned int *rlib_size,
bool *over_subscription)
@@ -80,9 +66,9 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
pr_debug("Over subscribed runlist\n");
}
- map_queue_size = sizeof(struct pm4_mes_map_queues);
+ map_queue_size = pm->pmf->map_queues_size;
/* calculate run list ib allocation size */
- *rlib_size = process_count * sizeof(struct pm4_mes_map_process) +
+ *rlib_size = process_count * pm->pmf->map_process_size +
queue_count * map_queue_size;
/*
@@ -90,7 +76,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
* when over subscription
*/
if (*over_subscription)
- *rlib_size += sizeof(struct pm4_mes_runlist);
+ *rlib_size += pm->pmf->runlist_size;
pr_debug("runlist ib size %d\n", *rlib_size);
}
@@ -108,12 +94,14 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
+ mutex_lock(&pm->lock);
+
retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size,
&pm->ib_buffer_obj);
if (retval) {
pr_err("Failed to allocate runlist IB\n");
- return retval;
+ goto out;
}
*(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr;
@@ -121,138 +109,10 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
memset(*rl_buffer, 0, *rl_buffer_size);
pm->allocated = true;
- return retval;
-}
-
-static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
- uint64_t ib, size_t ib_size_in_dwords, bool chain)
-{
- struct pm4_mes_runlist *packet;
- int concurrent_proc_cnt = 0;
- struct kfd_dev *kfd = pm->dqm->dev;
-
- if (WARN_ON(!ib))
- return -EFAULT;
-
- /* Determine the number of processes to map together to HW:
- * it can not exceed the number of VMIDs available to the
- * scheduler, and it is determined by the smaller of the number
- * of processes in the runlist and kfd module parameter
- * hws_max_conc_proc.
- * Note: the arbitration between the number of VMIDs and
- * hws_max_conc_proc has been done in
- * kgd2kfd_device_init().
- */
- concurrent_proc_cnt = min(pm->dqm->processes_count,
- kfd->max_proc_per_quantum);
-
- packet = (struct pm4_mes_runlist *)buffer;
-
- memset(buffer, 0, sizeof(struct pm4_mes_runlist));
- packet->header.u32All = build_pm4_header(IT_RUN_LIST,
- sizeof(struct pm4_mes_runlist));
-
- packet->bitfields4.ib_size = ib_size_in_dwords;
- packet->bitfields4.chain = chain ? 1 : 0;
- packet->bitfields4.offload_polling = 0;
- packet->bitfields4.valid = 1;
- packet->bitfields4.process_cnt = concurrent_proc_cnt;
- packet->ordinal2 = lower_32_bits(ib);
- packet->bitfields3.ib_base_hi = upper_32_bits(ib);
-
- return 0;
-}
-
-static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer,
- struct qcm_process_device *qpd)
-{
- struct pm4_mes_map_process *packet;
-
- packet = (struct pm4_mes_map_process *)buffer;
- memset(buffer, 0, sizeof(struct pm4_mes_map_process));
-
- packet->header.u32All = build_pm4_header(IT_MAP_PROCESS,
- sizeof(struct pm4_mes_map_process));
- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
- packet->bitfields2.process_quantum = 1;
- packet->bitfields2.pasid = qpd->pqm->process->pasid;
- packet->bitfields3.page_table_base = qpd->page_table_base;
- packet->bitfields10.gds_size = qpd->gds_size;
- packet->bitfields10.num_gws = qpd->num_gws;
- packet->bitfields10.num_oac = qpd->num_oac;
- packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
-
- packet->sh_mem_config = qpd->sh_mem_config;
- packet->sh_mem_bases = qpd->sh_mem_bases;
- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
-
- packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
-
- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
-
- return 0;
-}
-
-static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer,
- struct queue *q, bool is_static)
-{
- struct pm4_mes_map_queues *packet;
- bool use_static = is_static;
-
- packet = (struct pm4_mes_map_queues *)buffer;
- memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
-
- packet->header.u32All = build_pm4_header(IT_MAP_QUEUES,
- sizeof(struct pm4_mes_map_queues));
- packet->bitfields2.alloc_format =
- alloc_format__mes_map_queues__one_per_pipe_vi;
- packet->bitfields2.num_queues = 1;
- packet->bitfields2.queue_sel =
- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
-
- packet->bitfields2.engine_sel =
- engine_sel__mes_map_queues__compute_vi;
- packet->bitfields2.queue_type =
- queue_type__mes_map_queues__normal_compute_vi;
-
- switch (q->properties.type) {
- case KFD_QUEUE_TYPE_COMPUTE:
- if (use_static)
- packet->bitfields2.queue_type =
- queue_type__mes_map_queues__normal_latency_static_queue_vi;
- break;
- case KFD_QUEUE_TYPE_DIQ:
- packet->bitfields2.queue_type =
- queue_type__mes_map_queues__debug_interface_queue_vi;
- break;
- case KFD_QUEUE_TYPE_SDMA:
- packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
- engine_sel__mes_map_queues__sdma0_vi;
- use_static = false; /* no static queues under SDMA */
- break;
- default:
- WARN(1, "queue type %d", q->properties.type);
- return -EINVAL;
- }
- packet->bitfields3.doorbell_offset =
- q->properties.doorbell_off;
-
- packet->mqd_addr_lo =
- lower_32_bits(q->gart_mqd_addr);
-
- packet->mqd_addr_hi =
- upper_32_bits(q->gart_mqd_addr);
-
- packet->wptr_addr_lo =
- lower_32_bits((uint64_t)q->properties.write_ptr);
-
- packet->wptr_addr_hi =
- upper_32_bits((uint64_t)q->properties.write_ptr);
-
- return 0;
+out:
+ mutex_unlock(&pm->lock);
+ return retval;
}
static int pm_create_runlist_ib(struct packet_manager *pm,
@@ -292,12 +152,12 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
return -ENOMEM;
}
- retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd);
+ retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd);
if (retval)
return retval;
proccesses_mapped++;
- inc_wptr(&rl_wptr, sizeof(struct pm4_mes_map_process),
+ inc_wptr(&rl_wptr, pm->pmf->map_process_size,
alloc_size_bytes);
list_for_each_entry(kq, &qpd->priv_queue_list, list) {
@@ -307,7 +167,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
pr_debug("static_queue, mapping kernel q %d, is debug status %d\n",
kq->queue->queue, qpd->is_debug);
- retval = pm_create_map_queue(pm,
+ retval = pm->pmf->map_queues(pm,
&rl_buffer[rl_wptr],
kq->queue,
qpd->is_debug);
@@ -315,7 +175,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
return retval;
inc_wptr(&rl_wptr,
- sizeof(struct pm4_mes_map_queues),
+ pm->pmf->map_queues_size,
alloc_size_bytes);
}
@@ -326,7 +186,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
pr_debug("static_queue, mapping user queue %d, is debug status %d\n",
q->queue, qpd->is_debug);
- retval = pm_create_map_queue(pm,
+ retval = pm->pmf->map_queues(pm,
&rl_buffer[rl_wptr],
q,
qpd->is_debug);
@@ -335,7 +195,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
return retval;
inc_wptr(&rl_wptr,
- sizeof(struct pm4_mes_map_queues),
+ pm->pmf->map_queues_size,
alloc_size_bytes);
}
}
@@ -343,7 +203,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
pr_debug("Finished map process and queues to runlist\n");
if (is_over_subscription)
- retval = pm_create_runlist(pm, &rl_buffer[rl_wptr],
+ retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr],
*rl_gpu_addr,
alloc_size_bytes / sizeof(uint32_t),
true);
@@ -355,45 +215,29 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
return retval;
}
-/* pm_create_release_mem - Create a RELEASE_MEM packet and return the size
- * of this packet
- * @gpu_addr - GPU address of the packet. It's a virtual address.
- * @buffer - buffer to fill up with the packet. It's a CPU kernel pointer
- * Return - length of the packet
- */
-uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer)
-{
- struct pm4_mec_release_mem *packet;
-
- WARN_ON(!buffer);
-
- packet = (struct pm4_mec_release_mem *)buffer;
- memset(buffer, 0, sizeof(*packet));
-
- packet->header.u32All = build_pm4_header(IT_RELEASE_MEM,
- sizeof(*packet));
-
- packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
- packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
- packet->bitfields2.tcl1_action_ena = 1;
- packet->bitfields2.tc_action_ena = 1;
- packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
- packet->bitfields2.atc = 0;
-
- packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low;
- packet->bitfields3.int_sel =
- int_sel___release_mem__send_interrupt_after_write_confirm;
-
- packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
- packet->address_hi = upper_32_bits(gpu_addr);
-
- packet->data_lo = 0;
-
- return sizeof(*packet) / sizeof(unsigned int);
-}
-
int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
{
+ switch (dqm->dev->device_info->asic_family) {
+ case CHIP_KAVERI:
+ case CHIP_HAWAII:
+ /* PM4 packet structures on CIK are the same as on VI */
+ case CHIP_CARRIZO:
+ case CHIP_TONGA:
+ case CHIP_FIJI:
+ case CHIP_POLARIS10:
+ case CHIP_POLARIS11:
+ pm->pmf = &kfd_vi_pm_funcs;
+ break;
+ case CHIP_VEGA10:
+ case CHIP_RAVEN:
+ pm->pmf = &kfd_v9_pm_funcs;
+ break;
+ default:
+ WARN(1, "Unexpected ASIC family %u",
+ dqm->dev->device_info->asic_family);
+ return -EINVAL;
+ }
+
pm->dqm = dqm;
mutex_init(&pm->lock);
pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ);
@@ -415,38 +259,25 @@ void pm_uninit(struct packet_manager *pm)
int pm_send_set_resources(struct packet_manager *pm,
struct scheduling_resources *res)
{
- struct pm4_mes_set_resources *packet;
+ uint32_t *buffer, size;
int retval = 0;
+ size = pm->pmf->set_resources_size;
mutex_lock(&pm->lock);
pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
- sizeof(*packet) / sizeof(uint32_t),
- (unsigned int **)&packet);
- if (!packet) {
+ size / sizeof(uint32_t),
+ (unsigned int **)&buffer);
+ if (!buffer) {
pr_err("Failed to allocate buffer on kernel queue\n");
retval = -ENOMEM;
goto out;
}
- memset(packet, 0, sizeof(struct pm4_mes_set_resources));
- packet->header.u32All = build_pm4_header(IT_SET_RESOURCES,
- sizeof(struct pm4_mes_set_resources));
-
- packet->bitfields2.queue_type =
- queue_type__mes_set_resources__hsa_interface_queue_hiq;
- packet->bitfields2.vmid_mask = res->vmid_mask;
- packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
- packet->bitfields7.oac_mask = res->oac_mask;
- packet->bitfields8.gds_heap_base = res->gds_heap_base;
- packet->bitfields8.gds_heap_size = res->gds_heap_size;
-
- packet->gws_mask_lo = lower_32_bits(res->gws_mask);
- packet->gws_mask_hi = upper_32_bits(res->gws_mask);
-
- packet->queue_mask_lo = lower_32_bits(res->queue_mask);
- packet->queue_mask_hi = upper_32_bits(res->queue_mask);
-
- pm->priv_queue->ops.submit_packet(pm->priv_queue);
+ retval = pm->pmf->set_resources(pm, buffer, res);
+ if (!retval)
+ pm->priv_queue->ops.submit_packet(pm->priv_queue);
+ else
+ pm->priv_queue->ops.rollback_packet(pm->priv_queue);
out:
mutex_unlock(&pm->lock);
@@ -468,7 +299,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr);
- packet_size_dwords = sizeof(struct pm4_mes_runlist) / sizeof(uint32_t);
+ packet_size_dwords = pm->pmf->runlist_size / sizeof(uint32_t);
mutex_lock(&pm->lock);
retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
@@ -476,7 +307,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
if (retval)
goto fail_acquire_packet_buffer;
- retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr,
+ retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr,
rl_ib_size / sizeof(uint32_t), false);
if (retval)
goto fail_create_runlist;
@@ -499,37 +330,29 @@ fail_create_runlist_ib:
int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
uint32_t fence_value)
{
- int retval;
- struct pm4_mes_query_status *packet;
+ uint32_t *buffer, size;
+ int retval = 0;
if (WARN_ON(!fence_address))
return -EFAULT;
+ size = pm->pmf->query_status_size;
mutex_lock(&pm->lock);
- retval = pm->priv_queue->ops.acquire_packet_buffer(
- pm->priv_queue,
- sizeof(struct pm4_mes_query_status) / sizeof(uint32_t),
- (unsigned int **)&packet);
- if (retval)
- goto fail_acquire_packet_buffer;
-
- packet->header.u32All = build_pm4_header(IT_QUERY_STATUS,
- sizeof(struct pm4_mes_query_status));
-
- packet->bitfields2.context_id = 0;
- packet->bitfields2.interrupt_sel =
- interrupt_sel__mes_query_status__completion_status;
- packet->bitfields2.command =
- command__mes_query_status__fence_only_after_write_ack;
-
- packet->addr_hi = upper_32_bits((uint64_t)fence_address);
- packet->addr_lo = lower_32_bits((uint64_t)fence_address);
- packet->data_hi = upper_32_bits((uint64_t)fence_value);
- packet->data_lo = lower_32_bits((uint64_t)fence_value);
+ pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+ size / sizeof(uint32_t), (unsigned int **)&buffer);
+ if (!buffer) {
+ pr_err("Failed to allocate buffer on kernel queue\n");
+ retval = -ENOMEM;
+ goto out;
+ }
- pm->priv_queue->ops.submit_packet(pm->priv_queue);
+ retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value);
+ if (!retval)
+ pm->priv_queue->ops.submit_packet(pm->priv_queue);
+ else
+ pm->priv_queue->ops.rollback_packet(pm->priv_queue);
-fail_acquire_packet_buffer:
+out:
mutex_unlock(&pm->lock);
return retval;
}
@@ -539,82 +362,27 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
uint32_t filter_param, bool reset,
unsigned int sdma_engine)
{
- int retval;
- uint32_t *buffer;
- struct pm4_mes_unmap_queues *packet;
+ uint32_t *buffer, size;
+ int retval = 0;
+ size = pm->pmf->unmap_queues_size;
mutex_lock(&pm->lock);
- retval = pm->priv_queue->ops.acquire_packet_buffer(
- pm->priv_queue,
- sizeof(struct pm4_mes_unmap_queues) / sizeof(uint32_t),
- &buffer);
- if (retval)
- goto err_acquire_packet_buffer;
-
- packet = (struct pm4_mes_unmap_queues *)buffer;
- memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
- pr_debug("static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n",
- filter, reset, type);
- packet->header.u32All = build_pm4_header(IT_UNMAP_QUEUES,
- sizeof(struct pm4_mes_unmap_queues));
- switch (type) {
- case KFD_QUEUE_TYPE_COMPUTE:
- case KFD_QUEUE_TYPE_DIQ:
- packet->bitfields2.engine_sel =
- engine_sel__mes_unmap_queues__compute;
- break;
- case KFD_QUEUE_TYPE_SDMA:
- packet->bitfields2.engine_sel =
- engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
- break;
- default:
- WARN(1, "queue type %d", type);
- retval = -EINVAL;
- goto err_invalid;
+ pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+ size / sizeof(uint32_t), (unsigned int **)&buffer);
+ if (!buffer) {
+ pr_err("Failed to allocate buffer on kernel queue\n");
+ retval = -ENOMEM;
+ goto out;
}
- if (reset)
- packet->bitfields2.action =
- action__mes_unmap_queues__reset_queues;
+ retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param,
+ reset, sdma_engine);
+ if (!retval)
+ pm->priv_queue->ops.submit_packet(pm->priv_queue);
else
- packet->bitfields2.action =
- action__mes_unmap_queues__preempt_queues;
-
- switch (filter) {
- case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
- packet->bitfields2.queue_sel =
- queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
- packet->bitfields2.num_queues = 1;
- packet->bitfields3b.doorbell_offset0 = filter_param;
- break;
- case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
- packet->bitfields2.queue_sel =
- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
- packet->bitfields3a.pasid = filter_param;
- break;
- case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
- packet->bitfields2.queue_sel =
- queue_sel__mes_unmap_queues__unmap_all_queues;
- break;
- case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
- /* in this case, we do not preempt static queues */
- packet->bitfields2.queue_sel =
- queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
- break;
- default:
- WARN(1, "filter %d", filter);
- retval = -EINVAL;
- goto err_invalid;
- }
+ pm->priv_queue->ops.rollback_packet(pm->priv_queue);
- pm->priv_queue->ops.submit_packet(pm->priv_queue);
-
- mutex_unlock(&pm->lock);
- return 0;
-
-err_invalid:
- pm->priv_queue->ops.rollback_packet(pm->priv_queue);
-err_acquire_packet_buffer:
+out:
mutex_unlock(&pm->lock);
return retval;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
new file mode 100644
index 000000000000..f2bcf5c092ea
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
@@ -0,0 +1,583 @@
+/*
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef F32_MES_PM4_PACKETS_H
+#define F32_MES_PM4_PACKETS_H
+
+#ifndef PM4_MES_HEADER_DEFINED
+#define PM4_MES_HEADER_DEFINED
+union PM4_MES_TYPE_3_HEADER {
+ struct {
+ uint32_t reserved1 : 8; /* < reserved */
+ uint32_t opcode : 8; /* < IT opcode */
+ uint32_t count : 14;/* < number of DWORDs - 1 in the
+ * information body.
+ */
+ uint32_t type : 2; /* < packet identifier.
+ * It should be 3 for type 3 packets
+ */
+ };
+ uint32_t u32All;
+};
+#endif /* PM4_MES_HEADER_DEFINED */
+
+/*--------------------MES_SET_RESOURCES--------------------*/
+
+#ifndef PM4_MES_SET_RESOURCES_DEFINED
+#define PM4_MES_SET_RESOURCES_DEFINED
+enum mes_set_resources_queue_type_enum {
+ queue_type__mes_set_resources__kernel_interface_queue_kiq = 0,
+ queue_type__mes_set_resources__hsa_interface_queue_hiq = 1,
+ queue_type__mes_set_resources__hsa_debug_interface_queue = 4
+};
+
+
+struct pm4_mes_set_resources {
+ union {
+ union PM4_MES_TYPE_3_HEADER header; /* header */
+ uint32_t ordinal1;
+ };
+
+ union {
+ struct {
+ uint32_t vmid_mask:16;
+ uint32_t unmap_latency:8;
+ uint32_t reserved1:5;
+ enum mes_set_resources_queue_type_enum queue_type:3;
+ } bitfields2;
+ uint32_t ordinal2;
+ };
+
+ uint32_t queue_mask_lo;
+ uint32_t queue_mask_hi;
+ uint32_t gws_mask_lo;
+ uint32_t gws_mask_hi;
+
+ union {
+ struct {
+ uint32_t oac_mask:16;
+ uint32_t reserved2:16;
+ } bitfields7;
+ uint32_t ordinal7;
+ };
+
+ union {
+ struct {
+ uint32_t gds_heap_base:6;
+ uint32_t reserved3:5;
+ uint32_t gds_heap_size:6;
+ uint32_t reserved4:15;
+ } bitfields8;
+ uint32_t ordinal8;
+ };
+
+};
+#endif
+
+/*--------------------MES_RUN_LIST--------------------*/
+
+#ifndef PM4_MES_RUN_LIST_DEFINED
+#define PM4_MES_RUN_LIST_DEFINED
+
+struct pm4_mes_runlist {
+ union {
+ union PM4_MES_TYPE_3_HEADER header; /* header */
+ uint32_t ordinal1;
+ };
+
+ union {
+ struct {
+ uint32_t reserved1:2;
+ uint32_t ib_base_lo:30;
+ } bitfields2;
+ uint32_t ordinal2;
+ };
+
+ uint32_t ib_base_hi;
+
+ union {
+ struct {
+ uint32_t ib_size:20;
+ uint32_t chain:1;
+ uint32_t offload_polling:1;
+ uint32_t reserved2:1;
+ uint32_t valid:1;
+ uint32_t process_cnt:4;
+ uint32_t reserved3:4;
+ } bitfields4;
+ uint32_t ordinal4;
+ };
+
+};
+#endif
+
+/*--------------------MES_MAP_PROCESS--------------------*/
+
+#ifndef PM4_MES_MAP_PROCESS_DEFINED
+#define PM4_MES_MAP_PROCESS_DEFINED
+
+struct pm4_mes_map_process {
+ union {
+ union PM4_MES_TYPE_3_HEADER header; /* header */
+ uint32_t ordinal1;
+ };
+
+ union {
+ struct {
+ uint32_t pasid:16;
+ uint32_t reserved1:8;
+ uint32_t diq_enable:1;
+ uint32_t process_quantum:7;
+ } bitfields2;
+ uint32_t ordinal2;
+ };
+
+ uint32_t vm_context_page_table_base_addr_lo32;
+
+ uint32_t vm_context_page_table_base_addr_hi32;
+
+ uint32_t sh_mem_bases;
+
+ uint32_t sh_mem_config;
+
+ uint32_t sq_shader_tba_lo;
+
+ uint32_t sq_shader_tba_hi;
+
+ uint32_t sq_shader_tma_lo;
+
+ uint32_t sq_shader_tma_hi;
+
+ uint32_t reserved6;
+
+ uint32_t gds_addr_lo;
+
+ uint32_t gds_addr_hi;
+
+ union {
+ struct {
+ uint32_t num_gws:6;
+ uint32_t reserved7:1;
+ uint32_t sdma_enable:1;
+ uint32_t num_oac:4;
+ uint32_t reserved8:4;
+ uint32_t gds_size:6;
+ uint32_t num_queues:10;
+ } bitfields14;
+ uint32_t ordinal14;
+ };
+
+ uint32_t completion_signal_lo;
+
+ uint32_t completion_signal_hi;
+
+};
+
+#endif
+
+/*--------------------MES_MAP_PROCESS_VM--------------------*/
+
+#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED
+#define PM4_MES_MAP_PROCESS_VM_DEFINED
+
+struct PM4_MES_MAP_PROCESS_VM {
+ union {
+ union PM4_MES_TYPE_3_HEADER header; /* header */
+ uint32_t ordinal1;
+ };
+
+ uint32_t reserved1;
+
+ uint32_t vm_context_cntl;
+
+ uint32_t reserved2;
+
+ uint32_t vm_context_page_table_end_addr_lo32;
+
+ uint32_t vm_context_page_table_end_addr_hi32;
+
+ uint32_t vm_context_page_table_start_addr_lo32;
+
+ uint32_t vm_context_page_table_start_addr_hi32;
+
+ uint32_t reserved3;
+
+ uint32_t reserved4;
+
+ uint32_t reserved5;
+
+ uint32_t reserved6;
+
+ uint32_t reserved7;
+
+ uint32_t reserved8;
+
+ uint32_t completion_signal_lo32;
+
+ uint32_t completion_signal_hi32;
+
+};
+#endif
+
+/*--------------------MES_MAP_QUEUES--------------------*/
+
+#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED
+#define PM4_MES_MAP_QUEUES_VI_DEFINED
+enum mes_map_queues_queue_sel_enum {
+ queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0,
+queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1
+};
+
+enum mes_map_queues_queue_type_enum {
+ queue_type__mes_map_queues__normal_compute_vi = 0,
+ queue_type__mes_map_queues__debug_interface_queue_vi = 1,
+ queue_type__mes_map_queues__normal_latency_static_queue_vi = 2,
+queue_type__mes_map_queues__low_latency_static_queue_vi = 3
+};
+
+enum mes_map_queues_alloc_format_enum {
+ alloc_format__mes_map_queues__one_per_pipe_vi = 0,
+alloc_format__mes_map_queues__all_on_one_pipe_vi = 1
+};
+
+enum mes_map_queues_engine_sel_enum {
+ engine_sel__mes_map_queues__compute_vi = 0,
+ engine_sel__mes_map_queues__sdma0_vi = 2,
+ engine_sel__mes_map_queues__sdma1_vi = 3
+};
+
+
+struct pm4_mes_map_queues {
+ union {
+ union PM4_MES_TYPE_3_HEADER header; /* header */
+ uint32_t ordinal1;
+ };
+
+ union {
+ struct {
+ uint32_t reserved1:4;
+ enum mes_map_queues_queue_sel_enum queue_sel:2;
+ uint32_t reserved2:15;
+ enum mes_map_queues_queue_type_enum queue_type:3;
+ enum mes_map_queues_alloc_format_enum alloc_format:2;
+ enum mes_map_queues_engine_sel_enum engine_sel:3;
+ uint32_t num_queues:3;
+ } bitfields2;
+ uint32_t ordinal2;
+ };
+
+ union {
+ struct {
+ uint32_t reserved3:1;
+ uint32_t check_disable:1;
+ uint32_t doorbell_offset:26;
+ uint32_t reserved4:4;
+ } bitfields3;
+ uint32_t ordinal3;
+ };
+
+ uint32_t mqd_addr_lo;
+ uint32_t mqd_addr_hi;
+ uint32_t wptr_addr_lo;
+ uint32_t wptr_addr_hi;
+};
+#endif
+
+/*--------------------MES_QUERY_STATUS--------------------*/
+
+#ifndef PM4_MES_QUERY_STATUS_DEFINED
+#define PM4_MES_QUERY_STATUS_DEFINED
+enum mes_query_status_interrupt_sel_enum {
+ interrupt_sel__mes_query_status__completion_status = 0,
+ interrupt_sel__mes_query_status__process_status = 1,
+ interrupt_sel__mes_query_status__queue_status = 2
+};
+
+enum mes_query_status_command_enum {
+ command__mes_query_status__interrupt_only = 0,
+ command__mes_query_status__fence_only_immediate = 1,
+ command__mes_query_status__fence_only_after_write_ack = 2,
+ command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3
+};
+
+enum mes_query_status_engine_sel_enum {
+ engine_sel__mes_query_status__compute = 0,
+ engine_sel__mes_query_status__sdma0_queue = 2,
+ engine_sel__mes_query_status__sdma1_queue = 3
+};
+
+struct pm4_mes_query_status {
+ union {
+ union PM4_MES_TYPE_3_HEADER header; /* header */
+ uint32_t ordinal1;
+ };
+
+ union {
+ struct {
+ uint32_t context_id:28;
+ enum mes_query_status_interrupt_sel_enum interrupt_sel:2;
+ enum mes_query_status_command_enum command:2;
+ } bitfields2;
+ uint32_t ordinal2;
+ };
+
+ union {
+ struct {
+ uint32_t pasid:16;
+ uint32_t reserved1:16;
+ } bitfields3a;
+ struct {
+ uint32_t reserved2:2;
+ uint32_t doorbell_offset:26;
+ enum mes_query_status_engine_sel_enum engine_sel:3;
+ uint32_t reserved3:1;
+ } bitfields3b;
+ uint32_t ordinal3;
+ };
+
+ uint32_t addr_lo;
+ uint32_t addr_hi;
+ uint32_t data_lo;
+ uint32_t data_hi;
+};
+#endif
+
+/*--------------------MES_UNMAP_QUEUES--------------------*/
+
+#ifndef PM4_MES_UNMAP_QUEUES_DEFINED
+#define PM4_MES_UNMAP_QUEUES_DEFINED
+enum mes_unmap_queues_action_enum {
+ action__mes_unmap_queues__preempt_queues = 0,
+ action__mes_unmap_queues__reset_queues = 1,
+ action__mes_unmap_queues__disable_process_queues = 2,
+ action__mes_unmap_queues__reserved = 3
+};
+
+enum mes_unmap_queues_queue_sel_enum {
+ queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0,
+ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1,
+ queue_sel__mes_unmap_queues__unmap_all_queues = 2,
+ queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3
+};
+
+enum mes_unmap_queues_engine_sel_enum {
+ engine_sel__mes_unmap_queues__compute = 0,
+ engine_sel__mes_unmap_queues__sdma0 = 2,
+ engine_sel__mes_unmap_queues__sdmal = 3
+};
+
+struct pm4_mes_unmap_queues {
+ union {
+ union PM4_MES_TYPE_3_HEADER header; /* header */
+ uint32_t ordinal1;
+ };
+
+ union {
+ struct {
+ enum mes_unmap_queues_action_enum action:2;
+ uint32_t reserved1:2;
+ enum mes_unmap_queues_queue_sel_enum queue_sel:2;
+ uint32_t reserved2:20;
+ enum mes_unmap_queues_engine_sel_enum engine_sel:3;
+ uint32_t num_queues:3;
+ } bitfields2;
+ uint32_t ordinal2;
+ };
+
+ union {
+ struct {
+ uint32_t pasid:16;
+ uint32_t reserved3:16;
+ } bitfields3a;
+ struct {
+ uint32_t reserved4:2;
+ uint32_t doorbell_offset0:26;
+ int32_t reserved5:4;
+ } bitfields3b;
+ uint32_t ordinal3;
+ };
+
+ union {
+ struct {
+ uint32_t reserved6:2;
+ uint32_t doorbell_offset1:26;
+ uint32_t reserved7:4;
+ } bitfields4;
+ uint32_t ordinal4;
+ };
+
+ union {
+ struct {
+ uint32_t reserved8:2;
+ uint32_t doorbell_offset2:26;
+ uint32_t reserved9:4;
+ } bitfields5;
+ uint32_t ordinal5;
+ };
+
+ union {
+ struct {
+ uint32_t reserved10:2;
+ uint32_t doorbell_offset3:26;
+ uint32_t reserved11:4;
+ } bitfields6;
+ uint32_t ordinal6;
+ };
+};
+#endif
+
+#ifndef PM4_MEC_RELEASE_MEM_DEFINED
+#define PM4_MEC_RELEASE_MEM_DEFINED
+
+enum mec_release_mem_event_index_enum {
+ event_index__mec_release_mem__end_of_pipe = 5,
+ event_index__mec_release_mem__shader_done = 6
+};
+
+enum mec_release_mem_cache_policy_enum {
+ cache_policy__mec_release_mem__lru = 0,
+ cache_policy__mec_release_mem__stream = 1
+};
+
+enum mec_release_mem_pq_exe_status_enum {
+ pq_exe_status__mec_release_mem__default = 0,
+ pq_exe_status__mec_release_mem__phase_update = 1
+};
+
+enum mec_release_mem_dst_sel_enum {
+ dst_sel__mec_release_mem__memory_controller = 0,
+ dst_sel__mec_release_mem__tc_l2 = 1,
+ dst_sel__mec_release_mem__queue_write_pointer_register = 2,
+ dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3
+};
+
+enum mec_release_mem_int_sel_enum {
+ int_sel__mec_release_mem__none = 0,
+ int_sel__mec_release_mem__send_interrupt_only = 1,
+ int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2,
+ int_sel__mec_release_mem__send_data_after_write_confirm = 3,
+ int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4,
+ int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5,
+ int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6
+};
+
+enum mec_release_mem_data_sel_enum {
+ data_sel__mec_release_mem__none = 0,
+ data_sel__mec_release_mem__send_32_bit_low = 1,
+ data_sel__mec_release_mem__send_64_bit_data = 2,
+ data_sel__mec_release_mem__send_gpu_clock_counter = 3,
+ data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4,
+ data_sel__mec_release_mem__store_gds_data_to_memory = 5
+};
+
+struct pm4_mec_release_mem {
+ union {
+ union PM4_MES_TYPE_3_HEADER header; /*header */
+ unsigned int ordinal1;
+ };
+
+ union {
+ struct {
+ unsigned int event_type:6;
+ unsigned int reserved1:2;
+ enum mec_release_mem_event_index_enum event_index:4;
+ unsigned int tcl1_vol_action_ena:1;
+ unsigned int tc_vol_action_ena:1;
+ unsigned int reserved2:1;
+ unsigned int tc_wb_action_ena:1;
+ unsigned int tcl1_action_ena:1;
+ unsigned int tc_action_ena:1;
+ uint32_t reserved3:1;
+ uint32_t tc_nc_action_ena:1;
+ uint32_t tc_wc_action_ena:1;
+ uint32_t tc_md_action_ena:1;
+ uint32_t reserved4:3;
+ enum mec_release_mem_cache_policy_enum cache_policy:2;
+ uint32_t reserved5:2;
+ enum mec_release_mem_pq_exe_status_enum pq_exe_status:1;
+ uint32_t reserved6:2;
+ } bitfields2;
+ unsigned int ordinal2;
+ };
+
+ union {
+ struct {
+ uint32_t reserved7:16;
+ enum mec_release_mem_dst_sel_enum dst_sel:2;
+ uint32_t reserved8:6;
+ enum mec_release_mem_int_sel_enum int_sel:3;
+ uint32_t reserved9:2;
+ enum mec_release_mem_data_sel_enum data_sel:3;
+ } bitfields3;
+ unsigned int ordinal3;
+ };
+
+ union {
+ struct {
+ uint32_t reserved10:2;
+ unsigned int address_lo_32b:30;
+ } bitfields4;
+ struct {
+ uint32_t reserved11:3;
+ uint32_t address_lo_64b:29;
+ } bitfields4b;
+ uint32_t reserved12;
+ unsigned int ordinal4;
+ };
+
+ union {
+ uint32_t address_hi;
+ uint32_t reserved13;
+ uint32_t ordinal5;
+ };
+
+ union {
+ uint32_t data_lo;
+ uint32_t cmp_data_lo;
+ struct {
+ uint32_t dw_offset:16;
+ uint32_t num_dwords:16;
+ } bitfields6c;
+ uint32_t reserved14;
+ uint32_t ordinal6;
+ };
+
+ union {
+ uint32_t data_hi;
+ uint32_t cmp_data_hi;
+ uint32_t reserved15;
+ uint32_t reserved16;
+ uint32_t ordinal7;
+ };
+
+ uint32_t int_ctxid;
+
+};
+
+#endif
+
+enum {
+ CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
+};
+#endif
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 96a9cc0f02c9..5e3990bb4c4b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -39,11 +39,37 @@
#include "amd_shared.h"
+#define KFD_MAX_RING_ENTRY_SIZE 8
+
#define KFD_SYSFS_FILE_MODE 0444
-#define KFD_MMAP_DOORBELL_MASK 0x8000000000000ull
-#define KFD_MMAP_EVENTS_MASK 0x4000000000000ull
-#define KFD_MMAP_RESERVED_MEM_MASK 0x2000000000000ull
+/* GPU ID hash width in bits */
+#define KFD_GPU_ID_HASH_WIDTH 16
+
+/* Use upper bits of mmap offset to store KFD driver specific information.
+ * BITS[63:62] - Encode MMAP type
+ * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to
+ * BITS[45:0] - MMAP offset value
+ *
+ * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
+ * defines are w.r.t to PAGE_SIZE
+ */
+#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT)
+#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_RESERVED_MEM (0x1ULL << KFD_MMAP_TYPE_SHIFT)
+
+#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT)
+#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \
+ << KFD_MMAP_GPU_ID_SHIFT)
+#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\
+ & KFD_MMAP_GPU_ID_MASK)
+#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \
+ >> KFD_MMAP_GPU_ID_SHIFT)
+
+#define KFD_MMAP_OFFSET_VALUE_MASK (0x3FFFFFFFFFFFULL >> PAGE_SHIFT)
+#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK)
/*
* When working with cp scheduler we should assign the HIQ manually or via
@@ -55,9 +81,6 @@
#define KFD_CIK_HIQ_PIPE 4
#define KFD_CIK_HIQ_QUEUE 0
-/* GPU ID hash width in bits */
-#define KFD_GPU_ID_HASH_WIDTH 16
-
/* Macro for allocating structures */
#define kfd_alloc_struct(ptr_to_struct) \
((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL))
@@ -116,6 +139,11 @@ extern int debug_largebar;
*/
extern int ignore_crat;
+/*
+ * Set sh_mem_config.retry_disable on Vega10
+ */
+extern int vega10_noretry;
+
/**
* enum kfd_sched_policy
*
@@ -148,6 +176,8 @@ enum cache_policy {
cache_policy_noncoherent
};
+#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10)
+
struct kfd_event_interrupt_class {
bool (*interrupt_isr)(struct kfd_dev *dev,
const uint32_t *ih_ring_entry);
@@ -160,6 +190,7 @@ struct kfd_device_info {
const struct kfd_event_interrupt_class *event_interrupt_class;
unsigned int max_pasid_bits;
unsigned int max_no_of_hqd;
+ unsigned int doorbell_size;
size_t ih_ring_entry_size;
uint8_t num_of_watch_points;
uint16_t mqd_size_aligned;
@@ -173,6 +204,7 @@ struct kfd_mem_obj {
uint32_t range_end;
uint64_t gpu_addr;
uint32_t *cpu_ptr;
+ void *gtt_mem;
};
struct kfd_vmid_info {
@@ -364,7 +396,7 @@ struct queue_properties {
uint32_t queue_percent;
uint32_t *read_ptr;
uint32_t *write_ptr;
- uint32_t __iomem *doorbell_ptr;
+ void __iomem *doorbell_ptr;
uint32_t doorbell_off;
bool is_interop;
bool is_evicted;
@@ -427,6 +459,7 @@ struct queue {
uint32_t queue;
unsigned int sdma_id;
+ unsigned int doorbell_id;
struct kfd_process *process;
struct kfd_dev *device;
@@ -501,6 +534,9 @@ struct qcm_process_device {
/* IB memory */
uint64_t ib_base;
void *ib_kaddr;
+
+ /* doorbell resources per process per device */
+ unsigned long *doorbell_bitmap;
};
/* KFD Memory Eviction */
@@ -512,6 +548,8 @@ struct qcm_process_device {
/* Approx. time before evicting the process again */
#define PROCESS_ACTIVE_TIME_MS 10
+int kgd2kfd_quiesce_mm(struct mm_struct *mm);
+int kgd2kfd_resume_mm(struct mm_struct *mm);
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
struct dma_fence *fence);
@@ -681,6 +719,8 @@ struct kfd_process *kfd_get_process(const struct task_struct *);
struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
void kfd_unref_process(struct kfd_process *p);
+int kfd_process_evict_queues(struct kfd_process *p);
+int kfd_process_restore_queues(struct kfd_process *p);
void kfd_suspend_all_processes(void);
int kfd_resume_all_processes(void);
@@ -693,7 +733,7 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
struct kfd_process *p);
-int kfd_reserved_mem_mmap(struct kfd_process *process,
+int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
struct vm_area_struct *vma);
/* KFD process API for creating and translating handles */
@@ -721,17 +761,20 @@ unsigned int kfd_pasid_alloc(void);
void kfd_pasid_free(unsigned int pasid);
/* Doorbells */
+size_t kfd_doorbell_process_slice(struct kfd_dev *kfd);
int kfd_doorbell_init(struct kfd_dev *kfd);
void kfd_doorbell_fini(struct kfd_dev *kfd);
-int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma);
-u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
+int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process,
+ struct vm_area_struct *vma);
+void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
unsigned int *doorbell_off);
void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr);
u32 read_kernel_doorbell(u32 __iomem *db);
-void write_kernel_doorbell(u32 __iomem *db, u32 value);
-unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd,
+void write_kernel_doorbell(void __iomem *db, u32 value);
+void write_kernel_doorbell64(void __iomem *db, u64 value);
+unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd,
struct kfd_process *process,
- unsigned int queue_id);
+ unsigned int doorbell_id);
phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
struct kfd_process *process);
int kfd_alloc_process_doorbells(struct kfd_process *process);
@@ -788,6 +831,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
+struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
+ struct kfd_dev *dev);
struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
@@ -832,8 +877,42 @@ struct packet_manager {
bool allocated;
struct kfd_mem_obj *ib_buffer_obj;
unsigned int ib_size_bytes;
+
+ const struct packet_manager_funcs *pmf;
+};
+
+struct packet_manager_funcs {
+ /* Support ASIC-specific packet formats for PM4 packets */
+ int (*map_process)(struct packet_manager *pm, uint32_t *buffer,
+ struct qcm_process_device *qpd);
+ int (*runlist)(struct packet_manager *pm, uint32_t *buffer,
+ uint64_t ib, size_t ib_size_in_dwords, bool chain);
+ int (*set_resources)(struct packet_manager *pm, uint32_t *buffer,
+ struct scheduling_resources *res);
+ int (*map_queues)(struct packet_manager *pm, uint32_t *buffer,
+ struct queue *q, bool is_static);
+ int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer,
+ enum kfd_queue_type type,
+ enum kfd_unmap_queues_filter mode,
+ uint32_t filter_param, bool reset,
+ unsigned int sdma_engine);
+ int (*query_status)(struct packet_manager *pm, uint32_t *buffer,
+ uint64_t fence_address, uint32_t fence_value);
+ int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
+
+ /* Packet sizes */
+ int map_process_size;
+ int runlist_size;
+ int set_resources_size;
+ int map_queues_size;
+ int unmap_queues_size;
+ int query_status_size;
+ int release_mem_size;
};
+extern const struct packet_manager_funcs kfd_vi_pm_funcs;
+extern const struct packet_manager_funcs kfd_v9_pm_funcs;
+
int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
void pm_uninit(struct packet_manager *pm);
int pm_send_set_resources(struct packet_manager *pm,
@@ -849,12 +928,17 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
void pm_release_ib(struct packet_manager *pm);
-uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer);
+/* Following PM funcs can be shared among VI and AI */
+unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
+int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
+ struct scheduling_resources *res);
uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
/* Events */
extern const struct kfd_event_interrupt_class event_interrupt_class_cik;
+extern const struct kfd_event_interrupt_class event_interrupt_class_v9;
+
extern const struct kfd_device_global_init_class device_global_init_class_cik;
void kfd_event_init_process(struct kfd_process *p);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 1711ad0642f7..1d80b4f7c681 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -332,6 +332,7 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
get_order(KFD_CWSR_TBA_TMA_SIZE));
+ kfree(pdd->qpd.doorbell_bitmap);
idr_destroy(&pdd->alloc_idr);
kfree(pdd);
@@ -451,7 +452,8 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
if (!dev->cwsr_enabled || qpd->cwsr_kaddr || qpd->cwsr_base)
continue;
- offset = (dev->id | KFD_MMAP_RESERVED_MEM_MASK) << PAGE_SHIFT;
+ offset = (KFD_MMAP_TYPE_RESERVED_MEM | KFD_MMAP_GPU_ID(dev->id))
+ << PAGE_SHIFT;
qpd->tba_addr = (int64_t)vm_mmap(filep, 0,
KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC,
MAP_SHARED, offset);
@@ -585,6 +587,31 @@ err_alloc_process:
return ERR_PTR(err);
}
+static int init_doorbell_bitmap(struct qcm_process_device *qpd,
+ struct kfd_dev *dev)
+{
+ unsigned int i;
+
+ if (!KFD_IS_SOC15(dev->device_info->asic_family))
+ return 0;
+
+ qpd->doorbell_bitmap =
+ kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
+ BITS_PER_BYTE), GFP_KERNEL);
+ if (!qpd->doorbell_bitmap)
+ return -ENOMEM;
+
+ /* Mask out any reserved doorbells */
+ for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++)
+ if ((dev->shared_resources.reserved_doorbell_mask & i) ==
+ dev->shared_resources.reserved_doorbell_val) {
+ set_bit(i, qpd->doorbell_bitmap);
+ pr_debug("reserved doorbell 0x%03x\n", i);
+ }
+
+ return 0;
+}
+
struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
struct kfd_process *p)
{
@@ -606,6 +633,12 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
if (!pdd)
return NULL;
+ if (init_doorbell_bitmap(&pdd->qpd, dev)) {
+ pr_err("Failed to init doorbell for process\n");
+ kfree(pdd);
+ return NULL;
+ }
+
pdd->dev = dev;
INIT_LIST_HEAD(&pdd->qpd.queues_list);
INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
@@ -808,7 +841,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
* Eviction is reference-counted per process-device. This means multiple
* evictions from different sources can be nested safely.
*/
-static int process_evict_queues(struct kfd_process *p)
+int kfd_process_evict_queues(struct kfd_process *p)
{
struct kfd_process_device *pdd;
int r = 0;
@@ -844,7 +877,7 @@ fail:
}
/* process_restore_queues - Restore all user queues of a process */
-static int process_restore_queues(struct kfd_process *p)
+int kfd_process_restore_queues(struct kfd_process *p)
{
struct kfd_process_device *pdd;
int r, ret = 0;
@@ -886,7 +919,7 @@ static void evict_process_worker(struct work_struct *work)
flush_delayed_work(&p->restore_work);
pr_debug("Started evicting pasid %d\n", p->pasid);
- ret = process_evict_queues(p);
+ ret = kfd_process_evict_queues(p);
if (!ret) {
dma_fence_signal(p->ef);
dma_fence_put(p->ef);
@@ -946,7 +979,7 @@ static void restore_process_worker(struct work_struct *work)
return;
}
- ret = process_restore_queues(p);
+ ret = kfd_process_restore_queues(p);
if (!ret)
pr_debug("Finished restoring pasid %d\n", p->pasid);
else
@@ -963,7 +996,7 @@ void kfd_suspend_all_processes(void)
cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work);
- if (process_evict_queues(p))
+ if (kfd_process_evict_queues(p))
pr_err("Failed to suspend process %d\n", p->pasid);
dma_fence_signal(p->ef);
dma_fence_put(p->ef);
@@ -989,15 +1022,12 @@ int kfd_resume_all_processes(void)
return ret;
}
-int kfd_reserved_mem_mmap(struct kfd_process *process,
+int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
struct vm_area_struct *vma)
{
- struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff);
struct kfd_process_device *pdd;
struct qcm_process_device *qpd;
- if (!dev)
- return -EINVAL;
if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) {
pr_err("Incorrect CWSR mapping size.\n");
return -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 7817e327ea6d..d65ce0436b31 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -119,9 +119,6 @@ static int create_cp_queue(struct process_queue_manager *pqm,
/* Doorbell initialized in user space*/
q_properties->doorbell_ptr = NULL;
- q_properties->doorbell_off =
- kfd_queue_id_to_doorbell(dev, pqm->process, qid);
-
/* let DQM handle it*/
q_properties->vmid = 0;
q_properties->queue_id = qid;
@@ -244,10 +241,20 @@ int pqm_create_queue(struct process_queue_manager *pqm,
}
if (retval != 0) {
- pr_err("DQM create queue failed\n");
+ pr_err("Pasid %d DQM create queue %d failed. ret %d\n",
+ pqm->process->pasid, type, retval);
goto err_create_queue;
}
+ if (q)
+ /* Return the doorbell offset within the doorbell page
+ * to the caller so it can be passed up to user mode
+ * (in bytes).
+ */
+ properties->doorbell_off =
+ (q->properties.doorbell_off * sizeof(uint32_t)) &
+ (kfd_doorbell_process_slice(dev) - 1);
+
pr_debug("PQM After DQM create queue\n");
list_add(&pqn->process_queue_list, &pqm->queues);
@@ -313,8 +320,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
dqm = pqn->q->device->dqm;
retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
if (retval) {
- pr_debug("Destroy queue failed, returned %d\n", retval);
- goto err_destroy_queue;
+ pr_err("Pasid %d destroy queue %d failed, ret %d\n",
+ pqm->process->pasid,
+ pqn->q->properties.queue_id, retval);
+ if (retval != -ETIME)
+ goto err_destroy_queue;
}
uninit_queue(pqn->q);
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index a5315d4f1c95..6dcd621e5b71 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -36,8 +36,8 @@ void print_queue_properties(struct queue_properties *q)
pr_debug("Queue Address: 0x%llX\n", q->queue_address);
pr_debug("Queue Id: %u\n", q->queue_id);
pr_debug("Queue Process Vmid: %u\n", q->vmid);
- pr_debug("Queue Read Pointer: 0x%p\n", q->read_ptr);
- pr_debug("Queue Write Pointer: 0x%p\n", q->write_ptr);
+ pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr);
+ pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr);
pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr);
pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off);
}
@@ -53,8 +53,8 @@ void print_queue(struct queue *q)
pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address);
pr_debug("Queue Id: %u\n", q->properties.queue_id);
pr_debug("Queue Process Vmid: %u\n", q->properties.vmid);
- pr_debug("Queue Read Pointer: 0x%p\n", q->properties.read_ptr);
- pr_debug("Queue Write Pointer: 0x%p\n", q->properties.write_ptr);
+ pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr);
+ pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr);
pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr);
pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off);
pr_debug("Queue MQD Address: 0x%p\n", q->mqd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index ac28abc94e57..bc95d4dfee2e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1239,6 +1239,12 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
break;
+ case CHIP_VEGA10:
+ case CHIP_RAVEN:
+ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+ break;
default:
WARN(1, "Unexpected ASIC family %u",
dev->gpu->device_info->asic_family);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index eb54cfcaf039..7d9c3f948dff 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -45,6 +45,7 @@
#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0
#define HSA_CAP_DOORBELL_TYPE_1_0 0x1
+#define HSA_CAP_DOORBELL_TYPE_2_0 0x2
#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000
struct kfd_node_properties {
diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
new file mode 100644
index 000000000000..0bc0b25cb410
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef HSA_SOC15_INT_H_INCLUDED
+#define HSA_SOC15_INT_H_INCLUDED
+
+#include "soc15_ih_clientid.h"
+
+#define SOC15_INTSRC_CP_END_OF_PIPE 181
+#define SOC15_INTSRC_CP_BAD_OPCODE 183
+#define SOC15_INTSRC_SQ_INTERRUPT_MSG 239
+#define SOC15_INTSRC_VMC_FAULT 0
+#define SOC15_INTSRC_SDMA_TRAP 224
+
+
+#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff)
+#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff)
+#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff)
+#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf)
+#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1)
+#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff)
+#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4]))
+#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5]))
+#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6]))
+#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7]))
+
+#endif
+
OpenPOWER on IntegriCloud