From d1853f42b63da94fa0147091d22bf5675b0ff89b Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 23 Mar 2018 15:32:31 -0400 Subject: drm/amdkfd: GFP_NOIO while holding locks taken in MMU notifier When an MMU notifier runs in memory reclaim context, it can deadlock trying to take locks that are already held in the thread causing the memory reclaim. The solution is to avoid memory reclaim while holding locks that are taken in MMU notifiers by using GFP_NOIO. This commit fixes memory allocations done while holding the dqm->lock which is needed in the MMU notifier (dqm->ops.evict_process_queues). Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device.c') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 3346699960dd..0434f659eeaf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -652,7 +652,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) return -ENOMEM; - *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); if ((*mem_obj) == NULL) return -ENOMEM; -- cgit v1.2.3 From 6b95e7973a136181e37446bd29b0b2e2f0d2d653 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 23 Mar 2018 15:32:32 -0400 Subject: drm/amdkfd: Add quiesce_mm and resume_mm to kgd2kfd_calls These interfaces allow KGD to stop and resume all GPU user mode queue access to a process address space. This is needed for handling MMU notifiers of userptrs mapped for GPU access in KFD VMs. Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 38 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_module.c | 2 ++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 +++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 10 +++---- drivers/gpu/drm/amd/include/kgd_kfd_interface.h | 6 ++++ 5 files changed, 55 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device.c') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0434f659eeaf..7b5799530c0f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -541,6 +541,44 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) spin_unlock(&kfd->interrupt_lock); } +int kgd2kfd_quiesce_mm(struct mm_struct *mm) +{ + struct kfd_process *p; + int r; + + /* Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are + * running so the lookup function increments the process ref count. + */ + p = kfd_lookup_process_by_mm(mm); + if (!p) + return -ESRCH; + + r = kfd_process_evict_queues(p); + + kfd_unref_process(p); + return r; +} + +int kgd2kfd_resume_mm(struct mm_struct *mm) +{ + struct kfd_process *p; + int r; + + /* Because we are called from arbitrary context (workqueue) as opposed + * to process context, kfd_process could attempt to exit while we are + * running so the lookup function increments the process ref count. + */ + p = kfd_lookup_process_by_mm(mm); + if (!p) + return -ESRCH; + + r = kfd_process_restore_queues(p); + + kfd_unref_process(p); + return r; +} + /** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will * prepare for safe eviction of KFD BOs that belong to the specified * process. diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index e0c07d24d251..45bc458f7348 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -43,6 +43,8 @@ static const struct kgd2kfd_calls kgd2kfd = { .interrupt = kgd2kfd_interrupt, .suspend = kgd2kfd_suspend, .resume = kgd2kfd_resume, + .quiesce_mm = kgd2kfd_quiesce_mm, + .resume_mm = kgd2kfd_resume_mm, .schedule_evict_and_restore_process = kgd2kfd_schedule_evict_and_restore_process, }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 96a9cc0f02c9..4d5c49ef2dc5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -512,6 +512,8 @@ struct qcm_process_device { /* Approx. time before evicting the process again */ #define PROCESS_ACTIVE_TIME_MS 10 +int kgd2kfd_quiesce_mm(struct mm_struct *mm); +int kgd2kfd_resume_mm(struct mm_struct *mm); int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, struct dma_fence *fence); @@ -681,6 +683,8 @@ struct kfd_process *kfd_get_process(const struct task_struct *); struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); void kfd_unref_process(struct kfd_process *p); +int kfd_process_evict_queues(struct kfd_process *p); +int kfd_process_restore_queues(struct kfd_process *p); void kfd_suspend_all_processes(void); int kfd_resume_all_processes(void); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 1711ad0642f7..2791e72c2058 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -808,7 +808,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) * Eviction is reference-counted per process-device. This means multiple * evictions from different sources can be nested safely. */ -static int process_evict_queues(struct kfd_process *p) +int kfd_process_evict_queues(struct kfd_process *p) { struct kfd_process_device *pdd; int r = 0; @@ -844,7 +844,7 @@ fail: } /* process_restore_queues - Restore all user queues of a process */ -static int process_restore_queues(struct kfd_process *p) +int kfd_process_restore_queues(struct kfd_process *p) { struct kfd_process_device *pdd; int r, ret = 0; @@ -886,7 +886,7 @@ static void evict_process_worker(struct work_struct *work) flush_delayed_work(&p->restore_work); pr_debug("Started evicting pasid %d\n", p->pasid); - ret = process_evict_queues(p); + ret = kfd_process_evict_queues(p); if (!ret) { dma_fence_signal(p->ef); dma_fence_put(p->ef); @@ -946,7 +946,7 @@ static void restore_process_worker(struct work_struct *work) return; } - ret = process_restore_queues(p); + ret = kfd_process_restore_queues(p); if (!ret) pr_debug("Finished restoring pasid %d\n", p->pasid); else @@ -963,7 +963,7 @@ void kfd_suspend_all_processes(void) cancel_delayed_work_sync(&p->eviction_work); cancel_delayed_work_sync(&p->restore_work); - if (process_evict_queues(p)) + if (kfd_process_evict_queues(p)) pr_err("Failed to suspend process %d\n", p->pasid); dma_fence_signal(p->ef); dma_fence_put(p->ef); diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h index 237289a72bb7..286cfe7068c1 100644 --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h @@ -382,6 +382,10 @@ struct kfd2kgd_calls { * * @resume: Notifies amdkfd about a resume action done to a kgd device * + * @quiesce_mm: Quiesce all user queue access to specified MM address space + * + * @resume_mm: Resume user queue access to specified MM address space + * * @schedule_evict_and_restore_process: Schedules work queue that will prepare * for safe eviction of KFD BOs that belong to the specified process. * @@ -399,6 +403,8 @@ struct kgd2kfd_calls { void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); void (*suspend)(struct kfd_dev *kfd); int (*resume)(struct kfd_dev *kfd); + int (*quiesce_mm)(struct mm_struct *mm); + int (*resume_mm)(struct mm_struct *mm); int (*schedule_evict_and_restore_process)(struct mm_struct *mm, struct dma_fence *fence); }; -- cgit v1.2.3 From ada2b29c4a79efbdc5bf5eed876bad6b00f43536 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 10 Apr 2018 17:33:03 -0400 Subject: drm/amdkfd: Make doorbell size ASIC-dependent This prepares for GFXv9 (Vega10), which has 64-bit doorbells. Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 10 +++++++ drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 48 ++++++++++++++++--------------- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 7 +++-- 3 files changed, 39 insertions(+), 26 deletions(-) (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device.c') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 7b5799530c0f..f563acbc1ad7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -41,6 +41,7 @@ static const struct kfd_device_info kaveri_device_info = { .max_pasid_bits = 16, /* max num of queues for KV.TODO should be a dynamic value */ .max_no_of_hqd = 24, + .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, @@ -55,6 +56,7 @@ static const struct kfd_device_info carrizo_device_info = { .max_pasid_bits = 16, /* max num of queues for CZ.TODO should be a dynamic value */ .max_no_of_hqd = 24, + .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, @@ -70,6 +72,7 @@ static const struct kfd_device_info hawaii_device_info = { .max_pasid_bits = 16, /* max num of queues for KV.TODO should be a dynamic value */ .max_no_of_hqd = 24, + .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, @@ -83,6 +86,7 @@ static const struct kfd_device_info tonga_device_info = { .asic_family = CHIP_TONGA, .max_pasid_bits = 16, .max_no_of_hqd = 24, + .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, @@ -96,6 +100,7 @@ static const struct kfd_device_info tonga_vf_device_info = { .asic_family = CHIP_TONGA, .max_pasid_bits = 16, .max_no_of_hqd = 24, + .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, @@ -109,6 +114,7 @@ static const struct kfd_device_info fiji_device_info = { .asic_family = CHIP_FIJI, .max_pasid_bits = 16, .max_no_of_hqd = 24, + .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, @@ -122,6 +128,7 @@ static const struct kfd_device_info fiji_vf_device_info = { .asic_family = CHIP_FIJI, .max_pasid_bits = 16, .max_no_of_hqd = 24, + .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, @@ -136,6 +143,7 @@ static const struct kfd_device_info polaris10_device_info = { .asic_family = CHIP_POLARIS10, .max_pasid_bits = 16, .max_no_of_hqd = 24, + .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, @@ -149,6 +157,7 @@ static const struct kfd_device_info polaris10_vf_device_info = { .asic_family = CHIP_POLARIS10, .max_pasid_bits = 16, .max_no_of_hqd = 24, + .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, @@ -162,6 +171,7 @@ static const struct kfd_device_info polaris11_device_info = { .asic_family = CHIP_POLARIS11, .max_pasid_bits = 16, .max_no_of_hqd = 24, + .doorbell_size = 4, .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index ebb4da14e3df..484031423d1f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -33,7 +33,6 @@ static DEFINE_IDA(doorbell_ida); static unsigned int max_doorbell_slices; -#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 /* * Each device exposes a doorbell aperture, a PCI MMIO aperture that @@ -50,9 +49,9 @@ static unsigned int max_doorbell_slices; */ /* # of doorbell bytes allocated for each process. */ -static inline size_t doorbell_process_allocation(void) +static size_t kfd_doorbell_process_slice(struct kfd_dev *kfd) { - return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * + return roundup(kfd->device_info->doorbell_size * KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, PAGE_SIZE); } @@ -72,16 +71,16 @@ int kfd_doorbell_init(struct kfd_dev *kfd) doorbell_start_offset = roundup(kfd->shared_resources.doorbell_start_offset, - doorbell_process_allocation()); + kfd_doorbell_process_slice(kfd)); doorbell_aperture_size = rounddown(kfd->shared_resources.doorbell_aperture_size, - doorbell_process_allocation()); + kfd_doorbell_process_slice(kfd)); if (doorbell_aperture_size > doorbell_start_offset) doorbell_process_limit = (doorbell_aperture_size - doorbell_start_offset) / - doorbell_process_allocation(); + kfd_doorbell_process_slice(kfd); else return -ENOSPC; @@ -95,7 +94,7 @@ int kfd_doorbell_init(struct kfd_dev *kfd) kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32); kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, - doorbell_process_allocation()); + kfd_doorbell_process_slice(kfd)); if (!kfd->doorbell_kernel_ptr) return -ENOMEM; @@ -132,16 +131,16 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) phys_addr_t address; struct kfd_dev *dev; + /* Find kfd device according to gpu id */ + dev = kfd_device_by_id(vma->vm_pgoff); + if (!dev) + return -EINVAL; + /* * For simplicitly we only allow mapping of the entire doorbell * allocation of a single device & process. */ - if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) - return -EINVAL; - - /* Find kfd device according to gpu id */ - dev = kfd_device_by_id(vma->vm_pgoff); - if (!dev) + if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) return -EINVAL; /* Calculate physical address of doorbell */ @@ -158,19 +157,19 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) " vm_flags == 0x%04lX\n" " size == 0x%04lX\n", (unsigned long long) vma->vm_start, address, vma->vm_flags, - doorbell_process_allocation()); + kfd_doorbell_process_slice(dev)); return io_remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, - doorbell_process_allocation(), + kfd_doorbell_process_slice(dev), vma->vm_page_prot); } /* get kernel iomem pointer for a doorbell */ -u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, +void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, unsigned int *doorbell_off) { u32 inx; @@ -185,6 +184,8 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) return NULL; + inx *= kfd->device_info->doorbell_size / sizeof(u32); + /* * Calculating the kernel doorbell offset using the first * doorbell page. @@ -210,7 +211,7 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) mutex_unlock(&kfd->doorbell_mutex); } -inline void write_kernel_doorbell(u32 __iomem *db, u32 value) +void write_kernel_doorbell(void __iomem *db, u32 value) { if (db) { writel(value, db); @@ -228,20 +229,21 @@ unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, { /* * doorbell_id_offset accounts for doorbells taken by KGD. - * index * doorbell_process_allocation/sizeof(u32) adjusts to - * the process's doorbells. + * index * kfd_doorbell_process_slice/sizeof(u32) adjusts to + * the process's doorbells. The offset returned is in dword + * units regardless of the ASIC-dependent doorbell size. */ return kfd->doorbell_id_offset + process->doorbell_index - * doorbell_process_allocation() / sizeof(u32) + - queue_id; + * kfd_doorbell_process_slice(kfd) / sizeof(u32) + + queue_id * kfd->device_info->doorbell_size / sizeof(u32); } uint64_t kfd_get_number_elems(struct kfd_dev *kfd) { uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - kfd->shared_resources.doorbell_start_offset) / - doorbell_process_allocation() + 1; + kfd_doorbell_process_slice(kfd) + 1; return num_of_elems; @@ -251,7 +253,7 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, struct kfd_process *process) { return dev->doorbell_base + - process->doorbell_index * doorbell_process_allocation(); + process->doorbell_index * kfd_doorbell_process_slice(dev); } int kfd_alloc_process_doorbells(struct kfd_process *process) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 4d5c49ef2dc5..d9c0fe126429 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -160,6 +160,7 @@ struct kfd_device_info { const struct kfd_event_interrupt_class *event_interrupt_class; unsigned int max_pasid_bits; unsigned int max_no_of_hqd; + unsigned int doorbell_size; size_t ih_ring_entry_size; uint8_t num_of_watch_points; uint16_t mqd_size_aligned; @@ -364,7 +365,7 @@ struct queue_properties { uint32_t queue_percent; uint32_t *read_ptr; uint32_t *write_ptr; - uint32_t __iomem *doorbell_ptr; + void __iomem *doorbell_ptr; uint32_t doorbell_off; bool is_interop; bool is_evicted; @@ -728,11 +729,11 @@ void kfd_pasid_free(unsigned int pasid); int kfd_doorbell_init(struct kfd_dev *kfd); void kfd_doorbell_fini(struct kfd_dev *kfd); int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); -u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, +void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, unsigned int *doorbell_off); void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); u32 read_kernel_doorbell(u32 __iomem *db); -void write_kernel_doorbell(u32 __iomem *db, u32 value); +void write_kernel_doorbell(void __iomem *db, u32 value); unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, struct kfd_process *process, unsigned int queue_id); -- cgit v1.2.3 From b91d43dd01aadd43b1002160b78d77f8175876a4 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 10 Apr 2018 17:33:08 -0400 Subject: drm/amdkfd: Add GFXv9 MQD manager Signed-off-by: John Bridgman Signed-off-by: Jay Cornwall Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/Makefile | 1 + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 3 + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 443 ++++++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 + 5 files changed, 451 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device.c') diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 52b3c1b419f1..094b591ed8c2 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -30,6 +30,7 @@ amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ kfd_process.o kfd_queue.o kfd_mqd_manager.o \ kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ + kfd_mqd_manager_v9.o \ kfd_kernel_queue.o kfd_kernel_queue_cik.o \ kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ kfd_packet_manager.o kfd_process_queue_manager.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index f563acbc1ad7..c368ce3e96ff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -700,7 +700,7 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) return -ENOMEM; - *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); + *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); if ((*mem_obj) == NULL) return -ENOMEM; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index ee7061e1c466..4b8eb506642b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -38,6 +38,9 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, case CHIP_POLARIS10: case CHIP_POLARIS11: return mqd_manager_init_vi_tonga(type, dev); + case CHIP_VEGA10: + case CHIP_RAVEN: + return mqd_manager_init_v9(type, dev); default: WARN(1, "Unexpected ASIC family %u", dev->device_info->asic_family); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c new file mode 100644 index 000000000000..684054ff02cd --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -0,0 +1,443 @@ +/* + * Copyright 2016-2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include +#include +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "v9_structs.h" +#include "gc/gc_9_0_offset.h" +#include "gc/gc_9_0_sh_mask.h" +#include "sdma0/sdma0_4_0_sh_mask.h" + +static inline struct v9_mqd *get_mqd(void *mqd) +{ + return (struct v9_mqd *)mqd; +} + +static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct v9_sdma_mqd *)mqd; +} + +static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + int retval; + uint64_t addr; + struct v9_mqd *m; + struct kfd_dev *kfd = mm->dev; + + /* From V9, for CWSR, the control stack is located on the next page + * boundary after the mqd, we will use the gtt allocation function + * instead of sub-allocation function. + */ + if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { + *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); + if (!*mqd_mem_obj) + return -ENOMEM; + retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd, + ALIGN(q->ctl_stack_size, PAGE_SIZE) + + ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), + &((*mqd_mem_obj)->gtt_mem), + &((*mqd_mem_obj)->gpu_addr), + (void *)&((*mqd_mem_obj)->cpu_ptr)); + } else + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), + mqd_mem_obj); + if (retval != 0) + return -ENOMEM; + + m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; + addr = (*mqd_mem_obj)->gpu_addr; + + memset(m, 0, sizeof(struct v9_mqd)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | + 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; + + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_aql_control = + 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; + } + + if (q->tba_addr) { + m->compute_pgm_rsrc2 |= + (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); + } + + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + } + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + + return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, + wptr_shift, 0, mms); +} + +static int update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v9_mqd *m; + + m = get_mqd(mqd); + + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = + q->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + m->cp_hqd_ib_control = + 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | + 1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT; + + /* + * HW does not clamp this field correctly. Maximum EOP queue size + * is constrained by per-SE EOP done signal count, which is 8-bit. + * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit + * more than (EOP entry count - 1) so a queue size of 0x800 dwords + * is safe, giving a maximum field value of 0xA. + */ + m->cp_hqd_eop_control = min(0xA, + order_base_2(q->eop_ring_buffer_size / 4) - 1); + m->cp_hqd_eop_base_addr_lo = + lower_32_bits(q->eop_ring_buffer_address >> 8); + m->cp_hqd_eop_base_addr_hi = + upper_32_bits(q->eop_ring_buffer_address >> 8); + + m->cp_hqd_iq_timer = 0; + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | + 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | + 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; + m->cp_hqd_pq_doorbell_control |= 1 << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; + } + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) + m->cp_hqd_ctx_save_control = 0; + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); + + return 0; +} + + +static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_destroy + (mm->dev->kgd, mqd, type, timeout, + pipe_id, queue_id); +} + +static void uninit_mqd(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + struct kfd_dev *kfd = mm->dev; + + if (mqd_mem_obj->gtt_mem) { + kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); + kfree(mqd_mem_obj); + } else { + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); + } +} + +static bool is_occupied(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_is_occupied( + mm->dev->kgd, queue_address, + pipe_id, queue_id); +} + +static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v9_mqd *m; + int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + + if (retval != 0) + return retval; + + m = get_mqd(*mqd); + + m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | + 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; + + return retval; +} + +static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v9_mqd *m; + int retval = update_mqd(mm, mqd, q); + + if (retval != 0) + return retval; + + /* TODO: what's the point? update_mqd already does this. */ + m = get_mqd(mqd); + m->cp_hqd_vmid = q->vmid; + return retval; +} + +static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + int retval; + struct v9_sdma_mqd *m; + + + retval = kfd_gtt_sa_allocate(mm->dev, + sizeof(struct v9_sdma_mqd), + mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; + + memset(m, 0, sizeof(struct v9_sdma_mqd)); + + *mqd = m; + if (gart_addr) + *gart_addr = (*mqd_mem_obj)->gpu_addr; + + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, + (uint32_t __user *)p->write_ptr, + mms); +} + +#define SDMA_RLC_DUMMY_DEFAULT 0xf + +static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v9_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_doorbell_offset = + q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); + + return 0; +} + +/* + * * preempt type here is ignored because there is only one way + * * to preempt sdma queue + */ +static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +} + +static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v9_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v9_sdma_mqd), false); + return 0; +} + +#endif + +struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + mqd = kzalloc(sizeof(*mqd), GFP_NOIO); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + case KFD_MQD_TYPE_COMPUTE: + mqd->init_mqd = init_mqd; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_HIQ: + mqd->init_mqd = init_mqd_hiq; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_SDMA: + mqd->init_mqd = init_mqd_sdma; + mqd->uninit_mqd = uninit_mqd_sdma; + mqd->load_mqd = load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index b68299a3e18a..fac28827b000 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -197,6 +197,7 @@ struct kfd_mem_obj { uint32_t range_end; uint64_t gpu_addr; uint32_t *cpu_ptr; + void *gtt_mem; }; struct kfd_vmid_info { @@ -822,6 +823,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, -- cgit v1.2.3 From 3e76c2399b55483b1a28499b090f9d6600ab9eff Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 10 Apr 2018 17:33:16 -0400 Subject: drm/amdkfd: Add GFXv9 CWSR trap handler Signed-off-by: Shaoyun Liu Signed-off-by: Jay Cornwall Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 1495 ++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 13 +- 2 files changed, 1505 insertions(+), 3 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device.c') diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm new file mode 100644 index 000000000000..033580c997ea --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -0,0 +1,1495 @@ +/* + * Copyright 2016 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#if 0 +HW (GFX9) source code for CWSR trap handler +#Version 18 + multiple trap handler + +// this performance-optimal version was originally from Seven Xu at SRDC + +// Revison #18 --... +/* Rev History +** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) +** #4. SR Memory Layout: +** 1. VGPR-SGPR-HWREG-{LDS} +** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. +** #5. Update: 1. Accurate g8sr_ts_save_d timestamp +** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) +** #7. Update: 1. don't barrier if noLDS +** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version +** 2. Fix SQ issue by s_sleep 2 +** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last +** 2. optimize s_buffer save by burst 16sgprs... +** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. +** #11. Update 1. Add 2 more timestamp for debug version +** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance +** #13. Integ 1. Always use MUBUF for PV trap shader... +** #14. Update 1. s_buffer_store soft clause... +** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. +** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree +** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] +** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... +** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 +** 2. FUNC - Handle non-CWSR traps +*/ + +var G8SR_WDMEM_HWREG_OFFSET = 0 +var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes + +// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. + +var G8SR_DEBUG_TIMESTAMP = 0 +var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset +var s_g8sr_ts_save_s = s[34:35] // save start +var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi +var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ +var s_g8sr_ts_save_d = s[40:41] // save end +var s_g8sr_ts_restore_s = s[42:43] // restore start +var s_g8sr_ts_restore_d = s[44:45] // restore end + +var G8SR_VGPR_SR_IN_DWX4 = 0 +var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes +var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 + + +/*************************************************************************/ +/* control on how to run the shader */ +/*************************************************************************/ +//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) +var EMU_RUN_HACK = 0 +var EMU_RUN_HACK_RESTORE_NORMAL = 0 +var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 +var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 +var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var SAVE_LDS = 1 +var WG_BASE_ADDR_LO = 0x9000a000 +var WG_BASE_ADDR_HI = 0x0 +var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem +var CTX_SAVE_CONTROL = 0x0 +var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL +var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) +var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write +var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes +var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing +var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency + +/**************************************************************************/ +/* variables */ +/**************************************************************************/ +var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 +var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 +var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 +var SQ_WAVE_STATUS_HALT_MASK = 0x2000 + +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits + +var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 +var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask +var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 +var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 +var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 +var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 + +var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME +var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000 +var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME + +var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 +var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 + +var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 26 // bits [31:26] unused by SPI debug data +var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0xFC000000 + +/* Save */ +var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes +var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE + +var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_SAVE_SPI_INIT_ATC_SHIFT = 27 +var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 +var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used +var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME + +var s_save_spi_init_lo = exec_lo +var s_save_spi_init_hi = exec_hi + +var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +var s_save_pc_hi = ttmp1 +var s_save_exec_lo = ttmp2 +var s_save_exec_hi = ttmp3 +var s_save_tmp = ttmp4 +var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine +var s_save_xnack_mask_lo = ttmp6 +var s_save_xnack_mask_hi = ttmp7 +var s_save_buf_rsrc0 = ttmp8 +var s_save_buf_rsrc1 = ttmp9 +var s_save_buf_rsrc2 = ttmp10 +var s_save_buf_rsrc3 = ttmp11 +var s_save_status = ttmp12 +var s_save_mem_offset = ttmp14 +var s_save_alloc_size = s_save_trapsts //conflict +var s_save_m0 = ttmp15 +var s_save_ttmps_lo = s_save_tmp //no conflict +var s_save_ttmps_hi = s_save_trapsts //no conflict + +/* Restore */ +var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC + +var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 +var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 +var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT +var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK +var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK + +var s_restore_spi_init_lo = exec_lo +var s_restore_spi_init_hi = exec_hi + +var s_restore_mem_offset = ttmp12 +var s_restore_alloc_size = ttmp3 +var s_restore_tmp = ttmp2 +var s_restore_mem_offset_save = s_restore_tmp //no conflict + +var s_restore_m0 = s_restore_alloc_size //no conflict + +var s_restore_mode = ttmp7 + +var s_restore_pc_lo = ttmp0 +var s_restore_pc_hi = ttmp1 +var s_restore_exec_lo = ttmp14 +var s_restore_exec_hi = ttmp15 +var s_restore_status = ttmp4 +var s_restore_trapsts = ttmp5 +var s_restore_xnack_mask_lo = xnack_mask_lo +var s_restore_xnack_mask_hi = xnack_mask_hi +var s_restore_buf_rsrc0 = ttmp8 +var s_restore_buf_rsrc1 = ttmp9 +var s_restore_buf_rsrc2 = ttmp10 +var s_restore_buf_rsrc3 = ttmp11 +var s_restore_ttmps_lo = s_restore_tmp //no conflict +var s_restore_ttmps_hi = s_restore_alloc_size //no conflict + +/**************************************************************************/ +/* trap handler entry points */ +/**************************************************************************/ +/* Shader Main*/ + +shader main + asic(GFX9) + type(CS) + + + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore + //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC + s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. + s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE + //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE + s_branch L_SKIP_RESTORE //NOT restore, SAVE actually + else + s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save + end + +L_JUMP_TO_RESTORE: + s_branch L_RESTORE //restore + +L_SKIP_RESTORE: + + s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save + s_cbranch_scc1 L_SAVE //this is the operation for save + + // ********* Handle non-CWSR traps ******************* +if (!EMU_RUN_HACK) + // Illegal instruction is a non-maskable exception which blocks context save. + // Halt the wavefront and return from the trap. + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK + s_cbranch_scc1 L_HALT_WAVE + + // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA. + // Instead, halt the wavefront and return from the trap. + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK + s_cbranch_scc0 L_FETCH_2ND_TRAP + +L_HALT_WAVE: + // If STATUS.HALT is set then this fault must come from SQC instruction fetch. + // We cannot prevent further faults so just terminate the wavefront. + s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK + s_cbranch_scc0 L_NOT_ALREADY_HALTED + s_endpgm +L_NOT_ALREADY_HALTED: + s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK + + // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set. + // Rewind the PC to prevent this from occurring. The debugger compensates for this. + s_sub_u32 ttmp0, ttmp0, 0x8 + s_subb_u32 ttmp1, ttmp1, 0x0 + +L_FETCH_2ND_TRAP: + // Preserve and clear scalar XNACK state before issuing scalar reads. + // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26]. + s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS) + s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_andn2_b32 ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK + s_or_b32 ttmp11, ttmp11, ttmp3 + + s_andn2_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + + // Read second-level TBA/TMA from first-level TMA and jump if available. + // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) + // ttmp12 holds SQ_WAVE_STATUS + s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO) + s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI) + s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 + s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA + s_waitcnt lgkmcnt(0) + s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA + s_waitcnt lgkmcnt(0) + s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] + s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set + s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler + +L_NO_NEXT_TRAP: + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception + s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. + s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 + s_addc_u32 ttmp1, ttmp1, 0 +L_EXCP_CASE: + s_and_b32 ttmp1, ttmp1, 0xFFFF + + // Restore SQ_WAVE_IB_STS. + s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + + // Restore SQ_WAVE_STATUS. + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status + + s_rfe_b64 [ttmp0, ttmp1] +end + // ********* End handling of non-CWSR traps ******************* + +/**************************************************************************/ +/* save routine */ +/**************************************************************************/ + +L_SAVE: + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_save_s + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? +end + + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + + s_mov_b32 s_save_tmp, 0 //clear saveCtx bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG + + s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp + + /* inform SPI the readiness and wait for SPI's go signal */ + s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI + s_mov_b32 s_save_exec_hi, exec_hi + s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_sq_save_msg + s_waitcnt lgkmcnt(0) +end + + if (EMU_RUN_HACK) + + else + s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC + end + + L_SLEEP: + s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 + + if (EMU_RUN_HACK) + + else + s_cbranch_execz L_SLEEP + end + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_spi_wrexec + s_waitcnt lgkmcnt(0) +end + + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_save_tmp, v9, 0 + s_lshr_b32 s_save_tmp, s_save_tmp, 6 + s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + + // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 + get_vgpr_size_bytes(s_save_ttmps_lo) + get_sgpr_size_bytes(s_save_ttmps_hi) + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo + s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0 + s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF + s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1 + ack_sqc_store_workaround() + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1 + ack_sqc_store_workaround() + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1 + ack_sqc_store_workaround() + s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1 + ack_sqc_store_workaround() + + /* setup Resource Contants */ + s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo + s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited + s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE + + //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) + s_mov_b32 s_save_m0, m0 //save M0 + + /* global mem offset */ + s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 + + + + + /* save HW registers */ + ////////////////////////////// + + L_SAVE_HWREG: + // HWREG SR memory offset : size(VGPR)+size(SGPR) + get_vgpr_size_bytes(s_save_mem_offset) + get_sgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + + + s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 + + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + end + + write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC + write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC + write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS + + //s_save_trapsts conflicts with s_save_alloc_size + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS + + write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO + write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI + + //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 + s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) + + + + /* the first wave in the threadgroup */ + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit + s_mov_b32 s_save_exec_hi, 0x0 + s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] + + + /* save SGPRs */ + // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... + ////////////////////////////// + + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_save_mem_offset) + // TODO, change RSRC word to rearrange memory layout for SGPRS + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) + end + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 + //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 + s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 + s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset + s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 + + s_mov_b32 m0, 0x0 //SGPR initial index value =0 + s_nop 0x0 //Manually inserted wait states + L_SAVE_SGPR_LOOP: + // SGPR is allocated in 16 SGPR granularity + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] + s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] + s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] + + write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 + s_add_u32 m0, m0, 16 //next sgpr index + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? + // restore s_save_buf_rsrc0,1 + //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo + s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo + + + + + /* save first 4 VGPR, then LDS save could use */ + // each wave will alloc 4 vgprs at least... + ///////////////////////////////////////////////////////////////////////////////////// + + s_mov_b32 s_save_mem_offset, 0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + s_mov_b32 xnack_mask_lo, 0x0 + s_mov_b32 xnack_mask_hi, 0x0 + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // VGPR Allocated in 4-GPR granularity + +if G8SR_VGPR_SR_IN_DWX4 + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +end + + + + /* save LDS */ + ////////////////////////////// + + L_SAVE_LDS: + + // Change EXEC to all threads... + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE + + s_barrier //LDS is used? wait for other waves in the same TG + s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here + s_cbranch_scc0 L_SAVE_LDS_DONE + + // first wave do LDS save; + + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_save_mem_offset) + get_sgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() + + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 + + +var LDS_DMA_ENABLE = 0 +var UNROLL = 0 +if UNROLL==0 && LDS_DMA_ENABLE==1 + s_mov_b32 s3, 256*2 + s_nop 0 + s_nop 0 + s_nop 0 + L_SAVE_LDS_LOOP: + //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? + if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW + end + + s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? + +elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss + // store from higest LDS address to lowest + s_mov_b32 s3, 256*2 + s_sub_u32 m0, s_save_alloc_size, s3 + s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 + s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... + s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest + s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction + s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc + s_nop 0 + s_nop 0 + s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes + s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved + s_add_u32 s0, s0,s_save_alloc_size + s_addc_u32 s1, s1, 0 + s_setpc_b64 s[0:1] + + + for var i =0; i< 128; i++ + // be careful to make here a 64Byte aligned address, which could improve performance... + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW + + if i!=127 + s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline + s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 + end + end + +else // BUFFER_STORE + v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 + v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid + v_mul_i32_i24 v2, v3, 8 // tid*8 + v_mov_b32 v3, 256*2 + s_mov_b32 m0, 0x10000 + s_mov_b32 s0, s_save_buf_rsrc3 + s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT + +L_SAVE_LDS_LOOP_VECTOR: + ds_read_b64 v[0:1], v2 //x =LDS[a], byte address + s_waitcnt lgkmcnt(0) + buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 +// s_waitcnt vmcnt(0) +// v_add_u32 v2, vcc[0:1], v2, v3 + v_add_u32 v2, v2, v3 + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size + s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR + + // restore rsrc3 + s_mov_b32 s_save_buf_rsrc3, s0 + +end + +L_SAVE_LDS_DONE: + + + /* save VGPRs - set the Rest VGPRs */ + ////////////////////////////////////////////////////////////////////////////////////// + L_SAVE_VGPR: + // VGPR SR memory offset: 0 + // TODO rearrange the RSRC words to use swizzle for VGPR save... + + s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // VGPR Allocated in 4-GPR granularity + +if G8SR_VGPR_SR_IN_DWX4 + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + s_mov_b32 m0, 4 // skip first 4 VGPRs + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs + + s_set_gpr_idx_on m0, 0x1 // This will change M0 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 +L_SAVE_VGPR_LOOP: + v_mov_b32 v0, v0 // v0 = v[0+m0] + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + + + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + s_add_u32 m0, m0, 4 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? + s_set_gpr_idx_off +L_SAVE_VGPR_LOOP_END: + + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +else + // VGPR store using dw burst + s_mov_b32 m0, 0x4 //VGPR initial index value =0 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_END + + + s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later + + L_SAVE_VGPR_LOOP: + v_mov_b32 v0, v0 //v0 = v[0+m0] + v_mov_b32 v1, v1 //v0 = v[0+m0] + v_mov_b32 v2, v2 //v0 = v[0+m0] + v_mov_b32 v3, v3 //v0 = v[0+m0] + + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + end + + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? + s_set_gpr_idx_off +end + +L_SAVE_VGPR_END: + + + + + + + /* S_PGM_END_SAVED */ //FIXME graphics ONLY + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + s_rfe_b64 s_save_pc_lo //Return to the main shader program + else + end + +// Save Done timestamp +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_save_d + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_save_mem_offset) + s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? + // Need reset rsrc2?? + s_mov_b32 m0, s_save_mem_offset + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 +end + + + s_branch L_END_PGM + + + +/**************************************************************************/ +/* restore routine */ +/**************************************************************************/ + +L_RESTORE: + /* Setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_restore_tmp, v9, 0 + s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 + s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE + s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL + else + end + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_restore_s + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? + // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... + s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] + s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. +end + + + + s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo + s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) + s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE + + /* global mem offset */ +// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 + + /* the first wave in the threadgroup */ + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_RESTORE_VGPR + + /* restore LDS */ + ////////////////////////////// + L_RESTORE_LDS: + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? + + + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 + + L_RESTORE_LDS_LOOP: + if (SAVE_LDS) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW + end + s_add_u32 m0, m0, 256*2 // 128 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? + + + /* restore VGPRs */ + ////////////////////////////// + L_RESTORE_VGPR: + // VGPR SR memory offset : 0 + s_mov_b32 s_restore_mem_offset, 0x0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + +if G8SR_VGPR_SR_IN_DWX4 + get_vgpr_size_bytes(s_restore_mem_offset) + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + s_mov_b32 m0, s_restore_alloc_size + s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 + +L_RESTORE_VGPR_LOOP: + buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + s_waitcnt vmcnt(0) + s_sub_u32 m0, m0, 4 + v_mov_b32 v0, v0 // v[0+m0] = v0 + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_cmp_eq_u32 m0, 0x8000 + s_cbranch_scc0 L_RESTORE_VGPR_LOOP + s_set_gpr_idx_off + + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes + +else + // VGPR load using dw burst + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_mov_b32 m0, 4 //VGPR initial index value = 1 + s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later + + L_RESTORE_VGPR_LOOP: + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 + end + s_waitcnt vmcnt(0) //ensure data ready + v_mov_b32 v0, v0 //v[0+m0] = v0 + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? + s_set_gpr_idx_off + /* VGPR restore on v0 */ + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 + end + +end + + /* restore SGPRs */ + ////////////////////////////// + + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group + // TODO, change RSRC word to rearrange memory layout for SGPRS + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) + end + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 m0, s_restore_alloc_size + + L_RESTORE_SGPR_LOOP: + read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made + s_waitcnt lgkmcnt(0) //ensure data ready + + s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] + s_nop 0 // hazard SALU M0=> S_MOVREL + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + s_movreld_b64 s4, s4 + s_movreld_b64 s6, s6 + s_movreld_b64 s8, s8 + s_movreld_b64 s10, s10 + s_movreld_b64 s12, s12 + s_movreld_b64 s14, s14 + + s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? + + /* restore HW registers */ + ////////////////////////////// + L_RESTORE_HWREG: + + +if G8SR_DEBUG_TIMESTAMP + s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo + s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi +end + + // HWREG SR memory offset : size(VGPR)+size(SGPR) + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + + + s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 + read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC + read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC + read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS + read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS + read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO + read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI + read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE + + s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + + //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + + s_mov_b32 m0, s_restore_m0 + s_mov_b32 exec_lo, s_restore_exec_lo + s_mov_b32 exec_hi, s_restore_exec_hi + + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 + //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode + + // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 + get_vgpr_size_bytes(s_restore_ttmps_lo) + get_sgpr_size_bytes(s_restore_ttmps_hi) + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 + s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 + s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF + s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1 + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1 + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1 + s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1 + s_waitcnt lgkmcnt(0) + + //reuse s_restore_m0 as a temp register + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT + s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp + + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu + + s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_restore_d + s_waitcnt lgkmcnt(0) +end + +// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution + s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc + + +/**************************************************************************/ +/* the END */ +/**************************************************************************/ +L_END_PGM: + s_endpgm + +end + + +/**************************************************************************/ +/* the helper functions */ +/**************************************************************************/ + +//Only for save hwreg to mem +function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) + s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on + s_mov_b32 m0, s_mem_offset + s_buffer_store_dword s, s_rsrc, m0 glc:1 + ack_sqc_store_workaround() + s_add_u32 s_mem_offset, s_mem_offset, 4 + s_mov_b32 m0, exec_lo +end + + +// HWREG are saved before SGPRs, so all HWREG could be use. +function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) + + s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 + ack_sqc_store_workaround() + s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 + ack_sqc_store_workaround() + s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 + ack_sqc_store_workaround() + s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 + ack_sqc_store_workaround() + s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 + s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc +end + + +function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 +end + +function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 + s_sub_u32 s_mem_offset, s_mem_offset, 4*16 +end + + + +function get_lds_size_bytes(s_lds_size_byte) + // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW + s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size + s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW +end + +function get_vgpr_size_bytes(s_vgpr_size_byte) + s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible +end + +function get_sgpr_size_bytes(s_sgpr_size_byte) + s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 + s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) +end + +function get_hwreg_size_bytes + return 128 //HWREG size 128 bytes +end + +function ack_sqc_store_workaround + if ACK_SQC_STORE + s_waitcnt lgkmcnt(0) + end +end + + +#endif + +static const uint32_t cwsr_trap_gfx9_hex[] = { + 0xbf820001, 0xbf820158, + 0xb8f8f802, 0x89788678, + 0xb8f1f803, 0x866eff71, + 0x00000400, 0xbf850034, + 0x866eff71, 0x00000800, + 0xbf850003, 0x866eff71, + 0x00000100, 0xbf840008, + 0x866eff78, 0x00002000, + 0xbf840001, 0xbf810000, + 0x8778ff78, 0x00002000, + 0x80ec886c, 0x82ed806d, + 0xb8eef807, 0x866fff6e, + 0x001f8000, 0x8e6f8b6f, + 0x8977ff77, 0xfc000000, + 0x87776f77, 0x896eff6e, + 0x001f8000, 0xb96ef807, + 0xb8f0f812, 0xb8f1f813, + 0x8ef08870, 0xc0071bb8, + 0x00000000, 0xbf8cc07f, + 0xc0071c38, 0x00000008, + 0xbf8cc07f, 0x86ee6e6e, + 0xbf840001, 0xbe801d6e, + 0xb8f1f803, 0x8671ff71, + 0x000001ff, 0xbf850002, + 0x806c846c, 0x826d806d, + 0x866dff6d, 0x0000ffff, + 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x86fe7e7e, 0x86ea6a6a, + 0xb978f802, 0xbe801f6c, + 0x866dff6d, 0x0000ffff, + 0xbef00080, 0xb9700283, + 0xb8f02407, 0x8e709c70, + 0x876d706d, 0xb8f003c7, + 0x8e709b70, 0x876d706d, + 0xb8f0f807, 0x8670ff70, + 0x00007fff, 0xb970f807, + 0xbeee007e, 0xbeef007f, + 0xbefe0180, 0xbf900004, + 0xbf8e0002, 0xbf88fffe, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0xb8f11605, + 0x80718171, 0x8e718671, + 0x80707170, 0x80707e70, + 0x8271807f, 0x8671ff71, + 0x0000ffff, 0xc0471cb8, + 0x00000040, 0xbf8cc07f, + 0xc04b1d38, 0x00000048, + 0xbf8cc07f, 0xc0431e78, + 0x00000058, 0xbf8cc07f, + 0xc0471eb8, 0x0000005c, + 0xbf8cc07f, 0xbef4007e, + 0x8675ff7f, 0x0000ffff, + 0x8775ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x00807fac, 0x8670ff7f, + 0x08000000, 0x8f708370, + 0x87777077, 0x8670ff7f, + 0x70000000, 0x8f708170, + 0x87777077, 0xbefb007c, + 0xbefa0080, 0xb8fa2a05, + 0x807a817a, 0x8e7a8a7a, + 0xb8f01605, 0x80708170, + 0x8e708670, 0x807a707a, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xbefe007c, + 0xbefc007a, 0xc0611efa, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611b3a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611b7a, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611bba, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611bfa, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611e3a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xb8f1f803, + 0xbefe007c, 0xbefc007a, + 0xc0611c7a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611a3a, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611a7a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xb8fbf801, + 0xbefe007c, 0xbefc007a, + 0xc0611efa, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0x8670ff7f, + 0x04000000, 0xbeef0080, + 0x876f6f70, 0xb8fa2a05, + 0x807a817a, 0x8e7a8a7a, + 0xb8f11605, 0x80718171, + 0x8e718471, 0x8e768271, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747a74, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a717c, 0xbf85ffe7, + 0xbef40172, 0xbefa0080, + 0xbefe00c1, 0xbeff00c1, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, + 0xe0724000, 0x7a1d0000, + 0xe0724100, 0x7a1d0100, + 0xe0724200, 0x7a1d0200, + 0xe0724300, 0x7a1d0300, + 0xbefe00c1, 0xbeff00c1, + 0xb8f14306, 0x8671c171, + 0xbf84002c, 0xbf8a0000, + 0x8670ff6f, 0x04000000, + 0xbf840028, 0x8e718671, + 0x8e718271, 0xbef60071, + 0xb8fa2a05, 0x807a817a, + 0x8e7a8a7a, 0xb8f01605, + 0x80708170, 0x8e708670, + 0x807a707a, 0x807aff7a, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xd28c0002, 0x000100c1, + 0xd28d0003, 0x000204c1, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x7a1d0002, + 0x68040702, 0xd0c9006a, + 0x0000e302, 0xbf87fff7, + 0xbef70000, 0xbefa00ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8f12a05, + 0x80718171, 0x8e718271, + 0x8e768871, 0xbef600ff, + 0x01000000, 0xbefc0084, + 0xbf0a717c, 0xbf840015, + 0xbf11017c, 0x8071ff71, + 0x00001000, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0xe0724000, + 0x7a1d0000, 0xe0724100, + 0x7a1d0100, 0xe0724200, + 0x7a1d0200, 0xe0724300, + 0x7a1d0300, 0x807c847c, + 0x807aff7a, 0x00000400, + 0xbf0a717c, 0xbf85ffef, + 0xbf9c0000, 0xbf8200d9, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x866eff7f, 0x08000000, + 0x8f6e836e, 0x87776e77, + 0x866eff7f, 0x70000000, + 0x8f6e816e, 0x87776e77, + 0x866eff7f, 0x04000000, + 0xbf84001e, 0xbefe00c1, + 0xbeff00c1, 0xb8ef4306, + 0x866fc16f, 0xbf840019, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x8078ff78, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xe0510000, + 0x781d0000, 0xe0510100, + 0x781d0000, 0x807cff7c, + 0x00000200, 0x8078ff78, + 0x00000200, 0xbf0a6f7c, + 0xbf85fff6, 0xbef80080, + 0xbefe00c1, 0xbeff00c1, + 0xb8ef2a05, 0x806f816f, + 0x8e6f826f, 0x8e76886f, + 0xbef600ff, 0x01000000, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, + 0xbf11087c, 0x806fff6f, + 0x00008000, 0xe0524000, + 0x781d0000, 0xe0524100, + 0x781d0100, 0xe0524200, + 0x781d0200, 0xe0524300, + 0x781d0300, 0xbf8c0f70, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0x807c847c, 0x8078ff78, + 0x00000400, 0xbf0a6f7c, + 0xbf85ffee, 0xbf9c0000, + 0xe0524000, 0x6e1d0000, + 0xe0524100, 0x6e1d0100, + 0xe0524200, 0x6e1d0200, + 0xe0524300, 0x6e1d0300, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0x80f8c078, + 0xb8ef1605, 0x806f816f, + 0x8e6f846f, 0x8e76826f, + 0xbef600ff, 0x01000000, + 0xbefc006f, 0xc031003a, + 0x00000078, 0x80f8c078, + 0xbf8cc07f, 0x80fc907c, + 0xbf800000, 0xbe802d00, + 0xbe822d02, 0xbe842d04, + 0xbe862d06, 0xbe882d08, + 0xbe8a2d0a, 0xbe8c2d0c, + 0xbe8e2d0e, 0xbf06807c, + 0xbf84fff0, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, + 0x00000078, 0x80788478, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, + 0x00000078, 0x80788478, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, + 0x00000078, 0x80788478, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, + 0x00000078, 0x80788478, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, + 0x00000078, 0x80788478, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0x866dff6d, 0x0000ffff, + 0xbefc006f, 0xbefe007a, + 0xbeff007b, 0x866f71ff, + 0x000003ff, 0xb96f4803, + 0x866f71ff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8ef1605, 0x806f816f, + 0x8e6f866f, 0x806e6f6e, + 0x806e746e, 0x826f8075, + 0x866fff6f, 0x0000ffff, + 0xc0071cb7, 0x00000040, + 0xc00b1d37, 0x00000048, + 0xc0031e77, 0x00000058, + 0xc0071eb7, 0x0000005c, + 0xbf8cc07f, 0x866fff6d, + 0xf0000000, 0x8f6f9c6f, + 0x8e6f906f, 0xbeee0080, + 0x876e6f6e, 0x866fff6d, + 0x08000000, 0x8f6f9b6f, + 0x8e6f8f6f, 0x876e6f6e, + 0x866fff70, 0x00800000, + 0x8f6f976f, 0xb96ef807, + 0x86fe7e7e, 0x86ea6a6a, + 0xb970f802, 0xbf8a0000, + 0x95806f6c, 0xbf810000, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index c368ce3e96ff..053f1d0f80b8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -30,6 +30,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_pm4_headers_vi.h" #include "cwsr_trap_handler_gfx8.asm" +#include "cwsr_trap_handler_gfx9.asm" #include "kfd_iommu.h" #define MQD_SIZE_ALIGNED 768 @@ -333,10 +334,16 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, static void kfd_cwsr_init(struct kfd_dev *kfd) { if (cwsr_enable && kfd->device_info->supports_cwsr) { - BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); + if (kfd->device_info->asic_family < CHIP_VEGA10) { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_gfx8_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); + } else { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_gfx9_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); + } - kfd->cwsr_isa = cwsr_trap_gfx8_hex; - kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); kfd->cwsr_enabled = true; } } -- cgit v1.2.3 From 6106dce9559ec5a4b1a97302f2fcc508e40d2747 Mon Sep 17 00:00:00 2001 From: welu Date: Tue, 10 Apr 2018 17:33:17 -0400 Subject: drm/amdkfd: Try to enable atomics for all GPUs Report failure to enable atomics only on GPUs that require them. This allows GPUs that don't require atomics to function, but can benefit if they are available. This is the case for Vega10, which doesn't use atomics for basic functioning of the MEC, AQL and HWS microcode. So it can work without atomics. But shader programs can still use atomic instructions on systems that support PCIe atomics. Signed-off-by: welu Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device.c') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 053f1d0f80b8..0e64fb2c95e5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -290,7 +290,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) { struct kfd_dev *kfd; - + int ret; const struct kfd_device_info *device_info = lookup_device_info(pdev->device); @@ -299,19 +299,18 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, return NULL; } - if (device_info->needs_pci_atomics) { - /* Allow BIF to recode atomics to PCIe 3.0 - * AtomicOps. 32 and 64-bit requests are possible and - * must be supported. - */ - if (pci_enable_atomic_ops_to_root(pdev, - PCI_EXP_DEVCAP2_ATOMIC_COMP32 | - PCI_EXP_DEVCAP2_ATOMIC_COMP64) < 0) { - dev_info(kfd_device, - "skipped device %x:%x, PCI rejects atomics", - pdev->vendor, pdev->device); - return NULL; - } + /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. + * 32 and 64-bit requests are possible and must be + * supported. + */ + ret = pci_enable_atomic_ops_to_root(pdev, + PCI_EXP_DEVCAP2_ATOMIC_COMP32 | + PCI_EXP_DEVCAP2_ATOMIC_COMP64); + if (device_info->needs_pci_atomics && ret < 0) { + dev_info(kfd_device, + "skipped device %x:%x, PCI rejects atomics\n", + pdev->vendor, pdev->device); + return NULL; } kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); -- cgit v1.2.3 From 389056e5fef477c838dc20a08d6f1de960cf027b Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 10 Apr 2018 17:33:18 -0400 Subject: drm/amdkfd: Add Vega10 topology and device info * Report 64-bit doorbells as HSA_CAP_DOORBELL_TYPE_2_0 in topology * Report cache information in topology (duplicates GFXv8 info for now) * Add device info for Vega10 support in KFD Raven is not enabled at this time as it needs additional changes in DQM to work with a single SDMA engine. Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 11 +++++++++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 37 +++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 6 +++++ drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 1 + 4 files changed, 55 insertions(+) (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device.c') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 4f126ef6139b..296b3f230280 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -132,6 +132,9 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = { #define fiji_cache_info carrizo_cache_info #define polaris10_cache_info carrizo_cache_info #define polaris11_cache_info carrizo_cache_info +/* TODO - check & update Vega10 cache details */ +#define vega10_cache_info carrizo_cache_info +#define raven_cache_info carrizo_cache_info static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, struct crat_subtype_computeunit *cu) @@ -603,6 +606,14 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, pcache_info = polaris11_cache_info; num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); break; + case CHIP_VEGA10: + pcache_info = vega10_cache_info; + num_of_cache_types = ARRAY_SIZE(vega10_cache_info); + break; + case CHIP_RAVEN: + pcache_info = raven_cache_info; + num_of_cache_types = ARRAY_SIZE(raven_cache_info); + break; default: return -EINVAL; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0e64fb2c95e5..dd6c7535b6b4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -182,6 +182,34 @@ static const struct kfd_device_info polaris11_device_info = { .needs_pci_atomics = true, }; +static const struct kfd_device_info vega10_device_info = { + .asic_family = CHIP_VEGA10, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = false, +}; + +static const struct kfd_device_info vega10_vf_device_info = { + .asic_family = CHIP_VEGA10, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = false, +}; + struct kfd_deviceid { unsigned short did; @@ -261,6 +289,15 @@ static const struct kfd_deviceid supported_devices[] = { { 0x67EB, &polaris11_device_info }, /* Polaris11 */ { 0x67EF, &polaris11_device_info }, /* Polaris11 */ { 0x67FF, &polaris11_device_info }, /* Polaris11 */ + { 0x6860, &vega10_device_info }, /* Vega10 */ + { 0x6861, &vega10_device_info }, /* Vega10 */ + { 0x6862, &vega10_device_info }, /* Vega10 */ + { 0x6863, &vega10_device_info }, /* Vega10 */ + { 0x6864, &vega10_device_info }, /* Vega10 */ + { 0x6867, &vega10_device_info }, /* Vega10 */ + { 0x6868, &vega10_device_info }, /* Vega10 */ + { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ + { 0x687F, &vega10_device_info }, /* Vega10 */ }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index ac28abc94e57..bc95d4dfee2e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1239,6 +1239,12 @@ int kfd_topology_add_device(struct kfd_dev *gpu) HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); break; + case CHIP_VEGA10: + case CHIP_RAVEN: + dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); + break; default: WARN(1, "Unexpected ASIC family %u", dev->gpu->device_info->asic_family); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index eb54cfcaf039..7d9c3f948dff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -45,6 +45,7 @@ #define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 #define HSA_CAP_DOORBELL_TYPE_1_0 0x1 +#define HSA_CAP_DOORBELL_TYPE_2_0 0x2 #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 struct kfd_node_properties { -- cgit v1.2.3 From a2e94158b83185c9dac430cb53bff26737a786ef Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 1 May 2018 17:56:05 -0400 Subject: drm/amdkfd: Remove redundant include of amd-iommu.h Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device.c') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index dd6c7535b6b4..c1d9e2772cbc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -20,9 +20,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) -#include -#endif #include #include #include -- cgit v1.2.3 From 0db54b24ad676c3f2d0cf5291c9d170e3e15f213 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 1 May 2018 17:56:06 -0400 Subject: drm/amdkfd: Separate trap handler assembly code and its hex values Since the assembly code is inside "#if 0", it is ineffective. Despite that, during debugging, we need to change the assembly code, extract it into a separate file and compile the new file into hex values using sp3. That process also requires us to remove "#if 0" and modify lines starting with "#", so that sp3 can successfully compile the new file. With this change, all the above chore is no longer needed, and cwsr_trap_handler_gfx*.asm can be directly used by sp3 to generate its hex values. Signed-off-by: Yong Zhao Reviewed-by: Felix Kuehling Signed-off-by: Felix Kuehling Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 560 +++++++++++++++++++++ .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm | 267 +--------- .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 300 +---------- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 3 +- 4 files changed, 575 insertions(+), 555 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device.c') diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h new file mode 100644 index 000000000000..a546a219d025 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -0,0 +1,560 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +static const uint32_t cwsr_trap_gfx8_hex[] = { + 0xbf820001, 0xbf820125, + 0xb8f4f802, 0x89748674, + 0xb8f5f803, 0x8675ff75, + 0x00000400, 0xbf850011, + 0xc00a1e37, 0x00000000, + 0xbf8c007f, 0x87777978, + 0xbf840002, 0xb974f802, + 0xbe801d78, 0xb8f5f803, + 0x8675ff75, 0x000001ff, + 0xbf850002, 0x80708470, + 0x82718071, 0x8671ff71, + 0x0000ffff, 0xb974f802, + 0xbe801f70, 0xb8f5f803, + 0x8675ff75, 0x00000100, + 0xbf840006, 0xbefa0080, + 0xb97a0203, 0x8671ff71, + 0x0000ffff, 0x80f08870, + 0x82f18071, 0xbefa0080, + 0xb97a0283, 0xbef60068, + 0xbef70069, 0xb8fa1c07, + 0x8e7a9c7a, 0x87717a71, + 0xb8fa03c7, 0x8e7a9b7a, + 0x87717a71, 0xb8faf807, + 0x867aff7a, 0x00007fff, + 0xb97af807, 0xbef2007e, + 0xbef3007f, 0xbefe0180, + 0xbf900004, 0x877a8474, + 0xb97af802, 0xbf8e0002, + 0xbf88fffe, 0xbef8007e, + 0x8679ff7f, 0x0000ffff, + 0x8779ff79, 0x00040000, + 0xbefa0080, 0xbefb00ff, + 0x00807fac, 0x867aff7f, + 0x08000000, 0x8f7a837a, + 0x877b7a7b, 0x867aff7f, + 0x70000000, 0x8f7a817a, + 0x877b7a7b, 0xbeef007c, + 0xbeee0080, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x806e7a6e, + 0xbefa0084, 0xbefa00ff, + 0x01000000, 0xbefe007c, + 0xbefc006e, 0xc0611bfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611c3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611c7c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611cbc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611cfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611d3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xb8f5f803, + 0xbefe007c, 0xbefc006e, + 0xc0611d7c, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0611dbc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0611dfc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xb8eff801, 0xbefe007c, + 0xbefc006e, 0xc0611bfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611b3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611b7c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0x867aff7f, + 0x04000000, 0xbef30080, + 0x8773737a, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8f51605, 0x80758175, + 0x8e758475, 0x8e7a8275, + 0xbefa00ff, 0x01000000, + 0xbef60178, 0x80786e78, + 0x82798079, 0xbefc0080, + 0xbe802b00, 0xbe822b02, + 0xbe842b04, 0xbe862b06, + 0xbe882b08, 0xbe8a2b0a, + 0xbe8c2b0c, 0xbe8e2b0e, + 0xc06b003c, 0x00000000, + 0xc06b013c, 0x00000010, + 0xc06b023c, 0x00000020, + 0xc06b033c, 0x00000030, + 0x8078c078, 0x82798079, + 0x807c907c, 0xbf0a757c, + 0xbf85ffeb, 0xbef80176, + 0xbeee0080, 0xbefe00c1, + 0xbeff00c1, 0xbefa00ff, + 0x01000000, 0xe0724000, + 0x6e1e0000, 0xe0724100, + 0x6e1e0100, 0xe0724200, + 0x6e1e0200, 0xe0724300, + 0x6e1e0300, 0xbefe00c1, + 0xbeff00c1, 0xb8f54306, + 0x8675c175, 0xbf84002c, + 0xbf8a0000, 0x867aff73, + 0x04000000, 0xbf840028, + 0x8e758675, 0x8e758275, + 0xbefa0075, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x806e7a6e, + 0x806eff6e, 0x00000080, + 0xbefa00ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0xd1060002, + 0x00011103, 0x7e0602ff, + 0x00000200, 0xbefc00ff, + 0x00010000, 0xbe80007b, + 0x867bff7b, 0xff7fffff, + 0x877bff7b, 0x00058000, + 0xd8ec0000, 0x00000002, + 0xbf8c007f, 0xe0765000, + 0x6e1e0002, 0x32040702, + 0xd0c9006a, 0x0000eb02, + 0xbf87fff7, 0xbefb0000, + 0xbeee00ff, 0x00000400, + 0xbefe00c1, 0xbeff00c1, + 0xb8f52a05, 0x80758175, + 0x8e758275, 0x8e7a8875, + 0xbefa00ff, 0x01000000, + 0xbefc0084, 0xbf0a757c, + 0xbf840015, 0xbf11017c, + 0x8075ff75, 0x00001000, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x6e1e0000, + 0xe0724100, 0x6e1e0100, + 0xe0724200, 0x6e1e0200, + 0xe0724300, 0x6e1e0300, + 0x807c847c, 0x806eff6e, + 0x00000400, 0xbf0a757c, + 0xbf85ffef, 0xbf9c0000, + 0xbf8200ca, 0xbef8007e, + 0x8679ff7f, 0x0000ffff, + 0x8779ff79, 0x00040000, + 0xbefa0080, 0xbefb00ff, + 0x00807fac, 0x8676ff7f, + 0x08000000, 0x8f768376, + 0x877b767b, 0x8676ff7f, + 0x70000000, 0x8f768176, + 0x877b767b, 0x8676ff7f, + 0x04000000, 0xbf84001e, + 0xbefe00c1, 0xbeff00c1, + 0xb8f34306, 0x8673c173, + 0xbf840019, 0x8e738673, + 0x8e738273, 0xbefa0073, + 0xb8f22a05, 0x80728172, + 0x8e728a72, 0xb8f61605, + 0x80768176, 0x8e768676, + 0x80727672, 0x8072ff72, + 0x00000080, 0xbefa00ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x721e0000, + 0xe0510100, 0x721e0000, + 0x807cff7c, 0x00000200, + 0x8072ff72, 0x00000200, + 0xbf0a737c, 0xbf85fff6, + 0xbef20080, 0xbefe00c1, + 0xbeff00c1, 0xb8f32a05, + 0x80738173, 0x8e738273, + 0x8e7a8873, 0xbefa00ff, + 0x01000000, 0xbef60072, + 0x8072ff72, 0x00000400, + 0xbefc0084, 0xbf11087c, + 0x8073ff73, 0x00008000, + 0xe0524000, 0x721e0000, + 0xe0524100, 0x721e0100, + 0xe0524200, 0x721e0200, + 0xe0524300, 0x721e0300, + 0xbf8c0f70, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0x807c847c, + 0x8072ff72, 0x00000400, + 0xbf0a737c, 0xbf85ffee, + 0xbf9c0000, 0xe0524000, + 0x761e0000, 0xe0524100, + 0x761e0100, 0xe0524200, + 0x761e0200, 0xe0524300, + 0x761e0300, 0xb8f22a05, + 0x80728172, 0x8e728a72, + 0xb8f61605, 0x80768176, + 0x8e768676, 0x80727672, + 0x80f2c072, 0xb8f31605, + 0x80738173, 0x8e738473, + 0x8e7a8273, 0xbefa00ff, + 0x01000000, 0xbefc0073, + 0xc031003c, 0x00000072, + 0x80f2c072, 0xbf8c007f, + 0x80fc907c, 0xbe802d00, + 0xbe822d02, 0xbe842d04, + 0xbe862d06, 0xbe882d08, + 0xbe8a2d0a, 0xbe8c2d0c, + 0xbe8e2d0e, 0xbf06807c, + 0xbf84fff1, 0xb8f22a05, + 0x80728172, 0x8e728a72, + 0xb8f61605, 0x80768176, + 0x8e768676, 0x80727672, + 0xbefa0084, 0xbefa00ff, + 0x01000000, 0xc0211cfc, + 0x00000072, 0x80728472, + 0xc0211c3c, 0x00000072, + 0x80728472, 0xc0211c7c, + 0x00000072, 0x80728472, + 0xc0211bbc, 0x00000072, + 0x80728472, 0xc0211bfc, + 0x00000072, 0x80728472, + 0xc0211d3c, 0x00000072, + 0x80728472, 0xc0211d7c, + 0x00000072, 0x80728472, + 0xc0211a3c, 0x00000072, + 0x80728472, 0xc0211a7c, + 0x00000072, 0x80728472, + 0xc0211dfc, 0x00000072, + 0x80728472, 0xc0211b3c, + 0x00000072, 0x80728472, + 0xc0211b7c, 0x00000072, + 0x80728472, 0xbf8c007f, + 0x8671ff71, 0x0000ffff, + 0xbefc0073, 0xbefe006e, + 0xbeff006f, 0x867375ff, + 0x000003ff, 0xb9734803, + 0x867375ff, 0xfffff800, + 0x8f738b73, 0xb973a2c3, + 0xb977f801, 0x8673ff71, + 0xf0000000, 0x8f739c73, + 0x8e739073, 0xbef60080, + 0x87767376, 0x8673ff71, + 0x08000000, 0x8f739b73, + 0x8e738f73, 0x87767376, + 0x8673ff74, 0x00800000, + 0x8f739773, 0xb976f807, + 0x86fe7e7e, 0x86ea6a6a, + 0xb974f802, 0xbf8a0000, + 0x95807370, 0xbf810000, +}; + + +static const uint32_t cwsr_trap_gfx9_hex[] = { + 0xbf820001, 0xbf82015a, + 0xb8f8f802, 0x89788678, + 0xb8f1f803, 0x866eff71, + 0x00000400, 0xbf850034, + 0x866eff71, 0x00000800, + 0xbf850003, 0x866eff71, + 0x00000100, 0xbf840008, + 0x866eff78, 0x00002000, + 0xbf840001, 0xbf810000, + 0x8778ff78, 0x00002000, + 0x80ec886c, 0x82ed806d, + 0xb8eef807, 0x866fff6e, + 0x001f8000, 0x8e6f8b6f, + 0x8977ff77, 0xfc000000, + 0x87776f77, 0x896eff6e, + 0x001f8000, 0xb96ef807, + 0xb8f0f812, 0xb8f1f813, + 0x8ef08870, 0xc0071bb8, + 0x00000000, 0xbf8cc07f, + 0xc0071c38, 0x00000008, + 0xbf8cc07f, 0x86ee6e6e, + 0xbf840001, 0xbe801d6e, + 0xb8f1f803, 0x8671ff71, + 0x000001ff, 0xbf850002, + 0x806c846c, 0x826d806d, + 0x866dff6d, 0x0000ffff, + 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, + 0x86fe7e7e, 0x86ea6a6a, + 0xb978f802, 0xbe801f6c, + 0x866dff6d, 0x0000ffff, + 0xbef00080, 0xb9700283, + 0xb8f02407, 0x8e709c70, + 0x876d706d, 0xb8f003c7, + 0x8e709b70, 0x876d706d, + 0xb8f0f807, 0x8670ff70, + 0x00007fff, 0xb970f807, + 0xbeee007e, 0xbeef007f, + 0xbefe0180, 0xbf900004, + 0x87708478, 0xb970f802, + 0xbf8e0002, 0xbf88fffe, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0xb8f11605, + 0x80718171, 0x8e718671, + 0x80707170, 0x80707e70, + 0x8271807f, 0x8671ff71, + 0x0000ffff, 0xc0471cb8, + 0x00000040, 0xbf8cc07f, + 0xc04b1d38, 0x00000048, + 0xbf8cc07f, 0xc0431e78, + 0x00000058, 0xbf8cc07f, + 0xc0471eb8, 0x0000005c, + 0xbf8cc07f, 0xbef4007e, + 0x8675ff7f, 0x0000ffff, + 0x8775ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x00807fac, 0x8670ff7f, + 0x08000000, 0x8f708370, + 0x87777077, 0x8670ff7f, + 0x70000000, 0x8f708170, + 0x87777077, 0xbefb007c, + 0xbefa0080, 0xb8fa2a05, + 0x807a817a, 0x8e7a8a7a, + 0xb8f01605, 0x80708170, + 0x8e708670, 0x807a707a, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xbefe007c, + 0xbefc007a, 0xc0611efa, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611b3a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611b7a, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611bba, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611bfa, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611e3a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xb8f1f803, + 0xbefe007c, 0xbefc007a, + 0xc0611c7a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611a3a, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611a7a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xb8fbf801, + 0xbefe007c, 0xbefc007a, + 0xc0611efa, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0x8670ff7f, + 0x04000000, 0xbeef0080, + 0x876f6f70, 0xb8fa2a05, + 0x807a817a, 0x8e7a8a7a, + 0xb8f11605, 0x80718171, + 0x8e718471, 0x8e768271, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747a74, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a717c, 0xbf85ffe7, + 0xbef40172, 0xbefa0080, + 0xbefe00c1, 0xbeff00c1, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, + 0xe0724000, 0x7a1d0000, + 0xe0724100, 0x7a1d0100, + 0xe0724200, 0x7a1d0200, + 0xe0724300, 0x7a1d0300, + 0xbefe00c1, 0xbeff00c1, + 0xb8f14306, 0x8671c171, + 0xbf84002c, 0xbf8a0000, + 0x8670ff6f, 0x04000000, + 0xbf840028, 0x8e718671, + 0x8e718271, 0xbef60071, + 0xb8fa2a05, 0x807a817a, + 0x8e7a8a7a, 0xb8f01605, + 0x80708170, 0x8e708670, + 0x807a707a, 0x807aff7a, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xd28c0002, 0x000100c1, + 0xd28d0003, 0x000204c1, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x7a1d0002, + 0x68040702, 0xd0c9006a, + 0x0000e302, 0xbf87fff7, + 0xbef70000, 0xbefa00ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8f12a05, + 0x80718171, 0x8e718271, + 0x8e768871, 0xbef600ff, + 0x01000000, 0xbefc0084, + 0xbf0a717c, 0xbf840015, + 0xbf11017c, 0x8071ff71, + 0x00001000, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0xe0724000, + 0x7a1d0000, 0xe0724100, + 0x7a1d0100, 0xe0724200, + 0x7a1d0200, 0xe0724300, + 0x7a1d0300, 0x807c847c, + 0x807aff7a, 0x00000400, + 0xbf0a717c, 0xbf85ffef, + 0xbf9c0000, 0xbf8200d9, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x866eff7f, 0x08000000, + 0x8f6e836e, 0x87776e77, + 0x866eff7f, 0x70000000, + 0x8f6e816e, 0x87776e77, + 0x866eff7f, 0x04000000, + 0xbf84001e, 0xbefe00c1, + 0xbeff00c1, 0xb8ef4306, + 0x866fc16f, 0xbf840019, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x8078ff78, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xe0510000, + 0x781d0000, 0xe0510100, + 0x781d0000, 0x807cff7c, + 0x00000200, 0x8078ff78, + 0x00000200, 0xbf0a6f7c, + 0xbf85fff6, 0xbef80080, + 0xbefe00c1, 0xbeff00c1, + 0xb8ef2a05, 0x806f816f, + 0x8e6f826f, 0x8e76886f, + 0xbef600ff, 0x01000000, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, + 0xbf11087c, 0x806fff6f, + 0x00008000, 0xe0524000, + 0x781d0000, 0xe0524100, + 0x781d0100, 0xe0524200, + 0x781d0200, 0xe0524300, + 0x781d0300, 0xbf8c0f70, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0x807c847c, 0x8078ff78, + 0x00000400, 0xbf0a6f7c, + 0xbf85ffee, 0xbf9c0000, + 0xe0524000, 0x6e1d0000, + 0xe0524100, 0x6e1d0100, + 0xe0524200, 0x6e1d0200, + 0xe0524300, 0x6e1d0300, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0x80f8c078, + 0xb8ef1605, 0x806f816f, + 0x8e6f846f, 0x8e76826f, + 0xbef600ff, 0x01000000, + 0xbefc006f, 0xc031003a, + 0x00000078, 0x80f8c078, + 0xbf8cc07f, 0x80fc907c, + 0xbf800000, 0xbe802d00, + 0xbe822d02, 0xbe842d04, + 0xbe862d06, 0xbe882d08, + 0xbe8a2d0a, 0xbe8c2d0c, + 0xbe8e2d0e, 0xbf06807c, + 0xbf84fff0, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, + 0x00000078, 0x80788478, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, + 0x00000078, 0x80788478, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, + 0x00000078, 0x80788478, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, + 0x00000078, 0x80788478, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, + 0x00000078, 0x80788478, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0x866dff6d, 0x0000ffff, + 0xbefc006f, 0xbefe007a, + 0xbeff007b, 0x866f71ff, + 0x000003ff, 0xb96f4803, + 0x866f71ff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8ef1605, 0x806f816f, + 0x8e6f866f, 0x806e6f6e, + 0x806e746e, 0x826f8075, + 0x866fff6f, 0x0000ffff, + 0xc0071cb7, 0x00000040, + 0xc00b1d37, 0x00000048, + 0xc0031e77, 0x00000058, + 0xc0071eb7, 0x0000005c, + 0xbf8cc07f, 0x866fff6d, + 0xf0000000, 0x8f6f9c6f, + 0x8e6f906f, 0xbeee0080, + 0x876e6f6e, 0x866fff6d, + 0x08000000, 0x8f6f9b6f, + 0x8e6f8f6f, 0x876e6f6e, + 0x866fff70, 0x00800000, + 0x8f6f976f, 0xb96ef807, + 0x86fe7e7e, 0x86ea6a6a, + 0xb970f802, 0xbf8a0000, + 0x95806f6c, 0xbf810000, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm index 34eabcdd27a0..658a4c6be8e4 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm @@ -20,9 +20,12 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#if 0 -HW (VI) source code for CWSR trap handler -#Version 18 + multiple trap handler +/* To compile this assembly code: + * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex + */ + +/* HW (VI) source code for CWSR trap handler */ +/* Version 18 + multiple trap handler */ // this performance-optimal version was originally from Seven Xu at SRDC @@ -150,7 +153,7 @@ var s_save_spi_init_lo = exec_lo var s_save_spi_init_hi = exec_hi //tba_lo and tba_hi need to be saved/restored -var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} var s_save_pc_hi = ttmp1 var s_save_exec_lo = ttmp2 var s_save_exec_hi = ttmp3 @@ -1132,259 +1135,3 @@ end function get_hwreg_size_bytes return 128 //HWREG size 128 bytes end - - -#endif - -static const uint32_t cwsr_trap_gfx8_hex[] = { - 0xbf820001, 0xbf820125, - 0xb8f4f802, 0x89748674, - 0xb8f5f803, 0x8675ff75, - 0x00000400, 0xbf850011, - 0xc00a1e37, 0x00000000, - 0xbf8c007f, 0x87777978, - 0xbf840002, 0xb974f802, - 0xbe801d78, 0xb8f5f803, - 0x8675ff75, 0x000001ff, - 0xbf850002, 0x80708470, - 0x82718071, 0x8671ff71, - 0x0000ffff, 0xb974f802, - 0xbe801f70, 0xb8f5f803, - 0x8675ff75, 0x00000100, - 0xbf840006, 0xbefa0080, - 0xb97a0203, 0x8671ff71, - 0x0000ffff, 0x80f08870, - 0x82f18071, 0xbefa0080, - 0xb97a0283, 0xbef60068, - 0xbef70069, 0xb8fa1c07, - 0x8e7a9c7a, 0x87717a71, - 0xb8fa03c7, 0x8e7a9b7a, - 0x87717a71, 0xb8faf807, - 0x867aff7a, 0x00007fff, - 0xb97af807, 0xbef2007e, - 0xbef3007f, 0xbefe0180, - 0xbf900004, 0x877a8474, - 0xb97af802, 0xbf8e0002, - 0xbf88fffe, 0xbef8007e, - 0x8679ff7f, 0x0000ffff, - 0x8779ff79, 0x00040000, - 0xbefa0080, 0xbefb00ff, - 0x00807fac, 0x867aff7f, - 0x08000000, 0x8f7a837a, - 0x877b7a7b, 0x867aff7f, - 0x70000000, 0x8f7a817a, - 0x877b7a7b, 0xbeef007c, - 0xbeee0080, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8fa1605, 0x807a817a, - 0x8e7a867a, 0x806e7a6e, - 0xbefa0084, 0xbefa00ff, - 0x01000000, 0xbefe007c, - 0xbefc006e, 0xc0611bfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611c3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611c7c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611cbc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611cfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611d3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xb8f5f803, - 0xbefe007c, 0xbefc006e, - 0xc0611d7c, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xbefe007c, 0xbefc006e, - 0xc0611dbc, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xbefe007c, 0xbefc006e, - 0xc0611dfc, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xb8eff801, 0xbefe007c, - 0xbefc006e, 0xc0611bfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611b3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611b7c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbef30080, - 0x8773737a, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8f51605, 0x80758175, - 0x8e758475, 0x8e7a8275, - 0xbefa00ff, 0x01000000, - 0xbef60178, 0x80786e78, - 0x82798079, 0xbefc0080, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003c, 0x00000000, - 0xc06b013c, 0x00000010, - 0xc06b023c, 0x00000020, - 0xc06b033c, 0x00000030, - 0x8078c078, 0x82798079, - 0x807c907c, 0xbf0a757c, - 0xbf85ffeb, 0xbef80176, - 0xbeee0080, 0xbefe00c1, - 0xbeff00c1, 0xbefa00ff, - 0x01000000, 0xe0724000, - 0x6e1e0000, 0xe0724100, - 0x6e1e0100, 0xe0724200, - 0x6e1e0200, 0xe0724300, - 0x6e1e0300, 0xbefe00c1, - 0xbeff00c1, 0xb8f54306, - 0x8675c175, 0xbf84002c, - 0xbf8a0000, 0x867aff73, - 0x04000000, 0xbf840028, - 0x8e758675, 0x8e758275, - 0xbefa0075, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8fa1605, 0x807a817a, - 0x8e7a867a, 0x806e7a6e, - 0x806eff6e, 0x00000080, - 0xbefa00ff, 0x01000000, - 0xbefc0080, 0xd28c0002, - 0x000100c1, 0xd28d0003, - 0x000204c1, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe80007b, - 0x867bff7b, 0xff7fffff, - 0x877bff7b, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8c007f, 0xe0765000, - 0x6e1e0002, 0x32040702, - 0xd0c9006a, 0x0000eb02, - 0xbf87fff7, 0xbefb0000, - 0xbeee00ff, 0x00000400, - 0xbefe00c1, 0xbeff00c1, - 0xb8f52a05, 0x80758175, - 0x8e758275, 0x8e7a8875, - 0xbefa00ff, 0x01000000, - 0xbefc0084, 0xbf0a757c, - 0xbf840015, 0xbf11017c, - 0x8075ff75, 0x00001000, - 0x7e000300, 0x7e020301, - 0x7e040302, 0x7e060303, - 0xe0724000, 0x6e1e0000, - 0xe0724100, 0x6e1e0100, - 0xe0724200, 0x6e1e0200, - 0xe0724300, 0x6e1e0300, - 0x807c847c, 0x806eff6e, - 0x00000400, 0xbf0a757c, - 0xbf85ffef, 0xbf9c0000, - 0xbf8200ca, 0xbef8007e, - 0x8679ff7f, 0x0000ffff, - 0x8779ff79, 0x00040000, - 0xbefa0080, 0xbefb00ff, - 0x00807fac, 0x8676ff7f, - 0x08000000, 0x8f768376, - 0x877b767b, 0x8676ff7f, - 0x70000000, 0x8f768176, - 0x877b767b, 0x8676ff7f, - 0x04000000, 0xbf84001e, - 0xbefe00c1, 0xbeff00c1, - 0xb8f34306, 0x8673c173, - 0xbf840019, 0x8e738673, - 0x8e738273, 0xbefa0073, - 0xb8f22a05, 0x80728172, - 0x8e728a72, 0xb8f61605, - 0x80768176, 0x8e768676, - 0x80727672, 0x8072ff72, - 0x00000080, 0xbefa00ff, - 0x01000000, 0xbefc0080, - 0xe0510000, 0x721e0000, - 0xe0510100, 0x721e0000, - 0x807cff7c, 0x00000200, - 0x8072ff72, 0x00000200, - 0xbf0a737c, 0xbf85fff6, - 0xbef20080, 0xbefe00c1, - 0xbeff00c1, 0xb8f32a05, - 0x80738173, 0x8e738273, - 0x8e7a8873, 0xbefa00ff, - 0x01000000, 0xbef60072, - 0x8072ff72, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0x8073ff73, 0x00008000, - 0xe0524000, 0x721e0000, - 0xe0524100, 0x721e0100, - 0xe0524200, 0x721e0200, - 0xe0524300, 0x721e0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8072ff72, 0x00000400, - 0xbf0a737c, 0xbf85ffee, - 0xbf9c0000, 0xe0524000, - 0x761e0000, 0xe0524100, - 0x761e0100, 0xe0524200, - 0x761e0200, 0xe0524300, - 0x761e0300, 0xb8f22a05, - 0x80728172, 0x8e728a72, - 0xb8f61605, 0x80768176, - 0x8e768676, 0x80727672, - 0x80f2c072, 0xb8f31605, - 0x80738173, 0x8e738473, - 0x8e7a8273, 0xbefa00ff, - 0x01000000, 0xbefc0073, - 0xc031003c, 0x00000072, - 0x80f2c072, 0xbf8c007f, - 0x80fc907c, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff1, 0xb8f22a05, - 0x80728172, 0x8e728a72, - 0xb8f61605, 0x80768176, - 0x8e768676, 0x80727672, - 0xbefa0084, 0xbefa00ff, - 0x01000000, 0xc0211cfc, - 0x00000072, 0x80728472, - 0xc0211c3c, 0x00000072, - 0x80728472, 0xc0211c7c, - 0x00000072, 0x80728472, - 0xc0211bbc, 0x00000072, - 0x80728472, 0xc0211bfc, - 0x00000072, 0x80728472, - 0xc0211d3c, 0x00000072, - 0x80728472, 0xc0211d7c, - 0x00000072, 0x80728472, - 0xc0211a3c, 0x00000072, - 0x80728472, 0xc0211a7c, - 0x00000072, 0x80728472, - 0xc0211dfc, 0x00000072, - 0x80728472, 0xc0211b3c, - 0x00000072, 0x80728472, - 0xc0211b7c, 0x00000072, - 0x80728472, 0xbf8c007f, - 0x8671ff71, 0x0000ffff, - 0xbefc0073, 0xbefe006e, - 0xbeff006f, 0x867375ff, - 0x000003ff, 0xb9734803, - 0x867375ff, 0xfffff800, - 0x8f738b73, 0xb973a2c3, - 0xb977f801, 0x8673ff71, - 0xf0000000, 0x8f739c73, - 0x8e739073, 0xbef60080, - 0x87767376, 0x8673ff71, - 0x08000000, 0x8f739b73, - 0x8e738f73, 0x87767376, - 0x8673ff74, 0x00800000, - 0x8f739773, 0xb976f807, - 0x86fe7e7e, 0x86ea6a6a, - 0xb974f802, 0xbf8a0000, - 0x95807370, 0xbf810000, -}; - diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index cac8d4992e04..065f55ae9e41 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -20,9 +20,12 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#if 0 -HW (GFX9) source code for CWSR trap handler -#Version 18 + multiple trap handler +/* To compile this assembly code: + * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex + */ + +/* HW (GFX9) source code for CWSR trap handler */ +/* Version 18 + multiple trap handler */ // this performance-optimal version was originally from Seven Xu at SRDC @@ -151,7 +154,7 @@ var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME var s_save_spi_init_lo = exec_lo var s_save_spi_init_hi = exec_hi -var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} var s_save_pc_hi = ttmp1 var s_save_exec_lo = ttmp2 var s_save_exec_hi = ttmp3 @@ -1210,292 +1213,3 @@ function ack_sqc_store_workaround s_waitcnt lgkmcnt(0) end end - - -#endif - -static const uint32_t cwsr_trap_gfx9_hex[] = { - 0xbf820001, 0xbf82015a, - 0xb8f8f802, 0x89788678, - 0xb8f1f803, 0x866eff71, - 0x00000400, 0xbf850034, - 0x866eff71, 0x00000800, - 0xbf850003, 0x866eff71, - 0x00000100, 0xbf840008, - 0x866eff78, 0x00002000, - 0xbf840001, 0xbf810000, - 0x8778ff78, 0x00002000, - 0x80ec886c, 0x82ed806d, - 0xb8eef807, 0x866fff6e, - 0x001f8000, 0x8e6f8b6f, - 0x8977ff77, 0xfc000000, - 0x87776f77, 0x896eff6e, - 0x001f8000, 0xb96ef807, - 0xb8f0f812, 0xb8f1f813, - 0x8ef08870, 0xc0071bb8, - 0x00000000, 0xbf8cc07f, - 0xc0071c38, 0x00000008, - 0xbf8cc07f, 0x86ee6e6e, - 0xbf840001, 0xbe801d6e, - 0xb8f1f803, 0x8671ff71, - 0x000001ff, 0xbf850002, - 0x806c846c, 0x826d806d, - 0x866dff6d, 0x0000ffff, - 0x8f6e8b77, 0x866eff6e, - 0x001f8000, 0xb96ef807, - 0x86fe7e7e, 0x86ea6a6a, - 0xb978f802, 0xbe801f6c, - 0x866dff6d, 0x0000ffff, - 0xbef00080, 0xb9700283, - 0xb8f02407, 0x8e709c70, - 0x876d706d, 0xb8f003c7, - 0x8e709b70, 0x876d706d, - 0xb8f0f807, 0x8670ff70, - 0x00007fff, 0xb970f807, - 0xbeee007e, 0xbeef007f, - 0xbefe0180, 0xbf900004, - 0x87708478, 0xb970f802, - 0xbf8e0002, 0xbf88fffe, - 0xb8f02a05, 0x80708170, - 0x8e708a70, 0xb8f11605, - 0x80718171, 0x8e718671, - 0x80707170, 0x80707e70, - 0x8271807f, 0x8671ff71, - 0x0000ffff, 0xc0471cb8, - 0x00000040, 0xbf8cc07f, - 0xc04b1d38, 0x00000048, - 0xbf8cc07f, 0xc0431e78, - 0x00000058, 0xbf8cc07f, - 0xc0471eb8, 0x0000005c, - 0xbf8cc07f, 0xbef4007e, - 0x8675ff7f, 0x0000ffff, - 0x8775ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x00807fac, 0x8670ff7f, - 0x08000000, 0x8f708370, - 0x87777077, 0x8670ff7f, - 0x70000000, 0x8f708170, - 0x87777077, 0xbefb007c, - 0xbefa0080, 0xb8fa2a05, - 0x807a817a, 0x8e7a8a7a, - 0xb8f01605, 0x80708170, - 0x8e708670, 0x807a707a, - 0xbef60084, 0xbef600ff, - 0x01000000, 0xbefe007c, - 0xbefc007a, 0xc0611efa, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611b3a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611b7a, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611bba, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611bfa, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611e3a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xb8f1f803, - 0xbefe007c, 0xbefc007a, - 0xc0611c7a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611a3a, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611a7a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xb8fbf801, - 0xbefe007c, 0xbefc007a, - 0xc0611efa, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0x8670ff7f, - 0x04000000, 0xbeef0080, - 0x876f6f70, 0xb8fa2a05, - 0x807a817a, 0x8e7a8a7a, - 0xb8f11605, 0x80718171, - 0x8e718471, 0x8e768271, - 0xbef600ff, 0x01000000, - 0xbef20174, 0x80747a74, - 0x82758075, 0xbefc0080, - 0xbf800000, 0xbe802b00, - 0xbe822b02, 0xbe842b04, - 0xbe862b06, 0xbe882b08, - 0xbe8a2b0a, 0xbe8c2b0c, - 0xbe8e2b0e, 0xc06b003a, - 0x00000000, 0xbf8cc07f, - 0xc06b013a, 0x00000010, - 0xbf8cc07f, 0xc06b023a, - 0x00000020, 0xbf8cc07f, - 0xc06b033a, 0x00000030, - 0xbf8cc07f, 0x8074c074, - 0x82758075, 0x807c907c, - 0xbf0a717c, 0xbf85ffe7, - 0xbef40172, 0xbefa0080, - 0xbefe00c1, 0xbeff00c1, - 0xbee80080, 0xbee90080, - 0xbef600ff, 0x01000000, - 0xe0724000, 0x7a1d0000, - 0xe0724100, 0x7a1d0100, - 0xe0724200, 0x7a1d0200, - 0xe0724300, 0x7a1d0300, - 0xbefe00c1, 0xbeff00c1, - 0xb8f14306, 0x8671c171, - 0xbf84002c, 0xbf8a0000, - 0x8670ff6f, 0x04000000, - 0xbf840028, 0x8e718671, - 0x8e718271, 0xbef60071, - 0xb8fa2a05, 0x807a817a, - 0x8e7a8a7a, 0xb8f01605, - 0x80708170, 0x8e708670, - 0x807a707a, 0x807aff7a, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xd28c0002, 0x000100c1, - 0xd28d0003, 0x000204c1, - 0xd1060002, 0x00011103, - 0x7e0602ff, 0x00000200, - 0xbefc00ff, 0x00010000, - 0xbe800077, 0x8677ff77, - 0xff7fffff, 0x8777ff77, - 0x00058000, 0xd8ec0000, - 0x00000002, 0xbf8cc07f, - 0xe0765000, 0x7a1d0002, - 0x68040702, 0xd0c9006a, - 0x0000e302, 0xbf87fff7, - 0xbef70000, 0xbefa00ff, - 0x00000400, 0xbefe00c1, - 0xbeff00c1, 0xb8f12a05, - 0x80718171, 0x8e718271, - 0x8e768871, 0xbef600ff, - 0x01000000, 0xbefc0084, - 0xbf0a717c, 0xbf840015, - 0xbf11017c, 0x8071ff71, - 0x00001000, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0xe0724000, - 0x7a1d0000, 0xe0724100, - 0x7a1d0100, 0xe0724200, - 0x7a1d0200, 0xe0724300, - 0x7a1d0300, 0x807c847c, - 0x807aff7a, 0x00000400, - 0xbf0a717c, 0xbf85ffef, - 0xbf9c0000, 0xbf8200d9, - 0xbef4007e, 0x8675ff7f, - 0x0000ffff, 0x8775ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x00807fac, - 0x866eff7f, 0x08000000, - 0x8f6e836e, 0x87776e77, - 0x866eff7f, 0x70000000, - 0x8f6e816e, 0x87776e77, - 0x866eff7f, 0x04000000, - 0xbf84001e, 0xbefe00c1, - 0xbeff00c1, 0xb8ef4306, - 0x866fc16f, 0xbf840019, - 0x8e6f866f, 0x8e6f826f, - 0xbef6006f, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0xb8ee1605, 0x806e816e, - 0x8e6e866e, 0x80786e78, - 0x8078ff78, 0x00000080, - 0xbef600ff, 0x01000000, - 0xbefc0080, 0xe0510000, - 0x781d0000, 0xe0510100, - 0x781d0000, 0x807cff7c, - 0x00000200, 0x8078ff78, - 0x00000200, 0xbf0a6f7c, - 0xbf85fff6, 0xbef80080, - 0xbefe00c1, 0xbeff00c1, - 0xb8ef2a05, 0x806f816f, - 0x8e6f826f, 0x8e76886f, - 0xbef600ff, 0x01000000, - 0xbeee0078, 0x8078ff78, - 0x00000400, 0xbefc0084, - 0xbf11087c, 0x806fff6f, - 0x00008000, 0xe0524000, - 0x781d0000, 0xe0524100, - 0x781d0100, 0xe0524200, - 0x781d0200, 0xe0524300, - 0x781d0300, 0xbf8c0f70, - 0x7e000300, 0x7e020301, - 0x7e040302, 0x7e060303, - 0x807c847c, 0x8078ff78, - 0x00000400, 0xbf0a6f7c, - 0xbf85ffee, 0xbf9c0000, - 0xe0524000, 0x6e1d0000, - 0xe0524100, 0x6e1d0100, - 0xe0524200, 0x6e1d0200, - 0xe0524300, 0x6e1d0300, - 0xb8f82a05, 0x80788178, - 0x8e788a78, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x80f8c078, - 0xb8ef1605, 0x806f816f, - 0x8e6f846f, 0x8e76826f, - 0xbef600ff, 0x01000000, - 0xbefc006f, 0xc031003a, - 0x00000078, 0x80f8c078, - 0xbf8cc07f, 0x80fc907c, - 0xbf800000, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff0, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0xb8ee1605, 0x806e816e, - 0x8e6e866e, 0x80786e78, - 0xbef60084, 0xbef600ff, - 0x01000000, 0xc0211bfa, - 0x00000078, 0x80788478, - 0xc0211b3a, 0x00000078, - 0x80788478, 0xc0211b7a, - 0x00000078, 0x80788478, - 0xc0211eba, 0x00000078, - 0x80788478, 0xc0211efa, - 0x00000078, 0x80788478, - 0xc0211c3a, 0x00000078, - 0x80788478, 0xc0211c7a, - 0x00000078, 0x80788478, - 0xc0211a3a, 0x00000078, - 0x80788478, 0xc0211a7a, - 0x00000078, 0x80788478, - 0xc0211cfa, 0x00000078, - 0x80788478, 0xbf8cc07f, - 0x866dff6d, 0x0000ffff, - 0xbefc006f, 0xbefe007a, - 0xbeff007b, 0x866f71ff, - 0x000003ff, 0xb96f4803, - 0x866f71ff, 0xfffff800, - 0x8f6f8b6f, 0xb96fa2c3, - 0xb973f801, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8ef1605, 0x806f816f, - 0x8e6f866f, 0x806e6f6e, - 0x806e746e, 0x826f8075, - 0x866fff6f, 0x0000ffff, - 0xc0071cb7, 0x00000040, - 0xc00b1d37, 0x00000048, - 0xc0031e77, 0x00000058, - 0xc0071eb7, 0x0000005c, - 0xbf8cc07f, 0x866fff6d, - 0xf0000000, 0x8f6f9c6f, - 0x8e6f906f, 0xbeee0080, - 0x876e6f6e, 0x866fff6d, - 0x08000000, 0x8f6f9b6f, - 0x8e6f8f6f, 0x876e6f6e, - 0x866fff70, 0x00800000, - 0x8f6f976f, 0xb96ef807, - 0x86fe7e7e, 0x86ea6a6a, - 0xb970f802, 0xbf8a0000, - 0x95806f6c, 0xbf810000, -}; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index c1d9e2772cbc..7ee6cec2c060 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -26,8 +26,7 @@ #include "kfd_priv.h" #include "kfd_device_queue_manager.h" #include "kfd_pm4_headers_vi.h" -#include "cwsr_trap_handler_gfx8.asm" -#include "cwsr_trap_handler_gfx9.asm" +#include "cwsr_trap_handler.h" #include "kfd_iommu.h" #define MQD_SIZE_ALIGNED 768 -- cgit v1.2.3