diff options
Diffstat (limited to 'llvm/test')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/hsa.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll | 43 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll | 37 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll | 184 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/local-memory.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll | 293 |
12 files changed, 406 insertions, 195 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index ab87fdbc00d..d9bb586163d 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -38,8 +38,10 @@ ; HSA: .amdgpu_hsa_kernel simple ; HSA: {{^}}simple: ; HSA: .amd_kernel_code_t +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: .end_amd_kernel_code_t -; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x0 +; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 ; Make sure we are setting the ATC bit: ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll index 5e8cf5bb3d2..c348a2e7980 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -1,31 +1,46 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s -; XUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s -; XUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s ; FIXME: align on alloca seems to be ignored for private_segment_alignment ; ALL-LABEL: {{^}}large_alloca_compute_shader: -; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN: s_mov_b32 s14, -1 -; CI: s_mov_b32 s15, 0x80f000 -; VI: s_mov_b32 s15, 0x800000 +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 ; GCNHSA: .amd_kernel_code_t + +; GCNHSA: compute_pgm_rsrc2_scratch_en = 1 +; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6 +; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1 +; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0 +; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0 +; GCNHSA: compute_pgm_rsrc2_tg_size_en = 0 +; GCNHSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 + +; GCNHSA: enable_sgpr_private_segment_buffer = 1 +; GCNHSA: enable_sgpr_dispatch_ptr = 0 +; GCNHSA: enable_sgpr_queue_ptr = 0 +; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1 +; GCNHSA: enable_sgpr_dispatch_id = 0 +; GCNHSA: enable_sgpr_flat_scratch_init = 0 +; GCNHSA: enable_sgpr_private_segment_size = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCNHSA: workitem_private_segment_byte_size = 0 ; GCNHSA: private_segment_alignment = 4 ; GCNHSA: .end_amd_kernel_code_t -; GCNHSA: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCNHSA: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCNHSA: s_mov_b32 s10, -1 -; CIHSA: s_mov_b32 s11, 0x180f000 -; VIHSA: s_mov_b32 s11, 0x11800000 -; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen -; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen +; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen +; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen ; Scratch size = alloca size + emergency stack slot ; ALL: ; ScratchSize: 32772 diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll index 208b9a10050..141ee256015 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -8,8 +8,8 @@ ; CI: s_mov_b32 s11, 0x80f000 ; VI: s_mov_b32 s11, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; ALL: ; ScratchSize: 32772 define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { @@ -29,8 +29,8 @@ define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { ; CI: s_mov_b32 s11, 0x80f000 ; VI: s_mov_b32 s11, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; ALL: ; ScratchSize: 32772 define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll new file mode 100644 index 00000000000..6dc9d050eee --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll @@ -0,0 +1,37 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}read_workdim: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].Z + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] +define void @read_workdim(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.AMDGPU.read.workdim() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}read_workdim_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOT: 0xff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @read_workdim_known_bits(i32 addrspace(1)* %out) { +entry: + %dim = call i32 @llvm.AMDGPU.read.workdim() #0 + %shl = shl i32 %dim, 24 + %shr = lshr i32 %shl, 24 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.AMDGPU.read.workdim() #0 + +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll index 719f7ffe0f1..dc95cd1ee01 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}test: ; GCN: enable_sgpr_dispatch_ptr = 1 -; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 define void @test(i32 addrspace(1)* %out) { %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll index c5aba2b76b8..cc109327d92 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -1,8 +1,8 @@ ; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test_debug_value: -; CHECK: s_load_dwordx2 -; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR0_SGPR1 +; CHECK: s_load_dwordx2 s[4:5] +; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5 ; CHECK: buffer_store_dword ; CHECK: s_endpgm define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll new file mode 100644 index 00000000000..f2a7256e812 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -0,0 +1,184 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}local_size_x: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].Z + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4 + +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_x(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].W + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_y(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].X + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_z(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xy: +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xy(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %val = mul i32 %x, %y + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xz: + +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xz(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %x, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_yz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_yz(i32 addrspace(1)* %out) { +entry: + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %y, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xyz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xyz(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %xy = mul i32 %x, %y + %xyz = add i32 %xy, %z + store i32 %xyz, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_x_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_x_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.x() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_y_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.y() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_z_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.z() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.local.size.x() #0 +declare i32 @llvm.r600.read.local.size.y() #0 +declare i32 @llvm.r600.read.local.size.z() #0 + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll index 7f31ef45b62..6b52b80ba08 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -10,7 +10,7 @@ ; EG: .long 166120 ; EG-NEXT: .long 8 ; GCN: .long 47180 -; GCN-NEXT: .long 38792 +; GCN-NEXT: .long 32900 ; EG: {{^}}local_memory_two_objects: diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.ll b/llvm/test/CodeGen/AMDGPU/local-memory.ll index 9494ed75bd0..9ffb59e7092 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.ll @@ -9,9 +9,9 @@ ; EG: .long 166120 ; EG-NEXT: .long 128 ; SI: .long 47180 -; SI-NEXT: .long 71560 +; SI-NEXT: .long 65668 ; CI: .long 47180 -; CI-NEXT: .long 38792 +; CI-NEXT: .long 32900 ; FUNC-LABEL: {{^}}local_memory: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index 2cbb67d85fb..cd7c78f408d 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -17,16 +17,18 @@ declare i32 @llvm.r600.read.tgid.z() #1 ; GCN-LABEL: {{^}}spill_vgpr_compute: +; GCN: s_mov_b32 s16, s3 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_mov_b32 s15, 0x80f000 ; VI-NEXT: s_mov_b32 s15, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s8 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index ef492cef576..16abb89bb0b 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -11,14 +11,14 @@ ; GCN-LABEL: {{^}}main: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s11, 0x80f000 -; VI-NEXT: s_mov_b32 s11, 0x800000 +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0x80f000 +; VI-NEXT: s_mov_b32 s15, 0x800000 ; s12 is offset user SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, s[8:11], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 diff --git a/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll index ebe73429da9..a704a23b0f9 100644 --- a/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll @@ -1,5 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -7,9 +9,26 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].X -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; HSA: .amd_kernel_code_t + +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; HSA: .end_amd_kernel_code_t + + +; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] + define void @ngroups_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.x() #0 @@ -21,10 +40,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.y() #0 @@ -36,10 +55,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].Z -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.z() #0 @@ -51,10 +70,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].W -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.x() #0 @@ -66,10 +85,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].X -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.y() #0 @@ -81,10 +100,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.z() #0 @@ -92,74 +111,33 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_size_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].W - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].X - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}get_work_dim: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @get_work_dim (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.AMDGPU.read.workdim() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; The tgid values are stored in sgprs offset by the number of user sgprs. -; Currently we always use exactly 2 user sgprs for the pointer to the -; kernel arguments, but this may change in the future. +; The tgid values are stored in sgprs offset by the number of user +; sgprs. ; FUNC-LABEL: {{^}}tgid_x: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 +; HSA: .amd_kernel_code_t +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; HSA: .end_amd_kernel_code_t + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}} ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_x (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -167,9 +145,25 @@ entry: } ; FUNC-LABEL: {{^}}tgid_y: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5 +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 1 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3 +; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7 ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_y (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -177,102 +171,83 @@ entry: } ; FUNC-LABEL: {{^}}tgid_z: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6 +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 1 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}} ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_z (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.z() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 132{{$}} + ; FUNC-LABEL: {{^}}tidig_x: +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 ; GCN: buffer_store_dword v0 -define void @tidig_x (i32 addrspace(1)* %out) { +define void @tidig_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.x() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 2180{{$}} + ; FUNC-LABEL: {{^}}tidig_y: + +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1 ; GCN: buffer_store_dword v1 -define void @tidig_y (i32 addrspace(1)* %out) { +define void @tidig_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.y() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 4228{{$}} + ; FUNC-LABEL: {{^}}tidig_z: +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2 ; GCN: buffer_store_dword v2 -define void @tidig_z (i32 addrspace(1)* %out) { +define void @tidig_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.z() #0 store i32 %0, i32 addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}local_size_x_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_x_known_bits(i32 addrspace(1)* %out) { -entry: - %size = call i32 @llvm.r600.read.local.size.x() #0 - %shl = shl i32 %size, 16 - %shr = lshr i32 %shl, 16 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_y_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_y_known_bits(i32 addrspace(1)* %out) { -entry: - %size = call i32 @llvm.r600.read.local.size.y() #0 - %shl = shl i32 %size, 16 - %shr = lshr i32 %shl, 16 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_z_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_z_known_bits(i32 addrspace(1)* %out) { -entry: - %size = call i32 @llvm.r600.read.local.size.z() #0 - %shl = shl i32 %size, 16 - %shr = lshr i32 %shl, 16 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}get_work_dim_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN-NOT: 0xff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @get_work_dim_known_bits(i32 addrspace(1)* %out) { -entry: - %dim = call i32 @llvm.AMDGPU.read.workdim() #0 - %shl = shl i32 %dim, 24 - %shr = lshr i32 %shl, 24 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - declare i32 @llvm.r600.read.ngroups.x() #0 declare i32 @llvm.r600.read.ngroups.y() #0 declare i32 @llvm.r600.read.ngroups.z() #0 @@ -281,10 +256,6 @@ declare i32 @llvm.r600.read.global.size.x() #0 declare i32 @llvm.r600.read.global.size.y() #0 declare i32 @llvm.r600.read.global.size.z() #0 -declare i32 @llvm.r600.read.local.size.x() #0 -declare i32 @llvm.r600.read.local.size.y() #0 -declare i32 @llvm.r600.read.local.size.z() #0 - declare i32 @llvm.r600.read.tgid.x() #0 declare i32 @llvm.r600.read.tgid.y() #0 declare i32 @llvm.r600.read.tgid.z() #0 |