diff options
author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2017-04-12 20:48:56 +0000 |
---|---|---|
committer | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2017-04-12 20:48:56 +0000 |
commit | c90347d7604f0a036e74c55db8cf117886ae9df6 (patch) | |
tree | 642c8823531a186b7e195152fbe508f82e9514b6 /llvm/test | |
parent | 04aee46779f062ed45245b9b675d1b3bd4224ae3 (diff) | |
download | bcm5719-llvm-c90347d7604f0a036e74c55db8cf117886ae9df6.tar.gz bcm5719-llvm-c90347d7604f0a036e74c55db8cf117886ae9df6.zip |
[AMDGPU] Generate range metadata for workitem id
If workgroup size is known inform llvm about range returned by local
id and local size queries.
Differential Revision: https://reviews.llvm.org/D31804
llvm-svn: 300102
Diffstat (limited to 'llvm/test')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/add.i16.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll | 20 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll | 16 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll | 16 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/private-memory-r600.ll | 9 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll | 16 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/shl.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/sub.i16.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/zext-lid.ll | 83 |
14 files changed, 141 insertions, 55 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/add.i16.ll b/llvm/test/CodeGen/AMDGPU/add.i16.ll index b65e79f14de..3b274c9d202 100644 --- a/llvm/test/CodeGen/AMDGPU/add.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.i16.ll @@ -84,10 +84,10 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1 ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64: +; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 97cb9067f29..1f4b1eaa209 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -27,8 +27,6 @@ ; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120 ; HSA-PROMOTE: .end_amd_kernel_code_t -; FIXME: These should be merged -; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x1 ; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x2 ; SI-PROMOTE: ds_write_b32 @@ -58,9 +56,9 @@ ; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16 -; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !1 -; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !1 -; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !1 +; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !2 +; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !2 ; HSAOPT: [[Y_SIZE_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[EXTRACTY]], [[LDZU]] ; HSAOPT: [[YZ_X_XID:%[0-9]+]] = mul i32 [[Y_SIZE_X_Z_SIZE]], [[WORKITEM_ID_X]] @@ -77,9 +75,9 @@ ; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !0 ; NOHSAOPT: call i32 @llvm.r600.read.local.size.z(), !range !0 -; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0 -; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0 -; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0 +; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !1 +; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !1 +; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !1 define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 @@ -557,6 +555,8 @@ entry: attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" } ; HSAOPT: !0 = !{} -; HSAOPT: !1 = !{i32 0, i32 2048} +; HSAOPT: !1 = !{i32 0, i32 257} +; HSAOPT: !2 = !{i32 0, i32 256} -; NOHSAOPT: !0 = !{i32 0, i32 2048} +; NOHSAOPT: !0 = !{i32 0, i32 257} +; NOHSAOPT: !1 = !{i32 0, i32 256} diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index 5e39a6c6774..c23cc1c88b5 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -50,7 +50,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, ; GCN-LABEL: {{^}}s_ubfe_sub_i32: ; GCN: s_load_dword [[SRC:s[0-9]+]] ; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]] +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}} ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -128,7 +128,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, ; GCN-LABEL: {{^}}s_sbfe_sub_i32: ; GCN: s_load_dword [[SRC:s[0-9]+]] ; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]] +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}} ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll index fc85ec06f58..3dfdaf3936a 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -150,7 +150,7 @@ define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1 ; Do scalar loads into the super register we need. ; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} -; CI-NOT: v_mov +; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}} ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}} ; CI: s_endpgm define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 { @@ -173,7 +173,7 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; CI-NOT: v_mov +; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}} ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}} ; CI: s_endpgm define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll index 405534ea4b5..56966a19cf7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -234,8 +234,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %p } ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -248,8 +248,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4 } ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -355,8 +355,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* } ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { @@ -370,8 +370,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace } ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll index 8334c0c357b..3d64f93db2e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -206,8 +206,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* } ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { @@ -221,8 +221,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace } ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { @@ -348,8 +348,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %p } ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -362,8 +362,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4 } ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll index 4ce9208eadd..47b6558241b 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -45,11 +45,7 @@ entry: ; GCN-LABEL: {{^}}local_memory_two_objects: ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 - -; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]] - -; SI-DAG: ds_write_b32 [[ADDRW]], -; SI-DAG: ds_write_b32 [[ADDRW_OFF]], +; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; GCN: s_barrier diff --git a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll index 4e0ecc0565e..6f5f4ca13b5 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll @@ -14,8 +14,8 @@ entry: } ; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range: -; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0 -; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]] +; CHECK-NOT: v_and_b32 +; CHECK: {{flat|buffer}}_store_dword {{.*}}v0 define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 @@ -26,8 +26,8 @@ entry: ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range_m1: ; CHECK-NOT: v0 -; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0 -; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]] +; CHECK-NOT: v_and_b32 +; CHECK: {{flat|buffer}}_store_dword {{.*}}v0 define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1 diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll index d07a0a02cba..866cd16ec3b 100644 --- a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll @@ -12,9 +12,9 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0 ; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0 -; OPT: call i32 @llvm.r600.read.tidig.x(), !range !0 -; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0 -; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0 +; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1 +; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1 +; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1 define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: @@ -295,6 +295,7 @@ define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ret void } -; OPT: !0 = !{i32 0, i32 2048} +; OPT: !0 = !{i32 0, i32 257} +; OPT: !1 = !{i32 0, i32 256} attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" } diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll index b3cb19ad05e..0a29db4a058 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -12,7 +12,7 @@ ; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x %ld.64 = load i128, i128 addrspace(1)* %in.gep @@ -56,7 +56,7 @@ define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x %ld.64 = load i128, i128 addrspace(1)* %in.gep @@ -113,5 +113,7 @@ define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.x() #0 + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index 744c1c2b682..6f5fc6d0f38 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -9,7 +9,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -42,7 +42,7 @@ define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 add ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -58,7 +58,7 @@ define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addr ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -106,7 +106,7 @@ define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 add ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -122,7 +122,7 @@ define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -138,7 +138,7 @@ define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 a ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -156,7 +156,7 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 a ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -383,5 +383,7 @@ define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 add declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.x() #0 + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index f6520eeb4fd..ff666cc3653 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -4,6 +4,8 @@ declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.tgid.x() #0 + ;EG: {{^}}shl_v2i32: ;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} @@ -288,7 +290,7 @@ define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) { ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}} define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() #0 + %tid = call i32 @llvm.r600.read.tgid.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid %a = load i64, i64 addrspace(1)* %gep.in diff --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll index ada72140563..6642411f7a6 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll @@ -85,10 +85,10 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1 ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64: +; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/zext-lid.ll b/llvm/test/CodeGen/AMDGPU/zext-lid.ll new file mode 100644 index 00000000000..8eeff53ff99 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/zext-lid.ll @@ -0,0 +1,83 @@ +; RUN: llc -march=amdgcn < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s + +; CHECK-NOT: and_b32 + +; OPT-LABEL: @zext_grp_size_128 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !0 +define amdgpu_kernel void @zext_grp_size_128(i32 addrspace(1)* nocapture %arg) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp1 = and i32 %tmp, 127 + store i32 %tmp1, i32 addrspace(1)* %arg, align 4 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp3 = and i32 %tmp2, 127 + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp6 = and i32 %tmp5, 127 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 + ret void +} + +; OPT-LABEL: @zext_grp_size_32x4x1 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !2 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !3 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !4 +define amdgpu_kernel void @zext_grp_size_32x4x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp1 = and i32 %tmp, 31 + store i32 %tmp1, i32 addrspace(1)* %arg, align 4 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp3 = and i32 %tmp2, 3 + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp6 = and i32 %tmp5, 1 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 + ret void +} + +; OPT-LABEL: @zext_grp_size_512 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !5 +define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp1 = and i32 %tmp, 65535 + store i32 %tmp1, i32 addrspace(1)* %arg, align 4 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp3 = and i32 %tmp2, 65535 + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp6 = and i32 %tmp5, 65535 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #2 + +declare i32 @llvm.amdgcn.workitem.id.y() #2 + +declare i32 @llvm.amdgcn.workitem.id.z() #2 + +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,128" } +attributes #1 = { nounwind "amdgpu-flat-work-group-size"="512,512" } +attributes #2 = { nounwind readnone } + +!0 = !{i32 32, i32 4, i32 1} + +; OPT: !0 = !{i32 0, i32 128} +; OPT: !1 = !{i32 32, i32 4, i32 1} +; OPT: !2 = !{i32 0, i32 32} +; OPT: !3 = !{i32 0, i32 4} +; OPT: !4 = !{i32 0, i32 1} +; OPT: !5 = !{i32 0, i32 512} |