[AMDGPU] Generate range metadata for workitem id

If workgroup size is known inform llvm about range returned by local id and local size queries. Differential Revision: https://reviews.llvm.org/D31804 llvm-svn: 300102
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2017-04-12 20:48:56 +0000
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2017-04-12 20:48:56 +0000
commit: c90347d7604f0a036e74c55db8cf117886ae9df6 (patch)
tree: 642c8823531a186b7e195152fbe508f82e9514b6 /llvm/test
parent: 04aee46779f062ed45245b9b675d1b3bd4224ae3 (diff)
download: bcm5719-llvm-c90347d7604f0a036e74c55db8cf117886ae9df6.tar.gz
bcm5719-llvm-c90347d7604f0a036e74c55db8cf117886ae9df6.zip
14 files changed, 141 insertions, 55 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/add.i16.ll b/llvm/test/CodeGen/AMDGPU/add.i16.ll
index b65e79f14de..3b274c9d202 100644
--- a/llvm/test/CodeGen/AMDGPU/add.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.i16.ll
@@ -84,10 +84,10 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
+; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
-; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 97cb9067f29..1f4b1eaa209 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -27,8 +27,6 @@
 ; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120
 ; HSA-PROMOTE: .end_amd_kernel_code_t
 
-; FIXME: These should be merged
-; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x1
 ; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x2
 
 ; SI-PROMOTE: ds_write_b32
@@ -58,9 +56,9 @@
 ; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0
 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16
 
-; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
-; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !1
-; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !1
+; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2
+; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !2
+; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !2
 
 ; HSAOPT: [[Y_SIZE_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[EXTRACTY]], [[LDZU]]
 ; HSAOPT: [[YZ_X_XID:%[0-9]+]] = mul i32 [[Y_SIZE_X_Z_SIZE]], [[WORKITEM_ID_X]]
@@ -77,9 +75,9 @@
 
 ; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !0
 ; NOHSAOPT: call i32 @llvm.r600.read.local.size.z(), !range !0
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !1
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !1
+; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !1
 define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
@@ -557,6 +555,8 @@ entry:
 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
 
 ; HSAOPT: !0 = !{}
-; HSAOPT: !1 = !{i32 0, i32 2048}
+; HSAOPT: !1 = !{i32 0, i32 257}
+; HSAOPT: !2 = !{i32 0, i32 256}
 
-; NOHSAOPT: !0 = !{i32 0, i32 2048}
+; NOHSAOPT: !0 = !{i32 0, i32 257}
+; NOHSAOPT: !1 = !{i32 0, i32 256}
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index 5e39a6c6774..c23cc1c88b5 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -50,7 +50,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
 ; GCN-LABEL: {{^}}s_ubfe_sub_i32:
 ; GCN: s_load_dword [[SRC:s[0-9]+]]
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
 define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -128,7 +128,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
 ; GCN-LABEL: {{^}}s_sbfe_sub_i32:
 ; GCN: s_load_dword [[SRC:s[0-9]+]]
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
 define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index fc85ec06f58..3dfdaf3936a 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -150,7 +150,7 @@ define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1
 ; Do scalar loads into the super register we need.
 ; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4:
 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
-; CI-NOT: v_mov
+; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
 ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
 ; CI: s_endpgm
 define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
@@ -173,7 +173,7 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x
 ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4:
 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; CI-NOT: v_mov
+; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
 ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
 ; CI: s_endpgm
 define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index 405534ea4b5..56966a19cf7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -234,8 +234,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %p
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
 define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -248,8 +248,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
 define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -355,8 +355,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
 define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
@@ -370,8 +370,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
 define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index 8334c0c357b..3d64f93db2e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -206,8 +206,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
 define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
@@ -221,8 +221,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 }
 
 ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
 define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
@@ -348,8 +348,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %p
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -362,8 +362,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64:
-; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
 define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index 4ce9208eadd..47b6558241b 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -45,11 +45,7 @@ entry:
 ; GCN-LABEL: {{^}}local_memory_two_objects:
 ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
 ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
-
-; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
-
-; SI-DAG: ds_write_b32 [[ADDRW]],
-; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
+; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
 
 ; GCN: s_barrier
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
index 4e0ecc0565e..6f5f4ca13b5 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -14,8 +14,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range:
-; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0
-; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
+; CHECK-NOT: v_and_b32
+; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
 define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
@@ -26,8 +26,8 @@ entry:
 
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range_m1:
 ; CHECK-NOT: v0
-; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0
-; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
+; CHECK-NOT: v_and_b32
+; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
 define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
index d07a0a02cba..866cd16ec3b 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -12,9 +12,9 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0
 ; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0
-; OPT: call i32 @llvm.r600.read.tidig.x(), !range !0
-; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0
-; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0
+; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1
+; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1
+; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1
 
 define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
@@ -295,6 +295,7 @@ define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   ret void
 }
 
-; OPT: !0 = !{i32 0, i32 2048}
+; OPT: !0 = !{i32 0, i32 257}
+; OPT: !1 = !{i32 0, i32 256}
 
 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
index b3cb19ad05e..0a29db4a058 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -12,7 +12,7 @@
 ; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i128, i128 addrspace(1)* %in.gep
@@ -56,7 +56,7 @@ define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i128, i128 addrspace(1)* %in.gep
@@ -113,5 +113,7 @@ define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index 744c1c2b682..6f5fc6d0f38 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -9,7 +9,7 @@
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -42,7 +42,7 @@ define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 add
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -58,7 +58,7 @@ define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addr
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -106,7 +106,7 @@ define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 add
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -122,7 +122,7 @@ define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -138,7 +138,7 @@ define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 a
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -156,7 +156,7 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 a
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
-  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
   %ld.64 = load i64, i64 addrspace(1)* %in.gep
@@ -383,5 +383,7 @@ define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 add
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index f6520eeb4fd..ff666cc3653 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -4,6 +4,8 @@
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
+declare i32 @llvm.r600.read.tgid.x() #0
+
 
 ;EG: {{^}}shl_v2i32:
 ;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -288,7 +290,7 @@ define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
 define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid = call i32 @llvm.r600.read.tgid.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
   %a = load i64, i64 addrspace(1)* %gep.in
diff --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
index ada72140563..6642411f7a6 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
@@ -85,10 +85,10 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
+; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
-; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/zext-lid.ll b/llvm/test/CodeGen/AMDGPU/zext-lid.ll
new file mode 100644
index 00000000000..8eeff53ff99
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/zext-lid.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=amdgcn < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s
+
+; CHECK-NOT: and_b32
+
+; OPT-LABEL: @zext_grp_size_128
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !0
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !0
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !0
+define amdgpu_kernel void @zext_grp_size_128(i32 addrspace(1)* nocapture %arg) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %tmp1 = and i32 %tmp, 127
+  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
+  %tmp3 = and i32 %tmp2, 127
+  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
+  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
+  %tmp6 = and i32 %tmp5, 127
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
+  ret void
+}
+
+; OPT-LABEL: @zext_grp_size_32x4x1
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !2
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !3
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !4
+define amdgpu_kernel void @zext_grp_size_32x4x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %tmp1 = and i32 %tmp, 31
+  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
+  %tmp3 = and i32 %tmp2, 3
+  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
+  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
+  %tmp6 = and i32 %tmp5, 1
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
+  ret void
+}
+
+; OPT-LABEL: @zext_grp_size_512
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !5
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !5
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !5
+define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %tmp1 = and i32 %tmp, 65535
+  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
+  %tmp3 = and i32 %tmp2, 65535
+  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
+  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
+  %tmp6 = and i32 %tmp5, 65535
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+declare i32 @llvm.amdgcn.workitem.id.y() #2
+
+declare i32 @llvm.amdgcn.workitem.id.z() #2
+
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,128" }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
+attributes #2 = { nounwind readnone }
+
+!0 = !{i32 32, i32 4, i32 1}
+
+; OPT: !0 = !{i32 0, i32 128}
+; OPT: !1 = !{i32 32, i32 4, i32 1}
+; OPT: !2 = !{i32 0, i32 32}
+; OPT: !3 = !{i32 0, i32 4}
+; OPT: !4 = !{i32 0, i32 1}
+; OPT: !5 = !{i32 0, i32 512}
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2017-04-12 20:48:56 +0000
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2017-04-12 20:48:56 +0000
commit	c90347d7604f0a036e74c55db8cf117886ae9df6 (patch)
tree	642c8823531a186b7e195152fbe508f82e9514b6 /llvm/test
parent	04aee46779f062ed45245b9b675d1b3bd4224ae3 (diff)
download	bcm5719-llvm-c90347d7604f0a036e74c55db8cf117886ae9df6.tar.gz bcm5719-llvm-c90347d7604f0a036e74c55db8cf117886ae9df6.zip