AMDGPU: Split MUBUF offset into aligned components

Summary: Atomic buffer operations do not work (and trap on gfx9) when the components are unaligned, even if their sum is aligned. Previously, we generated an offset of 4156 without an SGPR by splitting it as 4095 + 61 (immediate + inline constant). The highest offset for which we can do this correctly is 4156 = 4092 + 64. Fixes dEQP-GLES31.functional.ssbo.atomic.* Reviewers: arsenm Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D37850 llvm-svn: 315302
author: Nicolai Haehnle <nhaehnle@gmail.com> 2017-10-10 12:22:23 +0000
committer: Nicolai Haehnle <nhaehnle@gmail.com> 2017-10-10 12:22:23 +0000
commit: 312b64f4d703947779ce56abdcc0d59741ff99df (patch)
tree: 9eb7bccd2f72bdf3326848767801ab20070931af /llvm/test
parent: 0f22a06b4de75d4d02741b9f39d39df1cd4e8066 (diff)
download: bcm5719-llvm-312b64f4d703947779ce56abdcc0d59741ff99df.tar.gz
bcm5719-llvm-312b64f4d703947779ce56abdcc0d59741ff99df.zip
3 files changed, 25 insertions, 25 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
index 98f7058b5ef..b6f72a114d9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -3,7 +3,7 @@
 
 ;CHECK-LABEL: {{^}}test1:
 ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
-;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
+;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc
 ;CHECK: s_waitcnt vmcnt(0)
@@ -14,7 +14,7 @@
 ;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc
 ;CHECK-DAG: s_waitcnt vmcnt(0)
 ;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc
-;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc
+;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}}
 define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
@@ -71,24 +71,24 @@ main_body:
 ;CHECK-LABEL: {{^}}test3:
 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
 ;CHECK: s_waitcnt vmcnt(0)
-;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
+;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc
 ;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:44 glc
 ;CHECK-DAG: s_waitcnt vmcnt(0)
 ;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc
-;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc
+;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc
 define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
 main_body:
   %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
   %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
   %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
   %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
-  %ofs.5 = add i32 %voffset, 42
+  %ofs.5 = add i32 %voffset, 44
   %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
   %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
index 9cb9f25520b..d5159934d3f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
@@ -27,20 +27,20 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
-;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x103c
+;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x1038
 ;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS]], s[0:3], 0 offen
 ;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 61 offset:4095
-;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4093
+;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
+;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
+;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092
 ;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
-;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:1
+;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc
+;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4
 ;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
 main_body:
-  %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0)
-  %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0)
+  %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4152, i1 0, i1 0)
+  %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36856, i1 0, i1 0)
   %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0)
   %d.3 = fadd <4 x float> %d.0, %d.1
   %data = fadd <4 x float> %d.2, %d.3
@@ -48,10 +48,10 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse:
-;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xfff
-;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:65
+;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xffc
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:68
 ;VI-NOT: s_mov
-;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:81
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:84
 ;VI: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) {
 main_body:
@@ -80,11 +80,11 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
-;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:58
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
 ;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
 main_body:
-  %ofs = add i32 %1, 58
+  %ofs = add i32 %1, 60
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
   ret <4 x float> %data
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
index 5c93ae0e786..03caca8d29c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -18,18 +18,18 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
-;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:42
+;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
 ;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
 main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0)
   ret <4 x float> %data
 }
 
 ;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
 ;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen
-;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff
-;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1
+;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
+;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4
 ;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
 main_body:
@@ -56,11 +56,11 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
-;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60
 ;CHECK: s_waitcnt
 define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
 main_body:
-  %ofs = add i32 %1, 58
+  %ofs = add i32 %1, 60
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
   ret <4 x float> %data
 }
author	Nicolai Haehnle <nhaehnle@gmail.com>	2017-10-10 12:22:23 +0000
committer	Nicolai Haehnle <nhaehnle@gmail.com>	2017-10-10 12:22:23 +0000
commit	312b64f4d703947779ce56abdcc0d59741ff99df (patch)
tree	9eb7bccd2f72bdf3326848767801ab20070931af /llvm/test
parent	0f22a06b4de75d4d02741b9f39d39df1cd4e8066 (diff)
download	bcm5719-llvm-312b64f4d703947779ce56abdcc0d59741ff99df.tar.gz bcm5719-llvm-312b64f4d703947779ce56abdcc0d59741ff99df.zip