diff options
| author | Marek Olsak <marek.olsak@amd.com> | 2017-11-09 01:52:55 +0000 |
|---|---|---|
| committer | Marek Olsak <marek.olsak@amd.com> | 2017-11-09 01:52:55 +0000 |
| commit | 58410f37ff58c5778d349725458011a57ee21bf9 (patch) | |
| tree | 05f88334afc264f9fe1845fdcf63162f7ba20ac2 /llvm/test/CodeGen | |
| parent | 5cec64195ceea262f86ae3d305607eb4e7840d88 (diff) | |
| download | bcm5719-llvm-58410f37ff58c5778d349725458011a57ee21bf9.tar.gz bcm5719-llvm-58410f37ff58c5778d349725458011a57ee21bf9.zip | |
AMDGPU: Merge BUFFER_STORE_DWORD_OFFEN/OFFSET into x2, x4
Summary:
Only 56 shaders (out of 48486) are affected.
Totals from affected shaders (changed stats only):
SGPRS: 2420 -> 2460 (1.65 %)
Spilled VGPRs: 94 -> 112 (19.15 %)
Scratch size: 524 -> 528 (0.76 %) dwords per thread
Code Size: 187400 -> 184992 (-1.28 %) bytes
One DiRT Showdown shader spills 6 more VGPRs.
One Grid Autosport shader spills 12 more VGPRs.
The other 54 shaders only have a decrease in code size.
(I'm ignoring the SGPR noise)
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D39012
llvm-svn: 317755
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll | 75 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/merge-stores.ll | 8 |
2 files changed, 77 insertions, 6 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll index c6200cacbe8..c12c713cdc2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -95,6 +95,81 @@ main_body: ret void } +;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} +define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1) + call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) + ret void +} + declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll index 44497d93707..2b2f67cd1b3 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll @@ -237,8 +237,7 @@ define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base( ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: ; GCN: buffer_load_dwordx2 v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v +; GCN: buffer_store_dwordx2 v define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 @@ -379,10 +378,7 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 a ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: ; GCN: buffer_load_dwordx4 v ; GCN: s_barrier -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v +; GCN: buffer_store_dwordx4 v define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 |

