summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/AMDGPU/smrd.ll
diff options
context:
space:
mode:
authorNicolai Haehnle <nhaehnle@gmail.com>2018-11-30 22:55:38 +0000
committerNicolai Haehnle <nhaehnle@gmail.com>2018-11-30 22:55:38 +0000
commita7b00058e05f6862d4ef2c8f8bb287b09f7e41b1 (patch)
tree3f571b7d7ba5368d8ca4dc8010ef04ffe0ee6eef /llvm/test/CodeGen/AMDGPU/smrd.ll
parenta9cc92c247ce5d0ecc3399e7af6e40a3d59bbf6c (diff)
downloadbcm5719-llvm-a7b00058e05f6862d4ef2c8f8bb287b09f7e41b1.tar.gz
bcm5719-llvm-a7b00058e05f6862d4ef2c8f8bb287b09f7e41b1.zip
AMDGPU: Divergence-driven selection of scalar buffer load intrinsics
Summary: Moving SMRD to VMEM in SIFixSGPRCopies is rather bad for performance if the load is really uniform. So select the scalar load intrinsics directly to either VMEM or SMRD buffer loads based on divergence analysis. If an offset happens to end up in a VGPR -- either because a floating point calculation was involved, or due to other remaining deficiencies in SIFixSGPRCopies -- we use v_readfirstlane. There is some unrelated churn in tests since we now select MUBUF offsets in a unified way with non-scalar buffer loads. Change-Id: I170e6816323beb1348677b358c9d380865cd1a19 Reviewers: arsenm, alex-t, rampitec, tpr Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D53283 llvm-svn: 348050
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/smrd.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/smrd.ll45
1 files changed, 27 insertions, 18 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index a67d471ee19..d70801381ce 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -292,18 +292,19 @@ main_body:
; GCN-LABEL: {{^}}smrd_vgpr_offset_imm:
; GCN-NEXT: %bb.
-; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ;
+; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ;
define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 {
main_body:
- %off = add i32 %offset, 4095
+ %off = add i32 %offset, 4092
%r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off)
ret float %r
}
; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large:
; GCN-NEXT: %bb.
-; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
-; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
+; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0
+; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ;
+; VIGFX9-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ;
define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 {
main_body:
%off = add i32 %offset, 4096
@@ -512,12 +513,15 @@ main_body:
}
; GCN-LABEL: {{^}}smrd_load_nonconst4:
-; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
-; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0xff8, v0 ;
-; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
-; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
-; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
-; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
+; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ;
; GCN: ; return to shader part epilog
define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 {
main_body:
@@ -528,12 +532,16 @@ main_body:
}
; GCN-LABEL: {{^}}smrd_load_nonconst5:
-; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0x1004, v0
-; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x1004, v0
-; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
-; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
-; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
-; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0
+; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; VIGFX9: s_movk_i32 s4, 0xfc0
+; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ;
+; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ;
; GCN: ; return to shader part epilog
define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 {
main_body:
@@ -561,9 +569,10 @@ main_body:
; GCN-LABEL: {{^}}smrd_uniform_loop:
;
-; TODO: this should use an s_buffer_load
+; TODO: we should keep the loop counter in an SGPR
;
-; GCN: buffer_load_dword
+; GCN: v_readfirstlane_b32
+; GCN: s_buffer_load_dword
define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
main_body:
br label %loop
OpenPOWER on IntegriCloud