diff options
| author | Mark Searles <m.c.searles@gmail.com> | 2018-01-22 21:46:43 +0000 |
|---|---|---|
| committer | Mark Searles <m.c.searles@gmail.com> | 2018-01-22 21:46:43 +0000 |
| commit | 7687d4205242a47ff49e6b241e80d871a5956ec2 (patch) | |
| tree | edeb460935b45a6d617fbf4b85d97fcd999523fe /llvm/test/CodeGen/AMDGPU | |
| parent | e8ea8296fccc6d9345817ece7d2f4ea3b35a55e1 (diff) | |
| download | bcm5719-llvm-7687d4205242a47ff49e6b241e80d871a5956ec2.tar.gz bcm5719-llvm-7687d4205242a47ff49e6b241e80d871a5956ec2.zip | |
[AMDGPU] SI Load Store Optimizer: When merging with offset, use V_ADD_{I|U}32_e64
- Change inserted add ( V_ADD_{I|U}32_e32 ) to _e64 version ( V_ADD_{I|U}32_e64 ) so that the add uses a vreg for the carry; this prevents inserted v_add from killing VCC; the _e64 version doesn't accept a literal in its encoding, so we need to introduce a mov instr as well to get the imm into a register.
- Change pass name to "SI Load Store Optimizer"; this removes the '/', which complicates scripts.
Differential Revision: https://reviews.llvm.org/D42124
llvm-svn: 323153
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll | 44 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir | 60 |
2 files changed, 82 insertions, 22 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll index 7dd1f90e914..6ae36cc1fbb 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll @@ -5,9 +5,9 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] @@ -50,9 +50,9 @@ bb: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] @@ -132,8 +132,8 @@ bb: ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] @@ -170,7 +170,7 @@ bb: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]] ; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50 @@ -211,8 +211,8 @@ bb: ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] @@ -249,9 +249,9 @@ bb: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] @@ -285,9 +285,9 @@ bb: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x320, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x640, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]] @@ -349,8 +349,8 @@ bb: ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4004, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8004, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]] @@ -380,7 +380,7 @@ bb: ; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0 ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] -; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 0x960, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]] ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50 @@ -412,8 +412,8 @@ bb: ; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]] ; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, 0x4008, [[BASE]] -; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, 0x8008, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] +; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]] ; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]] diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir new file mode 100644 index 00000000000..fbd5611b3fc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir @@ -0,0 +1,60 @@ +# RUN: llc -march=amdgcn -mcpu=gfx803 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN,VI %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s + +# If there's a base offset, check that SILoadStoreOptimizer creates +# V_ADD_{I|U}32_e64 for that offset; _e64 uses a vreg for the carry (rather than +# %vcc, which is used in _e32); this ensures that %vcc is not inadvertently +# clobbered. + +# GCN-LABEL: name: kernel + +# VI: V_ADD_I32_e64 %6, %0, +# VI-NEXT: DS_WRITE2_B32 killed %7, %0, %3, 0, 8, +# VI: V_ADD_I32_e64 %10, %3, +# VI-NEXT: DS_READ2_B32 killed %11, 0, 8, + +# GFX9: V_ADD_U32_e64 %6, %0, +# GFX9-NEXT: DS_WRITE2_B32_gfx9 killed %7, %0, %3, 0, 8, +# GFX9: V_ADD_U32_e64 %9, %3, +# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 0, 8, + +--- | + @0 = internal unnamed_addr addrspace(3) global [256 x float] undef, align 4 + + define amdgpu_kernel void @kernel() { + bb.0: + br label %bb2 + + bb1: + ret void + + bb2: + %tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0 + %tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8 + %tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16 + %tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24 + br label %bb1 + } +--- +name: kernel +body: | + bb.0: + %0:vgpr_32 = IMPLICIT_DEF + S_BRANCH %bb.2 + + bb.1: + S_ENDPGM + + bb.2: + %1:sreg_64_xexec = V_CMP_NE_U32_e64 %0, 0, implicit %exec + %2:vgpr_32 = V_CNDMASK_B32_e64 0, 1, %1, implicit %exec + V_CMP_NE_U32_e32 1, %2, implicit-def %vcc, implicit %exec + DS_WRITE_B32 %0, %0, 1024, 0, implicit %m0, implicit %exec :: (store 4 into %ir.tmp) + %3:vgpr_32 = V_MOV_B32_e32 0, implicit %exec + DS_WRITE_B32 %0, %3, 1056, 0, implicit %m0, implicit %exec :: (store 4 into %ir.tmp1) + %4:vgpr_32 = DS_READ_B32 %3, 1088, 0, implicit %m0, implicit %exec :: (load 4 from %ir.tmp2) + %5:vgpr_32 = DS_READ_B32 %3, 1120, 0, implicit %m0, implicit %exec :: (load 4 from %ir.tmp3) + %vcc = S_AND_B64 %exec, %vcc, implicit-def %scc + S_CBRANCH_VCCNZ %bb.1, implicit %vcc + S_BRANCH %bb.1 +... |

