diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-12-05 20:23:10 +0000 |
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-12-05 20:23:10 +0000 |
| commit | 7bee6ac798f2c547753dd867e130ec587f201483 (patch) | |
| tree | da0fea7e1f415a0dbada331fc836c3e2ca547240 /llvm/test/CodeGen | |
| parent | df87d070c917029bd0209408fcfe833d149bcca7 (diff) | |
| download | bcm5719-llvm-7bee6ac798f2c547753dd867e130ec587f201483.tar.gz bcm5719-llvm-7bee6ac798f2c547753dd867e130ec587f201483.zip | |
AMDGPU: Refactor exp instructions
Structure the definitions a bit more like the other classes.
The main change here is to split EXP with the done bit set
to a separate opcode, so we can set mayLoad = 1 so that it won't
be reordered before the other exp stores, since this has the special
constraint that if the done bit is set then this should be the last
exp in she shader.
Previously all exp instructions were inferred to have unmodeled
side effects.
llvm-svn: 288695
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/ret.ll | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/MIR/AMDGPU/insert-waits-exp.mir | 63 | ||||
| -rw-r--r-- | llvm/test/CodeGen/MIR/AMDGPU/movrels-bug.mir | 2 |
3 files changed, 68 insertions, 5 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/ret.ll b/llvm/test/CodeGen/AMDGPU/ret.ll index 0408413f547..0bdecc96caf 100644 --- a/llvm/test/CodeGen/AMDGPU/ret.ll +++ b/llvm/test/CodeGen/AMDGPU/ret.ll @@ -6,7 +6,7 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float ; GCN-LABEL: {{^}}vgpr: ; GCN: v_mov_b32_e32 v1, v0 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1 -; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1 +; GCN-DAG: exp 15, 0, -1, 1, -1, v1, v1, v1, v1 ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { @@ -19,7 +19,7 @@ define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 i ; GCN-LABEL: {{^}}vgpr_literal: ; GCN: v_mov_b32_e32 v4, v0 -; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4 +; GCN: exp 15, 0, -1, 1, -1, v4, v4, v4, v4 ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: v_mov_b32_e32 v1, 2.0 ; GCN-DAG: v_mov_b32_e32 v2, 4.0 @@ -209,7 +209,7 @@ define amdgpu_vs {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2) ; GCN-LABEL: {{^}}both: ; GCN: v_mov_b32_e32 v1, v0 -; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1 +; GCN-DAG: exp 15, 0, -1, 1, -1, v1, v1, v1, v1 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1 ; GCN-DAG: s_add_i32 s0, s3, 2 ; GCN-DAG: s_mov_b32 s1, s2 @@ -231,7 +231,7 @@ define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2 ; GCN-LABEL: {{^}}structure_literal: ; GCN: v_mov_b32_e32 v3, v0 -; GCN: exp 15, 0, 1, 1, 1, v3, v3, v3, v3 +; GCN: exp 15, 0, -1, 1, -1, v3, v3, v3, v3 ; GCN-DAG: v_mov_b32_e32 v0, 1.0 ; GCN-DAG: s_mov_b32 s0, 2 ; GCN-DAG: s_mov_b32 s1, 3 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/insert-waits-exp.mir b/llvm/test/CodeGen/MIR/AMDGPU/insert-waits-exp.mir new file mode 100644 index 00000000000..9aaa374ed28 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/insert-waits-exp.mir @@ -0,0 +1,63 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s +--- | + define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) { + %a = load volatile float, float addrspace(1)* undef + %b = load volatile float, float addrspace(1)* undef + %c = load volatile float, float addrspace(1)* undef + %d = load volatile float, float addrspace(1)* undef + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %a, float %b, float %c, float %d) + ret <4 x float> <float 5.000000e-01, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00> + } + + declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + + attributes #0 = { readnone } + attributes #1 = { nounwind } + +... +--- + +# CHECK-LABEL: name: exp_done_waitcnt{{$}} +# CHECK: EXP_DONE +# CHECK-NEXT: S_WAITCNT 3855 +# CHECK: %vgpr0 = V_MOV_B32 +# CHECK: %vgpr1 = V_MOV_B32 +# CHECK: %vgpr2 = V_MOV_B32 +# CHECK: %vgpr3 = V_MOV_B32 +name: exp_done_waitcnt +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.2): + %sgpr3 = S_MOV_B32 61440 + %sgpr2 = S_MOV_B32 -1 + %vgpr0 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + %vgpr1 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + %vgpr2 = BUFFER_LOAD_DWORD_OFFSET %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + %vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + EXP_DONE 0, killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3, -1, -1, 15, implicit %exec + %vgpr0 = V_MOV_B32_e32 1056964608, implicit %exec + %vgpr1 = V_MOV_B32_e32 1065353216, implicit %exec + %vgpr2 = V_MOV_B32_e32 1073741824, implicit %exec + %vgpr3 = V_MOV_B32_e32 1082130432, implicit %exec + SI_RETURN killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3 + +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/movrels-bug.mir b/llvm/test/CodeGen/MIR/AMDGPU/movrels-bug.mir index 6493cc8703e..9c330bc8a6b 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/movrels-bug.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/movrels-bug.mir @@ -25,7 +25,7 @@ body: | %m0 = S_MOV_B32 undef %sgpr0 %vgpr1 = V_MOVRELS_B32_e32 undef %vgpr1, implicit %m0, implicit %exec, implicit killed %vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 %vgpr4 = V_MAC_F32_e32 undef %vgpr0, undef %vgpr0, undef %vgpr4, implicit %exec - EXP 15, 12, 0, 1, 0, undef %vgpr0, killed %vgpr1, killed %vgpr4, undef %vgpr0, implicit %exec + EXP_DONE 15, undef %vgpr0, killed %vgpr1, killed %vgpr4, undef %vgpr0, 0, 0, 12, implicit %exec S_ENDPGM ... |

