diff options
author | Alexander Timofeev <Alexander.Timofeev@amd.com> | 2019-05-26 20:33:26 +0000 |
---|---|---|
committer | Alexander Timofeev <Alexander.Timofeev@amd.com> | 2019-05-26 20:33:26 +0000 |
commit | ba447bae7448435c9986eece0811da1423972fdd (patch) | |
tree | 688b9be43648fbc0f5c07b422b417ef8b69353b3 /llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll | |
parent | c2493ce4a40be025054087fde59dd0f339baf6c0 (diff) | |
download | bcm5719-llvm-ba447bae7448435c9986eece0811da1423972fdd.tar.gz bcm5719-llvm-ba447bae7448435c9986eece0811da1423972fdd.zip |
[AMDGPU] Divergence driven ISel. Assign register class for cross block values according to the divergence.
Details: To make instruction selection really divergence driven it is necessary to assign
the correct register classes to the cross block values beforehand. For the divergent targets
same value type requires different register classes dependent on the value divergence.
Reviewers: rampitec, nhaehnle
Differential Revision: https://reviews.llvm.org/D59990
This commit was reverted because of the build failure.
The reason was mlformed patch.
Build failure fixed.
llvm-svn: 361741
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll | 53 |
1 files changed, 34 insertions, 19 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index 80071e3407e..e7555a67033 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -1,28 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-dce-in-ra=0 -o - %s | FileCheck %s ; Don't crash when the use of an undefined value is only detected by the ; register coalescer because it is hidden with subregister insert/extract. target triple="amdgcn--" -; CHECK-LABEL: foobar: -; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc - -; CHECK: BB0_1: -; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $sgpr4_sgpr5 killed $exec -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 - -; CHECK: BB0_2: -; CHECK: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; CHECK-NEXT: s_endpgm define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { +; CHECK-LABEL: foobar: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) + +; FIXME: The change related to the fact that +; DetectDeadLanes pass hit "Copy across incompatible class" SGPR -> VGPR in analysis +; and hence it cannot derive the fact that the vector element is unused. +; Such a copies appear because the float4 vectors and their elements in the test are uniform +; but the PHI node in "ife" block is divergent because of the CF dependency (divergent branch in bb0) + +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 + +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: ; mask branch BB0_2 +; CHECK-NEXT: BB0_1: ; %ift +; CHECK-NEXT: s_mov_b32 s4, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: BB0_2: ; %ife +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm entry: %v0 = insertelement <4 x float> undef, float %a0, i32 0 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 |