AMDGPU: Rewrite SILowerI1Copies to always stay on SALU

Summary: Instead of writing boolean values temporarily into 32-bit VGPRs if they are involved in PHIs or are observed from outside a loop, we use bitwise masking operations to combine lane masks in a way that is consistent with wave control flow. Move SIFixSGPRCopies to before this pass, since that pass incorrectly attempts to move SGPR phis to VGPRs. This should recover most of the code quality that was lost with the bug fix in "AMDGPU: Remove PHI loop condition optimization". There are still some relevant cases where code quality could be improved, in particular: - We often introduce redundant masks with EXEC. Ideally, we'd have a generic computeKnownBits-like analysis to determine whether masks are already masked by EXEC, so we can avoid this masking both here and when lowering uniform control flow. - The criterion we use to determine whether a def is observed from outside a loop is conservative: it doesn't check whether (loop) branch conditions are uniform. Change-Id: Ibabdb373a7510e426b90deef00f5e16c5d56e64b Reviewers: arsenm, rampitec, tpr Subscribers: kzhuravl, jvesely, wdng, mgorny, yaxunl, dstuttard, t-tye, eraman, llvm-commits Differential Revision: https://reviews.llvm.org/D53496 llvm-svn: 345719
author: Nicolai Haehnle <nhaehnle@gmail.com> 2018-10-31 13:27:08 +0000
committer: Nicolai Haehnle <nhaehnle@gmail.com> 2018-10-31 13:27:08 +0000
commit: 814abb59dfdb354ca246a66217b3b3a9f7ac4aa5 (patch)
tree: 1ee4b8ef083603e15db53dfe6e7e6ec21e54ac14 /llvm/test/CodeGen
parent: 28212cc6891559855d41066d68e64a84097bb749 (diff)
download: bcm5719-llvm-814abb59dfdb354ca246a66217b3b3a9f7ac4aa5.tar.gz
bcm5719-llvm-814abb59dfdb354ca246a66217b3b3a9f7ac4aa5.zip
15 files changed, 264 insertions, 141 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/add_i1.ll b/llvm/test/CodeGen/AMDGPU/add_i1.ll
index fb3b69ca3bd..c5f7e3af5e3 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i1.ll
@@ -21,8 +21,8 @@ define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)
 }
 
 ; GCN-LABEL: {{^}}add_i1_cf:
-; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}
-; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+; GCN: ; %endif
+; GCN: s_not_b64
 define amdgpu_kernel void @add_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
index 84a2d3d3a7b..ae78a1ecf32 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -1,19 +1,25 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}i1_copy_from_loop:
 ;
-; Cannot use an SGPR mask to copy %cc out of the loop, since the mask would
-; only contain the lanes that were active during the last loop iteration.
-;
 ; SI: ; %for.body
-; SI:      v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4,
-; SI:      v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]]
-; SI-NEXT: s_cbranch_vccnz [[ENDIF:BB[0-9_]+]]
-; SI:      [[ENDIF]]:
-; SI-NOT:  [[VREG]]
-; SI:      ; %for.end
-; SI:      v_cmp_ne_u32_e32 vcc, 0, [[VREG]]
+; SI:      v_cmp_gt_u32_e64  [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4,
+; SI-DAG:  s_andn2_b64       [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI-DAG:  s_and_b64         [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec
+; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]]
+
+; SI: ; %Flow1
+; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], exec
+
+; SI: ; %Flow
+; SI-DAG:  s_andn2_b64       [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
+; SI-DAG:  s_and_b64         [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI:      s_or_b64          [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
+
+; SI: ; %for.end
+; SI:      s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]]
+
 define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
 entry:
   br label %for.body
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
new file mode 100644
index 00000000000..0aacbbfda18
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_dont_clobber_scc:
+
+; GCN: ; %entry
+; GCN:      s_cmp_eq_u32    s0, 0
+; GCN:      s_cbranch_scc1  [[PREEXIT:BB[0-9_]+]]
+
+; GCN: ; %blocka
+; GCN:      s_xor_b64       s[{{[0-9:]+}}], exec, -1
+; GCN:      s_cmp_eq_u32    s1, 0
+; GCN:      s_cbranch_scc1  [[EXIT:BB[0-9_]+]]
+
+; GCN: [[PREEXIT]]:
+; GCN: [[EXIT]]:
+
+define amdgpu_vs float @test_dont_clobber_scc(i32 inreg %uni, i32 inreg %uni2) #0 {
+entry:
+  %cc.uni = icmp eq i32 %uni, 0
+  br i1 %cc.uni, label %exit, label %blocka
+
+blocka:
+  call void asm sideeffect "; dummy a", ""()
+  %cc.uni2 = icmp eq i32 %uni2, 0
+  br i1 %cc.uni2, label %exit, label %blockb
+
+blockb:
+  call void asm sideeffect "; dummy b", ""()
+  br label %exit
+
+exit:
+  %cc.phi = phi i1 [ true, %entry ], [ false, %blocka ], [ false, %blockb ]
+  call void asm sideeffect "; dummy exit", ""()
+  %r = select i1 %cc.phi, float 1.0, float 2.0
+  ret float %r
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 63a9f1feb6d..5b25271ce17 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -2,12 +2,16 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}br_i1_phi:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; SI: s_and_saveexec_b64
-; SI: v_mov_b32_e32 [[REG]], -1{{$}}
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]]
-; SI: s_and_saveexec_b64
-; SI: s_endpgm
+
+; SI: ; %bb
+; SI:    s_mov_b64           [[TMP:s\[[0-9]+:[0-9]+\]]], 0
+
+; SI: ; %bb2
+; SI:    s_mov_b64           [[TMP]], exec
+
+; SI: ; %bb3
+; SI:    s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[TMP]]
+
 define amdgpu_kernel void @br_i1_phi(i32 %arg) {
 bb:
   %tidig = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index a0563cdd319..9615efaaa93 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -198,7 +198,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:
-; CHECK: v_mov_b32_e32 v0, -1{{$}}
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], -1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, [[MASK]]
 ; CHECK: ; use v0
 define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
 entry:
@@ -212,10 +213,14 @@ entry:
 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]]
 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK: ; use v0
+; CHECK: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK: v_cndmask_b32_e64 [[STORE:v[0-9]+]], 0, 1, vcc
+; CHECK: {{buffer|flat}}_store_byte [[STORE]],
 define amdgpu_kernel void @i1_input_phys_vgpr() {
 entry:
   %val = load i1, i1 addrspace(1)* undef
-  call void asm sideeffect "; use $0 ", "{v0}"(i1 %val)
+  %cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val)
+  store i1 %cc, i1 addrspace(1)* undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 34b842d8436..63c1556212d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 ; FIXME: Enable for VI.
 
@@ -144,20 +144,24 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
-; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
-; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
 
-; SI: buffer_load_dword [[LOAD:v[0-9]+]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: ; %entry
+; SI:     v_cmp_eq_u32_e64   [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
+; SI:     s_mov_b64          vcc, 0
+; SI:     s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]]
 
+; SI: ; %bb
+; SI:     buffer_load_dword  [[LOAD:v[0-9]+]],
+; SI:     v_cmp_ne_u32_e32   vcc, 0, [[LOAD]]
+; SI:     s_and_b64          vcc, vcc, exec
+
+; SI: ; %exit
+; SI:     s_or_b64           exec, exec, [[SAVE]]
+; SI-NOT: vcc
+; SI:     v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI:     buffer_store_dword
+; SI:     s_endpgm
 
-; SI: BB9_2:
-; SI: s_or_b64 exec, exec, [[SAVE]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: buffer_store_dword
-; SI: s_endpgm
 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
index 576950188d3..f37b3a3637a 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -22,23 +22,28 @@
 ; OPT: bb9:
 ; OPT: call void @llvm.amdgcn.end.cf(i64
 
-; TODO: Can remove exec fixes in return block
 ; GCN-LABEL: {{^}}break_loop:
-; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN:      s_mov_b64         [[OUTER_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
-; GCN: v_cmp_lt_i32_e32 vcc, -1
-; GCN: s_and_b64 vcc, exec, vcc
-; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
-
-; GCN: ; %bb.2: ; %bb4
-; GCN: buffer_load_dword
-; GCN: v_cmp_ge_i32_e32 vcc,
-
-; GCN: [[FLOW]]:
-; GCN: s_or_b64 [[MASK]], vcc, [[MASK]]
-; GCN: s_andn2_b64 exec, exec, [[MASK]]
-; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]
+; GCN:      v_cmp_lt_i32_e32  vcc, -1
+; GCN:      s_and_b64         vcc, exec, vcc
+; GCN:      s_or_b64          [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec
+; GCN:      s_cbranch_vccnz   [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: ; %bb4
+; GCN:      buffer_load_dword
+; GCN:      v_cmp_ge_i32_e32  vcc,
+; GCN:      s_andn2_b64       [[INNER_MASK]], [[INNER_MASK]], exec
+; GCN:      s_and_b64         [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN:      s_or_b64          [[INNER_MASK]], [[INNER_MASK]], [[TMP0]]
+
+; GCN: [[FLOW]]: ; %Flow
+; GCN:      s_and_b64         [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]]
+; GCN:      s_or_b64          [[TMP1]], [[TMP1]], [[OUTER_MASK]]
+; GCN:      s_mov_b64         [[OUTER_MASK]], [[TMP1]]
+; GCN:      s_andn2_b64       exec, exec, [[TMP1]]
+; GCN-NEXT: s_cbranch_execnz  [[LOOP_ENTRY]]
 
 ; GCN: ; %bb.4: ; %bb9
 ; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index fbdf9832b29..679fd7c9870 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -59,31 +59,48 @@
 
 
 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
-; GCN: v_cmp_lt_i32_e32 vcc, 1
-; GCN: s_and_saveexec_b64
-; GCN: s_xor_b64
 
+; GCN:      s_mov_b64           [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
+; GCN:      v_cmp_lt_i32_e32    vcc, 1,
+; GCN:      s_mov_b64           [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
+; GCN:      s_and_saveexec_b64
+; GCN:      s_xor_b64
+
+; GCN: ; %LeafBlock1
+; GCN-NEXT: s_mov_b64           [[EXIT0]], exec
+; GCN-NEXT: v_cmp_ne_u32_e32    vcc, 2,
+; GCN-NEXT: s_and_b64           [[EXIT1]], vcc, exec
+
+; GCN: ; %Flow
+; GCN-NEXT: s_or_saveexec_b64
+; GCN-NEXT: s_xor_b64
 
 ; FIXME: Why is this compare essentially repeated?
-; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
-; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: ; %LeafBlock
+; GCN-DAG:  v_cmp_eq_u32_e32    vcc, 1,
+; GCN-DAG:  v_cmp_ne_u32_e64    [[TMP1:s\[[0-9]+:[0-9]+\]]], 1,
+; GCN-DAG:  s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
+; GCN-DAG:  s_andn2_b64         [[EXIT1]], [[EXIT1]], exec
+; GCN-DAG:  s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN-DAG:  s_and_b64           [[TMP1]], [[TMP1]], exec
+; GCN-DAG:  s_or_b64            [[EXIT0]], [[EXIT0]], [[TMP0]]
+; GCN-DAG:  s_or_b64            [[EXIT1]], [[EXIT1]], [[TMP1]]
 
 ; GCN: ; %Flow4
-; GCN-NEXT: s_or_b64 exec, exec
-; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_or_b64            exec, exec,
+; GCN-NEXT: s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]]
+; GCN-NEXT: s_xor_b64
 
 ; GCN: ; %exit1
-; GCN: ds_write_b32
+; GCN:      ds_write_b32
+; GCN:      s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
 
-; GCN: %Flow5
-; GCN-NEXT: s_or_b64 exec, exec
-; GCN: v_cmp_ne_u32_e32 vcc, 0
-; GCN-NEXT: s_and_saveexec_b64
+; GCN: ; %Flow5
+; GCN-NEXT: s_or_b64            exec, exec,
+; GCN-NEXT; s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]]
 
 ; GCN: ; %exit0
-; GCN: buffer_store_dword
+; GCN:      buffer_store_dword
 
 ; GCN: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index c4e2f1e3487..4c1a769d599 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -21,34 +21,46 @@
 
 ; GCN-LABEL: {{^}}multi_else_break:
 
+; GCN: ; %main_body
+; GCN:      s_mov_b64           [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+
 ; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
+; GCN:      s_mov_b64           [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
-; GCN: s_and_saveexec_b64 [[SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], vcc
-
-; GCN: BB{{[0-9]+}}_{{[0-9]+}}: ; %Flow{{$}}
-; GCN-NEXT: ; in Loop: Header=[[INNER_LOOP]] Depth=2
-
-; Ensure extra or eliminated
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]]
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], vcc, s{{\[[0-9]+:[0-9]+\]}}
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
-; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]
-
-; GCN: ; %bb.{{[0-9]+}}: ; %Flow2{{$}}
-; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1
-
-; Ensure copy is eliminated
-; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]]
-; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, vcc
-; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
-; GCN-NEXT: s_mov_b64
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]]
-; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]]
+; GCN:      s_or_b64            [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]], [[BREAK_OUTER]], exec
+; GCN:      s_or_b64            [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]], [[BREAK_INNER]], exec
+; GCN:      s_and_saveexec_b64  [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+
+; FIXME: duplicate comparison
+; GCN: ; %ENDIF
+; GCN-DAG:  v_cmp_eq_u32_e32    vcc,
+; GCN-DAG:  v_cmp_ne_u32_e64    [[TMP51NEG:s\[[0-9]+:[0-9]+\]]],
+; GCN-DAG:  s_andn2_b64         [[BREAK_OUTER]], [[BREAK_OUTER]], exec
+; GCN-DAG:  s_andn2_b64         [[BREAK_INNER]], [[BREAK_INNER]], exec
+; GCN-DAG:  s_and_b64           [[TMP_EQ:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN-DAG:  s_and_b64           [[TMP_NE:s\[[0-9]+:[0-9]+\]]], [[TMP51NEG]], exec
+; GCN-DAG:  s_or_b64            [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]]
+; GCN-DAG:  s_or_b64            [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]]
+
+; GCN: ; %Flow
+; GCN:      s_or_b64            exec, exec, [[SAVE_EXEC]]
+; GCN:      s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER]]
+; GCN:      s_or_b64            [[TMP0]], [[TMP0]], [[LEFT_INNER]]
+; GCN:      s_mov_b64           [[LEFT_INNER]], [[TMP0]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP0]]
+; GCN:      s_cbranch_execnz    [[INNER_LOOP]]
+
+; GCN: ; %Flow2
+; GCN:      s_or_b64            exec, exec, [[TMP0]]
+; GCN:      s_and_b64           [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER]]
+; GCN:      s_or_b64            [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
+; GCN:      s_mov_b64           [[LEFT_OUTER]], [[TMP1]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP1]]
+; GCN:      s_cbranch_execnz    [[OUTER_LOOP]]
+
+; GCN: ; %IF
+; GCN-NEXT: s_endpgm
 define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 main_body:
   br label %LOOP.outer
@@ -78,12 +90,38 @@ ENDIF:                                            ; preds = %LOOP
 ; OPT: llvm.amdgcn.end.cf
 
 ; GCN-LABEL: {{^}}multi_if_break_loop:
-; GCN: s_mov_b64 [[BREAK_REG:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN:      s_mov_b64          [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
+; GCN:      s_mov_b64          [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+
+; GCN: ; %LeafBlock1
+; GCN:      s_mov_b64
+; GCN:      s_mov_b64          [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+
+; GCN: ; %case1
+; GCN:      buffer_load_dword  [[LOAD2:v[0-9]+]],
+; GCN:      v_cmp_ge_i32_e32   vcc, {{v[0-9]+}}, [[LOAD2]]
+; GCN:      s_orn2_b64         [[BREAK]], vcc, exec
+
+; GCN: ; %Flow3
+; GCN:      s_branch           [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN:      s_mov_b64          [[BREAK]], -1{{$}}
+
+; GCN: [[FLOW]]: ; %Flow
+
+; GCN: ; %case0
+; GCN:      buffer_load_dword  [[LOAD1:v[0-9]+]],
+; GCN-DAG:  s_andn2_b64        [[BREAK]], [[BREAK]], exec
+; GCN-DAG:  v_cmp_ge_i32_e32   vcc, {{v[0-9]+}}, [[LOAD1]]
+; GCN-DAG:  s_and_b64          [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN:      s_or_b64           [[BREAK]], [[BREAK]], [[TMP]]
 
-; GCN: s_or_b64 [[BREAK_REG]], vcc, [[BREAK_REG]]
-; GCN: s_andn2_b64 exec, exec, [[BREAK_REG]]
+; GCN: ; %Flow4
+; GCN:      s_and_b64          [[BREAK]], exec, [[BREAK]]
+; GCN:      s_or_b64           [[LEFT]], [[BREAK]], [[OLD_LEFT]]
+; GCN:      s_andn2_b64        exec, exec, [[LEFT]]
 ; GCN-NEXT: s_cbranch_execnz
 
 define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index 33028f17531..f773357976c 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -137,7 +137,6 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, flo
 ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0
 ; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
 ; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}}
-; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
 define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 04df33b8dd4..3db6fd2d898 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -100,22 +100,22 @@ endif:
   ret void
 }
 
-; FIXME: Should write to different SGPR pairs instead of copying to
-; VALU for i1 phi.
-
 ; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
-
-; SI: BB{{[0-9]+}}_2:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
-
-; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
-; SI: buffer_store_dword [[RESULT]]
+
+; SI: ; %else
+; SI:      buffer_load_dword  [[AVAL:v[0-9]+]]
+; SI:      v_cmp_gt_i32_e64   [[PHI:s\[[0-9]+:[0-9]+\]]], 0, [[AVAL]]
+
+; SI: ; %if
+; SI:      buffer_load_dword  [[AVAL:v[0-9]+]]
+; SI:      v_cmp_eq_u32_e32   [[CMP_ELSE:vcc]], 0, [[AVAL]]
+; SI-DAG:  s_andn2_b64        [[PHI]], [[PHI]], exec
+; SI-DAG:  s_and_b64          [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP_ELSE]], exec
+; SI:      s_or_b64           [[PHI]], [[PHI]], [[TMP]]
+
+; SI: ; %endif
+; SI:      v_cndmask_b32_e64  [[RESULT:v[0-9]+]], 0, -1, [[PHI]]
+; SI:      buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 73e56593ce8..6215a486a36 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
 
@@ -27,18 +27,23 @@ ENDIF:
 
 
 ; FUNC-LABEL: {{^}}phi_cond_outside_loop:
-; FIXME: This could be folded into the s_or_b64 instruction
-; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0
-; SI: [[LOOP_LABEL:[A-Z0-9]+]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
-; SI_IF_BREAK instruction:
-; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]]
+; SI:     s_mov_b64         [[LEFT:s\[[0-9]+:[0-9]+\]]], 0
+; SI:     s_mov_b64         [[PHI:s\[[0-9]+:[0-9]+\]]], 0
 
-; SI_LOOP instruction:
-; SI: s_andn2_b64 exec, exec, [[BREAK]]
-; SI: s_cbranch_execnz [[LOOP_LABEL]]
-; SI: s_endpgm
+; SI: ; %else
+; SI:     v_cmp_eq_u32_e64  [[TMP:s\[[0-9]+:[0-9]+\]]],
+; SI:     s_and_b64         [[PHI]], [[TMP]], exec
+
+; SI: ; %endif
+
+; SI: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]: ; %loop
+; SI:     s_mov_b64         [[TMP:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+; SI:     s_and_b64         [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[PHI]]
+; SI:     s_or_b64          [[LEFT]], [[TMP1]], [[TMP]]
+; SI:     s_andn2_b64       exec, exec, [[LEFT]]
+; SI:     s_cbranch_execnz  [[LOOP_LABEL]]
+; SI:     s_endpgm
 
 define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
 entry:
@@ -90,19 +95,21 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; This broke the old AMDIL cfg structurizer
 ; FUNC-LABEL: {{^}}loop_land_info_assert:
 ; SI:      v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
-; SI:      s_and_b64 vcc, exec, [[CMP4]]
-; SI-NEXT: s_cbranch_vccnz [[BR1:BB[0-9_]+]]
-; SI-NEXT: s_branch [[BR2:BB[0-9_]+]]
-; SI-NEXT: BB{{[0-9_]+}}:
-; SI-NEXT: buffer_store_dword
+; SI:      s_and_b64        [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]]
+; SI:      s_mov_b64        vcc, [[CMP4M]]
+; SI-NEXT: s_cbranch_vccnz  [[CONVEX_EXIT:BB[0-9_]+]]
+; SI-NEXT: s_branch         [[FOR_COND_PREHDR:BB[0-9_]+]]
+
+; SI: ; %if.else
+; SI:      buffer_store_dword
 
 ; SI:      [[INFLOOP:BB[0-9]+_[0-9]+]]:
 
-; SI:      [[BR1]]:
-; SI-NEXT: s_and_b64 vcc, exec,
-; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]
+; SI:      [[CONVEX_EXIT]]:
+; SI:      s_mov_b64        vcc,
+; SI-NEXT: s_cbranch_vccnz  [[ENDPGM:BB[0-9]+_[0-9]+]]
 ; SI:      s_branch [[INFLOOP]]
-; SI-NEXT: [[BR2]]:
+; SI-NEXT: [[FOR_COND_PREHDR]]:
 ; SI:      s_cbranch_vccz [[ENDPGM]]
 
 ; SI:      [[ENDPGM]]:
diff --git a/llvm/test/CodeGen/AMDGPU/sub_i1.ll b/llvm/test/CodeGen/AMDGPU/sub_i1.ll
index 70562a59f0a..6861d32dccf 100644
--- a/llvm/test/CodeGen/AMDGPU/sub_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub_i1.ll
@@ -21,8 +21,8 @@ define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)
 }
 
 ; GCN-LABEL: {{^}}sub_i1_cf:
-; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}
-; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+; GCN: ; %endif
+; GCN: s_not_b64
 define amdgpu_kernel void @sub_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index 3d980b749a9..ca85f0bee4c 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -8,23 +8,22 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 
 ; waitcnt should be inserted after exec modification
-; SI: v_cmp_lt_i32_e32 vcc, 0,
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
+; SI:      v_cmp_lt_i32_e32 vcc, 0,
+; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
+; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
 ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
 ; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
 
 ; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
-; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
-; SI: v_mov_b32_e32 v{{[0-9]}}, -1
-; SI: s_and_saveexec_b64
+; SI:      s_mov_b64 s[{{[0-9]:[0-9]}}], -1
+; SI:      s_and_saveexec_b64
 ; SI-NEXT: ; mask branch
 
 ; v_mov should be after exec modification
 ; SI: [[FLOW_BB]]:
 ; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
-; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
 ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
 ; SI-NEXT: ; mask branch
 ;
@@ -220,9 +219,10 @@ exit:
 ; SI: [[LABEL_FLOW]]:
 ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
 ; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
-; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
+; SI-NEXT: s_or_b64 [[TMP2:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[COND_STATE]]
+; SI-NEXT: s_mov_b64 [[COND_STATE]], [[TMP2]]
+; SI-NEXT: s_andn2_b64 exec, exec, [[TMP2]]
 ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
 
 ; SI: [[LABEL_EXIT]]:
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
index a941e5fb1f7..08267b76aef 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}testKernel
 ; GCN: BB0_1:
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cmp_eq_f32_e64
+; GCN-NEXT: v_cmp_eq_f32_e32
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT: v_cmp_eq_f32_e32
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
author	Nicolai Haehnle <nhaehnle@gmail.com>	2018-10-31 13:27:08 +0000
committer	Nicolai Haehnle <nhaehnle@gmail.com>	2018-10-31 13:27:08 +0000
commit	814abb59dfdb354ca246a66217b3b3a9f7ac4aa5 (patch)
tree	1ee4b8ef083603e15db53dfe6e7e6ec21e54ac14 /llvm/test/CodeGen
parent	28212cc6891559855d41066d68e64a84097bb749 (diff)
download	bcm5719-llvm-814abb59dfdb354ca246a66217b3b3a9f7ac4aa5.tar.gz bcm5719-llvm-814abb59dfdb354ca246a66217b3b3a9f7ac4aa5.zip