AMDGPU/SI: Fix threshold calculation for branching when exec is zero

Summary: When control flow is implemented using the exec mask, the compiler will insert branch instructions to skip over the masked section when exec is zero if the section contains more than a certain number of instructions. The previous code would only count instructions in successor blocks, and this patch modifies the code to start counting instructions in all blocks between the start and end of the branch. Reviewers: nhaehnle, arsenm Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D18282 llvm-svn: 263969
author: Tom Stellard <thomas.stellard@amd.com> 2016-03-21 18:56:58 +0000
committer: Tom Stellard <thomas.stellard@amd.com> 2016-03-21 18:56:58 +0000
commit: 92339e888f160d837baa2b2feac132e9536c2c48 (patch)
tree: e72c6da38626de7a21035841beac30f8169106d6
parent: 2bf5ed567027ae874dc5858e5dadeb3f32d637c5 (diff)
download: bcm5719-llvm-92339e888f160d837baa2b2feac132e9536c2c48.tar.gz
bcm5719-llvm-92339e888f160d837baa2b2feac132e9536c2c48.zip
2 files changed, 39 insertions, 3 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index a2dfc641d75..a804a5e6d32 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -130,10 +130,12 @@ bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
 
   unsigned NumInstr = 0;
 
-  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
-       MBB = *MBB->succ_begin()) {
+  for (MachineFunction::iterator MBBI = MachineFunction::iterator(From),
+                                 ToI = MachineFunction::iterator(To); MBBI != ToI; ++MBBI) {
 
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+    MachineBasicBlock &MBB = *MBBI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          NumInstr < SkipThreshold && I != E; ++I) {
 
       if (I->isBundle() || !I->isBundled()) {
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
index 26927e46024..4ad6dce26ea 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -24,5 +24,39 @@ out:
   ret void
 }
 
+;CHECK-LABEL: {{^}}test2:
+;CHECK: s_and_saveexec_b64
+;CHECK: s_xor_b64
+;CHECK-NEXT: s_cbranch_execz
+define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+main_body:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %cc = icmp eq i32 %tid, 0
+  br i1 %cc, label %done1, label %if
+
+if:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %done0, label %loop_body
+
+loop_body:
+  %counter = phi i32 [ 0, %if ], [0, %done0], [ %incr, %loop_body ]
+
+  ; Prevent the loop from being optimized out
+  call void asm sideeffect "", "" ()
+
+  %incr = add i32 %counter, 1
+  %lc = icmp sge i32 %incr, 1000
+  br i1 %lc, label %done1, label %loop_body
+
+done0:
+  %cmp0 = icmp eq i32 %b, 0
+  br i1 %cmp0, label %done1, label %loop_body
+
+done1:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
 attributes #0 = { "ShaderType"="0" }
 attributes #1 = { nounwind readonly }
author	Tom Stellard <thomas.stellard@amd.com>	2016-03-21 18:56:58 +0000
committer	Tom Stellard <thomas.stellard@amd.com>	2016-03-21 18:56:58 +0000
commit	92339e888f160d837baa2b2feac132e9536c2c48 (patch)
tree	e72c6da38626de7a21035841beac30f8169106d6
parent	2bf5ed567027ae874dc5858e5dadeb3f32d637c5 (diff)
download	bcm5719-llvm-92339e888f160d837baa2b2feac132e9536c2c48.tar.gz bcm5719-llvm-92339e888f160d837baa2b2feac132e9536c2c48.zip