AMDGPU/SI: Handle infinite loop for the structurizer to work with CFG with infinite loops.

Summary: The current StructurizeCFG pass only works for CFG with one exit. AMDGPUUnifyDivergentExitNodes combines multiple "return" blocks and/or "unreachable" blocks to one exit block for the Structurizer to work. However, infinite loop is another kind of special "exit", and if we don't handle it, the case of multiple exits will prevent the structurizer from working. In this work, for each infinite loop, we add a dummy edge to the "return" block, and thus the AMDGPUUnifyDivergentExitNodes pass will work with infinite loops. This will make CFG with infinite loops be structurized. Reviewer: nhaehnle Differential Revision: https://reviews.llvm.org/D46340 llvm-svn: 332625
author: Changpeng Fang <changpeng.fang@gmail.com> 2018-05-17 16:45:01 +0000
committer: Changpeng Fang <changpeng.fang@gmail.com> 2018-05-17 16:45:01 +0000
commit: 391bcf889391f9128763674087f2ddfd82671e90 (patch)
tree: f1883bd89f3a27d0c1d2cd4dffc3d146a2219b0d /llvm/test
parent: daf5169398e7605c68d61c21618e887ce4980931 (diff)
download: bcm5719-llvm-391bcf889391f9128763674087f2ddfd82671e90.tar.gz
bcm5719-llvm-391bcf889391f9128763674087f2ddfd82671e90.zip
5 files changed, 170 insertions, 15 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index c1ea13421dc..1fb386cd4cd 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -431,11 +431,17 @@ endif:
 ; si_mask_branch
 
 ; GCN-LABEL: {{^}}analyze_mask_branch:
-; GCN: v_cmp_lt_f32_e32 vcc
-; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN: v_cmp_nlt_f32_e32 vcc
+; GCN-NEXT: s_and_saveexec_b64 [[TEMP_MASK:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64  [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[TEMP_MASK]]
+; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: [[FLOW]]: ; %Flow
+; GCN-NEXT: s_or_saveexec_b64 [[TEMP_MASK1:s\[[0-9]+:[0-9]+\]]], [[MASK]]
+; GCN-NEXT: s_xor_b64 exec, exec, [[TEMP_MASK1]]
 ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 
-; GCN-NEXT: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop_body
+; GCN: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop_body
 ; GCN: ;;#ASMSTART
 ; GCN: v_nop_e64
 ; GCN: v_nop_e64
@@ -444,6 +450,7 @@ endif:
 ; GCN: v_nop_e64
 ; GCN: v_nop_e64
 ; GCN: ;;#ASMEND
+; GCN: s_cbranch_vccz [[RET]]
 
 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop_body
 ; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
@@ -452,9 +459,7 @@ endif:
 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
 ; GCN-NEXT: s_setpc_b64 vcc
 
-; GCN-NEXT: [[RET]]: ; %ret
-; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
-; GCN: buffer_store_dword
+; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @analyze_mask_branch() #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index 1e04544d2cb..4872fbfadab 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -2,10 +2,11 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
 
 ; GCN-LABEL: {{^}}test_loop:
+; GCN: s_and_b64 vcc, exec, -1
 ; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
-; GCN: s_branch [[LABEL]]
+; GCN: s_cbranch_vccnz [[LABEL]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 3caffc342c7..567b5a9f450 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -1,18 +1,167 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify %s | FileCheck -check-prefix=IR %s
 
 ; SI-LABEL: {{^}}infinite_loop:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
-; SI: BB0_1:
+; SI: [[LOOP:BB[0-9]+_[0-9]+]]:  ; %loop
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: buffer_store_dword [[REG]]
-; SI: s_branch BB0_1
+; SI: s_branch [[LOOP]]
 define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
 entry:
-  br label %for.body
+  br label %loop
 
-for.body:                                         ; preds = %entry, %for.body
+loop:
   store i32 999, i32 addrspace(1)* %out, align 4
-  br label %for.body
+  br label %loop
 }
 
+
+; IR-LABEL: @infinite_loop_ret(
+; IR:  br i1 %cond, label %loop, label %UnifiedReturnBlock
+
+; IR: loop:
+; IR: store i32 999, i32 addrspace(1)* %out, align 4
+; IR: br i1 true, label %loop, label %UnifiedReturnBlock
+
+; IR: UnifiedReturnBlock:
+; IR:  ret void
+
+
+; SI-LABEL: {{^}}infinite_loop_ret:
+; SI: s_cbranch_execz [[RET:BB[0-9]+_[0-9]+]]
+
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
+; SI: [[LOOP:BB[0-9]+_[0-9]+]]:  ; %loop
+; SI: s_and_b64 vcc, exec, -1
+; SI: s_waitcnt lgkmcnt(0)
+; SI: buffer_store_dword [[REG]]
+; SI: s_cbranch_vccnz [[LOOP]]
+
+; SI: [[RET]]:  ; %UnifiedReturnBlock
+; SI: s_endpgm
+define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond = icmp eq i32 %tmp, 1
+  br i1 %cond, label %loop, label %return
+
+loop:
+  store i32 999, i32 addrspace(1)* %out, align 4
+  br label %loop
+
+return:
+  ret void
+}
+
+
+; IR-LABEL: @infinite_loops(
+; IR: br i1 undef, label %loop1, label %loop2
+
+; IR: loop1:
+; IR: store i32 999, i32 addrspace(1)* %out, align 4
+; IR: br i1 true, label %loop1, label %DummyReturnBlock
+
+; IR: loop2:
+; IR: store i32 888, i32 addrspace(1)* %out, align 4
+; IR: br i1 true, label %loop2, label %DummyReturnBlock
+
+; IR: DummyReturnBlock:
+; IR: ret void
+
+
+; SI-LABEL: {{^}}infinite_loops:
+
+; SI: v_mov_b32_e32 [[REG1:v[0-9]+]], 0x3e7
+; SI: s_and_b64 vcc, exec, -1
+
+; SI: [[LOOP1:BB[0-9]+_[0-9]+]]:  ; %loop1
+; SI: s_waitcnt lgkmcnt(0)
+; SI: buffer_store_dword [[REG1]]
+; SI: s_cbranch_vccnz [[LOOP1]]
+; SI: s_branch [[RET:BB[0-9]+_[0-9]+]]
+
+; SI: v_mov_b32_e32 [[REG2:v[0-9]+]], 0x378
+; SI: s_and_b64 vcc, exec, -1
+
+; SI: [[LOOP2:BB[0-9]+_[0-9]+]]:  ; %loop2
+; SI: s_waitcnt lgkmcnt(0)
+; SI: buffer_store_dword [[REG2]]
+; SI: s_cbranch_vccnz [[LOOP2]]
+
+; SI: [[RET]]:  ; %DummyReturnBlock
+; SI: s_endpgm
+define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
+entry:
+  br i1 undef, label %loop1, label %loop2
+
+loop1:
+  store i32 999, i32 addrspace(1)* %out, align 4
+  br label %loop1
+
+loop2:
+  store i32 888, i32 addrspace(1)* %out, align 4
+  br label %loop2
+}
+
+
+
+; IR-LABEL: @infinite_loop_nest_ret(
+; IR: br i1 %cond1, label %outer_loop, label %UnifiedReturnBlock
+
+; IR: outer_loop:
+; IR: br label %inner_loop
+
+; IR: inner_loop:
+; IR: store i32 999, i32 addrspace(1)* %out, align 4
+; IR: %cond3 = icmp eq i32 %tmp, 3
+; IR: br i1 true, label %TransitionBlock, label %UnifiedReturnBlock
+
+; IR: TransitionBlock:
+; IR: br i1 %cond3, label %inner_loop, label %outer_loop
+
+; IR: UnifiedReturnBlock:
+; IR: ret void
+
+; SI-LABEL: {{^}}infinite_loop_nest_ret:
+; SI: s_cbranch_execz [[RET:BB[0-9]+_[0-9]+]]
+
+; SI: s_mov_b32
+; SI: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]:  ; %outer_loop
+
+; SI: [[INNER_LOOP:BB[0-9]+_[0-9]+]]:  ; %inner_loop
+; SI: s_waitcnt expcnt(0)
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
+; SI: v_cmp_ne_u32_e32
+; SI: s_waitcnt lgkmcnt(0)
+; SI: buffer_store_dword [[REG]]
+
+; SI: s_andn2_b64 exec
+; SI: s_cbranch_execnz [[INNER_LOOP]]
+
+; SI: s_andn2_b64 exec
+; SI: s_cbranch_execnz [[OUTER_LOOP]]
+
+; SI: [[RET]]:  ; %UnifiedReturnBlock
+; SI: s_endpgm
+define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond1 = icmp eq i32 %tmp, 1
+  br i1 %cond1, label %outer_loop, label %return
+
+outer_loop:
+ ; %cond2 = icmp eq i32 %tmp, 2
+ ; br i1 %cond2, label %outer_loop, label %inner_loop
+ br label %inner_loop
+
+inner_loop:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 999, i32 addrspace(1)* %out, align 4
+  %cond3 = icmp eq i32 %tmp, 3
+  br i1 %cond3, label %inner_loop, label %outer_loop
+
+return:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 51eb24cd658..b02ce51da1c 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -70,7 +70,7 @@
 
 ; GCN: [[BB9:BB[0-9]+_[0-9]+]]: ; %bb9
 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: s_branch [[BB9]]
+; GCN-NEXT: s_cbranch_vccnz [[BB9]]
 define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* nocapture %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index a4b6d1fd069..1f915662ac0 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -96,7 +96,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; SI-NEXT: s_cbranch_scc1 [[ENDPGM]]
 
 ; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]
-; SI: s_branch [[INFLOOP]]
+; SI: s_cbranch_vccnz [[INFLOOP]]
 
 ; SI: [[ENDPGM]]:
 ; SI: s_endpgm
author	Changpeng Fang <changpeng.fang@gmail.com>	2018-05-17 16:45:01 +0000
committer	Changpeng Fang <changpeng.fang@gmail.com>	2018-05-17 16:45:01 +0000
commit	391bcf889391f9128763674087f2ddfd82671e90 (patch)
tree	f1883bd89f3a27d0c1d2cd4dffc3d146a2219b0d /llvm/test
parent	daf5169398e7605c68d61c21618e887ce4980931 (diff)
download	bcm5719-llvm-391bcf889391f9128763674087f2ddfd82671e90.tar.gz bcm5719-llvm-391bcf889391f9128763674087f2ddfd82671e90.zip