summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-condition-and.ll17
-rw-r--r--llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll710
-rw-r--r--llvm/test/CodeGen/AMDGPU/ret_jump.ll87
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll45
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/valu-i1.ll83
6 files changed, 952 insertions, 44 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
index 94616a4be8f..68b77ea3490 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -15,12 +15,16 @@
; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]]
-;
-; TODO: The following sequence is a bug (missing s_endpgm)!
-;
-; GCN: s_branch [[BB:BB[0-9]+_[0-9]+]]
-; GCN: [[BB]]:
-; GCN-NEXT: .Lfunc_end0:
+; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+
+; GCN-NEXT: [[BB5]]
+; GCN: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: .Lfunc_end
define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
bb:
%tmp = fcmp ogt float %arg, 0.000000e+00
@@ -29,6 +33,7 @@ bb:
br i1 %tmp3, label %bb4, label %bb5
bb4: ; preds = %bb
+ store volatile i32 4, i32 addrspace(3)* undef
unreachable
bb5: ; preds = %bb
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
new file mode 100644
index 00000000000..9d0b6b39599
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -0,0 +1,710 @@
+; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Add an extra verifier runs. There were some cases where invalid IR
+; was produced but happened to be fixed by the later passes.
+
+; Make sure divergent control flow with multiple exits from a region
+; is properly handled. UnifyFunctionExitNodes should be run before
+; StructurizeCFG.
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: %2 = extractvalue { i1, i64 } %1, 0
+; IR: %3 = extractvalue { i1, i64 } %1, 1
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: %7 = extractvalue { i1, i64 } %6, 0
+; IR: %8 = extractvalue { i1, i64 } %6, 1
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: LeafBlock:
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: br label %Flow{{$}}
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: %13 = extractvalue { i1, i64 } %12, 0
+; IR: %14 = extractvalue { i1, i64 } %12, 1
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+
+
+; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
+; GCN: v_cmp_lt_i32_e32 vcc, 1
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+
+
+; FIXME: Why is this compare essentially repeated?
+; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
+; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
+
+; GCN: ; %Flow1
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+
+; GCN: ; %exit1
+; GCN: ds_write_b32
+
+; GCN: %Flow2
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_and_saveexec_b64
+; GCN-NEXT: s_xor_b64
+
+; GCN: ; %exit0
+; GCN: buffer_store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
+
+
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: unreachable
+
+
+; FIXME: Probably should insert an s_endpgm anyway.
+; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
+; GCN: ; %UnifiedUnreachableBlock
+; GCN-NEXT: .Lfunc_end
+define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ unreachable
+}
+
+; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
+; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
+; IR: llvm.amdgcn.if
+; IR: br i1
+
+; IR: {{^}}Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: {{^}}LeafBlock:
+; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
+; IR: %9 = xor i1 %divergent.cond1, true
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
+; IR: %10 = xor i1 %uniform.cond0, true
+; IR: br label %Flow
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: {{^}}Flow1:
+; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %divergent.cond0 = icmp slt i32 %tmp16, 2
+ br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %divergent.cond1 = icmp eq i32 %tmp16, 1
+ br i1 %divergent.cond1, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %uniform.cond0 = icmp eq i32 %arg3, 2
+ br i1 %uniform.cond0, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+
+define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %arg3, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
+; IR: Flow2:
+; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
+; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %20)
+
+; IR: UnifiedReturnBlock:
+; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %15)
+; IR: ret float %UnifiedRetVal
+define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
+entry:
+ %Pivot = icmp slt i32 %vgpr, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %vgpr, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %vgpr, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store i32 9, i32 addrspace(1)* undef
+ ret float 1.0
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store i32 17, i32 addrspace(3)* undef
+ ret float 2.0
+}
+
+; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
+
+; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
+; GCN: s_cmp_gt_i32 s0, 1
+; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
+
+; GCN: {{^}}[[FLOW]]:
+; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
+
+; GCN: v_mov_b32_e32 v0, 2.0
+; GCN: s_or_b64 exec, exec
+; GCN: s_and_b64 exec, exec
+; GCN: v_mov_b32_e32 v0, 1.0
+
+; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+
+define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
+entry:
+ %uniform.cond = icmp slt i32 %sgpr, 2
+ br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %divergent.cond0 = icmp eq i32 %vgpr, 3
+ br i1 %divergent.cond0, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %divergent.cond1 = icmp eq i32 %vgpr, 7
+ br i1 %divergent.cond1, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store i32 9, i32 addrspace(1)* undef
+ ret float 1.0
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store i32 17, i32 addrspace(3)* undef
+ ret float 2.0
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+}
+
+; The non-uniformity of the branch to the exiting blocks requires
+; looking at transitive predecessors.
+
+; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
+
+; IR: exit0: ; preds = %Flow2
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+
+; IR: indirect.exit1:
+; IR: %load = load volatile i32, i32 addrspace(1)* undef
+; IR: store volatile i32 %load, i32 addrspace(1)* undef
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+
+indirect.exit1:
+ %load = load volatile i32, i32 addrspace(1)* undef
+ store volatile i32 %load, i32 addrspace(1)* undef
+ br label %exit1
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_switch(
+define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ switch i32 %tmp16, label %exit1
+ [ i32 1, label %LeafBlock
+ i32 2, label %LeafBlock1
+ i32 3, label %exit0 ]
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+}
+
+; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+ %uniform.cond0 = icmp eq i32 %arg0, 4
+ br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %divergent.cond0 = icmp eq i32 %id.x, 0
+ br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
+
+divergent.ret0:
+ store volatile i32 11, i32 addrspace(3)* undef
+ ret void
+
+divergent.ret1:
+ store volatile i32 42, i32 addrspace(3)* undef
+ ret void
+
+uniform.ret:
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+}
+
+; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+ %uniform.cond0 = icmp eq i32 %arg0, 4
+ br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %divergent.cond0 = icmp eq i32 %id.x, 0
+ br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
+
+divergent.if:
+ %vgpr0 = load volatile float, float addrspace(1)* undef
+ %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
+ br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
+
+divergent.then:
+ %vgpr1 = load volatile float, float addrspace(1)* undef
+ %divergent.cond2 = fcmp olt float %vgpr1, 4.0
+ store volatile i32 33, i32 addrspace(1)* undef
+ br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
+
+divergent.endif:
+ store volatile i32 38, i32 addrspace(1)* undef
+ br label %divergent.ret0
+
+divergent.ret0:
+ store volatile i32 11, i32 addrspace(3)* undef
+ ret void
+
+divergent.ret1:
+ store volatile i32 42, i32 addrspace(3)* undef
+ ret void
+
+uniform.ret:
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+}
+
+; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
+; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region
+; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
+; IR: br i1 %8, label %uniform.if, label %Flow2
+
+; IR: Flow: ; preds = %uniform.then, %uniform.if
+; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
+; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
+
+; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
+; IR-NEXT: ret void
+define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
+entry:
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %divergent.cond0 = icmp eq i32 %id.x, 0
+ br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
+
+uniform.multi.exit.region:
+ %uniform.cond0 = icmp eq i32 %arg0, 4
+ br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
+
+uniform.if:
+ %sgpr0 = load volatile i32, i32 addrspace(2)* undef
+ %uniform.cond1 = icmp slt i32 %sgpr0, 1
+ br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
+
+uniform.then:
+ %sgpr1 = load volatile i32, i32 addrspace(2)* undef
+ %uniform.cond2 = icmp sge i32 %sgpr1, 4
+ store volatile i32 33, i32 addrspace(1)* undef
+ br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
+
+uniform.endif:
+ store volatile i32 38, i32 addrspace(1)* undef
+ br label %uniform.ret0
+
+uniform.ret0:
+ store volatile i32 11, i32 addrspace(3)* undef
+ ret void
+
+uniform.ret1:
+ store volatile i32 42, i32 addrspace(3)* undef
+ ret void
+
+divergent.ret:
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_divergent_unreachable_exit(
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+ switch i32 %tmp, label %bb3 [
+ i32 2, label %bb1
+ i32 0, label %bb2
+ ]
+
+bb1: ; preds = %bb
+ unreachable
+
+bb2: ; preds = %bb
+ unreachable
+
+bb3: ; preds = %bb
+ switch i32 undef, label %bb5 [
+ i32 2, label %bb4
+ ]
+
+bb4: ; preds = %bb3
+ ret void
+
+bb5: ; preds = %bb3
+ unreachable
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
index 2aad3ce6418..f2fbacbab82 100644
--- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
@@ -4,20 +4,78 @@
; This should end with an no-op sequence of exec mask manipulations
; Mask should be in original state after executed unreachable block
-; GCN-LABEL: {{^}}main:
+
+; GCN-LABEL: {{^}}uniform_br_trivial_ret_divergent_br_trivial_unreachable:
; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: ; %else
+
; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
-; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
-; GCN: [[RET_BB]]:
-; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]]
+; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb
+; GCN-NEXT: ; divergent unreachable
-; GCN-NEXT: [[UNREACHABLE_BB]]:
-; GCN-NEXT: [[FINAL_BB]]:
+; GCN-NEXT: {{^}}[[FLOW]]: ; %Flow
+; GCN-NEXT: s_or_b64 exec, exec
+
+; GCN-NEXT: [[RET_BB]]:
+; GCN-NEXT: ; return
; GCN-NEXT: .Lfunc_end0
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+entry:
+ %i.i = extractelement <2 x i32> %arg7, i32 0
+ %j.i = extractelement <2 x i32> %arg7, i32 1
+ %i.f.i = bitcast i32 %i.i to float
+ %j.f.i = bitcast i32 %j.i to float
+ %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2
+ %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2
+ %p87 = fmul float undef, %p2.i
+ %p88 = fadd float %p87, undef
+ %p93 = fadd float %p88, undef
+ %p97 = fmul float %p93, undef
+ %p102 = fsub float %p97, undef
+ %p104 = fmul float %p102, undef
+ %p106 = fadd float 0.000000e+00, %p104
+ %p108 = fadd float undef, %p106
+ %uniform.cond = icmp slt i32 %arg17, 0
+ br i1 %uniform.cond, label %ret.bb, label %else
+
+else: ; preds = %main_body
+ %p124 = fmul float %p108, %p108
+ %p125 = fsub float %p124, undef
+ %divergent.cond = fcmp olt float %p125, 0.000000e+00
+ br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
+
+unreachable.bb: ; preds = %else
+ unreachable
+
+ret.bb: ; preds = %else, %main_body
+ ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
+}
+
+; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable:
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: ; BB#{{[0-9]+}}: ; %else
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: ; %unreachable.bb
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+; GCN: ; divergent unreachable
+
+; GCN: ; %ret.bb
+; GCN: store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+; GCN-NEXT: .Lfunc_end
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
main_body:
%i.i = extractelement <2 x i32> %arg7, i32 0
%j.i = extractelement <2 x i32> %arg7, i32 1
@@ -33,18 +91,21 @@ main_body:
%p104 = fmul float %p102, undef
%p106 = fadd float 0.000000e+00, %p104
%p108 = fadd float undef, %p106
- br i1 undef, label %ENDIF69, label %ELSE
+ %uniform.cond = icmp slt i32 %arg18, 0
+ br i1 %uniform.cond, label %ret.bb, label %else
-ELSE: ; preds = %main_body
+else: ; preds = %main_body
%p124 = fmul float %p108, %p108
%p125 = fsub float %p124, undef
- %p126 = fcmp olt float %p125, 0.000000e+00
- br i1 %p126, label %ENDIF69, label %ELSE41
+ %divergent.cond = fcmp olt float %p125, 0.000000e+00
+ br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
-ELSE41: ; preds = %ELSE
+unreachable.bb: ; preds = %else
+ store volatile i32 8, i32 addrspace(3)* undef
unreachable
-ENDIF69: ; preds = %ELSE, %main_body
+ret.bb: ; preds = %else, %main_body
+ store volatile i32 11, i32 addrspace(1)* undef
ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
}
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
index 2bc734cc522..5c6663dbbda 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
@@ -6,7 +6,7 @@
; OPT-NOT: call i1 @llvm.amdgcn.loop
; GCN-LABEL: {{^}}annotate_unreachable_noloop:
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
; GCN-NOT: s_endpgm
; GCN: .Lfunc_end0
define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
@@ -37,9 +37,14 @@ bb5: ; preds = %bb3, %bb1
; OPT-NOT: call i1 @llvm.amdgcn.loop
; GCN-LABEL: {{^}}annotate_ret_noloop:
-; GCN: s_cbranch_scc1
-; GCN: s_endpgm
-; GCN: .Lfunc_end1
+; GCN: load_dwordx4
+; GCN: v_cmp_nlt_f32
+; GCN: s_and_saveexec_b64
+; GCN: ; mask branch [[UNIFIED_RET:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: [[UNIFIED_RET]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN: .Lfunc_end
define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -49,6 +54,38 @@ bb1: ; preds = %bb
%tmp2 = sext i32 %tmp to i64
%tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
%tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+ %tmp5 = extractelement <4 x float> %tmp4, i32 1
+ store volatile <4 x float> %tmp4, <4 x float> addrspace(1)* undef
+ %cmp = fcmp ogt float %tmp5, 1.0
+ br i1 %cmp, label %bb5, label %bb3
+
+bb3: ; preds = %bb1
+ %tmp6 = extractelement <4 x float> %tmp4, i32 2
+ %tmp7 = fcmp olt float %tmp6, 0.000000e+00
+ br i1 %tmp7, label %bb4, label %bb5 ; crash goes away if these are swapped
+
+bb4: ; preds = %bb3
+ ret void
+
+bb5: ; preds = %bb3, %bb1
+ ret void
+}
+
+; OPT-LABEL: @uniform_annotate_ret_noloop(
+; OPT-NOT: call i1 @llvm.amdgcn.loop
+
+; GCN-LABEL: {{^}}uniform_annotate_ret_noloop:
+; GCN: s_cbranch_scc1
+; GCN: s_endpgm
+; GCN: .Lfunc_end
+define amdgpu_kernel void @uniform_annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg, i32 %tmp) #0 {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb
+ %tmp2 = sext i32 %tmp to i64
+ %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
+ %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
br i1 undef, label %bb5, label %bb3
bb3: ; preds = %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index 89c1eeb8381..cb010cf1530 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -4,16 +4,17 @@
; GCN: v_cmp_eq_u32
; GCN: s_and_saveexec_b64
; GCN: s_xor_b64
-; GCN: ; mask branch [[RET:BB[0-9]+]]
-; GCN: s_branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
-; GCN: [[RET]]
-; GCN: s_or_b64 exec, exec
-; GCN: s_endpgm
-
-; GCN: [[UNREACHABLE]]:
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
; GCN: ds_write_b32
+; GCN: ; divergent unreachable
; GCN: s_waitcnt
+
+; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: s_endpgm
+
define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
bb:
%tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -29,18 +30,19 @@ ret:
}
; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
-; GCN: v_cmp_eq_u32
+; GCN: v_cmp_ne_u32
; GCN: s_and_saveexec_b64
; GCN: s_xor_b64
-; GCN: ; mask branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]]
-; GCN-NEXT: ; %ret
-; GCN-NEXT: s_endpgm
-
-; GCN-NEXT: [[UNREACHABLE]]:
-; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
; GCN: ds_write_b32
+; GCN: ; divergent unreachable
; GCN: s_waitcnt
+
+; GCN: [[RETURN]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
bb:
%tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -55,7 +57,29 @@ unreachable:
unreachable
}
-; Function Attrs: nounwind readnone
+; GCN-LABEL: {{^}}uniform_lower_control_flow_unreachable_terminator:
+; GCN: s_cmp_lg_u32
+; GCN: s_cbranch_scc0 [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB#{{[0-9]+}}: ; %ret
+; GCN-NEXT: s_endpgm
+
+; GCN: [[UNREACHABLE]]:
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 {
+bb:
+ %tmp63 = icmp eq i32 %arg0, 32
+ br i1 %tmp63, label %unreachable, label %ret
+
+unreachable:
+ store volatile i32 0, i32 addrspace(3)* undef, align 4
+ unreachable
+
+ret:
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.y() #1
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index 41220ff8f87..aad260c3e36 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -64,29 +64,100 @@ end:
ret void
}
-; SI-LABEL: @simple_test_v_if
+; SI-LABEL: {{^}}simple_test_v_if:
; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
-; SI: BB{{[0-9]+_[0-9]+}}:
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
; SI: buffer_store_dword
-; SI: s_endpgm
+; SI-NEXT: s_waitcnt
-; SI: BB1_2:
+; SI-NEXT: {{^}}[[EXIT]]:
; SI: s_or_b64 exec, exec, [[BR_SREG]]
; SI: s_endpgm
define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%is.0 = icmp ne i32 %tid, 0
- br i1 %is.0, label %store, label %exit
+ br i1 %is.0, label %then, label %exit
+
+then:
+ %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+ store i32 999, i32 addrspace(1)* %gep
+ br label %exit
+
+exit:
+ ret void
+}
+
+; FIXME: It would be better to endpgm in the then block.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
+; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[EXIT]]:
+; SI: s_or_b64 exec, exec, [[BR_SREG]]
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %is.0 = icmp ne i32 %tid, 0
+ br i1 %is.0, label %then, label %exit
+
+then:
+ %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+ store i32 999, i32 addrspace(1)* %gep
+ ret void
+
+exit:
+ ret void
+}
+
+; Final block has more than a ret to execute. This was miscompiled
+; before function exit blocks were unified since the endpgm would
+; terminate the then wavefront before reaching the store.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
+; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
+; SI: ds_write_b32
+; SI: s_waitcnt
+
+; SI-NEXT: {{^}}[[FLOW]]:
+; SI-NEXT: s_or_saveexec_b64
+; SI-NEXT: s_xor_b64 exec, exec
+; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
+; SI: s_or_b64 exec, exec
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %is.0 = icmp ne i32 %tid, 0
+ br i1 %is.0, label %then, label %exit
-store:
+then:
%gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
store i32 999, i32 addrspace(1)* %gep
ret void
exit:
+ store volatile i32 7, i32 addrspace(3)* undef
ret void
}
OpenPOWER on IntegriCloud