6 files changed, 952 insertions, 44 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
index 94616a4be8f..68b77ea3490 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -15,12 +15,16 @@
 ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
 ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
 ; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]]
-;
-; TODO: The following sequence is a bug (missing s_endpgm)!
-;
-; GCN: s_branch [[BB:BB[0-9]+_[0-9]+]]
-; GCN: [[BB]]:
-; GCN-NEXT: .Lfunc_end0:
+; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+
+; GCN-NEXT: [[BB5]]
+; GCN: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: .Lfunc_end
 define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
 bb:
   %tmp = fcmp ogt float %arg, 0.000000e+00
@@ -29,6 +33,7 @@ bb:
   br i1 %tmp3, label %bb4, label %bb5
 
 bb4:                                              ; preds = %bb
+  store volatile i32 4, i32 addrspace(3)* undef
   unreachable
 
 bb5:                                              ; preds = %bb
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
new file mode 100644
index 00000000000..9d0b6b39599
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -0,0 +1,710 @@
+; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Add an extra verifier runs. There were some cases where invalid IR
+; was produced but happened to be fixed by the later passes.
+
+; Make sure divergent control flow with multiple exits from a region
+; is properly handled. UnifyFunctionExitNodes should be run before
+; StructurizeCFG.
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: %2 = extractvalue { i1, i64 } %1, 0
+; IR: %3 = extractvalue { i1, i64 } %1, 1
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: %7 = extractvalue { i1, i64 } %6, 0
+; IR: %8 = extractvalue { i1, i64 } %6, 1
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: LeafBlock:
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: br label %Flow{{$}}
+
+; IR:  Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: %13 = extractvalue { i1, i64 } %12, 0
+; IR: %14 = extractvalue { i1, i64 } %12, 1
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR:  br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+
+
+; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
+; GCN: v_cmp_lt_i32_e32 vcc, 1
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+
+
+; FIXME: Why is this compare essentially repeated?
+; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
+; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
+
+; GCN: ; %Flow1
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+
+; GCN: ; %exit1
+; GCN: ds_write_b32
+
+; GCN: %Flow2
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_and_saveexec_b64
+; GCN-NEXT: s_xor_b64
+
+; GCN: ; %exit0
+; GCN: buffer_store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
+
+
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: unreachable
+
+
+; FIXME: Probably should insert an s_endpgm anyway.
+; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
+; GCN: ; %UnifiedUnreachableBlock
+; GCN-NEXT: .Lfunc_end
+define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  unreachable
+}
+
+; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
+; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
+; IR: llvm.amdgcn.if
+; IR: br i1
+
+; IR: {{^}}Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: {{^}}LeafBlock:
+; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
+; IR: %9 = xor i1 %divergent.cond1, true
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
+; IR: %10 = xor i1 %uniform.cond0, true
+; IR: br label %Flow
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: {{^}}Flow1:
+; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %divergent.cond0 = icmp slt i32 %tmp16, 2
+  br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %divergent.cond1 = icmp eq i32 %tmp16, 1
+  br i1 %divergent.cond1, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %uniform.cond0 = icmp eq i32 %arg3, 2
+  br i1 %uniform.cond0, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+
+define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %arg3, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
+; IR: Flow2:
+; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
+; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %20)
+
+; IR: UnifiedReturnBlock:
+; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %15)
+; IR: ret float %UnifiedRetVal
+define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
+entry:
+  %Pivot = icmp slt i32 %vgpr, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %vgpr, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %vgpr, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 9, i32 addrspace(1)* undef
+  ret float 1.0
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 17, i32 addrspace(3)* undef
+  ret float 2.0
+}
+
+; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
+
+; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
+; GCN: s_cmp_gt_i32 s0, 1
+; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
+
+; GCN: {{^}}[[FLOW]]:
+; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
+
+; GCN: v_mov_b32_e32 v0, 2.0
+; GCN: s_or_b64 exec, exec
+; GCN: s_and_b64 exec, exec
+; GCN: v_mov_b32_e32 v0, 1.0
+
+; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+
+define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
+entry:
+  %uniform.cond = icmp slt i32 %sgpr, 2
+  br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %divergent.cond0 = icmp eq i32 %vgpr, 3
+  br i1 %divergent.cond0, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %divergent.cond1 = icmp eq i32 %vgpr, 7
+  br i1 %divergent.cond1, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 9, i32 addrspace(1)* undef
+  ret float 1.0
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 17, i32 addrspace(3)* undef
+  ret float 2.0
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; The non-uniformity of the branch to the exiting blocks requires
+; looking at transitive predecessors.
+
+; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
+
+; IR: exit0:                                            ; preds = %Flow2
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+
+; IR: indirect.exit1:
+; IR: %load = load volatile i32, i32 addrspace(1)* undef
+; IR: store volatile i32 %load, i32 addrspace(1)* undef
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+indirect.exit1:
+  %load = load volatile i32, i32 addrspace(1)* undef
+  store volatile i32 %load, i32 addrspace(1)* undef
+  br label %exit1
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_switch(
+define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  switch i32 %tmp16, label %exit1
+    [ i32 1, label %LeafBlock
+      i32 2, label %LeafBlock1
+      i32 3, label %exit0 ]
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
+
+divergent.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
+
+divergent.if:
+  %vgpr0 = load volatile float, float addrspace(1)* undef
+  %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
+  br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
+
+divergent.then:
+  %vgpr1 = load volatile float, float addrspace(1)* undef
+  %divergent.cond2 = fcmp olt float %vgpr1, 4.0
+  store volatile i32 33, i32 addrspace(1)* undef
+  br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
+
+divergent.endif:
+  store volatile i32 38, i32 addrspace(1)* undef
+  br label %divergent.ret0
+
+divergent.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
+; IR: Flow1:                                            ; preds = %uniform.ret1, %uniform.multi.exit.region
+; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
+; IR: br i1 %8, label %uniform.if, label %Flow2
+
+; IR: Flow:                                             ; preds = %uniform.then, %uniform.if
+; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
+; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
+
+; IR: UnifiedReturnBlock:                               ; preds = %Flow3, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
+; IR-NEXT: ret void
+define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
+entry:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
+
+uniform.multi.exit.region:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
+
+uniform.if:
+  %sgpr0 = load volatile i32, i32 addrspace(2)* undef
+  %uniform.cond1 = icmp slt i32 %sgpr0, 1
+  br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
+
+uniform.then:
+  %sgpr1 = load volatile i32, i32 addrspace(2)* undef
+  %uniform.cond2 = icmp sge i32 %sgpr1, 4
+  store volatile i32 33, i32 addrspace(1)* undef
+  br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
+
+uniform.endif:
+  store volatile i32 38, i32 addrspace(1)* undef
+  br label %uniform.ret0
+
+uniform.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_unreachable_exit(
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  switch i32 %tmp, label %bb3 [
+    i32 2, label %bb1
+    i32 0, label %bb2
+  ]
+
+bb1:                                              ; preds = %bb
+  unreachable
+
+bb2:                                              ; preds = %bb
+  unreachable
+
+bb3:                                              ; preds = %bb
+  switch i32 undef, label %bb5 [
+    i32 2, label %bb4
+  ]
+
+bb4:                                              ; preds = %bb3
+  ret void
+
+bb5:                                              ; preds = %bb3
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
index 2aad3ce6418..f2fbacbab82 100644
--- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll
@@ -4,20 +4,78 @@
 ; This should end with an no-op sequence of exec mask manipulations
 ; Mask should be in original state after executed unreachable block
 
-; GCN-LABEL: {{^}}main:
+
+; GCN-LABEL: {{^}}uniform_br_trivial_ret_divergent_br_trivial_unreachable:
 ; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]]
 
+; GCN-NEXT: ; %else
+
 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
 ; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
-; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
 
-; GCN: [[RET_BB]]:
-; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]]
+; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb
+; GCN-NEXT: ; divergent unreachable
 
-; GCN-NEXT: [[UNREACHABLE_BB]]:
-; GCN-NEXT: [[FINAL_BB]]:
+; GCN-NEXT: {{^}}[[FLOW]]: ; %Flow
+; GCN-NEXT: s_or_b64 exec, exec
+
+; GCN-NEXT: [[RET_BB]]:
+; GCN-NEXT: ; return
 ; GCN-NEXT: .Lfunc_end0
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+entry:
+  %i.i = extractelement <2 x i32> %arg7, i32 0
+  %j.i = extractelement <2 x i32> %arg7, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p87 = fmul float undef, %p2.i
+  %p88 = fadd float %p87, undef
+  %p93 = fadd float %p88, undef
+  %p97 = fmul float %p93, undef
+  %p102 = fsub float %p97, undef
+  %p104 = fmul float %p102, undef
+  %p106 = fadd float 0.000000e+00, %p104
+  %p108 = fadd float undef, %p106
+  %uniform.cond = icmp slt i32 %arg17, 0
+  br i1 %uniform.cond, label %ret.bb, label %else
+
+else:                                             ; preds = %main_body
+  %p124 = fmul float %p108, %p108
+  %p125 = fsub float %p124, undef
+  %divergent.cond = fcmp olt float %p125, 0.000000e+00
+  br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
+
+unreachable.bb:                                           ; preds = %else
+  unreachable
+
+ret.bb:                                          ; preds = %else, %main_body
+  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
+}
+
+; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable:
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: ; BB#{{[0-9]+}}: ; %else
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT:  ; %unreachable.bb
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+; GCN: ; divergent unreachable
+
+; GCN: ; %ret.bb
+; GCN: store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+; GCN-NEXT: .Lfunc_end
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
 main_body:
   %i.i = extractelement <2 x i32> %arg7, i32 0
   %j.i = extractelement <2 x i32> %arg7, i32 1
@@ -33,18 +91,21 @@ main_body:
   %p104 = fmul float %p102, undef
   %p106 = fadd float 0.000000e+00, %p104
   %p108 = fadd float undef, %p106
-  br i1 undef, label %ENDIF69, label %ELSE
+  %uniform.cond = icmp slt i32 %arg18, 0
+  br i1 %uniform.cond, label %ret.bb, label %else
 
-ELSE:                                             ; preds = %main_body
+else:                                             ; preds = %main_body
   %p124 = fmul float %p108, %p108
   %p125 = fsub float %p124, undef
-  %p126 = fcmp olt float %p125, 0.000000e+00
-  br i1 %p126, label %ENDIF69, label %ELSE41
+  %divergent.cond = fcmp olt float %p125, 0.000000e+00
+  br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
 
-ELSE41:                                           ; preds = %ELSE
+unreachable.bb:                                           ; preds = %else
+  store volatile i32 8, i32 addrspace(3)* undef
   unreachable
 
-ENDIF69:                                          ; preds = %ELSE, %main_body
+ret.bb:                                          ; preds = %else, %main_body
+  store volatile i32 11, i32 addrspace(1)* undef
   ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
index 2bc734cc522..5c6663dbbda 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
@@ -6,7 +6,7 @@
 ; OPT-NOT: call i1 @llvm.amdgcn.loop
 
 ; GCN-LABEL: {{^}}annotate_unreachable_noloop:
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 ; GCN-NOT: s_endpgm
 ; GCN: .Lfunc_end0
 define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
@@ -37,9 +37,14 @@ bb5:                                              ; preds = %bb3, %bb1
 ; OPT-NOT: call i1 @llvm.amdgcn.loop
 
 ; GCN-LABEL: {{^}}annotate_ret_noloop:
-; GCN: s_cbranch_scc1
-; GCN: s_endpgm
-; GCN: .Lfunc_end1
+; GCN: load_dwordx4
+; GCN: v_cmp_nlt_f32
+; GCN: s_and_saveexec_b64
+; GCN: ; mask branch [[UNIFIED_RET:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: [[UNIFIED_RET]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN: .Lfunc_end
 define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -49,6 +54,38 @@ bb1:                                              ; preds = %bb
   %tmp2 = sext i32 %tmp to i64
   %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
   %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+  %tmp5 = extractelement <4 x float> %tmp4, i32 1
+  store volatile <4 x float> %tmp4, <4 x float> addrspace(1)* undef
+  %cmp = fcmp ogt float %tmp5, 1.0
+  br i1 %cmp, label %bb5, label %bb3
+
+bb3:                                              ; preds = %bb1
+  %tmp6 = extractelement <4 x float> %tmp4, i32 2
+  %tmp7 = fcmp olt float %tmp6, 0.000000e+00
+  br i1 %tmp7, label %bb4, label %bb5 ; crash goes away if these are swapped
+
+bb4:                                              ; preds = %bb3
+  ret void
+
+bb5:                                              ; preds = %bb3, %bb1
+  ret void
+}
+
+; OPT-LABEL: @uniform_annotate_ret_noloop(
+; OPT-NOT: call i1 @llvm.amdgcn.loop
+
+; GCN-LABEL: {{^}}uniform_annotate_ret_noloop:
+; GCN: s_cbranch_scc1
+; GCN: s_endpgm
+; GCN: .Lfunc_end
+define amdgpu_kernel void @uniform_annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg, i32 %tmp) #0 {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp2 = sext i32 %tmp to i64
+  %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
+  %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
   br i1 undef, label %bb5, label %bb3
 
 bb3:                                              ; preds = %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index 89c1eeb8381..cb010cf1530 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -4,16 +4,17 @@
 ; GCN: v_cmp_eq_u32
 ; GCN: s_and_saveexec_b64
 ; GCN: s_xor_b64
-; GCN: ; mask branch [[RET:BB[0-9]+]]
-; GCN: s_branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 
-; GCN: [[RET]]
-; GCN: s_or_b64 exec, exec
-; GCN: s_endpgm
-
-; GCN: [[UNREACHABLE]]:
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
+; GCN: ; divergent unreachable
 ; GCN: s_waitcnt
+
+; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: s_endpgm
+
 define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -29,18 +30,19 @@ ret:
 }
 
 ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
-; GCN: v_cmp_eq_u32
+; GCN: v_cmp_ne_u32
 ; GCN: s_and_saveexec_b64
 ; GCN: s_xor_b64
-; GCN: ; mask branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]]
 
-; GCN-NEXT: ; %ret
-; GCN-NEXT: s_endpgm
-
-; GCN-NEXT: [[UNREACHABLE]]:
-; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
+; GCN: ; divergent unreachable
 ; GCN: s_waitcnt
+
+; GCN: [[RETURN]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -55,7 +57,29 @@ unreachable:
   unreachable
 }
 
-; Function Attrs: nounwind readnone
+; GCN-LABEL: {{^}}uniform_lower_control_flow_unreachable_terminator:
+; GCN: s_cmp_lg_u32
+; GCN: s_cbranch_scc0 [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB#{{[0-9]+}}: ; %ret
+; GCN-NEXT: s_endpgm
+
+; GCN: [[UNREACHABLE]]:
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 {
+bb:
+  %tmp63 = icmp eq i32 %arg0, 32
+  br i1 %tmp63, label %unreachable, label %ret
+
+unreachable:
+  store volatile i32 0, i32 addrspace(3)* undef, align 4
+  unreachable
+
+ret:
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.y() #1
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index 41220ff8f87..aad260c3e36 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -64,29 +64,100 @@ end:
   ret void
 }
 
-; SI-LABEL: @simple_test_v_if
+; SI-LABEL: {{^}}simple_test_v_if:
 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
 
-; SI: BB{{[0-9]+_[0-9]+}}:
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
-; SI: s_endpgm
+; SI-NEXT: s_waitcnt
 
-; SI: BB1_2:
+; SI-NEXT: {{^}}[[EXIT]]:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
 ; SI: s_endpgm
 define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
-  br i1 %is.0, label %store, label %exit
+  br i1 %is.0, label %then, label %exit
+
+then:
+  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+  store i32 999, i32 addrspace(1)* %gep
+  br label %exit
+
+exit:
+  ret void
+}
+
+; FIXME: It would be better to endpgm in the then block.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
+; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[EXIT]]:
+; SI: s_or_b64 exec, exec, [[BR_SREG]]
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %is.0 = icmp ne i32 %tid, 0
+  br i1 %is.0, label %then, label %exit
+
+then:
+  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+  store i32 999, i32 addrspace(1)* %gep
+  ret void
+
+exit:
+  ret void
+}
+
+; Final block has more than a ret to execute. This was miscompiled
+; before function exit blocks were unified since the endpgm would
+; terminate the then wavefront before reaching the store.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
+; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
+; SI: ds_write_b32
+; SI: s_waitcnt
+
+; SI-NEXT: {{^}}[[FLOW]]:
+; SI-NEXT: s_or_saveexec_b64
+; SI-NEXT: s_xor_b64 exec, exec
+; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
+; SI: s_or_b64 exec, exec
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %is.0 = icmp ne i32 %tid, 0
+  br i1 %is.0, label %then, label %exit
 
-store:
+then:
   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
   store i32 999, i32 addrspace(1)* %gep
   ret void
 
 exit:
+  store volatile i32 7, i32 addrspace(3)* undef
   ret void
 }