2 files changed, 137 insertions, 35 deletions
diff --git a/llvm/lib/Analysis/SyncDependenceAnalysis.cpp b/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
index 3cf248a3114..8447dc87069 100644
--- a/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
@@ -218,9 +218,11 @@ struct DivergencePropagator {
   template <typename SuccessorIterable>
   std::unique_ptr<ConstBlockSet>
   computeJoinPoints(const BasicBlock &RootBlock,
-                    SuccessorIterable NodeSuccessors, const Loop *ParentLoop, const BasicBlock * PdBoundBlock) {
+                    SuccessorIterable NodeSuccessors, const Loop *ParentLoop) {
     assert(JoinBlocks);
 
+    LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints. Parent loop: " << (ParentLoop ? ParentLoop->getName() : "<null>") << "\n" );
+
     // bootstrap with branch targets
     for (const auto *SuccBlock : NodeSuccessors) {
       DefMap.emplace(SuccBlock, SuccBlock);
@@ -228,13 +230,19 @@ struct DivergencePropagator {
       if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
         // immediate loop exit from node.
         ReachedLoopExits.insert(SuccBlock);
-        continue;
       } else {
         // regular successor
         PendingUpdates.insert(SuccBlock);
       }
     }
 
+    LLVM_DEBUG(
+      dbgs() << "SDA: rpo order:\n";
+      for (const auto * RpoBlock : FuncRPOT) {
+        dbgs() << "- " << RpoBlock->getName() << "\n";
+      }
+    );
+
     auto ItBeginRPO = FuncRPOT.begin();
 
     // skip until term (TODO RPOT won't let us start at @term directly)
@@ -245,16 +253,18 @@ struct DivergencePropagator {
 
     // propagate definitions at the immediate successors of the node in RPO
     auto ItBlockRPO = ItBeginRPO;
-    while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) {
+    while ((++ItBlockRPO != ItEndRPO) &&
+           !PendingUpdates.empty()) {
       const auto *Block = *ItBlockRPO;
+      LLVM_DEBUG(dbgs() << "SDA::joins. visiting " << Block->getName() << "\n");
 
-      // skip @block if not pending update
+      // skip Block if not pending update
       auto ItPending = PendingUpdates.find(Block);
       if (ItPending == PendingUpdates.end())
         continue;
       PendingUpdates.erase(ItPending);
 
-      // propagate definition at @block to its successors
+      // propagate definition at Block to its successors
       auto ItDef = DefMap.find(Block);
       const auto *DefBlock = ItDef->second;
       assert(DefBlock);
@@ -278,6 +288,8 @@ struct DivergencePropagator {
       }
     }
 
+    LLVM_DEBUG(dbgs() << "SDA::joins. After propagation:\n"; printDefs(dbgs()));
+
     // We need to know the definition at the parent loop header to decide
     // whether the definition at the header is different from the definition at
     // the loop exits, which would indicate a divergent loop exits.
@@ -292,24 +304,17 @@ struct DivergencePropagator {
     // |
     // proper exit from both loops
     //
-    // D post-dominates B as it is the only proper exit from the "A loop".
-    // If C has a divergent branch, propagation will therefore stop at D.
-    // That implies that B will never receive a definition.
-    // But that definition can only be the same as at D (D itself in thise case)
-    // because all paths to anywhere have to pass through D.
-    //
-    const BasicBlock *ParentLoopHeader =
-        ParentLoop ? ParentLoop->getHeader() : nullptr;
-    if (ParentLoop && ParentLoop->contains(PdBoundBlock)) {
-      DefMap[ParentLoopHeader] = DefMap[PdBoundBlock];
-    }
-
     // analyze reached loop exits
     if (!ReachedLoopExits.empty()) {
+      const BasicBlock *ParentLoopHeader =
+          ParentLoop ? ParentLoop->getHeader() : nullptr;
+
       assert(ParentLoop);
-      const auto *HeaderDefBlock = DefMap[ParentLoopHeader];
+      auto ItHeaderDef = DefMap.find(ParentLoopHeader);
+      const auto *HeaderDefBlock = (ItHeaderDef == DefMap.end()) ? nullptr : ItHeaderDef->second;
+
       LLVM_DEBUG(printDefs(dbgs()));
-      assert(HeaderDefBlock && "no definition in header of carrying loop");
+      assert(HeaderDefBlock && "no definition at header of carrying loop");
 
       for (const auto *ExitBlock : ReachedLoopExits) {
         auto ItExitDef = DefMap.find(ExitBlock);
@@ -339,19 +344,10 @@ const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) {
     return *ItCached->second;
   }
 
-  // dont propagte beyond the immediate post dom of the loop
-  const auto *PdNode = PDT.getNode(const_cast<BasicBlock *>(Loop.getHeader()));
-  const auto *IpdNode = PdNode->getIDom();
-  const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-  while (PdBoundBlock && Loop.contains(PdBoundBlock)) {
-    IpdNode = IpdNode->getIDom();
-    PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-  }
-
   // compute all join points
   DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
   auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
-      *Loop.getHeader(), LoopExits, Loop.getParentLoop(), PdBoundBlock);
+      *Loop.getHeader(), LoopExits, Loop.getParentLoop());
 
   auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks));
   assert(ItInserted.second);
@@ -370,16 +366,11 @@ SyncDependenceAnalysis::join_blocks(const Instruction &Term) {
   if (ItCached != CachedBranchJoins.end())
     return *ItCached->second;
 
-  // dont propagate beyond the immediate post dominator of the branch
-  const auto *PdNode = PDT.getNode(const_cast<BasicBlock *>(Term.getParent()));
-  const auto *IpdNode = PdNode->getIDom();
-  const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
-
   // compute all join points
   DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
   const auto &TermBlock = *Term.getParent();
   auto JoinBlocks = Propagator.computeJoinPoints<succ_const_range>(
-      TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock), PdBoundBlock);
+      TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock));
 
   auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
   assert(ItInserted.second);
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/b42473-r1-crash.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/b42473-r1-crash.ll
new file mode 100644
index 00000000000..cb3e42de363
--- /dev/null
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/b42473-r1-crash.ll
@@ -0,0 +1,111 @@
+; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+
+declare i32 @gf2(i32)
+declare i32 @gf1(i32)
+
+define  void @tw1(i32 addrspace(4)* noalias nocapture readonly %A, i32 addrspace(4)* noalias nocapture %B) local_unnamed_addr #2 {
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'tw1':
+; CHECK: DIVERGENT: i32 addrspace(4)* %A
+; CHECK: DIVERGENT: i32 addrspace(4)* %B
+entry:
+; CHECK: DIVERGENT:       %call = tail call i32 @gf2(i32 0) #0
+; CHECK: DIVERGENT:       %cmp = icmp ult i32 %call, 16
+; CHECK: DIVERGENT:       br i1 %cmp, label %if.then, label %new_exit
+  %call = tail call  i32 @gf2(i32 0) #3
+  %cmp = icmp ult i32 %call, 16
+  br i1 %cmp, label %if.then, label %new_exit
+
+if.then:
+; CHECK: DIVERGENT:       %call1 = tail call i32 @gf1(i32 0) #0
+; CHECK: DIVERGENT:       %arrayidx = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %call1
+; CHECK: DIVERGENT:       %0 = load i32, i32 addrspace(4)* %arrayidx, align 4
+; CHECK: DIVERGENT:       %cmp225 = icmp sgt i32 %0, 0
+; CHECK: DIVERGENT:       %arrayidx10 = getelementptr inbounds i32, i32 addrspace(4)* %B, i32 %call1
+; CHECK: DIVERGENT:       br i1 %cmp225, label %while.body.preheader, label %if.then.while.end_crit_edge
+  %call1 = tail call  i32 @gf1(i32 0) #4
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %call1
+  %0 = load i32, i32 addrspace(4)* %arrayidx, align 4
+  %cmp225 = icmp sgt i32 %0, 0
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(4)* %B, i32 %call1
+  br i1 %cmp225, label %while.body.preheader, label %if.then.while.end_crit_edge
+
+while.body.preheader:
+  br label %while.body
+
+if.then.while.end_crit_edge:
+; CHECK: DIVERGENT:       %.pre = load i32, i32 addrspace(4)* %arrayidx10, align 4
+  %.pre = load i32, i32 addrspace(4)* %arrayidx10, align 4
+  br label %while.end
+
+while.body:
+; CHECK-NOT: DIVERGENT:                  %i.026 = phi i32 [ %inc, %if.end.while.body_crit_edge ], [ 0, %while.body.preheader ]
+; CHECK: DIVERGENT:       %call3 = tail call i32 @gf1(i32 0) #0
+; CHECK: DIVERGENT:       %cmp4 = icmp ult i32 %call3, 10
+; CHECK: DIVERGENT:       %arrayidx6 = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %i.026
+; CHECK: DIVERGENT:       %1 = load i32, i32 addrspace(4)* %arrayidx6, align 4
+; CHECK: DIVERGENT:       br i1 %cmp4, label %if.then5, label %if.else
+  %i.026 = phi i32 [ %inc, %if.end.while.body_crit_edge ], [ 0, %while.body.preheader ]
+  %call3 = tail call  i32 @gf1(i32 0) #4
+  %cmp4 = icmp ult i32 %call3, 10
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %i.026
+  %1 = load i32, i32 addrspace(4)* %arrayidx6, align 4
+  br i1 %cmp4, label %if.then5, label %if.else
+
+if.then5:
+; CHECK: DIVERGENT:       %mul = shl i32 %1, 1
+; CHECK: DIVERGENT:       %2 = load i32, i32 addrspace(4)* %arrayidx10, align 4
+; CHECK: DIVERGENT:       %add = add nsw i32 %2, %mul
+  %mul = shl i32 %1, 1
+  %2 = load i32, i32 addrspace(4)* %arrayidx10, align 4
+  %add = add nsw i32 %2, %mul
+  br label %if.end
+
+if.else:
+; CHECK: DIVERGENT:       %mul9 = shl i32 %1, 2
+; CHECK: DIVERGENT:       %3 = load i32, i32 addrspace(4)* %arrayidx10, align 4
+; CHECK: DIVERGENT:       %add11 = add nsw i32 %3, %mul9
+  %mul9 = shl i32 %1, 2
+  %3 = load i32, i32 addrspace(4)* %arrayidx10, align 4
+  %add11 = add nsw i32 %3, %mul9
+  br label %if.end
+
+if.end:
+; CHECK: DIVERGENT:       %storemerge = phi i32 [ %add11, %if.else ], [ %add, %if.then5 ]
+; CHECK: DIVERGENT:       store i32 %storemerge, i32 addrspace(4)* %arrayidx10, align 4
+; CHECK-NOT: DIVERGENT:                  %inc = add nuw nsw i32 %i.026, 1
+; CHECK: DIVERGENT:       %exitcond = icmp ne i32 %inc, %0
+; CHECK: DIVERGENT:       br i1 %exitcond, label %if.end.while.body_crit_edge, label %while.end.loopexit
+  %storemerge = phi i32 [ %add11, %if.else ], [ %add, %if.then5 ]
+  store i32 %storemerge, i32 addrspace(4)* %arrayidx10, align 4
+  %inc = add nuw nsw i32 %i.026, 1
+  %exitcond = icmp ne i32 %inc, %0
+  br i1 %exitcond, label %if.end.while.body_crit_edge, label %while.end.loopexit
+
+if.end.while.body_crit_edge:
+  br label %while.body
+
+while.end.loopexit:
+; CHECK: DIVERGENT:       %storemerge.lcssa = phi i32 [ %storemerge, %if.end ]
+  %storemerge.lcssa = phi i32 [ %storemerge, %if.end ]
+  br label %while.end
+
+while.end:
+; CHECK: DIVERGENT:       %4 = phi i32 [ %.pre, %if.then.while.end_crit_edge ], [ %storemerge.lcssa, %while.end.loopexit ]
+; CHECK: DIVERGENT:       %i.0.lcssa = phi i32 [ 0, %if.then.while.end_crit_edge ], [ %0, %while.end.loopexit ]
+; CHECK: DIVERGENT:       %sub = sub nsw i32 %4, %i.0.lcssa
+; CHECK: DIVERGENT:       store i32 %sub, i32 addrspace(4)* %arrayidx10, align 4
+  %4 = phi i32 [ %.pre, %if.then.while.end_crit_edge ], [ %storemerge.lcssa, %while.end.loopexit ]
+  %i.0.lcssa = phi i32 [ 0, %if.then.while.end_crit_edge ], [ %0, %while.end.loopexit ]
+  %sub = sub nsw i32 %4, %i.0.lcssa
+  store i32 %sub, i32 addrspace(4)* %arrayidx10, align 4
+  br label %new_exit
+
+new_exit:
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind readnone }
+attributes #4 = { nounwind readnone }