diff options
7 files changed, 18 insertions, 236 deletions
diff --git a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h index 80f3e5fdcd4..7e827e8737b 100644 --- a/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h +++ b/llvm/include/llvm/Analysis/LoopUnrollAnalyzer.h @@ -89,7 +89,6 @@ private: bool visitLoad(LoadInst &I); bool visitCastInst(CastInst &I); bool visitCmpInst(CmpInst &I); - bool visitPHINode(PHINode &PN); }; } #endif diff --git a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp index a044e6bd330..fe73fe05724 100644 --- a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp +++ b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp @@ -189,13 +189,3 @@ bool UnrolledInstAnalyzer::visitCmpInst(CmpInst &I) { return Base::visitCmpInst(I); } - -bool UnrolledInstAnalyzer::visitPHINode(PHINode &PN) { - // Run base visitor first. This way we can gather some useful for later - // analysis information. - if (Base::visitPHINode(PN)) - return true; - - // The loop induction PHI nodes are definitionally free. - return PN.getParent() == L->getHeader(); -} diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 27827ffaaa1..95ec12d5cb8 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -185,40 +185,6 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( } namespace { -/// A struct to densely store the state of an instruction after unrolling at -/// each iteration. -/// -/// This is designed to work like a tuple of <Instruction *, int> for the -/// purposes of hashing and lookup, but to be able to associate two boolean -/// states with each key. -struct UnrolledInstState { - Instruction *I; - int Iteration : 30; - unsigned IsFree : 1; - unsigned IsCounted : 1; -}; - -/// Hashing and equality testing for a set of the instruction states. -struct UnrolledInstStateKeyInfo { - typedef DenseMapInfo<Instruction *> PtrInfo; - typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo; - static inline UnrolledInstState getEmptyKey() { - return {PtrInfo::getEmptyKey(), 0, 0, 0}; - } - static inline UnrolledInstState getTombstoneKey() { - return {PtrInfo::getTombstoneKey(), 0, 0, 0}; - } - static inline unsigned getHashValue(const UnrolledInstState &S) { - return PairInfo::getHashValue({S.I, S.Iteration}); - } - static inline bool isEqual(const UnrolledInstState &LHS, - const UnrolledInstState &RHS) { - return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration}); - } -}; -} - -namespace { struct EstimatedUnrollCost { /// \brief The estimated cost after unrolling. int UnrolledCost; @@ -252,25 +218,18 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) && "The unroll iterations max is too large!"); - // Only analyze inner loops. We can't properly estimate cost of nested loops - // and we won't visit inner loops again anyway. - if (!L->empty()) - return None; - // Don't simulate loops with a big or unknown tripcount if (!UnrollMaxIterationsCountToAnalyze || !TripCount || TripCount > UnrollMaxIterationsCountToAnalyze) return None; SmallSetVector<BasicBlock *, 16> BBWorklist; - SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist; DenseMap<Value *, Constant *> SimplifiedValues; SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. int UnrolledCost = 0; - // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic @@ -278,97 +237,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // unrolling. int RolledDynamicCost = 0; - // We track the simplification of each instruction in each iteration. We use - // this to recursively merge costs into the unrolled cost on-demand so that - // we don't count the cost of any dead code. This is essentially a map from - // <instruction, int> to <bool, bool>, but stored as a densely packed struct. - DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap; - - // A small worklist used to accumulate cost of instructions from each - // observable and reached root in the loop. - SmallVector<Instruction *, 16> CostWorklist; - - // PHI-used worklist used between iterations while accumulating cost. - SmallVector<Instruction *, 4> PHIUsedList; - - // Helper function to accumulate cost for instructions in the loop. - auto AddCostRecursively = [&](Instruction &RootI, int Iteration) { - assert(Iteration >= 0 && "Cannot have a negative iteration!"); - assert(CostWorklist.empty() && "Must start with an empty cost list"); - assert(PHIUsedList.empty() && "Must start with an empty phi used list"); - CostWorklist.push_back(&RootI); - for (;; --Iteration) { - do { - Instruction *I = CostWorklist.pop_back_val(); - - // InstCostMap only uses I and Iteration as a key, the other two values - // don't matter here. - auto CostIter = InstCostMap.find({I, Iteration, 0, 0}); - if (CostIter == InstCostMap.end()) - // If an input to a PHI node comes from a dead path through the loop - // we may have no cost data for it here. What that actually means is - // that it is free. - continue; - auto &Cost = *CostIter; - if (Cost.IsCounted) - // Already counted this instruction. - continue; - - // Mark that we are counting the cost of this instruction now. - Cost.IsCounted = true; - - // If this is a PHI node in the loop header, just add it to the PHI set. - if (auto *PhiI = dyn_cast<PHINode>(I)) - if (PhiI->getParent() == L->getHeader()) { - assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they " - "inherently simplify during unrolling."); - if (Iteration == 0) - continue; - - // Push the incoming value from the backedge into the PHI used list - // if it is an in-loop instruction. We'll use this to populate the - // cost worklist for the next iteration (as we count backwards). - if (auto *OpI = dyn_cast<Instruction>( - PhiI->getIncomingValueForBlock(L->getLoopLatch()))) - if (L->contains(OpI)) - PHIUsedList.push_back(OpI); - continue; - } - - // First accumulate the cost of this instruction. - if (!Cost.IsFree) { - UnrolledCost += TTI.getUserCost(I); - DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration - << "): "); - DEBUG(I->dump()); - } - - // We must count the cost of every operand which is not free, - // recursively. If we reach a loop PHI node, simply add it to the set - // to be considered on the next iteration (backwards!). - for (Value *Op : I->operands()) { - // Check whether this operand is free due to being a constant or - // outside the loop. - auto *OpI = dyn_cast<Instruction>(Op); - if (!OpI || !L->contains(OpI)) - continue; - - // Otherwise accumulate its cost. - CostWorklist.push_back(OpI); - } - } while (!CostWorklist.empty()); - - if (PHIUsedList.empty()) - // We've exhausted the search. - break; - - assert(Iteration > 0 && - "Cannot track PHI-used values past the first iteration!"); - CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end()); - PHIUsedList.clear(); - } - }; - // Ensure that we don't violate the loop structure invariants relied on by // this analysis. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); @@ -423,32 +291,22 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { - // Track this instruction's expected baseline cost when executing the - // rolled loop form. - RolledDynamicCost += TTI.getUserCost(&I); + int InstCost = TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, - // and if the visitor returns true, mark the instruction as free after - // unrolling and continue. - bool IsFree = Analyzer.visit(I); - bool Inserted = InstCostMap.insert({&I, (int)Iteration, IsFree, - /*IsCounted*/ false}) - .second; - (void)Inserted; - assert(Inserted && "Cannot have a state for an unvisited instruction!"); - - if (IsFree) - continue; + // and if the visitor returns false, include this instruction in the + // unrolled cost. + if (!Analyzer.visit(I)) + UnrolledCost += InstCost; + else { + DEBUG(dbgs() << " " << I + << " would be simplified if loop is unrolled.\n"); + (void)0; + } - // If the instruction might have a side-effect recursively account for - // the cost of it and all the instructions leading up to it. - if (I.mayHaveSideEffects()) - AddCostRecursively(I, Iteration); - - // Can't properly model a cost of a call. - // FIXME: With a proper cost model we should be able to do it. - if(isa<CallInst>(&I)) - return None; + // Also track this instructions expected cost when executing the rolled + // loop form. + RolledDynamicCost += InstCost; // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) { @@ -477,8 +335,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0); if (L->contains(Succ)) BBWorklist.insert(Succ); - else - ExitWorklist.insert({BB, Succ}); continue; } } @@ -494,8 +350,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, .getCaseSuccessor(); if (L->contains(Succ)) BBWorklist.insert(Succ); - else - ExitWorklist.insert({BB, Succ}); continue; } } @@ -504,8 +358,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, for (BasicBlock *Succ : successors(BB)) if (L->contains(Succ)) BBWorklist.insert(Succ); - else - ExitWorklist.insert({BB, Succ}); } // If we found no optimization opportunities on the first iteration, we @@ -516,23 +368,6 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, return None; } } - - while (!ExitWorklist.empty()) { - BasicBlock *ExitingBB, *ExitBB; - std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val(); - - for (Instruction &I : *ExitBB) { - auto *PN = dyn_cast<PHINode>(&I); - if (!PN) - break; - - Value *Op = PN->getIncomingValueForBlock(ExitingBB); - if (auto *OpI = dyn_cast<Instruction>(Op)) - if (L->contains(OpI)) - AddCostRecursively(*OpI, TripCount - 1); - } - } - DEBUG(dbgs() << "Analysis finished:\n" << "UnrolledCost: " << UnrolledCost << ", " << "RolledDynamicCost: " << RolledDynamicCost << "\n"); diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll index f62141d9c37..5df48e8c380 100644 --- a/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll +++ b/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=70 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=50 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @unknown_global = internal unnamed_addr global [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16 diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll deleted file mode 100644 index 6ee73b6fe4f..00000000000 --- a/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s -target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" - -@known_constant = internal unnamed_addr constant [10 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0], align 16 - -; If a load becomes a constant after loop unrolling, we sometimes can simplify -; CFG. This test verifies that we handle such cases. -; After one operand in an instruction is constant-folded and the -; instruction is simplified, the other operand might become dead. -; In this test we have:: -; for i in 1..10: -; r += A[i] * B[i] -; A[i] is 0 almost at every iteration, so there is no need in loading B[i] at -; all. - - -; CHECK-LABEL: @unroll_dce -; CHECK-NOT: br i1 %exitcond, label %for.end, label %for.body -define i32 @unroll_dce(i32* noalias nocapture readonly %b) { -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %iv.0 = phi i64 [ 0, %entry ], [ %iv.1, %for.body ] - %r.0 = phi i32 [ 0, %entry ], [ %r.1, %for.body ] - %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32]* @known_constant, i64 0, i64 %iv.0 - %x1 = load i32, i32* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv.0 - %x2 = load i32, i32* %arrayidx2, align 4 - %mul = mul i32 %x1, %x2 - %r.1 = add i32 %mul, %r.0 - %iv.1 = add nuw nsw i64 %iv.0, 1 - %exitcond = icmp eq i64 %iv.1, 10 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret i32 %r.1 -} diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll index 723a384ea2d..bc08029d4f8 100644 --- a/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll +++ b/llvm/test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=40 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; When examining gep-instructions we shouldn't consider them simplified if the diff --git a/llvm/unittests/Analysis/UnrollAnalyzer.cpp b/llvm/unittests/Analysis/UnrollAnalyzer.cpp index 6d11ab1f2f1..83d57f52469 100644 --- a/llvm/unittests/Analysis/UnrollAnalyzer.cpp +++ b/llvm/unittests/Analysis/UnrollAnalyzer.cpp @@ -134,7 +134,6 @@ TEST(UnrollAnalyzerTest, OuterLoopSimplification) { " br label %outer.loop\n" "outer.loop:\n" " %iv.outer = phi i64 [ 0, %entry ], [ %iv.outer.next, %outer.loop.latch ]\n" - " %iv.outer.next = add nuw nsw i64 %iv.outer, 1\n" " br label %inner.loop\n" "inner.loop:\n" " %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]\n" @@ -142,6 +141,7 @@ TEST(UnrollAnalyzerTest, OuterLoopSimplification) { " %exitcond.inner = icmp eq i64 %iv.inner.next, 1000\n" " br i1 %exitcond.inner, label %outer.loop.latch, label %inner.loop\n" "outer.loop.latch:\n" + " %iv.outer.next = add nuw nsw i64 %iv.outer, 1\n" " %exitcond.outer = icmp eq i64 %iv.outer.next, 40\n" " br i1 %exitcond.outer, label %exit, label %outer.loop\n" "exit:\n" @@ -163,15 +163,11 @@ TEST(UnrollAnalyzerTest, OuterLoopSimplification) { BasicBlock *InnerBody = &*FI++; BasicBlock::iterator BBI = Header->begin(); - BBI++; - Instruction *Y1 = &*BBI; + Instruction *Y1 = &*BBI++; BBI = InnerBody->begin(); - BBI++; - Instruction *Y2 = &*BBI; + Instruction *Y2 = &*BBI++; // Check that we can simplify IV of the outer loop, but can't simplify the IV // of the inner loop if we only know the iteration number of the outer loop. - // - // Y1 is %iv.outer.next, Y2 is %iv.inner.next auto I1 = SimplifiedValuesVector[0].find(Y1); EXPECT_TRUE(I1 != SimplifiedValuesVector[0].end()); auto I2 = SimplifiedValuesVector[0].find(Y2); |