diff options
author | Sam Parker <sam.parker@arm.com> | 2019-02-07 13:32:54 +0000 |
---|---|---|
committer | Sam Parker <sam.parker@arm.com> | 2019-02-07 13:32:54 +0000 |
commit | 67756c09f21ada07a3686601538e88da2ad1771e (patch) | |
tree | 0107d915bd5ec41cee6080448ef0fd2a674f28e2 /llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | |
parent | bb3b372aa118ff010fd044d0431ceda984475b10 (diff) | |
download | bcm5719-llvm-67756c09f21ada07a3686601538e88da2ad1771e.tar.gz bcm5719-llvm-67756c09f21ada07a3686601538e88da2ad1771e.zip |
[LSR] Generate cross iteration indexes
Modify GenerateConstantOffsetsImpl to create offsets that can be used
by indexed addressing modes. If formulae can be generated which
result in the constant offset being the same size as the recurrence,
we can generate a pre-indexed access. This allows the pointer to be
updated via the single pre-indexed access so that (hopefully) no
add/subs are required to update it for the next iteration. For small
cores, this can significantly improve performance DSP-like loops.
Differential Revision: https://reviews.llvm.org/D55373
llvm-svn: 353403
Diffstat (limited to 'llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp')
-rw-r--r-- | llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 90 |
1 files changed, 67 insertions, 23 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index eb6b1f24a7f..04a25052635 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -154,6 +154,10 @@ static cl::opt<bool> FilterSameScaledReg( cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale")); +static cl::opt<bool> EnableBackedgeIndexing( + "lsr-backedge-indexing", cl::Hidden, cl::init(true), + cl::desc("Enable the generation of cross iteration indexed memops")); + static cl::opt<unsigned> ComplexityLimit( "lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits<uint16_t>::max()), @@ -1052,12 +1056,12 @@ public: void dump() const; private: - void RateRegister(const SCEV *Reg, + void RateRegister(const Formula &F, const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, const TargetTransformInfo &TTI); - void RatePrimaryRegister(const SCEV *Reg, + void RatePrimaryRegister(const Formula &F, const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, @@ -1208,7 +1212,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, Instruction *Fixup = nullptr); /// Tally up interesting quantities from the given register. -void Cost::RateRegister(const SCEV *Reg, +void Cost::RateRegister(const Formula &F, const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, @@ -1235,16 +1239,24 @@ void Cost::RateRegister(const SCEV *Reg, } unsigned LoopCost = 1; - if (TTI.shouldFavorPostInc()) { - const SCEV *LoopStep = AR->getStepRecurrence(SE); - if (isa<SCEVConstant>(LoopStep)) { - // Check if a post-indexed load/store can be used. - if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || - TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { + if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) || + TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) { + + // If the step size matches the base offset, we could use pre-indexed + // addressing. + if (TTI.shouldFavorBackedgeIndex(L)) { + if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) + if (Step->getAPInt() == F.BaseOffset) + LoopCost = 0; + } + + if (TTI.shouldFavorPostInc()) { + const SCEV *LoopStep = AR->getStepRecurrence(SE); + if (isa<SCEVConstant>(LoopStep)) { const SCEV *LoopStart = AR->getStart(); if (!isa<SCEVConstant>(LoopStart) && - SE.isLoopInvariant(LoopStart, L)) - LoopCost = 0; + SE.isLoopInvariant(LoopStart, L)) + LoopCost = 0; } } } @@ -1254,7 +1266,7 @@ void Cost::RateRegister(const SCEV *Reg, // TODO: The non-affine case isn't precisely modeled here. if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) { if (!Regs.count(AR->getOperand(1))) { - RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI); + RateRegister(F, AR->getOperand(1), Regs, L, SE, DT, TTI); if (isLoser()) return; } @@ -1278,7 +1290,7 @@ void Cost::RateRegister(const SCEV *Reg, /// Record this register in the set. If we haven't seen it before, rate /// it. Optional LoserRegs provides a way to declare any formula that refers to /// one of those regs an instant loser. -void Cost::RatePrimaryRegister(const SCEV *Reg, +void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, @@ -1289,7 +1301,7 @@ void Cost::RatePrimaryRegister(const SCEV *Reg, return; } if (Regs.insert(Reg).second) { - RateRegister(Reg, Regs, L, SE, DT, TTI); + RateRegister(F, Reg, Regs, L, SE, DT, TTI); if (LoserRegs && isLoser()) LoserRegs->insert(Reg); } @@ -1313,7 +1325,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, Lose(); return; } - RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI); + RatePrimaryRegister(F, ScaledReg, Regs, L, SE, DT, LoserRegs, TTI); if (isLoser()) return; } @@ -1322,7 +1334,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, Lose(); return; } - RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI); + RatePrimaryRegister(F, BaseReg, Regs, L, SE, DT, LoserRegs, TTI); if (isLoser()) return; } @@ -1889,6 +1901,7 @@ class LSRInstance { LoopInfo &LI; const TargetTransformInfo &TTI; Loop *const L; + bool FavorBackedgeIndex = false; bool Changed = false; /// This is the insert position that the current loop's induction variable @@ -2803,7 +2816,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr, /// TODO: Consider IVInc free if it's already used in another chains. static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, - ScalarEvolution &SE, const TargetTransformInfo &TTI) { + ScalarEvolution &SE) { if (StressIVChain) return true; @@ -3063,7 +3076,7 @@ void LSRInstance::CollectChains() { for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); UsersIdx < NChains; ++UsersIdx) { if (!isProfitableChain(IVChainVec[UsersIdx], - ChainUsersVec[UsersIdx].FarUsers, SE, TTI)) + ChainUsersVec[UsersIdx].FarUsers, SE)) continue; // Preserve the chain at UsesIdx. if (ChainIdx != UsersIdx) @@ -3077,7 +3090,7 @@ void LSRInstance::CollectChains() { void LSRInstance::FinalizeChain(IVChain &Chain) { assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); - + for (const IVInc &Inc : Chain) { LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n"); auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand); @@ -3737,10 +3750,11 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, void LSRInstance::GenerateConstantOffsetsImpl( LSRUse &LU, unsigned LUIdx, const Formula &Base, const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) { - const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; - for (int64_t Offset : Worklist) { + + auto GenerateOffset = [&](const SCEV *G, int64_t Offset) { Formula F = Base; F.BaseOffset = (uint64_t)Base.BaseOffset - Offset; + if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind, LU.AccessTy, F)) { // Add the offset to the base register. @@ -3760,7 +3774,35 @@ void LSRInstance::GenerateConstantOffsetsImpl( (void)InsertFormula(LU, LUIdx, F); } + }; + + const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; + + // With constant offsets and constant steps, we can generate pre-inc + // accesses by having the offset equal the step. So, for access #0 with a + // step of 8, we generate a G - 8 base which would require the first access + // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer + // for itself and hopefully becomes the base for other accesses. This means + // means that a single pre-indexed access can be generated to become the new + // base pointer for each iteration of the loop, resulting in no extra add/sub + // instructions for pointer updating. + if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) { + if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) { + if (auto *StepRec = + dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) { + const APInt &StepInt = StepRec->getAPInt(); + int64_t Step = StepInt.isNegative() ? + StepInt.getSExtValue() : StepInt.getZExtValue(); + + for (int64_t Offset : Worklist) { + Offset -= Step; + GenerateOffset(G, Offset); + } + } + } } + for (int64_t Offset : Worklist) + GenerateOffset(G, Offset); int64_t Imm = ExtractImmediate(G, SE); if (G->isZero() || Imm == 0) @@ -4417,7 +4459,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { /// When there are many registers for expressions like A, A+1, A+2, etc., /// allocate a single register for them. void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { - if (EstimateSearchSpaceComplexity() < ComplexityLimit) + if (EstimateSearchSpaceComplexity() < ComplexityLimit) return; LLVM_DEBUG( @@ -5378,7 +5420,9 @@ void LSRInstance::ImplementSolution( LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI) - : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) { + : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), + FavorBackedgeIndex(EnableBackedgeIndexing && + TTI.shouldFavorBackedgeIndex(L)) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; |