diff options
| author | Nico Weber <nicolasweber@gmx.de> | 2019-08-21 19:53:42 +0000 |
|---|---|---|
| committer | Nico Weber <nicolasweber@gmx.de> | 2019-08-21 19:53:42 +0000 |
| commit | ed18e70c86f6de353cebe0a8faa961a341c74d27 (patch) | |
| tree | 504dce17a661d8e817fd245c90a45754f2fa0a57 /llvm/lib | |
| parent | dde8a25a4bd0b8e194ebb9277d656f6069c26449 (diff) | |
| download | bcm5719-llvm-ed18e70c86f6de353cebe0a8faa961a341c74d27.tar.gz bcm5719-llvm-ed18e70c86f6de353cebe0a8faa961a341c74d27.zip | |
Revert r367389 (and follow-up r368404); it caused PR43073.
llvm-svn: 369567
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMParallelDSP.cpp | 126 |
1 files changed, 76 insertions, 50 deletions
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp index 1f15ddb85d4..5717a7102b6 100644 --- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp +++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp @@ -1,4 +1,4 @@ -//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===// +//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -18,10 +18,13 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/NoFolder.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" @@ -68,7 +71,7 @@ namespace { } LoadInst *getBaseLoad() const { - return VecLd.front(); + return cast<LoadInst>(LHS); } }; @@ -155,11 +158,13 @@ namespace { } }; - class ARMParallelDSP : public FunctionPass { + class ARMParallelDSP : public LoopPass { ScalarEvolution *SE; AliasAnalysis *AA; TargetLibraryInfo *TLI; DominatorTree *DT; + LoopInfo *LI; + Loop *L; const DataLayout *DL; Module *M; std::map<LoadInst*, LoadInst*> LoadPairs; @@ -180,38 +185,63 @@ namespace { /// products to a 32-bit accumulate operand. Optionally, the instruction can /// exchange the halfwords of the second operand before performing the /// arithmetic. - bool MatchSMLAD(Function &F); + bool MatchSMLAD(Loop *L); public: static char ID; - ARMParallelDSP() : FunctionPass(ID) { } + ARMParallelDSP() : LoopPass(ID) { } + + bool doInitialization(Loop *L, LPPassManager &LPM) override { + LoadPairs.clear(); + WideLoads.clear(); + return true; + } void getAnalysisUsage(AnalysisUsage &AU) const override { - FunctionPass::getAnalysisUsage(AU); + LoopPass::getAnalysisUsage(AU); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetPassConfig>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); AU.setPreservesCFG(); } - bool runOnFunction(Function &F) override { + bool runOnLoop(Loop *TheLoop, LPPassManager &) override { if (DisableParallelDSP) return false; - if (skipFunction(F)) + if (skipLoop(TheLoop)) return false; + L = TheLoop; SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &TPC = getAnalysis<TargetPassConfig>(); + BasicBlock *Header = TheLoop->getHeader(); + if (!Header) + return false; + + // TODO: We assume the loop header and latch to be the same block. + // This is not a fundamental restriction, but lifting this would just + // require more work to do the transformation and then patch up the CFG. + if (Header != TheLoop->getLoopLatch()) { + LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not " + "running pass ARMParallelDSP\n"); + return false; + } + + if (!TheLoop->getLoopPreheader()) + InsertPreheaderForLoop(L, DT, LI, nullptr, true); + + Function &F = *Header->getParent(); M = F.getParent(); DL = &M->getDataLayout(); @@ -236,10 +266,17 @@ namespace { return false; } + LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI); + LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); - bool Changes = MatchSMLAD(F); + if (!RecordMemoryOps(Header)) { + LLVM_DEBUG(dbgs() << " - No sequential loads found.\n"); + return false; + } + + bool Changes = MatchSMLAD(L); return Changes; } }; @@ -300,8 +337,6 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) { bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { SmallVector<LoadInst*, 8> Loads; SmallVector<Instruction*, 8> Writes; - LoadPairs.clear(); - WideLoads.clear(); // Collect loads and instruction that may write to memory. For now we only // record loads which are simple, sign-extended and have a single user. @@ -379,7 +414,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { return LoadPairs.size() > 1; } -// The pass needs to identify integer add/sub reductions of 16-bit vector +// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector // multiplications. // To use SMLAD: // 1) we first need to find integer add then look for this pattern: @@ -410,13 +445,13 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { // If loop invariants are used instead of loads, these need to be packed // before the loop begins. // -bool ARMParallelDSP::MatchSMLAD(Function &F) { +bool ARMParallelDSP::MatchSMLAD(Loop *L) { // Search recursively back through the operands to find a tree of values that // form a multiply-accumulate chain. The search records the Add and Mul // instructions that form the reduction and allows us to find a single value // to be used as the initial input to the accumlator. - std::function<bool(Value*, BasicBlock*, Reduction&)> Search = [&] - (Value *V, BasicBlock *BB, Reduction &R) -> bool { + std::function<bool(Value*, Reduction&)> Search = [&] + (Value *V, Reduction &R) -> bool { // If we find a non-instruction, try to use it as the initial accumulator // value. This may have already been found during the search in which case @@ -425,9 +460,6 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) { if (!I) return R.InsertAcc(V); - if (I->getParent() != BB) - return false; - switch (I->getOpcode()) { default: break; @@ -438,8 +470,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) { // Adds should be adding together two muls, or another add and a mul to // be within the mac chain. One of the operands may also be the // accumulator value at which point we should stop searching. - bool ValidLHS = Search(I->getOperand(0), BB, R); - bool ValidRHS = Search(I->getOperand(1), BB, R); + bool ValidLHS = Search(I->getOperand(0), R); + bool ValidRHS = Search(I->getOperand(1), R); if (!ValidLHS && !ValidLHS) return false; else if (ValidLHS && ValidRHS) { @@ -465,40 +497,36 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) { return false; } case Instruction::SExt: - return Search(I->getOperand(0), BB, R); + return Search(I->getOperand(0), R); } return false; }; bool Changed = false; + SmallPtrSet<Instruction*, 4> AllAdds; + BasicBlock *Latch = L->getLoopLatch(); - for (auto &BB : F) { - SmallPtrSet<Instruction*, 4> AllAdds; - if (!RecordMemoryOps(&BB)) + for (Instruction &I : reverse(*Latch)) { + if (I.getOpcode() != Instruction::Add) continue; - for (Instruction &I : reverse(BB)) { - if (I.getOpcode() != Instruction::Add) - continue; - - if (AllAdds.count(&I)) - continue; + if (AllAdds.count(&I)) + continue; - const auto *Ty = I.getType(); - if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) - continue; + const auto *Ty = I.getType(); + if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) + continue; - Reduction R(&I); - if (!Search(&I, &BB, R)) - continue; + Reduction R(&I); + if (!Search(&I, R)) + continue; - if (!CreateParallelPairs(R)) - continue; + if (!CreateParallelPairs(R)) + continue; - InsertParallelMACs(R); - Changed = true; - AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); - } + InsertParallelMACs(R); + Changed = true; + AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); } return Changed; @@ -696,15 +724,13 @@ LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads, // Loads[0] needs trunc while Loads[1] needs a lshr and trunc. // TODO: Support big-endian as well. Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType()); - Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType()); - BaseSExt->replaceAllUsesWith(NewBaseSExt); + BaseSExt->setOperand(0, Bottom); IntegerType *OffsetTy = cast<IntegerType>(Offset->getType()); Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth()); Value *Top = IRB.CreateLShr(WideLoad, ShiftVal); Value *Trunc = IRB.CreateTrunc(Top, OffsetTy); - Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType()); - OffsetSExt->replaceAllUsesWith(NewOffsetSExt); + OffsetSExt->setOperand(0, Trunc); WideLoads.emplace(std::make_pair(Base, std::make_unique<WidenedLoad>(Loads, WideLoad))); @@ -718,6 +744,6 @@ Pass *llvm::createARMParallelDSPPass() { char ARMParallelDSP::ID = 0; INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", - "Transform functions to use DSP intrinsics", false, false) + "Transform loops to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp", - "Transform functions to use DSP intrinsics", false, false) + "Transform loops to use DSP intrinsics", false, false) |

