summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorNico Weber <nicolasweber@gmx.de>2019-08-21 19:53:42 +0000
committerNico Weber <nicolasweber@gmx.de>2019-08-21 19:53:42 +0000
commited18e70c86f6de353cebe0a8faa961a341c74d27 (patch)
tree504dce17a661d8e817fd245c90a45754f2fa0a57 /llvm/lib
parentdde8a25a4bd0b8e194ebb9277d656f6069c26449 (diff)
downloadbcm5719-llvm-ed18e70c86f6de353cebe0a8faa961a341c74d27.tar.gz
bcm5719-llvm-ed18e70c86f6de353cebe0a8faa961a341c74d27.zip
Revert r367389 (and follow-up r368404); it caused PR43073.
llvm-svn: 369567
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/ARM/ARMParallelDSP.cpp126
1 files changed, 76 insertions, 50 deletions
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index 1f15ddb85d4..5717a7102b6 100644
--- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -1,4 +1,4 @@
-//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
+//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -18,10 +18,13 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Pass.h"
#include "llvm/PassRegistry.h"
#include "llvm/PassSupport.h"
@@ -68,7 +71,7 @@ namespace {
}
LoadInst *getBaseLoad() const {
- return VecLd.front();
+ return cast<LoadInst>(LHS);
}
};
@@ -155,11 +158,13 @@ namespace {
}
};
- class ARMParallelDSP : public FunctionPass {
+ class ARMParallelDSP : public LoopPass {
ScalarEvolution *SE;
AliasAnalysis *AA;
TargetLibraryInfo *TLI;
DominatorTree *DT;
+ LoopInfo *LI;
+ Loop *L;
const DataLayout *DL;
Module *M;
std::map<LoadInst*, LoadInst*> LoadPairs;
@@ -180,38 +185,63 @@ namespace {
/// products to a 32-bit accumulate operand. Optionally, the instruction can
/// exchange the halfwords of the second operand before performing the
/// arithmetic.
- bool MatchSMLAD(Function &F);
+ bool MatchSMLAD(Loop *L);
public:
static char ID;
- ARMParallelDSP() : FunctionPass(ID) { }
+ ARMParallelDSP() : LoopPass(ID) { }
+
+ bool doInitialization(Loop *L, LPPassManager &LPM) override {
+ LoadPairs.clear();
+ WideLoads.clear();
+ return true;
+ }
void getAnalysisUsage(AnalysisUsage &AU) const override {
- FunctionPass::getAnalysisUsage(AU);
+ LoopPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetPassConfig>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.setPreservesCFG();
}
- bool runOnFunction(Function &F) override {
+ bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
if (DisableParallelDSP)
return false;
- if (skipFunction(F))
+ if (skipLoop(TheLoop))
return false;
+ L = TheLoop;
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &TPC = getAnalysis<TargetPassConfig>();
+ BasicBlock *Header = TheLoop->getHeader();
+ if (!Header)
+ return false;
+
+ // TODO: We assume the loop header and latch to be the same block.
+ // This is not a fundamental restriction, but lifting this would just
+ // require more work to do the transformation and then patch up the CFG.
+ if (Header != TheLoop->getLoopLatch()) {
+ LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
+ "running pass ARMParallelDSP\n");
+ return false;
+ }
+
+ if (!TheLoop->getLoopPreheader())
+ InsertPreheaderForLoop(L, DT, LI, nullptr, true);
+
+ Function &F = *Header->getParent();
M = F.getParent();
DL = &M->getDataLayout();
@@ -236,10 +266,17 @@ namespace {
return false;
}
+ LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
+
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
- bool Changes = MatchSMLAD(F);
+ if (!RecordMemoryOps(Header)) {
+ LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
+ return false;
+ }
+
+ bool Changes = MatchSMLAD(L);
return Changes;
}
};
@@ -300,8 +337,6 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) {
bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
SmallVector<LoadInst*, 8> Loads;
SmallVector<Instruction*, 8> Writes;
- LoadPairs.clear();
- WideLoads.clear();
// Collect loads and instruction that may write to memory. For now we only
// record loads which are simple, sign-extended and have a single user.
@@ -379,7 +414,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
return LoadPairs.size() > 1;
}
-// The pass needs to identify integer add/sub reductions of 16-bit vector
+// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
// multiplications.
// To use SMLAD:
// 1) we first need to find integer add then look for this pattern:
@@ -410,13 +445,13 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
// If loop invariants are used instead of loads, these need to be packed
// before the loop begins.
//
-bool ARMParallelDSP::MatchSMLAD(Function &F) {
+bool ARMParallelDSP::MatchSMLAD(Loop *L) {
// Search recursively back through the operands to find a tree of values that
// form a multiply-accumulate chain. The search records the Add and Mul
// instructions that form the reduction and allows us to find a single value
// to be used as the initial input to the accumlator.
- std::function<bool(Value*, BasicBlock*, Reduction&)> Search = [&]
- (Value *V, BasicBlock *BB, Reduction &R) -> bool {
+ std::function<bool(Value*, Reduction&)> Search = [&]
+ (Value *V, Reduction &R) -> bool {
// If we find a non-instruction, try to use it as the initial accumulator
// value. This may have already been found during the search in which case
@@ -425,9 +460,6 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
if (!I)
return R.InsertAcc(V);
- if (I->getParent() != BB)
- return false;
-
switch (I->getOpcode()) {
default:
break;
@@ -438,8 +470,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
// Adds should be adding together two muls, or another add and a mul to
// be within the mac chain. One of the operands may also be the
// accumulator value at which point we should stop searching.
- bool ValidLHS = Search(I->getOperand(0), BB, R);
- bool ValidRHS = Search(I->getOperand(1), BB, R);
+ bool ValidLHS = Search(I->getOperand(0), R);
+ bool ValidRHS = Search(I->getOperand(1), R);
if (!ValidLHS && !ValidLHS)
return false;
else if (ValidLHS && ValidRHS) {
@@ -465,40 +497,36 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
return false;
}
case Instruction::SExt:
- return Search(I->getOperand(0), BB, R);
+ return Search(I->getOperand(0), R);
}
return false;
};
bool Changed = false;
+ SmallPtrSet<Instruction*, 4> AllAdds;
+ BasicBlock *Latch = L->getLoopLatch();
- for (auto &BB : F) {
- SmallPtrSet<Instruction*, 4> AllAdds;
- if (!RecordMemoryOps(&BB))
+ for (Instruction &I : reverse(*Latch)) {
+ if (I.getOpcode() != Instruction::Add)
continue;
- for (Instruction &I : reverse(BB)) {
- if (I.getOpcode() != Instruction::Add)
- continue;
-
- if (AllAdds.count(&I))
- continue;
+ if (AllAdds.count(&I))
+ continue;
- const auto *Ty = I.getType();
- if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
- continue;
+ const auto *Ty = I.getType();
+ if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+ continue;
- Reduction R(&I);
- if (!Search(&I, &BB, R))
- continue;
+ Reduction R(&I);
+ if (!Search(&I, R))
+ continue;
- if (!CreateParallelPairs(R))
- continue;
+ if (!CreateParallelPairs(R))
+ continue;
- InsertParallelMACs(R);
- Changed = true;
- AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
- }
+ InsertParallelMACs(R);
+ Changed = true;
+ AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
}
return Changed;
@@ -696,15 +724,13 @@ LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
// Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
// TODO: Support big-endian as well.
Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
- Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
- BaseSExt->replaceAllUsesWith(NewBaseSExt);
+ BaseSExt->setOperand(0, Bottom);
IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
- Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
- OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
+ OffsetSExt->setOperand(0, Trunc);
WideLoads.emplace(std::make_pair(Base,
std::make_unique<WidenedLoad>(Loads, WideLoad)));
@@ -718,6 +744,6 @@ Pass *llvm::createARMParallelDSPPass() {
char ARMParallelDSP::ID = 0;
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
- "Transform functions to use DSP intrinsics", false, false)
+ "Transform loops to use DSP intrinsics", false, false)
INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
- "Transform functions to use DSP intrinsics", false, false)
+ "Transform loops to use DSP intrinsics", false, false)
OpenPOWER on IntegriCloud