summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/ARM/ARMParallelDSP.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/ARM/ARMParallelDSP.cpp')
-rw-r--r--llvm/lib/Target/ARM/ARMParallelDSP.cpp358
1 files changed, 174 insertions, 184 deletions
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index 901753713f6..beb44fb12e9 100644
--- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -63,21 +63,16 @@ namespace {
Instruction *Root;
ValueList AllValues;
MemInstList VecLd; // List of all load instructions.
- MemLocList MemLocs; // All memory locations read by this tree.
+ MemInstList Loads;
bool ReadOnly = true;
OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
virtual ~OpChain() = default;
- void SetMemoryLocations() {
- const auto Size = LocationSize::unknown();
+ void PopulateLoads() {
for (auto *V : AllValues) {
- if (auto *I = dyn_cast<Instruction>(V)) {
- if (I->mayWriteToMemory())
- ReadOnly = false;
- if (auto *Ld = dyn_cast<LoadInst>(V))
- MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
- }
+ if (auto *Ld = dyn_cast<LoadInst>(V))
+ Loads.push_back(Ld);
}
}
@@ -140,12 +135,11 @@ namespace {
std::map<LoadInst*, LoadInst*> LoadPairs;
std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
- bool RecordSequentialLoads(BasicBlock *BB);
+ bool RecordMemoryOps(BasicBlock *BB);
bool InsertParallelMACs(Reduction &Reduction);
bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
- LoadInst* CreateLoadIns(IRBuilder<NoFolder> &IRB,
- SmallVectorImpl<LoadInst*> &Loads,
- IntegerType *LoadTy);
+ LoadInst* CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+ IntegerType *LoadTy);
void CreateParallelMACPairs(Reduction &R);
Instruction *CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
SmallVectorImpl<LoadInst*> &VecLd1,
@@ -164,6 +158,12 @@ namespace {
ARMParallelDSP() : LoopPass(ID) { }
+ bool doInitialization(Loop *L, LPPassManager &LPM) override {
+ LoadPairs.clear();
+ WideLoads.clear();
+ return true;
+ }
+
void getAnalysisUsage(AnalysisUsage &AU) const override {
LoopPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
@@ -228,7 +228,7 @@ namespace {
if (!ST->isLittle()) {
LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
- "ARMParallelDSP\n");
+ << "ARMParallelDSP\n");
return false;
}
@@ -237,7 +237,7 @@ namespace {
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
- if (!RecordSequentialLoads(Header)) {
+ if (!RecordMemoryOps(Header)) {
LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
return false;
}
@@ -314,11 +314,18 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
return true;
}
-/// Iterate through the block and record base, offset pairs of loads as well as
-/// maximal sequences of sequential loads.
-bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *BB) {
+/// Iterate through the block and record base, offset pairs of loads which can
+/// be widened into a single load.
+bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
SmallVector<LoadInst*, 8> Loads;
+ SmallVector<Instruction*, 8> Writes;
+
+ // Collect loads and instruction that may write to memory. For now we only
+ // record loads which are simple, sign-extended and have a single user.
+ // TODO: Allow zero-extended loads.
for (auto &I : *BB) {
+ if (I.mayWriteToMemory())
+ Writes.push_back(&I);
auto *Ld = dyn_cast<LoadInst>(&I);
if (!Ld || !Ld->isSimple() ||
!Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
@@ -326,13 +333,54 @@ bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *BB) {
Loads.push_back(Ld);
}
- for (auto *Ld0 : Loads) {
- for (auto *Ld1 : Loads) {
- if (Ld0 == Ld1)
+ using InstSet = std::set<Instruction*>;
+ using DepMap = std::map<Instruction*, InstSet>;
+ DepMap RAWDeps;
+
+ // Record any writes that may alias a load.
+ const auto Size = LocationSize::unknown();
+ for (auto Read : Loads) {
+ for (auto Write : Writes) {
+ MemoryLocation ReadLoc =
+ MemoryLocation(Read->getPointerOperand(), Size);
+
+ if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
+ ModRefInfo::ModRef)))
continue;
+ if (DT->dominates(Write, Read))
+ RAWDeps[Read].insert(Write);
+ }
+ }
- if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
- LoadPairs[Ld0] = Ld1;
+ // Check whether there's not a write between the two loads which would
+ // prevent them from being safely merged.
+ auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
+ LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset;
+ LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base;
+
+ if (RAWDeps.count(Dominated)) {
+ InstSet &WritesBefore = RAWDeps[Dominated];
+
+ for (auto Before : WritesBefore) {
+
+ // We can't move the second load backward, past a write, to merge
+ // with the first load.
+ if (DT->dominates(Dominator, Before))
+ return false;
+ }
+ }
+ return true;
+ };
+
+ // Record base, offset load pairs.
+ for (auto *Base : Loads) {
+ for (auto *Offset : Loads) {
+ if (Base == Offset)
+ continue;
+
+ if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
+ SafeToPair(Base, Offset)) {
+ LoadPairs[Base] = Offset;
break;
}
}
@@ -442,9 +490,9 @@ bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
for (auto &Pair : Reduction.PMACPairs) {
BinOpChain *PMul0 = Pair.first;
BinOpChain *PMul1 = Pair.second;
- LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
- dbgs() << "- "; PMul0->Root->dump();
- dbgs() << "- "; PMul1->Root->dump());
+ LLVM_DEBUG(dbgs() << "Found parallel MACs:\n"
+ << "- " << *PMul0->Root << "\n"
+ << "- " << *PMul1->Root << "\n");
Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
InsertAfter);
@@ -459,54 +507,6 @@ bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
return false;
}
-static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
- ReductionList &Reductions) {
- RecurrenceDescriptor RecDesc;
- const bool HasFnNoNaNAttr =
- F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
- const BasicBlock *Latch = TheLoop->getLoopLatch();
-
- for (PHINode &Phi : Header->phis()) {
- const auto *Ty = Phi.getType();
- if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
- continue;
-
- const bool IsReduction =
- RecurrenceDescriptor::AddReductionVar(&Phi,
- RecurrenceDescriptor::RK_IntegerAdd,
- TheLoop, HasFnNoNaNAttr, RecDesc);
- if (!IsReduction)
- continue;
-
- Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
- if (!Acc)
- continue;
-
- Reductions.push_back(Reduction(&Phi, Acc));
- }
-
- LLVM_DEBUG(
- dbgs() << "\nAccumulating integer additions (reductions) found:\n";
- for (auto &R : Reductions) {
- dbgs() << "- "; R.Phi->dump();
- dbgs() << "-> "; R.AccIntAdd->dump();
- }
- );
-}
-
-static void AddMACCandidate(OpChainList &Candidates,
- Instruction *Mul,
- Value *MulOp0, Value *MulOp1) {
- assert(Mul->getOpcode() == Instruction::Mul &&
- "expected mul instruction");
- ValueList LHS;
- ValueList RHS;
- if (IsNarrowSequence<16>(MulOp0, LHS) &&
- IsNarrowSequence<16>(MulOp1, RHS)) {
- Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
- }
-}
-
static void MatchParallelMACSequences(Reduction &R,
OpChainList &Candidates) {
Instruction *Acc = R.AccIntAdd;
@@ -528,8 +528,14 @@ static void MatchParallelMACSequences(Reduction &R,
case Instruction::Mul: {
Value *MulOp0 = I->getOperand(0);
Value *MulOp1 = I->getOperand(1);
- if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1))
- AddMACCandidate(Candidates, I, MulOp0, MulOp1);
+ if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
+ ValueList LHS;
+ ValueList RHS;
+ if (IsNarrowSequence<16>(MulOp0, LHS) &&
+ IsNarrowSequence<16>(MulOp1, RHS)) {
+ Candidates.push_back(make_unique<BinOpChain>(I, LHS, RHS));
+ }
+ }
return false;
}
case Instruction::SExt:
@@ -543,52 +549,6 @@ static void MatchParallelMACSequences(Reduction &R,
<< Candidates.size() << " candidates.\n");
}
-// Collects all instructions that are not part of the MAC chains, which is the
-// set of instructions that can potentially alias with the MAC operands.
-static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
- Instructions &Writes) {
- for (auto &I : *Header) {
- if (I.mayReadFromMemory())
- Reads.push_back(&I);
- if (I.mayWriteToMemory())
- Writes.push_back(&I);
- }
-}
-
-// Check whether statements in the basic block that write to memory alias with
-// the memory locations accessed by the MAC-chains.
-// TODO: we need the read statements when we accept more complicated chains.
-static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
- Instructions &Writes, OpChainList &MACCandidates) {
- LLVM_DEBUG(dbgs() << "Alias checks:\n");
- for (auto &MAC : MACCandidates) {
- LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
-
- // At the moment, we allow only simple chains that only consist of reads,
- // accumulate their result with an integer add, and thus that don't write
- // memory, and simply bail if they do.
- if (!MAC->ReadOnly)
- return true;
-
- // Now for all writes in the basic block, check that they don't alias with
- // the memory locations accessed by our MAC-chain:
- for (auto *I : Writes) {
- LLVM_DEBUG(dbgs() << "- "; I->dump());
- assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
- for (auto &MemLoc : MAC->MemLocs) {
- if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
- ModRefInfo::ModRef))) {
- LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
- return true;
- }
- }
- }
- }
-
- LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
- return false;
-}
-
static bool CheckMACMemory(OpChainList &Candidates) {
for (auto &C : Candidates) {
// A mul has 2 operands, and a narrow op consist of sext and a load; thus
@@ -597,7 +557,7 @@ static bool CheckMACMemory(OpChainList &Candidates) {
LLVM_DEBUG(dbgs() << "Operand list too short.\n");
return false;
}
- C->SetMemoryLocations();
+ C->PopulateLoads();
ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
@@ -643,14 +603,36 @@ static bool CheckMACMemory(OpChainList &Candidates) {
// before the loop begins.
//
bool ARMParallelDSP::MatchSMLAD(Function &F) {
- BasicBlock *Header = L->getHeader();
- LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
- dbgs() << "Header block:\n"; Header->dump();
- dbgs() << "Loop info:\n\n"; L->dump());
- bool Changed = false;
+ auto FindReductions = [&](ReductionList &Reductions) {
+ RecurrenceDescriptor RecDesc;
+ const bool HasFnNoNaNAttr =
+ F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+ BasicBlock *Latch = L->getLoopLatch();
+
+ for (PHINode &Phi : Latch->phis()) {
+ const auto *Ty = Phi.getType();
+ if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+ continue;
+
+ const bool IsReduction = RecurrenceDescriptor::AddReductionVar(
+ &Phi, RecurrenceDescriptor::RK_IntegerAdd, L, HasFnNoNaNAttr, RecDesc);
+
+ if (!IsReduction)
+ continue;
+
+ Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
+ if (!Acc)
+ continue;
+
+ Reductions.push_back(Reduction(&Phi, Acc));
+ }
+ return !Reductions.empty();
+ };
+
ReductionList Reductions;
- MatchReductions(F, L, Header, Reductions);
+ if (!FindReductions(Reductions))
+ return false;
for (auto &R : Reductions) {
OpChainList MACCandidates;
@@ -666,72 +648,79 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
dbgs() << "\n";);
}
- // Collect all instructions that may read or write memory. Our alias
- // analysis checks bail out if any of these instructions aliases with an
- // instruction from the MAC-chain.
- Instructions Reads, Writes;
- AliasCandidates(Header, Reads, Writes);
-
+ bool Changed = false;
+ // Check whether statements in the basic block that write to memory alias
+ // with the memory locations accessed by the MAC-chains.
for (auto &R : Reductions) {
- if (AreAliased(AA, Reads, Writes, R.MACCandidates))
- return false;
CreateParallelMACPairs(R);
Changed |= InsertParallelMACs(R);
}
- LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
return Changed;
}
-LoadInst* ARMParallelDSP::CreateLoadIns(IRBuilder<NoFolder> &IRB,
- SmallVectorImpl<LoadInst*> &Loads,
- IntegerType *LoadTy) {
+LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+ IntegerType *LoadTy) {
assert(Loads.size() == 2 && "currently only support widening two loads");
-
- const unsigned AddrSpace = Loads[0]->getPointerAddressSpace();
- Value *VecPtr = IRB.CreateBitCast(Loads[0]->getPointerOperand(),
+
+ LoadInst *Base = Loads[0];
+ LoadInst *Offset = Loads[1];
+
+ Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
+ Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
+
+ assert((BaseSExt && OffsetSExt)
+ && "Loads should have a single, extending, user");
+
+ std::function<void(Value*, Value*)> MoveBefore =
+ [&](Value *A, Value *B) -> void {
+ if (!isa<Instruction>(A) || !isa<Instruction>(B))
+ return;
+
+ auto *Source = cast<Instruction>(A);
+ auto *Sink = cast<Instruction>(B);
+
+ if (DT->dominates(Source, Sink) ||
+ Source->getParent() != Sink->getParent() ||
+ isa<PHINode>(Source) || isa<PHINode>(Sink))
+ return;
+
+ Source->moveBefore(Sink);
+ for (auto &U : Source->uses())
+ MoveBefore(Source, U.getUser());
+ };
+
+ // Insert the load at the point of the original dominating load.
+ LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
+ IRBuilder<NoFolder> IRB(DomLoad->getParent(),
+ ++BasicBlock::iterator(DomLoad));
+
+ // Bitcast the pointer to a wider type and create the wide load, while making
+ // sure to maintain the original alignment as this prevents ldrd from being
+ // generated when it could be illegal due to memory alignment.
+ const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
+ Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
LoadTy->getPointerTo(AddrSpace));
LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
- Loads[0]->getAlignment());
- // Fix up users, Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
- Instruction *SExt0 = dyn_cast<SExtInst>(Loads[0]->user_back());
- Instruction *SExt1 = dyn_cast<SExtInst>(Loads[1]->user_back());
-
- assert((Loads[0]->hasOneUse() && Loads[1]->hasOneUse() && SExt0 && SExt1) &&
- "Loads should have a single, extending, user");
-
- std::function<void(Instruction*, Instruction*)> MoveAfter =
- [&](Instruction* Source, Instruction* Sink) -> void {
- if (DT->dominates(Source, Sink) ||
- Source->getParent() != Sink->getParent() ||
- isa<PHINode>(Source) || isa<PHINode>(Sink))
- return;
-
- Sink->moveAfter(Source);
- for (auto &U : Sink->uses())
- MoveAfter(Sink, cast<Instruction>(U.getUser()));
- };
+ Base->getAlignment());
+
+ // Make sure everything is in the correct order in the basic block.
+ MoveBefore(Base->getPointerOperand(), VecPtr);
+ MoveBefore(VecPtr, WideLoad);
// From the wide load, create two values that equal the original two loads.
- Value *Bottom = IRB.CreateTrunc(WideLoad, Loads[0]->getType());
- SExt0->setOperand(0, Bottom);
- if (auto *I = dyn_cast<Instruction>(Bottom)) {
- I->moveAfter(WideLoad);
- MoveAfter(I, SExt0);
- }
+ // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
+ // TODO: Support big-endian as well.
+ Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
+ BaseSExt->setOperand(0, Bottom);
- IntegerType *Ld1Ty = cast<IntegerType>(Loads[1]->getType());
- Value *ShiftVal = ConstantInt::get(LoadTy, Ld1Ty->getBitWidth());
+ IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
+ Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
- if (auto *I = dyn_cast<Instruction>(Top))
- MoveAfter(WideLoad, I);
+ Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
+ OffsetSExt->setOperand(0, Trunc);
- Value *Trunc = IRB.CreateTrunc(Top, Ld1Ty);
- SExt1->setOperand(0, Trunc);
- if (auto *I = dyn_cast<Instruction>(Trunc))
- MoveAfter(I, SExt1);
-
- WideLoads.emplace(std::make_pair(Loads[0],
+ WideLoads.emplace(std::make_pair(Base,
make_unique<WidenedLoad>(Loads, WideLoad)));
return WideLoad;
}
@@ -748,15 +737,13 @@ Instruction *ARMParallelDSP::CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
<< "- " << *Acc << "\n"
<< "- Exchange: " << Exchange << "\n");
- IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
-
// Replace the reduction chain with an intrinsic call
IntegerType *Ty = IntegerType::get(M->getContext(), 32);
LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
- WideLoads[VecLd0[0]]->getLoad() : CreateLoadIns(Builder, VecLd0, Ty);
+ WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty);
LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
- WideLoads[VecLd1[0]]->getLoad() : CreateLoadIns(Builder, VecLd1, Ty);
+ WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty);
+
Value* Args[] = { WideLd0, WideLd1, Acc };
Function *SMLAD = nullptr;
if (Exchange)
@@ -767,6 +754,9 @@ Instruction *ARMParallelDSP::CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
SMLAD = Acc->getType()->isIntegerTy(32) ?
Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
+
+ IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+ ++BasicBlock::iterator(InsertAfter));
CallInst *Call = Builder.CreateCall(SMLAD, Args);
NumSMLAD++;
return Call;
OpenPOWER on IntegriCloud