1 files changed, 174 insertions, 184 deletions
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index 901753713f6..beb44fb12e9 100644
--- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -63,21 +63,16 @@ namespace {
     Instruction   *Root;
     ValueList     AllValues;
     MemInstList   VecLd;    // List of all load instructions.
-    MemLocList    MemLocs;  // All memory locations read by this tree.
+    MemInstList   Loads;
     bool          ReadOnly = true;
 
     OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
     virtual ~OpChain() = default;
 
-    void SetMemoryLocations() {
-      const auto Size = LocationSize::unknown();
+    void PopulateLoads() {
       for (auto *V : AllValues) {
-        if (auto *I = dyn_cast<Instruction>(V)) {
-          if (I->mayWriteToMemory())
-            ReadOnly = false;
-          if (auto *Ld = dyn_cast<LoadInst>(V))
-            MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
-        }
+        if (auto *Ld = dyn_cast<LoadInst>(V))
+          Loads.push_back(Ld);
       }
     }
 
@@ -140,12 +135,11 @@ namespace {
     std::map<LoadInst*, LoadInst*> LoadPairs;
     std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
 
-    bool RecordSequentialLoads(BasicBlock *BB);
+    bool RecordMemoryOps(BasicBlock *BB);
     bool InsertParallelMACs(Reduction &Reduction);
     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
-    LoadInst* CreateLoadIns(IRBuilder<NoFolder> &IRB,
-                            SmallVectorImpl<LoadInst*> &Loads,
-                            IntegerType *LoadTy);
+    LoadInst* CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+                             IntegerType *LoadTy);
     void CreateParallelMACPairs(Reduction &R);
     Instruction *CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
                                  SmallVectorImpl<LoadInst*> &VecLd1,
@@ -164,6 +158,12 @@ namespace {
 
     ARMParallelDSP() : LoopPass(ID) { }
 
+    bool doInitialization(Loop *L, LPPassManager &LPM) override {
+      LoadPairs.clear();
+      WideLoads.clear();
+      return true;
+    }
+
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       LoopPass::getAnalysisUsage(AU);
       AU.addRequired<AssumptionCacheTracker>();
@@ -228,7 +228,7 @@ namespace {
 
       if (!ST->isLittle()) {
         LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
-                             "ARMParallelDSP\n");
+                          << "ARMParallelDSP\n");
         return false;
       }
 
@@ -237,7 +237,7 @@ namespace {
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
 
-      if (!RecordSequentialLoads(Header)) {
+      if (!RecordMemoryOps(Header)) {
         LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
         return false;
       }
@@ -314,11 +314,18 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
   return true;
 }
 
-/// Iterate through the block and record base, offset pairs of loads as well as
-/// maximal sequences of sequential loads.
-bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *BB) {
+/// Iterate through the block and record base, offset pairs of loads which can
+/// be widened into a single load.
+bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   SmallVector<LoadInst*, 8> Loads;
+  SmallVector<Instruction*, 8> Writes;
+
+  // Collect loads and instruction that may write to memory. For now we only
+  // record loads which are simple, sign-extended and have a single user.
+  // TODO: Allow zero-extended loads.
   for (auto &I : *BB) {
+    if (I.mayWriteToMemory())
+      Writes.push_back(&I);
     auto *Ld = dyn_cast<LoadInst>(&I);
     if (!Ld || !Ld->isSimple() ||
         !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
@@ -326,13 +333,54 @@ bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *BB) {
     Loads.push_back(Ld);
   }
 
-  for (auto *Ld0 : Loads) {
-    for (auto *Ld1 : Loads) {
-      if (Ld0 == Ld1)
+  using InstSet = std::set<Instruction*>;
+  using DepMap = std::map<Instruction*, InstSet>;
+  DepMap RAWDeps;
+
+  // Record any writes that may alias a load.
+  const auto Size = LocationSize::unknown();
+  for (auto Read : Loads) {
+    for (auto Write : Writes) {
+      MemoryLocation ReadLoc =
+        MemoryLocation(Read->getPointerOperand(), Size);
+
+      if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
+          ModRefInfo::ModRef)))
         continue;
+      if (DT->dominates(Write, Read))
+        RAWDeps[Read].insert(Write);
+    }
+  }
 
-      if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
-        LoadPairs[Ld0] = Ld1;
+  // Check whether there's not a write between the two loads which would
+  // prevent them from being safely merged.
+  auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
+    LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset;
+    LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base;
+
+    if (RAWDeps.count(Dominated)) {
+      InstSet &WritesBefore = RAWDeps[Dominated];
+
+      for (auto Before : WritesBefore) {
+
+        // We can't move the second load backward, past a write, to merge
+        // with the first load.
+        if (DT->dominates(Dominator, Before))
+          return false;
+      }
+    }
+    return true;
+  };
+
+  // Record base, offset load pairs.
+  for (auto *Base : Loads) {
+    for (auto *Offset : Loads) {
+      if (Base == Offset)
+        continue;
+
+      if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
+          SafeToPair(Base, Offset)) {
+        LoadPairs[Base] = Offset;
         break;
       }
     }
@@ -442,9 +490,9 @@ bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
   for (auto &Pair : Reduction.PMACPairs) {
     BinOpChain *PMul0 = Pair.first;
     BinOpChain *PMul1 = Pair.second;
-    LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
-               dbgs() << "- "; PMul0->Root->dump();
-               dbgs() << "- "; PMul1->Root->dump());
+    LLVM_DEBUG(dbgs() << "Found parallel MACs:\n"
+               << "- " << *PMul0->Root << "\n"
+               << "- " << *PMul1->Root << "\n");
 
     Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
                           InsertAfter);
@@ -459,54 +507,6 @@ bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
   return false;
 }
 
-static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
-                            ReductionList &Reductions) {
-  RecurrenceDescriptor RecDesc;
-  const bool HasFnNoNaNAttr =
-    F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
-  const BasicBlock *Latch = TheLoop->getLoopLatch();
-
-  for (PHINode &Phi : Header->phis()) {
-    const auto *Ty = Phi.getType();
-    if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
-      continue;
-
-    const bool IsReduction =
-      RecurrenceDescriptor::AddReductionVar(&Phi,
-                                            RecurrenceDescriptor::RK_IntegerAdd,
-                                            TheLoop, HasFnNoNaNAttr, RecDesc);
-    if (!IsReduction)
-      continue;
-
-    Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
-    if (!Acc)
-      continue;
-
-    Reductions.push_back(Reduction(&Phi, Acc));
-  }
-
-  LLVM_DEBUG(
-    dbgs() << "\nAccumulating integer additions (reductions) found:\n";
-    for (auto &R : Reductions) {
-      dbgs() << "-  "; R.Phi->dump();
-      dbgs() << "-> "; R.AccIntAdd->dump();
-    }
-  );
-}
-
-static void AddMACCandidate(OpChainList &Candidates,
-                            Instruction *Mul,
-                            Value *MulOp0, Value *MulOp1) {
-  assert(Mul->getOpcode() == Instruction::Mul &&
-         "expected mul instruction");
-  ValueList LHS;
-  ValueList RHS;
-  if (IsNarrowSequence<16>(MulOp0, LHS) &&
-      IsNarrowSequence<16>(MulOp1, RHS)) {
-    Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
-  }
-}
-
 static void MatchParallelMACSequences(Reduction &R,
                                       OpChainList &Candidates) {
   Instruction *Acc = R.AccIntAdd;
@@ -528,8 +528,14 @@ static void MatchParallelMACSequences(Reduction &R,
     case Instruction::Mul: {
       Value *MulOp0 = I->getOperand(0);
       Value *MulOp1 = I->getOperand(1);
-      if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1))
-        AddMACCandidate(Candidates, I, MulOp0, MulOp1);
+      if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1)) {
+        ValueList LHS;
+        ValueList RHS;
+        if (IsNarrowSequence<16>(MulOp0, LHS) &&
+            IsNarrowSequence<16>(MulOp1, RHS)) {
+          Candidates.push_back(make_unique<BinOpChain>(I, LHS, RHS));
+        }
+      }
       return false;
     }
     case Instruction::SExt:
@@ -543,52 +549,6 @@ static void MatchParallelMACSequences(Reduction &R,
              << Candidates.size() << " candidates.\n");
 }
 
-// Collects all instructions that are not part of the MAC chains, which is the
-// set of instructions that can potentially alias with the MAC operands.
-static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
-                            Instructions &Writes) {
-  for (auto &I : *Header) {
-    if (I.mayReadFromMemory())
-      Reads.push_back(&I);
-    if (I.mayWriteToMemory())
-      Writes.push_back(&I);
-  }
-}
-
-// Check whether statements in the basic block that write to memory alias with
-// the memory locations accessed by the MAC-chains.
-// TODO: we need the read statements when we accept more complicated chains.
-static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
-                       Instructions &Writes, OpChainList &MACCandidates) {
-  LLVM_DEBUG(dbgs() << "Alias checks:\n");
-  for (auto &MAC : MACCandidates) {
-    LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
-
-    // At the moment, we allow only simple chains that only consist of reads,
-    // accumulate their result with an integer add, and thus that don't write
-    // memory, and simply bail if they do.
-    if (!MAC->ReadOnly)
-      return true;
-
-    // Now for all writes in the basic block, check that they don't alias with
-    // the memory locations accessed by our MAC-chain:
-    for (auto *I : Writes) {
-      LLVM_DEBUG(dbgs() << "- "; I->dump());
-      assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
-      for (auto &MemLoc : MAC->MemLocs) {
-        if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
-                                          ModRefInfo::ModRef))) {
-          LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
-          return true;
-        }
-      }
-    }
-  }
-
-  LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
-  return false;
-}
-
 static bool CheckMACMemory(OpChainList &Candidates) {
   for (auto &C : Candidates) {
     // A mul has 2 operands, and a narrow op consist of sext and a load; thus
@@ -597,7 +557,7 @@ static bool CheckMACMemory(OpChainList &Candidates) {
       LLVM_DEBUG(dbgs() << "Operand list too short.\n");
       return false;
     }
-    C->SetMemoryLocations();
+    C->PopulateLoads();
     ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
     ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
 
@@ -643,14 +603,36 @@ static bool CheckMACMemory(OpChainList &Candidates) {
 // before the loop begins.
 //
 bool ARMParallelDSP::MatchSMLAD(Function &F) {
-  BasicBlock *Header = L->getHeader();
-  LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
-             dbgs() << "Header block:\n"; Header->dump();
-             dbgs() << "Loop info:\n\n"; L->dump());
 
-  bool Changed = false;
+  auto FindReductions = [&](ReductionList &Reductions) {
+    RecurrenceDescriptor RecDesc;
+    const bool HasFnNoNaNAttr =
+      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+    BasicBlock *Latch = L->getLoopLatch();
+
+    for (PHINode &Phi : Latch->phis()) {
+      const auto *Ty = Phi.getType();
+      if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+        continue;
+
+      const bool IsReduction = RecurrenceDescriptor::AddReductionVar(
+        &Phi, RecurrenceDescriptor::RK_IntegerAdd, L, HasFnNoNaNAttr, RecDesc);
+
+      if (!IsReduction)
+        continue;
+
+      Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
+      if (!Acc)
+        continue;
+
+      Reductions.push_back(Reduction(&Phi, Acc));
+    }
+    return !Reductions.empty();
+  };
+
   ReductionList Reductions;
-  MatchReductions(F, L, Header, Reductions);
+  if (!FindReductions(Reductions))
+    return false;
 
   for (auto &R : Reductions) {
     OpChainList MACCandidates;
@@ -666,72 +648,79 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
       dbgs() << "\n";);
   }
 
-  // Collect all instructions that may read or write memory. Our alias
-  // analysis checks bail out if any of these instructions aliases with an
-  // instruction from the MAC-chain.
-  Instructions Reads, Writes;
-  AliasCandidates(Header, Reads, Writes);
-
+  bool Changed = false;
+  // Check whether statements in the basic block that write to memory alias
+  // with the memory locations accessed by the MAC-chains.
   for (auto &R : Reductions) {
-    if (AreAliased(AA, Reads, Writes, R.MACCandidates))
-      return false;
     CreateParallelMACPairs(R);
     Changed |= InsertParallelMACs(R);
   }
 
-  LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
   return Changed;
 }
 
-LoadInst* ARMParallelDSP::CreateLoadIns(IRBuilder<NoFolder> &IRB,
-                                        SmallVectorImpl<LoadInst*> &Loads,
-                                        IntegerType *LoadTy) {
+LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl<LoadInst*> &Loads,
+                                         IntegerType *LoadTy) {
   assert(Loads.size() == 2 && "currently only support widening two loads");
- 
-  const unsigned AddrSpace = Loads[0]->getPointerAddressSpace();
-  Value *VecPtr = IRB.CreateBitCast(Loads[0]->getPointerOperand(),
+
+  LoadInst *Base = Loads[0];
+  LoadInst *Offset = Loads[1];
+
+  Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
+  Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
+
+  assert((BaseSExt && OffsetSExt)
+         && "Loads should have a single, extending, user");
+
+  std::function<void(Value*, Value*)> MoveBefore =
+    [&](Value *A, Value *B) -> void {
+      if (!isa<Instruction>(A) || !isa<Instruction>(B))
+        return;
+
+      auto *Source = cast<Instruction>(A);
+      auto *Sink = cast<Instruction>(B);
+
+      if (DT->dominates(Source, Sink) ||
+          Source->getParent() != Sink->getParent() ||
+          isa<PHINode>(Source) || isa<PHINode>(Sink))
+        return;
+
+      Source->moveBefore(Sink);
+      for (auto &U : Source->uses())
+        MoveBefore(Source, U.getUser());
+    };
+
+  // Insert the load at the point of the original dominating load.
+  LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
+  IRBuilder<NoFolder> IRB(DomLoad->getParent(),
+                          ++BasicBlock::iterator(DomLoad));
+
+  // Bitcast the pointer to a wider type and create the wide load, while making
+  // sure to maintain the original alignment as this prevents ldrd from being
+  // generated when it could be illegal due to memory alignment.
+  const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
+  Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
                                     LoadTy->getPointerTo(AddrSpace));
   LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
-                                             Loads[0]->getAlignment());
-  // Fix up users, Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
-  Instruction *SExt0 = dyn_cast<SExtInst>(Loads[0]->user_back());
-  Instruction *SExt1 = dyn_cast<SExtInst>(Loads[1]->user_back());
-
-  assert((Loads[0]->hasOneUse() && Loads[1]->hasOneUse() && SExt0 && SExt1) &&
-         "Loads should have a single, extending, user");
-
-  std::function<void(Instruction*, Instruction*)> MoveAfter =
-    [&](Instruction* Source, Instruction* Sink) -> void {
-    if (DT->dominates(Source, Sink) ||
-        Source->getParent() != Sink->getParent() ||
-        isa<PHINode>(Source) || isa<PHINode>(Sink))
-      return;
-
-    Sink->moveAfter(Source);
-    for (auto &U : Sink->uses())
-      MoveAfter(Sink, cast<Instruction>(U.getUser()));
-  };
+                                             Base->getAlignment());
+
+  // Make sure everything is in the correct order in the basic block.
+  MoveBefore(Base->getPointerOperand(), VecPtr);
+  MoveBefore(VecPtr, WideLoad);
 
   // From the wide load, create two values that equal the original two loads.
-  Value *Bottom = IRB.CreateTrunc(WideLoad, Loads[0]->getType());
-  SExt0->setOperand(0, Bottom);
-  if (auto *I = dyn_cast<Instruction>(Bottom)) {
-    I->moveAfter(WideLoad);
-    MoveAfter(I, SExt0);
-  }
+  // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
+  // TODO: Support big-endian as well.
+  Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
+  BaseSExt->setOperand(0, Bottom);
 
-  IntegerType *Ld1Ty = cast<IntegerType>(Loads[1]->getType());
-  Value *ShiftVal = ConstantInt::get(LoadTy, Ld1Ty->getBitWidth());
+  IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
+  Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
   Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
-  if (auto *I = dyn_cast<Instruction>(Top))
-    MoveAfter(WideLoad, I);
+  Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
+  OffsetSExt->setOperand(0, Trunc);
 
-  Value *Trunc = IRB.CreateTrunc(Top, Ld1Ty);
-  SExt1->setOperand(0, Trunc);
-  if (auto *I = dyn_cast<Instruction>(Trunc))
-    MoveAfter(I, SExt1);
-
-  WideLoads.emplace(std::make_pair(Loads[0],
+  WideLoads.emplace(std::make_pair(Base,
                                    make_unique<WidenedLoad>(Loads, WideLoad)));
   return WideLoad;
 }
@@ -748,15 +737,13 @@ Instruction *ARMParallelDSP::CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
              << "- " << *Acc << "\n"
              << "- Exchange: " << Exchange << "\n");
 
-  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
-                              ++BasicBlock::iterator(InsertAfter));
-
   // Replace the reduction chain with an intrinsic call
   IntegerType *Ty = IntegerType::get(M->getContext(), 32);
   LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
-    WideLoads[VecLd0[0]]->getLoad() : CreateLoadIns(Builder, VecLd0, Ty);
+    WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty);
   LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
-    WideLoads[VecLd1[0]]->getLoad() : CreateLoadIns(Builder, VecLd1, Ty);
+    WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty);
+
   Value* Args[] = { WideLd0, WideLd1, Acc };
   Function *SMLAD = nullptr;
   if (Exchange)
@@ -767,6 +754,9 @@ Instruction *ARMParallelDSP::CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
     SMLAD = Acc->getType()->isIntegerTy(32) ?
       Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
       Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
+
+  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
+                              ++BasicBlock::iterator(InsertAfter));
   CallInst *Call = Builder.CreateCall(SMLAD, Args);
   NumSMLAD++;
   return Call;