Revert r367389 (and follow-up r368404); it caused PR43073.

llvm-svn: 369567
author: Nico Weber <nicolasweber@gmx.de> 2019-08-21 19:53:42 +0000
committer: Nico Weber <nicolasweber@gmx.de> 2019-08-21 19:53:42 +0000
commit: ed18e70c86f6de353cebe0a8faa961a341c74d27 (patch)
tree: 504dce17a661d8e817fd245c90a45754f2fa0a57 /llvm/lib
parent: dde8a25a4bd0b8e194ebb9277d656f6069c26449 (diff)
download: bcm5719-llvm-ed18e70c86f6de353cebe0a8faa961a341c74d27.tar.gz
bcm5719-llvm-ed18e70c86f6de353cebe0a8faa961a341c74d27.zip
1 files changed, 76 insertions, 50 deletions
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index 1f15ddb85d4..5717a7102b6 100644
--- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -1,4 +1,4 @@
-//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
+//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -18,10 +18,13 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/PassSupport.h"
@@ -68,7 +71,7 @@ namespace {
     }
 
     LoadInst *getBaseLoad() const {
-      return VecLd.front();
+      return cast<LoadInst>(LHS);
     }
   };
 
@@ -155,11 +158,13 @@ namespace {
     }
   };
 
-  class ARMParallelDSP : public FunctionPass {
+  class ARMParallelDSP : public LoopPass {
     ScalarEvolution   *SE;
     AliasAnalysis     *AA;
     TargetLibraryInfo *TLI;
     DominatorTree     *DT;
+    LoopInfo          *LI;
+    Loop              *L;
     const DataLayout  *DL;
     Module            *M;
     std::map<LoadInst*, LoadInst*> LoadPairs;
@@ -180,38 +185,63 @@ namespace {
     /// products to a 32-bit accumulate operand. Optionally, the instruction can
     /// exchange the halfwords of the second operand before performing the
     /// arithmetic.
-    bool MatchSMLAD(Function &F);
+    bool MatchSMLAD(Loop *L);
 
   public:
     static char ID;
 
-    ARMParallelDSP() : FunctionPass(ID) { }
+    ARMParallelDSP() : LoopPass(ID) { }
+
+    bool doInitialization(Loop *L, LPPassManager &LPM) override {
+      LoadPairs.clear();
+      WideLoads.clear();
+      return true;
+    }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      FunctionPass::getAnalysisUsage(AU);
+      LoopPass::getAnalysisUsage(AU);
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<ScalarEvolutionWrapperPass>();
       AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<TargetPassConfig>();
-      AU.addPreserved<ScalarEvolutionWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
       AU.setPreservesCFG();
     }
 
-    bool runOnFunction(Function &F) override {
+    bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
       if (DisableParallelDSP)
         return false;
-      if (skipFunction(F))
+      if (skipLoop(TheLoop))
         return false;
 
+      L = TheLoop;
       SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
       AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
       TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
       auto &TPC = getAnalysis<TargetPassConfig>();
 
+      BasicBlock *Header = TheLoop->getHeader();
+      if (!Header)
+        return false;
+
+      // TODO: We assume the loop header and latch to be the same block.
+      // This is not a fundamental restriction, but lifting this would just
+      // require more work to do the transformation and then patch up the CFG.
+      if (Header != TheLoop->getLoopLatch()) {
+        LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
+                             "running pass ARMParallelDSP\n");
+        return false;
+      }
+
+      if (!TheLoop->getLoopPreheader())
+        InsertPreheaderForLoop(L, DT, LI, nullptr, true);
+
+      Function &F = *Header->getParent();
       M = F.getParent();
       DL = &M->getDataLayout();
 
@@ -236,10 +266,17 @@ namespace {
         return false;
       }
 
+      LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
+
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
 
-      bool Changes = MatchSMLAD(F);
+      if (!RecordMemoryOps(Header)) {
+        LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
+        return false;
+      }
+
+      bool Changes = MatchSMLAD(L);
       return Changes;
     }
   };
@@ -300,8 +337,6 @@ bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) {
 bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   SmallVector<LoadInst*, 8> Loads;
   SmallVector<Instruction*, 8> Writes;
-  LoadPairs.clear();
-  WideLoads.clear();
 
   // Collect loads and instruction that may write to memory. For now we only
   // record loads which are simple, sign-extended and have a single user.
@@ -379,7 +414,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   return LoadPairs.size() > 1;
 }
 
-// The pass needs to identify integer add/sub reductions of 16-bit vector
+// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
 // multiplications.
 // To use SMLAD:
 // 1) we first need to find integer add then look for this pattern:
@@ -410,13 +445,13 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
 // If loop invariants are used instead of loads, these need to be packed
 // before the loop begins.
 //
-bool ARMParallelDSP::MatchSMLAD(Function &F) {
+bool ARMParallelDSP::MatchSMLAD(Loop *L) {
   // Search recursively back through the operands to find a tree of values that
   // form a multiply-accumulate chain. The search records the Add and Mul
   // instructions that form the reduction and allows us to find a single value
   // to be used as the initial input to the accumlator.
-  std::function<bool(Value*, BasicBlock*, Reduction&)> Search = [&]
-    (Value *V, BasicBlock *BB, Reduction &R) -> bool {
+  std::function<bool(Value*, Reduction&)> Search = [&]
+    (Value *V, Reduction &R) -> bool {
 
     // If we find a non-instruction, try to use it as the initial accumulator
     // value. This may have already been found during the search in which case
@@ -425,9 +460,6 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
     if (!I)
       return R.InsertAcc(V);
 
-    if (I->getParent() != BB)
-      return false;
-
     switch (I->getOpcode()) {
     default:
       break;
@@ -438,8 +470,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
       // Adds should be adding together two muls, or another add and a mul to
       // be within the mac chain. One of the operands may also be the
       // accumulator value at which point we should stop searching.
-      bool ValidLHS = Search(I->getOperand(0), BB, R);
-      bool ValidRHS = Search(I->getOperand(1), BB, R);
+      bool ValidLHS = Search(I->getOperand(0), R);
+      bool ValidRHS = Search(I->getOperand(1), R);
       if (!ValidLHS && !ValidLHS)
         return false;
       else if (ValidLHS && ValidRHS) {
@@ -465,40 +497,36 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
       return false;
     }
     case Instruction::SExt:
-      return Search(I->getOperand(0), BB, R);
+      return Search(I->getOperand(0), R);
     }
     return false;
   };
 
   bool Changed = false;
+  SmallPtrSet<Instruction*, 4> AllAdds;
+  BasicBlock *Latch = L->getLoopLatch();
 
-  for (auto &BB : F) {
-    SmallPtrSet<Instruction*, 4> AllAdds;
-    if (!RecordMemoryOps(&BB))
+  for (Instruction &I : reverse(*Latch)) {
+    if (I.getOpcode() != Instruction::Add)
       continue;
 
-    for (Instruction &I : reverse(BB)) {
-      if (I.getOpcode() != Instruction::Add)
-        continue;
-
-      if (AllAdds.count(&I))
-        continue;
+    if (AllAdds.count(&I))
+      continue;
 
-      const auto *Ty = I.getType();
-      if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
-        continue;
+    const auto *Ty = I.getType();
+    if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
+      continue;
 
-      Reduction R(&I);
-      if (!Search(&I, &BB, R))
-        continue;
+    Reduction R(&I);
+    if (!Search(&I, R))
+      continue;
 
-      if (!CreateParallelPairs(R))
-        continue;
+    if (!CreateParallelPairs(R))
+      continue;
 
-      InsertParallelMACs(R);
-      Changed = true;
-      AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
-    }
+    InsertParallelMACs(R);
+    Changed = true;
+    AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
   }
 
   return Changed;
@@ -696,15 +724,13 @@ LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
   // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
   // TODO: Support big-endian as well.
   Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
-  Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
-  BaseSExt->replaceAllUsesWith(NewBaseSExt);
+  BaseSExt->setOperand(0, Bottom);
 
   IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
   Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
   Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
   Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
-  Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
-  OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
+  OffsetSExt->setOperand(0, Trunc);
 
   WideLoads.emplace(std::make_pair(Base,
                                    std::make_unique<WidenedLoad>(Loads, WideLoad)));
@@ -718,6 +744,6 @@ Pass *llvm::createARMParallelDSPPass() {
 char ARMParallelDSP::ID = 0;
 
 INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
-                "Transform functions to use DSP intrinsics", false, false)
+                "Transform loops to use DSP intrinsics", false, false)
 INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
-                "Transform functions to use DSP intrinsics", false, false)
+                "Transform loops to use DSP intrinsics", false, false)
author	Nico Weber <nicolasweber@gmx.de>	2019-08-21 19:53:42 +0000
committer	Nico Weber <nicolasweber@gmx.de>	2019-08-21 19:53:42 +0000
commit	ed18e70c86f6de353cebe0a8faa961a341c74d27 (patch)
tree	504dce17a661d8e817fd245c90a45754f2fa0a57 /llvm/lib
parent	dde8a25a4bd0b8e194ebb9277d656f6069c26449 (diff)
download	bcm5719-llvm-ed18e70c86f6de353cebe0a8faa961a341c74d27.tar.gz bcm5719-llvm-ed18e70c86f6de353cebe0a8faa961a341c74d27.zip