[LoopVectorizer] Let target prefer scalar addressing computations.

The loop vectorizer usually vectorizes any instruction it can and then extracts the elements for a scalarized use. On SystemZ, all elements containing addresses must be extracted into address registers (GRs). Since this extraction is not free, it is better to have the address in a suitable register to begin with. By forcing address arithmetic instructions and loads of addresses to be scalar after vectorization, two benefits result: * No need to extract the register * LSR optimizations trigger (LSR isn't handling vector addresses currently) Benchmarking show improvements on SystemZ with this new behaviour. Any other target could try this by returning false in the new hook prefersVectorizedAddressing(). Review: Renato Golin, Elena Demikhovsky, Ulrich Weigand https://reviews.llvm.org/D32422 llvm-svn: 303744
author: Jonas Paulsson <paulsson@linux.vnet.ibm.com> 2017-05-24 13:42:56 +0000
committer: Jonas Paulsson <paulsson@linux.vnet.ibm.com> 2017-05-24 13:42:56 +0000
commit: 8624b7e1cefbfc718bbe4dab10a8eac6fcd7db3b (patch)
tree: e97d9a9febe3a34ee549699cf9d2854e16f80f39 /llvm/lib/Transforms/Vectorize
parent: 081b5a1e9d1d483f65936bedab4caf6b555ab3e2 (diff)
download: bcm5719-llvm-8624b7e1cefbfc718bbe4dab10a8eac6fcd7db3b.tar.gz
bcm5719-llvm-8624b7e1cefbfc718bbe4dab10a8eac6fcd7db3b.zip
1 files changed, 74 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fa8b613f934..2b83b8426d1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2092,6 +2092,10 @@ private:
   /// The data is collected per VF.
   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
 
+  /// Holds the instructions (address computations) that are forced to be
+  /// scalarized.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+
   /// Returns the expected difference in cost from scalarizing the expression
   /// feeding a predicated instruction \p PredInst. The instructions to
   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
@@ -5576,6 +5580,13 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
     DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
   }
 
+  // Insert the forced scalars.
+  // FIXME: Currently widenPHIInstruction() often creates a dead vector
+  // induction variable when the PHI user is scalarized.
+  if (ForcedScalars.count(VF))
+    for (auto *I : ForcedScalars.find(VF)->second)
+      Worklist.insert(I);
+
   // Expand the worklist by looking through any bitcasts and getelementptr
   // instructions we've already identified as scalar. This is similar to the
   // expansion step in collectLoopUniforms(); however, here we're only
@@ -7151,9 +7162,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   if (VF > 1 && isProfitableToScalarize(I, VF))
     return VectorizationCostTy(InstsToScalarize[VF][I], false);
 
+  // Forced scalars do not have any scalarization overhead.
+  if (VF > 1 && ForcedScalars.count(VF) &&
+      ForcedScalars.find(VF)->second.count(I))
+    return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
+
   Type *VectorTy;
   unsigned C = getInstructionCost(I, VF, VectorTy);
 
+  // Note: Even if all instructions are scalarized, return true if any memory
+  // accesses appear in the loop to get benefits from address folding etc.
   bool TypeNotScalarized =
       VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF;
   return VectorizationCostTy(C, TypeNotScalarized);
@@ -7230,6 +7248,62 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
         setWideningDecision(&I, VF, Decision, Cost);
     }
   }
+
+  // Make sure that any load of address and any other address computation
+  // remains scalar unless there is gather/scatter support. This avoids
+  // inevitable extracts into address registers, and also has the benefit of
+  // activating LSR more, since that pass can't optimize vectorized
+  // addresses.
+  if (TTI.prefersVectorizedAddressing())
+    return;
+
+  // Start with all scalar pointer uses.
+  SmallPtrSet<Instruction *, 8> AddrDefs;
+  for (BasicBlock *BB : TheLoop->blocks())
+    for (Instruction &I : *BB) {
+      Instruction *PtrDef =
+        dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+      if (PtrDef && TheLoop->contains(PtrDef) &&
+          getWideningDecision(&I, VF) != CM_GatherScatter)
+        AddrDefs.insert(PtrDef);
+    }
+
+  // Add all instructions used to generate the addresses.
+  SmallVector<Instruction *, 4> Worklist;
+  for (auto *I : AddrDefs)
+    Worklist.push_back(I);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (auto &Op : I->operands())
+      if (auto *InstOp = dyn_cast<Instruction>(Op))
+        if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
+            AddrDefs.insert(InstOp).second == true)
+          Worklist.push_back(InstOp);
+  }
+
+  for (auto *I : AddrDefs) {
+    if (isa<LoadInst>(I)) {
+      // Setting the desired widening decision should ideally be handled in
+      // by cost functions, but since this involves the task of finding out
+      // if the loaded register is involved in an address computation, it is
+      // instead changed here when we know this is the case.
+      if (getWideningDecision(I, VF) == CM_Widen)
+        // Scalarize a widened load of address.
+        setWideningDecision(I, VF, CM_Scalarize,
+                            (VF * getMemoryInstructionCost(I, 1)));
+      else if (auto Group = Legal->getInterleavedAccessGroup(I)) {
+        // Scalarize an interleave group of address loads.
+        for (unsigned I = 0; I < Group->getFactor(); ++I) {
+          if (Instruction *Member = Group->getMember(I))
+            setWideningDecision(Member, VF, CM_Scalarize,
+                                (VF * getMemoryInstructionCost(Member, 1)));
+        }
+      }
+    } else
+      // Make sure I gets scalarized and a cost estimate without
+      // scalarization overhead.
+      ForcedScalars[VF].insert(I);
+  }
 }
 
 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
author	Jonas Paulsson <paulsson@linux.vnet.ibm.com>	2017-05-24 13:42:56 +0000
committer	Jonas Paulsson <paulsson@linux.vnet.ibm.com>	2017-05-24 13:42:56 +0000
commit	8624b7e1cefbfc718bbe4dab10a8eac6fcd7db3b (patch)
tree	e97d9a9febe3a34ee549699cf9d2854e16f80f39 /llvm/lib/Transforms/Vectorize
parent	081b5a1e9d1d483f65936bedab4caf6b555ab3e2 (diff)
download	bcm5719-llvm-8624b7e1cefbfc718bbe4dab10a8eac6fcd7db3b.tar.gz bcm5719-llvm-8624b7e1cefbfc718bbe4dab10a8eac6fcd7db3b.zip