Reapply r298620: [LV] Vectorize GEPs

This patch reapplies r298620. The original patch was reverted because of two issues. First, the patch exposed a bug in InstCombine that caused the Chromium builds to fail (PR32414). This issue was fixed in r299017. Second, the patch introduced a bug in the vectorizer's scalars analysis that caused test suite builds to fail on SystemZ. The scalars analysis was too aggressive and marked a memory instruction scalar, even though it was going to be vectorized. This issue has been fixed in the current patch and several new test cases for the scalars analysis have been added. llvm-svn: 299770
author: Matthew Simpson <mssimpso@codeaurora.org> 2017-04-07 14:15:34 +0000
committer: Matthew Simpson <mssimpso@codeaurora.org> 2017-04-07 14:15:34 +0000
commit: 11fe2e9f2b787646bbb4c5e8dc7698abf79761ce (patch)
tree: 2f95deeb422bd8f1f77691c001cc2b75face0805 /llvm/lib
parent: 9f6a5cd91d3acf3393b0bc8a2f40bc099718129e (diff)
download: bcm5719-llvm-11fe2e9f2b787646bbb4c5e8dc7698abf79761ce.tar.gz
bcm5719-llvm-11fe2e9f2b787646bbb4c5e8dc7698abf79761ce.zip
1 files changed, 206 insertions, 79 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index af2d3f53064..814baca35f6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -277,32 +277,6 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) {
   return VectorType::get(Scalar, VF);
 }
 
-/// A helper function that returns GEP instruction and knows to skip a
-/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination
-/// pointee types of the 'bitcast' have the same size.
-/// For example:
-///   bitcast double** %var to i64* - can be skipped
-///   bitcast double** %var to i8*  - can not
-static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
-
-  if (isa<GetElementPtrInst>(Ptr))
-    return cast<GetElementPtrInst>(Ptr);
-
-  if (isa<BitCastInst>(Ptr) &&
-      isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {
-    Type *BitcastTy = Ptr->getType();
-    Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();
-    if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
-      return nullptr;
-    Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
-    Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
-    const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();
-    if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))
-      return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
-  }
-  return nullptr;
-}
-
 // FIXME: The following helper functions have multiple implementations
 // in the project. They can be effectively organized in a common Load/Store
 // utilities unit.
@@ -2996,40 +2970,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   VectorParts VectorGep;
 
   // Handle consecutive loads/stores.
-  GetElementPtrInst *Gep = getGEPInstruction(Ptr);
   if (ConsecutiveStride) {
     Ptr = getScalarValue(Ptr, 0, 0);
   } else {
     // At this point we should vector version of GEP for Gather or Scatter
     assert(CreateGatherScatter && "The instruction should be scalarized");
-    if (Gep) {
-      // Vectorizing GEP, across UF parts. We want to get a vector value for base
-      // and each index that's defined inside the loop, even if it is
-      // loop-invariant but wasn't hoisted out. Otherwise we want to keep them
-      // scalar.
-      SmallVector<VectorParts, 4> OpsV;
-      for (Value *Op : Gep->operands()) {
-        Instruction *SrcInst = dyn_cast<Instruction>(Op);
-        if (SrcInst && OrigLoop->contains(SrcInst))
-          OpsV.push_back(getVectorValue(Op));
-        else
-          OpsV.push_back(VectorParts(UF, Op));
-      }
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        SmallVector<Value *, 4> Ops;
-        Value *GEPBasePtr = OpsV[0][Part];
-        for (unsigned i = 1; i < Gep->getNumOperands(); i++)
-          Ops.push_back(OpsV[i][Part]);
-        Value *NewGep =  Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep");
-        cast<GetElementPtrInst>(NewGep)->setIsInBounds(Gep->isInBounds());
-        assert(NewGep->getType()->isVectorTy() && "Expected vector GEP");
-
-        NewGep =
-            Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF));
-        VectorGep.push_back(NewGep);
-      }
-    } else
-      VectorGep = getVectorValue(Ptr);
+    VectorGep = getVectorValue(Ptr);
   }
 
   VectorParts Mask = createBlockInMask(Instr->getParent());
@@ -4789,7 +4735,72 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB) {
       widenPHIInstruction(&I, UF, VF);
       continue;
     } // End of PHI.
+    case Instruction::GetElementPtr: {
+      // Construct a vector GEP by widening the operands of the scalar GEP as
+      // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+      // results in a vector of pointers when at least one operand of the GEP
+      // is vector-typed. Thus, to keep the representation compact, we only use
+      // vector-typed operands for loop-varying values.
+      auto *GEP = cast<GetElementPtrInst>(&I);
+      VectorParts Entry(UF);
+
+      if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
+        // If we are vectorizing, but the GEP has only loop-invariant operands,
+        // the GEP we build (by only using vector-typed operands for
+        // loop-varying values) would be a scalar pointer. Thus, to ensure we
+        // produce a vector of pointers, we need to either arbitrarily pick an
+        // operand to broadcast, or broadcast a clone of the original GEP.
+        // Here, we broadcast a clone of the original.
+        //
+        // TODO: If at some point we decide to scalarize instructions having
+        //       loop-invariant operands, this special case will no longer be
+        //       required. We would add the scalarization decision to
+        //       collectLoopScalars() and teach getVectorValue() to broadcast
+        //       the lane-zero scalar value.
+        auto *Clone = Builder.Insert(GEP->clone());
+        for (unsigned Part = 0; Part < UF; ++Part)
+          Entry[Part] = Builder.CreateVectorSplat(VF, Clone);
+      } else {
+        // If the GEP has at least one loop-varying operand, we are sure to
+        // produce a vector of pointers. But if we are only unrolling, we want
+        // to produce a scalar GEP for each unroll part. Thus, the GEP we
+        // produce with the code below will be scalar (if VF == 1) or vector
+        // (otherwise). Note that for the unroll-only case, we still maintain
+        // values in the vector mapping with initVector, as we do for other
+        // instructions.
+        for (unsigned Part = 0; Part < UF; ++Part) {
+
+          // The pointer operand of the new GEP. If it's loop-invariant, we
+          // won't broadcast it.
+          auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand())
+                          ? GEP->getPointerOperand()
+                          : getVectorValue(GEP->getPointerOperand())[Part];
+
+          // Collect all the indices for the new GEP. If any index is
+          // loop-invariant, we won't broadcast it.
+          SmallVector<Value *, 4> Indices;
+          for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
+            if (OrigLoop->isLoopInvariant(U.get()))
+              Indices.push_back(U.get());
+            else
+              Indices.push_back(getVectorValue(U.get())[Part]);
+          }
+
+          // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+          // but it should be a vector, otherwise.
+          auto *NewGEP = GEP->isInBounds()
+                             ? Builder.CreateInBoundsGEP(Ptr, Indices)
+                             : Builder.CreateGEP(Ptr, Indices);
+          assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+                 "NewGEP is not a pointer vector");
+          Entry[Part] = NewGEP;
+        }
+      }
 
+      VectorLoopValueMap.initVector(&I, Entry);
+      addMetadata(Entry, GEP);
+      break;
+    }
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::SRem:
@@ -5469,46 +5480,158 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
 
-  // We should not collect Scalars more than once per VF. Right now,
-  // this function is called from collectUniformsAndScalars(), which 
-  // already does this check. Collecting Scalars for VF=1 does not make any
-  // sense.
-
+  // We should not collect Scalars more than once per VF. Right now, this
+  // function is called from collectUniformsAndScalars(), which already does
+  // this check. Collecting Scalars for VF=1 does not make any sense.
   assert(VF >= 2 && !Scalars.count(VF) &&
          "This function should not be visited twice for the same VF");
 
-  // If an instruction is uniform after vectorization, it will remain scalar.
-  Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
+  SmallSetVector<Instruction *, 8> Worklist;
+
+  // These sets are used to seed the analysis with pointers used by memory
+  // accesses that will remain scalar.
+  SmallSetVector<Instruction *, 8> ScalarPtrs;
+  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+
+  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
+  // The pointer operands of loads and stores will be scalar as long as the
+  // memory access is not a gather or scatter operation. The value operand of a
+  // store will remain scalar if the store is scalarized.
+  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
+    InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
+    assert(WideningDecision != CM_Unknown &&
+           "Widening decision should be ready at this moment");
+    if (auto *Store = dyn_cast<StoreInst>(MemAccess))
+      if (Ptr == Store->getValueOperand())
+        return WideningDecision == CM_Scalarize;
+    assert(Ptr == getPointerOperand(MemAccess) &&
+           "Ptr is neither a value or pointer operand");
+    return WideningDecision != CM_GatherScatter;
+  };
+
+  // A helper that returns true if the given value is a bitcast or
+  // getelementptr instruction contained in the loop.
+  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
+    return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
+            isa<GetElementPtrInst>(V)) &&
+           !TheLoop->isLoopInvariant(V);
+  };
+
+  // A helper that evaluates a memory access's use of a pointer. If the use
+  // will be a scalar use, and the pointer is only used by memory accesses, we
+  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
+  // PossibleNonScalarPtrs.
+  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+
+    // We only care about bitcast and getelementptr instructions contained in
+    // the loop.
+    if (!isLoopVaryingBitCastOrGEP(Ptr))
+      return;
 
-  // Collect the getelementptr instructions that will not be vectorized. A
-  // getelementptr instruction is only vectorized if it is used for a legal
-  // gather or scatter operation.
+    // If the pointer has already been identified as scalar (e.g., if it was
+    // also identified as uniform), there's nothing to do.
+    auto *I = cast<Instruction>(Ptr);
+    if (Worklist.count(I))
+      return;
+
+    // If the use of the pointer will be a scalar use, and all users of the
+    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
+    // place the pointer in PossibleNonScalarPtrs.
+    if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) {
+          return isa<LoadInst>(U) || isa<StoreInst>(U);
+        }))
+      ScalarPtrs.insert(I);
+    else
+      PossibleNonScalarPtrs.insert(I);
+  };
+
+  // We seed the scalars analysis with three classes of instructions: (1)
+  // instructions marked uniform-after-vectorization, (2) bitcast and
+  // getelementptr instructions used by memory accesses requiring a scalar use,
+  // and (3) pointer induction variables and their update instructions (we
+  // currently only scalarize these).
+  //
+  // (1) Add to the worklist all instructions that have been identified as
+  // uniform-after-vectorization.
+  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
+
+  // (2) Add to the worklist all bitcast and getelementptr instructions used by
+  // memory accesses requiring a scalar use. The pointer operands of loads and
+  // stores will be scalar as long as the memory accesses is not a gather or
+  // scatter operation. The value operand of a store will remain scalar if the
+  // store is scalarized.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
-      if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-        Scalars[VF].insert(GEP);
-        continue;
+      if (auto *Load = dyn_cast<LoadInst>(&I)) {
+        evaluatePtrUse(Load, Load->getPointerOperand());
+      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
+        evaluatePtrUse(Store, Store->getPointerOperand());
+        evaluatePtrUse(Store, Store->getValueOperand());
       }
-      auto *Ptr = getPointerOperand(&I);
-      if (!Ptr)
-        continue;
-      auto *GEP = getGEPInstruction(Ptr);
-      if (GEP && getWideningDecision(&I, VF) == CM_GatherScatter)
-        Scalars[VF].erase(GEP);
+    }
+  for (auto *I : ScalarPtrs)
+    if (!PossibleNonScalarPtrs.count(I)) {
+      DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
+      Worklist.insert(I);
     }
 
+  // (3) Add to the worklist all pointer induction variables and their update
+  // instructions.
+  //
+  // TODO: Once we are able to vectorize pointer induction variables we should
+  //       no longer insert them into the worklist here.
+  auto *Latch = TheLoop->getLoopLatch();
+  for (auto &Induction : *Legal->getInductionVars()) {
+    auto *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+    if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
+      continue;
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+  }
+
+  // Expand the worklist by looking through any bitcasts and getelementptr
+  // instructions we've already identified as scalar. This is similar to the
+  // expansion step in collectLoopUniforms(); however, here we're only
+  // expanding to include additional bitcasts and getelementptr instructions.
+  unsigned Idx = 0;
+  while (Idx != Worklist.size()) {
+    Instruction *Dst = Worklist[Idx++];
+    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
+      continue;
+    auto *Src = cast<Instruction>(Dst->getOperand(0));
+    if (all_of(Src->users(), [&](User *U) -> bool {
+          auto *J = cast<Instruction>(U);
+          return !TheLoop->contains(J) || Worklist.count(J) ||
+                 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
+                  isScalarUse(J, Src));
+        })) {
+      Worklist.insert(Src);
+      DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
+    }
+  }
+
   // An induction variable will remain scalar if all users of the induction
   // variable and induction variable update remain scalar.
-  auto *Latch = TheLoop->getLoopLatch();
   for (auto &Induction : *Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
+    // We already considered pointer induction variables, so there's no reason
+    // to look at their users again.
+    //
+    // TODO: Once we are able to vectorize pointer induction variables we
+    //       should no longer skip over them here.
+    if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
+      continue;
+
     // Determine if all users of the induction variable are scalar after
     // vectorization.
     auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == IndUpdate || !TheLoop->contains(I) || Scalars[VF].count(I);
+      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
     });
     if (!ScalarInd)
       continue;
@@ -5517,15 +5640,19 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
     // scalar after vectorization.
     auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == Ind || !TheLoop->contains(I) || Scalars[VF].count(I);
+      return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
     });
     if (!ScalarIndUpdate)
       continue;
 
     // The induction variable and its update instruction will remain scalar.
-    Scalars[VF].insert(Ind);
-    Scalars[VF].insert(IndUpdate);
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
   }
+
+  Scalars[VF].insert(Worklist.begin(), Worklist.end());
 }
 
 bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
author	Matthew Simpson <mssimpso@codeaurora.org>	2017-04-07 14:15:34 +0000
committer	Matthew Simpson <mssimpso@codeaurora.org>	2017-04-07 14:15:34 +0000
commit	11fe2e9f2b787646bbb4c5e8dc7698abf79761ce (patch)
tree	2f95deeb422bd8f1f77691c001cc2b75face0805 /llvm/lib
parent	9f6a5cd91d3acf3393b0bc8a2f40bc099718129e (diff)
download	bcm5719-llvm-11fe2e9f2b787646bbb4c5e8dc7698abf79761ce.tar.gz bcm5719-llvm-11fe2e9f2b787646bbb4c5e8dc7698abf79761ce.zip