diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 38 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 159 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 87 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 4 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp | 11 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanValue.h | 3 |
6 files changed, 287 insertions, 15 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index f23347e9cf6..b77108d598f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -516,6 +516,18 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { return false; } + // Check whether we are able to set up outer loop induction. + if (!setupOuterLoopInductions()) { + LLVM_DEBUG( + dbgs() << "LV: Not vectorizing: Unsupported outer loop Phi(s).\n"); + ORE->emit(createMissedAnalysis("UnsupportedPhi") + << "Unsupported outer loop Phi(s)"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + return Result; } @@ -571,6 +583,32 @@ void LoopVectorizationLegality::addInductionPhi( LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n"); } +bool LoopVectorizationLegality::setupOuterLoopInductions() { + BasicBlock *Header = TheLoop->getHeader(); + + // Returns true if a given Phi is a supported induction. + auto isSupportedPhi = [&](PHINode &Phi) -> bool { + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) && + ID.getKind() == InductionDescriptor::IK_IntInduction) { + addInductionPhi(&Phi, ID, AllowedExit); + return true; + } else { + // Bail out for any Phi in the outer loop header that is not a supported + // induction. + LLVM_DEBUG( + dbgs() + << "LV: Found unsupported PHI for outer loop vectorization.\n"); + return false; + } + }; + + if (llvm::all_of(Header->phis(), isSupportedPhi)) + return true; + else + return false; +} + bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *Header = TheLoop->getHeader(); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e9f46377280..d5708d0c8f0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,6 +58,7 @@ #include "LoopVectorizationPlanner.h" #include "VPRecipeBuilder.h" #include "VPlanHCFGBuilder.h" +#include "VPlanHCFGTransforms.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -234,7 +235,7 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC( cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); -static cl::opt<bool> EnableVPlanNativePath( +cl::opt<bool> EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); @@ -419,6 +420,9 @@ public: /// the instruction. void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + /// Fix the non-induction PHIs in the OrigPHIsToFix vector. + void fixNonInductionPHIs(void); + protected: friend class LoopVectorizationPlanner; @@ -686,6 +690,10 @@ protected: // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap<PHINode *, Value *> IVEndValues; + + // Vector of original scalar PHIs whose corresponding widened PHIs need to be + // fixed up at the end of vector code generation. + SmallVector<PHINode *, 8> OrigPHIsToFix; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -888,6 +896,12 @@ public: /// vectorization factor \p VF. bool isProfitableToScalarize(Instruction *I, unsigned VF) const { assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return false; + auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); @@ -898,6 +912,12 @@ public: bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { if (VF == 1) return true; + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return false; + auto UniformsPerVF = Uniforms.find(VF); assert(UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"); @@ -908,6 +928,12 @@ public: bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { if (VF == 1) return true; + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return false; + auto ScalarsPerVF = Scalars.find(VF); assert(ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"); @@ -962,6 +988,12 @@ public: /// through the cost modeling. InstWidening getWideningDecision(Instruction *I, unsigned VF) { assert(VF >= 2 && "Expected VF >=2"); + + // Cost model is not run in the VPlan-native path - return conservative + // result until this changes. + if (EnableVPlanNativePath) + return CM_GatherScatter; + std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) @@ -1397,8 +1429,16 @@ struct LoopVectorize : public FunctionPass { AU.addRequired<LoopAccessLegacyAnalysis>(); AU.addRequired<DemandedBitsWrapperPass>(); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); + + // We currently do not preserve loopinfo/dominator analyses with outer loop + // vectorization. Until this is addressed, mark these analyses as preserved + // only for non-VPlan-native path. + // TODO: Preserve Loop and Dominator analyses for VPlan-native path. + if (!EnableVPlanNativePath) { + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } + AU.addPreserved<BasicAAWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } @@ -1749,8 +1789,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { assert(!V->getType()->isVectorTy() && "Can't widen a vector"); assert(!V->getType()->isVoidTy() && "Type does not produce a value"); - // If we have a stride that is replaced by one, do it here. - if (Legal->hasStride(V)) + // If we have a stride that is replaced by one, do it here. Defer this for + // the VPlan-native path until we start running Legal checks in that path. + if (!EnableVPlanNativePath && Legal->hasStride(V)) V = ConstantInt::get(V->getType(), 1); // If we have a vector mapped to this value, return it. @@ -2416,6 +2457,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { } void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { + // VPlan-native path does not do any analysis for runtime checks currently. + if (EnableVPlanNativePath) + return; + BasicBlock *BB = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the @@ -3060,6 +3105,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() { if (VF > 1) truncateToMinimalBitwidths(); + // Fix widened non-induction PHIs by setting up the PHI operands. + if (OrigPHIsToFix.size()) { + assert(EnableVPlanNativePath && + "Unexpected non-induction PHIs for fixup in non VPlan-native path"); + fixNonInductionPHIs(); + } + // At this point every instruction in the original loop is widened to a // vector form. Now we need to fix the recurrences in the loop. These PHI // nodes are currently empty because we did not want to introduce cycles. @@ -3532,12 +3584,62 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { } while (Changed); } +void InnerLoopVectorizer::fixNonInductionPHIs() { + for (PHINode *OrigPhi : OrigPHIsToFix) { + PHINode *NewPhi = + cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); + unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); + + SmallVector<BasicBlock *, 2> ScalarBBPredecessors( + predecessors(OrigPhi->getParent())); + SmallVector<BasicBlock *, 2> VectorBBPredecessors( + predecessors(NewPhi->getParent())); + assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && + "Scalar and Vector BB should have the same number of predecessors"); + + // The insertion point in Builder may be invalidated by the time we get + // here. Force the Builder insertion point to something valid so that we do + // not run into issues during insertion point restore in + // getOrCreateVectorValue calls below. + Builder.SetInsertPoint(NewPhi); + + // The predecessor order is preserved and we can rely on mapping between + // scalar and vector block predecessors. + for (unsigned i = 0; i < NumIncomingValues; ++i) { + BasicBlock *NewPredBB = VectorBBPredecessors[i]; + + // When looking up the new scalar/vector values to fix up, use incoming + // values from original phi. + Value *ScIncV = + OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); + + // Scalar incoming value may need a broadcast + Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); + NewPhi->addIncoming(NewIncV, NewPredBB); + } + } +} + void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF) { + PHINode *P = cast<PHINode>(PN); + if (EnableVPlanNativePath) { + // Currently we enter here in the VPlan-native path for non-induction + // PHIs where all control flow is uniform. We simply widen these PHIs. + // Create a vector phi with no operands - the vector phi operands will be + // set at the end of vector code generation. + Type *VecTy = + (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); + Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); + VectorLoopValueMap.setVectorValue(P, 0, VecPhi); + OrigPHIsToFix.push_back(P); + + return; + } + assert(PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"); - PHINode *P = cast<PHINode>(PN); // In order to support recurrences we need to be able to vectorize Phi nodes. // Phi nodes have cycles, so we need to vectorize them in two stages. This is // stage #1: We create a new vector PHI node with no incoming edges. We'll use @@ -3893,6 +3995,10 @@ void InnerLoopVectorizer::updateAnalysis() { // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); + // DT is not kept up-to-date for outer loop vectorization + if (EnableVPlanNativePath) + return; + // Update the dominator tree information. assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && "Entry does not dominate exit."); @@ -6527,6 +6633,13 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range) { VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildHierarchicalCFG(); + SmallPtrSet<Instruction *, 1> DeadInstructions; + VPlanHCFGTransforms::VPInstructionsToVPRecipes( + Plan, Legal->getInductionVars(), DeadInstructions); + + for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) + Plan->addVF(VF); + return Plan; } @@ -6728,11 +6841,26 @@ static bool processLoopInVPlanNativePath( Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); // Plan how to best vectorize, return the best VF and its cost. - LVP.planInVPlanNativePath(OptForSize, UserVF); + VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); - // Returning false. We are currently not generating vector code in the VPlan - // native path. - return false; + // If we are stress testing VPlan builds, do not attempt to generate vector + // code. + if (VPlanBuildStressTest) + return false; + + LVP.setBestPlan(VF.Width, 1); + + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL, + &CM); + LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" + << L->getHeader()->getParent()->getName() << "\"\n"); + LVP.executePlan(LB, DT); + + // Mark the loop as already vectorized to avoid vectorizing again. + Hints.setAlreadyVectorized(); + + LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); + return true; } bool LoopVectorizePass::processLoop(Loop *L) { @@ -7123,8 +7251,15 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, if (!Changed) return PreservedAnalyses::all(); PreservedAnalyses PA; - PA.preserve<LoopAnalysis>(); - PA.preserve<DominatorTreeAnalysis>(); + + // We currently do not preserve loopinfo/dominator analyses with outer loop + // vectorization. Until this is addressed, mark these analyses as preserved + // only for non-VPlan-native path. + // TODO: Preserve Loop and Dominator analyses for VPlan-native path. + if (!EnableVPlanNativePath) { + PA.preserve<LoopAnalysis>(); + PA.preserve<DominatorTreeAnalysis>(); + } PA.preserve<BasicAA>(); PA.preserve<GlobalsAA>(); return PA; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 0780e70809d..511b31a28c3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -44,6 +44,7 @@ #include <vector> using namespace llvm; +extern cl::opt<bool> EnableVPlanNativePath; #define DEBUG_TYPE "vplan" @@ -124,6 +125,20 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock(); auto &PredVPSuccessors = PredVPBB->getSuccessors(); BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB]; + + // In outer loop vectorization scenario, the predecessor BBlock may not yet + // be visited(backedge). Mark the VPBasicBlock for fixup at the end of + // vectorization. We do not encounter this case in inner loop vectorization + // as we start out by building a loop skeleton with the vector loop header + // and latch blocks. As a result, we never enter this function for the + // header block in the non VPlan-native path. + if (!PredBB) { + assert(EnableVPlanNativePath && + "Unexpected null predecessor in non VPlan-native path"); + CFG.VPBBsToFix.push_back(PredVPBB); + continue; + } + assert(PredBB && "Predecessor basic-block not found building successor."); auto *PredBBTerminator = PredBB->getTerminator(); LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n'); @@ -185,6 +200,35 @@ void VPBasicBlock::execute(VPTransformState *State) { for (VPRecipeBase &Recipe : Recipes) Recipe.execute(*State); + VPValue *CBV; + if (EnableVPlanNativePath && (CBV = getCondBit())) { + Value *IRCBV = CBV->getUnderlyingValue(); + assert(IRCBV && "Unexpected null underlying value for condition bit"); + + // Delete the condition bit at this point - it should be no longer needed. + delete CBV; + setCondBit(nullptr); + + // Condition bit value in a VPBasicBlock is used as the branch selector. In + // the VPlan-native path case, since all branches are uniform we generate a + // branch instruction using the condition value from vector lane 0 and dummy + // successors. The successors are fixed later when the successor blocks are + // visited. + Value *NewCond = State->Callback.getOrCreateVectorValues(IRCBV, 0); + NewCond = State->Builder.CreateExtractElement(NewCond, + State->Builder.getInt32(0)); + + // Replace the temporary unreachable terminator with the new conditional + // branch. + auto *CurrentTerminator = NewBB->getTerminator(); + assert(isa<UnreachableInst>(CurrentTerminator) && + "Expected to replace unreachable terminator with conditional " + "branch."); + auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond); + CondBr->setSuccessor(0, nullptr); + ReplaceInstWithInst(CurrentTerminator, CondBr); + } + LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB); } @@ -194,6 +238,20 @@ void VPRegionBlock::execute(VPTransformState *State) { if (!isReplicator()) { // Visit the VPBlocks connected to "this", starting from it. for (VPBlockBase *Block : RPOT) { + if (EnableVPlanNativePath) { + // The inner loop vectorization path does not represent loop preheader + // and exit blocks as part of the VPlan. In the VPlan-native path, skip + // vectorizing loop preheader block. In future, we may replace this + // check with the check for loop preheader. + if (Block->getNumPredecessors() == 0) + continue; + + // Skip vectorizing loop exit block. In future, we may replace this + // check with the check for loop exit. + if (Block->getNumSuccessors() == 0) + continue; + } + LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); Block->execute(State); } @@ -319,11 +377,32 @@ void VPlan::execute(VPTransformState *State) { for (VPBlockBase *Block : depth_first(Entry)) Block->execute(State); + // Setup branch terminator successors for VPBBs in VPBBsToFix based on + // VPBB's successors. + for (auto VPBB : State->CFG.VPBBsToFix) { + assert(EnableVPlanNativePath && + "Unexpected VPBBsToFix in non VPlan-native path"); + BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB]; + assert(BB && "Unexpected null basic block for VPBB"); + + unsigned Idx = 0; + auto *BBTerminator = BB->getTerminator(); + + for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) { + VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock(); + BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]); + ++Idx; + } + } + // 3. Merge the temporary latch created with the last basic-block filled. BasicBlock *LastBB = State->CFG.PrevBB; // Connect LastBB to VectorLatchBB to facilitate their merge. - assert(isa<UnreachableInst>(LastBB->getTerminator()) && - "Expected VPlan CFG to terminate with unreachable"); + assert((EnableVPlanNativePath || + isa<UnreachableInst>(LastBB->getTerminator())) && + "Expected InnerLoop VPlan CFG to terminate with unreachable"); + assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) && + "Expected VPlan CFG to terminate with branch in NativePath"); LastBB->getTerminator()->eraseFromParent(); BranchInst::Create(VectorLatchBB, LastBB); @@ -333,7 +412,9 @@ void VPlan::execute(VPTransformState *State) { assert(Merged && "Could not merge last basic block with latch."); VectorLatchBB = LastBB; - updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); + // We do not attempt to preserve DT for outer loop vectorization currently. + if (!EnableVPlanNativePath) + updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); } void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 883e6f52369..8d544034e94 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -293,6 +293,10 @@ struct VPTransformState { /// of replication, maps the BasicBlock of the last replica created. SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB; + /// Vector of VPBasicBlocks whose terminator instruction needs to be fixed + /// up at the end of vector code generation. + SmallVector<VPBasicBlock *, 8> VPBBsToFix; + CFGState() = default; } CFG; diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp index e3cbab077e6..910d0f509ce 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp @@ -24,6 +24,17 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes( VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry()); ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry()); + + // Condition bit VPValues get deleted during transformation to VPRecipes. + // Create new VPValues and save away as condition bits. These will be deleted + // after finalizing the vector IR basic blocks. + for (VPBlockBase *Base : RPOT) { + VPBasicBlock *VPBB = Base->getEntryBasicBlock(); + if (auto *CondBit = VPBB->getCondBit()) { + auto *NCondBit = new VPValue(CondBit->getUnderlyingValue()); + VPBB->setCondBit(NCondBit); + } + } for (VPBlockBase *Base : RPOT) { // Do not widen instructions in pre-header and exit blocks. if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0) diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 08f142915b4..a81044a5e1b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -38,6 +38,9 @@ class VPUser; // and live-outs which the VPlan will need to fix accordingly. class VPValue { friend class VPBuilder; + friend class VPlanHCFGTransforms; + friend class VPBasicBlock; + private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). |