diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 217 |
1 files changed, 210 insertions, 7 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e4cf9281bb0..a7f20051818 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -518,6 +518,10 @@ protected: /// induction variable will first be truncated to the corresponding type. void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr); + /// Returns true if an instruction \p I should be scalarized instead of + /// vectorized for the chosen vectorization factor. + bool shouldScalarizeInstruction(Instruction *I) const; + /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; @@ -1907,6 +1911,15 @@ public: return MinBWs; } + /// \returns True if it is more profitable to scalarize instruction \p I for + /// vectorization factor \p VF. + bool isProfitableToScalarize(Instruction *I, unsigned VF) const { + auto Scalars = InstsToScalarize.find(VF); + assert(Scalars != InstsToScalarize.end() && + "VF not yet analyzed for scalarization profitability"); + return Scalars->second.count(I); + } + private: /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1949,6 +1962,29 @@ private: /// to this type. MapVector<Instruction *, uint64_t> MinBWs; + /// A type representing the costs for instructions if they were to be + /// scalarized rather than vectorized. The entries are Instruction-Cost + /// pairs. + typedef DenseMap<Instruction *, unsigned> ScalarCostsTy; + + /// A map holding scalar costs for different vectorization factors. The + /// presence of a cost for an instruction in the mapping indicates that the + /// instruction will be scalarized when vectorizing with the associated + /// vectorization factor. The entries are VF-ScalarCostTy pairs. + DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; + + /// Returns the expected difference in cost from scalarizing the expression + /// feeding a predicated instruction \p PredInst. The instructions to + /// scalarize and their scalar costs are collected in \p ScalarCosts. A + /// non-negative return value implies the expression will be scalarized. + /// Currently, only single-use chains are considered for scalarization. + int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, + unsigned VF); + + /// Collects the instructions to scalarize for each predicated instruction in + /// the loop. + void collectInstsToScalarize(unsigned VF); + public: /// The loop that we evaluate. Loop *TheLoop; @@ -2183,12 +2219,17 @@ void InnerLoopVectorizer::createVectorIntInductionPHI( VecInd->addIncoming(LastInduction, LoopVectorLatch); } +bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { + return Legal->isScalarAfterVectorization(I) || + Cost->isProfitableToScalarize(I, VF); +} + bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { - if (Legal->isScalarAfterVectorization(IV)) + if (shouldScalarizeInstruction(IV)) return true; auto isScalarInst = [&](User *U) -> bool { auto *I = cast<Instruction>(U); - return (OrigLoop->contains(I) && Legal->isScalarAfterVectorization(I)); + return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); }; return any_of(IV->users(), isScalarInst); } @@ -2229,7 +2270,7 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) { // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (VF > 1 && IV->getType() == Induction->getType() && Step && - !Legal->isScalarAfterVectorization(EntryVal)) { + !shouldScalarizeInstruction(EntryVal)) { createVectorIntInductionPHI(ID, EntryVal); VectorizedIV = true; } @@ -4648,10 +4689,11 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { continue; // Scalarize instructions that should remain scalar after vectorization. - if (!(isa<BranchInst>(&I) || isa<PHINode>(&I) || + if (VF > 1 && + !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) && - Legal->isScalarAfterVectorization(&I)) { - scalarizeInstruction(&I); + shouldScalarizeInstruction(&I)) { + scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); continue; } @@ -6124,6 +6166,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); Factor.Width = UserVF; + collectInstsToScalarize(UserVF); return Factor; } @@ -6530,10 +6573,160 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { return RUs; } +void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { + + // If we aren't vectorizing the loop, or if we've already collected the + // instructions to scalarize, there's nothing to do. Collection may already + // have occurred if we have a user-selected VF and are now computing the + // expected cost for interleaving. + if (VF < 2 || InstsToScalarize.count(VF)) + return; + + // Initialize a mapping for VF in InstsToScalalarize. If we find that it's + // not profitable to scalarize any instructions, the presence of VF in the + // map will indicate that we've analyzed it already. + ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; + + // Find all the instructions that are scalar with predication in the loop and + // determine if it would be better to not if-convert the blocks they are in. + // If so, we also record the instructions to scalarize. + for (BasicBlock *BB : TheLoop->blocks()) { + if (!Legal->blockNeedsPredication(BB)) + continue; + for (Instruction &I : *BB) + if (Legal->isScalarWithPredication(&I)) { + ScalarCostsTy ScalarCosts; + if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) + ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); + } + } +} + +int LoopVectorizationCostModel::computePredInstDiscount( + Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, + unsigned VF) { + + assert(!Legal->isUniformAfterVectorization(PredInst) && + "Instruction marked uniform-after-vectorization will be predicated"); + + // Initialize the discount to zero, meaning that the scalar version and the + // vector version cost the same. + int Discount = 0; + + // Holds instructions to analyze. The instructions we visit are mapped in + // ScalarCosts. Those instructions are the ones that would be scalarized if + // we find that the scalar version costs less. + SmallVector<Instruction *, 8> Worklist; + + // Returns true if the given instruction can be scalarized. + auto canBeScalarized = [&](Instruction *I) -> bool { + + // We only attempt to scalarize instructions forming a single-use chain + // from the original predicated block that would otherwise be vectorized. + // Although not strictly necessary, we give up on instructions we know will + // already be scalar to avoid traversing chains that are unlikely to be + // beneficial. + if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || + Legal->isScalarAfterVectorization(I)) + return false; + + // If the instruction is scalar with predication, it will be analyzed + // separately. We ignore it within the context of PredInst. + if (Legal->isScalarWithPredication(I)) + return false; + + // If any of the instruction's operands are uniform after vectorization, + // the instruction cannot be scalarized. This prevents, for example, a + // masked load from being scalarized. + // + // We assume we will only emit a value for lane zero of an instruction + // marked uniform after vectorization, rather than VF identical values. + // Thus, if we scalarize an instruction that uses a uniform, we would + // create uses of values corresponding to the lanes we aren't emitting code + // for. This behavior can be changed by allowing getScalarValue to clone + // the lane zero values for uniforms rather than asserting. + for (Use &U : I->operands()) + if (auto *J = dyn_cast<Instruction>(U.get())) + if (Legal->isUniformAfterVectorization(J)) + return false; + + // Otherwise, we can scalarize the instruction. + return true; + }; + + // Returns true if an operand that cannot be scalarized must be extracted + // from a vector. We will account for this scalarization overhead below. Note + // that the non-void predicated instructions are placed in their own blocks, + // and their return values are inserted into vectors. Thus, an extract would + // still be required. + auto needsExtract = [&](Instruction *I) -> bool { + return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I); + }; + + // Compute the expected cost discount from scalarizing the entire expression + // feeding the predicated instruction. We currently only consider expressions + // that are single-use instruction chains. + Worklist.push_back(PredInst); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + + // If we've already analyzed the instruction, there's nothing to do. + if (ScalarCosts.count(I)) + continue; + + // Compute the cost of the vector instruction. Note that this cost already + // includes the scalarization overhead of the predicated instruction. + unsigned VectorCost = getInstructionCost(I, VF).first; + + // Compute the cost of the scalarized instruction. This cost is the cost of + // the instruction as if it wasn't if-converted and instead remained in the + // predicated block. We will scale this cost by block probability after + // computing the scalarization overhead. + unsigned ScalarCost = VF * getInstructionCost(I, 1).first; + + // Compute the scalarization overhead of needed insertelement instructions + // and phi nodes. + if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true, + false, TTI); + ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); + } + + // Compute the scalarization overhead of needed extractelement + // instructions. For each of the instruction's operands, if the operand can + // be scalarized, add it to the worklist; otherwise, account for the + // overhead. + for (Use &U : I->operands()) + if (auto *J = dyn_cast<Instruction>(U.get())) { + assert(VectorType::isValidElementType(J->getType()) && + "Instruction has non-scalar type"); + if (canBeScalarized(J)) + Worklist.push_back(J); + else if (needsExtract(J)) + ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF), + false, true, TTI); + } + + // Scale the total scalar cost by block probability. + ScalarCost /= getReciprocalPredBlockProb(); + + // Compute the discount. A non-negative discount means the vector version + // of the instruction costs more, and scalarizing would be beneficial. + Discount += VectorCost - ScalarCost; + ScalarCosts[I] = ScalarCost; + } + + return Discount; +} + LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + collectInstsToScalarize(VF); + // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -6641,6 +6834,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { if (Legal->isUniformAfterVectorization(I)) VF = 1; + if (VF > 1 && isProfitableToScalarize(I, VF)) + return VectorizationCostTy(InstsToScalarize[VF][I], false); + Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); @@ -7007,7 +7203,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } - // Insert values known to be scalar into VecValuesToIgnore. + // Insert values known to be scalar into VecValuesToIgnore. This is a + // conservative estimation of the values that will later be scalarized. + // + // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may + // still be scalarized. For example, we may find an instruction to be + // more profitable for a given vectorization factor if it were to be + // scalarized. But at this point, we haven't yet computed the + // vectorization factor. for (auto *BB : TheLoop->getBlocks()) for (auto &I : *BB) if (Legal->isScalarAfterVectorization(&I)) |