diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/IR/IRBuilder.cpp | 85 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 280 |
2 files changed, 254 insertions, 111 deletions
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 44741293633..217577e090e 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -201,18 +201,17 @@ CallInst *IRBuilderBase::CreateAssumption(Value *Cond) { return createCallHelper(FnAssume, Ops, this); } -/// Create a call to a Masked Load intrinsic. -/// Ptr - the base pointer for the load -/// Align - alignment of the source location -/// Mask - an vector of booleans which indicates what vector lanes should -/// be accessed in memory -/// PassThru - a pass-through value that is used to fill the masked-off lanes -/// of the result -/// Name - name of the result variable +/// \brief Create a call to a Masked Load intrinsic. +/// \p Ptr - base pointer for the load +/// \p Align - alignment of the source location +/// \p Mask - vector of booleans which indicates what vector lanes should +/// be accessed in memory +/// \p PassThru - pass-through value that is used to fill the masked-off lanes +/// of the result +/// \p Name - name of the result variable CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru, const Twine &Name) { - assert(Ptr->getType()->isPointerTy() && "Ptr must be of pointer type"); // DataTy is the overloaded type Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType(); assert(DataTy->isVectorTy() && "Ptr should point to a vector"); @@ -222,12 +221,12 @@ CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align, return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy, Name); } -/// Create a call to a Masked Store intrinsic. -/// Val - the data to be stored, -/// Ptr - the base pointer for the store -/// Align - alignment of the destination location -/// Mask - an vector of booleans which indicates what vector lanes should -/// be accessed in memory +/// \brief Create a call to a Masked Store intrinsic. +/// \p Val - data to be stored, +/// \p Ptr - base pointer for the store +/// \p Align - alignment of the destination location +/// \p Mask - vector of booleans which indicates what vector lanes should +/// be accessed in memory CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align, Value *Mask) { Value *Ops[] = { Val, Ptr, getInt32(Align), Mask }; @@ -247,6 +246,62 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id, return createCallHelper(TheFn, Ops, this, Name); } +/// \brief Create a call to a Masked Gather intrinsic. +/// \p Ptrs - vector of pointers for loading +/// \p Align - alignment for one element +/// \p Mask - vector of booleans which indicates what vector lanes should +/// be accessed in memory +/// \p PassThru - pass-through value that is used to fill the masked-off lanes +/// of the result +/// \p Name - name of the result variable +CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, unsigned Align, + Value *Mask, Value *PassThru, + const Twine& Name) { + auto PtrsTy = cast<VectorType>(Ptrs->getType()); + auto PtrTy = cast<PointerType>(PtrsTy->getElementType()); + unsigned NumElts = PtrsTy->getVectorNumElements(); + Type *DataTy = VectorType::get(PtrTy->getElementType(), NumElts); + + if (!Mask) + Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context), + NumElts)); + + Value * Ops[] = {Ptrs, getInt32(Align), Mask, UndefValue::get(DataTy)}; + + // We specify only one type when we create this intrinsic. Types of other + // arguments are derived from this type. + return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, DataTy, Name); +} + +/// \brief Create a call to a Masked Scatter intrinsic. +/// \p Data - data to be stored, +/// \p Ptrs - the vector of pointers, where the \p Data elements should be +/// stored +/// \p Align - alignment for one element +/// \p Mask - vector of booleans which indicates what vector lanes should +/// be accessed in memory +CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs, + unsigned Align, Value *Mask) { + auto PtrsTy = cast<VectorType>(Ptrs->getType()); + auto DataTy = cast<VectorType>(Data->getType()); + + auto PtrTy = cast<PointerType>(PtrsTy->getElementType()); + unsigned NumElts = PtrsTy->getVectorNumElements(); + + assert(NumElts == DataTy->getVectorNumElements() && + PtrTy->getElementType() == DataTy->getElementType() && + "Incompatible pointer and data types"); + + if (!Mask) + Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context), + NumElts)); + Value * Ops[] = {Data, Ptrs, getInt32(Align), Mask}; + + // We specify only one type when we create this intrinsic. Types of other + // arguments are derived from this type. + return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, DataTy); +} + template <typename T0, typename T1, typename T2, typename T3> static std::vector<Value *> getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a32a0ad0ff5..4648f264b5f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1282,6 +1282,17 @@ public: bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); } + /// Returns true if the target machine supports masked scatter operation + /// for the given \p DataType. + bool isLegalMaskedScatter(Type *DataType) { + return TTI->isLegalMaskedScatter(DataType); + } + /// Returns true if the target machine supports masked gather operation + /// for the given \p DataType. + bool isLegalMaskedGather(Type *DataType) { + return TTI->isLegalMaskedGather(DataType); + } + /// Returns true if vector representation of the instruction \p I /// requires mask. bool isMaskRequired(const Instruction* I) { @@ -2379,70 +2390,107 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { if (ScalarAllocatedSize != VectorElementSize) return scalarizeInstruction(Instr); - // If the pointer is loop invariant or if it is non-consecutive, - // scalarize the load. + // If the pointer is loop invariant scalarize the load. + if (LI && Legal->isUniform(Ptr)) + return scalarizeInstruction(Instr); + + // If the pointer is non-consecutive and gather/scatter is not supported + // scalarize the instruction. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - bool UniformLoad = LI && Legal->isUniform(Ptr); - if (!ConsecutiveStride || UniformLoad) + bool CreateGatherScatter = !ConsecutiveStride && + ((LI && Legal->isLegalMaskedGather(ScalarDataTy)) || + (SI && Legal->isLegalMaskedScatter(ScalarDataTy))); + + if (!ConsecutiveStride && !CreateGatherScatter) return scalarizeInstruction(Instr); Constant *Zero = Builder.getInt32(0); VectorParts &Entry = WidenMap.get(Instr); + VectorParts VectorGep; // Handle consecutive loads/stores. GetElementPtrInst *Gep = getGEPInstruction(Ptr); - if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { - setDebugLocFromInst(Builder, Gep); - Value *PtrOperand = Gep->getPointerOperand(); - Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; - FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); - - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - Gep2->setOperand(0, FirstBasePtr); - Gep2->setName("gep.indvar.base"); - Ptr = Builder.Insert(Gep2); - } else if (Gep) { - setDebugLocFromInst(Builder, Gep); - assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), - OrigLoop) && - "Base ptr must be invariant"); - - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; - unsigned NumOperands = Gep->getNumOperands(); - unsigned InductionOperand = getGEPInductionOperand(Gep); - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - - for (unsigned i = 0; i < NumOperands; ++i) { - Value *GepOperand = Gep->getOperand(i); - Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); - - // Update last index or loop invariant instruction anchored in loop. - if (i == InductionOperand || - (GepOperandInst && OrigLoop->contains(GepOperandInst))) { - assert((i == InductionOperand || - PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), - OrigLoop)) && - "Must be last index or loop invariant"); - - VectorParts &GEPParts = getVectorValue(GepOperand); - Value *Index = GEPParts[0]; - Index = Builder.CreateExtractElement(Index, Zero); - Gep2->setOperand(i, Index); - Gep2->setName("gep.indvar.idx"); + if (ConsecutiveStride) { + if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { + setDebugLocFromInst(Builder, Gep); + Value *PtrOperand = Gep->getPointerOperand(); + Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; + FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + Gep2->setOperand(0, FirstBasePtr); + Gep2->setName("gep.indvar.base"); + Ptr = Builder.Insert(Gep2); + } else if (Gep) { + setDebugLocFromInst(Builder, Gep); + assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), + OrigLoop) && + "Base ptr must be invariant"); + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + unsigned InductionOperand = getGEPInductionOperand(Gep); + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + + for (unsigned i = 0; i < NumOperands; ++i) { + Value *GepOperand = Gep->getOperand(i); + Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); + + // Update last index or loop invariant instruction anchored in loop. + if (i == InductionOperand || + (GepOperandInst && OrigLoop->contains(GepOperandInst))) { + assert((i == InductionOperand || + PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), + OrigLoop)) && + "Must be last index or loop invariant"); + + VectorParts &GEPParts = getVectorValue(GepOperand); + Value *Index = GEPParts[0]; + Index = Builder.CreateExtractElement(Index, Zero); + Gep2->setOperand(i, Index); + Gep2->setName("gep.indvar.idx"); + } + } + Ptr = Builder.Insert(Gep2); + } else { // No GEP + // Use the induction element ptr. + assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); + setDebugLocFromInst(Builder, Ptr); + VectorParts &PtrVal = getVectorValue(Ptr); + Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); } + } else { + // At this point we should vector version of GEP for Gather or Scatter + assert(CreateGatherScatter && "The instruction should be scalarized"); + if (Gep) { + SmallVector<VectorParts, 4> OpsV; + // Vectorizing GEP, across UF parts, we want to keep each loop-invariant + // base or index of GEP scalar + for (Value *Op : Gep->operands()) { + if (PSE.getSE()->isLoopInvariant(PSE.getSCEV(Op), OrigLoop)) + OpsV.push_back(VectorParts(UF, Op)); + else + OpsV.push_back(getVectorValue(Op)); + } + + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector<Value*, 4> Ops; + Value *GEPBasePtr = OpsV[0][Part]; + for (unsigned i = 1; i < Gep->getNumOperands(); i++) + Ops.push_back(OpsV[i][Part]); + Value *NewGep = Builder.CreateGEP(nullptr, GEPBasePtr, Ops, + "VectorGep"); + assert(NewGep->getType()->isVectorTy() && "Expected vector GEP"); + NewGep = Builder.CreateBitCast(NewGep, + VectorType::get(Ptr->getType(), VF)); + VectorGep.push_back(NewGep); + } + } else + VectorGep = getVectorValue(Ptr); } - Ptr = Builder.Insert(Gep2); - } else { - // Use the induction element ptr. - assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); - setDebugLocFromInst(Builder, Ptr); - VectorParts &PtrVal = getVectorValue(Ptr); - Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); - } VectorParts Mask = createBlockInMask(Instr->getParent()); // Handle Stores: @@ -2455,30 +2503,37 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { VectorParts StoredVal = getVectorValue(SI->getValueOperand()); for (unsigned Part = 0; Part < UF; ++Part) { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = + Instruction *NewSI = nullptr; + if (CreateGatherScatter) { + Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr; + NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part], + Alignment, MaskPart); + } else { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); - if (Reverse) { - // If we store to reverse consecutive memory locations, then we need - // to reverse the order of elements in the stored value. - StoredVal[Part] = reverseVector(StoredVal[Part]); - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); - Mask[Part] = reverseVector(Mask[Part]); - } + if (Reverse) { + // If we store to reverse consecutive memory locations, then we need + // to reverse the order of elements in the stored value. + StoredVal[Part] = reverseVector(StoredVal[Part]); + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); + Mask[Part] = reverseVector(Mask[Part]); + } - Value *VecPtr = Builder.CreateBitCast(PartPtr, - DataTy->getPointerTo(AddressSpace)); + Value *VecPtr = Builder.CreateBitCast(PartPtr, + DataTy->getPointerTo(AddressSpace)); - Instruction *NewSI; - if (Legal->isMaskRequired(SI)) - NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, - Mask[Part]); - else - NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); + if (Legal->isMaskRequired(SI)) + NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, + Mask[Part]); + else + NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, + Alignment); + } propagateMetadata(NewSI, SI); } return; @@ -2488,29 +2543,36 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { assert(LI && "Must have a load instruction"); setDebugLocFromInst(Builder, LI); for (unsigned Part = 0; Part < UF; ++Part) { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = + Instruction* NewLI; + if (CreateGatherScatter) { + Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr; + NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, + MaskPart, 0, "wide.masked.gather"); + Entry[Part] = NewLI; + } else { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); - if (Reverse) { - // If the address is consecutive but reversed, then the - // wide load needs to start at the last vector element. - PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); - Mask[Part] = reverseVector(Mask[Part]); - } + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide load needs to start at the last vector element. + PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); + Mask[Part] = reverseVector(Mask[Part]); + } - Instruction* NewLI; - Value *VecPtr = Builder.CreateBitCast(PartPtr, - DataTy->getPointerTo(AddressSpace)); - if (Legal->isMaskRequired(LI)) - NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], - UndefValue::get(DataTy), - "wide.masked.load"); - else - NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + Value *VecPtr = Builder.CreateBitCast(PartPtr, + DataTy->getPointerTo(AddressSpace)); + if (Legal->isMaskRequired(LI)) + NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], + UndefValue::get(DataTy), + "wide.masked.load"); + else + NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; + } propagateMetadata(NewLI, LI); - Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; } } @@ -4520,7 +4582,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, if (!LI) return false; if (!SafePtrs.count(LI->getPointerOperand())) { - if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) { + if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) || + isLegalMaskedGather(LI->getType())) { MaskedOp.insert(LI); continue; } @@ -4545,7 +4608,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, // scalarize the block. bool isLegalMaskedOp = isLegalMaskedStore(SI->getValueOperand()->getType(), - SI->getPointerOperand()); + SI->getPointerOperand()) || + isLegalMaskedScatter(SI->getValueOperand()->getType()); if (isLegalMaskedOp) { --NumPredStores; MaskedOp.insert(SI); @@ -5281,6 +5345,19 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { return Cost; } +/// \brief Check if the load/store instruction \p I may be translated into +/// gather/scatter during vectorization. +/// +/// Pointer \p Ptr specifies address in memory for the given scalar memory +/// instruction. We need it to retrieve data type. +/// Using gather/scatter is possible when it is supported by target. +static bool isGatherOrScatterLegal(Instruction *I, Value *Ptr, + LoopVectorizationLegality *Legal) { + Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType(); + return (isa<LoadInst>(I) && Legal->isLegalMaskedGather(DataTy)) || + (isa<StoreInst>(I) && Legal->isLegalMaskedScatter(DataTy)); +} + /// \brief Check whether the address computation for a non-consecutive memory /// access looks like an unlikely candidate for being merged into the indexing /// mode. @@ -5500,11 +5577,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); + bool UseGatherOrScatter = (ConsecutiveStride == 0) && + isGatherOrScatterLegal(I, Ptr, Legal); + bool Reverse = ConsecutiveStride < 0; const DataLayout &DL = I->getModule()->getDataLayout(); unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; - if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { + if ((!ConsecutiveStride && !UseGatherOrScatter) || + ScalarAllocatedSize != VectorElementSize) { bool IsComplexComputation = isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); unsigned Cost = 0; @@ -5528,8 +5609,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return Cost; } - // Wide load/stores. unsigned Cost = TTI.getAddressComputationCost(VectorTy); + if (UseGatherOrScatter) { + assert(ConsecutiveStride == 0 && + "Gather/Scatter are not used for consecutive stride"); + return Cost + + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(I), Alignment); + } + // Wide load/stores. if (Legal->isMaskRequired(I)) Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); |