diff options
author | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2014-11-23 08:07:43 +0000 |
---|---|---|
committer | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2014-11-23 08:07:43 +0000 |
commit | 9e5089a9384c146c0d277b4962c2345e92ff4a88 (patch) | |
tree | 3a01ebfa31ad635c396e816fedc8dce9f24d38e0 /llvm/lib/Transforms/Vectorize | |
parent | 2a495975ed265fd249044afb83e4416df68410b9 (diff) | |
download | bcm5719-llvm-9e5089a9384c146c0d277b4962c2345e92ff4a88.tar.gz bcm5719-llvm-9e5089a9384c146c0d277b4962c2345e92ff4a88.zip |
Masked Vector Load and Store Intrinsics.
Introduced new target-independent intrinsics in order to support masked vector loads and stores. The loop vectorizer optimizes loops containing conditional memory accesses by generating these intrinsics for existing targets AVX2 and AVX-512. The vectorizer asks the target about availability of masked vector loads and stores.
Added SDNodes for masked operations and lowering patterns for X86 code generator.
Examples:
<16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32> %passthru, i32 4 /* align */, <16 x i1> %mask)
declare void @llvm.masked.store.v8f64(i8* %addr, <8 x double> %value, i32 4, <8 x i1> %mask)
Scalarizer for other targets (not AVX2/AVX-512) will be done in a separate patch.
http://reviews.llvm.org/D6191
llvm-svn: 222632
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 98 |
1 files changed, 83 insertions, 15 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 35b2ecf99ce..de4cb262575 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -580,9 +580,10 @@ public: LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, DominatorTree *DT, TargetLibraryInfo *TLI, - AliasAnalysis *AA, Function *F) + AliasAnalysis *AA, Function *F, + const TargetTransformInfo *TTI) : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), - DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr), + DT(DT), TLI(TLI), AA(AA), TheFunction(F), TTI(TTI), Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) { } @@ -768,6 +769,15 @@ public: } SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); } + bool canPredicateStore(Type *DataType, Value *Ptr) { + return TTI->isLegalPredicatedStore(DataType, isConsecutivePtr(Ptr)); + } + bool canPredicateLoad(Type *DataType, Value *Ptr) { + return TTI->isLegalPredicatedLoad(DataType, isConsecutivePtr(Ptr)); + } + bool setMaskedOp(const Instruction* I) { + return (MaskedOp.find(I) != MaskedOp.end()); + } private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -840,6 +850,8 @@ private: AliasAnalysis *AA; /// Parent function Function *TheFunction; + /// Target Transform Info + const TargetTransformInfo *TTI; // --- vectorization state --- // @@ -871,6 +883,10 @@ private: ValueToValueMap Strides; SmallPtrSet<Value *, 8> StrideSet; + + /// While vectorizing these instructions we have to generate a + /// call to an appropriate masked intrinsic + std::set<const Instruction*> MaskedOp; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1375,7 +1391,7 @@ struct LoopVectorize : public FunctionPass { } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F); + LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); @@ -1763,7 +1779,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; - if (SI && Legal->blockNeedsPredication(SI->getParent())) + if (SI && Legal->blockNeedsPredication(SI->getParent()) && + !Legal->setMaskedOp(SI)) return scalarizeInstruction(Instr, true); if (ScalarAllocatedSize != VectorElementSize) @@ -1857,8 +1874,25 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - StoreInst *NewSI = - Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); + + Instruction *NewSI; + if (Legal->setMaskedOp(SI)) { + Type *I8PtrTy = + Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace()); + + Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy); + + VectorParts Cond = createEdgeMask(SI->getParent()->getSinglePredecessor(), + SI->getParent()); + SmallVector <Value *, 8> Ops; + Ops.push_back(I8Ptr); + Ops.push_back(StoredVal[Part]); + Ops.push_back(Builder.getInt32(Alignment)); + Ops.push_back(Cond[Part]); + NewSI = Builder.CreateMaskedStore(Ops); + } + else + NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); propagateMetadata(NewSI, SI); } return; @@ -1873,14 +1907,31 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { if (Reverse) { // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. + // wide load needs to start at the last vector element. PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); } - Value *VecPtr = Builder.CreateBitCast(PartPtr, - DataTy->getPointerTo(AddressSpace)); - LoadInst *NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + Instruction* NewLI; + if (Legal->setMaskedOp(LI)) { + Type *I8PtrTy = + Builder.getInt8PtrTy(PartPtr->getType()->getPointerAddressSpace()); + + Value *I8Ptr = Builder.CreateBitCast(PartPtr, I8PtrTy); + + VectorParts SrcMask = createBlockInMask(LI->getParent()); + SmallVector <Value *, 8> Ops; + Ops.push_back(I8Ptr); + Ops.push_back(UndefValue::get(DataTy)); + Ops.push_back(Builder.getInt32(Alignment)); + Ops.push_back(SrcMask[Part]); + NewLI = Builder.CreateMaskedLoad(Ops); + } + else { + Value *VecPtr = Builder.CreateBitCast(PartPtr, + DataTy->getPointerTo(AddressSpace)); + NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + } propagateMetadata(NewLI, LI); Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; } @@ -5304,8 +5355,15 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, // We might be able to hoist the load. if (it->mayReadFromMemory()) { LoadInst *LI = dyn_cast<LoadInst>(it); - if (!LI || !SafePtrs.count(LI->getPointerOperand())) + if (!LI) + return false; + if (!SafePtrs.count(LI->getPointerOperand())) { + if (canPredicateLoad(LI->getType(), LI->getPointerOperand())) { + MaskedOp.insert(LI); + continue; + } return false; + } } // We don't predicate stores at the moment. @@ -5313,10 +5371,20 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, StoreInst *SI = dyn_cast<StoreInst>(it); // We only support predication of stores in basic blocks with one // predecessor. - if (!SI || ++NumPredStores > NumberOfStoresToPredicate || + if (!SI) + return false; + + if (++NumPredStores > NumberOfStoresToPredicate || !SafePtrs.count(SI->getPointerOperand()) || - !SI->getParent()->getSinglePredecessor()) + !SI->getParent()->getSinglePredecessor()) { + if (canPredicateStore(SI->getValueOperand()->getType(), + SI->getPointerOperand())) { + MaskedOp.insert(SI); + --NumPredStores; + continue; + } return false; + } } if (it->mayThrow()) return false; @@ -5380,7 +5448,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { MaxVectorSize = 1; } - assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements" + assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" " into one vector!"); unsigned VF = MaxVectorSize; @@ -5441,7 +5509,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // the vector elements. float VectorCost = expectedCost(i) / (float)i; DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << - (int)VectorCost << ".\n"); + VectorCost << ".\n"); if (VectorCost < Cost) { Cost = VectorCost; Width = i; |