diff options
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64ISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 44 |
1 files changed, 37 insertions, 7 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index af3ab1b0fb3..4dff402ba70 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7281,7 +7281,7 @@ static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, /// /// E.g. Lower an interleaved store (Factor = 3): /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, -/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: @@ -7292,6 +7292,17 @@ static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, /// /// Note that the new shufflevectors will be removed and we'll only generate one /// st3 instruction in CodeGen. +/// +/// Example for a more general valid mask (Factor 3). Lower: +/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, +/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> +/// store <12 x i32> %i.vec, <12 x i32>* %ptr +/// +/// Into: +/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> +/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> +/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> +/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { @@ -7302,9 +7313,9 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, assert(VecTy->getVectorNumElements() % Factor == 0 && "Invalid interleaved store"); - unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; + unsigned LaneLen = VecTy->getVectorNumElements() / Factor; Type *EltTy = VecTy->getVectorElementType(); - VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); + VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); @@ -7329,7 +7340,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); - SubVecTy = VectorType::get(IntTy, NumSubElts); + SubVecTy = VectorType::get(IntTy, LaneLen); } Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); @@ -7343,9 +7354,28 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, SmallVector<Value *, 5> Ops; // Split the shufflevector operands into sub vectors for the new stN call. - for (unsigned i = 0; i < Factor; i++) - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); + auto Mask = SVI->getShuffleMask(); + for (unsigned i = 0; i < Factor; i++) { + if (Mask[i] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + if (Mask[j*Factor + i] >= 0) { + StartMask = Mask[j*Factor + i] - j; + break; + } + } + // Note: If all elements in a chunk are undefs, StartMask=0! + // Note: Filling undef gaps with random elements is ok, since + // those elements were being written anyway (with undefs). + // In the case of all undefs we're defaulting to using elems from 0 + // Note: StartMask cannot be negative, it's checked in isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen))); + } + } Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); Builder.CreateCall(StNFunc, Ops); |