summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorAlina Sbirlea <asbirlea@google.com>2016-12-13 19:32:36 +0000
committerAlina Sbirlea <asbirlea@google.com>2016-12-13 19:32:36 +0000
commit77c5eaaedac7dde060b72a67514c41d2fa9df466 (patch)
treec211bcc74fdca5b533add6e0d3f1004a995ae935 /llvm/lib
parentc74da7ce581f0c42cb92b6a186c3e005f7b159d1 (diff)
downloadbcm5719-llvm-77c5eaaedac7dde060b72a67514c41d2fa9df466.tar.gz
bcm5719-llvm-77c5eaaedac7dde060b72a67514c41d2fa9df466.zip
Generalize strided store pattern in interleave access pass
Summary: This patch aims to generalize matching of the strided store accesses to more general masks. The more general rule is to have consecutive accesses based on the stride: [x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...] All elements in the masks need not form a contiguous space, there may be gaps. As before, undefs are allowed and filled in with adjacent element loads. Reviewers: HaoLiu, mssimpso Subscribers: mkuper, delena, llvm-commits Differential Revision: https://reviews.llvm.org/D23646 llvm-svn: 289573
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/CodeGen/InterleavedAccessPass.cpp88
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp44
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp42
3 files changed, 145 insertions, 29 deletions
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index e559eed33c9..c8f79d7fb71 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -162,12 +162,17 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
return false;
}
-/// \brief Check if the mask is RE-interleave mask for an interleaved store.
-///
-/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...>
+/// \brief Check if the mask can be used in an interleaved store.
+//
+/// It checks for a more general pattern than the RE-interleave mask.
+/// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
+/// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
+/// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+/// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
///
-/// E.g. The RE-interleave mask (Factor = 2) could be:
-/// <0, 4, 1, 5, 2, 6, 3, 7>
+/// The particular case of an RE-interleave mask is:
+/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
+/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
unsigned MaxFactor) {
unsigned NumElts = Mask.size();
@@ -179,21 +184,72 @@ static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
if (NumElts % Factor)
continue;
- unsigned NumSubElts = NumElts / Factor;
- if (!isPowerOf2_32(NumSubElts))
+ unsigned LaneLen = NumElts / Factor;
+ if (!isPowerOf2_32(LaneLen))
continue;
- // Check whether each element matchs the RE-interleaved rule. Ignore undef
- // elements.
- unsigned i = 0;
- for (; i < NumElts; i++)
- if (Mask[i] >= 0 &&
- static_cast<unsigned>(Mask[i]) !=
- (i % Factor) * NumSubElts + i / Factor)
+ // Check whether each element matches the general interleaved rule.
+ // Ignore undef elements, as long as the defined elements match the rule.
+ // Outer loop processes all factors (x, y, z in the above example)
+ unsigned I = 0, J;
+ for (; I < Factor; I++) {
+ unsigned SavedLaneValue;
+ unsigned SavedNoUndefs = 0;
+
+ // Inner loop processes consecutive accesses (x, x+1... in the example)
+ for (J = 0; J < LaneLen - 1; J++) {
+ // Lane computes x's position in the Mask
+ unsigned Lane = J * Factor + I;
+ unsigned NextLane = Lane + Factor;
+ int LaneValue = Mask[Lane];
+ int NextLaneValue = Mask[NextLane];
+
+ // If both are defined, values must be sequential
+ if (LaneValue >= 0 && NextLaneValue >= 0 &&
+ LaneValue + 1 != NextLaneValue)
+ break;
+
+ // If the next value is undef, save the current one as reference
+ if (LaneValue >= 0 && NextLaneValue < 0) {
+ SavedLaneValue = LaneValue;
+ SavedNoUndefs = 1;
+ }
+
+ // Undefs are allowed, but defined elements must still be consecutive:
+ // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, ....
+ // Verify this by storing the last non-undef followed by an undef
+ // Check that following non-undef masks are incremented with the
+ // corresponding distance.
+ if (SavedNoUndefs > 0 && LaneValue < 0) {
+ SavedNoUndefs++;
+ if (NextLaneValue >= 0 &&
+ SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue)
+ break;
+ }
+ }
+
+ if (J < LaneLen - 1)
break;
- // Find a RE-interleaved mask of current factor.
- if (i == NumElts)
+ int StartMask = 0;
+ if (Mask[I] >= 0) {
+ // Check that the start of the I range (J=0) is greater than 0
+ StartMask = Mask[I];
+ } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) {
+ // StartMask defined by the last value in lane
+ StartMask = Mask[(LaneLen - 1) * Factor + I] - J;
+ } else if (SavedNoUndefs > 0) {
+ // StartMask defined by some non-zero value in the j loop
+ StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs);
+ }
+ // else StartMask remains set to 0, i.e. all elements are undefs
+
+ if (StartMask < 0)
+ break;
+ }
+
+ // Found an interleaved mask of current factor.
+ if (I == Factor)
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index af3ab1b0fb3..4dff402ba70 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7281,7 +7281,7 @@ static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
///
/// E.g. Lower an interleaved store (Factor = 3):
/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
-/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
/// store <12 x i32> %i.vec, <12 x i32>* %ptr
///
/// Into:
@@ -7292,6 +7292,17 @@ static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
///
/// Note that the new shufflevectors will be removed and we'll only generate one
/// st3 instruction in CodeGen.
+///
+/// Example for a more general valid mask (Factor 3). Lower:
+/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
+/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+/// store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+/// Into:
+/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
+/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
+/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
+/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
ShuffleVectorInst *SVI,
unsigned Factor) const {
@@ -7302,9 +7313,9 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
assert(VecTy->getVectorNumElements() % Factor == 0 &&
"Invalid interleaved store");
- unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+ unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
Type *EltTy = VecTy->getVectorElementType();
- VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+ VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
@@ -7329,7 +7340,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
- SubVecTy = VectorType::get(IntTy, NumSubElts);
+ SubVecTy = VectorType::get(IntTy, LaneLen);
}
Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
@@ -7343,9 +7354,28 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
SmallVector<Value *, 5> Ops;
// Split the shufflevector operands into sub vectors for the new stN call.
- for (unsigned i = 0; i < Factor; i++)
- Ops.push_back(Builder.CreateShuffleVector(
- Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+ auto Mask = SVI->getShuffleMask();
+ for (unsigned i = 0; i < Factor; i++) {
+ if (Mask[i] >= 0) {
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
+ } else {
+ unsigned StartMask = 0;
+ for (unsigned j = 1; j < LaneLen; j++) {
+ if (Mask[j*Factor + i] >= 0) {
+ StartMask = Mask[j*Factor + i] - j;
+ break;
+ }
+ }
+ // Note: If all elements in a chunk are undefs, StartMask=0!
+ // Note: Filling undef gaps with random elements is ok, since
+ // those elements were being written anyway (with undefs).
+ // In the case of all undefs we're defaulting to using elems from 0
+ // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
+ }
+ }
Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
Builder.CreateCall(StNFunc, Ops);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index de4b7f7dfe5..4751d24a9fb 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -13191,6 +13191,17 @@ static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
///
/// Note that the new shufflevectors will be removed and we'll only generate one
/// vst3 instruction in CodeGen.
+///
+/// Example for a more general valid mask (Factor 3). Lower:
+/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
+/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+/// store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+/// Into:
+/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
+/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
+/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
+/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
ShuffleVectorInst *SVI,
unsigned Factor) const {
@@ -13201,9 +13212,9 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
assert(VecTy->getVectorNumElements() % Factor == 0 &&
"Invalid interleaved store");
- unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+ unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
Type *EltTy = VecTy->getVectorElementType();
- VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+ VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
@@ -13230,7 +13241,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
- SubVecTy = VectorType::get(IntTy, NumSubElts);
+ SubVecTy = VectorType::get(IntTy, LaneLen);
}
static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
@@ -13246,9 +13257,28 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
SI->getModule(), StoreInts[Factor - 2], Tys);
// Split the shufflevector operands into sub vectors for the new vstN call.
- for (unsigned i = 0; i < Factor; i++)
- Ops.push_back(Builder.CreateShuffleVector(
- Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+ auto Mask = SVI->getShuffleMask();
+ for (unsigned i = 0; i < Factor; i++) {
+ if (Mask[i] >= 0) {
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
+ } else {
+ unsigned StartMask = 0;
+ for (unsigned j = 1; j < LaneLen; j++) {
+ if (Mask[j*Factor + i] >= 0) {
+ StartMask = Mask[j*Factor + i] - j;
+ break;
+ }
+ }
+ // Note: If all elements in a chunk are undefs, StartMask=0!
+ // Note: Filling undef gaps with random elements is ok, since
+ // those elements were being written anyway (with undefs).
+ // In the case of all undefs we're defaulting to using elems from 0
+ // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
+ }
+ }
Ops.push_back(Builder.getInt32(SI->getAlignment()));
Builder.CreateCall(VstNFunc, Ops);
OpenPOWER on IntegriCloud