summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/ARM/ARMISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/ARM/ARMISelLowering.cpp')
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp193
1 files changed, 140 insertions, 53 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 76df9a57015..4b6d0e1a34b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -13282,6 +13282,13 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Addr});
}
+/// A helper function for determining the number of interleaved accesses we
+/// will generate when lowering accesses of the given type.
+static unsigned getNumInterleavedAccesses(VectorType *VecTy,
+ const DataLayout &DL) {
+ return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
+}
+
/// \brief Lower an interleaved load into a vldN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
@@ -13310,8 +13317,11 @@ bool ARMTargetLowering::lowerInterleavedLoad(
bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
// Skip if we do not have NEON and skip illegal vector types and vector types
- // with i64/f64 elements (vldN doesn't support i64/f64 elements).
- if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
+ // with i64/f64 elements (vldN doesn't support i64/f64 elements). We can
+ // "legalize" wide vector types into multiple interleaved accesses as long as
+ // the vector types are divisible by 128.
+ if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize % 128 != 0) ||
+ EltIs64Bits)
return false;
// Skip if the vector has f16 elements: even though we could do an i16 vldN,
@@ -13319,43 +13329,87 @@ bool ARMTargetLowering::lowerInterleavedLoad(
if (EltTy->isHalfTy())
return false;
+ unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
+
// A pointer vector can not be the return type of the ldN intrinsics. Need to
// load integer vectors first and then convert to pointer vectors.
if (EltTy->isPointerTy())
VecTy =
VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
- static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
- Intrinsic::arm_neon_vld3,
- Intrinsic::arm_neon_vld4};
-
IRBuilder<> Builder(LI);
- SmallVector<Value *, 2> Ops;
- Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
- Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
- Ops.push_back(Builder.getInt32(LI->getAlignment()));
+ // The base address of the load.
+ Value *BaseAddr = LI->getPointerOperand();
+
+ if (NumLoads > 1) {
+ // If we're going to generate more than one load, reset the sub-vector type
+ // to something legal.
+ VecTy = VectorType::get(VecTy->getVectorElementType(),
+ VecTy->getVectorNumElements() / NumLoads);
+
+ // We will compute the pointer operand of each load from the original base
+ // address using GEPs. Cast the base address to a pointer to the scalar
+ // element type.
+ BaseAddr = Builder.CreateBitCast(
+ BaseAddr, VecTy->getVectorElementType()->getPointerTo(
+ LI->getPointerAddressSpace()));
+ }
assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
- Type *Tys[] = { VecTy, Int8Ptr };
+ Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+ Type *Tys[] = {VecTy, Int8Ptr};
+ static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
+ Intrinsic::arm_neon_vld3,
+ Intrinsic::arm_neon_vld4};
Function *VldnFunc =
Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
- CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
- // Replace uses of each shufflevector with the corresponding vector loaded
- // by ldN.
- for (unsigned i = 0; i < Shuffles.size(); i++) {
- ShuffleVectorInst *SV = Shuffles[i];
- unsigned Index = Indices[i];
+ // Holds sub-vectors extracted from the load intrinsic return values. The
+ // sub-vectors are associated with the shufflevector instructions they will
+ // replace.
+ DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
+
+ for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
+
+ // If we're generating more than one load, compute the base address of
+ // subsequent loads as an offset from the previous.
+ if (LoadCount > 0)
+ BaseAddr = Builder.CreateConstGEP1_32(
+ BaseAddr, VecTy->getVectorNumElements() * Factor);
+
+ SmallVector<Value *, 2> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+ Ops.push_back(Builder.getInt32(LI->getAlignment()));
+
+ CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+
+ // Replace uses of each shufflevector with the corresponding vector loaded
+ // by ldN.
+ for (unsigned i = 0; i < Shuffles.size(); i++) {
+ ShuffleVectorInst *SV = Shuffles[i];
+ unsigned Index = Indices[i];
+
+ Value *SubVec = Builder.CreateExtractValue(VldN, Index);
- Value *SubVec = Builder.CreateExtractValue(VldN, Index);
+ // Convert the integer vector to pointer vector if the element is pointer.
+ if (EltTy->isPointerTy())
+ SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
- // Convert the integer vector to pointer vector if the element is pointer.
- if (EltTy->isPointerTy())
- SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
+ SubVecs[SV].push_back(SubVec);
+ }
+ }
- SV->replaceAllUsesWith(SubVec);
+ // Replace uses of the shufflevector instructions with the sub-vectors
+ // returned by the load intrinsic. If a shufflevector instruction is
+ // associated with more than one sub-vector, those sub-vectors will be
+ // concatenated into a single wide vector.
+ for (ShuffleVectorInst *SVI : Shuffles) {
+ auto &SubVec = SubVecs[SVI];
+ auto *WideVec =
+ SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
+ SVI->replaceAllUsesWith(WideVec);
}
return true;
@@ -13406,8 +13460,10 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
// Skip if we do not have NEON and skip illegal vector types and vector types
- // with i64/f64 elements (vstN doesn't support i64/f64 elements).
- if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
+ // with i64/f64 elements (vldN doesn't support i64/f64 elements). We can
+ // "legalize" wide vector types into multiple interleaved accesses as long as
+ // the vector types are divisible by 128.
+ if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize % 128 != 0) ||
EltIs64Bits)
return false;
@@ -13416,6 +13472,8 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
if (EltTy->isHalfTy())
return false;
+ unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
+
Value *Op0 = SVI->getOperand(0);
Value *Op1 = SVI->getOperand(1);
IRBuilder<> Builder(SI);
@@ -13434,46 +13492,75 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
SubVecTy = VectorType::get(IntTy, LaneLen);
}
+ // The base address of the store.
+ Value *BaseAddr = SI->getPointerOperand();
+
+ if (NumStores > 1) {
+ // If we're going to generate more than one store, reset the lane length
+ // and sub-vector type to something legal.
+ LaneLen /= NumStores;
+ SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+
+ // We will compute the pointer operand of each store from the original base
+ // address using GEPs. Cast the base address to a pointer to the scalar
+ // element type.
+ BaseAddr = Builder.CreateBitCast(
+ BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
+ SI->getPointerAddressSpace()));
+ }
+
+ assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
+
+ auto Mask = SVI->getShuffleMask();
+
+ Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+ Type *Tys[] = {Int8Ptr, SubVecTy};
static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
Intrinsic::arm_neon_vst3,
Intrinsic::arm_neon_vst4};
- SmallVector<Value *, 6> Ops;
- Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
- Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
+ for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
- assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
+ // If we generating more than one store, we compute the base address of
+ // subsequent stores as an offset from the previous.
+ if (StoreCount > 0)
+ BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
- Type *Tys[] = { Int8Ptr, SubVecTy };
- Function *VstNFunc = Intrinsic::getDeclaration(
- SI->getModule(), StoreInts[Factor - 2], Tys);
+ SmallVector<Value *, 6> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
- // Split the shufflevector operands into sub vectors for the new vstN call.
- auto Mask = SVI->getShuffleMask();
- for (unsigned i = 0; i < Factor; i++) {
- if (Mask[i] >= 0) {
- Ops.push_back(Builder.CreateShuffleVector(
- Op0, Op1, createSequentialMask(Builder, Mask[i], LaneLen, 0)));
- } else {
- unsigned StartMask = 0;
- for (unsigned j = 1; j < LaneLen; j++) {
- if (Mask[j*Factor + i] >= 0) {
- StartMask = Mask[j*Factor + i] - j;
- break;
+ Function *VstNFunc =
+ Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+
+ // Split the shufflevector operands into sub vectors for the new vstN call.
+ for (unsigned i = 0; i < Factor; i++) {
+ unsigned IdxI = StoreCount * LaneLen * Factor + i;
+ if (Mask[IdxI] >= 0) {
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+ } else {
+ unsigned StartMask = 0;
+ for (unsigned j = 1; j < LaneLen; j++) {
+ unsigned IdxJ = StoreCount * LaneLen * Factor + j;
+ if (Mask[IdxJ * Factor + IdxI] >= 0) {
+ StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
+ break;
+ }
}
+ // Note: If all elements in a chunk are undefs, StartMask=0!
+ // Note: Filling undef gaps with random elements is ok, since
+ // those elements were being written anyway (with undefs).
+ // In the case of all undefs we're defaulting to using elems from 0
+ // Note: StartMask cannot be negative, it's checked in
+ // isReInterleaveMask
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
}
- // Note: If all elements in a chunk are undefs, StartMask=0!
- // Note: Filling undef gaps with random elements is ok, since
- // those elements were being written anyway (with undefs).
- // In the case of all undefs we're defaulting to using elems from 0
- // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
- Ops.push_back(Builder.CreateShuffleVector(
- Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
}
- }
- Ops.push_back(Builder.getInt32(SI->getAlignment()));
- Builder.CreateCall(VstNFunc, Ops);
+ Ops.push_back(Builder.getInt32(SI->getAlignment()));
+ Builder.CreateCall(VstNFunc, Ops);
+ }
return true;
}
OpenPOWER on IntegriCloud