summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp121
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h2
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp25
3 files changed, 107 insertions, 41 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a33535ecd17..e359756b7bf 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -16681,15 +16681,20 @@ ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
}
bool ARMTargetLowering::isLegalInterleavedAccessType(
- VectorType *VecTy, const DataLayout &DL) const {
+ unsigned Factor, VectorType *VecTy, const DataLayout &DL) const {
unsigned VecSize = DL.getTypeSizeInBits(VecTy);
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+ if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
+ return false;
+
// Ensure the vector doesn't have f16 elements. Even though we could do an
// i16 vldN, we can't hold the f16 vectors and will end up converting via
// f32.
- if (VecTy->getElementType()->isHalfTy())
+ if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
+ return false;
+ if (Subtarget->hasMVEIntegerOps() && Factor == 3)
return false;
// Ensure the number of vector elements is greater than 1.
@@ -16702,12 +16707,16 @@ bool ARMTargetLowering::isLegalInterleavedAccessType(
// Ensure the total vector size is 64 or a multiple of 128. Types larger than
// 128 will be split into multiple interleaved accesses.
- return VecSize == 64 || VecSize % 128 == 0;
+ if (Subtarget->hasNEON() && VecSize == 64)
+ return true;
+ return VecSize % 128 == 0;
}
unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
if (Subtarget->hasNEON())
return 4;
+ if (Subtarget->hasMVEIntegerOps())
+ return 4;
return TargetLoweringBase::getMaxSupportedInterleaveFactor();
}
@@ -16739,7 +16748,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
- if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
+ if (!isLegalInterleavedAccessType(Factor, VecTy, DL))
return false;
unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
@@ -16771,13 +16780,37 @@ bool ARMTargetLowering::lowerInterleavedLoad(
assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
- Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
- Type *Tys[] = {VecTy, Int8Ptr};
- static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
- Intrinsic::arm_neon_vld3,
- Intrinsic::arm_neon_vld4};
- Function *VldnFunc =
- Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+ auto createLoadIntrinsic = [&](Value *BaseAddr) {
+ if (Subtarget->hasNEON()) {
+ Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+ Type *Tys[] = {VecTy, Int8Ptr};
+ static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
+ Intrinsic::arm_neon_vld3,
+ Intrinsic::arm_neon_vld4};
+ Function *VldnFunc =
+ Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+
+ SmallVector<Value *, 2> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+ Ops.push_back(Builder.getInt32(LI->getAlignment()));
+
+ return Builder.CreateCall(VldnFunc, Ops, "vldN");
+ } else {
+ assert((Factor == 2 || Factor == 4) &&
+ "expected interleave factor of 2 or 4 for MVE");
+ Intrinsic::ID LoadInts =
+ Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
+ Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo(
+ LI->getPointerAddressSpace());
+ Type *Tys[] = {VecTy, VecEltTy};
+ Function *VldnFunc =
+ Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
+
+ SmallVector<Value *, 2> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy));
+ return Builder.CreateCall(VldnFunc, Ops, "vldN");
+ }
+ };
// Holds sub-vectors extracted from the load intrinsic return values. The
// sub-vectors are associated with the shufflevector instructions they will
@@ -16792,11 +16825,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
VecTy->getVectorNumElements() * Factor);
- SmallVector<Value *, 2> Ops;
- Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
- Ops.push_back(Builder.getInt32(LI->getAlignment()));
-
- CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+ CallInst *VldN = createLoadIntrinsic(BaseAddr);
// Replace uses of each shufflevector with the corresponding vector loaded
// by ldN.
@@ -16875,7 +16904,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
- if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
+ if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL))
return false;
unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
@@ -16919,11 +16948,46 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
auto Mask = SVI->getShuffleMask();
- Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
- Type *Tys[] = {Int8Ptr, SubVecTy};
- static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
- Intrinsic::arm_neon_vst3,
- Intrinsic::arm_neon_vst4};
+ auto createStoreIntrinsic = [&](Value *BaseAddr,
+ SmallVectorImpl<Value *> &Shuffles) {
+ if (Subtarget->hasNEON()) {
+ static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
+ Intrinsic::arm_neon_vst3,
+ Intrinsic::arm_neon_vst4};
+ Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+ Type *Tys[] = {Int8Ptr, SubVecTy};
+
+ Function *VstNFunc = Intrinsic::getDeclaration(
+ SI->getModule(), StoreInts[Factor - 2], Tys);
+
+ SmallVector<Value *, 6> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+ for (auto S : Shuffles)
+ Ops.push_back(S);
+ Ops.push_back(Builder.getInt32(SI->getAlignment()));
+ Builder.CreateCall(VstNFunc, Ops);
+ } else {
+ assert((Factor == 2 || Factor == 4) &&
+ "expected interleave factor of 2 or 4 for MVE");
+ Intrinsic::ID StoreInts =
+ Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
+ Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo(
+ SI->getPointerAddressSpace());
+ Type *Tys[] = {EltPtrTy, SubVecTy};
+ Function *VstNFunc =
+ Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);
+
+ SmallVector<Value *, 6> Ops;
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy));
+ for (auto S : Shuffles)
+ Ops.push_back(S);
+ for (unsigned F = 0; F < Factor; F++) {
+ Ops.push_back(Builder.getInt32(F));
+ Builder.CreateCall(VstNFunc, Ops);
+ Ops.pop_back();
+ }
+ }
+ };
for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
// If we generating more than one store, we compute the base address of
@@ -16932,17 +16996,13 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
BaseAddr, LaneLen * Factor);
- SmallVector<Value *, 6> Ops;
- Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
-
- Function *VstNFunc =
- Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+ SmallVector<Value *, 4> Shuffles;
// Split the shufflevector operands into sub vectors for the new vstN call.
for (unsigned i = 0; i < Factor; i++) {
unsigned IdxI = StoreCount * LaneLen * Factor + i;
if (Mask[IdxI] >= 0) {
- Ops.push_back(Builder.CreateShuffleVector(
+ Shuffles.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
} else {
unsigned StartMask = 0;
@@ -16959,13 +17019,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
// In the case of all undefs we're defaulting to using elems from 0
// Note: StartMask cannot be negative, it's checked in
// isReInterleaveMask
- Ops.push_back(Builder.CreateShuffleVector(
+ Shuffles.push_back(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
}
}
- Ops.push_back(Builder.getInt32(SI->getAlignment()));
- Builder.CreateCall(VstNFunc, Ops);
+ createStoreIntrinsic(BaseAddr, Shuffles);
}
return true;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 367a40b8968..afb4750ee35 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -604,7 +604,7 @@ class VectorType;
/// Returns true if \p VecTy is a legal interleaved access type. This
/// function checks the vector element type and the overall width of the
/// vector.
- bool isLegalInterleavedAccessType(VectorType *VecTy,
+ bool isLegalInterleavedAccessType(unsigned Factor, VectorType *VecTy,
const DataLayout &DL) const;
bool alignLoopsWithOptSize() const override;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index cc5fae4a869..10faeb75b33 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -755,13 +755,10 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
return BaseCost * LT.first;
}
-int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int ARMTTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+ bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
@@ -776,9 +773,19 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
// vldN/vstN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
// matched to more than one vldN/vstN instruction.
+ int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1;
if (NumElts % Factor == 0 &&
- TLI->isLegalInterleavedAccessType(SubVecTy, DL))
- return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
+ TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL))
+ return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
+
+ // Some smaller than legal interleaved patterns are cheap as we can make
+ // use of the vmovn or vrev patterns to interleave a standard load. This is
+ // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
+ // promoted differently). The cost of 2 here is then a load and vrev or
+ // vmovn.
+ if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
+ VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64)
+ return 2 * BaseCost;
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
OpenPOWER on IntegriCloud