summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorHao Liu <Hao.Liu@arm.com>2015-06-26 02:45:36 +0000
committerHao Liu <Hao.Liu@arm.com>2015-06-26 02:45:36 +0000
commit2cd34bb5857b74561277cc146a2558c2fb93e91c (patch)
treebaa9932626e18ff354a6a4ff65023732ad3cab9b /llvm/lib
parent7ec8ee311942d45b2362cfbd1da322cd38cb8a48 (diff)
downloadbcm5719-llvm-2cd34bb5857b74561277cc146a2558c2fb93e91c.tar.gz
bcm5719-llvm-2cd34bb5857b74561277cc146a2558c2fb93e91c.zip
[ARM] Lower interleaved memory accesses to vldN/vstN intrinsics.
This patch also adds a function to calculate the cost of interleaved memory accesses. E.g. Lower an interleaved load: %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> into: %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 E.g. Lower an interleaved store: %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 into: %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) Differential Revision: http://reviews.llvm.org/D10533 llvm-svn: 240755
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp161
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h9
-rw-r--r--llvm/lib/Target/ARM/ARMTargetMachine.cpp4
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp25
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h5
5 files changed, 204 insertions, 0 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index ac4233cf92e..4b2105b7442 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -11404,6 +11404,167 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Addr});
}
+/// \brief Lower an interleaved load into a vldN intrinsic.
+///
+/// E.g. Lower an interleaved load (Factor = 2):
+/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
+/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
+/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
+///
+/// Into:
+/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
+/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
+/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
+bool ARMTargetLowering::lowerInterleavedLoad(
+ LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices, unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+ assert(!Shuffles.empty() && "Empty shufflevector input");
+ assert(Shuffles.size() == Indices.size() &&
+ "Unmatched number of shufflevectors and indices");
+
+ VectorType *VecTy = Shuffles[0]->getType();
+ Type *EltTy = VecTy->getVectorElementType();
+
+ const DataLayout *DL = getDataLayout();
+ unsigned VecSize = DL->getTypeAllocSizeInBits(VecTy);
+ bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64;
+
+ // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't
+ // support i64/f64 element).
+ if ((VecSize != 64 && VecSize != 128) || EltIs64Bits)
+ return false;
+
+ // A pointer vector can not be the return type of the ldN intrinsics. Need to
+ // load integer vectors first and then convert to pointer vectors.
+ if (EltTy->isPointerTy())
+ VecTy = VectorType::get(DL->getIntPtrType(EltTy),
+ VecTy->getVectorNumElements());
+
+ static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
+ Intrinsic::arm_neon_vld3,
+ Intrinsic::arm_neon_vld4};
+
+ Function *VldnFunc =
+ Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy);
+
+ IRBuilder<> Builder(LI);
+ SmallVector<Value *, 2> Ops;
+
+ Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+ Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
+ Ops.push_back(Builder.getInt32(LI->getAlignment()));
+
+ CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+
+ // Replace uses of each shufflevector with the corresponding vector loaded
+ // by ldN.
+ for (unsigned i = 0; i < Shuffles.size(); i++) {
+ ShuffleVectorInst *SV = Shuffles[i];
+ unsigned Index = Indices[i];
+
+ Value *SubVec = Builder.CreateExtractValue(VldN, Index);
+
+ // Convert the integer vector to pointer vector if the element is pointer.
+ if (EltTy->isPointerTy())
+ SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
+
+ SV->replaceAllUsesWith(SubVec);
+ }
+
+ return true;
+}
+
+/// \brief Get a mask consisting of sequential integers starting from \p Start.
+///
+/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
+ unsigned NumElts) {
+ SmallVector<Constant *, 16> Mask;
+ for (unsigned i = 0; i < NumElts; i++)
+ Mask.push_back(Builder.getInt32(Start + i));
+
+ return ConstantVector::get(Mask);
+}
+
+/// \brief Lower an interleaved store into a vstN intrinsic.
+///
+/// E.g. Lower an interleaved store (Factor = 3):
+/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
+/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
+///
+/// Into:
+/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
+/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
+/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
+/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
+///
+/// Note that the new shufflevectors will be removed and we'll only generate one
+/// vst3 instruction in CodeGen.
+bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+ ShuffleVectorInst *SVI,
+ unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+
+ VectorType *VecTy = SVI->getType();
+ assert(VecTy->getVectorNumElements() % Factor == 0 &&
+ "Invalid interleaved store");
+
+ unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+ Type *EltTy = VecTy->getVectorElementType();
+ VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+
+ const DataLayout *DL = getDataLayout();
+ unsigned SubVecSize = DL->getTypeAllocSizeInBits(SubVecTy);
+ bool EltIs64Bits = DL->getTypeAllocSizeInBits(EltTy) == 64;
+
+ // Skip illegal sub vector types and vector types of i64/f64 element (vstN
+ // doesn't support i64/f64 element).
+ if ((SubVecSize != 64 && SubVecSize != 128) || EltIs64Bits)
+ return false;
+
+ Value *Op0 = SVI->getOperand(0);
+ Value *Op1 = SVI->getOperand(1);
+ IRBuilder<> Builder(SI);
+
+ // StN intrinsics don't support pointer vectors as arguments. Convert pointer
+ // vectors to integer vectors.
+ if (EltTy->isPointerTy()) {
+ Type *IntTy = DL->getIntPtrType(EltTy);
+
+ // Convert to the corresponding integer vector.
+ Type *IntVecTy =
+ VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
+ Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
+ Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
+
+ SubVecTy = VectorType::get(IntTy, NumSubElts);
+ }
+
+ static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
+ Intrinsic::arm_neon_vst3,
+ Intrinsic::arm_neon_vst4};
+ Function *VstNFunc = Intrinsic::getDeclaration(
+ SI->getModule(), StoreInts[Factor - 2], SubVecTy);
+
+ SmallVector<Value *, 6> Ops;
+
+ Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+ Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
+
+ // Split the shufflevector operands into sub vectors for the new vstN call.
+ for (unsigned i = 0; i < Factor; i++)
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+
+ Ops.push_back(Builder.getInt32(SI->getAlignment()));
+ Builder.CreateCall(VstNFunc, Ops);
+ return true;
+}
+
enum HABaseType {
HA_UNKNOWN = 0,
HA_FLOAT,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index c0b329c5a1e..74396392f8e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -433,6 +433,15 @@ namespace llvm {
Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
bool IsStore, bool IsLoad) const override;
+ unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+
+ bool lowerInterleavedLoad(LoadInst *LI,
+ ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices,
+ unsigned Factor) const override;
+ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ unsigned Factor) const override;
+
bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
TargetLoweringBase::AtomicRMWExpansionKind
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 104a34f97e5..6e81bd2d349 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -332,6 +332,10 @@ void ARMPassConfig::addIRPasses() {
}));
TargetPassConfig::addIRPasses();
+
+ // Match interleaved memory accesses to ldN/stN intrinsics.
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ addPass(createInterleavedAccessPass(TM));
}
bool ARMPassConfig::addPreISel() {
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 4e1b371640b..f4901fc24e4 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -478,3 +478,28 @@ unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
}
return LT.first;
}
+
+unsigned ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+ assert(Factor >= 2 && "Invalid interleave factor");
+ assert(isa<VectorType>(VecTy) && "Expect a vector type");
+
+ // vldN/vstN doesn't support vector types of i64/f64 element.
+ bool EltIs64Bits = DL->getTypeAllocSizeInBits(VecTy->getScalarType()) == 64;
+
+ if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
+ unsigned NumElts = VecTy->getVectorNumElements();
+ Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+ unsigned SubVecSize = TLI->getDataLayout()->getTypeAllocSize(SubVecTy);
+
+ // vldN/vstN only support legal vector types of size 64 or 128 in bits.
+ if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
+ return Factor;
+ }
+
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+}
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 9479d7693eb..f2e5db655cc 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -126,6 +126,11 @@ public:
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace);
+ unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace);
/// @}
};
OpenPOWER on IntegriCloud