summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp66
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h2
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp24
3 files changed, 84 insertions, 8 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 88404804154..0c72f2ebee1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -742,6 +742,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
}
+
+ setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
}
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
@@ -2673,6 +2675,68 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
+// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
+static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
+ EVT VT, EVT MemVT,
+ SelectionDAG &DAG) {
+ assert(VT.isVector() && "VT should be a vector type");
+ assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
+
+ SDValue Value = ST->getValue();
+
+ // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
+ // the word lane which represent the v4i8 subvector. It optimizes the store
+ // to:
+ //
+ // xtn v0.8b, v0.8h
+ // str s0, [x0]
+
+ SDValue Undef = DAG.getUNDEF(MVT::i16);
+ SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
+ {Undef, Undef, Undef, Undef});
+
+ SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
+ Value, UndefVec);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
+
+ Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
+ SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+ Trunc, DAG.getConstant(0, DL, MVT::i64));
+
+ return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
+ ST->getBasePtr(), ST->getMemOperand());
+}
+
+// Custom lowering for any store, vector or scalar and/or default or with
+// a truncate operations. Currently only custom lower truncate operation
+// from vector v4i16 to v4i8.
+SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc Dl(Op);
+ StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
+ assert (StoreNode && "Can only custom lower store nodes");
+
+ SDValue Value = StoreNode->getValue();
+
+ EVT VT = Value.getValueType();
+ EVT MemVT = StoreNode->getMemoryVT();
+
+ assert (VT.isVector() && "Can only custom lower vector store types");
+
+ unsigned AS = StoreNode->getAddressSpace();
+ unsigned Align = StoreNode->getAlignment();
+ if (Align < MemVT.getStoreSize() &&
+ !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
+ return scalarizeVectorStore(StoreNode, DAG);
+ }
+
+ if (StoreNode->isTruncatingStore()) {
+ return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
+ }
+
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Custom lowering: ");
@@ -2784,6 +2848,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerMULH(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::STORE:
+ return LowerSTORE(Op, DAG);
case ISD::VECREDUCE_ADD:
case ISD::VECREDUCE_SMAX:
case ISD::VECREDUCE_SMIN:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index c8ebf4d9398..8d883c14c2c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -524,6 +524,8 @@ private:
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const;
+ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
bool isEligibleForTailCallOptimization(
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 42639edb4a0..d75fef7b017 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -634,14 +634,22 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
return LT.first * 2 * AmortizationCost;
}
- if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) &&
- Ty->getVectorNumElements() < 8) {
- // We scalarize the loads/stores because there is not v.4b register and we
- // have to promote the elements to v.4h.
- unsigned NumVecElts = Ty->getVectorNumElements();
- unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
- // We generate 2 instructions per vector element.
- return NumVectorizableInstsToAmortize * NumVecElts * 2;
+ if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
+ unsigned ProfitableNumElements;
+ if (Opcode == Instruction::Store)
+ // We use a custom trunc store lowering so v.4b should be profitable.
+ ProfitableNumElements = 4;
+ else
+ // We scalarize the loads because there is not v.4b register and we
+ // have to promote the elements to v.2.
+ ProfitableNumElements = 8;
+
+ if (Ty->getVectorNumElements() < ProfitableNumElements) {
+ unsigned NumVecElts = Ty->getVectorNumElements();
+ unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+ // We generate 2 instructions per vector element.
+ return NumVectorizableInstsToAmortize * NumVecElts * 2;
+ }
}
return LT.first;
OpenPOWER on IntegriCloud