diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 66 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 24 |
3 files changed, 84 insertions, 8 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 88404804154..0c72f2ebee1 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -742,6 +742,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); } + + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); @@ -2673,6 +2675,68 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } +// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. +static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, + EVT VT, EVT MemVT, + SelectionDAG &DAG) { + assert(VT.isVector() && "VT should be a vector type"); + assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); + + SDValue Value = ST->getValue(); + + // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract + // the word lane which represent the v4i8 subvector. It optimizes the store + // to: + // + // xtn v0.8b, v0.8h + // str s0, [x0] + + SDValue Undef = DAG.getUNDEF(MVT::i16); + SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, + {Undef, Undef, Undef, Undef}); + + SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, + Value, UndefVec); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); + + Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); + SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Trunc, DAG.getConstant(0, DL, MVT::i64)); + + return DAG.getStore(ST->getChain(), DL, ExtractTrunc, + ST->getBasePtr(), ST->getMemOperand()); +} + +// Custom lowering for any store, vector or scalar and/or default or with +// a truncate operations. Currently only custom lower truncate operation +// from vector v4i16 to v4i8. +SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc Dl(Op); + StoreSDNode *StoreNode = cast<StoreSDNode>(Op); + assert (StoreNode && "Can only custom lower store nodes"); + + SDValue Value = StoreNode->getValue(); + + EVT VT = Value.getValueType(); + EVT MemVT = StoreNode->getMemoryVT(); + + assert (VT.isVector() && "Can only custom lower vector store types"); + + unsigned AS = StoreNode->getAddressSpace(); + unsigned Align = StoreNode->getAlignment(); + if (Align < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) { + return scalarizeVectorStore(StoreNode, DAG); + } + + if (StoreNode->isTruncatingStore()) { + return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); + } + + return SDValue(); +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); @@ -2784,6 +2848,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerMULH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::STORE: + return LowerSTORE(Op, DAG); case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index c8ebf4d9398..8d883c14c2c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -524,6 +524,8 @@ private: SmallVectorImpl<SDValue> &InVals, bool isThisReturn, SDValue ThisVal) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 42639edb4a0..d75fef7b017 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -634,14 +634,22 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, return LT.first * 2 * AmortizationCost; } - if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) && - Ty->getVectorNumElements() < 8) { - // We scalarize the loads/stores because there is not v.4b register and we - // have to promote the elements to v.4h. - unsigned NumVecElts = Ty->getVectorNumElements(); - unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; - // We generate 2 instructions per vector element. - return NumVectorizableInstsToAmortize * NumVecElts * 2; + if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) { + unsigned ProfitableNumElements; + if (Opcode == Instruction::Store) + // We use a custom trunc store lowering so v.4b should be profitable. + ProfitableNumElements = 4; + else + // We scalarize the loads because there is not v.4b register and we + // have to promote the elements to v.2. + ProfitableNumElements = 8; + + if (Ty->getVectorNumElements() < ProfitableNumElements) { + unsigned NumVecElts = Ty->getVectorNumElements(); + unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; + // We generate 2 instructions per vector element. + return NumVectorizableInstsToAmortize * NumVecElts * 2; + } } return LT.first; |

