diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-10-15 13:20:41 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-10-15 13:20:41 +0000 |
| commit | 5abb607ebe1f44cfe78e62a782d6c7451e4fd2d5 (patch) | |
| tree | c8cf8ce3c97fc6f05ef11f87d067c19502dd1946 /llvm/lib | |
| parent | 10ec5c8c285174826e4124e2dacf5d7a324bc8be (diff) | |
| download | bcm5719-llvm-5abb607ebe1f44cfe78e62a782d6c7451e4fd2d5.tar.gz bcm5719-llvm-5abb607ebe1f44cfe78e62a782d6c7451e4fd2d5.zip | |
[ARM][NEON] Improve vector popcnt lowering with PADDL (PR39281)
As I suggested on PR39281, this patch uses PADDL pairwise addition to widen from the vXi8 CTPOP result to the target vector type.
This is a blocker for moving more x86 code to generic vector CTPOP expansion (P32655 + D53258) - ARM's vXi64 CTPOP currently expands, which would generate a vXi64 MUL but ARM's custom lowering expands the general MUL case and vectors aren't well handled in LegalizeDAG - improving the CTPOP lowering was a lot easier than fixing the MUL lowering for this one case......
Differential Revision: https://reviews.llvm.org/D53257
llvm-svn: 344512
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 156 |
1 files changed, 26 insertions, 130 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index bfff368a8fe..3527d049f50 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -669,8 +669,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); - setOperationAction(ISD::CTPOP, MVT::v1i64, Expand); - setOperationAction(ISD::CTPOP, MVT::v2i64, Expand); + setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); @@ -5409,10 +5409,6 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, // Compute with: cttz(x) = ctpop(lsb - 1) - // Since we can only compute the number of bits in a byte with vcnt.8, we - // have to gather the result with pairwise addition (vpaddl) for i16, i32, - // and i64. - // Compute LSB - 1. SDValue Bits; if (ElemTy == MVT::i64) { @@ -5425,32 +5421,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, DAG.getTargetConstant(1, dl, ElemTy)); Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); } - - // Count #bits with vcnt.8. - EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; - SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); - SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); - - // Gather the #bits with vpaddl (pairwise add.) - EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; - SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, - DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), - Cnt8); - if (ElemTy == MVT::i16) - return Cnt16; - - EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; - SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, - DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), - Cnt16); - if (ElemTy == MVT::i32) - return Cnt32; - - assert(ElemTy == MVT::i64); - SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), - Cnt32); - return Cnt64; + return DAG.getNode(ISD::CTPOP, dl, VT, Bits); } if (!ST->hasV6T2Ops()) @@ -5460,112 +5431,37 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::CTLZ, dl, VT, rbit); } -/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count -/// for each 16-bit element from operand, repeated. The basic idea is to -/// leverage vcnt to get the 8-bit counts, gather and add the results. -/// -/// Trace for v4i16: -/// input = [v0 v1 v2 v3 ] (vi 16-bit element) -/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) -/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) -/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] -/// [b0 b1 b2 b3 b4 b5 b6 b7] -/// +[b1 b0 b3 b2 b5 b4 b7 b6] -/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, -/// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) -static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - SDLoc DL(N); - - EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; - SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); - SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); - SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); - SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); - return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); -} - -/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the -/// bit-count for each 16-bit element from the operand. We need slightly -/// different sequencing for v4i16 and v8i16 to stay within NEON's available -/// 64/128-bit registers. -/// -/// Trace for v4i16: -/// input = [v0 v1 v2 v3 ] (vi 16-bit element) -/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) -/// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] -/// v4i16:Extracted = [k0 k1 k2 k3 ] -static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { +static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc DL(N); - SDValue BitCounts = getCTPOP16BitCounts(N, DAG); - if (VT.is64BitVector()) { - SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, - DAG.getIntPtrConstant(0, DL)); - } else { - SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, - BitCounts, DAG.getIntPtrConstant(0, DL)); - return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); - } -} - -/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the -/// bit-count for each 32-bit element from the operand. The idea here is -/// to split the vector into 16-bit elements, leverage the 16-bit count -/// routine, and then combine the results. -/// -/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): -/// input = [v0 v1 ] (vi: 32-bit elements) -/// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) -/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) -/// vrev: N0 = [k1 k0 k3 k2 ] -/// [k0 k1 k2 k3 ] -/// N1 =+[k1 k0 k3 k2 ] -/// [k0 k2 k1 k3 ] -/// N2 =+[k1 k3 k0 k2 ] -/// [k0 k2 k1 k3 ] -/// Extended =+[k1 k3 k0 k2 ] -/// [k0 k2 ] -/// Extracted=+[k1 k3 ] -/// -static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - SDLoc DL(N); + assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); + assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || + VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && + "Unexpected type for custom ctpop lowering"); - EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); + Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); - SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); - SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); - SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); - SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); - SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); + // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. + unsigned EltSize = 8; + unsigned NumElts = VT.is64BitVector() ? 8 : 16; + while (EltSize != VT.getScalarSizeInBits()) { + SmallVector<SDValue, 8> Ops; + Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, + TLI.getPointerTy(DAG.getDataLayout()))); + Ops.push_back(Res); - if (VT.is64BitVector()) { - SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, - DAG.getIntPtrConstant(0, DL)); - } else { - SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, - DAG.getIntPtrConstant(0, DL)); - return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); + EltSize *= 2; + NumElts /= 2; + MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); + Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); } -} -static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - EVT VT = N->getValueType(0); - - assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); - assert((VT == MVT::v2i32 || VT == MVT::v4i32 || - VT == MVT::v4i16 || VT == MVT::v8i16) && - "Unexpected type for custom ctpop lowering"); - - if (VT.getVectorElementType() == MVT::i32) - return lowerCTPOP32BitElements(N, DAG); - else - return lowerCTPOP16BitElements(N, DAG); + return Res; } static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, |

