summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2018-10-15 13:20:41 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2018-10-15 13:20:41 +0000
commit5abb607ebe1f44cfe78e62a782d6c7451e4fd2d5 (patch)
treec8cf8ce3c97fc6f05ef11f87d067c19502dd1946 /llvm/lib
parent10ec5c8c285174826e4124e2dacf5d7a324bc8be (diff)
downloadbcm5719-llvm-5abb607ebe1f44cfe78e62a782d6c7451e4fd2d5.tar.gz
bcm5719-llvm-5abb607ebe1f44cfe78e62a782d6c7451e4fd2d5.zip
[ARM][NEON] Improve vector popcnt lowering with PADDL (PR39281)
As I suggested on PR39281, this patch uses PADDL pairwise addition to widen from the vXi8 CTPOP result to the target vector type. This is a blocker for moving more x86 code to generic vector CTPOP expansion (P32655 + D53258) - ARM's vXi64 CTPOP currently expands, which would generate a vXi64 MUL but ARM's custom lowering expands the general MUL case and vectors aren't well handled in LegalizeDAG - improving the CTPOP lowering was a lot easier than fixing the MUL lowering for this one case...... Differential Revision: https://reviews.llvm.org/D53257 llvm-svn: 344512
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp156
1 files changed, 26 insertions, 130 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index bfff368a8fe..3527d049f50 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -669,8 +669,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);
setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
- setOperationAction(ISD::CTPOP, MVT::v1i64, Expand);
- setOperationAction(ISD::CTPOP, MVT::v2i64, Expand);
+ setOperationAction(ISD::CTPOP, MVT::v1i64, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
@@ -5409,10 +5409,6 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
// Compute with: cttz(x) = ctpop(lsb - 1)
- // Since we can only compute the number of bits in a byte with vcnt.8, we
- // have to gather the result with pairwise addition (vpaddl) for i16, i32,
- // and i64.
-
// Compute LSB - 1.
SDValue Bits;
if (ElemTy == MVT::i64) {
@@ -5425,32 +5421,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
DAG.getTargetConstant(1, dl, ElemTy));
Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
}
-
- // Count #bits with vcnt.8.
- EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
- SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
- SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
-
- // Gather the #bits with vpaddl (pairwise add.)
- EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
- SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
- DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
- Cnt8);
- if (ElemTy == MVT::i16)
- return Cnt16;
-
- EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
- SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
- DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
- Cnt16);
- if (ElemTy == MVT::i32)
- return Cnt32;
-
- assert(ElemTy == MVT::i64);
- SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
- Cnt32);
- return Cnt64;
+ return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
}
if (!ST->hasV6T2Ops())
@@ -5460,112 +5431,37 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
}
-/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
-/// for each 16-bit element from operand, repeated. The basic idea is to
-/// leverage vcnt to get the 8-bit counts, gather and add the results.
-///
-/// Trace for v4i16:
-/// input = [v0 v1 v2 v3 ] (vi 16-bit element)
-/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
-/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
-/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
-/// [b0 b1 b2 b3 b4 b5 b6 b7]
-/// +[b1 b0 b3 b2 b5 b4 b7 b6]
-/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
-/// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits)
-static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
- SDLoc DL(N);
-
- EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
- SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
- SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
- SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
- SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
- return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
-}
-
-/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
-/// bit-count for each 16-bit element from the operand. We need slightly
-/// different sequencing for v4i16 and v8i16 to stay within NEON's available
-/// 64/128-bit registers.
-///
-/// Trace for v4i16:
-/// input = [v0 v1 v2 v3 ] (vi 16-bit element)
-/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
-/// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ]
-/// v4i16:Extracted = [k0 k1 k2 k3 ]
-static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
+static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
EVT VT = N->getValueType(0);
SDLoc DL(N);
- SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
- if (VT.is64BitVector()) {
- SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
- DAG.getIntPtrConstant(0, DL));
- } else {
- SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
- BitCounts, DAG.getIntPtrConstant(0, DL));
- return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
- }
-}
-
-/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
-/// bit-count for each 32-bit element from the operand. The idea here is
-/// to split the vector into 16-bit elements, leverage the 16-bit count
-/// routine, and then combine the results.
-///
-/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
-/// input = [v0 v1 ] (vi: 32-bit elements)
-/// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
-/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
-/// vrev: N0 = [k1 k0 k3 k2 ]
-/// [k0 k1 k2 k3 ]
-/// N1 =+[k1 k0 k3 k2 ]
-/// [k0 k2 k1 k3 ]
-/// N2 =+[k1 k3 k0 k2 ]
-/// [k0 k2 k1 k3 ]
-/// Extended =+[k1 k3 k0 k2 ]
-/// [k0 k2 ]
-/// Extracted=+[k1 k3 ]
-///
-static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
- SDLoc DL(N);
+ assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
+ assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
+ VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
+ "Unexpected type for custom ctpop lowering");
- EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+ SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
+ Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
- SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
- SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
- SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
- SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
- SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
+ // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
+ unsigned EltSize = 8;
+ unsigned NumElts = VT.is64BitVector() ? 8 : 16;
+ while (EltSize != VT.getScalarSizeInBits()) {
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
+ TLI.getPointerTy(DAG.getDataLayout())));
+ Ops.push_back(Res);
- if (VT.is64BitVector()) {
- SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
- DAG.getIntPtrConstant(0, DL));
- } else {
- SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
- DAG.getIntPtrConstant(0, DL));
- return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
+ EltSize *= 2;
+ NumElts /= 2;
+ MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
+ Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
}
-}
-static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *ST) {
- EVT VT = N->getValueType(0);
-
- assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
- assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
- VT == MVT::v4i16 || VT == MVT::v8i16) &&
- "Unexpected type for custom ctpop lowering");
-
- if (VT.getVectorElementType() == MVT::i32)
- return lowerCTPOP32BitElements(N, DAG);
- else
- return lowerCTPOP16BitElements(N, DAG);
+ return Res;
}
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
OpenPOWER on IntegriCloud