diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2017-11-23 13:50:27 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2017-11-23 13:50:27 +0000 |
| commit | 90accbc5d98cd7f475dc8034cf52dedffdcecd64 (patch) | |
| tree | 95267f83d02cc2959d16504332d3624bf6960b5f /llvm/lib | |
| parent | a3251bf24c8bc0ebe8fb09e0d59dbc70fa706859 (diff) | |
| download | bcm5719-llvm-90accbc5d98cd7f475dc8034cf52dedffdcecd64.tar.gz bcm5719-llvm-90accbc5d98cd7f475dc8034cf52dedffdcecd64.zip | |
[X86][SSE] Use (V)PHMINPOSUW for vXi16 SMAX/SMIN/UMAX/UMIN horizontal reductions (PR32841)
(V)PHMINPOSUW determines the UMIN element in an v8i16 input, with suitable bit flipping it can also be used for SMAX/SMIN/UMAX cases as well.
This patch matches vXi16 SMAX/SMIN/UMAX/UMIN horizontal reductions and reduces the input down to a v8i16 vector before calling (V)PHMINPOSUW.
A later patch will use this for v16i8 reductions as well (PR32841).
Differential Revision: https://reviews.llvm.org/D39729
llvm-svn: 318917
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 65 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 1 |
5 files changed, 77 insertions, 7 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 658302b7de6..6251ef64a37 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25073,6 +25073,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; + case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; @@ -30326,6 +30327,66 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } +// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW. +static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Bail without SSE41. + if (!Subtarget.hasSSE41()) + return SDValue(); + + EVT ExtractVT = Extract->getValueType(0); + if (ExtractVT != MVT::i16) + return SDValue(); + + // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. + unsigned BinOp; + SDValue Src = matchBinOpReduction( + Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}); + if (!Src) + return SDValue(); + + EVT SrcVT = Src.getValueType(); + EVT SrcSVT = SrcVT.getScalarType(); + if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0) + return SDValue(); + + SDLoc DL(Extract); + SDValue MinPos = Src; + + // First, reduce the source down to 128-bit, applying BinOp to lo/hi. + while (SrcVT.getSizeInBits() > 128) { + unsigned NumElts = SrcVT.getVectorNumElements(); + unsigned NumSubElts = NumElts / 2; + SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts); + unsigned SubSizeInBits = SrcVT.getSizeInBits(); + SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits); + SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits); + MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); + } + assert(SrcVT == MVT::v8i16 && "Unexpected value type"); + + // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask + // to flip the value accordingly. + SDValue Mask; + if (BinOp == ISD::SMAX) + Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT); + else if (BinOp == ISD::SMIN) + Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT); + else if (BinOp == ISD::UMAX) + Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT); + + if (Mask) + MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); + + MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos); + + if (Mask) + MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos, + DAG.getIntPtrConstant(0, DL)); +} + // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. static SDValue combineHorizontalPredicateResult(SDNode *Extract, SelectionDAG &DAG, @@ -30633,6 +30694,10 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) return Cmp; + // Attempt to replace min/max v8i16 reductions with PHMINPOSUW. + if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) + return MinMax; + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (SrcVT != MVT::v4i32) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 6576bceffd2..3c831001e9a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -335,6 +335,9 @@ namespace llvm { // Vector integer comparisons, the result is in a mask vector. PCMPEQM, PCMPGTM, + // v8i16 Horizontal minimum and position. + PHMINPOS, + MULTISHIFT, /// Vector comparison generating mask bits for fp and diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index cb7c4306209..e29c6b19bfd 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -182,6 +182,9 @@ def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>; def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>; +def X86phminpos: SDNode<"X86ISD::PHMINPOS", + SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>; + def X86vshiftuniform : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVec<2>, SDTCisInt<0>, SDTCisInt<1>]>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index daab153bb2c..dc52f867dd5 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -6188,22 +6188,20 @@ let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { Sched<[WriteFAddLd]>, XS; } - - // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, - Intrinsic IntId128, PatFrag ld_frag, + SDNode OpNode, PatFrag ld_frag, X86FoldableSchedWrite Sched> { def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId128 VR128:$src))]>, + [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, Sched<[Sched]>; def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (IntId128 (bitconvert (ld_frag addr:$src))))]>, + (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>, Sched<[Sched.Folded]>; } @@ -6211,10 +6209,10 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, // model, although the naming is misleading. let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", - int_x86_sse41_phminposuw, loadv2i64, + X86phminpos, loadv2i64, WriteVecIMul>, VEX, VEX_WIG; defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", - int_x86_sse41_phminposuw, memopv2i64, + X86phminpos, memopv2i64, WriteVecIMul>; /// SS48I_binop_rm - Simple SSE41 binary operator. diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 6f39568a808..d611f02a249 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1679,6 +1679,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0), X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0), |

