diff options
author | Jun Bum Lim <junbuml@codeaurora.org> | 2015-09-14 16:19:52 +0000 |
---|---|---|
committer | Jun Bum Lim <junbuml@codeaurora.org> | 2015-09-14 16:19:52 +0000 |
commit | 34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec (patch) | |
tree | 5151aba4e563c5cd02743ea5c39d743d184edd63 /llvm/lib/Target | |
parent | 11282dcac637499e15ae1e58208d895515f3e8bf (diff) | |
download | bcm5719-llvm-34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec.tar.gz bcm5719-llvm-34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec.zip |
Improve ISel using across lane min/max reduction
In vectorized integer min/max reduction code, the final "reduce" step
is sub-optimal. In AArch64, this change wll combine :
%svn0 = vector_shuffle %0, undef<2,3,u,u>
%smax0 = smax %0, svn0
%svn3 = vector_shuffle %smax0, undef<1,u,u,u>
%sc = setcc %smax0, %svn3, gt
%n0 = extract_vector_elt %sc, #0
%n1 = extract_vector_elt %smax0, #0
%n2 = extract_vector_elt $smax0, #1
%result = select %n0, %n1, n2
becomes :
%1 = smaxv %0
%result = extract_vector_elt %1, 0
This change extends r246790.
llvm-svn: 247575
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 243 |
1 files changed, 190 insertions, 53 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3717297ae87..3c1251e2695 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8585,68 +8585,43 @@ static SDValue performPostLD1Combine(SDNode *N, return SDValue(); } -/// Target-specific DAG combine for the across vector reduction. -/// This function specifically handles the final clean-up step of a vector -/// reduction produced by the LoopVectorizer. It is the log2-shuffle pattern, -/// consisting of log2(NumVectorElements) steps and, in each step, 2^(s) -/// elements are reduced, where s is an induction variable from 0 -/// to log2(NumVectorElements). -/// For example, -/// %1 = vector_shuffle %0, <2,3,u,u> -/// %2 = add %0, %1 -/// %3 = vector_shuffle %2, <1,u,u,u> -/// %4 = add %2, %3 -/// %5 = extract_vector_elt %4, 0 -/// becomes : -/// %0 = uaddv %0 -/// %1 = extract_vector_elt %0, 0 -/// -/// FIXME: Currently this function is implemented and tested specifically -/// for the add reduction. We could also support other types of across lane -/// reduction available in AArch64, including SMAXV, SMINV, UMAXV, UMINV, -/// SADDLV, UADDLV, FMAXNMV, FMAXV, FMINNMV, FMINV. -static SDValue -performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { - if (!Subtarget->hasNEON()) +/// This function handles the log2-shuffle pattern produced by the +/// LoopVectorizer for the across vector reduction. It consists of +/// log2(NumVectorElements) steps and, in each step, 2^(s) elements +/// are reduced, where s is an induction variable from 0 to +/// log2(NumVectorElements). +static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, + unsigned Op, + SelectionDAG &DAG) { + EVT VTy = OpV->getOperand(0).getValueType(); + if (!VTy.isVector()) return SDValue(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - // Check if the input vector is fed by the operator we want to handle. - // We specifically check only ADD for now. - if (N0->getOpcode() != ISD::ADD) - return SDValue(); - - // The vector extract idx must constant zero because we only expect the final - // result of the reduction is placed in lane 0. - if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue()) - return SDValue(); - - EVT EltTy = N0.getValueType().getVectorElementType(); - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) - return SDValue(); - - int NumVecElts = N0.getValueType().getVectorNumElements(); + int NumVecElts = VTy.getVectorNumElements(); if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) return SDValue(); int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); - SDValue PreOp = N0; + SDValue PreOp = OpV; // Iterate over each step of the across vector reduction. for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { - // We specifically check ADD for now. - if (PreOp.getOpcode() != ISD::ADD) - return SDValue(); SDValue CurOp = PreOp.getOperand(0); SDValue Shuffle = PreOp.getOperand(1); if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { - // Try to swap the 1st and 2nd operand as add is commutative. + // Try to swap the 1st and 2nd operand as add and min/max instructions + // are commutative. CurOp = PreOp.getOperand(1); Shuffle = PreOp.getOperand(0); if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); } + + // Check if the input vector is fed by the operator we want to handle, + // except the last step; the very first input vector is not necessarily + // the same operator we are handling. + if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) + return SDValue(); + // Check if it forms one step of the across vector reduction. // E.g., // %cur = add %1, %0 @@ -8674,11 +8649,169 @@ performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG, PreOp = CurOp; } + unsigned Opcode; + switch (Op) { + default: + llvm_unreachable("Unexpected operator for across vector reduction"); + case ISD::ADD: + Opcode = AArch64ISD::UADDV; + break; + case ISD::SMAX: + Opcode = AArch64ISD::SMAXV; + break; + case ISD::UMAX: + Opcode = AArch64ISD::UMAXV; + break; + case ISD::SMIN: + Opcode = AArch64ISD::SMINV; + break; + case ISD::UMIN: + Opcode = AArch64ISD::UMINV; + break; + } SDLoc DL(N); - return DAG.getNode( - ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), - DAG.getNode(AArch64ISD::UADDV, DL, PreOp.getSimpleValueType(), PreOp), - DAG.getConstant(0, DL, MVT::i64)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), + DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), + DAG.getConstant(0, DL, MVT::i64)); +} + +/// Target-specific DAG combine for the across vector min/max reductions. +/// This function specifically handles the final clean-up step of the vector +/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which narrows down and finds the final min/max value from all +/// elements of the vector. +/// For example, for a <16 x i8> vector : +/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> +/// %smax0 = smax %arr, svn0 +/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax1 = smax %smax0, %svn1 +/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax2 = smax %smax1, svn2 +/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %sc = setcc %smax2, %svn3, gt +/// %n0 = extract_vector_elt %sc, #0 +/// %n1 = extract_vector_elt %smax2, #0 +/// %n2 = extract_vector_elt $smax2, #1 +/// %result = select %n0, %n1, n2 +/// becomes : +/// %1 = smaxv %0 +/// %result = extract_vector_elt %1, 0 +/// FIXME: Currently this function matches only SMAXV, UMAXV, SMINV, and UMINV. +/// We could also support other types of across lane reduction available +/// in AArch64, including FMAXNMV, FMAXV, FMINNMV, and FMINV. +static SDValue +performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue IfTrue = N->getOperand(1); + SDValue IfFalse = N->getOperand(2); + + // Check if the SELECT merges up the final result of the min/max + // from a vector. + if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Expect N0 is fed by SETCC. + SDValue SetCC = N0.getOperand(0); + EVT SetCCVT = SetCC.getValueType(); + if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || + SetCCVT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue VectorOp = SetCC.getOperand(0); + unsigned Op = VectorOp->getOpcode(); + // Check if the input vector is fed by the operator we want to handle. + if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && Op != ISD::UMIN) + return SDValue(); + + EVT VTy = VectorOp.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + // Check if extracting from the same vector. + // For example, + // %sc = setcc %vector, %svn1, gt + // %n0 = extract_vector_elt %sc, #0 + // %n1 = extract_vector_elt %vector, #0 + // %n2 = extract_vector_elt $vector, #1 + if (!(VectorOp == IfTrue->getOperand(0) && + VectorOp == IfFalse->getOperand(0))) + return SDValue(); + + // Check if the condition code is matched with the operator type. + ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); + if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || + (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || + (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || + (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE)) + return SDValue(); + + // Expect to check only lane 0 from the vector SETCC. + if (!isa<ConstantSDNode>(N0.getOperand(1)) || + cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue() != 0) + return SDValue(); + + // Expect to extract the true value from lane 0. + if (!isa<ConstantSDNode>(IfTrue.getOperand(1)) || + cast<ConstantSDNode>(IfTrue.getOperand(1))->getZExtValue() != 0) + return SDValue(); + + // Expect to extract the false value from lane 1. + if (!isa<ConstantSDNode>(IfFalse.getOperand(1)) || + cast<ConstantSDNode>(IfFalse.getOperand(1))->getZExtValue() != 1) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); +} + +/// Target-specific DAG combine for the across vector add reduction. +/// This function specifically handles the final clean-up step of the vector +/// add reduction produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which adds all elements of a vector together. +/// For example, for a <4 x i32> vector : +/// %1 = vector_shuffle %0, <2,3,u,u> +/// %2 = add %0, %1 +/// %3 = vector_shuffle %2, <1,u,u,u> +/// %4 = add %2, %3 +/// %result = extract_vector_elt %4, 0 +/// becomes : +/// %0 = uaddv %0 +/// %result = extract_vector_elt %0, 0 +static SDValue +performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Check if the input vector is fed by the ADD. + if (N0->getOpcode() != ISD::ADD) + return SDValue(); + + // The vector extract idx must constant zero because we only expect the final + // result of the reduction is placed in lane 0. + if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue() != 0) + return SDValue(); + + EVT VTy = N0.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); } /// Target-specific DAG combine function for NEON load/store intrinsics @@ -9259,8 +9392,12 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: - return performSelectCombine(N, DCI); + case ISD::SELECT: { + SDValue RV = performSelectCombine(N, DCI); + if (!RV.getNode()) + RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); + return RV; + } case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::STORE: @@ -9276,7 +9413,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); case ISD::EXTRACT_VECTOR_ELT: - return performAcrossLaneReductionCombine(N, DAG, Subtarget); + return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { |