summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorJun Bum Lim <junbuml@codeaurora.org>2015-09-14 16:19:52 +0000
committerJun Bum Lim <junbuml@codeaurora.org>2015-09-14 16:19:52 +0000
commit34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec (patch)
tree5151aba4e563c5cd02743ea5c39d743d184edd63 /llvm/lib/Target
parent11282dcac637499e15ae1e58208d895515f3e8bf (diff)
downloadbcm5719-llvm-34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec.tar.gz
bcm5719-llvm-34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec.zip
Improve ISel using across lane min/max reduction
In vectorized integer min/max reduction code, the final "reduce" step is sub-optimal. In AArch64, this change wll combine : %svn0 = vector_shuffle %0, undef<2,3,u,u> %smax0 = smax %0, svn0 %svn3 = vector_shuffle %smax0, undef<1,u,u,u> %sc = setcc %smax0, %svn3, gt %n0 = extract_vector_elt %sc, #0 %n1 = extract_vector_elt %smax0, #0 %n2 = extract_vector_elt $smax0, #1 %result = select %n0, %n1, n2 becomes : %1 = smaxv %0 %result = extract_vector_elt %1, 0 This change extends r246790. llvm-svn: 247575
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp243
1 files changed, 190 insertions, 53 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3717297ae87..3c1251e2695 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8585,68 +8585,43 @@ static SDValue performPostLD1Combine(SDNode *N,
return SDValue();
}
-/// Target-specific DAG combine for the across vector reduction.
-/// This function specifically handles the final clean-up step of a vector
-/// reduction produced by the LoopVectorizer. It is the log2-shuffle pattern,
-/// consisting of log2(NumVectorElements) steps and, in each step, 2^(s)
-/// elements are reduced, where s is an induction variable from 0
-/// to log2(NumVectorElements).
-/// For example,
-/// %1 = vector_shuffle %0, <2,3,u,u>
-/// %2 = add %0, %1
-/// %3 = vector_shuffle %2, <1,u,u,u>
-/// %4 = add %2, %3
-/// %5 = extract_vector_elt %4, 0
-/// becomes :
-/// %0 = uaddv %0
-/// %1 = extract_vector_elt %0, 0
-///
-/// FIXME: Currently this function is implemented and tested specifically
-/// for the add reduction. We could also support other types of across lane
-/// reduction available in AArch64, including SMAXV, SMINV, UMAXV, UMINV,
-/// SADDLV, UADDLV, FMAXNMV, FMAXV, FMINNMV, FMINV.
-static SDValue
-performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
- if (!Subtarget->hasNEON())
+/// This function handles the log2-shuffle pattern produced by the
+/// LoopVectorizer for the across vector reduction. It consists of
+/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
+/// are reduced, where s is an induction variable from 0 to
+/// log2(NumVectorElements).
+static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
+ unsigned Op,
+ SelectionDAG &DAG) {
+ EVT VTy = OpV->getOperand(0).getValueType();
+ if (!VTy.isVector())
return SDValue();
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- // Check if the input vector is fed by the operator we want to handle.
- // We specifically check only ADD for now.
- if (N0->getOpcode() != ISD::ADD)
- return SDValue();
-
- // The vector extract idx must constant zero because we only expect the final
- // result of the reduction is placed in lane 0.
- if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue())
- return SDValue();
-
- EVT EltTy = N0.getValueType().getVectorElementType();
- if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
- return SDValue();
-
- int NumVecElts = N0.getValueType().getVectorNumElements();
+ int NumVecElts = VTy.getVectorNumElements();
if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
return SDValue();
int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
- SDValue PreOp = N0;
+ SDValue PreOp = OpV;
// Iterate over each step of the across vector reduction.
for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
- // We specifically check ADD for now.
- if (PreOp.getOpcode() != ISD::ADD)
- return SDValue();
SDValue CurOp = PreOp.getOperand(0);
SDValue Shuffle = PreOp.getOperand(1);
if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
- // Try to swap the 1st and 2nd operand as add is commutative.
+ // Try to swap the 1st and 2nd operand as add and min/max instructions
+ // are commutative.
CurOp = PreOp.getOperand(1);
Shuffle = PreOp.getOperand(0);
if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
return SDValue();
}
+
+ // Check if the input vector is fed by the operator we want to handle,
+ // except the last step; the very first input vector is not necessarily
+ // the same operator we are handling.
+ if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
+ return SDValue();
+
// Check if it forms one step of the across vector reduction.
// E.g.,
// %cur = add %1, %0
@@ -8674,11 +8649,169 @@ performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
PreOp = CurOp;
}
+ unsigned Opcode;
+ switch (Op) {
+ default:
+ llvm_unreachable("Unexpected operator for across vector reduction");
+ case ISD::ADD:
+ Opcode = AArch64ISD::UADDV;
+ break;
+ case ISD::SMAX:
+ Opcode = AArch64ISD::SMAXV;
+ break;
+ case ISD::UMAX:
+ Opcode = AArch64ISD::UMAXV;
+ break;
+ case ISD::SMIN:
+ Opcode = AArch64ISD::SMINV;
+ break;
+ case ISD::UMIN:
+ Opcode = AArch64ISD::UMINV;
+ break;
+ }
SDLoc DL(N);
- return DAG.getNode(
- ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
- DAG.getNode(AArch64ISD::UADDV, DL, PreOp.getSimpleValueType(), PreOp),
- DAG.getConstant(0, DL, MVT::i64));
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
+ DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
+ DAG.getConstant(0, DL, MVT::i64));
+}
+
+/// Target-specific DAG combine for the across vector min/max reductions.
+/// This function specifically handles the final clean-up step of the vector
+/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which narrows down and finds the final min/max value from all
+/// elements of the vector.
+/// For example, for a <16 x i8> vector :
+/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
+/// %smax0 = smax %arr, svn0
+/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %smax1 = smax %smax0, %svn1
+/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %smax2 = smax %smax1, svn2
+/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %sc = setcc %smax2, %svn3, gt
+/// %n0 = extract_vector_elt %sc, #0
+/// %n1 = extract_vector_elt %smax2, #0
+/// %n2 = extract_vector_elt $smax2, #1
+/// %result = select %n0, %n1, n2
+/// becomes :
+/// %1 = smaxv %0
+/// %result = extract_vector_elt %1, 0
+/// FIXME: Currently this function matches only SMAXV, UMAXV, SMINV, and UMINV.
+/// We could also support other types of across lane reduction available
+/// in AArch64, including FMAXNMV, FMAXV, FMINNMV, and FMINV.
+static SDValue
+performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue IfTrue = N->getOperand(1);
+ SDValue IfFalse = N->getOperand(2);
+
+ // Check if the SELECT merges up the final result of the min/max
+ // from a vector.
+ if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // Expect N0 is fed by SETCC.
+ SDValue SetCC = N0.getOperand(0);
+ EVT SetCCVT = SetCC.getValueType();
+ if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
+ SetCCVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ SDValue VectorOp = SetCC.getOperand(0);
+ unsigned Op = VectorOp->getOpcode();
+ // Check if the input vector is fed by the operator we want to handle.
+ if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && Op != ISD::UMIN)
+ return SDValue();
+
+ EVT VTy = VectorOp.getValueType();
+ if (!VTy.isVector())
+ return SDValue();
+
+ EVT EltTy = VTy.getVectorElementType();
+ if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+ return SDValue();
+
+ // Check if extracting from the same vector.
+ // For example,
+ // %sc = setcc %vector, %svn1, gt
+ // %n0 = extract_vector_elt %sc, #0
+ // %n1 = extract_vector_elt %vector, #0
+ // %n2 = extract_vector_elt $vector, #1
+ if (!(VectorOp == IfTrue->getOperand(0) &&
+ VectorOp == IfFalse->getOperand(0)))
+ return SDValue();
+
+ // Check if the condition code is matched with the operator type.
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+ if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
+ (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
+ (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
+ (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE))
+ return SDValue();
+
+ // Expect to check only lane 0 from the vector SETCC.
+ if (!isa<ConstantSDNode>(N0.getOperand(1)) ||
+ cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue() != 0)
+ return SDValue();
+
+ // Expect to extract the true value from lane 0.
+ if (!isa<ConstantSDNode>(IfTrue.getOperand(1)) ||
+ cast<ConstantSDNode>(IfTrue.getOperand(1))->getZExtValue() != 0)
+ return SDValue();
+
+ // Expect to extract the false value from lane 1.
+ if (!isa<ConstantSDNode>(IfFalse.getOperand(1)) ||
+ cast<ConstantSDNode>(IfFalse.getOperand(1))->getZExtValue() != 1)
+ return SDValue();
+
+ return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
+}
+
+/// Target-specific DAG combine for the across vector add reduction.
+/// This function specifically handles the final clean-up step of the vector
+/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which adds all elements of a vector together.
+/// For example, for a <4 x i32> vector :
+/// %1 = vector_shuffle %0, <2,3,u,u>
+/// %2 = add %0, %1
+/// %3 = vector_shuffle %2, <1,u,u,u>
+/// %4 = add %2, %3
+/// %result = extract_vector_elt %4, 0
+/// becomes :
+/// %0 = uaddv %0
+/// %result = extract_vector_elt %0, 0
+static SDValue
+performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Check if the input vector is fed by the ADD.
+ if (N0->getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // The vector extract idx must constant zero because we only expect the final
+ // result of the reduction is placed in lane 0.
+ if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue() != 0)
+ return SDValue();
+
+ EVT VTy = N0.getValueType();
+ if (!VTy.isVector())
+ return SDValue();
+
+ EVT EltTy = VTy.getVectorElementType();
+ if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+ return SDValue();
+
+ return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
}
/// Target-specific DAG combine function for NEON load/store intrinsics
@@ -9259,8 +9392,12 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performBitcastCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
- case ISD::SELECT:
- return performSelectCombine(N, DCI);
+ case ISD::SELECT: {
+ SDValue RV = performSelectCombine(N, DCI);
+ if (!RV.getNode())
+ RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
+ return RV;
+ }
case ISD::VSELECT:
return performVSelectCombine(N, DCI.DAG);
case ISD::STORE:
@@ -9276,7 +9413,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INSERT_VECTOR_ELT:
return performPostLD1Combine(N, DCI, true);
case ISD::EXTRACT_VECTOR_ELT:
- return performAcrossLaneReductionCombine(N, DAG, Subtarget);
+ return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
OpenPOWER on IntegriCloud