Improve ISel using across lane min/max reduction

In vectorized integer min/max reduction code, the final "reduce" step is sub-optimal. In AArch64, this change wll combine : %svn0 = vector_shuffle %0, undef<2,3,u,u> %smax0 = smax %0, svn0 %svn3 = vector_shuffle %smax0, undef<1,u,u,u> %sc = setcc %smax0, %svn3, gt %n0 = extract_vector_elt %sc, #0 %n1 = extract_vector_elt %smax0, #0 %n2 = extract_vector_elt $smax0, #1 %result = select %n0, %n1, n2 becomes : %1 = smaxv %0 %result = extract_vector_elt %1, 0 This change extends r246790. llvm-svn: 247575
author: Jun Bum Lim <junbuml@codeaurora.org> 2015-09-14 16:19:52 +0000
committer: Jun Bum Lim <junbuml@codeaurora.org> 2015-09-14 16:19:52 +0000
commit: 34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec (patch)
tree: 5151aba4e563c5cd02743ea5c39d743d184edd63 /llvm/lib/Target
parent: 11282dcac637499e15ae1e58208d895515f3e8bf (diff)
download: bcm5719-llvm-34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec.tar.gz
bcm5719-llvm-34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec.zip
1 files changed, 190 insertions, 53 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3717297ae87..3c1251e2695 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8585,68 +8585,43 @@ static SDValue performPostLD1Combine(SDNode *N,
   return SDValue();
 }
 
-/// Target-specific DAG combine for the across vector reduction.
-/// This function specifically handles the final clean-up step of a vector
-/// reduction produced by the LoopVectorizer. It is the log2-shuffle pattern,
-/// consisting of log2(NumVectorElements) steps and, in each step, 2^(s)
-/// elements are reduced, where s is an induction variable from 0
-/// to log2(NumVectorElements).
-/// For example,
-///   %1 = vector_shuffle %0, <2,3,u,u>
-///   %2 = add %0, %1
-///   %3 = vector_shuffle %2, <1,u,u,u>
-///   %4 = add %2, %3
-///   %5 = extract_vector_elt %4, 0
-/// becomes :
-///   %0 = uaddv %0
-///   %1 = extract_vector_elt %0, 0
-///
-/// FIXME: Currently this function is implemented and tested specifically
-/// for the add reduction. We could also support other types of across lane
-/// reduction available in AArch64, including SMAXV, SMINV, UMAXV, UMINV,
-/// SADDLV, UADDLV, FMAXNMV, FMAXV, FMINNMV, FMINV.
-static SDValue
-performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
-                                  const AArch64Subtarget *Subtarget) {
-  if (!Subtarget->hasNEON())
+/// This function handles the log2-shuffle pattern produced by the
+/// LoopVectorizer for the across vector reduction. It consists of
+/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
+/// are reduced, where s is an induction variable from 0 to
+/// log2(NumVectorElements).
+static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
+                                                     unsigned Op,
+                                                     SelectionDAG &DAG) {
+  EVT VTy = OpV->getOperand(0).getValueType();
+  if (!VTy.isVector())
     return SDValue();
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
 
-  // Check if the input vector is fed by the operator we want to handle.
-  // We specifically check only ADD for now.
-  if (N0->getOpcode() != ISD::ADD)
-    return SDValue();
-
-  // The vector extract idx must constant zero because we only expect the final
-  // result of the reduction is placed in lane 0.
-  if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue())
-    return SDValue();
-
-  EVT EltTy = N0.getValueType().getVectorElementType();
-  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
-    return SDValue();
-
-  int NumVecElts = N0.getValueType().getVectorNumElements();
+  int NumVecElts = VTy.getVectorNumElements();
   if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
     return SDValue();
 
   int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
-  SDValue PreOp = N0;
+  SDValue PreOp = OpV;
   // Iterate over each step of the across vector reduction.
   for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
-    // We specifically check ADD for now.
-    if (PreOp.getOpcode() != ISD::ADD)
-      return SDValue();
     SDValue CurOp = PreOp.getOperand(0);
     SDValue Shuffle = PreOp.getOperand(1);
     if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
-      // Try to swap the 1st and 2nd operand as add is commutative.
+      // Try to swap the 1st and 2nd operand as add and min/max instructions
+      // are commutative.
       CurOp = PreOp.getOperand(1);
       Shuffle = PreOp.getOperand(0);
       if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
         return SDValue();
     }
+
+    // Check if the input vector is fed by the operator we want to handle,
+    // except the last step; the very first input vector is not necessarily
+    // the same operator we are handling.
+    if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
+      return SDValue();
+
     // Check if it forms one step of the across vector reduction.
     // E.g.,
     //   %cur = add %1, %0
@@ -8674,11 +8649,169 @@ performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
 
     PreOp = CurOp;
   }
+  unsigned Opcode;
+  switch (Op) {
+  default:
+    llvm_unreachable("Unexpected operator for across vector reduction");
+  case ISD::ADD:
+    Opcode = AArch64ISD::UADDV;
+    break;
+  case ISD::SMAX:
+    Opcode = AArch64ISD::SMAXV;
+    break;
+  case ISD::UMAX:
+    Opcode = AArch64ISD::UMAXV;
+    break;
+  case ISD::SMIN:
+    Opcode = AArch64ISD::SMINV;
+    break;
+  case ISD::UMIN:
+    Opcode = AArch64ISD::UMINV;
+    break;
+  }
   SDLoc DL(N);
-  return DAG.getNode(
-      ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
-      DAG.getNode(AArch64ISD::UADDV, DL, PreOp.getSimpleValueType(), PreOp),
-      DAG.getConstant(0, DL, MVT::i64));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
+                     DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
+                     DAG.getConstant(0, DL, MVT::i64));
+}
+
+/// Target-specific DAG combine for the across vector min/max reductions.
+/// This function specifically handles the final clean-up step of the vector
+/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which narrows down and finds the final min/max value from all
+/// elements of the vector.
+/// For example, for a <16 x i8> vector :
+///   svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
+///   %smax0 = smax %arr, svn0
+///   %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %smax1 = smax %smax0, %svn1
+///   %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %smax2 = smax %smax1, svn2
+///   %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %sc = setcc %smax2, %svn3, gt
+///   %n0 = extract_vector_elt %sc, #0
+///   %n1 = extract_vector_elt %smax2, #0
+///   %n2 = extract_vector_elt $smax2, #1
+///   %result = select %n0, %n1, n2
+///     becomes :
+///   %1 = smaxv %0
+///   %result = extract_vector_elt %1, 0
+/// FIXME: Currently this function matches only SMAXV, UMAXV, SMINV, and UMINV.
+/// We could also support other types of across lane reduction available
+/// in AArch64, including FMAXNMV, FMAXV, FMINNMV, and FMINV.
+static SDValue
+performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
+                                        const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue IfTrue = N->getOperand(1);
+  SDValue IfFalse = N->getOperand(2);
+
+  // Check if the SELECT merges up the final result of the min/max
+  // from a vector.
+  if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+
+  // Expect N0 is fed by SETCC.
+  SDValue SetCC = N0.getOperand(0);
+  EVT SetCCVT = SetCC.getValueType();
+  if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
+      SetCCVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  SDValue VectorOp = SetCC.getOperand(0);
+  unsigned Op = VectorOp->getOpcode();
+  // Check if the input vector is fed by the operator we want to handle.
+  if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && Op != ISD::UMIN)
+    return SDValue();
+
+  EVT VTy = VectorOp.getValueType();
+  if (!VTy.isVector())
+    return SDValue();
+
+  EVT EltTy = VTy.getVectorElementType();
+  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+    return SDValue();
+
+  // Check if extracting from the same vector.
+  // For example,
+  //   %sc = setcc %vector, %svn1, gt
+  //   %n0 = extract_vector_elt %sc, #0
+  //   %n1 = extract_vector_elt %vector, #0
+  //   %n2 = extract_vector_elt $vector, #1
+  if (!(VectorOp == IfTrue->getOperand(0) &&
+        VectorOp == IfFalse->getOperand(0)))
+    return SDValue();
+
+  // Check if the condition code is matched with the operator type.
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+  if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
+      (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
+      (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
+      (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE))
+    return SDValue();
+
+  // Expect to check only lane 0 from the vector SETCC.
+  if (!isa<ConstantSDNode>(N0.getOperand(1)) ||
+      cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue() != 0)
+    return SDValue();
+
+  // Expect to extract the true value from lane 0.
+  if (!isa<ConstantSDNode>(IfTrue.getOperand(1)) ||
+      cast<ConstantSDNode>(IfTrue.getOperand(1))->getZExtValue() != 0)
+    return SDValue();
+
+  // Expect to extract the false value from lane 1.
+  if (!isa<ConstantSDNode>(IfFalse.getOperand(1)) ||
+      cast<ConstantSDNode>(IfFalse.getOperand(1))->getZExtValue() != 1)
+    return SDValue();
+
+  return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
+}
+
+/// Target-specific DAG combine for the across vector add reduction.
+/// This function specifically handles the final clean-up step of the vector
+/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which adds all elements of a vector together.
+/// For example, for a <4 x i32> vector :
+///   %1 = vector_shuffle %0, <2,3,u,u>
+///   %2 = add %0, %1
+///   %3 = vector_shuffle %2, <1,u,u,u>
+///   %4 = add %2, %3
+///   %result = extract_vector_elt %4, 0
+/// becomes :
+///   %0 = uaddv %0
+///   %result = extract_vector_elt %0, 0
+static SDValue
+performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
+                                     const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Check if the input vector is fed by the ADD.
+  if (N0->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // The vector extract idx must constant zero because we only expect the final
+  // result of the reduction is placed in lane 0.
+  if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue() != 0)
+    return SDValue();
+
+  EVT VTy = N0.getValueType();
+  if (!VTy.isVector())
+    return SDValue();
+
+  EVT EltTy = VTy.getVectorElementType();
+  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+    return SDValue();
+
+  return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
 }
 
 /// Target-specific DAG combine function for NEON load/store intrinsics
@@ -9259,8 +9392,12 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performBitcastCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
-  case ISD::SELECT:
-    return performSelectCombine(N, DCI);
+  case ISD::SELECT: {
+    SDValue RV = performSelectCombine(N, DCI);
+    if (!RV.getNode())
+      RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
+    return RV;
+  }
   case ISD::VSELECT:
     return performVSelectCombine(N, DCI.DAG);
   case ISD::STORE:
@@ -9276,7 +9413,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::INSERT_VECTOR_ELT:
     return performPostLD1Combine(N, DCI, true);
   case ISD::EXTRACT_VECTOR_ELT:
-    return performAcrossLaneReductionCombine(N, DAG, Subtarget);
+    return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
author	Jun Bum Lim <junbuml@codeaurora.org>	2015-09-14 16:19:52 +0000
committer	Jun Bum Lim <junbuml@codeaurora.org>	2015-09-14 16:19:52 +0000
commit	34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec (patch)
tree	5151aba4e563c5cd02743ea5c39d743d184edd63 /llvm/lib/Target
parent	11282dcac637499e15ae1e58208d895515f3e8bf (diff)
download	bcm5719-llvm-34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec.tar.gz bcm5719-llvm-34b9bd0435a1e8f97bf97e08a2da37d8c01e80ec.zip