[SystemZ] Improve handling and cost estimates of vector integer div/rem

Enable the DAG optimization that converts vector div/rem with constants into multiply+shifts sequences by expanding them early. This is needed since ISD::SMUL_LOHI is 'Custom' lowered on SystemZ, and will therefore not be available to BuildSDIV after legalization. Better cost values for these instructions based on how they will be implemented (a constant divisor is cheaper). Review: Ulrich Weigand https://reviews.llvm.org/D53196 llvm-svn: 345321
author: Jonas Paulsson <paulsson@linux.vnet.ibm.com> 2018-10-25 21:47:22 +0000
committer: Jonas Paulsson <paulsson@linux.vnet.ibm.com> 2018-10-25 21:47:22 +0000
commit: 4645711a8d54eb34d0040dde1748376d0992068b (patch)
tree: 86e3a0cb26d46634422198e718d9dc2cdddd3e65 /llvm/lib
parent: 2f9c42c99453b9f33d5f4311075c357134e84120 (diff)
download: bcm5719-llvm-4645711a8d54eb34d0040dde1748376d0992068b.tar.gz
bcm5719-llvm-4645711a8d54eb34d0040dde1748376d0992068b.zip
3 files changed, 69 insertions, 39 deletions
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 53cd21c4236..d86737e2192 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -527,6 +527,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::FP_ROUND);
   setTargetDAGCombine(ISD::BSWAP);
+  setTargetDAGCombine(ISD::SDIV);
+  setTargetDAGCombine(ISD::UDIV);
+  setTargetDAGCombine(ISD::SREM);
+  setTargetDAGCombine(ISD::UREM);
 
   // Handle intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -5664,6 +5668,23 @@ SDValue SystemZTargetLowering::combineGET_CCMASK(
   return Select->getOperand(4);
 }
 
+SDValue SystemZTargetLowering::combineIntDIVREM(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  // In the case where the divisor is a vector of constants a cheaper
+  // sequence of instructions can replace the divide. BuildSDIV is called to
+  // do this during DAG combining, but it only succeeds when it can build a
+  // multiplication node. The only option for SystemZ is ISD::SMUL_LOHI, and
+  // since it is not Legal but Custom it can only happen before
+  // legalization. Therefore we must scalarize this early before Combine
+  // 1. For widened vectors, this is already the result of type legalization.
+  if (VT.isVector() && isTypeLegal(VT) &&
+      DAG.isConstantIntBuildVectorOrConstantInt(N->getOperand(1)))
+    return DAG.UnrollVectorOp(N);
+  return SDValue();
+}
+
 SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   switch(N->getOpcode()) {
@@ -5681,6 +5702,10 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
   case SystemZISD::BR_CCMASK:   return combineBR_CCMASK(N, DCI);
   case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
   case SystemZISD::GET_CCMASK:  return combineGET_CCMASK(N, DCI);
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:               return combineIntDIVREM(N, DCI);
   }
 
   return SDValue();
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 267e31a8521..4b6be9bff0a 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -605,6 +605,7 @@ private:
   SDValue combineBR_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineIntDIVREM(SDNode *N, DAGCombinerInfo &DCI) const;
 
   // If the last instruction before MBBI in MBB was some form of COMPARE,
   // try to replace it with a COMPARE AND BRANCH just before MBBI.
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 1eaeb9699bf..f52c9ca6e49 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -362,27 +362,33 @@ int SystemZTTIImpl::getArithmeticInstrCost(
 
   unsigned ScalarBits = Ty->getScalarSizeInBits();
 
-  // Div with a constant which is a power of 2 will be converted by
-  // DAGCombiner to use shifts. With vector shift-element instructions, a
-  // vector sdiv costs about as much as a scalar one.
-  const unsigned SDivCostEstimate = 4;
-  bool SDivPow2 = false;
-  bool UDivPow2 = false;
-  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv) &&
-      Args.size() == 2) {
-    const ConstantInt *CI = nullptr;
+  // There are thre cases of division and remainder: Dividing with a register
+  // needs a divide instruction. A divisor which is a power of two constant
+  // can be implemented with a sequence of shifts. Any other constant needs a
+  // multiply and shifts.
+  const unsigned DivInstrCost = 20;
+  const unsigned DivMulSeqCost = 10;
+  const unsigned SDivPow2Cost = 4;
+
+  bool SignedDivRem =
+      Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
+  bool UnsignedDivRem =
+      Opcode == Instruction::UDiv || Opcode == Instruction::URem;
+
+  // Check for a constant divisor.
+  bool DivRemConst = false;
+  bool DivRemConstPow2 = false;
+  if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
     if (const Constant *C = dyn_cast<Constant>(Args[1])) {
-      if (C->getType()->isVectorTy())
-        CI = dyn_cast_or_null<const ConstantInt>(C->getSplatValue());
+      const ConstantInt *CVal =
+          (C->getType()->isVectorTy()
+               ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
+               : dyn_cast<const ConstantInt>(C));
+      if (CVal != nullptr &&
+          (CVal->getValue().isPowerOf2() || (-CVal->getValue()).isPowerOf2()))
+        DivRemConstPow2 = true;
       else
-        CI = dyn_cast<const ConstantInt>(C);
-    }
-    if (CI != nullptr &&
-        (CI->getValue().isPowerOf2() || (-CI->getValue()).isPowerOf2())) {
-      if (Opcode == Instruction::SDiv)
-        SDivPow2 = true;
-      else
-        UDivPow2 = true;
+        DivRemConst = true;
     }
   }
 
@@ -394,18 +400,19 @@ int SystemZTTIImpl::getArithmeticInstrCost(
     // These vector operations are custom handled, but are still supported
     // with one instruction per vector, regardless of element size.
     if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
-        Opcode == Instruction::AShr || UDivPow2) {
+        Opcode == Instruction::AShr) {
       return NumVectors;
     }
 
-    if (SDivPow2)
-      return (NumVectors * SDivCostEstimate);
-
-    // Temporary hack: disable high vectorization factors with integer
-    // division/remainder, which will get scalarized and handled with GR128
-    // registers. The mischeduler is not clever enough to avoid spilling yet.
-    if ((Opcode == Instruction::UDiv || Opcode == Instruction::SDiv ||
-         Opcode == Instruction::URem || Opcode == Instruction::SRem) && VF > 4)
+    if (DivRemConstPow2)
+      return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
+    if (DivRemConst)
+      return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args);
+    if ((SignedDivRem || UnsignedDivRem) && VF > 4)
+      // Temporary hack: disable high vectorization factors with integer
+      // division/remainder, which will get scalarized and handled with
+      // GR128 registers. The mischeduler is not clever enough to avoid
+      // spilling yet.
       return 1000;
 
     // These FP operations are supported with a single vector instruction for
@@ -471,19 +478,16 @@ int SystemZTTIImpl::getArithmeticInstrCost(
       return 7; // 2 * ipm sequences ; xor ; shift ; compare
     }
 
-    if (UDivPow2)
-      return 1;
-    if (SDivPow2)
-      return SDivCostEstimate;
-
-    // An extra extension for narrow types is needed.
-    if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
+    if (DivRemConstPow2)
+      return (SignedDivRem ? SDivPow2Cost : 1);
+    if (DivRemConst)
+      return DivMulSeqCost;
+    if (SignedDivRem)
       // sext of op(s) for narrow types
-      return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1));
-
-    if (Opcode == Instruction::UDiv || Opcode == Instruction::URem)
+      return DivInstrCost + (ScalarBits < 32 ? 3 : (ScalarBits == 32 ? 1 : 0));
+    if (UnsignedDivRem)
       // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
-      return (ScalarBits < 32 ? 4 : 2);
+      return DivInstrCost + (ScalarBits < 32 ? 3 : 1);
   }
 
   // Fallback to the default implementation.
author	Jonas Paulsson <paulsson@linux.vnet.ibm.com>	2018-10-25 21:47:22 +0000
committer	Jonas Paulsson <paulsson@linux.vnet.ibm.com>	2018-10-25 21:47:22 +0000
commit	4645711a8d54eb34d0040dde1748376d0992068b (patch)
tree	86e3a0cb26d46634422198e718d9dc2cdddd3e65 /llvm/lib
parent	2f9c42c99453b9f33d5f4311075c357134e84120 (diff)
download	bcm5719-llvm-4645711a8d54eb34d0040dde1748376d0992068b.tar.gz bcm5719-llvm-4645711a8d54eb34d0040dde1748376d0992068b.zip