[SystemZ] Add CodeGen support for v4f32

The architecture doesn't really have any native v4f32 operations except v4f32->v2f64 and v2f64->v4f32 conversions, with only half of the v4f32 elements being used. Even so, using vector registers for <4 x float> and scalarising individual operations is much better than generating completely scalar code, since there's much less register pressure. It's also more efficient to do v4f32 comparisons by extending to 2 v2f64s, comparing those, then packing the result. This particularly helps with llvmpipe. Based on a patch by Richard Sandiford. llvm-svn: 236523
author: Ulrich Weigand <ulrich.weigand@de.ibm.com> 2015-05-05 19:27:45 +0000
committer: Ulrich Weigand <ulrich.weigand@de.ibm.com> 2015-05-05 19:27:45 +0000
commit: 80b3af7ab3f8e76507cc4491be1460f1b1d8adb2 (patch)
tree: b9f2252bf5fb13308c25e7f05f918dfdae76aa77 /llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
parent: cd808237b24c7d6d0bb7ddf577dba37c31a06a50 (diff)
download: bcm5719-llvm-80b3af7ab3f8e76507cc4491be1460f1b1d8adb2.tar.gz
bcm5719-llvm-80b3af7ab3f8e76507cc4491be1460f1b1d8adb2.zip
1 files changed, 133 insertions, 13 deletions
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 5f547439c9a..391cb8c6fc9 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -101,6 +101,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
     addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
     addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
     addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
+    addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
     addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
   }
 
@@ -275,7 +276,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
     if (isTypeLegal(VT)) {
       // These operations are legal for anything that can be stored in a
       // vector register, even if there is no native support for the format
-      // as such.
+      // as such.  In particular, we can do these for v4f32 even though there
+      // are no specific instructions for that format.
       setOperationAction(ISD::LOAD, VT, Legal);
       setOperationAction(ISD::STORE, VT, Legal);
       setOperationAction(ISD::VSELECT, VT, Legal);
@@ -365,11 +367,14 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
   // Handle floating-point vector types.
   if (Subtarget.hasVector()) {
     // Scalar-to-vector conversion is just a subreg.
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
 
     // Some insertions and extractions can be done directly but others
     // need to go via integers.
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
 
     // These operations have direct equivalents.
@@ -407,8 +412,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
 
   // We have 64-bit FPR<->GPR moves, but need special handling for
   // 32-bit forms.
-  setOperationAction(ISD::BITCAST, MVT::i32, Custom);
-  setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+  if (!Subtarget.hasVector()) {
+    setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+  }
 
   // VASTART and VACOPY need to deal with the SystemZ-specific varargs
   // structure, but VAEND is a no-op.
@@ -420,6 +427,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::FP_ROUND);
 
   // Handle intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -855,6 +863,7 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
       case MVT::v8i16:
       case MVT::v4i32:
       case MVT::v2i64:
+      case MVT::v4f32:
       case MVT::v2f64:
         RC = &SystemZ::VR128BitRegClass;
         break;
@@ -1977,6 +1986,33 @@ static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool IsFP,
   return 0;
 }
 
+// Return a v2f64 that contains the extended form of elements Start and Start+1
+// of v4f32 value Op.
+static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, SDLoc DL,
+                                  SDValue Op) {
+  int Mask[] = { Start, -1, Start + 1, -1 };
+  Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
+  return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op);
+}
+
+// Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
+// producing a result of type VT.
+static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, SDLoc DL,
+                            EVT VT, SDValue CmpOp0, SDValue CmpOp1) {
+  // There is no hardware support for v4f32, so extend the vector into
+  // two v2f64s and compare those.
+  if (CmpOp0.getValueType() == MVT::v4f32) {
+    SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
+    SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
+    SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
+    SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1);
+    SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1);
+    SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1);
+    return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
+  }
+  return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
+}
+
 // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
 // an integer mask of type VT.
 static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
@@ -1991,8 +2027,8 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
     Invert = true;
   case ISD::SETO: {
     assert(IsFP && "Unexpected integer comparison");
-    SDValue LT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
-    SDValue GE = DAG.getNode(SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
+    SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
+    SDValue GE = getVectorCmp(DAG, SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
     Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE);
     break;
   }
@@ -2002,8 +2038,8 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
     Invert = true;
   case ISD::SETONE: {
     assert(IsFP && "Unexpected integer comparison");
-    SDValue LT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
-    SDValue GT = DAG.getNode(SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
+    SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
+    SDValue GT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
     Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT);
     break;
   }
@@ -2013,11 +2049,11 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
     // there are no cases where both work.
   default:
     if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
-      Cmp = DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
+      Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1);
     else {
       CC = ISD::getSetCCSwappedOperands(CC);
       if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
-        Cmp = DAG.getNode(Opcode, DL, VT, CmpOp1, CmpOp0);
+        Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0);
       else
         llvm_unreachable("Unhandled comparison");
     }
@@ -3621,6 +3657,31 @@ static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT,
   if (VT == MVT::v2f64)
     return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
 
+  // Build v4f32 values directly from the FPRs:
+  //
+  //   <Axxx> <Bxxx> <Cxxxx> <Dxxx>
+  //         V              V         VMRHF
+  //      <ABxx>         <CDxx>
+  //                V                 VMRHG
+  //              <ABCD>
+  if (VT == MVT::v4f32) {
+    SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
+    SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
+    // Avoid unnecessary undefs by reusing the other operand.
+    if (Op01.getOpcode() == ISD::UNDEF)
+      Op01 = Op23;
+    else if (Op23.getOpcode() == ISD::UNDEF)
+      Op23 = Op01;
+    // Merging identical replications is a no-op.
+    if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
+      return Op01;
+    Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
+    Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
+    SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
+                             DL, MVT::v2i64, Op01, Op23);
+    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+  }
+
   // Collect the constant terms.
   SmallVector<SDValue, SystemZ::VectorBytes> Constants(NumElements, SDValue());
   SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false);
@@ -3796,10 +3857,11 @@ SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
   SDValue Op2 = Op.getOperand(2);
   EVT VT = Op.getValueType();
 
-  // Insertions into constant indices can be done using VPDI.  However,
-  // if the inserted value is a bitcast or a constant then it's better
-  // to use GPRs, as below.
-  if (Op1.getOpcode() != ISD::BITCAST &&
+  // Insertions into constant indices of a v2f64 can be done using VPDI.
+  // However, if the inserted value is a bitcast or a constant then it's
+  // better to use GPRs, as below.
+  if (VT == MVT::v2f64 &&
+      Op1.getOpcode() != ISD::BITCAST &&
       Op1.getOpcode() != ISD::ConstantFP &&
       Op2.getOpcode() == ISD::Constant) {
     uint64_t Index = dyn_cast<ConstantSDNode>(Op2)->getZExtValue();
@@ -4065,6 +4127,8 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(VFCMPE);
     OPCODE(VFCMPH);
     OPCODE(VFCMPHE);
+    OPCODE(VEXTEND);
+    OPCODE(VROUND);
     OPCODE(ATOMIC_SWAPW);
     OPCODE(ATOMIC_LOADW_ADD);
     OPCODE(ATOMIC_LOADW_SUB);
@@ -4265,6 +4329,19 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
       }
     }
   }
+  // (z_merge_high 0, 0) -> 0.  This is mostly useful for using VLLEZF
+  // for v4f32.
+  if (Opcode == SystemZISD::MERGE_HIGH) {
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    if (Op0 == Op1) {
+      if (Op0.getOpcode() == ISD::BITCAST)
+        Op0 = Op0.getOperand(0);
+      if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
+          cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0)
+        return Op1;
+    }
+  }
   // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
   // for the extraction to be done on a vMiN value, so that we can use VSTE.
   // If X has wider elements then convert it to:
@@ -4299,6 +4376,49 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
       N->getOperand(0) == N->getOperand(1))
     return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0),
                        N->getOperand(0));
+  // (fround (extract_vector_elt X 0))
+  // (fround (extract_vector_elt X 1)) ->
+  // (extract_vector_elt (VROUND X) 0)
+  // (extract_vector_elt (VROUND X) 1)
+  //
+  // This is a special case since the target doesn't really support v2f32s.
+  if (Opcode == ISD::FP_ROUND) {
+    SDValue Op0 = N->getOperand(0);
+    if (N->getValueType(0) == MVT::f32 &&
+        Op0.hasOneUse() &&
+        Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op0.getOperand(0).getValueType() == MVT::v2f64 &&
+        Op0.getOperand(1).getOpcode() == ISD::Constant &&
+        cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
+      SDValue Vec = Op0.getOperand(0);
+      for (auto *U : Vec->uses()) {
+        if (U != Op0.getNode() &&
+            U->hasOneUse() &&
+            U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+            U->getOperand(0) == Vec &&
+            U->getOperand(1).getOpcode() == ISD::Constant &&
+            cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
+          SDValue OtherRound = SDValue(*U->use_begin(), 0);
+          if (OtherRound.getOpcode() == ISD::FP_ROUND &&
+              OtherRound.getOperand(0) == SDValue(U, 0) &&
+              OtherRound.getValueType() == MVT::f32) {
+            SDValue VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
+                                         MVT::v4f32, Vec);
+            DCI.AddToWorklist(VRound.getNode());
+            SDValue Extract1 =
+              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
+                          VRound, DAG.getConstant(2, SDLoc(U), MVT::i32));
+            DCI.AddToWorklist(Extract1.getNode());
+            DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
+            SDValue Extract0 =
+              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
+                          VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
+            return Extract0;
+          }
+        }
+      }
+    }
+  }
   return SDValue();
 }
author	Ulrich Weigand <ulrich.weigand@de.ibm.com>	2015-05-05 19:27:45 +0000
committer	Ulrich Weigand <ulrich.weigand@de.ibm.com>	2015-05-05 19:27:45 +0000
commit	80b3af7ab3f8e76507cc4491be1460f1b1d8adb2 (patch)
tree	b9f2252bf5fb13308c25e7f05f918dfdae76aa77 /llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
parent	cd808237b24c7d6d0bb7ddf577dba37c31a06a50 (diff)
download	bcm5719-llvm-80b3af7ab3f8e76507cc4491be1460f1b1d8adb2.tar.gz bcm5719-llvm-80b3af7ab3f8e76507cc4491be1460f1b1d8adb2.zip