diff options
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/R600/AMDGPUISelLowering.cpp | 17 | ||||
| -rw-r--r-- | llvm/lib/Target/R600/AMDGPUISelLowering.h | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/R600/AMDGPUInstrInfo.td | 11 | ||||
| -rw-r--r-- | llvm/lib/Target/R600/AMDGPUIntrinsics.td | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/R600/SIISelLowering.cpp | 118 | ||||
| -rw-r--r-- | llvm/lib/Target/R600/SIISelLowering.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/R600/SIInstructions.td | 16 | 
7 files changed, 170 insertions, 5 deletions
| diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp index 0f1a77a7d64..92ab174695e 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp @@ -786,6 +786,18 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,        return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,                           Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); +    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: +      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); + +    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: +      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); + +    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: +      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); + +    case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: +      return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); +      case AMDGPUIntrinsic::AMDGPU_bfe_i32:        return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,                           Op.getOperand(1), @@ -1256,7 +1268,6 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,    FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,                          DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32    return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); -  }  SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, @@ -1582,6 +1593,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {    NODE_NAME_CASE(SAMPLEB)    NODE_NAME_CASE(SAMPLED)    NODE_NAME_CASE(SAMPLEL) +  NODE_NAME_CASE(CVT_F32_UBYTE0) +  NODE_NAME_CASE(CVT_F32_UBYTE1) +  NODE_NAME_CASE(CVT_F32_UBYTE2) +  NODE_NAME_CASE(CVT_F32_UBYTE3)    NODE_NAME_CASE(STORE_MSKOR)    NODE_NAME_CASE(TBUFFER_STORE_FORMAT)    } diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h index ee98b3bfd33..f50afababc0 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.h +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h @@ -205,6 +205,12 @@ enum {    SAMPLEB,    SAMPLED,    SAMPLEL, + +  // These cvt_f32_ubyte* nodes need to remain consecutive and in order. +  CVT_F32_UBYTE0, +  CVT_F32_UBYTE1, +  CVT_F32_UBYTE2, +  CVT_F32_UBYTE3,    FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,    STORE_MSKOR,    LOAD_CONSTANT, diff --git a/llvm/lib/Target/R600/AMDGPUInstrInfo.td b/llvm/lib/Target/R600/AMDGPUInstrInfo.td index f96dbb4d8a1..03137800d65 100644 --- a/llvm/lib/Target/R600/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/R600/AMDGPUInstrInfo.td @@ -59,6 +59,17 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,    [SDNPCommutative, SDNPAssociative]  >; + +def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", +  SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", +  SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2", +  SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", +  SDTIntToFPOp, []>; + +  // urecip - This operation is a helper for integer division, it returns the  // result of 1 / a as a fractional unsigned integer.  // out = (2^32 / a) + e diff --git a/llvm/lib/Target/R600/AMDGPUIntrinsics.td b/llvm/lib/Target/R600/AMDGPUIntrinsics.td index 9ad5e72d3f0..5790357bf1d 100644 --- a/llvm/lib/Target/R600/AMDGPUIntrinsics.td +++ b/llvm/lib/Target/R600/AMDGPUIntrinsics.td @@ -53,6 +53,10 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {    def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;    def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;    def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; +  def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; +  def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; +  def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; +  def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;    def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;    def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;    def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/R600/SIISelLowering.cpp b/llvm/lib/Target/R600/SIISelLowering.cpp index 608aad2d3ca..846aeb63093 100644 --- a/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/llvm/lib/Target/R600/SIISelLowering.cpp @@ -24,6 +24,7 @@  #include "llvm/CodeGen/MachineRegisterInfo.h"  #include "llvm/CodeGen/SelectionDAG.h"  #include "llvm/IR/Function.h" +#include "llvm/ADT/SmallString.h"  using namespace llvm; @@ -214,6 +215,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :    setTargetDAGCombine(ISD::SELECT_CC);    setTargetDAGCombine(ISD::SETCC); +  setTargetDAGCombine(ISD::UINT_TO_FP); +    setSchedulingPreference(Sched::RegPressure);  } @@ -979,6 +982,96 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {  // Custom DAG optimizations  //===----------------------------------------------------------------------===// +SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, +                                                     DAGCombinerInfo &DCI) { +  EVT VT = N->getValueType(0); +  EVT ScalarVT = VT.getScalarType(); +  if (ScalarVT != MVT::f32) +    return SDValue(); + +  SelectionDAG &DAG = DCI.DAG; +  SDLoc DL(N); + +  SDValue Src = N->getOperand(0); +  EVT SrcVT = Src.getValueType(); + +  // TODO: We could try to match extracting the higher bytes, which would be +  // easier if i8 vectors weren't promoted to i32 vectors, particularly after +  // types are legalized. v4i8 -> v4f32 is probably the only case to worry +  // about in practice. +  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { +    if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { +      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); +      DCI.AddToWorklist(Cvt.getNode()); +      return Cvt; +    } +  } + +  // We are primarily trying to catch operations on illegal vector types +  // before they are expanded. +  // For scalars, we can use the more flexible method of checking masked bits +  // after legalization. +  if (!DCI.isBeforeLegalize() || +      !SrcVT.isVector() || +      SrcVT.getVectorElementType() != MVT::i8) { +    return SDValue(); +  } + +  assert(DCI.isBeforeLegalize() && "Unexpected legal type"); + +  // Weird sized vectors are a pain to handle, but we know 3 is really the same +  // size as 4. +  unsigned NElts = SrcVT.getVectorNumElements(); +  if (!SrcVT.isSimple() && NElts != 3) +    return SDValue(); + +  // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to +  // prevent a mess from expanding to v4i32 and repacking. +  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { +    EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); +    EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); +    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); + +    LoadSDNode *Load = cast<LoadSDNode>(Src); +    SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, +                                     Load->getChain(), +                                     Load->getBasePtr(), +                                     LoadVT, +                                     Load->getMemOperand()); + +    // Make sure successors of the original load stay after it by updating +    // them to use the new Chain. +    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); + +    SmallVector<SDValue, 4> Elts; +    if (RegVT.isVector()) +      DAG.ExtractVectorElements(NewLoad, Elts); +    else +      Elts.push_back(NewLoad); + +    SmallVector<SDValue, 4> Ops; + +    unsigned EltIdx = 0; +    for (SDValue Elt : Elts) { +      unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); +      for (unsigned I = 0; I < ComponentsInElt; ++I) { +        unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; +        SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); +        DCI.AddToWorklist(Cvt.getNode()); +        Ops.push_back(Cvt); +      } + +      ++EltIdx; +    } + +    assert(Ops.size() == NElts); + +    return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); +  } + +  return SDValue(); +} +  SDValue SITargetLowering::PerformDAGCombine(SDNode *N,                                              DAGCombinerInfo &DCI) const {    SelectionDAG &DAG = DCI.DAG; @@ -1020,6 +1113,31 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,        }        break;      } + +  case AMDGPUISD::CVT_F32_UBYTE0: +  case AMDGPUISD::CVT_F32_UBYTE1: +  case AMDGPUISD::CVT_F32_UBYTE2: +  case AMDGPUISD::CVT_F32_UBYTE3: { +    unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; + +    SDValue Src = N->getOperand(0); +    APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + +    APInt KnownZero, KnownOne; +    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), +                                          !DCI.isBeforeLegalizeOps()); +    const TargetLowering &TLI = DAG.getTargetLoweringInfo(); +    if (TLO.ShrinkDemandedConstant(Src, Demanded) || +        TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { +      DCI.CommitTargetLoweringOpt(TLO); +    } + +    break; +  } + +  case ISD::UINT_TO_FP: { +    return performUCharToFloatCombine(N, DCI); +  }    }    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); diff --git a/llvm/lib/Target/R600/SIISelLowering.h b/llvm/lib/Target/R600/SIISelLowering.h index d9bad4e4acd..2f97a9ada8f 100644 --- a/llvm/lib/Target/R600/SIISelLowering.h +++ b/llvm/lib/Target/R600/SIISelLowering.h @@ -43,6 +43,9 @@ class SITargetLowering : public AMDGPUTargetLowering {    void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;    MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const; +  static SDValue performUCharToFloatCombine(SDNode *N, +                                            DAGCombinerInfo &DCI); +  public:    SITargetLowering(TargetMachine &tm);    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, diff --git a/llvm/lib/Target/R600/SIInstructions.td b/llvm/lib/Target/R600/SIInstructions.td index 7d37d105672..9af1a706175 100644 --- a/llvm/lib/Target/R600/SIInstructions.td +++ b/llvm/lib/Target/R600/SIInstructions.td @@ -978,10 +978,18 @@ defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64",  defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",    [(set f64:$dst, (fextend f32:$src0))]  >; -//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>; -//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>; -//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>; -//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>; +defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", +  [(set f32:$dst, (AMDGPUcvt_f32_ubyte0 i32:$src0))] +>; +defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", +  [(set f32:$dst, (AMDGPUcvt_f32_ubyte1 i32:$src0))] +>; +defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", +  [(set f32:$dst, (AMDGPUcvt_f32_ubyte2 i32:$src0))] +>; +defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", +  [(set f32:$dst, (AMDGPUcvt_f32_ubyte3 i32:$src0))] +>;  defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64",    [(set i32:$dst, (fp_to_uint f64:$src0))]  >; | 

