diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-09-20 21:01:24 +0000 | 
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-09-20 21:01:24 +0000 | 
| commit | 8cbb4884a550f71346f65cfee176e8af0577b184 (patch) | |
| tree | e12596b30994ed9d80849ab0c413c4016abdc4af /llvm/lib | |
| parent | aff96d907b562fb2a9b0934553dadb21b58f261c (diff) | |
| download | bcm5719-llvm-8cbb4884a550f71346f65cfee176e8af0577b184.tar.gz bcm5719-llvm-8cbb4884a550f71346f65cfee176e8af0577b184.zip | |
AMDGPU: Start selecting v_mad_mixhi_f16
llvm-svn: 313814
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 46 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 51 | 
4 files changed, 97 insertions, 2 deletions
| diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 1a30a163e6d..9fc38aeefaa 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -775,6 +775,7 @@ static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {    return true;  } +// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.  bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {    const MachineOperand *ClampSrc = isClamp(MI);    if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 99f7badde71..6a751d71db2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -502,6 +502,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);    setTargetDAGCombine(ISD::ZERO_EXTEND);    setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); +  setTargetDAGCombine(ISD::BUILD_VECTOR);    // All memory operations. Some folding on the pointer operand is done to help    // matching the constant offsets in the addressing modes. @@ -5853,7 +5854,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(    SDNode *N, DAGCombinerInfo &DCI) const {    SDValue Vec = N->getOperand(0); -  SelectionDAG &DAG= DCI.DAG; +  SelectionDAG &DAG = DCI.DAG;    if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {      SDLoc SL(N);      EVT EltVT = N->getValueType(0); @@ -5866,6 +5867,47 @@ SDValue SITargetLowering::performExtractVectorEltCombine(    return SDValue();  } +static bool convertBuildVectorCastElt(SelectionDAG &DAG, +                                      SDValue &Lo, SDValue &Hi) { +  if (Hi.getOpcode() == ISD::BITCAST && +      Hi.getOperand(0).getValueType() == MVT::f16 && +      (isa<ConstantSDNode>(Lo) || Lo.isUndef())) { +    Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo); +    Hi = Hi.getOperand(0); +    return true; +  } + +  return false; +} + +SDValue SITargetLowering::performBuildVectorCombine( +  SDNode *N, DAGCombinerInfo &DCI) const { +  SDLoc SL(N); + +  if (!isTypeLegal(MVT::v2i16)) +    return SDValue(); +  SelectionDAG &DAG = DCI.DAG; +  EVT VT = N->getValueType(0); + +  if (VT == MVT::v2i16) { +    SDValue Lo = N->getOperand(0); +    SDValue Hi = N->getOperand(1); + +    // v2i16 build_vector (const|undef), (bitcast f16:$x) +    // -> bitcast (v2f16 build_vector const|undef, $x +    if (convertBuildVectorCastElt(DAG, Lo, Hi)) { +      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi  }); +      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); +    } + +    if (convertBuildVectorCastElt(DAG, Hi, Lo)) { +      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo  }); +      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); +    } +  } + +  return SDValue(); +}  unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,                                            const SDNode *N0, @@ -6287,6 +6329,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,    }    case ISD::EXTRACT_VECTOR_ELT:      return performExtractVectorEltCombine(N, DCI); +  case ISD::BUILD_VECTOR: +    return performBuildVectorCombine(N, DCI);    }    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);  } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index ad38eb62c50..91380f8c588 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -111,6 +111,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {    SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;    SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;    SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; +  SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;    unsigned getFusedOpcode(const SelectionDAG &DAG,                            const SDNode *N0, const SDNode *N1) const; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index b7aa2a9e9f2..313792f3704 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -76,8 +76,11 @@ def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16  // Clamp modifier is applied after conversion to f16.  def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; + +let ClampLo = 0, ClampHi = 1 in {  def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;  } +}  let Predicates = [HasMadMix] in { @@ -88,10 +91,56 @@ def : Pat <    (V_MAD_MIXLO_F16 $src0_modifiers, $src0,                     $src1_modifiers, $src1,                     $src2_modifiers, $src2, -                   0, +                   DSTCLAMP.NONE,                     (i32 (IMPLICIT_DEF)))  >; +// FIXME: Special case handling for maxhi (especially for clamp) +// because dealing with the write to high half of the register is +// difficult. +def : Pat < +  (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), +                                          (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), +                                          (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), +  (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, +                          $src1_modifiers, $src1, +                          $src2_modifiers, $src2, +                          DSTCLAMP.NONE, +                          $elt0)) +>; + +def : Pat < +  (build_vector +    f16:$elt0, +    (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), +                                (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), +                                (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), +  (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, +                          $src1_modifiers, $src1, +                          $src2_modifiers, $src2, +                          DSTCLAMP.ENABLE, +                          $elt0)) +>; + +def : Pat < +  (AMDGPUclamp (build_vector +    (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), +                   (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), +                   (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), +    (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), +                   (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), +                   (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), +  (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0, +                          $hi_src1_modifiers, $hi_src1, +                          $hi_src2_modifiers, $hi_src2, +                          DSTCLAMP.ENABLE, +                          (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0, +                                           $lo_src1_modifiers, $lo_src1, +                                           $lo_src2_modifiers, $lo_src2, +                                           DSTCLAMP.ENABLE, +                                           (i32 (IMPLICIT_DEF))))) +>; +  } // End Predicates = [HasMadMix]  multiclass VOP3P_Real_vi<bits<10> op> { | 

