diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 46 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 51 |
4 files changed, 97 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 1a30a163e6d..9fc38aeefaa 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -775,6 +775,7 @@ static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) { return true; } +// FIXME: Clamp for v_mad_mixhi_f16 handled during isel. bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { const MachineOperand *ClampSrc = isClamp(MI); if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 99f7badde71..6a751d71db2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -502,6 +502,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::BUILD_VECTOR); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -5853,7 +5854,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine( SDNode *N, DAGCombinerInfo &DCI) const { SDValue Vec = N->getOperand(0); - SelectionDAG &DAG= DCI.DAG; + SelectionDAG &DAG = DCI.DAG; if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) { SDLoc SL(N); EVT EltVT = N->getValueType(0); @@ -5866,6 +5867,47 @@ SDValue SITargetLowering::performExtractVectorEltCombine( return SDValue(); } +static bool convertBuildVectorCastElt(SelectionDAG &DAG, + SDValue &Lo, SDValue &Hi) { + if (Hi.getOpcode() == ISD::BITCAST && + Hi.getOperand(0).getValueType() == MVT::f16 && + (isa<ConstantSDNode>(Lo) || Lo.isUndef())) { + Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo); + Hi = Hi.getOperand(0); + return true; + } + + return false; +} + +SDValue SITargetLowering::performBuildVectorCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDLoc SL(N); + + if (!isTypeLegal(MVT::v2i16)) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (VT == MVT::v2i16) { + SDValue Lo = N->getOperand(0); + SDValue Hi = N->getOperand(1); + + // v2i16 build_vector (const|undef), (bitcast f16:$x) + // -> bitcast (v2f16 build_vector const|undef, $x + if (convertBuildVectorCastElt(DAG, Lo, Hi)) { + SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi }); + return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); + } + + if (convertBuildVectorCastElt(DAG, Hi, Lo)) { + SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo }); + return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); + } + } + + return SDValue(); +} unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, @@ -6287,6 +6329,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } case ISD::EXTRACT_VECTOR_ELT: return performExtractVectorEltCombine(N, DCI); + case ISD::BUILD_VECTOR: + return performBuildVectorCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index ad38eb62c50..91380f8c588 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -111,6 +111,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index b7aa2a9e9f2..313792f3704 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -76,8 +76,11 @@ def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16 // Clamp modifier is applied after conversion to f16. def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; + +let ClampLo = 0, ClampHi = 1 in { def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; } +} let Predicates = [HasMadMix] in { @@ -88,10 +91,56 @@ def : Pat < (V_MAD_MIXLO_F16 $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, - 0, + DSTCLAMP.NONE, (i32 (IMPLICIT_DEF))) >; +// FIXME: Special case handling for maxhi (especially for clamp) +// because dealing with the write to high half of the register is +// difficult. +def : Pat < + (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE, + $elt0)) +>; + +def : Pat < + (build_vector + f16:$elt0, + (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), + (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.ENABLE, + $elt0)) +>; + +def : Pat < + (AMDGPUclamp (build_vector + (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), + (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), + (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0, + $hi_src1_modifiers, $hi_src1, + $hi_src2_modifiers, $hi_src2, + DSTCLAMP.ENABLE, + (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0, + $lo_src1_modifiers, $lo_src1, + $lo_src2_modifiers, $lo_src2, + DSTCLAMP.ENABLE, + (i32 (IMPLICIT_DEF))))) +>; + } // End Predicates = [HasMadMix] multiclass VOP3P_Real_vi<bits<10> op> { |

