diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 33 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 21 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3Instructions.td | 16 |
7 files changed, 52 insertions, 33 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index ba2aed68fb8..c4ac3180453 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -664,9 +664,10 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat < class IntMed3Pat<Instruction med3Inst, SDPatternOperator max, SDPatternOperator max_oneuse, - SDPatternOperator min_oneuse> : Pat< - (max (min_oneuse i32:$src0, i32:$src1), - (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)), + SDPatternOperator min_oneuse, + ValueType vt = i32> : Pat< + (max (min_oneuse vt:$src0, vt:$src1), + (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)), (med3Inst $src0, $src1, $src2) >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 34ae4538428..b99c36ab225 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -276,6 +276,10 @@ public: return (getGeneration() >= EVERGREEN); } + bool hasMed3_16() const { + return getGeneration() >= GFX9; + } + bool hasCARRY() const { return (getGeneration() >= EVERGREEN); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0959707ac9a..143a538c87d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4069,8 +4069,9 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { } } -static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1, bool Signed) { +SDValue SITargetLowering::performIntMed3ImmCombine( + SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1, bool Signed) const { ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); if (!K1) return SDValue(); @@ -4088,23 +4089,22 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, } EVT VT = K0->getValueType(0); + unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; + if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) { + return DAG.getNode(Med3Opc, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + } + // If there isn't a 16-bit med3 operation, convert to 32-bit. MVT NVT = MVT::i32; unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - SDValue Tmp1, Tmp2, Tmp3; - Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); - Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); - Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); + SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); + SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); + SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - if (VT == MVT::i16) { - Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT, - Tmp1, Tmp2, Tmp3); - - return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1); - } else - return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); + return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); } static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { @@ -4141,9 +4141,8 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); } - // No med3 for f16, but clamp is possible. - // TODO: gfx9 has med3 f16 - if (VT == MVT::f16 || VT == MVT::f64) + // med3 for f16 is only available on gfx9+. + if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16())) return SDValue(); // This isn't safe with signaling NaNs because in IEEE mode, min/max on a diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 5bf39064033..984640dfdb0 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -86,6 +86,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, SDValue Op0, SDValue Op1) const; + SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1, bool Signed) const; SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 1fc3fa81f30..f47b11f9f46 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1321,7 +1321,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; -def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>; +def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 5ec3cc2102a..e74dbacbfb5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1216,6 +1216,14 @@ def : Pat < // Miscellaneous Optimization Patterns //============================================================================// +// Undo sub x, c -> add x, -c canonicalization since c is more likely +// an inline immediate than -c. +// TODO: Also do for 64-bit. +def : Pat< + (add i32:$src0, (i32 NegSubInlineConst32:$src1)), + (S_SUB_I32 $src0, NegSubInlineConst32:$src1) +>; + def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>; @@ -1235,14 +1243,11 @@ class FPMed3Pat<ValueType vt, def : FPMed3Pat<f32, V_MED3_F32>; - -// Undo sub x, c -> add x, -c canonicalization since c is more likely -// an inline immediate than -c. -// TODO: Also do for 64-bit. -def : Pat< - (add i32:$src0, (i32 NegSubInlineConst32:$src1)), - (S_SUB_I32 $src0, NegSubInlineConst32:$src1) ->; +let Predicates = [isGFX9] in { +def : FPMed3Pat<f16, V_MED3_F16>; +def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>; +def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>; +} // End Predicates = [isGFX9] //============================================================================// // Assembler aliases diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 3ba748839ed..42ccd6d5e19 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -258,8 +258,8 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>; let Predicates = [isVI] in { -multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, - Instruction inst, SDPatternOperator op3> { +multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, + Instruction inst, SDPatternOperator op3> { def : Pat< (op2 (op1 i16:$src0, i16:$src1), i16:$src2), (inst i16:$src0, i16:$src1, i16:$src2) @@ -278,8 +278,8 @@ def : Pat< >; } -defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>; -defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>; +defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>; +defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>; } // End Predicates = [isVI] @@ -291,6 +291,10 @@ def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; + +def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>; +def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>; +def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>; } @@ -487,3 +491,7 @@ defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>; defm V_AND_OR_B32 : VOP3_Real_vi <0x201>; defm V_OR3_B32 : VOP3_Real_vi <0x202>; defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>; + +defm V_MED3_F16 : VOP3_Real_vi <0x1fa>; +defm V_MED3_I16 : VOP3_Real_vi <0x1fb>; +defm V_MED3_U16 : VOP3_Real_vi <0x1fc>; |