diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 56 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 35 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 116 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 16 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 29 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 13 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 61 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 12 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP1Instructions.td | 36 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP2Instructions.td | 59 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3Instructions.td | 22 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOPCInstructions.td | 370 |
18 files changed, 617 insertions, 238 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 7a208d7c09a..fe67f59feb6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -187,6 +187,12 @@ def FeatureScalarStores : SubtargetFeature<"scalar-stores", // Subtarget Features (options and debugging) //===------------------------------------------------------------===// +def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", + "FP16Denormals", + "true", + "Enable half precision denormal handling" +>; + // Some instructions do not support denormals despite this flag. Using // fp32 denormals also causes instructions to run at the double // precision rate for the device. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index addc8062b29..56a0540a070 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -563,7 +563,8 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64; + return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() && + VT == MVT::f16); } bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, @@ -1927,7 +1928,20 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, assert(Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"); + // TODO: Factor out code common with LowerSINT_TO_FP. + EVT DestVT = Op.getValueType(); + if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + + SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); + SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); + SDValue FPRound = + DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); + + return FPRound; + } if (DestVT == MVT::f32) return LowerINT_TO_FP32(Op, DAG, false); @@ -1941,7 +1955,21 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, assert(Op.getOperand(0).getValueType() == MVT::i64 && "operation should be legal"); + // TODO: Factor out code common with LowerUINT_TO_FP. + EVT DestVT = Op.getValueType(); + if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + + SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); + SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); + SDValue FPRound = + DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); + + return FPRound; + } + if (DestVT == MVT::f32) return LowerINT_TO_FP32(Op, DAG, true); @@ -2077,6 +2105,19 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); + // TODO: Factor out code common with LowerFP_TO_UINT. + + EVT SrcVT = Src.getValueType(); + if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { + SDLoc DL(Op); + + SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); + SDValue FpToInt32 = + DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); + + return FpToInt32; + } + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) return LowerFP64_TO_INT(Op, DAG, true); @@ -2087,6 +2128,19 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); + // TODO: Factor out code common with LowerFP_TO_SINT. + + EVT SrcVT = Src.getValueType(); + if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { + SDLoc DL(Op); + + SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); + SDValue FpToInt32 = + DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); + + return FpToInt32; + } + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) return LowerFP64_TO_INT(Op, DAG, false); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index c2544c295e3..a7a995156e2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -42,6 +42,7 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm = "", field bits<32> Inst = 0xffffffff; } +def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">; def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index dc3c64d0174..d0dd7a94f20 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -56,6 +56,7 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // denormals, but should be checked. Should we issue a warning somewhere // if someone tries to enable these? if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + FP16Denormals = false; FP32Denormals = false; FP64Denormals = false; } @@ -81,6 +82,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FastFMAF32(false), HalfRate64Ops(false), + FP16Denormals(false), FP32Denormals(false), FP64Denormals(false), FPExceptions(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 3eed8a125ea..842711b0dd3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -75,6 +75,7 @@ protected: bool HalfRate64Ops; // Dynamially set bits that enable features. + bool FP16Denormals; bool FP32Denormals; bool FP64Denormals; bool FPExceptions; @@ -270,6 +271,9 @@ public: /// the given LDS memory size is the only constraint. unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; + bool hasFP16Denormals() const { + return FP16Denormals; + } bool hasFP32Denormals() const { return FP32Denormals; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 0e6c1608785..9df89448059 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1368,10 +1368,11 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { getForcedEncodingSize() != 64) return Match_PreferE32; - if (Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa || - Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa) { + if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa || + Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa) { // v_mac_f32/16 allow only dst_sel == DWORD; - auto OpNum = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::dst_sel); + auto OpNum = + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::dst_sel); const auto &Op = Inst.getOperand(OpNum); if (!Op.isImm() || Op.getImm() != AMDGPU::SDWA::SdwaSel::DWORD) { return Match_InvalidOperand; @@ -2714,14 +2715,20 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); - // special case v_mac_f32: + // special case v_mac_{f16, f32}: // it has src2 register operand that is tied to dst operand // we don't allow modifiers for this operand in assembler so src2_modifiers // should be 0 if (Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_si || - Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi) { + Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || + Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi) { auto it = Inst.begin(); - std::advance(it, AMDGPU::getNamedOperandIdx(AMDGPU::V_MAC_F32_e64, AMDGPU::OpName::src2_modifiers)); + std::advance( + it, + AMDGPU::getNamedOperandIdx(Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ? + AMDGPU::V_MAC_F16_e64 : + AMDGPU::V_MAC_F32_e64, + AMDGPU::OpName::src2_modifiers)); it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 ++it; Inst.insert(it, Inst.getOperand(0)); // src2 = dst @@ -2896,11 +2903,13 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); - // special case v_mac_f32: + // special case v_mac_{f16, f32}: // it has src2 register operand that is tied to dst operand - if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp) { + if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp || + Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) { auto it = Inst.begin(); - std::advance(it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); + std::advance( + it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); Inst.insert(it, Inst.getOperand(0)); // src2 = dst } } @@ -3040,11 +3049,13 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } } - // special case v_mac_f32: + // special case v_mac_{f16, f32}: // it has src2 register operand that is tied to dst operand - if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa) { + if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa || + Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa) { auto it = Inst.begin(); - std::advance(it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); + std::advance( + it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); Inst.insert(it, Inst.getOperand(0)); // src2 = dst } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index d61893a43bd..6bd982be206 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -156,13 +156,15 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, const SIInstrInfo *TII) { if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { - // Special case for v_mac_f32_e64 if we are trying to fold into src2 + // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); - if (Opc == AMDGPU::V_MAC_F32_e64 && + if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) && (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { - // Check if changing this to a v_mad_f32 instruction will allow us to - // fold the operand. - MI->setDesc(TII->get(AMDGPU::V_MAD_F32)); + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + + // Check if changing this to a v_mad_{f16, f32} instruction will allow us + // to fold the operand. + MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16)); bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); if (FoldAsMAD) { MI->untieRegOperand(OpNo); @@ -239,10 +241,10 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, // make sense. e.g. don't fold: // // %vreg1 = COPY %vreg0:sub1 - // %vreg2<tied3> = V_MAC_F32 %vreg3, %vreg4, %vreg1<tied0> + // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0> // // into - // %vreg2<tied3> = V_MAC_F32 %vreg3, %vreg4, %vreg0:sub1<tied0> + // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0> if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) return; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6957510c054..d8ed325d098 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -78,8 +78,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); - if (Subtarget->has16BitInsts()) + if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); + } computeRegisterProperties(STI.getRegisterInfo()); @@ -263,20 +265,38 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::i64, MVT::i16, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); - AddPromotedToType(ISD::UINT_TO_FP, MVT::i16, MVT::i32); - setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); - AddPromotedToType(ISD::SINT_TO_FP, MVT::i16, MVT::i32); setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); - setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); - AddPromotedToType(ISD::FP_TO_SINT, MVT::i16, MVT::i32); + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom); + + // F16 - Constant Actions. + setOperationAction(ISD::ConstantFP, MVT::f16, Custom); + + // F16 - Load/Store Actions. + setOperationAction(ISD::LOAD, MVT::f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); + setOperationAction(ISD::STORE, MVT::f16, Promote); + AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); - setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); - AddPromotedToType(ISD::FP_TO_UINT, MVT::i16, MVT::i32); + // F16 - VOP1 Actions. + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + + // F16 - VOP2 Actions. + setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); + setOperationAction(ISD::FMINNUM, MVT::f16, Legal); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + + // F16 - VOP3 Actions. + setOperationAction(ISD::FMA, MVT::f16, Legal); + if (!Subtarget->hasFP16Denormals()) + setOperationAction(ISD::FMAD, MVT::f16, Legal); } setTargetDAGCombine(ISD::FADD); @@ -641,6 +661,7 @@ SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); } + SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, unsigned Offset, bool Signed) const { @@ -659,7 +680,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDValue Val; if (MemVT.isFloatingPoint()) - Val = DAG.getNode(ISD::FP_EXTEND, SL, VT, Load); + Val = getFPExtOrFPTrunc(DAG, Load, SL, VT); else if (Signed) Val = DAG.getSExtOrTrunc(Load, SL, VT); else @@ -1802,6 +1823,15 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); case ISD::TRAP: return lowerTRAP(Op, DAG); + + case ISD::ConstantFP: + return lowerConstantFP(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return lowerFpToInt(Op, DAG); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + return lowerIntToFp(Op, DAG); } return SDValue(); } @@ -1995,6 +2025,66 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, + SDValue Op, + const SDLoc &DL, + EVT VT) const { + return Op.getValueType().bitsLE(VT) ? + DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : + DAG.getNode(ISD::FTRUNC, DL, VT, Op); +} + +SDValue SITargetLowering::lowerConstantFP(SDValue Op, SelectionDAG &DAG) const { + if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(Op)) { + return DAG.getConstant(FP->getValueAPF().bitcastToAPInt().getZExtValue(), + SDLoc(Op), MVT::i32); + } + + return SDValue(); +} + +SDValue SITargetLowering::lowerFpToInt(SDValue Op, SelectionDAG &DAG) const { + EVT DstVT = Op.getValueType(); + EVT SrcVT = Op.getOperand(0).getValueType(); + if (DstVT == MVT::i64) { + return Op.getOpcode() == ISD::FP_TO_SINT ? + AMDGPUTargetLowering::LowerFP_TO_SINT(Op, DAG) : + AMDGPUTargetLowering::LowerFP_TO_UINT(Op, DAG); + } + + if (SrcVT == MVT::f16) + return Op; + + SDLoc DL(Op); + SDValue OrigSrc = Op.getOperand(0); + SDValue FPRoundFlag = DAG.getIntPtrConstant(0, DL); + SDValue FPRoundSrc = + DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, OrigSrc, FPRoundFlag); + + return DAG.getNode(Op.getOpcode(), DL, DstVT, FPRoundSrc); +} + +SDValue SITargetLowering::lowerIntToFp(SDValue Op, SelectionDAG &DAG) const { + EVT DstVT = Op.getValueType(); + EVT SrcVT = Op.getOperand(0).getValueType(); + if (SrcVT == MVT::i64) { + return Op.getOpcode() == ISD::SINT_TO_FP ? + AMDGPUTargetLowering::LowerSINT_TO_FP(Op, DAG) : + AMDGPUTargetLowering::LowerUINT_TO_FP(Op, DAG); + } + + if (DstVT == MVT::f16) + return Op; + + SDLoc DL(Op); + SDValue OrigSrc = Op.getOperand(0); + SDValue SExtOrZExtOrTruncSrc = Op.getOpcode() == ISD::SINT_TO_FP ? + DAG.getSExtOrTrunc(OrigSrc, DL, MVT::i32) : + DAG.getZExtOrTrunc(OrigSrc, DL, MVT::i32); + + return DAG.getNode(Op.getOpcode(), DL, DstVT, SExtOrZExtOrTruncSrc); +} + SDValue SITargetLowering::getSegmentAperture(unsigned AS, SelectionDAG &DAG) const { SDLoc SL; @@ -3562,7 +3652,8 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, SDValue RHS = N->getOperand(1); EVT VT = LHS.getValueType(); - if (VT != MVT::f32 && VT != MVT::f64) + if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && + VT != MVT::f16)) return SDValue(); // Match isinf pattern @@ -3706,8 +3797,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, // // Only do this if we are not trying to support denormals. v_mad_f32 does // not support denormals ever. - if (VT == MVT::f32 && - !Subtarget->hasFP32Denormals()) { + if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if (LHS.getOpcode() == ISD::FADD) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 05b98c9f903..32a3267f3c1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -46,6 +46,22 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + /// \brief Converts \p Op, which must be of floating point type, to the + /// floating point type \p VT, by either extending or truncating it. + SDValue getFPExtOrFPTrunc(SelectionDAG &DAG, + SDValue Op, + const SDLoc &DL, + EVT VT) const; + + /// \brief Custom lowering for ISD::ConstantFP. + SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Custom lowering for ISD::FP_TO_SINT, ISD::FP_TO_UINT. + SDValue lowerFpToInt(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Custom lowering for ISD::SINT_TO_FP, ISD::UINT_TO_FP. + SDValue lowerIntToFp(SDValue Op, SelectionDAG &DAG) const; + SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const; SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 02cbc882bf8..8550d12e6ad 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1386,7 +1386,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, return true; } - if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { + bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; + // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || @@ -1407,7 +1410,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); - // Multiplied part is the constant: Use v_madmk_f32 + // Multiplied part is the constant: Use v_madmk_{f16, f32}. // We should only expect these to be on src0 due to canonicalizations. if (Src0->isReg() && Src0->getReg() == Reg) { if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) @@ -1435,15 +1438,15 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Src0->setSubReg(Src1SubReg); Src0->setIsKill(Src1->isKill()); - if (Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); - } Src1->ChangeToImmediate(Imm); removeModOperands(UseMI); - UseMI.setDesc(get(AMDGPU::V_MADMK_F32)); + UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -1452,7 +1455,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, return true; } - // Added part is the constant: Use v_madak_f32 + // Added part is the constant: Use v_madak_{f16, f32}. if (Src2->isReg() && Src2->getReg() == Reg) { // Not allowed to use constant bus for another operand. // We can however allow an inline immediate as src0. @@ -1474,17 +1477,17 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, UseMI.RemoveOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); - if (Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); - } // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); // These come before src2. removeModOperands(UseMI); - UseMI.setDesc(get(AMDGPU::V_MADAK_F32)); + UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -1593,12 +1596,17 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const { + bool IsF16 = false; switch (MI.getOpcode()) { default: return nullptr; + case AMDGPU::V_MAC_F16_e64: + IsF16 = true; case AMDGPU::V_MAC_F32_e64: break; + case AMDGPU::V_MAC_F16_e32: + IsF16 = true; case AMDGPU::V_MAC_F32_e32: { const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); if (Src0->isImm() && !isInlineConstant(*Src0, 4)) @@ -1612,7 +1620,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); - return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32)) + return BuildMI(*MBB, MI, MI.getDebugLoc(), + get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) .addOperand(*Dst) .addImm(0) // Src0 mods .addOperand(*Src0) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index d770bd425c4..0a86305aee6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -939,14 +939,13 @@ class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { let HasExt = 0; } -// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order -// for the instruction patterns to work. def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; -def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>; -def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>; +def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>; +def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; -def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>; +def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; +def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>; @@ -960,10 +959,12 @@ def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>; def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>; def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>; +def VOP_I32_F16 : VOPProfile <[i32, f16, untyped, untyped]>; def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; +def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>; def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; @@ -976,6 +977,8 @@ def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; +def VOP_F16_F32_F16_F32 : VOPProfile <[f16, f32, f16, f32]>; +def VOP_F32_F32_F16_F16 : VOPProfile <[f32, f32, f16, f16]>; def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index b758a576047..0905df9cd43 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -413,6 +413,46 @@ def : Pat < } // End Predicates = [UnsafeFPMath] +def : Pat < + (f32 (fpextend f16:$src)), + (V_CVT_F32_F16_e32 $src) +>; + +def : Pat < + (f64 (fpextend f16:$src)), + (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) +>; + +def : Pat < + (f16 (fpround f32:$src)), + (V_CVT_F16_F32_e32 $src) +>; + +def : Pat < + (f16 (fpround f64:$src)), + (V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src)) +>; + +def : Pat < + (i32 (fp_to_sint f16:$src)), + (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src)) +>; + +def : Pat < + (i32 (fp_to_uint f16:$src)), + (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src)) +>; + +def : Pat < + (f16 (sint_to_fp i32:$src)), + (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src)) +>; + +def : Pat < + (f16 (uint_to_fp i32:$src)), + (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src)) +>; + //===----------------------------------------------------------------------===// // VOP2 Patterns //===----------------------------------------------------------------------===// @@ -427,11 +467,20 @@ def : Pat < (V_CNDMASK_B32_e64 $src2, $src1, $src0) >; +// Pattern for V_MAC_F16 +def : Pat < + (f16 (fmad (VOP3NoMods0 f16:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f16:$src1, i32:$src1_modifiers), + (VOP3NoMods f16:$src2, i32:$src2_modifiers))), + (V_MAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + $src2_modifiers, $src2, $clamp, $omod) +>; + // Pattern for V_MAC_F32 def : Pat < - (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3NoMods f32:$src1, i32:$src1_modifiers), - (VOP3NoMods f32:$src2, i32:$src2_modifiers)), + (f32 (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f32:$src1, i32:$src1_modifiers), + (VOP3NoMods f32:$src2, i32:$src2_modifiers))), (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, $clamp, $omod) >; @@ -506,6 +555,12 @@ foreach Index = 0-15 in { // FIXME: Why do only some of these type combinations for SReg and // VReg? +// 16-bit bitcast +def : BitConvert <i16, f16, VGPR_32>; +def : BitConvert <f16, i16, VGPR_32>; +def : BitConvert <i16, f16, SReg_32>; +def : BitConvert <f16, i16, SReg_32>; + // 32-bit bitcast def : BitConvert <i32, f32, VGPR_32>; def : BitConvert <f32, i32, VGPR_32>; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index a5ba0ef7e0e..d1907d16aba 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -123,7 +123,7 @@ def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> { // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add (sequence "SGPR%u", 0, 103))> { let AllocationPriority = 1; } @@ -190,8 +190,7 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], (add (decimate (shl TTMP_32, 3), 4))]>; // VGPR 32-bit registers -// i16 only on VI+ -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; @@ -252,14 +251,14 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. -def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32, +def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> { let AllocationPriority = 1; } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> { let AllocationPriority = 1; } @@ -347,7 +346,8 @@ def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { let Size = 32; } -def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add VGPR_32, SReg_32)> { +def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32, + (add VGPR_32, SReg_32)> { let isAllocatable = 0; } diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 7f9e9cded63..9ee2ededbb0 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -91,6 +91,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, default: return false; case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAC_F16_e64: if (!isVGPR(Src2, TRI, MRI) || TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) return false; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index b2840982462..b79b6b70cb2 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -280,24 +280,24 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; let SubtargetPredicate = isVI in { -defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16>; -defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16>; -defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16>; -defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16>; -defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16>; -defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16>; -defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16>; -defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16>; -defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16>; -defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16>; -defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16>; -defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16>; -defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16>; -defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16>; -defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16>; -defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16>; -defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16>; -defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16>; +defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>; +defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>; +defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; +defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; +defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; +defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>; +defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>; +defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>; +defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>; +defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; +defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I32_F16, int_amdgcn_frexp_exp>; +defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>; +defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>; +defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>; +defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>; +defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; +defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; +defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 570ca05587b..8e86aa0796e 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -133,19 +133,25 @@ multiclass VOP2eInst <string opName, } } -def VOP_MADAK : VOPProfile <[f32, f32, f32, f32]> { +class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, f32kimm:$imm); field string Asm32 = "$vdst, $src0, $src1, $imm"; field bit HasExt = 0; } -def VOP_MADMK : VOPProfile <[f32, f32, f32, f32]> { +def VOP_MADAK_F16 : VOP_MADAK <f16>; +def VOP_MADAK_F32 : VOP_MADAK <f32>; + +class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { field dag Ins32 = (ins VCSrc_f32:$src0, f32kimm:$imm, VGPR_32:$src1); field string Asm32 = "$vdst, $src0, $imm, $src1"; field bit HasExt = 0; } -def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { +def VOP_MADMK_F16 : VOP_MADMK <f16>; +def VOP_MADMK_F32 : VOP_MADMK <f32>; + +class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret; @@ -159,15 +165,26 @@ def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { VGPR_32:$src2, // stub argument clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); - let Asm32 = getAsm32<1, 2, f32>.ret; - let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret; - let AsmDPP = getAsmDPP<1, 2, HasModifiers, f32>.ret; - let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, f32>.ret; + let Asm32 = getAsm32<1, 2, vt>.ret; + let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; + let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret; let HasSrc2 = 0; let HasSrc2Mods = 0; let HasExt = 1; } +def VOP_MAC_F16 : VOP_MAC <f16> { + // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives + // 'not a string initializer' error. + let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret; +} + +def VOP_MAC_F32 : VOP_MAC <f32> { + // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives + // 'not a string initializer' error. + let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret; +} + // Write out to vcc or arbitrary SGPR. def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { let Asm32 = "$vdst, vcc, $src0, $src1"; @@ -233,7 +250,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> { let SubtargetPredicate = isGCN in { defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; -def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK>; +def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>; let isCommutable = 1 in { defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>; @@ -260,10 +277,10 @@ defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_I32_I32_I32>; let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { -defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC>; +defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>; } -def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK>; +def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>; // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. @@ -318,29 +335,33 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; let SubtargetPredicate = isVI in { -def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK>; +def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>; defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; defm V_ASHRREV_B16 : VOP2Inst <"v_ashrrev_b16", VOP_I16_I16_I16>; -defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I16>; +defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; let isCommutable = 1 in { -defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16>; -defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16>; +defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; +defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; -defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16>; -defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_F16_F16_F16>; -def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK>; +defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; +def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>; defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16>; defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>; -defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16>; -defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16>; +defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>; +defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>; defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>; defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>; defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>; defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>; + +let Constraints = "$vdst = $src2", DisableEncoding="$src2", + isConvertibleToThreeAddress = 1 in { +defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; +} } // End isCommutable = 1 } // End SubtargetPredicate = isVI diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 73e331503ad..87a7c4044ab 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -215,10 +215,18 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3_Profile<VOP_I64_I32_I32_I64> let SubtargetPredicate = isVI in { let isCommutable = 1 in { - def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>; - def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>; - def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>; -} + +def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>; +def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>; +def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>; +def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>; +def V_INTERP_P2_F16 : VOP3Inst <"v_interp_p2_f16", VOP3_Profile<VOP_F16_F32_F16_F32>>; +def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; + +def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>; +def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>; + +} // End isCommutable = 1 } // End SubtargetPredicate = isVI @@ -415,6 +423,12 @@ defm V_MAD_F16 : VOP3_Real_vi <0x1ea>; defm V_MAD_U16 : VOP3_Real_vi <0x1eb>; defm V_MAD_I16 : VOP3_Real_vi <0x1ec>; +defm V_FMA_F16 : VOP3_Real_vi <0x1ee>; +defm V_DIV_FIXUP_F16 : VOP3_Real_vi <0x1ef>; + +defm V_INTERP_P1LL_F16 : VOP3_Real_vi <0x274>; +defm V_INTERP_P1LV_F16 : VOP3_Real_vi <0x275>; +defm V_INTERP_P2_F16 : VOP3_Real_vi <0x276>; defm V_ADD_F64 : VOP3_Real_vi <0x280>; defm V_MUL_F64 : VOP3_Real_vi <0x281>; defm V_MIN_F64 : VOP3_Real_vi <0x282>; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 8014a2d67c2..2742587c9c3 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -144,11 +144,15 @@ multiclass VOPC_Pseudos <string opName, } } +def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>; def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>; def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>; def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>; def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>; +multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>; + multiclass VOPC_F32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>; @@ -161,6 +165,9 @@ multiclass VOPC_I32 <string opName, PatLeaf cond = COND_NULL, string revOp = opN multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>; +multiclass VOPCX_F16 <string opName, string revOp = opName> : + VOPC_Pseudos <opName, VOPC_I1_F16_F16, COND_NULL, revOp, 1>; + multiclass VOPCX_F32 <string opName, string revOp = opName> : VOPC_Pseudos <opName, VOPC_I1_F32_F32, COND_NULL, revOp, 1>; @@ -318,6 +325,44 @@ defm V_CMPSX_TRU_F64 : VOPCX_F64 <"v_cmpsx_tru_f64">; } // End SubtargetPredicate = isSICI +let SubtargetPredicate = isVI in { + +defm V_CMP_F_F16 : VOPC_F16 <"v_cmp_f_f16">; +defm V_CMP_LT_F16 : VOPC_F16 <"v_cmp_lt_f16", COND_OLT, "v_cmp_gt_f16">; +defm V_CMP_EQ_F16 : VOPC_F16 <"v_cmp_eq_f16", COND_OEQ>; +defm V_CMP_LE_F16 : VOPC_F16 <"v_cmp_le_f16", COND_OLE, "v_cmp_ge_f16">; +defm V_CMP_GT_F16 : VOPC_F16 <"v_cmp_gt_f16", COND_OGT>; +defm V_CMP_LG_F16 : VOPC_F16 <"v_cmp_lg_f16", COND_ONE>; +defm V_CMP_GE_F16 : VOPC_F16 <"v_cmp_ge_f16", COND_OGE>; +defm V_CMP_O_F16 : VOPC_F16 <"v_cmp_o_f16", COND_O>; +defm V_CMP_U_F16 : VOPC_F16 <"v_cmp_u_f16", COND_UO>; +defm V_CMP_NGE_F16 : VOPC_F16 <"v_cmp_nge_f16", COND_ULT, "v_cmp_nle_f16">; +defm V_CMP_NLG_F16 : VOPC_F16 <"v_cmp_nlg_f16", COND_UEQ>; +defm V_CMP_NGT_F16 : VOPC_F16 <"v_cmp_ngt_f16", COND_ULE, "v_cmp_nlt_f16">; +defm V_CMP_NLE_F16 : VOPC_F16 <"v_cmp_nle_f16", COND_UGT>; +defm V_CMP_NEQ_F16 : VOPC_F16 <"v_cmp_neq_f16", COND_UNE>; +defm V_CMP_NLT_F16 : VOPC_F16 <"v_cmp_nlt_f16", COND_UGE>; +defm V_CMP_TRU_F16 : VOPC_F16 <"v_cmp_tru_f16">; + +defm V_CMPX_F_F16 : VOPCX_F16 <"v_cmpx_f_f16">; +defm V_CMPX_LT_F16 : VOPCX_F16 <"v_cmpx_lt_f16", "v_cmpx_gt_f16">; +defm V_CMPX_EQ_F16 : VOPCX_F16 <"v_cmpx_eq_f16">; +defm V_CMPX_LE_F16 : VOPCX_F16 <"v_cmpx_le_f16", "v_cmpx_ge_f16">; +defm V_CMPX_GT_F16 : VOPCX_F16 <"v_cmpx_gt_f16">; +defm V_CMPX_LG_F16 : VOPCX_F16 <"v_cmpx_lg_f16">; +defm V_CMPX_GE_F16 : VOPCX_F16 <"v_cmpx_ge_f16">; +defm V_CMPX_O_F16 : VOPCX_F16 <"v_cmpx_o_f16">; +defm V_CMPX_U_F16 : VOPCX_F16 <"v_cmpx_u_f16">; +defm V_CMPX_NGE_F16 : VOPCX_F16 <"v_cmpx_nge_f16">; +defm V_CMPX_NLG_F16 : VOPCX_F16 <"v_cmpx_nlg_f16">; +defm V_CMPX_NGT_F16 : VOPCX_F16 <"v_cmpx_ngt_f16">; +defm V_CMPX_NLE_F16 : VOPCX_F16 <"v_cmpx_nle_f16">; +defm V_CMPX_NEQ_F16 : VOPCX_F16 <"v_cmpx_neq_f16">; +defm V_CMPX_NLT_F16 : VOPCX_F16 <"v_cmpx_nlt_f16">; +defm V_CMPX_TRU_F16 : VOPCX_F16 <"v_cmpx_tru_f16">; + +} // End SubtargetPredicate = isVI + defm V_CMP_F_I32 : VOPC_I32 <"v_cmp_f_i32">; defm V_CMP_LT_I32 : VOPC_I32 <"v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; defm V_CMP_EQ_I32 : VOPC_I32 <"v_cmp_eq_i32">; @@ -429,9 +474,16 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> { } } +def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>; def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>; def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>; +multiclass VOPC_CLASS_F16 <string opName> : + VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>; + +multiclass VOPCX_CLASS_F16 <string opName> : + VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>; + multiclass VOPC_CLASS_F32 <string opName> : VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>; @@ -448,6 +500,8 @@ defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">; defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">; defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">; defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">; +defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">; +defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; //===----------------------------------------------------------------------===// // V_ICMPIntrinsic Pattern. @@ -810,147 +864,183 @@ multiclass VOPC_Real_vi <bits<10> op> { } } -defm V_CMP_F_F32 : VOPC_Real_vi <0x40>; -defm V_CMP_LT_F32 : VOPC_Real_vi <0x41>; -defm V_CMP_EQ_F32 : VOPC_Real_vi <0x42>; -defm V_CMP_LE_F32 : VOPC_Real_vi <0x43>; -defm V_CMP_GT_F32 : VOPC_Real_vi <0x44>; -defm V_CMP_LG_F32 : VOPC_Real_vi <0x45>; -defm V_CMP_GE_F32 : VOPC_Real_vi <0x46>; -defm V_CMP_O_F32 : VOPC_Real_vi <0x47>; -defm V_CMP_U_F32 : VOPC_Real_vi <0x48>; -defm V_CMP_NGE_F32 : VOPC_Real_vi <0x49>; -defm V_CMP_NLG_F32 : VOPC_Real_vi <0x4a>; -defm V_CMP_NGT_F32 : VOPC_Real_vi <0x4b>; -defm V_CMP_NLE_F32 : VOPC_Real_vi <0x4c>; -defm V_CMP_NEQ_F32 : VOPC_Real_vi <0x4d>; -defm V_CMP_NLT_F32 : VOPC_Real_vi <0x4e>; -defm V_CMP_TRU_F32 : VOPC_Real_vi <0x4f>; - -defm V_CMPX_F_F32 : VOPC_Real_vi <0x50>; -defm V_CMPX_LT_F32 : VOPC_Real_vi <0x51>; -defm V_CMPX_EQ_F32 : VOPC_Real_vi <0x52>; -defm V_CMPX_LE_F32 : VOPC_Real_vi <0x53>; -defm V_CMPX_GT_F32 : VOPC_Real_vi <0x54>; -defm V_CMPX_LG_F32 : VOPC_Real_vi <0x55>; -defm V_CMPX_GE_F32 : VOPC_Real_vi <0x56>; -defm V_CMPX_O_F32 : VOPC_Real_vi <0x57>; -defm V_CMPX_U_F32 : VOPC_Real_vi <0x58>; -defm V_CMPX_NGE_F32 : VOPC_Real_vi <0x59>; -defm V_CMPX_NLG_F32 : VOPC_Real_vi <0x5a>; -defm V_CMPX_NGT_F32 : VOPC_Real_vi <0x5b>; -defm V_CMPX_NLE_F32 : VOPC_Real_vi <0x5c>; -defm V_CMPX_NEQ_F32 : VOPC_Real_vi <0x5d>; -defm V_CMPX_NLT_F32 : VOPC_Real_vi <0x5e>; -defm V_CMPX_TRU_F32 : VOPC_Real_vi <0x5f>; - -defm V_CMP_F_F64 : VOPC_Real_vi <0x60>; -defm V_CMP_LT_F64 : VOPC_Real_vi <0x61>; -defm V_CMP_EQ_F64 : VOPC_Real_vi <0x62>; -defm V_CMP_LE_F64 : VOPC_Real_vi <0x63>; -defm V_CMP_GT_F64 : VOPC_Real_vi <0x64>; -defm V_CMP_LG_F64 : VOPC_Real_vi <0x65>; -defm V_CMP_GE_F64 : VOPC_Real_vi <0x66>; -defm V_CMP_O_F64 : VOPC_Real_vi <0x67>; -defm V_CMP_U_F64 : VOPC_Real_vi <0x68>; -defm V_CMP_NGE_F64 : VOPC_Real_vi <0x69>; -defm V_CMP_NLG_F64 : VOPC_Real_vi <0x6a>; -defm V_CMP_NGT_F64 : VOPC_Real_vi <0x6b>; -defm V_CMP_NLE_F64 : VOPC_Real_vi <0x6c>; -defm V_CMP_NEQ_F64 : VOPC_Real_vi <0x6d>; -defm V_CMP_NLT_F64 : VOPC_Real_vi <0x6e>; -defm V_CMP_TRU_F64 : VOPC_Real_vi <0x6f>; - -defm V_CMPX_F_F64 : VOPC_Real_vi <0x70>; -defm V_CMPX_LT_F64 : VOPC_Real_vi <0x71>; -defm V_CMPX_EQ_F64 : VOPC_Real_vi <0x72>; -defm V_CMPX_LE_F64 : VOPC_Real_vi <0x73>; -defm V_CMPX_GT_F64 : VOPC_Real_vi <0x74>; -defm V_CMPX_LG_F64 : VOPC_Real_vi <0x75>; -defm V_CMPX_GE_F64 : VOPC_Real_vi <0x76>; -defm V_CMPX_O_F64 : VOPC_Real_vi <0x77>; -defm V_CMPX_U_F64 : VOPC_Real_vi <0x78>; -defm V_CMPX_NGE_F64 : VOPC_Real_vi <0x79>; -defm V_CMPX_NLG_F64 : VOPC_Real_vi <0x7a>; -defm V_CMPX_NGT_F64 : VOPC_Real_vi <0x7b>; -defm V_CMPX_NLE_F64 : VOPC_Real_vi <0x7c>; -defm V_CMPX_NEQ_F64 : VOPC_Real_vi <0x7d>; -defm V_CMPX_NLT_F64 : VOPC_Real_vi <0x7e>; -defm V_CMPX_TRU_F64 : VOPC_Real_vi <0x7f>; - -defm V_CMP_F_I32 : VOPC_Real_vi <0xc0>; -defm V_CMP_LT_I32 : VOPC_Real_vi <0xc1>; -defm V_CMP_EQ_I32 : VOPC_Real_vi <0xc2>; -defm V_CMP_LE_I32 : VOPC_Real_vi <0xc3>; -defm V_CMP_GT_I32 : VOPC_Real_vi <0xc4>; -defm V_CMP_NE_I32 : VOPC_Real_vi <0xc5>; -defm V_CMP_GE_I32 : VOPC_Real_vi <0xc6>; -defm V_CMP_T_I32 : VOPC_Real_vi <0xc7>; - -defm V_CMPX_F_I32 : VOPC_Real_vi <0xd0>; -defm V_CMPX_LT_I32 : VOPC_Real_vi <0xd1>; -defm V_CMPX_EQ_I32 : VOPC_Real_vi <0xd2>; -defm V_CMPX_LE_I32 : VOPC_Real_vi <0xd3>; -defm V_CMPX_GT_I32 : VOPC_Real_vi <0xd4>; -defm V_CMPX_NE_I32 : VOPC_Real_vi <0xd5>; -defm V_CMPX_GE_I32 : VOPC_Real_vi <0xd6>; -defm V_CMPX_T_I32 : VOPC_Real_vi <0xd7>; - -defm V_CMP_F_I64 : VOPC_Real_vi <0xe0>; -defm V_CMP_LT_I64 : VOPC_Real_vi <0xe1>; -defm V_CMP_EQ_I64 : VOPC_Real_vi <0xe2>; -defm V_CMP_LE_I64 : VOPC_Real_vi <0xe3>; -defm V_CMP_GT_I64 : VOPC_Real_vi <0xe4>; -defm V_CMP_NE_I64 : VOPC_Real_vi <0xe5>; -defm V_CMP_GE_I64 : VOPC_Real_vi <0xe6>; -defm V_CMP_T_I64 : VOPC_Real_vi <0xe7>; - -defm V_CMPX_F_I64 : VOPC_Real_vi <0xf0>; -defm V_CMPX_LT_I64 : VOPC_Real_vi <0xf1>; -defm V_CMPX_EQ_I64 : VOPC_Real_vi <0xf2>; -defm V_CMPX_LE_I64 : VOPC_Real_vi <0xf3>; -defm V_CMPX_GT_I64 : VOPC_Real_vi <0xf4>; -defm V_CMPX_NE_I64 : VOPC_Real_vi <0xf5>; -defm V_CMPX_GE_I64 : VOPC_Real_vi <0xf6>; -defm V_CMPX_T_I64 : VOPC_Real_vi <0xf7>; - -defm V_CMP_F_U32 : VOPC_Real_vi <0xc8>; -defm V_CMP_LT_U32 : VOPC_Real_vi <0xc9>; -defm V_CMP_EQ_U32 : VOPC_Real_vi <0xca>; -defm V_CMP_LE_U32 : VOPC_Real_vi <0xcb>; -defm V_CMP_GT_U32 : VOPC_Real_vi <0xcc>; -defm V_CMP_NE_U32 : VOPC_Real_vi <0xcd>; -defm V_CMP_GE_U32 : VOPC_Real_vi <0xce>; -defm V_CMP_T_U32 : VOPC_Real_vi <0xcf>; - -defm V_CMPX_F_U32 : VOPC_Real_vi <0xd8>; -defm V_CMPX_LT_U32 : VOPC_Real_vi <0xd9>; -defm V_CMPX_EQ_U32 : VOPC_Real_vi <0xda>; -defm V_CMPX_LE_U32 : VOPC_Real_vi <0xdb>; -defm V_CMPX_GT_U32 : VOPC_Real_vi <0xdc>; -defm V_CMPX_NE_U32 : VOPC_Real_vi <0xdd>; -defm V_CMPX_GE_U32 : VOPC_Real_vi <0xde>; -defm V_CMPX_T_U32 : VOPC_Real_vi <0xdf>; - -defm V_CMP_F_U64 : VOPC_Real_vi <0xe8>; -defm V_CMP_LT_U64 : VOPC_Real_vi <0xe9>; -defm V_CMP_EQ_U64 : VOPC_Real_vi <0xea>; -defm V_CMP_LE_U64 : VOPC_Real_vi <0xeb>; -defm V_CMP_GT_U64 : VOPC_Real_vi <0xec>; -defm V_CMP_NE_U64 : VOPC_Real_vi <0xed>; -defm V_CMP_GE_U64 : VOPC_Real_vi <0xee>; -defm V_CMP_T_U64 : VOPC_Real_vi <0xef>; - -defm V_CMPX_F_U64 : VOPC_Real_vi <0xf8>; -defm V_CMPX_LT_U64 : VOPC_Real_vi <0xf9>; -defm V_CMPX_EQ_U64 : VOPC_Real_vi <0xfa>; -defm V_CMPX_LE_U64 : VOPC_Real_vi <0xfb>; -defm V_CMPX_GT_U64 : VOPC_Real_vi <0xfc>; -defm V_CMPX_NE_U64 : VOPC_Real_vi <0xfd>; -defm V_CMPX_GE_U64 : VOPC_Real_vi <0xfe>; -defm V_CMPX_T_U64 : VOPC_Real_vi <0xff>; - defm V_CMP_CLASS_F32 : VOPC_Real_vi <0x10>; defm V_CMPX_CLASS_F32 : VOPC_Real_vi <0x11>; defm V_CMP_CLASS_F64 : VOPC_Real_vi <0x12>; defm V_CMPX_CLASS_F64 : VOPC_Real_vi <0x13>; +defm V_CMP_CLASS_F16 : VOPC_Real_vi <0x14>; +defm V_CMPX_CLASS_F16 : VOPC_Real_vi <0x15>; + +defm V_CMP_F_F16 : VOPC_Real_vi <0x20>; +defm V_CMP_LT_F16 : VOPC_Real_vi <0x21>; +defm V_CMP_EQ_F16 : VOPC_Real_vi <0x22>; +defm V_CMP_LE_F16 : VOPC_Real_vi <0x23>; +defm V_CMP_GT_F16 : VOPC_Real_vi <0x24>; +defm V_CMP_LG_F16 : VOPC_Real_vi <0x25>; +defm V_CMP_GE_F16 : VOPC_Real_vi <0x26>; +defm V_CMP_O_F16 : VOPC_Real_vi <0x27>; +defm V_CMP_U_F16 : VOPC_Real_vi <0x28>; +defm V_CMP_NGE_F16 : VOPC_Real_vi <0x29>; +defm V_CMP_NLG_F16 : VOPC_Real_vi <0x2a>; +defm V_CMP_NGT_F16 : VOPC_Real_vi <0x2b>; +defm V_CMP_NLE_F16 : VOPC_Real_vi <0x2c>; +defm V_CMP_NEQ_F16 : VOPC_Real_vi <0x2d>; +defm V_CMP_NLT_F16 : VOPC_Real_vi <0x2e>; +defm V_CMP_TRU_F16 : VOPC_Real_vi <0x2f>; + +defm V_CMPX_F_F16 : VOPC_Real_vi <0x30>; +defm V_CMPX_LT_F16 : VOPC_Real_vi <0x31>; +defm V_CMPX_EQ_F16 : VOPC_Real_vi <0x32>; +defm V_CMPX_LE_F16 : VOPC_Real_vi <0x33>; +defm V_CMPX_GT_F16 : VOPC_Real_vi <0x34>; +defm V_CMPX_LG_F16 : VOPC_Real_vi <0x35>; +defm V_CMPX_GE_F16 : VOPC_Real_vi <0x36>; +defm V_CMPX_O_F16 : VOPC_Real_vi <0x37>; +defm V_CMPX_U_F16 : VOPC_Real_vi <0x38>; +defm V_CMPX_NGE_F16 : VOPC_Real_vi <0x39>; +defm V_CMPX_NLG_F16 : VOPC_Real_vi <0x3a>; +defm V_CMPX_NGT_F16 : VOPC_Real_vi <0x3b>; +defm V_CMPX_NLE_F16 : VOPC_Real_vi <0x3c>; +defm V_CMPX_NEQ_F16 : VOPC_Real_vi <0x3d>; +defm V_CMPX_NLT_F16 : VOPC_Real_vi <0x3e>; +defm V_CMPX_TRU_F16 : VOPC_Real_vi <0x3f>; + +defm V_CMP_F_F32 : VOPC_Real_vi <0x40>; +defm V_CMP_LT_F32 : VOPC_Real_vi <0x41>; +defm V_CMP_EQ_F32 : VOPC_Real_vi <0x42>; +defm V_CMP_LE_F32 : VOPC_Real_vi <0x43>; +defm V_CMP_GT_F32 : VOPC_Real_vi <0x44>; +defm V_CMP_LG_F32 : VOPC_Real_vi <0x45>; +defm V_CMP_GE_F32 : VOPC_Real_vi <0x46>; +defm V_CMP_O_F32 : VOPC_Real_vi <0x47>; +defm V_CMP_U_F32 : VOPC_Real_vi <0x48>; +defm V_CMP_NGE_F32 : VOPC_Real_vi <0x49>; +defm V_CMP_NLG_F32 : VOPC_Real_vi <0x4a>; +defm V_CMP_NGT_F32 : VOPC_Real_vi <0x4b>; +defm V_CMP_NLE_F32 : VOPC_Real_vi <0x4c>; +defm V_CMP_NEQ_F32 : VOPC_Real_vi <0x4d>; +defm V_CMP_NLT_F32 : VOPC_Real_vi <0x4e>; +defm V_CMP_TRU_F32 : VOPC_Real_vi <0x4f>; + +defm V_CMPX_F_F32 : VOPC_Real_vi <0x50>; +defm V_CMPX_LT_F32 : VOPC_Real_vi <0x51>; +defm V_CMPX_EQ_F32 : VOPC_Real_vi <0x52>; +defm V_CMPX_LE_F32 : VOPC_Real_vi <0x53>; +defm V_CMPX_GT_F32 : VOPC_Real_vi <0x54>; +defm V_CMPX_LG_F32 : VOPC_Real_vi <0x55>; +defm V_CMPX_GE_F32 : VOPC_Real_vi <0x56>; +defm V_CMPX_O_F32 : VOPC_Real_vi <0x57>; +defm V_CMPX_U_F32 : VOPC_Real_vi <0x58>; +defm V_CMPX_NGE_F32 : VOPC_Real_vi <0x59>; +defm V_CMPX_NLG_F32 : VOPC_Real_vi <0x5a>; +defm V_CMPX_NGT_F32 : VOPC_Real_vi <0x5b>; +defm V_CMPX_NLE_F32 : VOPC_Real_vi <0x5c>; +defm V_CMPX_NEQ_F32 : VOPC_Real_vi <0x5d>; +defm V_CMPX_NLT_F32 : VOPC_Real_vi <0x5e>; +defm V_CMPX_TRU_F32 : VOPC_Real_vi <0x5f>; + +defm V_CMP_F_F64 : VOPC_Real_vi <0x60>; +defm V_CMP_LT_F64 : VOPC_Real_vi <0x61>; +defm V_CMP_EQ_F64 : VOPC_Real_vi <0x62>; +defm V_CMP_LE_F64 : VOPC_Real_vi <0x63>; +defm V_CMP_GT_F64 : VOPC_Real_vi <0x64>; +defm V_CMP_LG_F64 : VOPC_Real_vi <0x65>; +defm V_CMP_GE_F64 : VOPC_Real_vi <0x66>; +defm V_CMP_O_F64 : VOPC_Real_vi <0x67>; +defm V_CMP_U_F64 : VOPC_Real_vi <0x68>; +defm V_CMP_NGE_F64 : VOPC_Real_vi <0x69>; +defm V_CMP_NLG_F64 : VOPC_Real_vi <0x6a>; +defm V_CMP_NGT_F64 : VOPC_Real_vi <0x6b>; +defm V_CMP_NLE_F64 : VOPC_Real_vi <0x6c>; +defm V_CMP_NEQ_F64 : VOPC_Real_vi <0x6d>; +defm V_CMP_NLT_F64 : VOPC_Real_vi <0x6e>; +defm V_CMP_TRU_F64 : VOPC_Real_vi <0x6f>; + +defm V_CMPX_F_F64 : VOPC_Real_vi <0x70>; +defm V_CMPX_LT_F64 : VOPC_Real_vi <0x71>; +defm V_CMPX_EQ_F64 : VOPC_Real_vi <0x72>; +defm V_CMPX_LE_F64 : VOPC_Real_vi <0x73>; +defm V_CMPX_GT_F64 : VOPC_Real_vi <0x74>; +defm V_CMPX_LG_F64 : VOPC_Real_vi <0x75>; +defm V_CMPX_GE_F64 : VOPC_Real_vi <0x76>; +defm V_CMPX_O_F64 : VOPC_Real_vi <0x77>; +defm V_CMPX_U_F64 : VOPC_Real_vi <0x78>; +defm V_CMPX_NGE_F64 : VOPC_Real_vi <0x79>; +defm V_CMPX_NLG_F64 : VOPC_Real_vi <0x7a>; +defm V_CMPX_NGT_F64 : VOPC_Real_vi <0x7b>; +defm V_CMPX_NLE_F64 : VOPC_Real_vi <0x7c>; +defm V_CMPX_NEQ_F64 : VOPC_Real_vi <0x7d>; +defm V_CMPX_NLT_F64 : VOPC_Real_vi <0x7e>; +defm V_CMPX_TRU_F64 : VOPC_Real_vi <0x7f>; + +defm V_CMP_F_I32 : VOPC_Real_vi <0xc0>; +defm V_CMP_LT_I32 : VOPC_Real_vi <0xc1>; +defm V_CMP_EQ_I32 : VOPC_Real_vi <0xc2>; +defm V_CMP_LE_I32 : VOPC_Real_vi <0xc3>; +defm V_CMP_GT_I32 : VOPC_Real_vi <0xc4>; +defm V_CMP_NE_I32 : VOPC_Real_vi <0xc5>; +defm V_CMP_GE_I32 : VOPC_Real_vi <0xc6>; +defm V_CMP_T_I32 : VOPC_Real_vi <0xc7>; + +defm V_CMPX_F_I32 : VOPC_Real_vi <0xd0>; +defm V_CMPX_LT_I32 : VOPC_Real_vi <0xd1>; +defm V_CMPX_EQ_I32 : VOPC_Real_vi <0xd2>; +defm V_CMPX_LE_I32 : VOPC_Real_vi <0xd3>; +defm V_CMPX_GT_I32 : VOPC_Real_vi <0xd4>; +defm V_CMPX_NE_I32 : VOPC_Real_vi <0xd5>; +defm V_CMPX_GE_I32 : VOPC_Real_vi <0xd6>; +defm V_CMPX_T_I32 : VOPC_Real_vi <0xd7>; + +defm V_CMP_F_I64 : VOPC_Real_vi <0xe0>; +defm V_CMP_LT_I64 : VOPC_Real_vi <0xe1>; +defm V_CMP_EQ_I64 : VOPC_Real_vi <0xe2>; +defm V_CMP_LE_I64 : VOPC_Real_vi <0xe3>; +defm V_CMP_GT_I64 : VOPC_Real_vi <0xe4>; +defm V_CMP_NE_I64 : VOPC_Real_vi <0xe5>; +defm V_CMP_GE_I64 : VOPC_Real_vi <0xe6>; +defm V_CMP_T_I64 : VOPC_Real_vi <0xe7>; + +defm V_CMPX_F_I64 : VOPC_Real_vi <0xf0>; +defm V_CMPX_LT_I64 : VOPC_Real_vi <0xf1>; +defm V_CMPX_EQ_I64 : VOPC_Real_vi <0xf2>; +defm V_CMPX_LE_I64 : VOPC_Real_vi <0xf3>; +defm V_CMPX_GT_I64 : VOPC_Real_vi <0xf4>; +defm V_CMPX_NE_I64 : VOPC_Real_vi <0xf5>; +defm V_CMPX_GE_I64 : VOPC_Real_vi <0xf6>; +defm V_CMPX_T_I64 : VOPC_Real_vi <0xf7>; + +defm V_CMP_F_U32 : VOPC_Real_vi <0xc8>; +defm V_CMP_LT_U32 : VOPC_Real_vi <0xc9>; +defm V_CMP_EQ_U32 : VOPC_Real_vi <0xca>; +defm V_CMP_LE_U32 : VOPC_Real_vi <0xcb>; +defm V_CMP_GT_U32 : VOPC_Real_vi <0xcc>; +defm V_CMP_NE_U32 : VOPC_Real_vi <0xcd>; +defm V_CMP_GE_U32 : VOPC_Real_vi <0xce>; +defm V_CMP_T_U32 : VOPC_Real_vi <0xcf>; + +defm V_CMPX_F_U32 : VOPC_Real_vi <0xd8>; +defm V_CMPX_LT_U32 : VOPC_Real_vi <0xd9>; +defm V_CMPX_EQ_U32 : VOPC_Real_vi <0xda>; +defm V_CMPX_LE_U32 : VOPC_Real_vi <0xdb>; +defm V_CMPX_GT_U32 : VOPC_Real_vi <0xdc>; +defm V_CMPX_NE_U32 : VOPC_Real_vi <0xdd>; +defm V_CMPX_GE_U32 : VOPC_Real_vi <0xde>; +defm V_CMPX_T_U32 : VOPC_Real_vi <0xdf>; + +defm V_CMP_F_U64 : VOPC_Real_vi <0xe8>; +defm V_CMP_LT_U64 : VOPC_Real_vi <0xe9>; +defm V_CMP_EQ_U64 : VOPC_Real_vi <0xea>; +defm V_CMP_LE_U64 : VOPC_Real_vi <0xeb>; +defm V_CMP_GT_U64 : VOPC_Real_vi <0xec>; +defm V_CMP_NE_U64 : VOPC_Real_vi <0xed>; +defm V_CMP_GE_U64 : VOPC_Real_vi <0xee>; +defm V_CMP_T_U64 : VOPC_Real_vi <0xef>; + +defm V_CMPX_F_U64 : VOPC_Real_vi <0xf8>; +defm V_CMPX_LT_U64 : VOPC_Real_vi <0xf9>; +defm V_CMPX_EQ_U64 : VOPC_Real_vi <0xfa>; +defm V_CMPX_LE_U64 : VOPC_Real_vi <0xfb>; +defm V_CMPX_GT_U64 : VOPC_Real_vi <0xfc>; +defm V_CMPX_NE_U64 : VOPC_Real_vi <0xfd>; +defm V_CMPX_GE_U64 : VOPC_Real_vi <0xfe>; +defm V_CMPX_T_U64 : VOPC_Real_vi <0xff>; |