diff options
Diffstat (limited to 'llvm/lib')
19 files changed, 292 insertions, 90 deletions
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 642a89fe930..53664906847 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -400,6 +400,8 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX810, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX902, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX904, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH); BCase(EF_AMDGPU_XNACK); break; case ELF::EM_X86_64: diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 37c593d951e..73490f6c421 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -127,6 +127,12 @@ def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts", "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions" >; +def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", + "HasFmaMixInsts", + "true", + "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" +>; + // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support // XNACK. The current default kernel driver setting is: // - graphics ring: XNACK disabled @@ -310,6 +316,12 @@ def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem", "Has unpacked d16 vmem instructions" >; +def FeatureDLInsts : SubtargetFeature<"dl-insts", + "HasDLInsts", + "true", + "Has deep learning instructions" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -606,6 +618,18 @@ def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2, FeatureXNACK ]>; +def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4, + [FeatureGFX9, + FeatureLDSBankCount32, + FeatureFmaMixInsts]>; + +def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6, + [FeatureGFX9, + HalfRate64Ops, + FeatureFmaMixInsts, + FeatureLDSBankCount32, + FeatureDLInsts]>; + //===----------------------------------------------------------------------===// // Debugger related subtarget features. //===----------------------------------------------------------------------===// @@ -788,6 +812,13 @@ def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, def HasMovrel : Predicate<"Subtarget->hasMovrel()">, AssemblerPredicate<"FeatureMovrel">; +def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, + AssemblerPredicate<"FeatureFmaMixInsts">; + +def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, + AssemblerPredicate<"FeatureDLInsts">; + + def EnableLateCFGStructurize : Predicate< "EnableLateStructurizeCFG">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 1b98edcf95b..47321a76e5c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -215,7 +215,7 @@ private: void SelectS_BFE(SDNode *N); bool isCBranchSCC(const SDNode *N) const; void SelectBRCOND(SDNode *N); - void SelectFMAD(SDNode *N); + void SelectFMAD_FMA(SDNode *N); void SelectATOMIC_CMP_SWAP(SDNode *N); protected: @@ -621,7 +621,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectBRCOND(N); return; case ISD::FMAD: - SelectFMAD(N); + case ISD::FMA: + SelectFMAD_FMA(N); return; case AMDGPUISD::ATOMIC_CMP_SWAP: SelectATOMIC_CMP_SWAP(N); @@ -1728,9 +1729,13 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { VCC.getValue(0)); } -void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { MVT VT = N->getSimpleValueType(0); - if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) { + bool IsFMA = N->getOpcode() == ISD::FMA; + if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() && + !Subtarget->hasFmaMixInsts()) || + ((IsFMA && Subtarget->hasMadMixInsts()) || + (!IsFMA && Subtarget->hasFmaMixInsts()))) { SelectCode(N); return; } @@ -1740,13 +1745,13 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) { SDValue Src2 = N->getOperand(2); unsigned Src0Mods, Src1Mods, Src2Mods; - // Avoid using v_mad_mix_f32 unless there is actually an operand using the - // conversion from f16. + // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand + // using the conversion from f16. bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods); bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); - assert(!Subtarget->hasFP32Denormals() && + assert((IsFMA || !Subtarget->hasFP32Denormals()) && "fmad selected with denormals enabled"); // TODO: We can select this with f32 denormals enabled if all the sources are // converted from f16 (in which case fmad isn't legal). @@ -1762,7 +1767,9 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) { Zero, Zero }; - CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops); + CurDAG->SelectNodeTo(N, + IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32, + MVT::f32, Ops); } else { SelectCode(N); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 0ede62ba4d4..c60e25390c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -939,7 +939,8 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { // where this is OK to use. bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const { - return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() && + return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && SrcVT.getScalarType() == MVT::f16; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 07ed04e41d7..b3b485e548b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -148,6 +148,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasIntClamp(false), HasVOP3PInsts(false), HasMadMixInsts(false), + HasFmaMixInsts(false), HasMovrel(false), HasVGPRIndexMode(false), HasScalarStores(false), @@ -160,6 +161,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasSDWAMac(false), HasSDWAOutModsVOPC(false), HasDPP(false), + HasDLInsts(false), FlatAddressSpace(false), FlatInstOffsets(false), FlatGlobalInsts(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 3bcb701af15..996ae9c2f0b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -72,7 +72,10 @@ public: ISAVersion8_0_3, ISAVersion8_1_0, ISAVersion9_0_0, - ISAVersion9_0_2 + ISAVersion9_0_1, + ISAVersion9_0_2, + ISAVersion9_0_4, + ISAVersion9_0_6 }; enum TrapHandlerAbi { @@ -150,6 +153,7 @@ protected: bool HasIntClamp; bool HasVOP3PInsts; bool HasMadMixInsts; + bool HasFmaMixInsts; bool HasMovrel; bool HasVGPRIndexMode; bool HasScalarStores; @@ -162,6 +166,7 @@ protected: bool HasSDWAMac; bool HasSDWAOutModsVOPC; bool HasDPP; + bool HasDLInsts; bool FlatAddressSpace; bool FlatInstOffsets; bool FlatGlobalInsts; @@ -329,6 +334,10 @@ public: return HasMadMixInsts; } + bool hasFmaMixInsts() const { + return HasFmaMixInsts; + } + bool hasCARRY() const { return (getGeneration() >= EVERGREEN); } @@ -534,6 +543,10 @@ public: return getGeneration() < SEA_ISLANDS; } + bool hasDLInsts() const { + return HasDLInsts; + } + /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index d22a0c90c0d..a249c99f7a7 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4601,12 +4601,14 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); } - // special case v_mac_{f16, f32}: + // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906): // it has src2 register operand that is tied to dst operand // we don't allow modifiers for this operand in assembler so src2_modifiers - // should be 0 - if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi || - Opc == AMDGPU::V_MAC_F16_e64_vi) { + // should be 0. + if (Opc == AMDGPU::V_MAC_F32_e64_si || + Opc == AMDGPU::V_MAC_F32_e64_vi || + Opc == AMDGPU::V_MAC_F16_e64_vi || + Opc == AMDGPU::V_FMAC_F32_e64_vi) { auto it = Inst.begin(); std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers)); it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index b51c23fd688..7277a81f2e5 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -201,7 +201,17 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) { Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address); - if (Res) break; + if (Res) + break; + } + + // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and + // v_mad_mixhi_f16 for FMA variants. Try to decode using this special + // table first so we print the correct name. + if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) { + Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address); + if (Res) + break; } } diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index c4f585d5140..d76acfa24f9 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -148,3 +148,11 @@ def : ProcessorModel<"gfx900", SIQuarterSpeedModel, def : ProcessorModel<"gfx902", SIQuarterSpeedModel, [FeatureISAVersion9_0_2] >; + +def : ProcessorModel<"gfx904", SIQuarterSpeedModel, + [FeatureISAVersion9_0_4] +>; + +def : ProcessorModel<"gfx906", SIQuarterSpeedModel, + [FeatureISAVersion9_0_6] +>; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 728e5ab8bb2..caed0a16d42 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -95,6 +95,8 @@ unsigned AMDGPUTargetStreamer::getMACH(StringRef GPU) const { // AMDGCN GFX9. .Case("gfx900", ELF::EF_AMDGPU_MACH_AMDGCN_GFX900) .Case("gfx902", ELF::EF_AMDGPU_MACH_AMDGCN_GFX902) + .Case("gfx904", ELF::EF_AMDGPU_MACH_AMDGCN_GFX904) + .Case("gfx906", ELF::EF_AMDGPU_MACH_AMDGCN_GFX906) // Not specified processor. .Default(ELF::EF_AMDGPU_MACH_NONE); } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 481eb741af3..e4f121368a4 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -127,14 +127,18 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, unsigned Opc = UseMI.getOpcode(); switch (Opc) { case AMDGPU::V_MAC_F32_e64: - case AMDGPU::V_MAC_F16_e64: { + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_FMAC_F32_e64: { // Special case for mac. Since this is replaced with mad when folded into // src2, we need to check the legality for the final instruction. int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (static_cast<int>(OpNo) == Src2Idx) { + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; - const MCInstrDesc &MadDesc - = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + + unsigned Opc = IsFMA ? + AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + const MCInstrDesc &MadDesc = TII->get(Opc); return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); } return false; @@ -224,13 +228,17 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); - if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) && + if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F32_e64) && (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + unsigned NewOpc = IsFMA ? + AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); // Check if changing this to a v_mad_{f16, f32} instruction will allow us // to fold the operand. - MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16)); + MI->setDesc(TII->get(NewOpc)); bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); if (FoldAsMAD) { MI->untieRegOperand(OpNo); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 925fdce757a..ebc686ca83d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3272,12 +3272,17 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); switch (VT.getSimpleVT().SimpleTy) { - case MVT::f32: + case MVT::f32: { // This is as fast on some subtargets. However, we always have full rate f32 // mad available which returns the same result as the separate operations // which we should prefer over fma. We can't use this if we want to support // denormals, so only report this in these cases. - return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); + if (Subtarget->hasFP32Denormals()) + return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); + + // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. + return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts(); + } case MVT::f64: return true; case MVT::f16: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f4ff718e42d..897ffa948e2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2161,20 +2161,24 @@ static int64_t getFoldableImm(const MachineOperand* MO) { MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const { + unsigned Opc = MI.getOpcode(); bool IsF16 = false; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64; - switch (MI.getOpcode()) { + switch (Opc) { default: return nullptr; case AMDGPU::V_MAC_F16_e64: IsF16 = true; LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_FMAC_F32_e64: break; case AMDGPU::V_MAC_F16_e32: IsF16 = true; LLVM_FALLTHROUGH; - case AMDGPU::V_MAC_F32_e32: { + case AMDGPU::V_MAC_F32_e32: + case AMDGPU::V_FMAC_F32_e32: { int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); const MachineOperand *Src0 = &MI.getOperand(Src0Idx); @@ -2199,7 +2203,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); - if (!Src0Mods && !Src1Mods && !Clamp && !Omod && + if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod && // If we have an SGPR input, we will violate the constant bus restriction. (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { if (auto Imm = getFoldableImm(Src2)) { @@ -2230,8 +2234,10 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, } } - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) + assert((!IsFMA || !IsF16) && "fmac only expected with f32"); + unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 : + (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) .addImm(Src0Mods ? Src0Mods->getImm() : 0) .add(*Src0) @@ -4048,17 +4054,23 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); - unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) - .add(Src0) - .add(Src1); + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (ST.hasDLInsts()) { + BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) + .add(Src0) + .add(Src1); + } else { + unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) + .add(Src0) + .add(Src1); - unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not) - .addReg(Xor); + BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest) + .addReg(Xor); + } - MRI.replaceRegWith(Dest.getReg(), Not); - addUsersToMoveToVALUWorklist(Not, MRI, Worklist); + MRI.replaceRegWith(Dest.getReg(), NewDest); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } void SIInstrInfo::splitScalar64BitUnaryOp( diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 8797253f183..ef7fbfba416 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1934,6 +1934,9 @@ def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>; def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>; def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>; +def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>; +def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>; + class Commutable_REV <string revOp, bit isOrig> { string RevOp = revOp; bit IsOrig = isOrig; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 415806bcb20..8080151d6d9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1367,6 +1367,16 @@ def : GCNPat< >; } +let OtherPredicates = [HasDLInsts] in { +def : GCNPat < + (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)), + (f32 (VOP3NoMods f32:$src2))), + (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + SRCMODS.NONE, $src2, $clamp, $omod) +>; +} // End OtherPredicates = [HasDLInsts] + // Allow integer inputs class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat< diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index a346e409cb3..61cbba4c8ae 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -100,6 +100,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_FMAC_F32_e64: if (!isVGPR(Src2, TRI, MRI) || TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) return false; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 706f0171dbd..8f687fdc60a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -221,6 +221,10 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) { return {9, 0, 0}; if (Features.test(FeatureISAVersion9_0_2)) return {9, 0, 2}; + if (Features.test(FeatureISAVersion9_0_4)) + return {9, 0, 4}; + if (Features.test(FeatureISAVersion9_0_6)) + return {9, 0, 6}; if (Features.test(FeatureGFX9)) return {9, 0, 0}; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index b012dd3506e..5ec1a15c5cd 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -491,6 +491,19 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; } // End SubtargetPredicate = Has16BitInsts +let SubtargetPredicate = HasDLInsts in { + +defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>; + +let Constraints = "$vdst = $src2", + DisableEncoding="$src2", + isConvertibleToThreeAddress = 1, + isCommutable = 1 in { +defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>; +} + +} // End SubtargetPredicate = HasDLInsts + // Note: 16-bit instructions produce a 0 result in the high 16-bits. multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> { @@ -944,3 +957,10 @@ def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; } // End SubtargetPredicate = isVI + +let SubtargetPredicate = HasDLInsts in { + +defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>; +defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>; + +} // End SubtargetPredicate = HasDLInsts diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index d2530c45459..3127532a8e0 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -68,6 +68,67 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; +multiclass MadFmaMixPats<SDPatternOperator fma_like, + Instruction mix_inst, + Instruction mixlo_inst, + Instruction mixhi_inst> { + def : GCNPat < + (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (mixlo_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE, + (i32 (IMPLICIT_DEF))) + >; + + // FIXME: Special case handling for maxhi (especially for clamp) + // because dealing with the write to high half of the register is + // difficult. + def : GCNPat < + (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (v2f16 (mixhi_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE, + $elt0)) + >; + + def : GCNPat < + (build_vector + f16:$elt0, + (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), + (v2f16 (mixhi_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.ENABLE, + $elt0)) + >; + + def : GCNPat < + (AMDGPUclamp (build_vector + (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), + (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), + (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, + $hi_src1_modifiers, $hi_src1, + $hi_src2_modifiers, $hi_src2, + DSTCLAMP.ENABLE, + (mixlo_inst $lo_src0_modifiers, $lo_src0, + $lo_src1_modifiers, $lo_src1, + $lo_src2_modifiers, $lo_src2, + DSTCLAMP.ENABLE, + (i32 (IMPLICIT_DEF))))) + >; +} let SubtargetPredicate = HasMadMixInsts in { // These are VOP3a-like opcodes which accept no omod. @@ -84,64 +145,37 @@ def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16 } } -def : GCNPat < - (f16 (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), - (V_MAD_MIXLO_F16 $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.NONE, - (i32 (IMPLICIT_DEF))) ->; +defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; +} // End SubtargetPredicate = HasMadMixInsts -// FIXME: Special case handling for maxhi (especially for clamp) -// because dealing with the write to high half of the register is -// difficult. -def : GCNPat < - (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), - (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.NONE, - $elt0)) ->; -def : GCNPat < - (build_vector - f16:$elt0, - (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.ENABLE, - $elt0)) ->; +// Essentially the same as the mad_mix versions +let SubtargetPredicate = HasFmaMixInsts in { +let isCommutable = 1 in { +def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; -def : GCNPat < - (AMDGPUclamp (build_vector - (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), - (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), - (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0, - $hi_src1_modifiers, $hi_src1, - $hi_src2_modifiers, $hi_src2, - DSTCLAMP.ENABLE, - (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0, - $lo_src1_modifiers, $lo_src1, - $lo_src2_modifiers, $lo_src2, - DSTCLAMP.ENABLE, - (i32 (IMPLICIT_DEF))))) ->; +// Clamp modifier is applied after conversion to f16. +def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; + +let ClampLo = 0, ClampHi = 1 in { +def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; +} +} + +defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; +} -} // End SubtargetPredicate = [HasMadMixInsts] +let SubtargetPredicate = HasDLInsts in { + +def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, int_amdgcn_fdot2>; +def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>; +def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>; +def V_DOT4_I32_I8 : VOP3Inst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sdot4>; +def V_DOT4_U32_U8 : VOP3Inst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_udot4>; +def V_DOT8_I32_I4 : VOP3Inst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sdot8>; +def V_DOT8_U32_U4 : VOP3Inst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_udot8>; + +} // End SubtargetPredicate = HasDLInsts multiclass VOP3P_Real_vi<bits<10> op> { def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, @@ -172,6 +206,33 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>; defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>; defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>; + +let SubtargetPredicate = HasMadMixInsts in { defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>; defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; +} + +let SubtargetPredicate = HasFmaMixInsts in { +let DecoderNamespace = "GFX9_DL" in { +// The mad_mix instructions were renamed and their behaviors changed, +// but the opcode stayed the same so we need to put these in a +// different DecoderNamespace to avoid the ambiguity. +defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>; +defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; +defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; +} +} + + +let SubtargetPredicate = HasDLInsts in { + +defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>; +defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>; +defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>; +defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>; +defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>; +defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>; +defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>; + +} // End SubtargetPredicate = HasDLInsts |