summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/ObjectYAML/ELFYAML.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td31
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h15
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/GCNProcessors.td8
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp40
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td3
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td20
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td169
19 files changed, 292 insertions, 90 deletions
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 642a89fe930..53664906847 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -400,6 +400,8 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX810, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX902, EF_AMDGPU_MACH);
+ BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX904, EF_AMDGPU_MACH);
+ BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH);
BCase(EF_AMDGPU_XNACK);
break;
case ELF::EM_X86_64:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 37c593d951e..73490f6c421 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -127,6 +127,12 @@ def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts",
"Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions"
>;
+def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
+ "HasFmaMixInsts",
+ "true",
+ "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
+>;
+
// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
// XNACK. The current default kernel driver setting is:
// - graphics ring: XNACK disabled
@@ -310,6 +316,12 @@ def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem",
"Has unpacked d16 vmem instructions"
>;
+def FeatureDLInsts : SubtargetFeature<"dl-insts",
+ "HasDLInsts",
+ "true",
+ "Has deep learning instructions"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -606,6 +618,18 @@ def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
FeatureXNACK
]>;
+def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
+ [FeatureGFX9,
+ FeatureLDSBankCount32,
+ FeatureFmaMixInsts]>;
+
+def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
+ [FeatureGFX9,
+ HalfRate64Ops,
+ FeatureFmaMixInsts,
+ FeatureLDSBankCount32,
+ FeatureDLInsts]>;
+
//===----------------------------------------------------------------------===//
// Debugger related subtarget features.
//===----------------------------------------------------------------------===//
@@ -788,6 +812,13 @@ def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
AssemblerPredicate<"FeatureMovrel">;
+def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
+ AssemblerPredicate<"FeatureFmaMixInsts">;
+
+def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
+ AssemblerPredicate<"FeatureDLInsts">;
+
+
def EnableLateCFGStructurize : Predicate<
"EnableLateStructurizeCFG">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 1b98edcf95b..47321a76e5c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -215,7 +215,7 @@ private:
void SelectS_BFE(SDNode *N);
bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);
- void SelectFMAD(SDNode *N);
+ void SelectFMAD_FMA(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);
protected:
@@ -621,7 +621,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectBRCOND(N);
return;
case ISD::FMAD:
- SelectFMAD(N);
+ case ISD::FMA:
+ SelectFMAD_FMA(N);
return;
case AMDGPUISD::ATOMIC_CMP_SWAP:
SelectATOMIC_CMP_SWAP(N);
@@ -1728,9 +1729,13 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
VCC.getValue(0));
}
-void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
+void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
MVT VT = N->getSimpleValueType(0);
- if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) {
+ bool IsFMA = N->getOpcode() == ISD::FMA;
+ if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
+ !Subtarget->hasFmaMixInsts()) ||
+ ((IsFMA && Subtarget->hasMadMixInsts()) ||
+ (!IsFMA && Subtarget->hasFmaMixInsts()))) {
SelectCode(N);
return;
}
@@ -1740,13 +1745,13 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
SDValue Src2 = N->getOperand(2);
unsigned Src0Mods, Src1Mods, Src2Mods;
- // Avoid using v_mad_mix_f32 unless there is actually an operand using the
- // conversion from f16.
+ // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
+ // using the conversion from f16.
bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
- assert(!Subtarget->hasFP32Denormals() &&
+ assert((IsFMA || !Subtarget->hasFP32Denormals()) &&
"fmad selected with denormals enabled");
// TODO: We can select this with f32 denormals enabled if all the sources are
// converted from f16 (in which case fmad isn't legal).
@@ -1762,7 +1767,9 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) {
Zero, Zero
};
- CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops);
+ CurDAG->SelectNodeTo(N,
+ IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
+ MVT::f32, Ops);
} else {
SelectCode(N);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0ede62ba4d4..c60e25390c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -939,7 +939,8 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
// where this is OK to use.
bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
EVT DestVT, EVT SrcVT) const {
- return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
+ return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
SrcVT.getScalarType() == MVT::f16;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 07ed04e41d7..b3b485e548b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -148,6 +148,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasIntClamp(false),
HasVOP3PInsts(false),
HasMadMixInsts(false),
+ HasFmaMixInsts(false),
HasMovrel(false),
HasVGPRIndexMode(false),
HasScalarStores(false),
@@ -160,6 +161,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasSDWAMac(false),
HasSDWAOutModsVOPC(false),
HasDPP(false),
+ HasDLInsts(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
FlatGlobalInsts(false),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 3bcb701af15..996ae9c2f0b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -72,7 +72,10 @@ public:
ISAVersion8_0_3,
ISAVersion8_1_0,
ISAVersion9_0_0,
- ISAVersion9_0_2
+ ISAVersion9_0_1,
+ ISAVersion9_0_2,
+ ISAVersion9_0_4,
+ ISAVersion9_0_6
};
enum TrapHandlerAbi {
@@ -150,6 +153,7 @@ protected:
bool HasIntClamp;
bool HasVOP3PInsts;
bool HasMadMixInsts;
+ bool HasFmaMixInsts;
bool HasMovrel;
bool HasVGPRIndexMode;
bool HasScalarStores;
@@ -162,6 +166,7 @@ protected:
bool HasSDWAMac;
bool HasSDWAOutModsVOPC;
bool HasDPP;
+ bool HasDLInsts;
bool FlatAddressSpace;
bool FlatInstOffsets;
bool FlatGlobalInsts;
@@ -329,6 +334,10 @@ public:
return HasMadMixInsts;
}
+ bool hasFmaMixInsts() const {
+ return HasFmaMixInsts;
+ }
+
bool hasCARRY() const {
return (getGeneration() >= EVERGREEN);
}
@@ -534,6 +543,10 @@ public:
return getGeneration() < SEA_ISLANDS;
}
+ bool hasDLInsts() const {
+ return HasDLInsts;
+ }
+
/// \brief Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index d22a0c90c0d..a249c99f7a7 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4601,12 +4601,14 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
}
- // special case v_mac_{f16, f32}:
+ // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906):
// it has src2 register operand that is tied to dst operand
// we don't allow modifiers for this operand in assembler so src2_modifiers
- // should be 0
- if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi ||
- Opc == AMDGPU::V_MAC_F16_e64_vi) {
+ // should be 0.
+ if (Opc == AMDGPU::V_MAC_F32_e64_si ||
+ Opc == AMDGPU::V_MAC_F32_e64_vi ||
+ Opc == AMDGPU::V_MAC_F16_e64_vi ||
+ Opc == AMDGPU::V_FMAC_F32_e64_vi) {
auto it = Inst.begin();
std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index b51c23fd688..7277a81f2e5 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -201,7 +201,17 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) {
Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address);
- if (Res) break;
+ if (Res)
+ break;
+ }
+
+ // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
+ // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
+ // table first so we print the correct name.
+ if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) {
+ Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address);
+ if (Res)
+ break;
}
}
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index c4f585d5140..d76acfa24f9 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -148,3 +148,11 @@ def : ProcessorModel<"gfx900", SIQuarterSpeedModel,
def : ProcessorModel<"gfx902", SIQuarterSpeedModel,
[FeatureISAVersion9_0_2]
>;
+
+def : ProcessorModel<"gfx904", SIQuarterSpeedModel,
+ [FeatureISAVersion9_0_4]
+>;
+
+def : ProcessorModel<"gfx906", SIQuarterSpeedModel,
+ [FeatureISAVersion9_0_6]
+>;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 728e5ab8bb2..caed0a16d42 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -95,6 +95,8 @@ unsigned AMDGPUTargetStreamer::getMACH(StringRef GPU) const {
// AMDGCN GFX9.
.Case("gfx900", ELF::EF_AMDGPU_MACH_AMDGCN_GFX900)
.Case("gfx902", ELF::EF_AMDGPU_MACH_AMDGCN_GFX902)
+ .Case("gfx904", ELF::EF_AMDGPU_MACH_AMDGCN_GFX904)
+ .Case("gfx906", ELF::EF_AMDGPU_MACH_AMDGCN_GFX906)
// Not specified processor.
.Default(ELF::EF_AMDGPU_MACH_NONE);
}
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 481eb741af3..e4f121368a4 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -127,14 +127,18 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
unsigned Opc = UseMI.getOpcode();
switch (Opc) {
case AMDGPU::V_MAC_F32_e64:
- case AMDGPU::V_MAC_F16_e64: {
+ case AMDGPU::V_MAC_F16_e64:
+ case AMDGPU::V_FMAC_F32_e64: {
// Special case for mac. Since this is replaced with mad when folded into
// src2, we need to check the legality for the final instruction.
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (static_cast<int>(OpNo) == Src2Idx) {
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
- const MCInstrDesc &MadDesc
- = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+
+ unsigned Opc = IsFMA ?
+ AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+ const MCInstrDesc &MadDesc = TII->get(Opc);
return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
}
return false;
@@ -224,13 +228,17 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
unsigned Opc = MI->getOpcode();
- if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
+ if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64) &&
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ unsigned NewOpc = IsFMA ?
+ AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
// to fold the operand.
- MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
+ MI->setDesc(TII->get(NewOpc));
bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
if (FoldAsMAD) {
MI->untieRegOperand(OpNo);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 925fdce757a..ebc686ca83d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3272,12 +3272,17 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
VT = VT.getScalarType();
switch (VT.getSimpleVT().SimpleTy) {
- case MVT::f32:
+ case MVT::f32: {
// This is as fast on some subtargets. However, we always have full rate f32
// mad available which returns the same result as the separate operations
// which we should prefer over fma. We can't use this if we want to support
// denormals, so only report this in these cases.
- return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
+ if (Subtarget->hasFP32Denormals())
+ return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
+
+ // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
+ return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
+ }
case MVT::f64:
return true;
case MVT::f16:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f4ff718e42d..897ffa948e2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2161,20 +2161,24 @@ static int64_t getFoldableImm(const MachineOperand* MO) {
MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
MachineInstr &MI,
LiveVariables *LV) const {
+ unsigned Opc = MI.getOpcode();
bool IsF16 = false;
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
- switch (MI.getOpcode()) {
+ switch (Opc) {
default:
return nullptr;
case AMDGPU::V_MAC_F16_e64:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
+ case AMDGPU::V_FMAC_F32_e64:
break;
case AMDGPU::V_MAC_F16_e32:
IsF16 = true;
LLVM_FALLTHROUGH;
- case AMDGPU::V_MAC_F32_e32: {
+ case AMDGPU::V_MAC_F32_e32:
+ case AMDGPU::V_FMAC_F32_e32: {
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::src0);
const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
@@ -2199,7 +2203,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
- if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
+ if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
// If we have an SGPR input, we will violate the constant bus restriction.
(!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
if (auto Imm = getFoldableImm(Src2)) {
@@ -2230,8 +2234,10 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
}
}
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
+ assert((!IsFMA || !IsF16) && "fmac only expected with f32");
+ unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
+ (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.addImm(Src0Mods ? Src0Mods->getImm() : 0)
.add(*Src0)
@@ -4048,17 +4054,23 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
- unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
- .add(Src0)
- .add(Src1);
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (ST.hasDLInsts()) {
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
+ .add(Src0)
+ .add(Src1);
+ } else {
+ unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
+ .add(Src0)
+ .add(Src1);
- unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not)
- .addReg(Xor);
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
+ .addReg(Xor);
+ }
- MRI.replaceRegWith(Dest.getReg(), Not);
- addUsersToMoveToVALUWorklist(Not, MRI, Worklist);
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
}
void SIInstrInfo::splitScalar64BitUnaryOp(
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8797253f183..ef7fbfba416 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1934,6 +1934,9 @@ def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
+def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>;
+def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>;
+
class Commutable_REV <string revOp, bit isOrig> {
string RevOp = revOp;
bit IsOrig = isOrig;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 415806bcb20..8080151d6d9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1367,6 +1367,16 @@ def : GCNPat<
>;
}
+let OtherPredicates = [HasDLInsts] in {
+def : GCNPat <
+ (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3NoMods f32:$src2))),
+ (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+ SRCMODS.NONE, $src2, $clamp, $omod)
+>;
+} // End OtherPredicates = [HasDLInsts]
+
// Allow integer inputs
class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index a346e409cb3..61cbba4c8ae 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -100,6 +100,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
+ case AMDGPU::V_FMAC_F32_e64:
if (!isVGPR(Src2, TRI, MRI) ||
TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
return false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 706f0171dbd..8f687fdc60a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -221,6 +221,10 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
return {9, 0, 0};
if (Features.test(FeatureISAVersion9_0_2))
return {9, 0, 2};
+ if (Features.test(FeatureISAVersion9_0_4))
+ return {9, 0, 4};
+ if (Features.test(FeatureISAVersion9_0_6))
+ return {9, 0, 6};
if (Features.test(FeatureGFX9))
return {9, 0, 0};
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index b012dd3506e..5ec1a15c5cd 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -491,6 +491,19 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
} // End SubtargetPredicate = Has16BitInsts
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>;
+
+let Constraints = "$vdst = $src2",
+ DisableEncoding="$src2",
+ isConvertibleToThreeAddress = 1,
+ isCommutable = 1 in {
+defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
+}
+
+} // End SubtargetPredicate = HasDLInsts
+
// Note: 16-bit instructions produce a 0 result in the high 16-bits.
multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
@@ -944,3 +957,10 @@ def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
} // End SubtargetPredicate = isVI
+
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>;
+defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
+
+} // End SubtargetPredicate = HasDLInsts
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index d2530c45459..3127532a8e0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -68,6 +68,67 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1
def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+multiclass MadFmaMixPats<SDPatternOperator fma_like,
+ Instruction mix_inst,
+ Instruction mixlo_inst,
+ Instruction mixhi_inst> {
+ def : GCNPat <
+ (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (mixlo_inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.NONE,
+ (i32 (IMPLICIT_DEF)))
+ >;
+
+ // FIXME: Special case handling for maxhi (especially for clamp)
+ // because dealing with the write to high half of the register is
+ // difficult.
+ def : GCNPat <
+ (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+ (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.NONE,
+ $elt0))
+ >;
+
+ def : GCNPat <
+ (build_vector
+ f16:$elt0,
+ (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
+ (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.ENABLE,
+ $elt0))
+ >;
+
+ def : GCNPat <
+ (AMDGPUclamp (build_vector
+ (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
+ (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
+ (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
+ $hi_src1_modifiers, $hi_src1,
+ $hi_src2_modifiers, $hi_src2,
+ DSTCLAMP.ENABLE,
+ (mixlo_inst $lo_src0_modifiers, $lo_src0,
+ $lo_src1_modifiers, $lo_src1,
+ $lo_src2_modifiers, $lo_src2,
+ DSTCLAMP.ENABLE,
+ (i32 (IMPLICIT_DEF)))))
+ >;
+}
let SubtargetPredicate = HasMadMixInsts in {
// These are VOP3a-like opcodes which accept no omod.
@@ -84,64 +145,37 @@ def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16
}
}
-def : GCNPat <
- (f16 (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
- (V_MAD_MIXLO_F16 $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.NONE,
- (i32 (IMPLICIT_DEF)))
->;
+defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
+} // End SubtargetPredicate = HasMadMixInsts
-// FIXME: Special case handling for maxhi (especially for clamp)
-// because dealing with the write to high half of the register is
-// difficult.
-def : GCNPat <
- (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
- (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.NONE,
- $elt0))
->;
-def : GCNPat <
- (build_vector
- f16:$elt0,
- (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
- (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.ENABLE,
- $elt0))
->;
+// Essentially the same as the mad_mix versions
+let SubtargetPredicate = HasFmaMixInsts in {
+let isCommutable = 1 in {
+def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
-def : GCNPat <
- (AMDGPUclamp (build_vector
- (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
- (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
- (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
- (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0,
- $hi_src1_modifiers, $hi_src1,
- $hi_src2_modifiers, $hi_src2,
- DSTCLAMP.ENABLE,
- (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0,
- $lo_src1_modifiers, $lo_src1,
- $lo_src2_modifiers, $lo_src2,
- DSTCLAMP.ENABLE,
- (i32 (IMPLICIT_DEF)))))
->;
+// Clamp modifier is applied after conversion to f16.
+def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+
+let ClampLo = 0, ClampHi = 1 in {
+def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
+}
+}
+
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
+}
-} // End SubtargetPredicate = [HasMadMixInsts]
+let SubtargetPredicate = HasDLInsts in {
+
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, int_amdgcn_fdot2>;
+def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;
+def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;
+def V_DOT4_I32_I8 : VOP3Inst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sdot4>;
+def V_DOT4_U32_U8 : VOP3Inst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_udot4>;
+def V_DOT8_I32_I4 : VOP3Inst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sdot8>;
+def V_DOT8_U32_U4 : VOP3Inst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_udot8>;
+
+} // End SubtargetPredicate = HasDLInsts
multiclass VOP3P_Real_vi<bits<10> op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
@@ -172,6 +206,33 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>;
defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>;
defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>;
+
+let SubtargetPredicate = HasMadMixInsts in {
defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>;
defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+}
+
+let SubtargetPredicate = HasFmaMixInsts in {
+let DecoderNamespace = "GFX9_DL" in {
+// The mad_mix instructions were renamed and their behaviors changed,
+// but the opcode stayed the same so we need to put these in a
+// different DecoderNamespace to avoid the ambiguity.
+defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
+}
+}
+
+
+let SubtargetPredicate = HasDLInsts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>;
+defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>;
+defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>;
+defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>;
+defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>;
+defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>;
+defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>;
+
+} // End SubtargetPredicate = HasDLInsts
OpenPOWER on IntegriCloud