diff options
author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2019-04-24 17:03:15 +0000 |
---|---|---|
committer | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2019-04-24 17:03:15 +0000 |
commit | cee607e4144e6391472e04235479969d9b0408a2 (patch) | |
tree | c8d40338463d9de3eeee008a50bb8e408cac584b /llvm/lib | |
parent | c60a4099a15ad83b19e5389284e1186a7dddd591 (diff) | |
download | bcm5719-llvm-cee607e4144e6391472e04235479969d9b0408a2.tar.gz bcm5719-llvm-cee607e4144e6391472e04235479969d9b0408a2.zip |
[AMDGPU] Add gfx1010 target definitions
Differential Revision: https://reviews.llvm.org/D61041
llvm-svn: 359113
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/ObjectYAML/ELFYAML.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Support/TargetParser.cpp | 38 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 192 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 36 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 100 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNProcessors.td | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp | 79 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIDefines.h | 18 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 24 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SISchedule.td | 33 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 98 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h | 3 |
16 files changed, 537 insertions, 112 deletions
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 758be51469d..93d309cdc05 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -411,6 +411,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX904, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH); BCase(EF_AMDGPU_XNACK); BCase(EF_AMDGPU_SRAM_ECC); break; diff --git a/llvm/lib/Support/TargetParser.cpp b/llvm/lib/Support/TargetParser.cpp index 59e096b4b2c..09eb845ddff 100644 --- a/llvm/lib/Support/TargetParser.cpp +++ b/llvm/lib/Support/TargetParser.cpp @@ -62,7 +62,7 @@ constexpr GPUInfo R600GPUs[26] = { // This table should be sorted by the value of GPUKind // Don't bother listing the implicitly true features -constexpr GPUInfo AMDGCNGPUs[33] = { +constexpr GPUInfo AMDGCNGPUs[34] = { // Name Canonical Kind Features // Name {{"gfx600"}, {"gfx600"}, GK_GFX600, FEATURE_FAST_FMA_F32}, @@ -98,6 +98,7 @@ constexpr GPUInfo AMDGCNGPUs[33] = { {{"gfx904"}, {"gfx904"}, GK_GFX904, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32}, {{"gfx906"}, {"gfx906"}, GK_GFX906, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32}, {{"gfx909"}, {"gfx909"}, GK_GFX909, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32}, + {{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32}, }; const GPUInfo *getArchEntry(AMDGPU::GPUKind AK, ArrayRef<GPUInfo> Table) { @@ -179,22 +180,23 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) { } switch (AK) { - case GK_GFX600: return {6, 0, 0}; - case GK_GFX601: return {6, 0, 1}; - case GK_GFX700: return {7, 0, 0}; - case GK_GFX701: return {7, 0, 1}; - case GK_GFX702: return {7, 0, 2}; - case GK_GFX703: return {7, 0, 3}; - case GK_GFX704: return {7, 0, 4}; - case GK_GFX801: return {8, 0, 1}; - case GK_GFX802: return {8, 0, 2}; - case GK_GFX803: return {8, 0, 3}; - case GK_GFX810: return {8, 1, 0}; - case GK_GFX900: return {9, 0, 0}; - case GK_GFX902: return {9, 0, 2}; - case GK_GFX904: return {9, 0, 4}; - case GK_GFX906: return {9, 0, 6}; - case GK_GFX909: return {9, 0, 9}; - default: return {0, 0, 0}; + case GK_GFX600: return {6, 0, 0}; + case GK_GFX601: return {6, 0, 1}; + case GK_GFX700: return {7, 0, 0}; + case GK_GFX701: return {7, 0, 1}; + case GK_GFX702: return {7, 0, 2}; + case GK_GFX703: return {7, 0, 3}; + case GK_GFX704: return {7, 0, 4}; + case GK_GFX801: return {8, 0, 1}; + case GK_GFX802: return {8, 0, 2}; + case GK_GFX803: return {8, 0, 3}; + case GK_GFX810: return {8, 1, 0}; + case GK_GFX900: return {9, 0, 0}; + case GK_GFX902: return {9, 0, 2}; + case GK_GFX904: return {9, 0, 4}; + case GK_GFX906: return {9, 0, 6}; + case GK_GFX909: return {9, 0, 9}; + case GK_GFX1010: return {10, 1, 0}; + default: return {0, 0, 0}; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 4e3893d38b2..663569e67ba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -60,6 +60,12 @@ def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", "Have scratch_* flat memory instructions" >; +def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts", + "ScalarFlatScratchInsts", + "true", + "Have s_scratch_* flat memory instructions" +>; + def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", "AddNoCarryInsts", "true", @@ -115,12 +121,72 @@ def FeatureXNACK : SubtargetFeature<"xnack", "Enable XNACK support" >; +def FeatureCuMode : SubtargetFeature<"cumode", + "EnableCuMode", + "true", + "Enable CU wavefront execution mode" +>; + def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; +def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", + "LDSMisalignedBug", + "true", + "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode" +>; + +def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard", + "HasVcmpxPermlaneHazard", + "true", + "TODO: describe me" +>; + +def FeatureVMEMtoScalarWriteHazard : SubtargetFeature<"vmem-to-scalar-write-hazard", + "HasVMEMtoScalarWriteHazard", + "true", + "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution." +>; + +def FeatureSMEMtoVectorWriteHazard : SubtargetFeature<"smem-to-vector-write-hazard", + "HasSMEMtoVectorWriteHazard", + "true", + "s_load_dword followed by v_cmp page faults" +>; + +def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug", + "HasInstFwdPrefetchBug", + "true", + "S_INST_PREFETCH instruction causes shader to hang" +>; + +def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", + "HasVcmpxExecWARHazard", + "true", + "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)" +>; + +def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard", + "HasLdsBranchVmemWARHazard", + "true", + "Switching between LDS and VMEM-tex not waiting VM_VSRC=0" +>; + +def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug", + "HasNSAtoVMEMBug", + "true", + "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero" +>; + +def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug", + "HasFlatSegmentOffsetBug", + "true", + "GFX10 bug, inst_offset ignored in flat segment" +>; + class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -155,6 +221,12 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", "Additional instructions for GFX9+" >; +def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", + "GFX10Insts", + "true", + "Additional instructions for GFX10+" +>; + def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts", "GFX7GFX8GFX9Insts", "true", @@ -257,6 +329,12 @@ def FeatureR128A16 : SubtargetFeature<"r128-a16", "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9" >; +def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", + "HasNSAEncoding", + "true", + "Support NSA encoding for image instructions" +>; + def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", "HasIntClamp", "true", @@ -299,6 +377,36 @@ def FeatureSRAMECC : SubtargetFeature<"sram-ecc", "Enable SRAM ECC" >; +def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx", + "HasNoSdstCMPX", + "true", + "V_CMPX does not write VCC/SGPR in addition to EXEC" +>; + +def FeatureVscnt : SubtargetFeature<"vscnt", + "HasVscnt", + "true", + "Has separate store vscnt counter" +>; + +def FeatureRegisterBanking : SubtargetFeature<"register-banking", + "HasRegisterBanking", + "true", + "Has register banking" +>; + +def FeatureVOP3Literal : SubtargetFeature<"vop3-literal", + "HasVOP3Literal", + "true", + "Can use one literal in VOP3" +>; + +def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard", + "HasNoDataDepHazard", + "true", + "Does not need SW waitstates" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -487,7 +595,24 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, - FeatureScalarAtomics, FeatureR128A16 + FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16 + ] +>; + +def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", + "gfx10", + [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, + FeatureFlatAddressSpace, + FeatureCIInsts, Feature16BitInsts, + FeatureSMemRealTime, FeatureInv2PiInlineImm, + FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3P, + FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, + FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, + FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, + FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, + FeatureVOP3Literal, FeatureNoDataDepHazard, + FeatureDoesNotSupportSRAMECC ] >; @@ -601,6 +726,34 @@ def FeatureISAVersion9_0_9 : FeatureSet< FeatureXNACK, FeatureCodeObjectV3]>; +// TODO: Organize more features into groups. +def FeatureGroup { + // Bugs present on gfx10.1. + list<SubtargetFeature> GFX10_1_Bugs = [ + FeatureVcmpxPermlaneHazard, + FeatureVMEMtoScalarWriteHazard, + FeatureSMEMtoVectorWriteHazard, + FeatureInstFwdPrefetchBug, + FeatureVcmpxExecWARHazard, + FeatureLdsBranchVmemWARHazard, + FeatureNSAtoVMEMBug, + FeatureFlatSegmentOffsetBug + ]; +} + +def FeatureISAVersion10_1_0 : FeatureSet< + !listconcat(FeatureGroup.GFX10_1_Bugs, + [FeatureGFX10, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureNSAEncoding, + FeatureWavefrontSize64, + FeatureScalarStores, + FeatureScalarAtomics, + FeatureScalarFlatScratchInsts, + FeatureLdsMisalignedBug, + FeatureCodeObjectV3])>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { @@ -687,10 +840,21 @@ def isGFX6 : def isGFX6GFX7 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"!FeatureGCN3Encoding,!FeatureGFX10Insts">; + +def isGFX6GFX7GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, AssemblerPredicate<"!FeatureGCN3Encoding">; def isGFX7Only : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts,!FeatureGFX10Insts">; + +def isGFX7GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts">; def isGFX7GFX8GFX9 : @@ -699,6 +863,13 @@ def isGFX7GFX8GFX9 : "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, AssemblerPredicate<"FeatureGFX7GFX8GFX9Insts">; +def isGFX6GFX7GFX8GFX9 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"!FeatureGFX10Insts">; + def isGFX7Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, AssemblerPredicate<"FeatureCIInsts">; @@ -724,6 +895,10 @@ def isGFX8GFX9 : "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, AssemblerPredicate<"FeatureGFX8Insts,FeatureGCN3Encoding">; +def isGFX10Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">, + AssemblerPredicate<"FeatureGFX10Insts">; + def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, AssemblerPredicate<"FeatureFlatAddressSpace">; @@ -731,6 +906,8 @@ def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, AssemblerPredicate<"FeatureFlatGlobalInsts">; def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">, AssemblerPredicate<"FeatureFlatScratchInsts">; +def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">, + AssemblerPredicate<"FeatureScalarFlatScratchInsts">; def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<"FeatureGFX9Insts">; @@ -766,6 +943,10 @@ def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts,FeatureSDWA">; +def HasSDWA10 : + Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureSDWA">; + def HasDPP : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">; @@ -778,9 +959,18 @@ def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">, def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">, AssemblerPredicate<"FeatureMadMixInsts">; +def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">, + AssemblerPredicate<"FeatureScalarStores">; + def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">, AssemblerPredicate<"FeatureScalarAtomics">; +def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">, + AssemblerPredicate<"FeatureNoSdstCMPX">; + +def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">, + AssemblerPredicate<"!FeatureNoSdstCMPX">; + def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index d24c22ca930..2643cb05742 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -181,6 +181,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasApertureRegs(false), EnableXNACK(false), + EnableCuMode(false), TrapHandler(false), EnableHugePrivateBuffer(false), @@ -196,6 +197,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, CIInsts(false), GFX8Insts(false), GFX9Insts(false), + GFX10Insts(false), GFX7GFX8GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), @@ -212,20 +214,37 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasSDWAOutModsVOPC(false), HasDPP(false), HasR128A16(false), + HasNSAEncoding(false), HasDLInsts(false), HasDot1Insts(false), HasDot2Insts(false), EnableSRAMECC(false), DoesNotSupportSRAMECC(false), + HasNoSdstCMPX(false), + HasVscnt(false), + HasRegisterBanking(false), + HasVOP3Literal(false), + HasNoDataDepHazard(false), FlatAddressSpace(false), FlatInstOffsets(false), FlatGlobalInsts(false), FlatScratchInsts(false), + ScalarFlatScratchInsts(false), AddNoCarryInsts(false), HasUnpackedD16VMem(false), + LDSMisalignedBug(false), ScalarizeGlobal(false), + HasVcmpxPermlaneHazard(false), + HasVMEMtoScalarWriteHazard(false), + HasSMEMtoVectorWriteHazard(false), + HasInstFwdPrefetchBug(false), + HasVcmpxExecWARHazard(false), + HasLdsBranchVmemWARHazard(false), + HasNSAtoVMEMBug(false), + HasFlatSegmentOffsetBug(false), + FeatureDisable(false), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), @@ -243,6 +262,8 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, return getLocalMemorySize(); unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + if (!WorkGroupsPerCu) + return 0; unsigned MaxWaves = getMaxWavesPerEU(); return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } @@ -251,6 +272,8 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &F) const { unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + if (!WorkGroupsPerCu) + return 0; unsigned MaxWaves = getMaxWavesPerEU(); unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); @@ -271,7 +294,8 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: - return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); + return std::make_pair(getWavefrontSize() * 2, + std::max(getWavefrontSize() * 4, 256u)); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_LS: case CallingConv::AMDGPU_HS: @@ -496,7 +520,14 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, Policy.ShouldTrackLaneMasks = true; } +bool GCNSubtarget::hasMadF16() const { + return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; +} + unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { + if (getGeneration() >= AMDGPUSubtarget::GFX10) + return 10; + if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) return 10; @@ -543,6 +574,9 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + if (getGeneration() >= AMDGPUSubtarget::GFX10) + return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. + if (MFI.hasFlatScratchInit()) { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index df091655afd..f600fdde677 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -55,7 +55,8 @@ public: SOUTHERN_ISLANDS = 4, SEA_ISLANDS = 5, VOLCANIC_ISLANDS = 6, - GFX9 = 7 + GFX9 = 7, + GFX10 = 8 }; private: @@ -293,6 +294,7 @@ protected: bool UnalignedBufferAccess; bool HasApertureRegs; bool EnableXNACK; + bool EnableCuMode; bool TrapHandler; // Used as options. @@ -313,6 +315,7 @@ protected: bool CIInsts; bool GFX8Insts; bool GFX9Insts; + bool GFX10Insts; bool GFX7GFX8GFX9Insts; bool SGPRInitBug; bool HasSMemRealTime; @@ -329,24 +332,41 @@ protected: bool HasSDWAOutModsVOPC; bool HasDPP; bool HasR128A16; + bool HasNSAEncoding; bool HasDLInsts; bool HasDot1Insts; bool HasDot2Insts; bool EnableSRAMECC; bool DoesNotSupportSRAMECC; + bool HasNoSdstCMPX; + bool HasVscnt; + bool HasRegisterBanking; + bool HasVOP3Literal; + bool HasNoDataDepHazard; bool FlatAddressSpace; bool FlatInstOffsets; bool FlatGlobalInsts; bool FlatScratchInsts; + bool ScalarFlatScratchInsts; bool AddNoCarryInsts; bool HasUnpackedD16VMem; bool R600ALUInst; bool CaymanISA; bool CFALUBug; + bool LDSMisalignedBug; bool HasVertexCache; short TexVTXClauseSize; bool ScalarizeGlobal; + bool HasVcmpxPermlaneHazard; + bool HasVMEMtoScalarWriteHazard; + bool HasSMEMtoVectorWriteHazard; + bool HasInstFwdPrefetchBug; + bool HasVcmpxExecWARHazard; + bool HasLdsBranchVmemWARHazard; + bool HasNSAtoVMEMBug; + bool HasFlatSegmentOffsetBug; + // Dummy feature to use for assembler in tablegen. bool FeatureDisable; @@ -583,6 +603,10 @@ public: return EnableXNACK; } + bool isCuModeEnabled() const { + return EnableCuMode; + } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } @@ -599,6 +623,14 @@ public: return FlatScratchInsts; } + bool hasScalarFlatScratchInsts() const { + return ScalarFlatScratchInsts; + } + + bool hasFlatSegmentOffsetBug() const { + return HasFlatSegmentOffsetBug; + } + bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; } @@ -654,10 +686,6 @@ public: return HasSDWAOutModsVOPC; } - bool vmemWriteNeedsExpWaitcnt() const { - return getGeneration() < SEA_ISLANDS; - } - bool hasDLInsts() const { return HasDLInsts; } @@ -674,6 +702,30 @@ public: return EnableSRAMECC; } + bool hasNoSdstCMPX() const { + return HasNoSdstCMPX; + } + + bool hasVscnt() const { + return HasVscnt; + } + + bool hasRegisterBanking() const { + return HasRegisterBanking; + } + + bool hasVOP3Literal() const { + return HasVOP3Literal; + } + + bool hasNoDataDepHazard() const { + return HasNoDataDepHazard; + } + + bool vmemWriteNeedsExpWaitcnt() const { + return getGeneration() < SEA_ISLANDS; + } + // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspecive of an arbitrary workitem, this // is 4-byte aligned. @@ -782,6 +834,12 @@ public: return HasR128A16; } + bool hasNSAEncoding() const { + return HasNSAEncoding; + } + + bool hasMadF16() const; + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -816,6 +874,38 @@ public: getGeneration() <= AMDGPUSubtarget::GFX9; } + bool hasVcmpxPermlaneHazard() const { + return HasVcmpxPermlaneHazard; + } + + bool hasVMEMtoScalarWriteHazard() const { + return HasVMEMtoScalarWriteHazard; + } + + bool hasSMEMtoVectorWriteHazard() const { + return HasSMEMtoVectorWriteHazard; + } + + bool hasLDSMisalignedBug() const { + return LDSMisalignedBug && !EnableCuMode; + } + + bool hasInstFwdPrefetchBug() const { + return HasInstFwdPrefetchBug; + } + + bool hasVcmpxExecWARHazard() const { + return HasVcmpxExecWARHazard; + } + + bool hasLdsBranchVmemWARHazard() const { + return HasLdsBranchVmemWARHazard; + } + + bool hasNSAtoVMEMBug() const { + return HasNSAtoVMEMBug; + } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 19af9d3ce84..52e5ab5f387 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -999,6 +999,10 @@ public: return AMDGPU::isGFX9(getSTI()); } + bool isGFX10() const { + return AMDGPU::isGFX10(getSTI()); + } + bool hasInv2PiInlineImm() const { return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; } @@ -1407,7 +1411,7 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const { bool AMDGPUOperand::isSDWAOperand(MVT type) const { if (AsmParser->isVI()) return isVReg32(); - else if (AsmParser->isGFX9()) + else if (AsmParser->isGFX9() || AsmParser->isGFX10()) return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(type); else return false; @@ -2953,7 +2957,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (getParser().parseIdentifier(KernelName)) return true; - kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(); + kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(&getSTI()); StringSet<> Seen; diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 021db78d48b..0de8feeeb46 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -164,3 +164,10 @@ def : ProcessorModel<"gfx909", SIQuarterSpeedModel, FeatureISAVersion9_0_9.Features >; +//===----------------------------------------------------------------------===// +// GCN GFX10. +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx1010", GFX10SpeedModel, + FeatureISAVersion10_1_0.Features +>; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 5daf4ac6141..b40bda94ae6 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -60,39 +60,40 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { AMDGPU::GPUKind AK; switch (ElfMach) { - case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break; - case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break; - case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break; - case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break; - case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break; - case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break; - case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break; - case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break; - case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break; - case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break; - case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break; - case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break; - case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break; - case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break; - case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break; - case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; - case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; + case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break; + case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break; + case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break; + case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break; + case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break; + case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break; + case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break; + case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break; + case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break; + case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break; + case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break; + case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break; + case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break; + case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break; + case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break; + case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; + case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; } StringRef GPUName = getArchNameAMDGCN(AK); @@ -139,6 +140,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX904: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904; case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906; case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; + case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE; } @@ -324,6 +326,17 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD, compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL); + if (IVersion.Major >= 10) { + PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE); + PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED); + PRINT_FIELD(OS, ".amdhsa_forward_progress", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS); + } PRINT_FIELD( OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, compute_pgm_rsrc2, diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index e645bd0d54a..3e66e6be48d 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -523,6 +523,15 @@ enum DppCtrl : unsigned { #define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) #define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) #define C_00B848_IEEE_MODE 0xFF7FFFFF +#define S_00B848_WGP_MODE(x) (((x) & 0x1) << 29) +#define G_00B848_WGP_MODE(x) (((x) >> 29) & 0x1) +#define C_00B848_WGP_MODE 0xDFFFFFFF +#define S_00B848_MEM_ORDERED(x) (((x) & 0x1) << 30) +#define G_00B848_MEM_ORDERED(x) (((x) >> 30) & 0x1) +#define C_00B848_MEM_ORDERED 0xBFFFFFFF +#define S_00B848_FWD_PROGRESS(x) (((x) & 0x1) << 31) +#define G_00B848_FWD_PROGRESS(x) (((x) >> 31) & 0x1) +#define C_00B848_FWD_PROGRESS 0x7FFFFFFF // Helpers for setting FLOAT_MODE @@ -553,6 +562,15 @@ enum DppCtrl : unsigned { #define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 #define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54 +#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21) +#define S_028B54_GS_W32_EN(x) (((x) & 0x1) << 22) +#define S_028B54_VS_W32_EN(x) (((x) & 0x1) << 23) +#define R_0286D8_SPI_PS_IN_CONTROL 0x0286D8 +#define S_0286D8_PS_W32_EN(x) (((x) & 0x1) << 15) +#define R_00B800_COMPUTE_DISPATCH_INITIATOR 0x00B800 +#define S_00B800_CS_W32_EN(x) (((x) & 0x1) << 15) + #define R_SPILLED_SGPRS 0x4 #define R_SPILLED_VGPRS 0x8 } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8ce3fd2fd58..d9dc000827b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5591,7 +5591,9 @@ enum SIEncodingFamily { SDWA = 2, SDWA9 = 3, GFX80 = 4, - GFX9 = 5 + GFX9 = 5, + GFX10 = 6, + SDWA10 = 7 }; static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { @@ -5604,6 +5606,8 @@ static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { case AMDGPUSubtarget::VOLCANIC_ISLANDS: case AMDGPUSubtarget::GFX9: return SIEncodingFamily::VI; + case AMDGPUSubtarget::GFX10: + return SIEncodingFamily::GFX10; } llvm_unreachable("Unknown subtarget generation!"); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b5a683c1ee3..260fc6156bd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -23,6 +23,8 @@ def SIEncodingFamily { int SDWA9 = 3; int GFX80 = 4; int GFX9 = 5; + int GFX10 = 6; + int SDWA10 = 7; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 2f2dc4b41c9..948f870318e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -112,9 +112,9 @@ def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, } foreach Index = 0-15 in { - def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>; - def TTMP#Index#_gfx9 : SIReg<"ttmp"#Index, !add(108, Index)>; - def TTMP#Index : SIReg<"", 0>; + def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>; + def TTMP#Index#_gfx9_gfx10 : SIReg<"ttmp"#Index, !add(108, Index)>; + def TTMP#Index : SIReg<"", 0>; } multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { @@ -311,8 +311,8 @@ class TmpRegTuples<string tgt, getSubRegs<size>.ret>; foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in { - def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>; - def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 2, Index>; } foreach Index = {0, 4, 8, 12} in { @@ -321,7 +321,7 @@ foreach Index = {0, 4, 8, 12} in { _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 4, Index>; def TTMP#Index#_TTMP#!add(Index,1)# _TTMP#!add(Index,2)# - _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>; + _TTMP#!add(Index,3)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 4, Index>; } foreach Index = {0, 4, 8} in { @@ -338,7 +338,7 @@ foreach Index = {0, 4, 8} in { _TTMP#!add(Index,4)# _TTMP#!add(Index,5)# _TTMP#!add(Index,6)# - _TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>; + _TTMP#!add(Index,7)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 8, Index>; } def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi : @@ -348,12 +348,12 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi, TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>; -def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 : +def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9_gfx10 : TmpRegTuplesBase<0, 16, - [TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9, - TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9, - TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9, - TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>; + [TTMP0_gfx9_gfx10, TTMP1_gfx9_gfx10, TTMP2_gfx9_gfx10, TTMP3_gfx9_gfx10, + TTMP4_gfx9_gfx10, TTMP5_gfx9_gfx10, TTMP6_gfx9_gfx10, TTMP7_gfx9_gfx10, + TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10, + TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>; // VGPR 32-bit registers diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 4c75a16ce3a..e3066df12d0 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -37,6 +37,9 @@ def WriteDouble : SchedWrite; // half rate f64 instruction (same as v_add_f64) def WriteDoubleAdd : SchedWrite; +// Conversion to or from f64 instruction +def WriteDoubleCvt : SchedWrite; + // Half rate 64-bit instructions. def Write64Bit : SchedWrite; @@ -61,6 +64,7 @@ class SISchedMachineModel : SchedMachineModel { def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; +def GFX10SpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? def HWBranch : ProcResource<1> { @@ -81,6 +85,9 @@ def HWVMEM : ProcResource<1> { def HWVALU : ProcResource<1> { let BufferSize = 1; } +def HWRC : ProcResource<1> { // Register destination cache + let BufferSize = 1; +} class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, int latency> : WriteRes<write, resources> { @@ -124,6 +131,7 @@ defm : SICommonWriteRes; def : HWVALUWriteRes<WriteFloatFMA, 1>; def : HWVALUWriteRes<WriteDouble, 4>; def : HWVALUWriteRes<WriteDoubleAdd, 2>; +def : HWVALUWriteRes<WriteDoubleCvt, 4>; def : InstRW<[WriteCopy], (instrs COPY)>; @@ -136,7 +144,32 @@ defm : SICommonWriteRes; def : HWVALUWriteRes<WriteFloatFMA, 16>; def : HWVALUWriteRes<WriteDouble, 16>; def : HWVALUWriteRes<WriteDoubleAdd, 8>; +def : HWVALUWriteRes<WriteDoubleCvt, 4>; def : InstRW<[WriteCopy], (instrs COPY)>; } // End SchedModel = SIQuarterSpeedModel + +let SchedModel = GFX10SpeedModel in { + +// The latency values are 1 / (operations / cycle). +// Add 1 stall cycle for VGPR read. +def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; +def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 9>; +def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 17>; +def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>; +def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 17>; +def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 17>; +def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 17>; + +def : HWWriteRes<WriteBranch, [HWBranch], 32>; +def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>; +def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>; +def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 5>; +def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>; +def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; +def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; + +def : InstRW<[WriteCopy], (instrs COPY)>; + +} // End SchedModel = GFX10SpeedModel diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 819c06df158..7d34e4f737a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -435,11 +435,21 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.kernarg_segment_alignment = 4; Header.group_segment_alignment = 4; Header.private_segment_alignment = 4; + + if (Version.Major >= 10) { + Header.compute_pgm_resource_registers |= + S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) | + S_00B848_MEM_ORDERED(1); + } } -amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() { +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( + const MCSubtargetInfo *STI) { + IsaVersion Version = getIsaVersion(STI->getCPU()); + amdhsa::kernel_descriptor_t KD; memset(&KD, 0, sizeof(KD)); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE); @@ -449,6 +459,13 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() { amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1); AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1); + if (Version.Major >= 10) { + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE, + STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1); + } return KD; } @@ -679,6 +696,10 @@ bool isGFX9(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; } +bool isGFX10(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; +} + bool isGCN3Encoding(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]; } @@ -704,46 +725,46 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { CASE_CI_VI(FLAT_SCR) \ CASE_CI_VI(FLAT_SCR_LO) \ CASE_CI_VI(FLAT_SCR_HI) \ - CASE_VI_GFX9(TTMP0) \ - CASE_VI_GFX9(TTMP1) \ - CASE_VI_GFX9(TTMP2) \ - CASE_VI_GFX9(TTMP3) \ - CASE_VI_GFX9(TTMP4) \ - CASE_VI_GFX9(TTMP5) \ - CASE_VI_GFX9(TTMP6) \ - CASE_VI_GFX9(TTMP7) \ - CASE_VI_GFX9(TTMP8) \ - CASE_VI_GFX9(TTMP9) \ - CASE_VI_GFX9(TTMP10) \ - CASE_VI_GFX9(TTMP11) \ - CASE_VI_GFX9(TTMP12) \ - CASE_VI_GFX9(TTMP13) \ - CASE_VI_GFX9(TTMP14) \ - CASE_VI_GFX9(TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1) \ - CASE_VI_GFX9(TTMP2_TTMP3) \ - CASE_VI_GFX9(TTMP4_TTMP5) \ - CASE_VI_GFX9(TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP8_TTMP9) \ - CASE_VI_GFX9(TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP12_TTMP13) \ - CASE_VI_GFX9(TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \ - CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0) \ + CASE_VI_GFX9_GFX10(TTMP1) \ + CASE_VI_GFX9_GFX10(TTMP2) \ + CASE_VI_GFX9_GFX10(TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4) \ + CASE_VI_GFX9_GFX10(TTMP5) \ + CASE_VI_GFX9_GFX10(TTMP6) \ + CASE_VI_GFX9_GFX10(TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8) \ + CASE_VI_GFX9_GFX10(TTMP9) \ + CASE_VI_GFX9_GFX10(TTMP10) \ + CASE_VI_GFX9_GFX10(TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12) \ + CASE_VI_GFX9_GFX10(TTMP13) \ + CASE_VI_GFX9_GFX10(TTMP14) \ + CASE_VI_GFX9_GFX10(TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1) \ + CASE_VI_GFX9_GFX10(TTMP2_TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5) \ + CASE_VI_GFX9_GFX10(TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9) \ + CASE_VI_GFX9_GFX10(TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12_TTMP13) \ + CASE_VI_GFX9_GFX10(TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ } #define CASE_CI_VI(node) \ assert(!isSI(STI)); \ case node: return isCI(STI) ? node##_ci : node##_vi; -#define CASE_VI_GFX9(node) \ - case node: return isGFX9(STI) ? node##_gfx9 : node##_vi; +#define CASE_VI_GFX9_GFX10(node) \ + case node: return (isGFX9(STI) || isGFX10(STI)) ? node##_gfx9_gfx10 : node##_vi; unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { if (STI.getTargetTriple().getArch() == Triple::r600) @@ -752,17 +773,17 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { } #undef CASE_CI_VI -#undef CASE_VI_GFX9 +#undef CASE_VI_GFX9_GFX10 #define CASE_CI_VI(node) case node##_ci: case node##_vi: return node; -#define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node; +#define CASE_VI_GFX9_GFX10(node) case node##_vi: case node##_gfx9_gfx10: return node; unsigned mc2PseudoReg(unsigned Reg) { MAP_REG2REG } #undef CASE_CI_VI -#undef CASE_VI_GFX9 +#undef CASE_VI_GFX9_GFX10 #undef MAP_REG2REG bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { @@ -1030,5 +1051,6 @@ const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr); bool isIntrinsicSourceOfDivergence(unsigned IntrID) { return lookupSourceOfDivergence(IntrID); } + } // namespace AMDGPU } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 2943722963a..cad2d4f25da 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -244,7 +244,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen); void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI); -amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(); +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( + const MCSubtargetInfo *STI); bool isGroupSegment(const GlobalValue *GV); bool isGlobalSegment(const GlobalValue *GV); @@ -398,6 +399,7 @@ bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); bool isGFX9(const MCSubtargetInfo &STI); +bool isGFX10(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h index 8efe6f6741e..2d7857ed92b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -82,6 +82,9 @@ COMPPGM1(priv, compute_pgm_rsrc1_priv, PRIV COMPPGM1(enable_dx10_clamp, compute_pgm_rsrc1_dx10_clamp, DX10_CLAMP), COMPPGM1(debug_mode, compute_pgm_rsrc1_debug_mode, DEBUG_MODE), COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE_MODE), +COMPPGM1(enable_wgp_mode, compute_pgm_rsrc1_wgp_mode, WGP_MODE), +COMPPGM1(enable_mem_ordered, compute_pgm_rsrc1_mem_ordered, MEM_ORDERED), +COMPPGM1(enable_fwd_progress, compute_pgm_rsrc1_fwd_progress, FWD_PROGRESS), // TODO: bulky // TODO: cdbg_user COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN), |