diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 192 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 36 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 100 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNProcessors.td | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp | 79 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIDefines.h | 18 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 24 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SISchedule.td | 33 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 98 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h | 3 | 
14 files changed, 516 insertions, 94 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 4e3893d38b2..663569e67ba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -60,6 +60,12 @@ def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",    "Have scratch_* flat memory instructions"  >; +def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts", +  "ScalarFlatScratchInsts", +  "true", +  "Have s_scratch_* flat memory instructions" +>; +  def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",    "AddNoCarryInsts",    "true", @@ -115,12 +121,72 @@ def FeatureXNACK : SubtargetFeature<"xnack",    "Enable XNACK support"  >; +def FeatureCuMode : SubtargetFeature<"cumode", +  "EnableCuMode", +  "true", +  "Enable CU wavefront execution mode" +>; +  def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",    "SGPRInitBug",    "true",    "VI SGPR initialization bug requiring a fixed SGPR allocation size"  >; +def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", +  "LDSMisalignedBug", +  "true", +  "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode" +>; + +def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard", +  "HasVcmpxPermlaneHazard", +  "true", +  "TODO: describe me" +>; + +def FeatureVMEMtoScalarWriteHazard : SubtargetFeature<"vmem-to-scalar-write-hazard", +  "HasVMEMtoScalarWriteHazard", +  "true", +  "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution." +>; + +def FeatureSMEMtoVectorWriteHazard : SubtargetFeature<"smem-to-vector-write-hazard", +  "HasSMEMtoVectorWriteHazard", +  "true", +  "s_load_dword followed by v_cmp page faults" +>; + +def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug", +  "HasInstFwdPrefetchBug", +  "true", +  "S_INST_PREFETCH instruction causes shader to hang" +>; + +def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", +  "HasVcmpxExecWARHazard", +  "true", +  "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)" +>; + +def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard", +  "HasLdsBranchVmemWARHazard", +  "true", +  "Switching between LDS and VMEM-tex not waiting VM_VSRC=0" +>; + +def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug", +  "HasNSAtoVMEMBug", +  "true", +  "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero" +>; + +def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug", +  "HasFlatSegmentOffsetBug", +  "true", +  "GFX10 bug, inst_offset ignored in flat segment" +>; +  class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <    "ldsbankcount"#Value,    "LDSBankCount", @@ -155,6 +221,12 @@ def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",    "Additional instructions for GFX9+"  >; +def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", +  "GFX10Insts", +  "true", +  "Additional instructions for GFX10+" +>; +  def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts",    "GFX7GFX8GFX9Insts",    "true", @@ -257,6 +329,12 @@ def FeatureR128A16 : SubtargetFeature<"r128-a16",    "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9"  >; +def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", +  "HasNSAEncoding", +  "true", +  "Support NSA encoding for image instructions" +>; +  def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",    "HasIntClamp",    "true", @@ -299,6 +377,36 @@ def FeatureSRAMECC : SubtargetFeature<"sram-ecc",    "Enable SRAM ECC"  >; +def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx", +  "HasNoSdstCMPX", +  "true", +  "V_CMPX does not write VCC/SGPR in addition to EXEC" +>; + +def FeatureVscnt : SubtargetFeature<"vscnt", +  "HasVscnt", +  "true", +  "Has separate store vscnt counter" +>; + +def FeatureRegisterBanking : SubtargetFeature<"register-banking", +  "HasRegisterBanking", +  "true", +  "Has register banking" +>; + +def FeatureVOP3Literal : SubtargetFeature<"vop3-literal", +  "HasVOP3Literal", +  "true", +  "Can use one literal in VOP3" +>; + +def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard", +  "HasNoDataDepHazard", +  "true", +  "Does not need SW waitstates" +>; +  //===------------------------------------------------------------===//  // Subtarget Features (options and debugging)  //===------------------------------------------------------------===// @@ -487,7 +595,24 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",     FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,     FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,     FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, -   FeatureScalarAtomics, FeatureR128A16 +   FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16 +  ] +>; + +def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", +  "gfx10", +  [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, +   FeatureFlatAddressSpace, +   FeatureCIInsts, Feature16BitInsts, +   FeatureSMemRealTime, FeatureInv2PiInlineImm, +   FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3P, +   FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, +   FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, +   FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, +   FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, +   FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, +   FeatureVOP3Literal, FeatureNoDataDepHazard, +   FeatureDoesNotSupportSRAMECC    ]  >; @@ -601,6 +726,34 @@ def FeatureISAVersion9_0_9 : FeatureSet<     FeatureXNACK,     FeatureCodeObjectV3]>; +// TODO: Organize more features into groups. +def FeatureGroup { +  // Bugs present on gfx10.1. +  list<SubtargetFeature> GFX10_1_Bugs = [ +    FeatureVcmpxPermlaneHazard, +    FeatureVMEMtoScalarWriteHazard, +    FeatureSMEMtoVectorWriteHazard, +    FeatureInstFwdPrefetchBug, +    FeatureVcmpxExecWARHazard, +    FeatureLdsBranchVmemWARHazard, +    FeatureNSAtoVMEMBug, +    FeatureFlatSegmentOffsetBug +   ]; +} + +def FeatureISAVersion10_1_0 : FeatureSet< +  !listconcat(FeatureGroup.GFX10_1_Bugs, +    [FeatureGFX10, +     FeatureLDSBankCount32, +     FeatureDLInsts, +     FeatureNSAEncoding, +     FeatureWavefrontSize64, +     FeatureScalarStores, +     FeatureScalarAtomics, +     FeatureScalarFlatScratchInsts, +     FeatureLdsMisalignedBug, +     FeatureCodeObjectV3])>; +  //===----------------------------------------------------------------------===//  def AMDGPUInstrInfo : InstrInfo { @@ -687,10 +840,21 @@ def isGFX6 :  def isGFX6GFX7 :    Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"              "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, +  AssemblerPredicate<"!FeatureGCN3Encoding,!FeatureGFX10Insts">; + +def isGFX6GFX7GFX10 : +  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" +            "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" +            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,    AssemblerPredicate<"!FeatureGCN3Encoding">;  def isGFX7Only :    Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, +  AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts,!FeatureGFX10Insts">; + +def isGFX7GFX10 : +  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" +            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,    AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts">;  def isGFX7GFX8GFX9 : @@ -699,6 +863,13 @@ def isGFX7GFX8GFX9 :              "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,    AssemblerPredicate<"FeatureGFX7GFX8GFX9Insts">; +def isGFX6GFX7GFX8GFX9 : +  Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" +            "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" +            "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" +            "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, +  AssemblerPredicate<"!FeatureGFX10Insts">; +  def isGFX7Plus :    Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,    AssemblerPredicate<"FeatureCIInsts">; @@ -724,6 +895,10 @@ def isGFX8GFX9 :              "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,    AssemblerPredicate<"FeatureGFX8Insts,FeatureGCN3Encoding">; +def isGFX10Plus : +  Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">, +  AssemblerPredicate<"FeatureGFX10Insts">; +  def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,    AssemblerPredicate<"FeatureFlatAddressSpace">; @@ -731,6 +906,8 @@ def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,    AssemblerPredicate<"FeatureFlatGlobalInsts">;  def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,    AssemblerPredicate<"FeatureFlatScratchInsts">; +def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">, +  AssemblerPredicate<"FeatureScalarFlatScratchInsts">;  def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,    AssemblerPredicate<"FeatureGFX9Insts">; @@ -766,6 +943,10 @@ def HasSDWA9 :    Predicate<"Subtarget->hasSDWA()">,    AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts,FeatureSDWA">; +def HasSDWA10 : +  Predicate<"Subtarget->hasSDWA()">, +  AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureSDWA">; +  def HasDPP : Predicate<"Subtarget->hasDPP()">,    AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">; @@ -778,9 +959,18 @@ def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,  def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,    AssemblerPredicate<"FeatureMadMixInsts">; +def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">, +  AssemblerPredicate<"FeatureScalarStores">; +  def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,    AssemblerPredicate<"FeatureScalarAtomics">; +def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">, +  AssemblerPredicate<"FeatureNoSdstCMPX">; + +def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">, +  AssemblerPredicate<"!FeatureNoSdstCMPX">; +  def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;  def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;  def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index d24c22ca930..2643cb05742 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -181,6 +181,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,      HasApertureRegs(false),      EnableXNACK(false), +    EnableCuMode(false),      TrapHandler(false),      EnableHugePrivateBuffer(false), @@ -196,6 +197,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,      CIInsts(false),      GFX8Insts(false),      GFX9Insts(false), +    GFX10Insts(false),      GFX7GFX8GFX9Insts(false),      SGPRInitBug(false),      HasSMemRealTime(false), @@ -212,20 +214,37 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,      HasSDWAOutModsVOPC(false),      HasDPP(false),      HasR128A16(false), +    HasNSAEncoding(false),      HasDLInsts(false),      HasDot1Insts(false),      HasDot2Insts(false),      EnableSRAMECC(false),      DoesNotSupportSRAMECC(false), +    HasNoSdstCMPX(false), +    HasVscnt(false), +    HasRegisterBanking(false), +    HasVOP3Literal(false), +    HasNoDataDepHazard(false),      FlatAddressSpace(false),      FlatInstOffsets(false),      FlatGlobalInsts(false),      FlatScratchInsts(false), +    ScalarFlatScratchInsts(false),      AddNoCarryInsts(false),      HasUnpackedD16VMem(false), +    LDSMisalignedBug(false),      ScalarizeGlobal(false), +    HasVcmpxPermlaneHazard(false), +    HasVMEMtoScalarWriteHazard(false), +    HasSMEMtoVectorWriteHazard(false), +    HasInstFwdPrefetchBug(false), +    HasVcmpxExecWARHazard(false), +    HasLdsBranchVmemWARHazard(false), +    HasNSAtoVMEMBug(false), +    HasFlatSegmentOffsetBug(false), +      FeatureDisable(false),      InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),      TLInfo(TM, *this), @@ -243,6 +262,8 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,      return getLocalMemorySize();    unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;    unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); +  if (!WorkGroupsPerCu) +    return 0;    unsigned MaxWaves = getMaxWavesPerEU();    return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;  } @@ -251,6 +272,8 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,    const Function &F) const {    unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;    unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); +  if (!WorkGroupsPerCu) +    return 0;    unsigned MaxWaves = getMaxWavesPerEU();    unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;    unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); @@ -271,7 +294,8 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {    case CallingConv::AMDGPU_CS:    case CallingConv::AMDGPU_KERNEL:    case CallingConv::SPIR_KERNEL: -    return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); +    return std::make_pair(getWavefrontSize() * 2, +                          std::max(getWavefrontSize() * 4, 256u));    case CallingConv::AMDGPU_VS:    case CallingConv::AMDGPU_LS:    case CallingConv::AMDGPU_HS: @@ -496,7 +520,14 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,      Policy.ShouldTrackLaneMasks = true;  } +bool GCNSubtarget::hasMadF16() const { +  return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; +} +  unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { +  if (getGeneration() >= AMDGPUSubtarget::GFX10) +    return 10; +    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {      if (SGPRs <= 80)        return 10; @@ -543,6 +574,9 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {  unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {    const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); +  if (getGeneration() >= AMDGPUSubtarget::GFX10) +    return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. +    if (MFI.hasFlatScratchInit()) {      if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)        return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index df091655afd..f600fdde677 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -55,7 +55,8 @@ public:      SOUTHERN_ISLANDS = 4,      SEA_ISLANDS = 5,      VOLCANIC_ISLANDS = 6, -    GFX9 = 7 +    GFX9 = 7, +    GFX10 = 8    };  private: @@ -293,6 +294,7 @@ protected:    bool UnalignedBufferAccess;    bool HasApertureRegs;    bool EnableXNACK; +  bool EnableCuMode;    bool TrapHandler;    // Used as options. @@ -313,6 +315,7 @@ protected:    bool CIInsts;    bool GFX8Insts;    bool GFX9Insts; +  bool GFX10Insts;    bool GFX7GFX8GFX9Insts;    bool SGPRInitBug;    bool HasSMemRealTime; @@ -329,24 +332,41 @@ protected:    bool HasSDWAOutModsVOPC;    bool HasDPP;    bool HasR128A16; +  bool HasNSAEncoding;    bool HasDLInsts;    bool HasDot1Insts;    bool HasDot2Insts;    bool EnableSRAMECC;    bool DoesNotSupportSRAMECC; +  bool HasNoSdstCMPX; +  bool HasVscnt; +  bool HasRegisterBanking; +  bool HasVOP3Literal; +  bool HasNoDataDepHazard;    bool FlatAddressSpace;    bool FlatInstOffsets;    bool FlatGlobalInsts;    bool FlatScratchInsts; +  bool ScalarFlatScratchInsts;    bool AddNoCarryInsts;    bool HasUnpackedD16VMem;    bool R600ALUInst;    bool CaymanISA;    bool CFALUBug; +  bool LDSMisalignedBug;    bool HasVertexCache;    short TexVTXClauseSize;    bool ScalarizeGlobal; +  bool HasVcmpxPermlaneHazard; +  bool HasVMEMtoScalarWriteHazard; +  bool HasSMEMtoVectorWriteHazard; +  bool HasInstFwdPrefetchBug; +  bool HasVcmpxExecWARHazard; +  bool HasLdsBranchVmemWARHazard; +  bool HasNSAtoVMEMBug; +  bool HasFlatSegmentOffsetBug; +    // Dummy feature to use for assembler in tablegen.    bool FeatureDisable; @@ -583,6 +603,10 @@ public:      return EnableXNACK;    } +  bool isCuModeEnabled() const { +    return EnableCuMode; +  } +    bool hasFlatAddressSpace() const {      return FlatAddressSpace;    } @@ -599,6 +623,14 @@ public:      return FlatScratchInsts;    } +  bool hasScalarFlatScratchInsts() const { +    return ScalarFlatScratchInsts; +  } + +  bool hasFlatSegmentOffsetBug() const { +    return HasFlatSegmentOffsetBug; +  } +    bool hasFlatLgkmVMemCountInOrder() const {      return getGeneration() > GFX9;    } @@ -654,10 +686,6 @@ public:      return HasSDWAOutModsVOPC;    } -  bool vmemWriteNeedsExpWaitcnt() const { -    return getGeneration() < SEA_ISLANDS; -  } -    bool hasDLInsts() const {      return HasDLInsts;    } @@ -674,6 +702,30 @@ public:      return EnableSRAMECC;    } +  bool hasNoSdstCMPX() const { +    return HasNoSdstCMPX; +  } + +  bool hasVscnt() const { +    return HasVscnt; +  } + +  bool hasRegisterBanking() const { +    return HasRegisterBanking; +  } + +  bool hasVOP3Literal() const { +    return HasVOP3Literal; +  } + +  bool hasNoDataDepHazard() const { +    return HasNoDataDepHazard; +  } + +  bool vmemWriteNeedsExpWaitcnt() const { +    return getGeneration() < SEA_ISLANDS; +  } +    // Scratch is allocated in 256 dword per wave blocks for the entire    // wavefront. When viewed from the perspecive of an arbitrary workitem, this    // is 4-byte aligned. @@ -782,6 +834,12 @@ public:      return HasR128A16;    } +  bool hasNSAEncoding() const { +    return HasNSAEncoding; +  } + +  bool hasMadF16() const; +    bool enableSIScheduler() const {      return EnableSIScheduler;    } @@ -816,6 +874,38 @@ public:             getGeneration() <= AMDGPUSubtarget::GFX9;    } +  bool hasVcmpxPermlaneHazard() const { +    return HasVcmpxPermlaneHazard; +  } + +  bool hasVMEMtoScalarWriteHazard() const { +    return HasVMEMtoScalarWriteHazard; +  } + +  bool hasSMEMtoVectorWriteHazard() const { +    return HasSMEMtoVectorWriteHazard; +  } + +  bool hasLDSMisalignedBug() const { +    return LDSMisalignedBug && !EnableCuMode; +  } + +  bool hasInstFwdPrefetchBug() const { +    return HasInstFwdPrefetchBug; +  } + +  bool hasVcmpxExecWARHazard() const { +    return HasVcmpxExecWARHazard; +  } + +  bool hasLdsBranchVmemWARHazard() const { +    return HasLdsBranchVmemWARHazard; +  } + +  bool hasNSAtoVMEMBug() const { +    return HasNSAtoVMEMBug; +  } +    /// Return the maximum number of waves per SIMD for kernels using \p SGPRs    /// SGPRs    unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 19af9d3ce84..52e5ab5f387 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -999,6 +999,10 @@ public:      return AMDGPU::isGFX9(getSTI());    } +  bool isGFX10() const { +    return AMDGPU::isGFX10(getSTI()); +  } +    bool hasInv2PiInlineImm() const {      return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];    } @@ -1407,7 +1411,7 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {  bool AMDGPUOperand::isSDWAOperand(MVT type) const {    if (AsmParser->isVI())      return isVReg32(); -  else if (AsmParser->isGFX9()) +  else if (AsmParser->isGFX9() || AsmParser->isGFX10())      return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(type);    else      return false; @@ -2953,7 +2957,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {    if (getParser().parseIdentifier(KernelName))      return true; -  kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(); +  kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(&getSTI());    StringSet<> Seen; diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 021db78d48b..0de8feeeb46 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -164,3 +164,10 @@ def : ProcessorModel<"gfx909", SIQuarterSpeedModel,    FeatureISAVersion9_0_9.Features  >; +//===----------------------------------------------------------------------===// +// GCN GFX10. +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx1010", GFX10SpeedModel, +  FeatureISAVersion10_1_0.Features +>; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 5daf4ac6141..b40bda94ae6 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -60,39 +60,40 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {    AMDGPU::GPUKind AK;    switch (ElfMach) { -  case ELF::EF_AMDGPU_MACH_R600_R600:     AK = GK_R600;    break; -  case ELF::EF_AMDGPU_MACH_R600_R630:     AK = GK_R630;    break; -  case ELF::EF_AMDGPU_MACH_R600_RS880:    AK = GK_RS880;   break; -  case ELF::EF_AMDGPU_MACH_R600_RV670:    AK = GK_RV670;   break; -  case ELF::EF_AMDGPU_MACH_R600_RV710:    AK = GK_RV710;   break; -  case ELF::EF_AMDGPU_MACH_R600_RV730:    AK = GK_RV730;   break; -  case ELF::EF_AMDGPU_MACH_R600_RV770:    AK = GK_RV770;   break; -  case ELF::EF_AMDGPU_MACH_R600_CEDAR:    AK = GK_CEDAR;   break; -  case ELF::EF_AMDGPU_MACH_R600_CYPRESS:  AK = GK_CYPRESS; break; -  case ELF::EF_AMDGPU_MACH_R600_JUNIPER:  AK = GK_JUNIPER; break; -  case ELF::EF_AMDGPU_MACH_R600_REDWOOD:  AK = GK_REDWOOD; break; -  case ELF::EF_AMDGPU_MACH_R600_SUMO:     AK = GK_SUMO;    break; -  case ELF::EF_AMDGPU_MACH_R600_BARTS:    AK = GK_BARTS;   break; -  case ELF::EF_AMDGPU_MACH_R600_CAICOS:   AK = GK_CAICOS;  break; -  case ELF::EF_AMDGPU_MACH_R600_CAYMAN:   AK = GK_CAYMAN;  break; -  case ELF::EF_AMDGPU_MACH_R600_TURKS:    AK = GK_TURKS;   break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906;  break; -  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909;  break; -  case ELF::EF_AMDGPU_MACH_NONE:          AK = GK_NONE;    break; +  case ELF::EF_AMDGPU_MACH_R600_R600:      AK = GK_R600;    break; +  case ELF::EF_AMDGPU_MACH_R600_R630:      AK = GK_R630;    break; +  case ELF::EF_AMDGPU_MACH_R600_RS880:     AK = GK_RS880;   break; +  case ELF::EF_AMDGPU_MACH_R600_RV670:     AK = GK_RV670;   break; +  case ELF::EF_AMDGPU_MACH_R600_RV710:     AK = GK_RV710;   break; +  case ELF::EF_AMDGPU_MACH_R600_RV730:     AK = GK_RV730;   break; +  case ELF::EF_AMDGPU_MACH_R600_RV770:     AK = GK_RV770;   break; +  case ELF::EF_AMDGPU_MACH_R600_CEDAR:     AK = GK_CEDAR;   break; +  case ELF::EF_AMDGPU_MACH_R600_CYPRESS:   AK = GK_CYPRESS; break; +  case ELF::EF_AMDGPU_MACH_R600_JUNIPER:   AK = GK_JUNIPER; break; +  case ELF::EF_AMDGPU_MACH_R600_REDWOOD:   AK = GK_REDWOOD; break; +  case ELF::EF_AMDGPU_MACH_R600_SUMO:      AK = GK_SUMO;    break; +  case ELF::EF_AMDGPU_MACH_R600_BARTS:     AK = GK_BARTS;   break; +  case ELF::EF_AMDGPU_MACH_R600_CAICOS:    AK = GK_CAICOS;  break; +  case ELF::EF_AMDGPU_MACH_R600_CAYMAN:    AK = GK_CAYMAN;  break; +  case ELF::EF_AMDGPU_MACH_R600_TURKS:     AK = GK_TURKS;   break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600:  AK = GK_GFX600;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601:  AK = GK_GFX601;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700:  AK = GK_GFX700;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701:  AK = GK_GFX701;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702:  AK = GK_GFX702;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703:  AK = GK_GFX703;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704:  AK = GK_GFX704;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801:  AK = GK_GFX801;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802:  AK = GK_GFX802;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803:  AK = GK_GFX803;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810:  AK = GK_GFX810;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900:  AK = GK_GFX900;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902:  AK = GK_GFX902;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904:  AK = GK_GFX904;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906:  AK = GK_GFX906;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909:  AK = GK_GFX909;  break; +  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; +  case ELF::EF_AMDGPU_MACH_NONE:           AK = GK_NONE;    break;    }    StringRef GPUName = getArchNameAMDGCN(AK); @@ -139,6 +140,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {    case GK_GFX904:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904;    case GK_GFX906:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;    case GK_GFX909:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; +  case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;    case GK_NONE:    return ELF::EF_AMDGPU_MACH_NONE;    } @@ -324,6 +326,17 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(      PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,                  compute_pgm_rsrc1,                  amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL); +  if (IVersion.Major >= 10) { +    PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD, +                compute_pgm_rsrc1, +                amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE); +    PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD, +                compute_pgm_rsrc1, +                amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED); +    PRINT_FIELD(OS, ".amdhsa_forward_progress", KD, +                compute_pgm_rsrc1, +                amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS); +  }    PRINT_FIELD(        OS, ".amdhsa_exception_fp_ieee_invalid_op", KD,        compute_pgm_rsrc2, diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index e645bd0d54a..3e66e6be48d 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -523,6 +523,15 @@ enum DppCtrl : unsigned {  #define   S_00B848_IEEE_MODE(x)                                       (((x) & 0x1) << 23)  #define   G_00B848_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)  #define   C_00B848_IEEE_MODE                                          0xFF7FFFFF +#define   S_00B848_WGP_MODE(x)                                        (((x) & 0x1) << 29) +#define   G_00B848_WGP_MODE(x)                                        (((x) >> 29) & 0x1) +#define   C_00B848_WGP_MODE                                           0xDFFFFFFF +#define   S_00B848_MEM_ORDERED(x)                                     (((x) & 0x1) << 30) +#define   G_00B848_MEM_ORDERED(x)                                     (((x) >> 30) & 0x1) +#define   C_00B848_MEM_ORDERED                                        0xBFFFFFFF +#define   S_00B848_FWD_PROGRESS(x)                                    (((x) & 0x1) << 31) +#define   G_00B848_FWD_PROGRESS(x)                                    (((x) >> 31) & 0x1) +#define   C_00B848_FWD_PROGRESS                                       0x7FFFFFFF  // Helpers for setting FLOAT_MODE @@ -553,6 +562,15 @@ enum DppCtrl : unsigned {  #define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8  #define   S_0286E8_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12) +#define R_028B54_VGT_SHADER_STAGES_EN                                 0x028B54 +#define   S_028B54_HS_W32_EN(x)                                       (((x) & 0x1) << 21) +#define   S_028B54_GS_W32_EN(x)                                       (((x) & 0x1) << 22) +#define   S_028B54_VS_W32_EN(x)                                       (((x) & 0x1) << 23) +#define R_0286D8_SPI_PS_IN_CONTROL                                    0x0286D8 +#define   S_0286D8_PS_W32_EN(x)                                       (((x) & 0x1) << 15) +#define R_00B800_COMPUTE_DISPATCH_INITIATOR                           0x00B800 +#define   S_00B800_CS_W32_EN(x)                                       (((x) & 0x1) << 15) +  #define R_SPILLED_SGPRS         0x4  #define R_SPILLED_VGPRS         0x8  } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8ce3fd2fd58..d9dc000827b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5591,7 +5591,9 @@ enum SIEncodingFamily {    SDWA = 2,    SDWA9 = 3,    GFX80 = 4, -  GFX9 = 5 +  GFX9 = 5, +  GFX10 = 6, +  SDWA10 = 7  };  static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { @@ -5604,6 +5606,8 @@ static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {    case AMDGPUSubtarget::VOLCANIC_ISLANDS:    case AMDGPUSubtarget::GFX9:      return SIEncodingFamily::VI; +  case AMDGPUSubtarget::GFX10: +    return SIEncodingFamily::GFX10;    }    llvm_unreachable("Unknown subtarget generation!");  } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b5a683c1ee3..260fc6156bd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -23,6 +23,8 @@ def SIEncodingFamily {    int SDWA9 = 3;    int GFX80 = 4;    int GFX9 = 5; +  int GFX10 = 6; +  int SDWA10 = 7;  }  //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 2f2dc4b41c9..948f870318e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -112,9 +112,9 @@ def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,  }  foreach Index = 0-15 in { -  def TTMP#Index#_vi   : SIReg<"ttmp"#Index, !add(112, Index)>; -  def TTMP#Index#_gfx9 : SIReg<"ttmp"#Index, !add(108, Index)>; -  def TTMP#Index       : SIReg<"", 0>; +  def TTMP#Index#_vi         : SIReg<"ttmp"#Index, !add(112, Index)>; +  def TTMP#Index#_gfx9_gfx10 : SIReg<"ttmp"#Index, !add(108, Index)>; +  def TTMP#Index             : SIReg<"", 0>;  }  multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { @@ -311,8 +311,8 @@ class TmpRegTuples<string tgt,                     getSubRegs<size>.ret>;  foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in { -  def TTMP#Index#_TTMP#!add(Index,1)#_vi   : TmpRegTuples<"_vi",   2, Index>; -  def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>; +  def TTMP#Index#_TTMP#!add(Index,1)#_vi         : TmpRegTuples<"_vi",   2, Index>; +  def TTMP#Index#_TTMP#!add(Index,1)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 2, Index>;  }  foreach Index = {0, 4, 8, 12} in { @@ -321,7 +321,7 @@ foreach Index = {0, 4, 8, 12} in {                   _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi",   4, Index>;    def TTMP#Index#_TTMP#!add(Index,1)#                   _TTMP#!add(Index,2)# -                 _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>; +                 _TTMP#!add(Index,3)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 4, Index>;  }  foreach Index = {0, 4, 8} in { @@ -338,7 +338,7 @@ foreach Index = {0, 4, 8} in {                   _TTMP#!add(Index,4)#                   _TTMP#!add(Index,5)#                   _TTMP#!add(Index,6)# -                 _TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>; +                 _TTMP#!add(Index,7)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 8, Index>;  }  def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi : @@ -348,12 +348,12 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT                      TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi,                      TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>; -def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 : +def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9_gfx10 :    TmpRegTuplesBase<0, 16, -                   [TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9, -                    TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9, -                    TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9, -                    TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>; +                   [TTMP0_gfx9_gfx10, TTMP1_gfx9_gfx10, TTMP2_gfx9_gfx10, TTMP3_gfx9_gfx10, +                    TTMP4_gfx9_gfx10, TTMP5_gfx9_gfx10, TTMP6_gfx9_gfx10, TTMP7_gfx9_gfx10, +                    TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10, +                    TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>;  // VGPR 32-bit registers diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 4c75a16ce3a..e3066df12d0 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -37,6 +37,9 @@ def WriteDouble : SchedWrite;  // half rate f64 instruction (same as v_add_f64)  def WriteDoubleAdd  : SchedWrite; +// Conversion to or from f64 instruction +def WriteDoubleCvt  : SchedWrite; +  // Half rate 64-bit instructions.  def Write64Bit : SchedWrite; @@ -61,6 +64,7 @@ class SISchedMachineModel : SchedMachineModel {  def SIFullSpeedModel : SISchedMachineModel;  def SIQuarterSpeedModel : SISchedMachineModel; +def GFX10SpeedModel : SISchedMachineModel;  // XXX: Are the resource counts correct?  def HWBranch : ProcResource<1> { @@ -81,6 +85,9 @@ def HWVMEM   : ProcResource<1> {  def HWVALU   : ProcResource<1> {    let BufferSize = 1;  } +def HWRC   : ProcResource<1> { // Register destination cache +  let BufferSize = 1; +}  class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,                   int latency> : WriteRes<write, resources> { @@ -124,6 +131,7 @@ defm : SICommonWriteRes;  def : HWVALUWriteRes<WriteFloatFMA,   1>;  def : HWVALUWriteRes<WriteDouble,     4>;  def : HWVALUWriteRes<WriteDoubleAdd,  2>; +def : HWVALUWriteRes<WriteDoubleCvt,  4>;  def : InstRW<[WriteCopy], (instrs COPY)>; @@ -136,7 +144,32 @@ defm : SICommonWriteRes;  def : HWVALUWriteRes<WriteFloatFMA, 16>;  def : HWVALUWriteRes<WriteDouble,   16>;  def : HWVALUWriteRes<WriteDoubleAdd, 8>; +def : HWVALUWriteRes<WriteDoubleCvt, 4>;  def : InstRW<[WriteCopy], (instrs COPY)>;  }  // End SchedModel = SIQuarterSpeedModel + +let SchedModel = GFX10SpeedModel in { + +// The latency values are 1 / (operations / cycle). +// Add 1 stall cycle for VGPR read. +def : HWWriteRes<Write32Bit,         [HWVALU, HWRC],   5>; +def : HWWriteRes<Write64Bit,         [HWVALU, HWRC],   9>; +def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC],   17>; +def : HWWriteRes<WriteFloatFMA,      [HWVALU, HWRC],   5>; +def : HWWriteRes<WriteDouble,        [HWVALU, HWRC],   17>; +def : HWWriteRes<WriteDoubleAdd,     [HWVALU, HWRC],   17>; +def : HWWriteRes<WriteDoubleCvt,     [HWVALU, HWRC],   17>; + +def : HWWriteRes<WriteBranch,        [HWBranch],       32>; +def : HWWriteRes<WriteExport,        [HWExport, HWRC], 16>; +def : HWWriteRes<WriteLDS,           [HWLGKM,   HWRC], 20>; +def : HWWriteRes<WriteSALU,          [HWSALU,   HWRC], 5>; +def : HWWriteRes<WriteSMEM,          [HWLGKM,   HWRC], 20>; +def : HWWriteRes<WriteVMEM,          [HWVMEM,   HWRC], 320>; +def : HWWriteRes<WriteBarrier,       [HWBranch],       2000>; + +def : InstRW<[WriteCopy], (instrs COPY)>; + +}  // End SchedModel = GFX10SpeedModel diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 819c06df158..7d34e4f737a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -435,11 +435,21 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,    Header.kernarg_segment_alignment = 4;    Header.group_segment_alignment = 4;    Header.private_segment_alignment = 4; + +  if (Version.Major >= 10) { +    Header.compute_pgm_resource_registers |= +      S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) | +      S_00B848_MEM_ORDERED(1); +  }  } -amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() { +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( +    const MCSubtargetInfo *STI) { +  IsaVersion Version = getIsaVersion(STI->getCPU()); +    amdhsa::kernel_descriptor_t KD;    memset(&KD, 0, sizeof(KD)); +    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,                    amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,                    amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE); @@ -449,6 +459,13 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {                    amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1);    AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,                    amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1); +  if (Version.Major >= 10) { +    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, +                    amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE, +                    STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1); +    AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, +                    amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1); +  }    return KD;  } @@ -679,6 +696,10 @@ bool isGFX9(const MCSubtargetInfo &STI) {    return STI.getFeatureBits()[AMDGPU::FeatureGFX9];  } +bool isGFX10(const MCSubtargetInfo &STI) { +  return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; +} +  bool isGCN3Encoding(const MCSubtargetInfo &STI) {    return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];  } @@ -704,46 +725,46 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {    CASE_CI_VI(FLAT_SCR) \    CASE_CI_VI(FLAT_SCR_LO) \    CASE_CI_VI(FLAT_SCR_HI) \ -  CASE_VI_GFX9(TTMP0) \ -  CASE_VI_GFX9(TTMP1) \ -  CASE_VI_GFX9(TTMP2) \ -  CASE_VI_GFX9(TTMP3) \ -  CASE_VI_GFX9(TTMP4) \ -  CASE_VI_GFX9(TTMP5) \ -  CASE_VI_GFX9(TTMP6) \ -  CASE_VI_GFX9(TTMP7) \ -  CASE_VI_GFX9(TTMP8) \ -  CASE_VI_GFX9(TTMP9) \ -  CASE_VI_GFX9(TTMP10) \ -  CASE_VI_GFX9(TTMP11) \ -  CASE_VI_GFX9(TTMP12) \ -  CASE_VI_GFX9(TTMP13) \ -  CASE_VI_GFX9(TTMP14) \ -  CASE_VI_GFX9(TTMP15) \ -  CASE_VI_GFX9(TTMP0_TTMP1) \ -  CASE_VI_GFX9(TTMP2_TTMP3) \ -  CASE_VI_GFX9(TTMP4_TTMP5) \ -  CASE_VI_GFX9(TTMP6_TTMP7) \ -  CASE_VI_GFX9(TTMP8_TTMP9) \ -  CASE_VI_GFX9(TTMP10_TTMP11) \ -  CASE_VI_GFX9(TTMP12_TTMP13) \ -  CASE_VI_GFX9(TTMP14_TTMP15) \ -  CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \ -  CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \ -  CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \ -  CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \ -  CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ -  CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ -  CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ -  CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ +  CASE_VI_GFX9_GFX10(TTMP0) \ +  CASE_VI_GFX9_GFX10(TTMP1) \ +  CASE_VI_GFX9_GFX10(TTMP2) \ +  CASE_VI_GFX9_GFX10(TTMP3) \ +  CASE_VI_GFX9_GFX10(TTMP4) \ +  CASE_VI_GFX9_GFX10(TTMP5) \ +  CASE_VI_GFX9_GFX10(TTMP6) \ +  CASE_VI_GFX9_GFX10(TTMP7) \ +  CASE_VI_GFX9_GFX10(TTMP8) \ +  CASE_VI_GFX9_GFX10(TTMP9) \ +  CASE_VI_GFX9_GFX10(TTMP10) \ +  CASE_VI_GFX9_GFX10(TTMP11) \ +  CASE_VI_GFX9_GFX10(TTMP12) \ +  CASE_VI_GFX9_GFX10(TTMP13) \ +  CASE_VI_GFX9_GFX10(TTMP14) \ +  CASE_VI_GFX9_GFX10(TTMP15) \ +  CASE_VI_GFX9_GFX10(TTMP0_TTMP1) \ +  CASE_VI_GFX9_GFX10(TTMP2_TTMP3) \ +  CASE_VI_GFX9_GFX10(TTMP4_TTMP5) \ +  CASE_VI_GFX9_GFX10(TTMP6_TTMP7) \ +  CASE_VI_GFX9_GFX10(TTMP8_TTMP9) \ +  CASE_VI_GFX9_GFX10(TTMP10_TTMP11) \ +  CASE_VI_GFX9_GFX10(TTMP12_TTMP13) \ +  CASE_VI_GFX9_GFX10(TTMP14_TTMP15) \ +  CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3) \ +  CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7) \ +  CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11) \ +  CASE_VI_GFX9_GFX10(TTMP12_TTMP13_TTMP14_TTMP15) \ +  CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ +  CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ +  CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ +  CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \    }  #define CASE_CI_VI(node) \    assert(!isSI(STI)); \    case node: return isCI(STI) ? node##_ci : node##_vi; -#define CASE_VI_GFX9(node) \ -  case node: return isGFX9(STI) ? node##_gfx9 : node##_vi; +#define CASE_VI_GFX9_GFX10(node) \ +  case node: return (isGFX9(STI) || isGFX10(STI)) ? node##_gfx9_gfx10 : node##_vi;  unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {    if (STI.getTargetTriple().getArch() == Triple::r600) @@ -752,17 +773,17 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {  }  #undef CASE_CI_VI -#undef CASE_VI_GFX9 +#undef CASE_VI_GFX9_GFX10  #define CASE_CI_VI(node)   case node##_ci: case node##_vi:   return node; -#define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node; +#define CASE_VI_GFX9_GFX10(node) case node##_vi: case node##_gfx9_gfx10: return node;  unsigned mc2PseudoReg(unsigned Reg) {    MAP_REG2REG  }  #undef CASE_CI_VI -#undef CASE_VI_GFX9 +#undef CASE_VI_GFX9_GFX10  #undef MAP_REG2REG  bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { @@ -1030,5 +1051,6 @@ const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);  bool isIntrinsicSourceOfDivergence(unsigned IntrID) {    return lookupSourceOfDivergence(IntrID);  } +  } // namespace AMDGPU  } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 2943722963a..cad2d4f25da 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -244,7 +244,8 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen);  void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,                                 const MCSubtargetInfo *STI); -amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(); +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( +    const MCSubtargetInfo *STI);  bool isGroupSegment(const GlobalValue *GV);  bool isGlobalSegment(const GlobalValue *GV); @@ -398,6 +399,7 @@ bool isSI(const MCSubtargetInfo &STI);  bool isCI(const MCSubtargetInfo &STI);  bool isVI(const MCSubtargetInfo &STI);  bool isGFX9(const MCSubtargetInfo &STI); +bool isGFX10(const MCSubtargetInfo &STI);  /// Is Reg - scalar register  bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h index 8efe6f6741e..2d7857ed92b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -82,6 +82,9 @@ COMPPGM1(priv,                            compute_pgm_rsrc1_priv,           PRIV  COMPPGM1(enable_dx10_clamp,               compute_pgm_rsrc1_dx10_clamp,     DX10_CLAMP),  COMPPGM1(debug_mode,                      compute_pgm_rsrc1_debug_mode,     DEBUG_MODE),  COMPPGM1(enable_ieee_mode,                compute_pgm_rsrc1_ieee_mode,      IEEE_MODE), +COMPPGM1(enable_wgp_mode,                 compute_pgm_rsrc1_wgp_mode,       WGP_MODE), +COMPPGM1(enable_mem_ordered,              compute_pgm_rsrc1_mem_ordered,    MEM_ORDERED), +COMPPGM1(enable_fwd_progress,             compute_pgm_rsrc1_fwd_progress,   FWD_PROGRESS),  // TODO: bulky  // TODO: cdbg_user  COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN),  | 

