diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 105 |
1 files changed, 97 insertions, 8 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 92d719838b9..997b41e4b1a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -14,6 +14,7 @@ #include "SIInstrInfo.h" #include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "GCNHazardRecognizer.h" #include "SIDefines.h" @@ -63,6 +64,19 @@ using namespace llvm; +#define GET_INSTRINFO_CTOR_DTOR +#include "AMDGPUGenInstrInfo.inc" + +namespace llvm { +namespace AMDGPU { +#define GET_D16ImageDimIntrinsics_IMPL +#define GET_ImageDimIntrinsicTable_IMPL +#define GET_RsrcIntrinsics_IMPL +#include "AMDGPUGenSearchableTables.inc" +} +} + + // Must be at least 4 to be able to branch over minimum unconditional branch // code. This is only for making it possible to write reasonably small tests for // long branches. @@ -71,7 +85,8 @@ BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)")); SIInstrInfo::SIInstrInfo(const SISubtarget &ST) - : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} + : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + RI(ST), ST(ST) {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -438,6 +453,28 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; } +// FIXME: This behaves strangely. If, for example, you have 32 load + stores, +// the first 16 loads will be interleaved with the stores, and the next 16 will +// be clustered as expected. It should really split into 2 16 store batches. +// +// Loads are clustered until this returns false, rather than trying to schedule +// groups of stores. This also means we have to deal with saying different +// address space loads should be clustered, and ones which might cause bank +// conflicts. +// +// This might be deprecated so it might not be worth that much effort to fix. +bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, + int64_t Offset0, int64_t Offset1, + unsigned NumLoads) const { + assert(Offset1 > Offset0 && + "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 64 + // bytes, then schedule together. + + // A cacheline is 64 bytes (for global memory). + return (NumLoads <= 16 && (Offset1 - Offset0) < 64); +} + static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, @@ -998,7 +1035,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( unsigned FrameOffset, unsigned Size) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -1134,7 +1171,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { - default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + default: return TargetInstrInfo::expandPostRAPseudo(MI); case AMDGPU::S_MOV_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. @@ -1900,16 +1937,16 @@ unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( switch(Kind) { case PseudoSourceValue::Stack: case PseudoSourceValue::FixedStack: - return AMDGPUASI.PRIVATE_ADDRESS; + return ST.getAMDGPUAS().PRIVATE_ADDRESS; case PseudoSourceValue::ConstantPool: case PseudoSourceValue::GOT: case PseudoSourceValue::JumpTable: case PseudoSourceValue::GlobalValueCallEntry: case PseudoSourceValue::ExternalSymbolCallEntry: case PseudoSourceValue::TargetCustom: - return AMDGPUASI.CONSTANT_ADDRESS; + return ST.getAMDGPUAS().CONSTANT_ADDRESS; } - return AMDGPUASI.FLAT_ADDRESS; + return ST.getAMDGPUAS().FLAT_ADDRESS; } static void removeModOperands(MachineInstr &MI) { @@ -4649,7 +4686,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, return AMDGPU::NoRegister; assert(!MI.memoperands_empty() && - (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); + (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS); FrameIndex = Addr->getIndex(); return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); @@ -4768,7 +4805,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { return true; for (const MachineMemOperand *MMO : MI.memoperands()) { - if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) + if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS) return true; } return false; @@ -4948,3 +4985,55 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; return RCID == AMDGPU::SReg_128RegClassID; } + +// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td +enum SIEncodingFamily { + SI = 0, + VI = 1, + SDWA = 2, + SDWA9 = 3, + GFX80 = 4, + GFX9 = 5 +}; + +static SIEncodingFamily subtargetEncodingFamily(const SISubtarget &ST) { + switch (ST.getGeneration()) { + case SISubtarget::SOUTHERN_ISLANDS: + case SISubtarget::SEA_ISLANDS: + return SIEncodingFamily::SI; + case SISubtarget::VOLCANIC_ISLANDS: + case SISubtarget::GFX9: + return SIEncodingFamily::VI; + } + llvm_unreachable("Unknown subtarget generation!"); +} + +int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { + SIEncodingFamily Gen = subtargetEncodingFamily(ST); + + if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && + ST.getGeneration() >= SISubtarget::GFX9) + Gen = SIEncodingFamily::GFX9; + + if (get(Opcode).TSFlags & SIInstrFlags::SDWA) + Gen = ST.getGeneration() == SISubtarget::GFX9 ? SIEncodingFamily::SDWA9 + : SIEncodingFamily::SDWA; + // Adjust the encoding family to GFX80 for D16 buffer instructions when the + // subtarget has UnpackedD16VMem feature. + // TODO: remove this when we discard GFX80 encoding. + if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) + Gen = SIEncodingFamily::GFX80; + + int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); + + // -1 means that Opcode is already a native instruction. + if (MCOp == -1) + return Opcode; + + // (uint16_t)-1 means that Opcode is a pseudo instruction that has + // no encoding in the given subtarget generation. + if (MCOp == (uint16_t)-1) + return -1; + + return MCOp; +} |

