diff options
| author | Tom Stellard <tstellar@redhat.com> | 2019-07-10 00:22:41 +0000 |
|---|---|---|
| committer | Tom Stellard <tstellar@redhat.com> | 2019-07-10 00:22:41 +0000 |
| commit | d0ba79fe7bfe32f8e0ddf42de002716d937431c4 (patch) | |
| tree | db9e82dab243ac02d4d79888a2434d90886a0fcc /llvm/lib | |
| parent | 693936ab8fe84488e9c888245890dc7936d857c3 (diff) | |
| download | bcm5719-llvm-d0ba79fe7bfe32f8e0ddf42de002716d937431c4.tar.gz bcm5719-llvm-d0ba79fe7bfe32f8e0ddf42de002716d937431c4.zip | |
AMDGPU/GlobalISel: Add support for wide loads >= 256-bits
Summary:
This adds support for the most commonly used wide load types:
<8xi32>, <16xi32>, <4xi64>, and <8xi64>
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: hiraditya, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, dstuttard, tpr, t-tye, volkan, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D57399
llvm-svn: 365586
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def | 72 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 9 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 172 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h | 3 |
4 files changed, 219 insertions, 37 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def index 0ad60e7be0f..0a1f48231b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -161,5 +161,77 @@ const RegisterBankInfo::ValueMapping *getValueMappingSGPR64Only(unsigned BankID, return &ValMappingsSGPR64OnlyVGPR32[2]; } +const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] { + /* 256-bit load */ {0, 256, SGPRRegBank}, + /* 512-bit load */ {0, 512, SGPRRegBank}, + /* 8 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, + {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, + {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, + {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, + /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank}, + {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank}, + {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank}, + {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank}, + {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank}, + {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank}, + {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank}, + {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank}, + /* 4 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, + {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, + /* 8 64-bit loads */ {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank}, + {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank}, + {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank}, + {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank}, + + /* FIXME: The generic register bank select does not support complex + * break downs where the number of vector elements does not equal the + * number of breakdowns. + * FIXME: register bank select now tries to handle complex break downs, + * but it emits an illegal instruction: + * %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128) + */ + /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, + /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank}, + {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank} +}; + +const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] { + /* 256-bit load */ {&LoadSGPROnlyBreakDown[0], 1}, + /* 512-bit load */ {&LoadSGPROnlyBreakDown[1], 1}, + /* <8 x i32> load */ {&LoadSGPROnlyBreakDown[2], 8}, + /* <16 x i32> load */ {&LoadSGPROnlyBreakDown[10], 16}, + /* <4 x i64> load */ {&LoadSGPROnlyBreakDown[26], 4}, + /* <8 x i64> load */ {&LoadSGPROnlyBreakDown[30], 8} +}; + +const RegisterBankInfo::ValueMapping * +getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) { + unsigned Size = SizeTy.getSizeInBits(); + if (Size < 256 || BankID == AMDGPU::SGPRRegBankID) + return getValueMapping(BankID, Size); + + assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID); + + // Default to using the non-split ValueMappings, we will use these if + // the register bank is SGPR or if we don't know how to handle the vector + // type. + unsigned Idx = Size == 256 ? 0 : 1; + + // We need to split this load if it has a vgpr pointer. + if (BankID == AMDGPU::VGPRRegBankID) { + if (SizeTy == LLT::vector(8, 32)) + Idx = 2; + else if (SizeTy == LLT::vector(16, 32)) + Idx = 3; + else if (SizeTy == LLT::vector(4, 64)) + Idx = 4; + else if (SizeTy == LLT::vector(8, 64)) + Idx = 5; + } + + return &ValMappingsLoadSGPROnly[Idx]; +} + + } // End AMDGPU namespace. } // End llvm namespace. diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index f33fe2e128d..38e15171128 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -517,7 +517,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, case 256: case 512: - // TODO: constant loads + // TODO: Possibly support loads of i256 and i512 . This will require + // adding i256 and i512 types to MVT in order for to be able to use + // TableGen. + // TODO: Add support for other vector types, this will require + // defining more value mappings for the new types. + return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || + Ty0.getScalarType().getSizeInBits() == 64); + default: return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 2654d51ea3a..78cafc7f997 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -37,22 +37,23 @@ using namespace llvm; namespace { // Observer to apply a register bank to new registers created by LegalizerHelper. -class ApplySALUMapping final : public GISelChangeObserver { +class ApplyRegBankMapping final : public GISelChangeObserver { private: MachineRegisterInfo &MRI; + const RegisterBank *NewBank; SmallVector<MachineInstr *, 4> NewInsts; public: - ApplySALUMapping(MachineRegisterInfo &MRI_) - : MRI(MRI_) {} + ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB) + : MRI(MRI_), NewBank(RB) {} - ~ApplySALUMapping() { + ~ApplyRegBankMapping() { for (MachineInstr *MI : NewInsts) - applySALUBank(*MI); + applyBank(*MI); } /// Set any registers that don't have a set register class or bank to SALU. - void applySALUBank(MachineInstr &MI) { + void applyBank(MachineInstr &MI) { for (MachineOperand &Op : MI.operands()) { if (!Op.isReg()) continue; @@ -61,10 +62,13 @@ public: if (MRI.getRegClassOrRegBank(Reg)) continue; + const RegisterBank *RB = NewBank; // FIXME: This might not be enough to detect when SCC should be used. - const RegisterBank &RB = MRI.getType(Reg) == LLT::scalar(1) ? - AMDGPU::SCCRegBank : AMDGPU::SGPRRegBank; - MRI.setRegBank(Reg, RB); + if (MRI.getType(Reg) == LLT::scalar(1)) + RB = (NewBank == &AMDGPU::SGPRRegBank ? + &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank); + + MRI.setRegBank(Reg, *RB); } } @@ -80,7 +84,6 @@ public: }; } - AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) : AMDGPUGenRegisterBankInfo(), TRI(static_cast<const SIRegisterInfo*>(&TRI)) { @@ -128,6 +131,12 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, unsigned AMDGPURegisterBankInfo::getBreakDownCost( const ValueMapping &ValMapping, const RegisterBank *CurBank) const { + // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to + // VGPR. + // FIXME: Is there a better way to do this? + if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) + return 10; // This is expensive. + assert(ValMapping.NumBreakDowns == 2 && ValMapping.BreakDown[0].Length == 32 && ValMapping.BreakDown[0].StartIdx == 0 && @@ -302,6 +311,14 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( } } +static bool isInstrUniform(const MachineInstr &MI) { + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand *MMO = *MI.memoperands_begin(); + return AMDGPUInstrInfo::isUniformMMO(MMO); +} + RegisterBankInfo::InstructionMappings AMDGPURegisterBankInfo::getInstrAlternativeMappings( const MachineInstr &MI) const { @@ -356,29 +373,29 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( } case TargetOpcode::G_LOAD: { unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); // FIXME: Should we be hard coding the size for these mappings? - const InstructionMapping &SSMapping = getInstructionMapping( - 1, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), - 2); // Num Operands - AltMappings.push_back(&SSMapping); + if (isInstrUniform(MI)) { + const InstructionMapping &SSMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), + 2); // Num Operands + AltMappings.push_back(&SSMapping); + } const InstructionMapping &VVMapping = getInstructionMapping( 2, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy), AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}), 2); // Num Operands AltMappings.push_back(&VVMapping); - // FIXME: Should this be the pointer-size (64-bits) or the size of the - // register that will hold the bufffer resourc (128-bits). - const InstructionMapping &VSMapping = getInstructionMapping( - 3, 1, getOperandsMapping( - {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), - AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}), - 2); // Num Operands - AltMappings.push_back(&VSMapping); + // It may be possible to have a vgpr = load sgpr mapping here, because + // the mubuf instructions support this kind of load, but probably for only + // gfx7 and older. However, the addressing mode matching in the instruction + // selector should be able to do a better job of detecting and selecting + // these kinds of loads from the vgpr = load vgpr mapping. return AltMappings; @@ -874,6 +891,91 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( MI.getOperand(OpIdx).setReg(SGPR); } +// When regbankselect repairs registers, it will insert a repair instruction +// which defines the repaired register. Then it calls applyMapping and expects +// that the targets will either delete or rewrite the originally wrote to the +// repaired registers. Beccause of this, we end up in a situation where +// we have 2 instructions defining the same registers. +static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI, + Register Reg, + const MachineInstr &MI) { + // Is there some way we can assert that there are exactly 2 def instructions? + for (MachineInstr &Other : MRI.def_instructions(Reg)) { + if (&Other != &MI) + return &Other; + } + + return nullptr; +} + +bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const { + Register DstReg = MI.getOperand(0).getReg(); + const LLT LoadTy = MRI.getType(DstReg); + unsigned LoadSize = LoadTy.getSizeInBits(); + const unsigned MaxNonSmrdLoadSize = 128; + // 128-bit loads are supported for all instruction types. + if (LoadSize <= MaxNonSmrdLoadSize) + return false; + + SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0)); + SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1)); + + // If the pointer is an SGPR, we have nothing to do. + if (SrcRegs.empty()) + return false; + + assert(LoadSize % MaxNonSmrdLoadSize == 0); + + // We want to get the repair instruction now, because it will help us + // determine which instruction the legalizer inserts that will also + // write to DstReg. + MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI); + + // RegBankSelect only emits scalar types, so we need to reset the pointer + // operand to a pointer type. + Register BasePtrReg = SrcRegs[0]; + LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); + MRI.setType(BasePtrReg, PtrTy); + + MachineIRBuilder B(MI); + + unsigned SplitElts = + MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits(); + const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType()); + ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank); + GISelObserverWrapper Observer(&O); + B.setChangeObserver(Observer); + LegalizerHelper Helper(B.getMF(), Observer, B); + if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) + return false; + + // At this point, the legalizer has split the original load into smaller + // loads. At the end of lowering, it inserts an instruction (LegalizedInst) + // that combines the outputs of the lower loads and writes it to DstReg. + // The register bank selector has also added the RepairInst which writes to + // DstReg as well. + + MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst); + + // Replace the output of the LegalizedInst with a temporary register, since + // RepairInst already defines DstReg. + Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg)); + LegalizedInst->getOperand(0).setReg(TmpReg); + B.setInsertPt(*RepairInst->getParent(), RepairInst); + + for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) { + Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + B.buildConstant(IdxReg, DefIdx); + MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID)); + B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg); + } + + MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID)); + return true; +} + // For cases where only a single copy is inserted for matching register banks. // Replace the register in the instruction operand static void substituteSimpleCopyRegs( @@ -1008,7 +1110,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. MachineFunction *MF = MI.getParent()->getParent(); MachineIRBuilder B(MI); - ApplySALUMapping ApplySALU(MRI); + ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); GISelObserverWrapper Observer(&ApplySALU); LegalizerHelper Helper(*MF, Observer, B); @@ -1028,7 +1130,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MachineFunction *MF = MI.getParent()->getParent(); MachineIRBuilder B(MI); - ApplySALUMapping ApplySALU(MRI); + ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank); GISelObserverWrapper Observer(&ApplySALU); LegalizerHelper Helper(*MF, Observer, B); @@ -1212,6 +1314,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl( } break; } + case AMDGPU::G_LOAD: { + if (applyMappingWideLoad(MI, OpdMapper, MRI)) + return; + break; + } default: break; } @@ -1219,14 +1326,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return applyDefaultMapping(OpdMapper); } -static bool isInstrUniform(const MachineInstr &MI) { - if (!MI.hasOneMemOperand()) - return false; - - const MachineMemOperand *MMO = *MI.memoperands_begin(); - return AMDGPUInstrInfo::isUniformMMO(MMO); -} - bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -1322,6 +1421,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); const ValueMapping *ValMapping; @@ -1332,7 +1432,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); } else { - ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy); // FIXME: What would happen if we used SGPRRegBankID here? PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 05f7b1f29f0..f3a96e2a612 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -44,6 +44,9 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const; + bool applyMappingWideLoad(MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const; /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; |

