diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td | 22 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/GCNRegPressure.h | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 28 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 97 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 34 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 202 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 151 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 31 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 2 |
16 files changed, 548 insertions, 62 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b3589a02fcd..7fa23fd84e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -647,6 +647,8 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { return AMDGPU::SReg_256RegClassID; case 16: return AMDGPU::SReg_512RegClassID; + case 32: + return AMDGPU::SReg_1024RegClassID; } llvm_unreachable("invalid vector size"); @@ -665,12 +667,12 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { return; } - assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " + assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not " "supported yet"); - // 16 = Max Num Vector Elements + // 32 = Max Num Vector Elements // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) // 1 = Vector Register Class - SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); + SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); bool IsRegSeq = true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 666d0225561..56922b05050 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -355,6 +355,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index ef930f017ae..f8703c36127 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -103,5 +103,27 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>; def : SourceOfDivergence<int_amdgcn_mov_dpp8>; def : SourceOfDivergence<int_amdgcn_update_dpp>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4f16>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_4x4x4i8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x2bf16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x1f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16f16>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x4i8>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x16i8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x2bf16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8bf16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x1f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2f32>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4f16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8f16>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>; +def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>; +def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>; + foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence<intr>; diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp index c3d076e95af..f0d47eaa4ed 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -365,6 +365,9 @@ unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, continue; unsigned R = Op.getReg(); + if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) + continue; + unsigned ShiftedBank = Bank; if (Bank != -1 && R == Reg && Op.getSubReg()) { diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index be01988b6bc..39460fbd8a8 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -89,7 +89,9 @@ unsigned GCNRegPressure::getRegKind(unsigned Reg, auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); return STI->isSGPRClass(RC) ? (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) : - (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); + STI->hasAGPRs(RC) ? + (STI->getRegSizeInBits(*RC) == 32 ? AGPR32 : AGPR_TUPLE) : + (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); } void GCNRegPressure::inc(unsigned Reg, @@ -110,16 +112,18 @@ void GCNRegPressure::inc(unsigned Reg, switch (auto Kind = getRegKind(Reg, MRI)) { case SGPR32: case VGPR32: + case AGPR32: assert(PrevMask.none() && NewMask == MaxMask); Value[Kind] += Sign; break; case SGPR_TUPLE: case VGPR_TUPLE: + case AGPR_TUPLE: assert(NewMask < MaxMask || NewMask == MaxMask); assert(PrevMask < NewMask); - Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] += + Value[Kind == SGPR_TUPLE ? SGPR32 : Kind == AGPR_TUPLE ? AGPR32 : VGPR32] += Sign * (~PrevMask & NewMask).getNumLanes(); if (PrevMask.none()) { diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index fe7b934f291..e4894418b94 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -31,6 +31,8 @@ struct GCNRegPressure { SGPR_TUPLE, VGPR32, VGPR_TUPLE, + AGPR32, + AGPR_TUPLE, TOTAL_KINDS }; @@ -43,9 +45,10 @@ struct GCNRegPressure { void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } unsigned getSGPRNum() const { return Value[SGPR32]; } - unsigned getVGPRNum() const { return Value[VGPR32]; } + unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); } - unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; } + unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE], + Value[AGPR_TUPLE]); } unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; } unsigned getOccupancy(const GCNSubtarget &ST) const { diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index cc16d931dcf..18598d6cef4 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -143,14 +143,15 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass() { return new SIFixSGPRCopies(); } -static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { +static bool hasVectorOperands(const MachineInstr &MI, + const SIRegisterInfo *TRI) { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (!MI.getOperand(i).isReg() || !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) continue; - if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) + if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg()))) return true; } return false; @@ -183,14 +184,14 @@ static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && - TRI.hasVGPRs(SrcRC); + TRI.hasVectorRegisters(SrcRC); } static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && - TRI.hasVGPRs(DstRC); + TRI.hasVectorRegisters(DstRC); } static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, @@ -277,6 +278,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, // VGPRz = REG_SEQUENCE VGPRx, sub0 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); + bool IsAGPR = TRI->hasAGPRs(DstRC); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { unsigned SrcReg = MI.getOperand(I).getReg(); @@ -295,6 +297,17 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, TmpReg) .add(MI.getOperand(I)); + if (IsAGPR) { + const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); + unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC); + unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? + AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), + TmpAReg) + .addReg(TmpReg, RegState::Kill); + TmpReg = TmpAReg; + } + MI.getOperand(I).setReg(TmpReg); } @@ -682,8 +695,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { break; } case AMDGPU::REG_SEQUENCE: - if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || - !hasVGPROperands(MI, TRI)) { + if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || + !hasVectorOperands(MI, TRI)) { foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); continue; } @@ -698,7 +711,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); if (TRI->isSGPRClass(DstRC) && - (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { + (TRI->hasVectorRegisters(Src0RC) || + TRI->hasVectorRegisters(Src1RC))) { LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); TII->moveToVALU(MI, MDT); } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 74ed6f1fed1..bcc3478c67b 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -187,6 +187,7 @@ static bool updateOperand(FoldCandidate &Fold, if (Fold.isImm()) { if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked && + !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) && AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold), ST.hasInv2PiInlineImm())) { // Set op_sel/op_sel_hi on this operand or bail out if op_sel is @@ -419,6 +420,71 @@ static bool isUseSafeToFold(const SIInstrInfo *TII, //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } +static bool tryToFoldACImm(const SIInstrInfo *TII, + const MachineOperand &OpToFold, + MachineInstr *UseMI, + unsigned UseOpIdx, + SmallVectorImpl<FoldCandidate> &FoldList) { + const MCInstrDesc &Desc = UseMI->getDesc(); + const MCOperandInfo *OpInfo = Desc.OpInfo; + if (!OpInfo || UseOpIdx >= Desc.getNumOperands()) + return false; + + uint8_t OpTy = OpInfo[UseOpIdx].OperandType; + if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST || + OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) + return false; + + if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) { + UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); + return true; + } + + if (!OpToFold.isReg()) + return false; + + unsigned UseReg = OpToFold.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(UseReg)) + return false; + + if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) { + return FC.UseMI == UseMI; }) != FoldList.end()) + return false; + + MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo(); + const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg); + if (!Def || !Def->isRegSequence()) + return false; + + int64_t Imm; + MachineOperand *Op; + for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { + const MachineOperand &Sub = Def->getOperand(I); + if (!Sub.isReg() || Sub.getSubReg()) + return false; + MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg()); + while (SubDef && !SubDef->isMoveImmediate() && + !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef)) + SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg()); + if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm()) + return false; + Op = &SubDef->getOperand(1); + auto SubImm = Op->getImm(); + if (I == 1) { + if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy)) + return false; + + Imm = SubImm; + continue; + } + if (Imm != SubImm) + return false; // Can only fold splat constants + } + + FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op)); + return true; +} + void SIFoldOperands::foldOperand( MachineOperand &OpToFold, MachineInstr *UseMI, @@ -462,6 +528,11 @@ void SIFoldOperands::foldOperand( Next = std::next(RSUse); MachineInstr *RSUseMI = RSUse->getParent(); + + if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI, + RSUse.getOperandNo(), FoldList)) + continue; + if (RSUse->getSubReg() != RegSeqDstSubReg) continue; @@ -472,6 +543,9 @@ void SIFoldOperands::foldOperand( return; } + if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList)) + return; + if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) { // Sanity check that this is a stack access. // FIXME: Should probably use stack pseudos before frame lowering. @@ -505,7 +579,7 @@ void SIFoldOperands::foldOperand( if (TargetRegisterInfo::isVirtualRegister(DestReg) && TargetRegisterInfo::isVirtualRegister(SrcReg)) { const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); - if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) { + if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { MachineRegisterInfo::use_iterator NextUse; SmallVector<FoldCandidate, 4> CopyUses; for (MachineRegisterInfo::use_iterator @@ -523,6 +597,14 @@ void SIFoldOperands::foldOperand( } } + if (DestRC == &AMDGPU::AGPR_32RegClass && + TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + CopiesToReplace.push_back(UseMI); + return; + } + // In order to fold immediates into copies, we need to change the // copy to a MOV. @@ -535,14 +617,23 @@ void SIFoldOperands::foldOperand( } else { if (UseMI->isCopy() && OpToFold.isReg() && TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) && - TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && - TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) && + TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) && !UseMI->getOperand(1).getSubReg()) { + unsigned Size = TII->getOpSize(*UseMI, 1); UseMI->getOperand(1).setReg(OpToFold.getReg()); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); UseMI->getOperand(1).setIsKill(false); CopiesToReplace.push_back(UseMI); OpToFold.setIsKill(false); + if (Size != 4) + return; + if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); + else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32)); return; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e8bc5c85f4e..69c6a3361de 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -151,6 +151,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } + if (Subtarget->hasMAIInsts()) { + addRegisterClass(MVT::v32i32, &AMDGPU::AReg_1024RegClass); + } + computeRegisterProperties(Subtarget->getRegisterInfo()); // We need to custom lower vector stores from local memory @@ -10194,6 +10198,36 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); + + // Prefer VGPRs over AGPRs in mAI instructions where possible. + // This saves a chain-copy of registers and better ballance register + // use between vgpr and agpr as agpr tuples tend to be big. + if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) { + unsigned Opc = MI.getOpcode(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) { + if (I == -1) + break; + MachineOperand &Op = MI.getOperand(I); + if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && + OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || + !TargetRegisterInfo::isVirtualRegister(Op.getReg()) || + !TRI->isAGPR(MRI, Op.getReg())) + continue; + auto *Src = MRI.getUniqueVRegDef(Op.getReg()); + if (!Src || !Src->isCopy() || + !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) + continue; + auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); + auto *NewRC = TRI->getEquivalentVGPRClass(RC); + // All uses of agpr64 and agpr32 can also accept vgpr except for + // v_accvgpr_read, but we do not produce agpr reads during selection, + // so no use checks are needed. + MRI.setRegClass(Op.getReg(), NewRC); + } + } + return; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index efc80652631..c89d5b71ec5 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -463,7 +463,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, unsigned OpNo, bool Def) const { const MachineOperand &Op = MI->getOperand(OpNo); if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || - (Def && !Op.isDef())) + (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg())) return {-1, -1}; // A use via a PW operand does not need a waitcnt. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8605932330e..88d37992072 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -512,8 +512,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (RC == &AMDGPU::VGPR_32RegClass) { assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || - AMDGPU::SReg_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + AMDGPU::SReg_32RegClass.contains(SrcReg) || + AMDGPU::AGPR_32RegClass.contains(SrcReg)); + unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? + AMDGPU::V_ACCVGPR_READ_B32 : AMDGPU::V_MOV_B32_e32; + BuildMI(MBB, MI, DL, get(Opc), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } @@ -586,6 +589,78 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (RC == &AMDGPU::AGPR_32RegClass) { + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || + AMDGPU::SReg_32RegClass.contains(SrcReg) || + AMDGPU::AGPR_32RegClass.contains(SrcReg)); + if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) { + // First try to find defining accvgpr_write to avoid temporary registers. + for (auto Def = MI, E = MBB.begin(); Def != E; ) { + --Def; + if (!Def->definesRegister(SrcReg, &RI)) + continue; + if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) + break; + + MachineOperand &DefOp = Def->getOperand(1); + assert(DefOp.isReg() || DefOp.isImm()); + + if (DefOp.isReg()) { + // Check that register source operand if not clobbered before MI. + // Immediate operands are always safe to propagate. + bool SafeToPropagate = true; + for (auto I = Def; I != MI && SafeToPropagate; ++I) + if (I->modifiesRegister(DefOp.getReg(), &RI)) + SafeToPropagate = false; + + if (!SafeToPropagate) + break; + + DefOp.setIsKill(false); + } + + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .add(DefOp); + return; + } + + RegScavenger RS; + RS.enterBasicBlock(MBB); + RS.forward(MI); + + // Ideally we want to have three registers for a long reg_sequence copy + // to hide 2 waitstates between v_mov_b32 and accvgpr_write. + unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, + *MBB.getParent()); + + // Registers in the sequence are allocated contiguously so we can just + // use register number to pick one of three round-robin temps. + unsigned RegNo = DestReg % 3; + unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp) + report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); + RS.setRegUsed(Tmp); + // Only loop through if there are any free registers left, otherwise + // scavenger may report a fatal error without emergency spill slot + // or spill with the slot. + while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { + unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) + break; + Tmp = Tmp2; + RS.setRegUsed(Tmp); + } + copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .addReg(Tmp, RegState::Kill); + return; + } + + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isSGPRClass(RC)) { @@ -602,6 +677,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); return; } + } else if (RI.hasAGPRs(RC)) { + Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? + AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; + } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { + Opcode = AMDGPU::V_ACCVGPR_READ_B32; } ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); @@ -614,6 +694,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else SubIdx = SubIndices[SubIndices.size() - Idx - 1]; + if (Opcode == TargetOpcode::COPY) { + copyPhysReg(MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), + RI.getSubReg(SrcReg, SubIdx), KillSrc); + continue; + } + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)); @@ -862,6 +948,8 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { + if (RI.hasAGPRs(DstRC)) + return AMDGPU::COPY; if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { @@ -1922,7 +2010,7 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? // Limit to equal cost for branch vs. N v_cndmask_b32s. - return !RI.isSGPRClass(RC) && NumInsts <= 6; + return RI.hasVGPRs(RC) && NumInsts <= 6; } case SCC_TRUE: case SCC_FALSE: { @@ -2056,6 +2144,8 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: case AMDGPU::COPY: + case AMDGPU::V_ACCVGPR_WRITE_B32: + case AMDGPU::V_ACCVGPR_READ_B32: return true; default: return false; @@ -2108,6 +2198,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, case AMDGPU::V_MOV_B32_e32: case AMDGPU::S_MOV_B32: + case AMDGPU::V_ACCVGPR_WRITE_B32: break; } @@ -2121,6 +2212,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (Opc == AMDGPU::COPY) { bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; + if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) { + if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32)) + return false; + NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; + } UseMI.setDesc(get(NewOpc)); UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); @@ -2628,7 +2724,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: - case AMDGPU::OPERAND_REG_INLINE_C_FP32: { + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { int32_t Trunc = static_cast<int32_t>(Imm); return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } @@ -2641,7 +2739,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. @@ -2657,7 +2757,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { uint32_t Trunc = static_cast<uint32_t>(Imm); return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); } @@ -3026,7 +3128,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { const MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { ErrInfo = "Illegal immediate value for operand."; @@ -3475,9 +3581,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::WQM: return AMDGPU::WQM; case AMDGPU::WWM: return AMDGPU::WWM; - case AMDGPU::S_MOV_B32: - return MI.getOperand(1).isReg() ? + case AMDGPU::S_MOV_B32: { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + return MI.getOperand(1).isReg() || + RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; + } case AMDGPU::S_ADD_I32: return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; case AMDGPU::S_ADDC_U32: @@ -3755,27 +3864,24 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, unsigned Opc = MI.getOpcode(); const MCInstrDesc &InstrDesc = get(Opc); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); MachineOperand &Src1 = MI.getOperand(Src1Idx); // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 // we need to only have one constant bus use before GFX10. bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; - if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1) { - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - - if (Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || - isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) - legalizeOpWithMove(MI, Src0Idx); - } + if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && + Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || + isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) + legalizeOpWithMove(MI, Src0Idx); // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for // both the value to write (src0) and lane select (src1). Fix up non-SGPR // src0/src1 with V_READFIRSTLANE. if (Opc == AMDGPU::V_WRITELANE_B32) { - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); const DebugLoc &DL = MI.getDebugLoc(); if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -3793,6 +3899,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, return; } + // No VOP2 instructions support AGPRs. + if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) + legalizeOpWithMove(MI, Src0Idx); + + if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) + legalizeOpWithMove(MI, Src1Idx); + // VOP2 src0 instructions support all operand types, so we don't need to check // their legality. If src1 is already legal, we don't need to do anything. if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) @@ -3820,9 +3933,6 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, return; } - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - // If src0 can be used as src1, commuting will make the operands legal. // Otherwise we have to give up and insert a move. // @@ -3923,6 +4033,12 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, continue; } + if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && + !isOperandLegal(MI, Idx, &MO)) { + legalizeOpWithMove(MI, Idx); + continue; + } + if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) continue; // VGPRs are legal @@ -3949,6 +4065,15 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, unsigned DstReg = MRI.createVirtualRegister(SRC); unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; + if (RI.hasAGPRs(VRC)) { + VRC = RI.getEquivalentVGPRClass(VRC); + unsigned NewSrcReg = MRI.createVirtualRegister(VRC); + BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(TargetOpcode::COPY), NewSrcReg) + .addReg(SrcReg); + SrcReg = NewSrcReg; + } + if (SubRegs == 1) { BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(AMDGPU::V_READFIRSTLANE_B32), DstReg) @@ -4258,7 +4383,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, continue; const TargetRegisterClass *OpRC = MRI.getRegClass(MI.getOperand(i).getReg()); - if (RI.hasVGPRs(OpRC)) { + if (RI.hasVectorRegisters(OpRC)) { VRC = OpRC; } else { SRC = OpRC; @@ -4271,7 +4396,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); - VRC = RI.getEquivalentVGPRClass(SRC); + VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC) + : RI.getEquivalentVGPRClass(SRC); } RC = VRC; } else { @@ -4340,7 +4466,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, // Legalize SI_INIT_M0 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { MachineOperand &Src = MI.getOperand(0); - if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg()))) + if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); return; } @@ -5342,7 +5468,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( break; } - if (!RI.hasVGPRs(getOpRegClass(UseMI, OpNo))) { + if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { Worklist.insert(&UseMI); do { @@ -5449,14 +5575,26 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( case AMDGPU::REG_SEQUENCE: case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: - case AMDGPU::WWM: - if (RI.hasVGPRs(NewDstRC)) - return nullptr; + case AMDGPU::WWM: { + const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); + if (RI.hasAGPRs(SrcRC)) { + if (RI.hasAGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + } else { + if (RI.hasVGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + } - NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); - if (!NewDstRC) - return nullptr; return NewDstRC; + } default: return NewDstRC; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index f054dc8cd35..4831ede3d54 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -891,6 +891,16 @@ def : Pat < (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) >; +foreach Index = 0-31 in { + def Extract_Element_v32i32_#Index : Extract_Element < + i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) + >; + + def Insert_Element_v32i32_#Index : Insert_Element < + i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) + >; +} + // FIXME: Why do only some of these type combinations for SReg and // VReg? // 16-bit bitcast diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index a23216a6121..3227bff2051 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -253,8 +253,8 @@ static MachineBasicBlock::reverse_iterator findExecCopy( } // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly -// repor tthe register as unavailable because a super-register with a lane mask -// as unavailable. +// report the register as unavailable because a super-register with a lane mask +// is unavailable. static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { for (MachineBasicBlock *Succ : MBB.successors()) { if (Succ->isLiveIn(Reg)) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 6160e587c14..9fde16edade 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -62,6 +62,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPURegisterInfo(), SGPRPressureSets(getNumRegPressureSets()), VGPRPressureSets(getNumRegPressureSets()), + AGPRPressureSets(getNumRegPressureSets()), SpillSGPRToVGPR(false), SpillSGPRToSMEM(false), isWave32(ST.isWave32()) { @@ -74,10 +75,12 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : SGPRSetID = NumRegPressureSets; VGPRSetID = NumRegPressureSets; + AGPRSetID = NumRegPressureSets; for (unsigned i = 0; i < NumRegPressureSets; ++i) { classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); + classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets); } // Determine the number of reg units for each pressure set. @@ -89,7 +92,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : } } - unsigned VGPRMax = 0, SGPRMax = 0; + unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0; for (unsigned i = 0; i < NumRegPressureSets; ++i) { if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) { VGPRSetID = i; @@ -100,10 +103,16 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : SGPRSetID = i; SGPRMax = PressureSetRegUnits[i]; } + if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) { + AGPRSetID = i; + AGPRMax = PressureSetRegUnits[i]; + continue; + } } assert(SGPRSetID < NumRegPressureSets && - VGPRSetID < NumRegPressureSets); + VGPRSetID < NumRegPressureSets && + AGPRSetID < NumRegPressureSets); } unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( @@ -1327,18 +1336,25 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { static const TargetRegisterClass *const BaseClasses[] = { &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, + &AMDGPU::AGPR_32RegClass, &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, + &AMDGPU::AReg_64RegClass, &AMDGPU::VReg_96RegClass, &AMDGPU::SReg_96RegClass, &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, + &AMDGPU::AReg_128RegClass, &AMDGPU::VReg_160RegClass, &AMDGPU::SReg_160RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, &AMDGPU::VReg_512RegClass, &AMDGPU::SReg_512RegClass, + &AMDGPU::AReg_512RegClass, + &AMDGPU::SReg_1024RegClass, + &AMDGPU::VReg_1024RegClass, + &AMDGPU::AReg_1024RegClass, &AMDGPU::SCC_CLASSRegClass, &AMDGPU::Pseudo_SReg_32RegClass, &AMDGPU::Pseudo_SReg_128RegClass, @@ -1373,6 +1389,33 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; case 512: return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; + case 1024: + return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr; + default: + llvm_unreachable("Invalid register class size"); + } +} + +bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const { + unsigned Size = getRegSizeInBits(*RC); + if (Size < 32) + return false; + switch (Size) { + case 32: + return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr; + case 64: + return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr; + case 96: + return false; + case 128: + return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr; + case 160: + case 256: + return false; + case 512: + return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr; + case 1024: + return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr; default: llvm_unreachable("Invalid register class size"); } @@ -1395,6 +1438,26 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( return &AMDGPU::VReg_256RegClass; case 512: return &AMDGPU::VReg_512RegClass; + case 1024: + return &AMDGPU::VReg_1024RegClass; + default: + llvm_unreachable("Invalid register class size"); + } +} + +const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass( + const TargetRegisterClass *SRC) const { + switch (getRegSizeInBits(*SRC)) { + case 32: + return &AMDGPU::AGPR_32RegClass; + case 64: + return &AMDGPU::AReg_64RegClass; + case 128: + return &AMDGPU::AReg_128RegClass; + case 512: + return &AMDGPU::AReg_512RegClass; + case 1024: + return &AMDGPU::AReg_1024RegClass; default: llvm_unreachable("Invalid register class size"); } @@ -1417,6 +1480,8 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( return &AMDGPU::SReg_256RegClass; case 512: return &AMDGPU::SReg_512RegClass; + case 1024: + return &AMDGPU::SReg_1024RegClass; default: llvm_unreachable("Invalid register class size"); } @@ -1443,7 +1508,23 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return &AMDGPU::SReg_160RegClass; case 8: return &AMDGPU::SReg_256RegClass; - case 16: /* fall-through */ + case 16: + return &AMDGPU::SReg_512RegClass; + case 32: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } + } else if (hasAGPRs(RC)) { + switch (Count) { + case 1: + return &AMDGPU::AGPR_32RegClass; + case 2: + return &AMDGPU::AReg_64RegClass; + case 4: + return &AMDGPU::AReg_128RegClass; + case 16: + return &AMDGPU::AReg_512RegClass; + case 32: /* fall-through */ default: llvm_unreachable("Invalid sub-register class size"); } @@ -1461,7 +1542,9 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return &AMDGPU::VReg_160RegClass; case 8: return &AMDGPU::VReg_256RegClass; - case 16: /* fall-through */ + case 16: + return &AMDGPU::VReg_512RegClass; + case 32: /* fall-through */ default: llvm_unreachable("Invalid sub-register class size"); } @@ -1509,6 +1592,17 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { if (EltSize == 4) { + static const int16_t Sub0_31[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, + AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, + AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, + AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31, + }; + static const int16_t Sub0_15[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, @@ -1552,12 +1646,25 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC return makeArrayRef(Sub0_7); case 512: return makeArrayRef(Sub0_15); + case 1024: + return makeArrayRef(Sub0_31); default: llvm_unreachable("unhandled register size"); } } if (EltSize == 8) { + static const int16_t Sub0_31_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, + AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, + AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, + AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, + AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 + }; + static const int16_t Sub0_15_64[] = { AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, @@ -1584,12 +1691,26 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC return makeArrayRef(Sub0_7_64); case 512: return makeArrayRef(Sub0_15_64); + case 1024: + return makeArrayRef(Sub0_31_64); default: llvm_unreachable("unhandled register size"); } } if (EltSize == 16) { + + static const int16_t Sub0_31_128[] = { + AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11, + AMDGPU::sub12_sub13_sub14_sub15, + AMDGPU::sub16_sub17_sub18_sub19, + AMDGPU::sub20_sub21_sub22_sub23, + AMDGPU::sub24_sub25_sub26_sub27, + AMDGPU::sub28_sub29_sub30_sub31 + }; + static const int16_t Sub0_15_128[] = { AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub4_sub5_sub6_sub7, @@ -1609,6 +1730,8 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC return makeArrayRef(Sub0_7_128); case 512: return makeArrayRef(Sub0_15_128); + case 1024: + return makeArrayRef(Sub0_31_128); default: llvm_unreachable("unhandled register size"); } @@ -1616,6 +1739,13 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC assert(EltSize == 32 && "unhandled elt size"); + static const int16_t Sub0_31_256[] = { + AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, + AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, + AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, + AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 + }; + static const int16_t Sub0_15_256[] = { AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15 @@ -1626,6 +1756,8 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC return {}; case 512: return makeArrayRef(Sub0_15_256); + case 1024: + return makeArrayRef(Sub0_31_256); default: llvm_unreachable("unhandled register size"); } @@ -1647,6 +1779,13 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, return hasVGPRs(RC); } +bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI, + unsigned Reg) const { + const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); + assert(RC && "Register class for the reg not found"); + return hasAGPRs(RC); +} + bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, @@ -1688,7 +1827,7 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - if (Idx == getVGPRPressureSet()) + if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet()) return getRegPressureLimit(&AMDGPU::VGPR_32RegClass, const_cast<MachineFunction &>(MF)); @@ -1739,7 +1878,7 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, &AMDGPU::SReg_32_XM0RegClass; case 64: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : - &AMDGPU::SReg_64_XEXECRegClass; + &AMDGPU::SReg_64_XEXECRegClass; case 96: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : &AMDGPU::SReg_96RegClass; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 32de2f11d23..34487c96e72 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -29,8 +29,10 @@ class SIRegisterInfo final : public AMDGPURegisterInfo { private: unsigned SGPRSetID; unsigned VGPRSetID; + unsigned AGPRSetID; BitVector SGPRPressureSets; BitVector VGPRPressureSets; + BitVector AGPRPressureSets; bool SpillSGPRToVGPR; bool SpillSGPRToSMEM; bool isWave32; @@ -129,7 +131,7 @@ public: /// \returns true if this class contains only SGPR registers bool isSGPRClass(const TargetRegisterClass *RC) const { - return !hasVGPRs(RC); + return !hasVGPRs(RC) && !hasAGPRs(RC); } /// \returns true if this class ID contains only SGPR registers @@ -149,10 +151,22 @@ public: /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; + /// \returns true if this class contains AGPR registers. + bool hasAGPRs(const TargetRegisterClass *RC) const; + + /// \returns true if this class contains any vector registers. + bool hasVectorRegisters(const TargetRegisterClass *RC) const { + return hasVGPRs(RC) || hasAGPRs(RC); + } + /// \returns A VGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentVGPRClass( const TargetRegisterClass *SRC) const; + /// \returns An AGPR reg class with the same width as \p SRC + const TargetRegisterClass *getEquivalentAGPRClass( + const TargetRegisterClass *SRC) const; + /// \returns A SGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentSGPRClass( const TargetRegisterClass *VRC) const; @@ -190,10 +204,15 @@ public: unsigned getSGPRPressureSet() const { return SGPRSetID; }; unsigned getVGPRPressureSet() const { return VGPRSetID; }; + unsigned getAGPRPressureSet() const { return AGPRSetID; }; const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI, unsigned Reg) const; bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + bool isAGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + bool isVectorRegister(const MachineRegisterInfo &MRI, unsigned Reg) const { + return isVGPR(MRI, Reg) || isAGPR(MRI, Reg); + } virtual bool isDivergentRegClass(const TargetRegisterClass *RC) const override { @@ -201,10 +220,16 @@ public: } bool isSGPRPressureSet(unsigned SetID) const { - return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID); + return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID) && + !AGPRPressureSets.test(SetID); } bool isVGPRPressureSet(unsigned SetID) const { - return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID); + return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) && + !AGPRPressureSets.test(SetID); + } + bool isAGPRPressureSet(unsigned SetID) const { + return AGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) && + !VGPRPressureSets.test(SetID); } ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 856514b724a..4e07efff55d 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -388,7 +388,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, unsigned Reg = MO.getReg(); if (!TRI->isVirtualRegister(Reg) && - TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { + TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) { Flags = StateWQM; break; } |