diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 156 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 19 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 20 |
5 files changed, 186 insertions, 19 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2247814cfe5..1eab96d5e18 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -509,20 +509,154 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( } } - MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestSGPRReg = Reg; - break; + int32_t MaxVGPR = -1; + int32_t MaxSGPR = -1; + uint32_t CalleeFrameSize = 0; + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: Check regmasks? Do they occur anywhere except calls? + for (const MachineOperand &MO : MI.operands()) { + unsigned Width = 0; + bool IsSGPR = false; + + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::SCC: + case AMDGPU::M0: + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: + continue; + + case AMDGPU::NoRegister: + assert(MI.isDebugValue()); + continue; + + case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: + Info.UsesVCC = true; + continue; + + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + continue; + + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + llvm_unreachable("trap handler registers should not be used"); + + default: + break; + } + + if (AMDGPU::SReg_32RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 1; + } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) { + IsSGPR = false; + Width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { + IsSGPR = false; + Width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { + IsSGPR = false; + Width = 3; + } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { + IsSGPR = true; + Width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { + IsSGPR = false; + Width = 4; + } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { + IsSGPR = true; + Width = 8; + } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { + IsSGPR = false; + Width = 8; + } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { + IsSGPR = true; + Width = 16; + } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { + IsSGPR = false; + Width = 16; + } else { + llvm_unreachable("Unknown register class"); + } + unsigned HWReg = TRI.getHWRegIndex(Reg); + int MaxUsed = HWReg + Width - 1; + if (IsSGPR) { + MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; + } else { + MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; + } + } + + if (MI.isCall()) { + assert(MI.getOpcode() == AMDGPU::SI_CALL); + // Pseudo used just to encode the underlying global. Is there a better + // way to track this? + const Function *Callee = cast<Function>(MI.getOperand(2).getGlobal()); + if (Callee->isDeclaration()) { + // If this is a call to an external function, we can't do much. Make + // conservative guesses. + + // 48 SGPRs - vcc, - flat_scr, -xnack + int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true, + ST.hasFlatAddressSpace()); + MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); + MaxVGPR = std::max(MaxVGPR, 23); + + CalleeFrameSize = std::max(CalleeFrameSize, 16384u); + Info.UsesVCC = true; + Info.UsesFlatScratch = ST.hasFlatAddressSpace(); + Info.HasDynamicallySizedStack = true; + } else { + // We force CodeGen to run in SCC order, so the callee's register + // usage etc. should be the cumulative usage of all callees. + auto I = CallGraphResourceInfo.find(Callee); + assert(I != CallGraphResourceInfo.end() && + "callee should have been handled before caller"); + + MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); + MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); + CalleeFrameSize + = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); + Info.UsesVCC |= I->second.UsesVCC; + Info.UsesFlatScratch |= I->second.UsesFlatScratch; + Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; + Info.HasRecursion |= I->second.HasRecursion; + } + + if (!Callee->doesNotRecurse()) + Info.HasRecursion = true; + } } } - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestVGPRReg) + 1; - Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestSGPRReg) + 1; + Info.NumExplicitSGPR = MaxSGPR + 1; + Info.NumVGPR = MaxVGPR + 1; + Info.PrivateSegmentSize += CalleeFrameSize; return Info; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index c665bc38f4b..ba52c3ae1a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -135,6 +135,11 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { // do that with a single pseudo source operation. if (Opcode == AMDGPU::S_SETPC_B64_return) Opcode = AMDGPU::S_SETPC_B64; + else if (Opcode == AMDGPU::SI_CALL) { + // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the + // called function. + Opcode = AMDGPU::S_SWAPPC_B64; + } int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode); if (MCOpcode == -1) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 714aebbafae..854000d1c41 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -486,7 +486,10 @@ public: class GCNPassConfig final : public AMDGPUPassConfig { public: GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) - : AMDGPUPassConfig(TM, PM) {} + : AMDGPUPassConfig(TM, PM) { + // It is necessary to know the register usage of the entire call graph. + setRequiresCodeGenSCCOrder(EnableAMDGPUFunctionCalls); + } GCNTargetMachine &getGCNTargetMachine() const { return getTM<GCNTargetMachine>(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1d88b5d78d7..47a5aa4b0ce 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2650,14 +2650,27 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); return BB; } - case AMDGPU::SI_CALL: { + case AMDGPU::SI_CALL_ISEL: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned GlobalAddrReg = MI.getOperand(0).getReg(); + MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg); + assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET); + + const GlobalValue *G = PCRel->getOperand(1).getGlobal(); + MachineInstrBuilder MIB = - BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_SWAPPC_B64), ReturnAddrReg); - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) + .add(MI.getOperand(0)) + .addGlobalAddress(G); + + for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) MIB.add(MI.getOperand(I)); + + MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index c8b208e69a5..50e806188a9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -327,16 +327,28 @@ def SI_RETURN : SPseudoInstSI < let SchedRW = [WriteBranch]; } -// Return for returning function calls. -def SI_CALL : SPseudoInstSI < - (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)], - "; call $src0"> { +// Return for returning function calls without output register. +// +// This version is only needed so we can fill in the output regiter in +// the custom inserter. +def SI_CALL_ISEL : SPseudoInstSI < + (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> { let Size = 4; let isCall = 1; let SchedRW = [WriteBranch]; let usesCustomInserter = 1; } +// Wrapper around s_swappc_b64 with extra $callee parameter to track +// the called function after regalloc. +def SI_CALL : SPseudoInstSI < + (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> { + let Size = 4; + let isCall = 1; + let SchedRW = [WriteBranch]; +} + + def ADJCALLSTACKUP : SPseudoInstSI< (outs), (ins i32imm:$amt0, i32imm:$amt1), [(callseq_start timm:$amt0, timm:$amt1)], |