diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 57 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 38 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 24 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 19 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 4 |
7 files changed, 102 insertions, 56 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 7d20509c464..bb875c9b9b5 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -21,19 +21,8 @@ using namespace llvm; static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, const MachineFrameInfo *FrameInfo) { - if (!FuncInfo->hasSpilledSGPRs()) - return false; - - if (FuncInfo->hasSpilledVGPRs()) - return false; - - for (int I = FrameInfo->getObjectIndexBegin(), - E = FrameInfo->getObjectIndexEnd(); I != E; ++I) { - if (!FrameInfo->isSpillSlotObjectIndex(I)) - return false; - } - - return true; + return FuncInfo->hasSpilledSGPRs() && + (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects()); } static ArrayRef<MCPhysReg> getAllSGPR128() { @@ -67,6 +56,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineBasicBlock::iterator I = MBB.begin(); // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); @@ -84,6 +75,44 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } + if (MFI->hasFlatScratchInit()) { + // We don't need this if we only have spills since there is no user facing + // scratch. + + // TODO: If we know we don't have flat instructions earlier, we can omit + // this from the input registers. + // + // TODO: We only need to know if we access scratch space through a flat + // pointer. Because we only detect if flat instructions are used at all, + // this will be used more often than necessary on VI. + + DebugLoc DL; + + unsigned FlatScratchInitReg + = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); + + MRI.addLiveIn(FlatScratchInitReg); + MBB.addLiveIn(FlatScratchInitReg); + + // Copy the size in bytes. + unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitHi, RegState::Kill); + + unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + + // Add wave offset in bytes to private base offset. + // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); + + // Convert offset to 256-byte units. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) + .addReg(FlatScrInitLo, RegState::Kill) + .addImm(8); + } + // If we reserved the original input registers, we don't need to copy to the // reserved registers. if (ScratchRsrcReg == PreloadedPrivateBufferReg) { @@ -96,7 +125,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. - MachineRegisterInfo &MRI = MF.getRegInfo(); MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); @@ -160,7 +188,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - MachineBasicBlock::iterator I = MBB.begin(); DebugLoc DL; if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8d634e957d3..78bcc9aaf0a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -695,6 +695,12 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(InputPtrReg); } + if (Info->hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(FlatScratchInitReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector<SDValue, 16> Chains; @@ -822,8 +828,11 @@ SDValue SITargetLowering::LowerFormalArguments( // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. - bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + // Record that we know we have non-spill stack objects so we don't need to + // check all stack objects later. + if (HasStackObjects) + Info->setHasNonSpillStackObjects(true); if (ST.isAmdHsaOS()) { // TODO: Assume we will spill without optimizations. diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 37ba7eef3d6..edcfb0889bb 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -572,43 +572,11 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { AMDGPU::EXEC).addReg(AMDGPU::EXEC); } - // FIXME: This seems inappropriate to do here. if (NeedFlat && MFI->IsKernel) { - // Insert the prologue initializing the SGPRs pointing to the scratch space - // for flat accesses. - const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - // TODO: What to use with function calls? - - // FIXME: This is reporting stack size that is used in a scratch buffer - // rather than registers as well. - uint64_t StackSizeBytes = FrameInfo->getStackSize(); - - int IndirectBegin - = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); - // Convert register index to 256-byte unit. - uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); - - assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && - "Stack limits should be smaller than 16-bits"); - - // Initialize the flat scratch register pair. - // TODO: Can we use one s_mov_b64 here? - - // Offset is in units of 256-bytes. - MachineBasicBlock &MBB = MF.front(); - DebugLoc NoDL; - MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); - const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); - - assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); - - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) - .addImm(StackOffset); - - // Documentation says size is "per-thread scratch size in bytes" - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) - .addImm(StackSizeBytes); + // We will need to Initialize the flat scratch register pair. + if (NeedFlat) + MFI->setHasFlatInstructions(true); } return true; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 49677fc2b0a..c5ecfd0ac73 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -54,6 +54,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) NumSystemSGPRs(0), HasSpilledSGPRs(false), HasSpilledVGPRs(false), + HasNonSpillStackObjects(false), + HasFlatInstructions(false), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), @@ -93,6 +95,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F->hasFnAttribute("amdgpu-work-item-id-z")) WorkItemIDZ = true; + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; + bool MaySpill = ST.isVGPRSpillingEnabled(this); bool HasStackObjects = FrameInfo->hasStackObjects(); @@ -107,10 +114,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) DispatchPtr = true; } - // X, XY, and XYZ are the only supported combinations, so make sure Y is - // enabled if Z is. - if (WorkItemIDZ) - WorkItemIDY = true; + // We don't need to worry about accessing spills with flat instructions. + // TODO: On VI where we must use flat for global, we should be able to omit + // this if it is never used for generic access. + if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS && + ST.isAmdHsaOS()) + FlatScratchInit = true; } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( @@ -142,6 +151,13 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) return KernargSegmentPtrUserSGPR; } +unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { + FlatScratchInitUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return FlatScratchInitUserSGPR; +} + SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( MachineFunction *MF, unsigned FrameIndex, diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 846ee5de057..787b3bb7a75 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -73,6 +73,8 @@ public: private: bool HasSpilledSGPRs; bool HasSpilledVGPRs; + bool HasNonSpillStackObjects; + bool HasFlatInstructions; // Feature bits required for inputs passed in user SGPRs. bool PrivateSegmentBuffer : 1; @@ -129,6 +131,7 @@ public: unsigned addDispatchPtr(const SIRegisterInfo &TRI); unsigned addQueuePtr(const SIRegisterInfo &TRI); unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + unsigned addFlatScratchInit(const SIRegisterInfo &TRI); // Add system SGPRs. unsigned addWorkGroupIDX() { @@ -277,6 +280,22 @@ public: HasSpilledVGPRs = Spill; } + bool hasNonSpillStackObjects() const { + return HasNonSpillStackObjects; + } + + void setHasNonSpillStackObjects(bool StackObject = true) { + HasNonSpillStackObjects = StackObject; + } + + bool hasFlatInstructions() const { + return HasFlatInstructions; + } + + void setHasFlatInstructions(bool UseFlat = true) { + HasFlatInstructions = UseFlat; + } + unsigned getPSInputAddr() const { return PSInputAddr; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ef1c25b4304..384275f7534 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -649,6 +649,11 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, case SIRegisterInfo::KERNARG_SEGMENT_PTR: assert(MFI->hasKernargSegmentPtr()); return MFI->KernargSegmentPtrUserSGPR; + case SIRegisterInfo::DISPATCH_ID: + llvm_unreachable("unimplemented"); + case SIRegisterInfo::FLAT_SCRATCH_INIT: + assert(MFI->hasFlatScratchInit()); + return MFI->FlatScratchInitUserSGPR; case SIRegisterInfo::DISPATCH_PTR: assert(MFI->hasDispatchPtr()); return MFI->DispatchPtrUserSGPR; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 534bde04d3c..76eaa2cdc60 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -121,10 +121,12 @@ public: enum PreloadedValue { // SGPRS: - PRIVATE_SEGMENT_BUFFER = 0, + PRIVATE_SEGMENT_BUFFER = 0, DISPATCH_PTR = 1, QUEUE_PTR = 2, KERNARG_SEGMENT_PTR = 3, + DISPATCH_ID = 4, + FLAT_SCRATCH_INIT = 5, WORKGROUP_ID_X = 10, WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, |