diff options
| author | Konstantin Zhuravlyov <kzhuravl_dev@outlook.com> | 2016-06-25 03:11:28 +0000 |
|---|---|---|
| committer | Konstantin Zhuravlyov <kzhuravl_dev@outlook.com> | 2016-06-25 03:11:28 +0000 |
| commit | f2f3d14774e6019048dccd1ecbca340ae5bbfca1 (patch) | |
| tree | 65f692e2c2223031bede6426b5f4d1e6646d5655 /llvm/lib/Target/AMDGPU | |
| parent | 92d33bd2af3452b7272f8d27a75c0e344ae20f38 (diff) | |
| download | bcm5719-llvm-f2f3d14774e6019048dccd1ecbca340ae5bbfca1.tar.gz bcm5719-llvm-f2f3d14774e6019048dccd1ecbca340ae5bbfca1.zip | |
[AMDGPU] Emit debugger prologue and emit the rest of the debugger fields in the kernel code header
Debugger prologue is emitted if -mattr=+amdgpu-debugger-emit-prologue.
Debugger prologue writes work group IDs and work item IDs to scratch memory at fixed location in the following format:
- offset 0: work group ID x
- offset 4: work group ID y
- offset 8: work group ID z
- offset 16: work item ID x
- offset 20: work item ID y
- offset 24: work item ID z
Set
- amd_kernel_code_t::debug_wavefront_private_segment_offset_sgpr to scratch wave offset reg
- amd_kernel_code_t::debug_private_segment_buffer_sgpr to scratch rsrc reg
- amd_kernel_code_t::is_debug_supported to true if all debugger features are enabled
Differential Revision: http://reviews.llvm.org/D20335
llvm-svn: 273769
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 27 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 10 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 10 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 50 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 31 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 10 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 60 |
11 files changed, 207 insertions, 5 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index dfd65fadf64..7736fd6c4cf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -329,6 +329,13 @@ def FeatureDebuggerReserveRegs : SubtargetFeature< "Reserve registers for debugger usage" >; +def FeatureDebuggerEmitPrologue : SubtargetFeature< + "amdgpu-debugger-emit-prologue", + "DebuggerEmitPrologue", + "true", + "Emit debugger prologue" +>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index b18c97acd72..c7784d7a76c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -200,6 +200,13 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount), false); + if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) { + OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + + Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); + OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" + + Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false); + } + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), false); @@ -444,6 +451,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, MaxVGPR += MFI->getDebuggerReservedVGPRCount(); } + // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and + // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" + // attribute was specified. + if (STM.debuggerEmitPrologue()) { + ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = + RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); + ProgInfo.DebuggerPrivateSegmentBufferSGPR = + RI->getHWRegIndex(MFI->getScratchRSrcReg()); + } + // We found the maximum register index. They start at 0, so add one to get the // number of registers. ProgInfo.NumVGPR = MaxVGPR + 1; @@ -670,6 +687,9 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, if (MFI->hasDispatchPtr()) header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + if (STM.debuggerSupported()) + header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; + if (STM.isXNACKEnabled()) header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; @@ -681,6 +701,13 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; + if (STM.debuggerEmitPrologue()) { + header.debug_wavefront_private_segment_offset_sgpr = + KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; + header.debug_private_segment_buffer_sgpr = + KernelInfo.DebuggerPrivateSegmentBufferSGPR; + } + AMDGPUTargetStreamer *TS = static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 87f953fce27..2d44ffefd80 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -42,6 +42,8 @@ private: FlatUsed(false), ReservedVGPRFirst(0), ReservedVGPRCount(0), + DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1), + DebuggerPrivateSegmentBufferSGPR((uint16_t)-1), VCCUsed(false), CodeLen(0) {} @@ -75,6 +77,14 @@ private: // The number of consecutive VGPRs reserved. uint16_t ReservedVGPRCount; + // Fixed SGPR number used to hold wave scratch offset for entire kernel + // execution, or uint16_t(-1) if the register is not used or not known. + uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR; + // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire + // kernel execution, or uint16_t(-1) if the register is not used or not + // known. + uint16_t DebuggerPrivateSegmentBufferSGPR; + // Bonus information for debugging. bool VCCUsed; uint64_t CodeLen; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index c6ac84bd3a3..e973f8e4837 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -101,6 +101,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, EnableXNACK(false), DebuggerInsertNops(false), DebuggerReserveRegs(false), + DebuggerEmitPrologue(false), EnableVGPRSpilling(false), EnablePromoteAlloca(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 3c1bb5c07f0..53117e3cb60 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -77,6 +77,7 @@ protected: bool EnableXNACK; bool DebuggerInsertNops; bool DebuggerReserveRegs; + bool DebuggerEmitPrologue; // Used as options. bool EnableVGPRSpilling; @@ -402,6 +403,11 @@ public: return EnableSIScheduler; } + bool debuggerSupported() const { + return debuggerInsertNops() && debuggerReserveRegs() && + debuggerEmitPrologue(); + } + bool debuggerInsertNops() const { return DebuggerInsertNops; } @@ -410,6 +416,10 @@ public: return DebuggerReserveRegs; } + bool debuggerEmitPrologue() const { + return DebuggerEmitPrologue; + } + bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 1f3b361175e..03b11f0fd38 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -39,6 +39,12 @@ static ArrayRef<MCPhysReg> getAllSGPRs() { void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { + // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was + // specified. + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + if (ST.debuggerEmitPrologue()) + emitDebuggerPrologue(MF, MBB); + if (!MF.getFrameInfo()->hasStackObjects()) return; @@ -54,7 +60,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) return; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -87,6 +92,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // pointer. Because we only detect if flat instructions are used at all, // this will be used more often than necessary on VI. + // Debug location must be unknown since the first debug location is used to + // determine the end of the prologue. DebugLoc DL; unsigned FlatScratchInitReg @@ -289,3 +296,44 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( RS->addScavengingFrameIndex(ScavengeFI); } } + +void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL; + + // For each dimension: + for (unsigned i = 0; i < 3; ++i) { + // Get work group ID SGPR, and make it live-in again. + unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i); + MF.getRegInfo().addLiveIn(WorkGroupIDSGPR); + MBB.addLiveIn(WorkGroupIDSGPR); + + // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in + // order to spill it to scratch. + unsigned WorkGroupIDVGPR = + MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR) + .addReg(WorkGroupIDSGPR); + + // Spill work group ID. + int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i); + TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false, + WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); + + // Get work item ID VGPR, and make it live-in again. + unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i); + MF.getRegInfo().addLiveIn(WorkItemIDVGPR); + MBB.addLiveIn(WorkItemIDVGPR); + + // Spill work item ID. + int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i); + TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false, + WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); + } +} diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index c2e7a710817..37417d098f3 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -29,6 +29,10 @@ public: void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; + +private: + /// \brief Emits debugger prologue. + void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const; }; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3142e4c8f42..ad02c4113ca 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -596,6 +596,11 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getEntryNode(); } + // Create stack objects that are used for emitting debugger prologue if + // "amdgpu-debugger-emit-prologue" attribute was specified. + if (ST.debuggerEmitPrologue()) + createDebuggerPrologueStackObjects(MF); + SmallVector<ISD::InputArg, 16> Splits; BitVector Skipped(Ins.size()); @@ -1258,6 +1263,32 @@ bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { } } +void SITargetLowering::createDebuggerPrologueStackObjects( + MachineFunction &MF) const { + // Create stack objects that are used for emitting debugger prologue. + // + // Debugger prologue writes work group IDs and work item IDs to scratch memory + // at fixed location in the following format: + // offset 0: work group ID x + // offset 4: work group ID y + // offset 8: work group ID z + // offset 16: work item ID x + // offset 20: work item ID y + // offset 24: work item ID z + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + int ObjectIdx = 0; + + // For each dimension: + for (unsigned i = 0; i < 3; ++i) { + // Create fixed stack object for work group ID. + ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true); + Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); + // Create fixed stack object for work item ID. + ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true); + Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); + } +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 032372b7b17..3806e95dff3 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -70,6 +70,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; bool isCFIntrinsic(const SDNode *Intr) const; + + void createDebuggerPrologueStackObjects(MachineFunction &MF) const; public: SITargetLowering(const TargetMachine &tm, const SISubtarget &STI); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 6daebbaa0e4..0904d7dc8ab 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -54,6 +54,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ReturnsVoid(true), MaximumWorkGroupSize(0), DebuggerReservedVGPRCount(0), + DebuggerWorkGroupIDStackObjectIndices{0, 0, 0}, + DebuggerWorkItemIDStackObjectIndices{0, 0, 0}, LDSWaveSpillSize(0), PSInputEna(0), NumUserSGPRs(0), @@ -92,16 +94,16 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDX = true; } - if (F->hasFnAttribute("amdgpu-work-group-id-y")) + if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue()) WorkGroupIDY = true; - if (F->hasFnAttribute("amdgpu-work-group-id-z")) + if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue()) WorkGroupIDZ = true; - if (F->hasFnAttribute("amdgpu-work-item-id-y")) + if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue()) WorkItemIDY = true; - if (F->hasFnAttribute("amdgpu-work-item-id-z")) + if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue()) WorkItemIDZ = true; // X, XY, and XYZ are the only supported combinations, so make sure Y is diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index ee2f722aba5..0ad25f0cae4 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -64,6 +64,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // Number of reserved VGPRs for debugger usage. unsigned DebuggerReservedVGPRCount; + // Stack object indices for work group IDs. + int DebuggerWorkGroupIDStackObjectIndices[3]; + // Stack object indices for work item IDs. + int DebuggerWorkItemIDStackObjectIndices[3]; public: // FIXME: Make private @@ -334,6 +338,62 @@ public: return DebuggerReservedVGPRCount; } + /// \returns Stack object index for \p Dim's work group ID. + int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const { + assert(Dim < 3); + return DebuggerWorkGroupIDStackObjectIndices[Dim]; + } + + /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx. + void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) { + assert(Dim < 3); + DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx; + } + + /// \returns Stack object index for \p Dim's work item ID. + int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const { + assert(Dim < 3); + return DebuggerWorkItemIDStackObjectIndices[Dim]; + } + + /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx. + void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) { + assert(Dim < 3); + DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx; + } + + /// \returns SGPR used for \p Dim's work group ID. + unsigned getWorkGroupIDSGPR(unsigned Dim) const { + switch (Dim) { + case 0: + assert(hasWorkGroupIDX()); + return WorkGroupIDXSystemSGPR; + case 1: + assert(hasWorkGroupIDY()); + return WorkGroupIDYSystemSGPR; + case 2: + assert(hasWorkGroupIDZ()); + return WorkGroupIDZSystemSGPR; + } + llvm_unreachable("unexpected dimension"); + } + + /// \returns VGPR used for \p Dim' work item ID. + unsigned getWorkItemIDVGPR(unsigned Dim) const { + switch (Dim) { + case 0: + assert(hasWorkItemIDX()); + return AMDGPU::VGPR0; + case 1: + assert(hasWorkItemIDY()); + return AMDGPU::VGPR1; + case 2: + assert(hasWorkItemIDZ()); + return AMDGPU::VGPR2; + } + llvm_unreachable("unexpected dimension"); + } + unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; |

