summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.h6
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h9
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdpal.ll47
6 files changed, 141 insertions, 13 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 0facae0992b..85b056e5c82 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -895,19 +895,24 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->EmitIntValue(RsrcReg, 4);
OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
+ unsigned Rsrc2Val = 0;
if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
+ if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
+ Rsrc2Val = S_00B84C_SCRATCH_EN(CurrentProgramInfo.ScratchBlocks > 0);
+ }
+ if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
+ OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+ OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
+ OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
+ OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
+ Rsrc2Val |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
+ }
+ if (Rsrc2Val) {
+ OutStreamer->EmitIntValue(RsrcReg + 4 /*rsrc2*/, 4);
+ OutStreamer->EmitIntValue(Rsrc2Val, 4);
}
- }
-
- if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
- OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
- OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
- OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
- OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
- OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
- OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
}
OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index ff6fed88e37..37f5665be50 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -219,7 +219,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
// Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
// specified.
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- auto AMDGPUASI = ST.getAMDGPUAS();
if (ST.debuggerEmitPrologue())
emitDebuggerPrologue(MF, MBB);
@@ -356,7 +355,65 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
.addReg(PreloadedPrivateBufferReg, RegState::Kill);
}
- if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) {
+ if (ResourceRegUsed)
+ emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
+ PreloadedPrivateBufferReg, ScratchRsrcReg);
+}
+
+// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
+void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
+ MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
+ MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
+ unsigned ScratchRsrcReg) const {
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ DebugLoc DL;
+ auto AMDGPUASI = ST.getAMDGPUAS();
+
+ if (ST.isAmdPalOS()) {
+ // The pointer to the GIT is formed from the offset passed in and either
+ // the amdgpu-git-ptr-high function attribute or the top part of the PC
+ unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+ unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+
+ const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+
+ if (MFI->getGITPtrHigh() != 0xffffffff) {
+ BuildMI(MBB, I, DL, SMovB32, RsrcHi)
+ .addImm(MFI->getGITPtrHigh())
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ } else {
+ const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
+ BuildMI(MBB, I, DL, GetPC64, Rsrc01);
+ }
+ BuildMI(MBB, I, DL, SMovB32, RsrcLo)
+ .addReg(AMDGPU::SGPR0) // Low address passed in
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ // We now have the GIT ptr - now get the scratch descriptor from the entry
+ // at offset 0.
+ PointerType *PtrTy =
+ PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS);
+ MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+ const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
+ auto MMO = MF.getMachineMemOperand(PtrInfo,
+ MachineMemOperand::MOLoad |
+ MachineMemOperand::MOInvariant |
+ MachineMemOperand::MODereferenceable,
+ 0, 0);
+ BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
+ .addReg(Rsrc01)
+ .addImm(0) // offset
+ .addImm(0) // glc
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
+ .addMemOperand(MMO);
+ return;
+ }
+ if (ST.isMesaGfxShader(MF)
+ || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
assert(!ST.isAmdCodeObjectV2(MF));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index cc1c85ff6bf..df6f1632a31 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -69,6 +69,12 @@ private:
/// \brief Emits debugger prologue.
void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+ // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
+ void emitEntryFunctionScratchSetup(const SISubtarget &ST, MachineFunction &MF,
+ MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
+ MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
+ unsigned ScratchRsrcReg) const;
+
public:
bool hasFP(const MachineFunction &MF) const override;
bool hasSP(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index ebb83fea1fd..0a92cd17654 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -48,7 +48,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkItemIDY(false),
WorkItemIDZ(false),
ImplicitBufferPtr(false),
- ImplicitArgPtr(false) {
+ ImplicitArgPtr(false),
+ GITPtrHigh(0xffffffff) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const Function *F = MF.getFunction();
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
@@ -160,6 +161,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch"))
FlatScratchInit = true;
}
+
+ Attribute A = F->getFnAttribute("amdgpu-git-ptr-high");
+ StringRef S = A.getValueAsString();
+ if (!S.empty())
+ S.consumeInteger(0, GITPtrHigh);
}
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 242b41a5908..ade909cc84e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -185,6 +185,11 @@ private:
// user arguments. This is an offset from the KernargSegmentPtr.
bool ImplicitArgPtr : 1;
+ // The hard-wired high half of the address of the global information table
+ // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since
+ // current hardware only allows a 16 bit value.
+ unsigned GITPtrHigh;
+
MCPhysReg getNextUserSGPR() const {
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
return AMDGPU::SGPR0 + NumUserSGPRs;
@@ -406,6 +411,10 @@ public:
return ArgInfo.getPreloadedValue(Value).first->getRegister();
}
+ unsigned getGITPtrHigh() const {
+ return GITPtrHigh;
+ }
+
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal.ll b/llvm/test/CodeGen/AMDGPU/amdpal.ll
index 61205d0cdb8..3c8a490b40e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tahiti | FileCheck --check-prefix=PAL %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tahiti | FileCheck --check-prefix=PAL --enable-var-scope %s
; PAL: .AMDGPU.config
@@ -8,3 +8,48 @@ entry:
ret void
}
+; Check code sequence for amdpal use of scratch for alloca. This is the case
+; where the high half of the address comes from s_getpc.
+
+; PAL-LABEL: {{^}}scratch:
+; PAL: s_getpc_b64 s{{\[}}[[GITPTR:[0-9]+]]:
+; PAL: s_mov_b32 s[[GITPTR]], s0
+; PAL: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]:
+; PAL: buffer_store{{.*}}, s{{\[}}[[SCRATCHDESC]]:
+
+define amdgpu_kernel void @scratch(<2 x i32> %in, i32 %idx, i32* %out) {
+entry:
+ %v = alloca [2 x i32]
+ %vv = bitcast [2 x i32]* %v to <2 x i32>*
+ store <2 x i32> %in, <2 x i32>* %vv
+ %e = getelementptr [2 x i32], [2 x i32]* %v, i32 0, i32 %idx
+ %x = load i32, i32* %e
+ store i32 %x, i32* %out
+ ret void
+}
+
+; Check code sequence for amdpal use of scratch for alloca. This is the case
+; where the amdgpu-git-ptr-high function attribute gives the high half of the
+; address to use.
+; Looks like you can't do arithmetic on a filecheck variable, so we can't test
+; that the s_movk_i32 is into a reg that is one more than the following
+; s_mov_b32.
+
+; PAL-LABEL: {{^}}scratch2:
+; PAL: s_movk_i32 s{{[0-9]+}}, 0x1234
+; PAL: s_mov_b32 s[[GITPTR:[0-9]+]], s0
+; PAL: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]:
+; PAL: buffer_store{{.*}}, s{{\[}}[[SCRATCHDESC]]:
+
+define amdgpu_kernel void @scratch2(<2 x i32> %in, i32 %idx, i32* %out) #0 {
+entry:
+ %v = alloca [2 x i32]
+ %vv = bitcast [2 x i32]* %v to <2 x i32>*
+ store <2 x i32> %in, <2 x i32>* %vv
+ %e = getelementptr [2 x i32], [2 x i32]* %v, i32 0, i32 %idx
+ %x = load i32, i32* %e
+ store i32 %x, i32* %out
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-git-ptr-high"="0x1234" }
OpenPOWER on IntegriCloud