summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaits.cpp43
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp275
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h11
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td8
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp2
8 files changed, 284 insertions, 96 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 4d55a663d48..bf493c9fd38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -253,7 +253,7 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
switch (NumVectorElts) {
case 1:
- return AMDGPU::SReg_32RegClassID;
+ return AMDGPU::SReg_32_XM0RegClassID;
case 2:
return AMDGPU::SReg_64RegClassID;
case 4:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6deee51a064..04edc91f0a7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -59,7 +59,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
- addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
@@ -79,8 +79,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
if (Subtarget->has16BitInsts()) {
- addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
- addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
}
computeRegisterProperties(STI.getRegisterInfo());
@@ -941,25 +941,25 @@ SDValue SITargetLowering::LowerFormalArguments(
// Start adding system SGPRs.
if (Info->hasWorkGroupIDX()) {
unsigned Reg = Info->addWorkGroupIDX();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasWorkGroupIDY()) {
unsigned Reg = Info->addWorkGroupIDY();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasWorkGroupIDZ()) {
unsigned Reg = Info->addWorkGroupIDZ();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasWorkGroupInfo()) {
unsigned Reg = Info->addWorkGroupInfo();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
CCInfo.AllocateReg(Reg);
}
@@ -2414,15 +2414,15 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SI::KernelInputOffsets::LOCAL_SIZE_Z);
case Intrinsic::amdgcn_workgroup_id_x:
case Intrinsic::r600_read_tgid_x:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
case Intrinsic::amdgcn_workgroup_id_y:
case Intrinsic::r600_read_tgid_y:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::r600_read_tgid_z:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::r600_read_tidig_x:
@@ -4182,7 +4182,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
default:
return std::make_pair(0U, nullptr);
case 32:
- return std::make_pair(0U, &AMDGPU::SReg_32RegClass);
+ return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
case 64:
return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
case 128:
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
index a9e693917bf..da4db63ab33 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -532,6 +532,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
IV = getIsaVersion(ST->getFeatureBits());
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
HardwareLimits.Named.VM = getVmcntBitMask(IV);
HardwareLimits.Named.EXP = getExpcntBitMask(IV);
@@ -543,20 +544,27 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
LastOpcodeType = OTHER;
LastInstWritesM0 = false;
IsFlatOutstanding = false;
- ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid();
+ ReturnsVoid = MFI->returnsVoid();
memset(&UsedRegs, 0, sizeof(UsedRegs));
memset(&DefinedRegs, 0, sizeof(DefinedRegs));
SmallVector<MachineInstr *, 4> RemoveMI;
+ SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+
+ bool HaveScalarStores = false;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
+
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
+ if (!HaveScalarStores && TII->isScalarStore(*I))
+ HaveScalarStores = true;
+
if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
// There is a hardware bug on CI/SI where SMRD instruction may corrupt
// vccz bit, so when we detect that an instruction may read from a
@@ -625,12 +633,45 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
pushInstruction(MBB, I, Increment);
handleSendMsg(MBB, I);
+
+ if (I->getOpcode() == AMDGPU::S_ENDPGM ||
+ I->getOpcode() == AMDGPU::SI_RETURN)
+ EndPgmBlocks.push_back(&MBB);
}
// Wait for everything at the end of the MBB
Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
}
+ if (HaveScalarStores) {
+ // If scalar writes are used, the cache must be flushed or else the next
+ // wave to reuse the same scratch memory can be clobbered.
+ //
+ // Insert s_dcache_wb at wave termination points if there were any scalar
+ // stores, and only if the cache hasn't already been flushed. This could be
+ // improved by looking across blocks for flushes in postdominating blocks
+ // from the stores but an explicitly requested flush is probably very rare.
+ for (MachineBasicBlock *MBB : EndPgmBlocks) {
+ bool SeenDCacheWB = false;
+
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+ I != E; ++I) {
+
+ if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+ SeenDCacheWB = true;
+ else if (TII->isScalarStore(*I))
+ SeenDCacheWB = false;
+
+ // FIXME: It would be better to insert this before a waitcnt if any.
+ if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+ I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) {
+ Changes = true;
+ BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+ }
+ }
+ }
+ }
+
for (MachineInstr *I : RemoveMI)
I->eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 63ce2583581..85eca558f25 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -364,7 +364,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- if (RC == &AMDGPU::SReg_32RegClass) {
+ if (RC == &AMDGPU::SReg_32_XM0RegClass ||
+ RC == &AMDGPU::SReg_32RegClass) {
if (SrcReg == AMDGPU::SCC) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
.addImm(-1)
@@ -544,7 +545,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
}
- BuildMI(MBB, MI, DL, OpDesc)
+ MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
@@ -554,6 +555,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// needing them, and need to ensure that the reserved registers are
// correctly handled.
+ if (ST.hasScalarStores()) {
+ // m0 is used for offset to scalar stores if used to spill.
+ Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+ }
+
return;
}
@@ -643,12 +649,17 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
- BuildMI(MBB, MI, DL, OpDesc, DestReg)
+ MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
.addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+ if (ST.hasScalarStores()) {
+ // m0 is used for offset to scalar stores if used to spill.
+ Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+ }
+
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 54fcbb507c8..1d933da47c2 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -24,6 +24,12 @@
using namespace llvm;
+static cl::opt<bool> EnableSpillSGPRToSMEM(
+ "amdgpu-spill-sgpr-to-smem",
+ cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
+ cl::init(false));
+
+
static bool hasPressureSet(const int *PSets, unsigned PSetID) {
for (unsigned i = 0; PSets[i] != -1; ++i) {
if (PSets[i] == (int)PSetID)
@@ -237,7 +243,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(Offset);
@@ -401,28 +407,36 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,
- const MachineOperand *SrcDst,
+ int Index,
+ unsigned ValueReg,
+ bool IsKill,
unsigned ScratchRsrcReg,
- unsigned ScratchOffset,
- int64_t Offset,
+ unsigned ScratchOffsetReg,
+ int64_t InstOffset,
+ MachineMemOperand *MMO,
RegScavenger *RS) const {
- unsigned Value = SrcDst->getReg();
- bool IsKill = SrcDst->isKill();
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
- DebugLoc DL = MI->getDebugLoc();
- bool IsStore = MI->mayStore();
+ const MCInstrDesc &Desc = TII->get(LoadStoreOp);
+ const DebugLoc &DL = MI->getDebugLoc();
+ bool IsStore = Desc.mayStore();
bool RanOutOfSGPRs = false;
bool Scavenged = false;
- unsigned SOffset = ScratchOffset;
- unsigned OriginalImmOffset = Offset;
+ unsigned SOffset = ScratchOffsetReg;
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
+ unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
unsigned Size = NumSubRegs * 4;
+ int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
+ const int64_t OriginalImmOffset = Offset;
+
+ unsigned Align = MFI.getObjectAlignment(Index);
+ const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
if (!isUInt<12>(Offset + Size)) {
SOffset = AMDGPU::NoRegister;
@@ -441,19 +455,23 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
// subtract the offset after the spill to return ScratchOffset to it's
// original value.
RanOutOfSGPRs = true;
- SOffset = ScratchOffset;
+ SOffset = ScratchOffsetReg;
} else {
Scavenged = true;
}
+
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
- .addReg(ScratchOffset)
- .addImm(Offset);
+ .addReg(ScratchOffsetReg)
+ .addImm(Offset);
+
Offset = 0;
}
- for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
+ const unsigned EltSize = 4;
+
+ for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
unsigned SubReg = NumSubRegs == 1 ?
- Value : getSubReg(Value, getSubRegFromChannel(i));
+ ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
unsigned SOffsetRegState = 0;
unsigned SrcDstRegState = getDefRegState(!IsStore);
@@ -463,40 +481,65 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
SrcDstRegState |= getKillRegState(IsKill);
}
- BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
- .addReg(SubReg, getDefRegState(!IsStore))
+ MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
+ MachineMemOperand *NewMMO
+ = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
+ EltSize, MinAlign(Align, EltSize * i));
+
+ auto MIB = BuildMI(*MBB, MI, DL, Desc)
+ .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
.addReg(ScratchRsrcReg)
.addReg(SOffset, SOffsetRegState)
.addImm(Offset)
.addImm(0) // glc
.addImm(0) // slc
.addImm(0) // tfe
- .addReg(Value, RegState::Implicit | SrcDstRegState)
- .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ .addMemOperand(NewMMO);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
}
+
if (RanOutOfSGPRs) {
// Subtract the offset we added to the ScratchOffset register.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset)
- .addReg(ScratchOffset)
- .addImm(OriginalImmOffset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
+ .addReg(ScratchOffsetReg)
+ .addImm(OriginalImmOffset);
}
}
void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS) const {
- MachineFunction *MF = MI->getParent()->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- const DebugLoc &DL = MI->getDebugLoc();
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
unsigned SuperReg = MI->getOperand(0).getReg();
bool IsKill = MI->getOperand(0).isKill();
+ const DebugLoc &DL = MI->getDebugLoc();
+
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+
+ bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+
+ assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+
+ const unsigned EltSize = 4;
+ unsigned OffsetReg = AMDGPU::M0;
+ unsigned M0CopyReg = AMDGPU::NoRegister;
+
+ if (SpillToSMEM) {
+ if (RS->isRegUsed(AMDGPU::M0)) {
+ M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
+ .addReg(AMDGPU::M0);
+ }
+ }
// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
@@ -504,21 +547,43 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
unsigned SubReg = NumSubRegs == 1 ?
SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
+ if (SpillToSMEM) {
+ int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ EltSize, MinAlign(Align, EltSize * i));
+
+ // Add i * 4 wave offset.
+ //
+ // SMEM instructions only support a single offset, so increment the wave
+ // offset.
+
+ int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i);
+ if (Offset != 0) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg())
+ .addImm(Offset);
+ } else {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR))
+ .addReg(SubReg, getKillRegState(IsKill)) // sdata
+ .addReg(MFI->getScratchRSrcReg()) // sbase
+ .addReg(OffsetReg, RegState::Kill) // soff
+ .addImm(0) // glc
+ .addMemOperand(MMO);
+
+ continue;
+ }
+
struct SIMachineFunctionInfo::SpilledReg Spill =
MFI->getSpilledReg(MF, Index, i);
if (Spill.hasReg()) {
- if (SuperReg == AMDGPU::M0) {
- assert(NumSubRegs == 1);
- unsigned CopyM0
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), CopyM0)
- .addReg(SuperReg, getKillRegState(IsKill));
-
- // The real spill now kills the temp copy.
- SubReg = SuperReg = CopyM0;
- IsKill = true;
- }
-
BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
Spill.VGPR)
@@ -530,10 +595,9 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
// it are fixed.
} else {
// Spill SGPR to a frame index.
- // FIXME we should use S_STORE_DWORD here for VI.
-
// TODO: Should VI try to spill to VGPR and then spill to SMEM?
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ // TODO: Should VI try to spill to VGPR and then spill to SMEM?
MachineInstrBuilder Mov
= BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
@@ -550,13 +614,12 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
}
- unsigned Size = FrameInfo.getObjectSize(Index);
unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index);
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
MachineMemOperand *MMO
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
- Size, Align);
+ EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
.addReg(TmpReg, RegState::Kill) // src
.addFrameIndex(Index) // vaddr
@@ -567,6 +630,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
}
}
+ if (M0CopyReg != AMDGPU::NoRegister) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0CopyReg, RegState::Kill);
+ }
+
MI->eraseFromParent();
MFI->addToSpilledSGPRs(NumSubRegs);
}
@@ -585,42 +653,86 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
unsigned SuperReg = MI->getOperand(0).getReg();
+ bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+
+ assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+
+ unsigned OffsetReg = AMDGPU::M0;
+ unsigned M0CopyReg = AMDGPU::NoRegister;
- // m0 is not allowed as with readlane/writelane, so a temporary SGPR and
- // extra copy is needed.
- bool IsM0 = (SuperReg == AMDGPU::M0);
- if (IsM0) {
- assert(NumSubRegs == 1);
- SuperReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ if (SpillToSMEM) {
+ if (RS->isRegUsed(AMDGPU::M0)) {
+ M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
+ .addReg(AMDGPU::M0);
+ }
}
+ // SubReg carries the "Kill" flag when SubReg == SuperReg.
+ int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+
+ const unsigned EltSize = 4;
+
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
unsigned SubReg = NumSubRegs == 1 ?
SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i));
+ if (SpillToSMEM) {
+ unsigned Align = FrameInfo.getObjectAlignment(Index);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+ EltSize, MinAlign(Align, EltSize * i));
+
+ // Add i * 4 offset
+ int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i);
+ if (Offset != 0) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg())
+ .addImm(Offset);
+ } else {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg)
+ .addReg(MFI->getScratchRSrcReg()) // sbase
+ .addReg(OffsetReg, RegState::Kill) // soff
+ .addImm(0) // glc
+ .addMemOperand(MMO);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+
+ continue;
+ }
+
SIMachineFunctionInfo::SpilledReg Spill
= MFI->getSpilledReg(MF, Index, i);
if (Spill.hasReg()) {
- BuildMI(*MBB, MI, DL,
- TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
- SubReg)
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ SubReg)
.addReg(Spill.VGPR)
- .addImm(Spill.Lane)
- .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+ .addImm(Spill.Lane);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
} else {
// Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI.
-
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned Align = FrameInfo.getObjectAlignment(Index);
- unsigned Size = FrameInfo.getObjectSize(Index);
MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index);
+ = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
- MachineMemOperand *MMO = MF->getMachineMemOperand(
- PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+ MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
+ MachineMemOperand::MOLoad, EltSize,
+ MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
.addFrameIndex(Index) // vaddr
@@ -628,16 +740,19 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
.addReg(MFI->getScratchWaveOffsetReg()) // soffset
.addImm(i * 4) // offset
.addMemOperand(MMO);
- BuildMI(*MBB, MI, DL,
- TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
- .addReg(TmpReg, RegState::Kill)
- .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+
+ auto MIB =
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
+ .addReg(TmpReg, RegState::Kill);
+
+ if (NumSubRegs > 1)
+ MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
}
}
- if (IsM0 && SuperReg != AMDGPU::M0) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(SuperReg);
+ if (M0CopyReg != AMDGPU::NoRegister) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(M0CopyReg, RegState::Kill);
}
MI->eraseFromParent();
@@ -685,28 +800,38 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V64_SAVE:
- case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V32_SAVE: {
+ const MachineOperand *VData = TII->getNamedOperand(*MI,
+ AMDGPU::OpName::vdata);
buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
- TII->getNamedOperand(*MI, AMDGPU::OpName::vdata),
+ Index,
+ VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
- FrameInfo.getObjectOffset(Index) +
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(),
+ RS);
MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
MI->eraseFromParent();
break;
+ }
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_V96_RESTORE:
case AMDGPU::SI_SPILL_V128_RESTORE:
case AMDGPU::SI_SPILL_V256_RESTORE:
case AMDGPU::SI_SPILL_V512_RESTORE: {
+ const MachineOperand *VData = TII->getNamedOperand(*MI,
+ AMDGPU::OpName::vdata);
+
buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
- TII->getNamedOperand(*MI, AMDGPU::OpName::vdata),
+ Index,
+ VData->getReg(), VData->isKill(),
TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
- FrameInfo.getObjectOffset(Index) +
- TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+ *MI->memoperands_begin(),
+ RS);
MI->eraseFromParent();
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index ed960c97f7b..bd83ef1b403 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -253,9 +253,14 @@ public:
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
- unsigned LoadStoreOp, const MachineOperand *SrcDst,
- unsigned ScratchRsrcReg, unsigned ScratchOffset,
- int64_t Offset,
+ unsigned LoadStoreOp,
+ int Index,
+ unsigned ValueReg,
+ bool ValueIsKill,
+ unsigned ScratchRsrcReg,
+ unsigned ScratchOffsetReg,
+ int64_t InstrOffset,
+ MachineMemOperand *MMO,
RegScavenger *RS) const;
};
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index d1907d16aba..5bdd8bec54e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -120,6 +120,11 @@ def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
let isAllocatable = 0;
}
+def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
+ let CopyCost = 1;
+ let isAllocatable = 0;
+}
+
// TODO: Do we need to set DwarfRegAlias on register tuples?
// SGPR 32-bit registers
@@ -259,8 +264,9 @@ def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
- (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> {
+ (add SReg_32_XM0, M0_CLASS)> {
let AllocationPriority = 1;
+ let isAllocatable = 0;
}
def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 21518389b8b..76b0b4573fc 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -437,7 +437,7 @@ bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
MachineBasicBlock::iterator
SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before) {
- unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MachineInstr *Save =
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
OpenPOWER on IntegriCloud