summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp162
1 files changed, 108 insertions, 54 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 03efd1f3326..ecf1d5abb9f 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -69,6 +69,25 @@ enum {
StateExact = 0x2,
};
+struct PrintState {
+public:
+ explicit PrintState(int State) : State(State) {}
+
+ int State;
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
+ if (PS.State & StateWQM)
+ OS << "WQM";
+ if (PS.State & StateExact) {
+ if (PS.State & StateWQM)
+ OS << '|';
+ OS << "Exact";
+ }
+
+ return OS;
+}
+
struct InstrInfo {
char Needs = 0;
char OutNeeds = 0;
@@ -98,11 +117,13 @@ private:
DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
- SmallVector<const MachineInstr *, 2> ExecExports;
SmallVector<MachineInstr *, 1> LiveMaskQueries;
+ void printInfo();
+
void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
+ void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@@ -151,6 +172,24 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
return new SIWholeQuadMode;
}
+void SIWholeQuadMode::printInfo() {
+ for (const auto &BII : Blocks) {
+ dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
+ << " InNeeds = " << PrintState(BII.second.InNeeds)
+ << ", Needs = " << PrintState(BII.second.Needs)
+ << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
+
+ for (const MachineInstr &MI : *BII.first) {
+ auto III = Instructions.find(&MI);
+ if (III == Instructions.end())
+ continue;
+
+ dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
+ << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+ }
+ }
+}
+
void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
InstrInfo &II = Instructions[&MI];
@@ -168,6 +207,45 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
Worklist.push_back(&MI);
}
+/// Mark all instructions defining the uses in \p MI as WQM.
+void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
+ std::vector<WorkItem> &Worklist) {
+ for (const MachineOperand &Use : MI.uses()) {
+ if (!Use.isReg() || !Use.isUse())
+ continue;
+
+ unsigned Reg = Use.getReg();
+
+ // Handle physical registers that we need to track; this is mostly relevant
+ // for VCC, which can appear as the (implicit) input of a uniform branch,
+ // e.g. when a loop counter is stored in a VGPR.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Reg == AMDGPU::EXEC)
+ continue;
+
+ for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+ LiveRange &LR = LIS->getRegUnit(*RegUnit);
+ const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+ if (!Value)
+ continue;
+
+ // Since we're in machine SSA, we do not need to track physical
+ // registers across basic blocks.
+ if (Value->isPHIDef())
+ continue;
+
+ markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+ Worklist);
+ }
+
+ continue;
+ }
+
+ for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+ markInstruction(DefMI, StateWQM, Worklist);
+ }
+}
+
// Scan instructions to determine which ones require an Exact execmask and
// which ones seed WQM requirements.
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
@@ -183,16 +261,19 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
unsigned Opcode = MI.getOpcode();
char Flags = 0;
- if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
+ if (TII->isDS(Opcode)) {
Flags = StateWQM;
+ } else if (TII->isWQM(Opcode)) {
+ // Sampling instructions don't need to produce results for all pixels
+ // in a quad, they just require all inputs of a quad to have been
+ // computed for derivatives.
+ markUsesWQM(MI, Worklist);
+ GlobalFlags |= StateWQM;
+ continue;
} else if (TII->isDisableWQM(MI)) {
Flags = StateExact;
} else {
- // Handle export instructions with the exec mask valid flag set
- if (Opcode == AMDGPU::EXP) {
- if (MI.getOperand(4).getImm() != 0)
- ExecExports.push_back(&MI);
- } else if (Opcode == AMDGPU::SI_PS_LIVE) {
+ if (Opcode == AMDGPU::SI_PS_LIVE) {
LiveMaskQueries.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
@@ -259,43 +340,9 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
// Propagate WQM flag to instruction inputs
assert(II.Needs != (StateWQM | StateExact));
- if (II.Needs != StateWQM)
- return;
-
- for (const MachineOperand &Use : MI.uses()) {
- if (!Use.isReg() || !Use.isUse())
- continue;
-
- unsigned Reg = Use.getReg();
-
- // Handle physical registers that we need to track; this is mostly relevant
- // for VCC, which can appear as the (implicit) input of a uniform branch,
- // e.g. when a loop counter is stored in a VGPR.
- if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
- if (Reg == AMDGPU::EXEC)
- continue;
-
- for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
- LiveRange &LR = LIS->getRegUnit(*RegUnit);
- const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
- if (!Value)
- continue;
-
- // Since we're in machine SSA, we do not need to track physical
- // registers across basic blocks.
- if (Value->isPHIDef())
- continue;
-
- markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
- Worklist);
- }
- continue;
- }
-
- for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
- markInstruction(DefMI, StateWQM, Worklist);
- }
+ if (II.Needs == StateWQM)
+ markUsesWQM(MI, Worklist);
}
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -395,9 +442,12 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
return;
+ DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
+
unsigned SavedWQMReg = 0;
bool WQMFromExec = isEntry;
char State = isEntry ? StateExact : StateWQM;
+ MachineInstr *FirstNonWQM = nullptr;
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
while (II != IE) {
@@ -428,15 +478,16 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (InstrInfoIt != Instructions.end()) {
Needs = InstrInfoIt->second.Needs;
OutNeeds = InstrInfoIt->second.OutNeeds;
-
- // Make sure to switch to Exact mode before the end of the block when
- // Exact and only Exact is needed further downstream.
- if (OutNeeds == StateExact && MI.isTerminator()) {
- assert(Needs == 0);
- Needs = StateExact;
- }
}
+ // Keep track of the first consecutive non-WQM instruction, so that we
+ // switch away from WQM as soon as possible, potentially saving a small
+ // bit of bandwidth on loads.
+ if (Needs == StateWQM)
+ FirstNonWQM = nullptr;
+ else if (!FirstNonWQM)
+ FirstNonWQM = &MI;
+
// State switching
if (Needs && State != Needs) {
if (Needs == StateExact) {
@@ -445,7 +496,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (!WQMFromExec && (OutNeeds & StateWQM))
SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
- toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
+ toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg);
} else {
assert(WQMFromExec == (SavedWQMReg == 0));
toWQM(MBB, &MI, SavedWQMReg);
@@ -455,7 +506,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
State = Needs;
}
- if (MI.getOpcode() == AMDGPU::SI_ELSE && State == StateExact)
+ if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
MI.getOperand(3).setImm(1);
}
@@ -463,7 +514,9 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
assert(WQMFromExec == (SavedWQMReg == 0));
toWQM(MBB, MBB.end(), SavedWQMReg);
} else if (BI.OutNeeds == StateExact && State != StateExact) {
- toExact(MBB, MBB.end(), 0, LiveMaskReg);
+ toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM)
+ : MBB.getFirstTerminator(),
+ 0, LiveMaskReg);
}
}
@@ -483,7 +536,6 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
Instructions.clear();
Blocks.clear();
- ExecExports.clear();
LiveMaskQueries.clear();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -523,6 +575,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
}
}
+ DEBUG(printInfo());
+
lowerLiveMaskQueries(LiveMaskReg);
// Handle the general case
OpenPOWER on IntegriCloud