summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp162
-rw-r--r--llvm/test/CodeGen/AMDGPU/wqm.ll65
2 files changed, 145 insertions, 82 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 03efd1f3326..ecf1d5abb9f 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -69,6 +69,25 @@ enum {
StateExact = 0x2,
};
+struct PrintState {
+public:
+ explicit PrintState(int State) : State(State) {}
+
+ int State;
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
+ if (PS.State & StateWQM)
+ OS << "WQM";
+ if (PS.State & StateExact) {
+ if (PS.State & StateWQM)
+ OS << '|';
+ OS << "Exact";
+ }
+
+ return OS;
+}
+
struct InstrInfo {
char Needs = 0;
char OutNeeds = 0;
@@ -98,11 +117,13 @@ private:
DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
- SmallVector<const MachineInstr *, 2> ExecExports;
SmallVector<MachineInstr *, 1> LiveMaskQueries;
+ void printInfo();
+
void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
+ void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@@ -151,6 +172,24 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
return new SIWholeQuadMode;
}
+void SIWholeQuadMode::printInfo() {
+ for (const auto &BII : Blocks) {
+ dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
+ << " InNeeds = " << PrintState(BII.second.InNeeds)
+ << ", Needs = " << PrintState(BII.second.Needs)
+ << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
+
+ for (const MachineInstr &MI : *BII.first) {
+ auto III = Instructions.find(&MI);
+ if (III == Instructions.end())
+ continue;
+
+ dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
+ << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+ }
+ }
+}
+
void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
InstrInfo &II = Instructions[&MI];
@@ -168,6 +207,45 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
Worklist.push_back(&MI);
}
+/// Mark all instructions defining the uses in \p MI as WQM.
+void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
+ std::vector<WorkItem> &Worklist) {
+ for (const MachineOperand &Use : MI.uses()) {
+ if (!Use.isReg() || !Use.isUse())
+ continue;
+
+ unsigned Reg = Use.getReg();
+
+ // Handle physical registers that we need to track; this is mostly relevant
+ // for VCC, which can appear as the (implicit) input of a uniform branch,
+ // e.g. when a loop counter is stored in a VGPR.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Reg == AMDGPU::EXEC)
+ continue;
+
+ for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+ LiveRange &LR = LIS->getRegUnit(*RegUnit);
+ const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+ if (!Value)
+ continue;
+
+ // Since we're in machine SSA, we do not need to track physical
+ // registers across basic blocks.
+ if (Value->isPHIDef())
+ continue;
+
+ markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+ Worklist);
+ }
+
+ continue;
+ }
+
+ for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+ markInstruction(DefMI, StateWQM, Worklist);
+ }
+}
+
// Scan instructions to determine which ones require an Exact execmask and
// which ones seed WQM requirements.
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
@@ -183,16 +261,19 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
unsigned Opcode = MI.getOpcode();
char Flags = 0;
- if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
+ if (TII->isDS(Opcode)) {
Flags = StateWQM;
+ } else if (TII->isWQM(Opcode)) {
+ // Sampling instructions don't need to produce results for all pixels
+ // in a quad, they just require all inputs of a quad to have been
+ // computed for derivatives.
+ markUsesWQM(MI, Worklist);
+ GlobalFlags |= StateWQM;
+ continue;
} else if (TII->isDisableWQM(MI)) {
Flags = StateExact;
} else {
- // Handle export instructions with the exec mask valid flag set
- if (Opcode == AMDGPU::EXP) {
- if (MI.getOperand(4).getImm() != 0)
- ExecExports.push_back(&MI);
- } else if (Opcode == AMDGPU::SI_PS_LIVE) {
+ if (Opcode == AMDGPU::SI_PS_LIVE) {
LiveMaskQueries.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
@@ -259,43 +340,9 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
// Propagate WQM flag to instruction inputs
assert(II.Needs != (StateWQM | StateExact));
- if (II.Needs != StateWQM)
- return;
-
- for (const MachineOperand &Use : MI.uses()) {
- if (!Use.isReg() || !Use.isUse())
- continue;
-
- unsigned Reg = Use.getReg();
-
- // Handle physical registers that we need to track; this is mostly relevant
- // for VCC, which can appear as the (implicit) input of a uniform branch,
- // e.g. when a loop counter is stored in a VGPR.
- if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
- if (Reg == AMDGPU::EXEC)
- continue;
-
- for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
- LiveRange &LR = LIS->getRegUnit(*RegUnit);
- const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
- if (!Value)
- continue;
-
- // Since we're in machine SSA, we do not need to track physical
- // registers across basic blocks.
- if (Value->isPHIDef())
- continue;
-
- markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
- Worklist);
- }
- continue;
- }
-
- for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
- markInstruction(DefMI, StateWQM, Worklist);
- }
+ if (II.Needs == StateWQM)
+ markUsesWQM(MI, Worklist);
}
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -395,9 +442,12 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
return;
+ DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
+
unsigned SavedWQMReg = 0;
bool WQMFromExec = isEntry;
char State = isEntry ? StateExact : StateWQM;
+ MachineInstr *FirstNonWQM = nullptr;
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
while (II != IE) {
@@ -428,15 +478,16 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (InstrInfoIt != Instructions.end()) {
Needs = InstrInfoIt->second.Needs;
OutNeeds = InstrInfoIt->second.OutNeeds;
-
- // Make sure to switch to Exact mode before the end of the block when
- // Exact and only Exact is needed further downstream.
- if (OutNeeds == StateExact && MI.isTerminator()) {
- assert(Needs == 0);
- Needs = StateExact;
- }
}
+ // Keep track of the first consecutive non-WQM instruction, so that we
+ // switch away from WQM as soon as possible, potentially saving a small
+ // bit of bandwidth on loads.
+ if (Needs == StateWQM)
+ FirstNonWQM = nullptr;
+ else if (!FirstNonWQM)
+ FirstNonWQM = &MI;
+
// State switching
if (Needs && State != Needs) {
if (Needs == StateExact) {
@@ -445,7 +496,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
if (!WQMFromExec && (OutNeeds & StateWQM))
SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
- toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
+ toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg);
} else {
assert(WQMFromExec == (SavedWQMReg == 0));
toWQM(MBB, &MI, SavedWQMReg);
@@ -455,7 +506,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
State = Needs;
}
- if (MI.getOpcode() == AMDGPU::SI_ELSE && State == StateExact)
+ if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
MI.getOperand(3).setImm(1);
}
@@ -463,7 +514,9 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
assert(WQMFromExec == (SavedWQMReg == 0));
toWQM(MBB, MBB.end(), SavedWQMReg);
} else if (BI.OutNeeds == StateExact && State != StateExact) {
- toExact(MBB, MBB.end(), 0, LiveMaskReg);
+ toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM)
+ : MBB.getFirstTerminator(),
+ 0, LiveMaskReg);
}
}
@@ -483,7 +536,6 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
Instructions.clear();
Blocks.clear();
- ExecExports.clear();
LiveMaskQueries.clear();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -523,6 +575,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
}
}
+ DEBUG(printInfo());
+
lowerLiveMaskQueries(LiveMaskReg);
// Handle the general case
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index aef3ed0a253..80fa3d76784 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -37,8 +37,8 @@ main_body:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
-;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: image_sample
;CHECK: store
;CHECK-NOT: exec
;CHECK: .size test3
@@ -63,7 +63,8 @@ main_body:
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;CHECK: store
;CHECK: s_wqm_b64 exec, exec
-;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf
+;CHECK: image_sample
+;CHECK: image_sample
define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
main_body:
%c.1 = mul i32 %c, %d
@@ -71,7 +72,9 @@ main_body:
call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
- ret <4 x float> %tex
+ %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+ %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %dtex
}
; Check a case of one branch of an if-else requiring WQM, the other requiring
@@ -91,6 +94,7 @@ main_body:
;CHECK: s_mov_b64 exec, [[SAVED]]
;CHECK: %IF
;CHECK: image_sample
+;CHECK: image_sample
define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
main_body:
%cmp = icmp eq i32 %z, 0
@@ -98,7 +102,9 @@ main_body:
IF:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
- %data.if = extractelement <4 x float> %tex, i32 0
+ %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+ %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %data.if = extractelement <4 x float> %dtex, i32 0
br label %END
ELSE:
@@ -118,6 +124,7 @@ END:
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: %IF
;CHECK: image_sample
+;CHECK: image_sample
;CHECK: %Flow
;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
@@ -137,7 +144,9 @@ main_body:
IF:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
- %data.if = extractelement <4 x float> %tex, i32 0
+ %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+ %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %data.if = extractelement <4 x float> %dtex, i32 0
br label %END
ELSE:
@@ -203,35 +212,27 @@ END:
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
-;CHECK: store
-;CHECK: load
+;CHECK: image_sample
;CHECK: store
;CHECK: v_cmp
-define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
- %tex.1 = extractelement <4 x float> %tex, i32 0
-
- %idx.1 = extractelement <3 x i32> %idx, i32 0
- %data.1 = extractelement <2 x float> %data, i32 0
- call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
-
- %idx.2 = extractelement <3 x i32> %idx, i32 1
- %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
+ %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+ %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %dtex.1 = extractelement <4 x float> %dtex, i32 0
- %idx.3 = extractelement <3 x i32> %idx, i32 2
- %data.3 = extractelement <2 x float> %data, i32 1
- call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
- %cc = fcmp ogt float %z, 0.0
+ %cc = fcmp ogt float %dtex.1, 0.0
br i1 %cc, label %IF, label %ELSE
IF:
- %tex.IF = fmul float %tex.1, 3.0
+ %tex.IF = fmul float %dtex.1, 3.0
br label %END
ELSE:
- %tex.ELSE = fmul float %tex.1, 4.0
+ %tex.ELSE = fmul float %dtex.1, 4.0
br label %END
END:
@@ -246,12 +247,13 @@ END:
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: %IF
-;CHECK: load
;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
+;CHECK: load
;CHECK: store
;CHECK: s_mov_b64 exec, [[SAVE]]
;CHECK: %END
;CHECK: image_sample
+;CHECK: image_sample
define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {
main_body:
%cond = icmp eq i32 %y, 0
@@ -264,7 +266,9 @@ IF:
END:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
- ret <4 x float> %tex
+ %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+ %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ ret <4 x float> %dtex
}
; Kill is performed in WQM mode so that uniform kill behaves correctly ...
@@ -273,8 +277,8 @@ END:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
-;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: image_sample
;CHECK: buffer_store_dword
;CHECK: s_wqm_b64 exec, exec
;CHECK: v_cmpx_
@@ -297,7 +301,9 @@ main_body:
call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
%tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
- %out = fadd <4 x float> %tex, %tex2
+ %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32>
+ %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %out = fadd <4 x float> %tex, %dtex
ret <4 x float> %out
}
@@ -310,18 +316,21 @@ main_body:
; CHECK: s_wqm_b64 exec, exec
; CHECK: image_sample
; CHECK: s_and_b64 exec, exec, [[ORIG]]
+; CHECK: image_sample
; CHECK: buffer_store_dword
; CHECK-NOT: wqm
; CHECK: v_cmpx_
define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+ %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+ %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
call void @llvm.AMDGPU.kill(float %z)
- ret <4 x float> %tex
+ ret <4 x float> %dtex
}
; Check prolog shaders.
@@ -392,8 +401,8 @@ break:
; CHECK: s_wqm_b64 exec, exec
; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
-; CHECK: image_sample
; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: image_sample
; CHECK: buffer_store_dwordx4
define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
entry:
OpenPOWER on IntegriCloud