summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp29
-rw-r--r--llvm/test/CodeGen/AMDGPU/wqm.ll14
2 files changed, 41 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index dd133d37eb7..79796853497 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -154,6 +154,7 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
std::vector<WorkItem> &Worklist) {
char GlobalFlags = 0;
+ bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
@@ -161,7 +162,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
MachineInstr &MI = *II;
unsigned Opcode = MI.getOpcode();
- char Flags;
+ char Flags = 0;
if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
Flags = StateWQM;
@@ -175,15 +176,39 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
ExecExports.push_back(&MI);
} else if (Opcode == AMDGPU::SI_PS_LIVE) {
LiveMaskQueries.push_back(&MI);
+ } else if (WQMOutputs) {
+ // The function is in machine SSA form, which means that physical
+ // VGPRs correspond to shader inputs and outputs. Inputs are
+ // only used, outputs are only defined.
+ for (const MachineOperand &MO : MI.defs()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg();
+
+ if (!TRI->isVirtualRegister(Reg) &&
+ TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
+ Flags = StateWQM;
+ break;
+ }
+ }
}
- continue;
+ if (!Flags)
+ continue;
}
Instructions[&MI].Needs = Flags;
Worklist.push_back(&MI);
GlobalFlags |= Flags;
}
+
+ if (WQMOutputs && MBB.succ_empty()) {
+ // This is a prolog shader. Make sure we go back to exact mode at the end.
+ Blocks[&MBB].OutNeeds = StateExact;
+ Worklist.push_back(&MBB);
+ GlobalFlags |= StateExact;
+ }
}
return GlobalFlags;
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 30915a82923..4eab0aec56b 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -332,6 +332,19 @@ main_body:
ret <4 x float> %tex
}
+; Check prolog shaders.
+;
+; CHECK-LABEL: {{^}}test_prolog_1:
+; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_add_f32_e32 v0,
+; CHECK: s_and_b64 exec, exec, [[ORIG]]
+define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 {
+main_body:
+ %s = fadd float %a, %b
+ ret float %s
+}
+
declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
@@ -345,3 +358,4 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
attributes #1 = { nounwind }
attributes #2 = { nounwind readonly }
attributes #3 = { nounwind readnone }
+attributes #4 = { "amdgpu-ps-wqm-outputs" }
OpenPOWER on IntegriCloud