diff options
| author | Connor Abbott <cwabbott0@gmail.com> | 2017-08-04 18:36:54 +0000 |
|---|---|---|
| committer | Connor Abbott <cwabbott0@gmail.com> | 2017-08-04 18:36:54 +0000 |
| commit | 66b9bd6e503bdd87616ab04509ffb3f70220c281 (patch) | |
| tree | ef831c300d054fcfca5e13142fa969db41c3fbf5 /llvm/lib | |
| parent | 92638ab62559ce082c02680efca7280344ca05dd (diff) | |
| download | bcm5719-llvm-66b9bd6e503bdd87616ab04509ffb3f70220c281.tar.gz bcm5719-llvm-66b9bd6e503bdd87616ab04509ffb3f70220c281.zip | |
[AMDGPU] Implement llvm.amdgcn.set.inactive intrinsic
Summary:
This intrinsic lets us set inactive lanes to an identity value when
implementing wavefront reductions. In combination with Whole Wavefront
Mode, it lets inactive lanes be skipped over as required by GLSL/Vulkan.
Lowering the intrinsic needs to happen post-RA so that RA knows that the
destination isn't completely overwritten due to the EXEC shenanigans, so
we need another pseudo-instruction to represent the un-lowered
intrinsic.
Reviewers: tstellar, arsenm
Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye
Differential Revision: https://reviews.llvm.org/D34719
llvm-svn: 310088
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 22 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 14 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 31 |
3 files changed, 66 insertions, 1 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 0b0d0388031..e9492c4cf9c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1099,6 +1099,28 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + case AMDGPU::V_SET_INACTIVE_B32: { + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) + .add(MI.getOperand(2)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MI.eraseFromParent(); + break; + } + case AMDGPU::V_SET_INACTIVE_B64: { + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), + MI.getOperand(0).getReg()) + .add(MI.getOperand(2)); + expandPostRAPseudo(*Copy); + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MI.eraseFromParent(); + break; + } case AMDGPU::V_MOVRELD_B32_V1: case AMDGPU::V_MOVRELD_B32_V2: case AMDGPU::V_MOVRELD_B32_V4: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index a13c8f32fe6..70ad847fc5e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -137,6 +137,20 @@ def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> { let mayStore = 0; } +// Invert the exec mask and overwrite the inactive lanes of dst with inactive, +// restoring it after we're done. +def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VGPR_32: $src, VSrc_b32:$inactive), + [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { + let Constraints = "$src = $vdst"; +} + +def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), + (ins VReg_64: $src, VSrc_b64:$inactive), + [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { + let Constraints = "$src = $vdst"; +} + let usesCustomInserter = 1, SALU = 1 in { def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 1a0f0f9aca9..8aa57ba7293 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -303,6 +303,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist) { char GlobalFlags = 0; bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); + SmallVector<MachineInstr *, 4> SetInactiveInstrs; // We need to visit the basic blocks in reverse post-order so that we visit // defs before uses, in particular so that we don't accidentally mark an @@ -341,6 +342,23 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, GlobalFlags |= StateWWM; LowerToCopyInstrs.push_back(&MI); continue; + } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || + Opcode == AMDGPU::V_SET_INACTIVE_B64) { + III.Disabled = StateWWM; + MachineOperand &Inactive = MI.getOperand(2); + if (Inactive.isReg()) { + if (Inactive.isUndef()) { + LowerToCopyInstrs.push_back(&MI); + } else { + unsigned Reg = Inactive.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + for (MachineInstr &DefMI : MRI->def_instructions(Reg)) + markInstruction(DefMI, StateWWM, Worklist); + } + } + } + SetInactiveInstrs.push_back(&MI); + continue; } else if (TII->isDisableWQM(MI)) { BBI.Needs |= StateExact; if (!(BBI.InNeeds & StateExact)) { @@ -380,6 +398,14 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, } } + // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is + // ever used anywhere in the function. This implements the corresponding + // semantics of @llvm.amdgcn.set.inactive. + if (GlobalFlags & StateWQM) { + for (MachineInstr *MI : SetInactiveInstrs) + markInstruction(*MI, StateWQM, Worklist); + } + return GlobalFlags; } @@ -799,8 +825,11 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { } void SIWholeQuadMode::lowerCopyInstrs() { - for (MachineInstr *MI : LowerToCopyInstrs) + for (MachineInstr *MI : LowerToCopyInstrs) { + for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) + MI->RemoveOperand(i); MI->setDesc(TII->get(AMDGPU::COPY)); + } } bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { |

