diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertSkips.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp | 158 |
6 files changed, 173 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index a55a1747caf..fbed51de0ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -156,6 +156,9 @@ extern char &SIWholeQuadModeID; void initializeSILowerControlFlowPass(PassRegistry &); extern char &SILowerControlFlowID; +void initializeSIRemoveShortExecBranchesPass(PassRegistry &); +extern char &SIRemoveShortExecBranchesID; + void initializeSIInsertSkipsPass(PassRegistry &); extern char &SIInsertSkipsPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c8dc6f6e3bf..eb30d659bf0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -228,6 +228,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); + initializeSIRemoveShortExecBranchesPass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); @@ -993,6 +994,7 @@ void GCNPassConfig::addPreEmitPass() { // be better for it to emit S_NOP <N> when possible. addPass(&PostRAHazardRecognizerID); + addPass(&SIRemoveShortExecBranchesID); addPass(&SIInsertSkipsPassID); addPass(&BranchRelaxationPassID); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 3ed35e57e54..0b8eb4b25ae 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -116,6 +116,7 @@ add_llvm_target(AMDGPUCodeGen SIOptimizeExecMaskingPreRA.cpp SIPeepholeSDWA.cpp SIRegisterInfo.cpp + SIRemoveShortExecBranches.cpp SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index 87e63fcc4a0..80c044ec00c 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -41,7 +41,7 @@ using namespace llvm; #define DEBUG_TYPE "si-insert-skips" static cl::opt<unsigned> SkipThresholdFlag( - "amdgpu-skip-threshold", + "amdgpu-skip-threshold-legacy", cl::desc("Number of instructions before jumping over divergent control flow"), cl::init(12), cl::Hidden); @@ -466,6 +466,9 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I; switch (MI.getOpcode()) { + case AMDGPU::S_CBRANCH_EXECZ: + ExecBranchStack.push_back(MI.getOperand(0).getMBB()); + break; case AMDGPU::SI_MASK_BRANCH: ExecBranchStack.push_back(MI.getOperand(0).getMBB()); MadeChange |= skipMaskBranch(MI, MBB); diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index bf052dc3c93..61d2719a3aa 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -244,9 +244,9 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) .addReg(Tmp, RegState::Kill); - // Insert a pseudo terminator to help keep the verifier happy. This will also - // be used later when inserting skips. - MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + // Insert the S_CBRANCH_EXECZ instruction which will be optimized later + // during SIRemoveShortExecBranches. + MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) .add(MI.getOperand(2)); if (!LIS) { @@ -323,8 +323,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { .addReg(DstReg); MachineInstr *Branch = - BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .addMBB(DestBB); + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addMBB(DestBB); if (!LIS) { MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp new file mode 100644 index 00000000000..51779e97ac6 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp @@ -0,0 +1,158 @@ +//===-- SIRemoveShortExecBranches.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass optmizes the s_cbranch_execz instructions. +/// The pass removes this skip instruction for short branches, +/// if there is no unwanted sideeffect in the fallthrough code sequence. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-remove-short-exec-branches" + +static unsigned SkipThreshold; + +static cl::opt<unsigned, true> SkipThresholdFlag( + "amdgpu-skip-threshold", cl::Hidden, + cl::desc( + "Number of instructions before jumping over divergent control flow"), + cl::location(SkipThreshold), cl::init(12)); + +namespace { + +class SIRemoveShortExecBranches : public MachineFunctionPass { +private: + const SIInstrInfo *TII = nullptr; + bool getBlockDestinations(MachineBasicBlock &SrcMBB, + MachineBasicBlock *&TrueMBB, + MachineBasicBlock *&FalseMBB, + SmallVectorImpl<MachineOperand> &Cond); + bool mustRetainExeczBranch(const MachineBasicBlock &From, + const MachineBasicBlock &To) const; + bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); + +public: + static char ID; + + SIRemoveShortExecBranches() : MachineFunctionPass(ID) { + initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE, + "SI remove short exec branches", false, false) + +char SIRemoveShortExecBranches::ID = 0; + +char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID; + +bool SIRemoveShortExecBranches::getBlockDestinations( + MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, + MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) { + if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond)) + return false; + + if (!FalseMBB) + FalseMBB = SrcMBB.getNextNode(); + + return true; +} + +bool SIRemoveShortExecBranches::mustRetainExeczBranch( + const MachineBasicBlock &From, const MachineBasicBlock &To) const { + unsigned NumInstr = 0; + const MachineFunction *MF = From.getParent(); + + for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); + MBBI != End && MBBI != ToI; ++MBBI) { + const MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + // When a uniform loop is inside non-uniform control flow, the branch + // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken + // when EXEC = 0. We should skip the loop lest it becomes infinite. + if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || + I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) + return true; + + if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) + return true; + + // These instructions are potentially expensive even if EXEC = 0. + if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || + I->getOpcode() == AMDGPU::S_WAITCNT) + return true; + + ++NumInstr; + if (NumInstr >= SkipThreshold) + return true; + } + } + + return false; +} + +// Returns true if the skip branch instruction is removed. +bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI, + MachineBasicBlock &SrcMBB) { + MachineBasicBlock *TrueMBB = nullptr; + MachineBasicBlock *FalseMBB = nullptr; + SmallVector<MachineOperand, 1> Cond; + + if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) + return false; + + // Consider only the forward branches. + if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) || + mustRetainExeczBranch(*FalseMBB, *TrueMBB)) + return false; + + LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); + MI.eraseFromParent(); + SrcMBB.removeSuccessor(TrueMBB); + + return true; +} + +bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + TII = ST.getInstrInfo(); + MF.RenumberBlocks(); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + if (MBBI == MBB.end()) + continue; + + MachineInstr &MI = *MBBI; + switch (MI.getOpcode()) { + case AMDGPU::S_CBRANCH_EXECZ: + Changed = removeExeczBranch(MI, MBB); + break; + default: + break; + } + } + + return Changed; +} |