diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 136 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp | 14 |
5 files changed, 139 insertions, 25 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 971cf3805ee..b964af49739 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1397,6 +1397,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_OR_B32)); break; + case AMDGPU::S_OR_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_OR_B64)); + break; + case AMDGPU::S_ANDN2_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. @@ -1889,6 +1895,7 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, case AMDGPU::SI_MASK_BRANCH: case AMDGPU::S_MOV_B64_term: case AMDGPU::S_XOR_B64_term: + case AMDGPU::S_OR_B64_term: case AMDGPU::S_ANDN2_B64_term: case AMDGPU::S_MOV_B32_term: case AMDGPU::S_XOR_B32_term: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 934b50b87de..f67c0a20861 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -193,6 +193,7 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< let WaveSizePredicate = isWave64 in { def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>; +def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>; def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; } diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 78f409cd955..516b9bed63c 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -55,6 +55,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -79,12 +80,16 @@ class SILowerControlFlow : public MachineFunctionPass { private: const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; - LiveIntervals *LIS = nullptr; MachineRegisterInfo *MRI = nullptr; + LiveIntervals *LIS = nullptr; + MachineDominatorTree *DT = nullptr; + MachineLoopInfo *MLI = nullptr; + const TargetRegisterClass *BoolRC = nullptr; unsigned AndOpc; unsigned OrOpc; + unsigned OrTermOpc; unsigned XorOpc; unsigned MovTermOpc; unsigned Andn2TermOpc; @@ -121,7 +126,7 @@ public: AU.addPreservedID(LiveVariablesID); AU.addPreservedID(MachineLoopInfoID); AU.addPreservedID(MachineDominatorsID); - AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -249,7 +254,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { LIS->InsertMachineInstrInMaps(*SetExec); LIS->InsertMachineInstrInMaps(*NewBr); - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); + LIS->removeAllRegUnitsForPhysReg(Exec); MI.eraseFromParent(); // FIXME: Is there a better way of adjusting the liveness? It shouldn't be @@ -333,7 +338,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { LIS->createAndComputeVirtRegInterval(SaveReg); // Let this be recomputed. - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); + LIS->removeAllRegUnitsForPhysReg(Exec); } void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { @@ -398,23 +403,99 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { MI.eraseFromParent(); } +// Insert \p Inst (which modifies exec) at \p InsPt in \p MBB, such that \p MBB +// is split as necessary to keep the exec modification in its own block. +static MachineBasicBlock *insertInstWithExecFallthrough(MachineBasicBlock &MBB, + MachineInstr &MI, + MachineInstr *NewMI, + MachineDominatorTree *DT, + LiveIntervals *LIS, + MachineLoopInfo *MLI) { + assert(NewMI->isTerminator()); + + MachineBasicBlock::iterator InsPt = MI.getIterator(); + if (std::next(MI.getIterator()) == MBB.end()) { + // Don't bother with a new block. + MBB.insert(InsPt, NewMI); + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + MI.eraseFromParent(); + return &MBB; + } + + MachineFunction *MF = MBB.getParent(); + MachineBasicBlock *SplitMBB + = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MachineFunction::iterator(MBB), SplitMBB); + + // FIXME: This is working around a MachineDominatorTree API defect. + // + // If a previous pass split a critical edge, it may not have been applied to + // the DomTree yet. applySplitCriticalEdges is lazily applied, and inspects + // the CFG of the given block. Make sure to call a dominator tree method that + // will flush this cache before touching the successors of the block. + MachineDomTreeNode *NodeMBB = nullptr; + if (DT) + NodeMBB = DT->getNode(&MBB); + + // Move everything to the new block, except the end_cf pseudo. + SplitMBB->splice(SplitMBB->begin(), &MBB, MBB.begin(), MBB.end()); + + SplitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(SplitMBB, BranchProbability::getOne()); + + MBB.insert(MBB.end(), NewMI); + + if (DT) { + std::vector<MachineDomTreeNode *> Children = NodeMBB->getChildren(); + DT->addNewBlock(SplitMBB, &MBB); + + // Reparent all of the children to the new block body. + auto *SplitNode = DT->getNode(SplitMBB); + for (auto *Child : Children) + DT->changeImmediateDominator(Child, SplitNode); + } + + if (MLI) { + if (MachineLoop *Loop = MLI->getLoopFor(&MBB)) + Loop->addBasicBlockToLoop(SplitMBB, MLI->getBase()); + } + + if (LIS) { + LIS->insertMBBInMaps(SplitMBB); + LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + } + + // All live-ins are forwarded. + for (auto &LiveIn : MBB.liveins()) + SplitMBB->addLiveIn(LiveIn); + + MI.eraseFromParent(); + return SplitMBB; +} + void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator InsPt = MBB.begin(); - MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) - .addReg(Exec) - .add(MI.getOperand(0)); + // First, move the instruction. It's unnecessarily difficult to update + // LiveIntervals when there's a change in control flow, so move the + // instruction before changing the blocks. + MBB.splice(InsPt, &MBB, MI.getIterator()); if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + LIS->handleMove(MI); - MI.eraseFromParent(); + MachineFunction *MF = MBB.getParent(); - if (LIS) - LIS->handleMove(*NewMI); + // Create instruction without inserting it yet. + MachineInstr *NewMI + = BuildMI(*MF, DL, TII->get(OrTermOpc), Exec) + .addReg(Exec) + .add(MI.getOperand(0)); + insertInstWithExecFallthrough(MBB, MI, NewMI, DT, LIS, MLI); } // Returns replace operands for a logical operation, either single result @@ -436,7 +517,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, // A copy with implcitly defined exec inserted earlier is an exclusion, it // does not really modify exec. for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) - if (I->modifiesRegister(AMDGPU::EXEC, TRI) && + if (I->modifiesRegister(Exec, TRI) && !(I->isCopy() && I->getOperand(0).getReg() != Exec)) return; @@ -480,12 +561,16 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { // This doesn't actually need LiveIntervals, but we can preserve them. LIS = getAnalysisIfAvailable<LiveIntervals>(); + DT = getAnalysisIfAvailable<MachineDominatorTree>(); + MLI = getAnalysisIfAvailable<MachineLoopInfo>(); + MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); if (ST.isWave32()) { AndOpc = AMDGPU::S_AND_B32; OrOpc = AMDGPU::S_OR_B32; + OrTermOpc = AMDGPU::S_OR_B32_term; XorOpc = AMDGPU::S_XOR_B32; MovTermOpc = AMDGPU::S_MOV_B32_term; Andn2TermOpc = AMDGPU::S_ANDN2_B32_term; @@ -495,6 +580,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { } else { AndOpc = AMDGPU::S_AND_B64; OrOpc = AMDGPU::S_OR_B64; + OrTermOpc = AMDGPU::S_OR_B64_term; XorOpc = AMDGPU::S_XOR_B64; MovTermOpc = AMDGPU::S_MOV_B64_term; Andn2TermOpc = AMDGPU::S_ANDN2_B64_term; @@ -507,11 +593,11 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; BI = NextBB) { NextBB = std::next(BI); - MachineBasicBlock &MBB = *BI; + MachineBasicBlock *MBB = &*BI; MachineBasicBlock::iterator I, Next, Last; - for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) { + for (I = MBB->begin(), Last = MBB->end(); I != MBB->end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; @@ -532,10 +618,24 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { emitLoop(MI); break; - case AMDGPU::SI_END_CF: + case AMDGPU::SI_END_CF: { + MachineInstr *NextMI = nullptr; + + if (Next != MBB->end()) + NextMI = &*Next; + emitEndCf(MI); - break; + if (NextMI) { + MBB = NextMI->getParent(); + Next = NextMI->getIterator(); + Last = MBB->end(); + } + + NextBB = std::next(MBB->getIterator()); + BE = MF.end(); + break; + } case AMDGPU::S_AND_B64: case AMDGPU::S_OR_B64: case AMDGPU::S_AND_B32: @@ -551,7 +651,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { } // Replay newly inserted code to combine masks - Next = (Last == MBB.end()) ? MBB.begin() : Last; + Next = (Last == MBB->end()) ? MBB->begin() : Last; } } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 3227bff2051..00675bb5798 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -202,6 +202,12 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { MI.setDesc(TII.get(AMDGPU::S_OR_B32)); return true; } + case AMDGPU::S_OR_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(TII.get(AMDGPU::S_OR_B64)); + return true; + } case AMDGPU::S_ANDN2_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 7e10316eab9..4b17cf4e632 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -82,14 +82,14 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() { return new SIOptimizeExecMaskingPreRA(); } -static bool isEndCF(const MachineInstr &MI, const SIRegisterInfo *TRI, - const GCNSubtarget &ST) { +static bool isEndCF(const MachineInstr &MI, const GCNSubtarget &ST, + const SIRegisterInfo *TRI) { if (ST.isWave32()) { - return MI.getOpcode() == AMDGPU::S_OR_B32 && + return MI.getOpcode() == AMDGPU::S_OR_B32_term && MI.modifiesRegister(AMDGPU::EXEC_LO, TRI); } - return MI.getOpcode() == AMDGPU::S_OR_B64 && + return MI.getOpcode() == AMDGPU::S_OR_B64_term && MI.modifiesRegister(AMDGPU::EXEC, TRI); } @@ -380,13 +380,13 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { // Try to collapse adjacent endifs. auto E = MBB.end(); - auto Lead = skipDebugInstructionsForward(MBB.begin(), E); - if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI, ST)) + auto Lead = MBB.getFirstTerminator(); + if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, ST, TRI)) continue; MachineBasicBlock *TmpMBB = &MBB; auto NextLead = skipIgnoreExecInstsTrivialSucc(TmpMBB, std::next(Lead)); - if (NextLead == TmpMBB->end() || !isEndCF(*NextLead, TRI, ST) || + if (NextLead == TmpMBB->end() || !isEndCF(*NextLead, ST, TRI) || !getOrExecSource(*NextLead, *TII, MRI, ST)) continue; |