diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 830 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp | 75 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h | 24 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt | 1 |
6 files changed, 749 insertions, 189 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6d39c254c73..48cde90a972 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -817,8 +817,8 @@ bool GCNPassConfig::addILPOpts() { bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); - addPass(createSILowerI1CopiesPass()); addPass(&SIFixSGPRCopiesID); + addPass(createSILowerI1CopiesPass()); return false; } diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 5324cbc912d..809f5bab469 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -183,13 +183,15 @@ getCopyRegClasses(const MachineInstr &Copy, static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { - return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC); + return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && + TRI.hasVGPRs(SrcRC); } static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { - return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); + return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && + TRI.hasVGPRs(DstRC); } static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index ecc6cff407e..eb038bb5d5f 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -5,37 +5,61 @@ // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -/// i1 values are usually inserted by the CFG Structurize pass and they are -/// unique in that they can be copied from VALU to SALU registers. -/// This is not possible for any other value type. Since there are no -/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1. -/// //===----------------------------------------------------------------------===// // +// This pass lowers all occurrences of i1 values (with a vreg_1 register class) +// to lane masks (64-bit scalar registers). The pass assumes machine SSA form +// and a wave-level control flow graph. +// +// Before this pass, values that are semantically i1 and are defined and used +// within the same basic block are already represented as lane masks in scalar +// registers. However, values that cross basic blocks are always transferred +// between basic blocks in vreg_1 virtual registers and are lowered by this +// pass. +// +// The only instructions that use or define vreg_1 virtual registers are COPY, +// PHI, and IMPLICIT_DEF. +// +//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "si-i1-copies" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "Utils/AMDGPULaneDominator.h" -#include "llvm/CodeGen/LiveIntervals.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSSAUpdater.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" +#define DEBUG_TYPE "si-i1-copies" + using namespace llvm; +static unsigned createLaneMaskReg(MachineFunction &MF); +static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); + namespace { class SILowerI1Copies : public MachineFunctionPass { public: static char ID; +private: + MachineFunction *MF = nullptr; + MachineDominatorTree *DT = nullptr; + MachinePostDominatorTree *PDT = nullptr; + MachineRegisterInfo *MRI = nullptr; + const GCNSubtarget *ST = nullptr; + const SIInstrInfo *TII = nullptr; + + DenseSet<unsigned> ConstrainRegs; + public: SILowerI1Copies() : MachineFunctionPass(ID) { initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); @@ -47,14 +71,337 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } + +private: + void lowerCopiesFromI1(); + void lowerPhis(); + void lowerCopiesToI1(); + bool isConstantLaneMask(unsigned Reg, bool &Val) const; + void buildMergeLaneMasks(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, unsigned PrevReg, unsigned CurReg); + MachineBasicBlock::iterator + getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; + + bool isLaneMaskReg(unsigned Reg) const { + return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && + TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == + ST->getWavefrontSize(); + } +}; + +/// Helper class that determines the relationship between incoming values of a +/// phi in the control flow graph to determine where an incoming value can +/// simply be taken as a scalar lane mask as-is, and where it needs to be +/// merged with another, previously defined lane mask. +/// +/// The approach is as follows: +/// - Determine all basic blocks which, starting from the incoming blocks, +/// a wave may reach before entering the def block (the block containing the +/// phi). +/// - If an incoming block has no predecessors in this set, we can take the +/// incoming value as a scalar lane mask as-is. +/// -- A special case of this is when the def block has a self-loop. +/// - Otherwise, the incoming value needs to be merged with a previously +/// defined lane mask. +/// - If there is a path into the set of reachable blocks that does _not_ go +/// through an incoming block where we can take the scalar lane mask as-is, +/// we need to invent an available value for the SSAUpdater. Choices are +/// 0 and undef, with differing consequences for how to merge values etc. +/// +/// TODO: We could use region analysis to quickly skip over SESE regions during +/// the traversal. +/// +class PhiIncomingAnalysis { + MachinePostDominatorTree &PDT; + + // For each reachable basic block, whether it is a source in the induced + // subgraph of the CFG. + DenseMap<MachineBasicBlock *, bool> ReachableMap; + SmallVector<MachineBasicBlock *, 4> ReachableOrdered; + SmallVector<MachineBasicBlock *, 4> Stack; + SmallVector<MachineBasicBlock *, 4> Predecessors; + +public: + PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {} + + /// Returns whether \p MBB is a source in the induced subgraph of reachable + /// blocks. + bool isSource(MachineBasicBlock &MBB) const { + return ReachableMap.find(&MBB)->second; + } + + ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } + + void analyze(MachineBasicBlock &DefBlock, + ArrayRef<MachineBasicBlock *> IncomingBlocks) { + assert(Stack.empty()); + ReachableMap.clear(); + ReachableOrdered.clear(); + Predecessors.clear(); + + // Insert the def block first, so that it acts as an end point for the + // traversal. + ReachableMap.try_emplace(&DefBlock, false); + ReachableOrdered.push_back(&DefBlock); + + for (MachineBasicBlock *MBB : IncomingBlocks) { + if (MBB == &DefBlock) { + ReachableMap[&DefBlock] = true; // self-loop on DefBlock + continue; + } + + ReachableMap.try_emplace(MBB, false); + ReachableOrdered.push_back(MBB); + + // If this block has a divergent terminator and the def block is its + // post-dominator, the wave may first visit the other successors. + bool Divergent = false; + for (MachineInstr &MI : MBB->terminators()) { + if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || + MI.getOpcode() == AMDGPU::SI_IF || + MI.getOpcode() == AMDGPU::SI_ELSE || + MI.getOpcode() == AMDGPU::SI_LOOP) { + Divergent = true; + break; + } + } + + if (Divergent && PDT.dominates(&DefBlock, MBB)) { + for (MachineBasicBlock *Succ : MBB->successors()) + Stack.push_back(Succ); + } + } + + while (!Stack.empty()) { + MachineBasicBlock *MBB = Stack.pop_back_val(); + if (!ReachableMap.try_emplace(MBB, false).second) + continue; + ReachableOrdered.push_back(MBB); + + for (MachineBasicBlock *Succ : MBB->successors()) + Stack.push_back(Succ); + } + + for (MachineBasicBlock *MBB : ReachableOrdered) { + bool HaveReachablePred = false; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (ReachableMap.count(Pred)) { + HaveReachablePred = true; + } else { + Stack.push_back(Pred); + } + } + if (!HaveReachablePred) + ReachableMap[MBB] = true; + if (HaveReachablePred) { + for (MachineBasicBlock *UnreachablePred : Stack) { + if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end()) + Predecessors.push_back(UnreachablePred); + } + } + Stack.clear(); + } + } +}; + +/// Helper class that detects loops which require us to lower an i1 COPY into +/// bitwise manipulation. +/// +/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish +/// between loops with the same header. Consider this example: +/// +/// A-+-+ +/// | | | +/// B-+ | +/// | | +/// C---+ +/// +/// A is the header of a loop containing A, B, and C as far as LoopInfo is +/// concerned. However, an i1 COPY in B that is used in C must be lowered to +/// bitwise operations to combine results from different loop iterations when +/// B has a divergent branch (since by default we will compile this code such +/// that threads in a wave are merged at the entry of C). +/// +/// The following rule is implemented to determine whether bitwise operations +/// are required: use the bitwise lowering for a def in block B if a backward +/// edge to B is reachable without going through the nearest common +/// post-dominator of B and all uses of the def. +/// +/// TODO: This rule is conservative because it does not check whether the +/// relevant branches are actually divergent. +/// +/// The class is designed to cache the CFG traversal so that it can be re-used +/// for multiple defs within the same basic block. +/// +/// TODO: We could use region analysis to quickly skip over SESE regions during +/// the traversal. +/// +class LoopFinder { + MachineDominatorTree &DT; + MachinePostDominatorTree &PDT; + + // All visited / reachable block, tagged by level (level 0 is the def block, + // level 1 are all blocks reachable including but not going through the def + // block's IPDOM, etc.). + DenseMap<MachineBasicBlock *, unsigned> Visited; + + // Nearest common dominator of all visited blocks by level (level 0 is the + // def block). Used for seeding the SSAUpdater. + SmallVector<MachineBasicBlock *, 4> CommonDominators; + + // Post-dominator of all visited blocks. + MachineBasicBlock *VisitedPostDom = nullptr; + + // Level at which a loop was found: 0 is not possible; 1 = a backward edge is + // reachable without going through the IPDOM of the def block (if the IPDOM + // itself has an edge to the def block, the loop level is 2), etc. + unsigned FoundLoopLevel = ~0u; + + MachineBasicBlock *DefBlock = nullptr; + SmallVector<MachineBasicBlock *, 4> Stack; + SmallVector<MachineBasicBlock *, 4> NextLevel; + +public: + LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) + : DT(DT), PDT(PDT) {} + + void initialize(MachineBasicBlock &MBB) { + Visited.clear(); + CommonDominators.clear(); + Stack.clear(); + NextLevel.clear(); + VisitedPostDom = nullptr; + FoundLoopLevel = ~0u; + + DefBlock = &MBB; + } + + /// Check whether a backward edge can be reached without going through the + /// given \p PostDom of the def block. + /// + /// Return the level of \p PostDom if a loop was found, or 0 otherwise. + unsigned findLoop(MachineBasicBlock *PostDom) { + MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); + + if (!VisitedPostDom) + advanceLevel(); + + unsigned Level = 0; + while (PDNode->getBlock() != PostDom) { + if (PDNode->getBlock() == VisitedPostDom) + advanceLevel(); + PDNode = PDNode->getIDom(); + Level++; + if (FoundLoopLevel == Level) + return Level; + } + + return 0; + } + + /// Add undef values dominating the loop and the optionally given additional + /// blocks, so that the SSA updater doesn't have to search all the way to the + /// function entry. + void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, + ArrayRef<MachineBasicBlock *> Blocks = {}) { + assert(LoopLevel < CommonDominators.size()); + + MachineBasicBlock *Dom = CommonDominators[LoopLevel]; + for (MachineBasicBlock *MBB : Blocks) + Dom = DT.findNearestCommonDominator(Dom, MBB); + + if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { + SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); + } else { + // The dominator is part of the loop or the given blocks, so add the + // undef value to unreachable predecessors instead. + for (MachineBasicBlock *Pred : Dom->predecessors()) { + if (!inLoopLevel(*Pred, LoopLevel, Blocks)) + SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); + } + } + } + +private: + bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, + ArrayRef<MachineBasicBlock *> Blocks) const { + auto DomIt = Visited.find(&MBB); + if (DomIt != Visited.end() && DomIt->second <= LoopLevel) + return true; + + if (llvm::find(Blocks, &MBB) != Blocks.end()) + return true; + + return false; + } + + void advanceLevel() { + MachineBasicBlock *VisitedDom; + + if (!VisitedPostDom) { + VisitedPostDom = DefBlock; + VisitedDom = DefBlock; + Stack.push_back(DefBlock); + } else { + VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); + VisitedDom = CommonDominators.back(); + + for (unsigned i = 0; i < NextLevel.size();) { + if (PDT.dominates(VisitedPostDom, NextLevel[i])) { + Stack.push_back(NextLevel[i]); + + NextLevel[i] = NextLevel.back(); + NextLevel.pop_back(); + } else { + i++; + } + } + } + + unsigned Level = CommonDominators.size(); + while (!Stack.empty()) { + MachineBasicBlock *MBB = Stack.pop_back_val(); + if (!PDT.dominates(VisitedPostDom, MBB)) + NextLevel.push_back(MBB); + + Visited[MBB] = Level; + VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); + + for (MachineBasicBlock *Succ : MBB->successors()) { + if (Succ == DefBlock) { + if (MBB == VisitedPostDom) + FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); + else + FoundLoopLevel = std::min(FoundLoopLevel, Level); + continue; + } + + if (Visited.try_emplace(Succ, ~0u).second) { + if (MBB == VisitedPostDom) + NextLevel.push_back(Succ); + else + Stack.push_back(Succ); + } + } + } + + CommonDominators.push_back(VisitedDom); + } }; } // End anonymous namespace. -INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE, - "SI Lower i1 Copies", false, false) +INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, + false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, + false) char SILowerI1Copies::ID = 0; @@ -64,104 +411,415 @@ FunctionPass *llvm::createSILowerI1CopiesPass() { return new SILowerI1Copies(); } -bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { +static unsigned createLaneMaskReg(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); + return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); +} + +static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { + MachineFunction &MF = *MBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); - const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); + unsigned UndefReg = createLaneMaskReg(MF); + BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), + UndefReg); + return UndefReg; +} - std::vector<unsigned> I1Defs; +/// Lower all instructions that def or use vreg_1 registers. +/// +/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can +/// occur around inline assembly. We do this first, before vreg_1 registers +/// are changed to scalar mask registers. +/// +/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before +/// all others, because phi lowering looks through copies and can therefore +/// often make copy lowering unnecessary. +bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { + MF = &TheMF; + MRI = &MF->getRegInfo(); + DT = &getAnalysis<MachineDominatorTree>(); + PDT = &getAnalysis<MachinePostDominatorTree>(); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { + ST = &MF->getSubtarget<GCNSubtarget>(); + TII = ST->getInstrInfo(); - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; + lowerCopiesFromI1(); + lowerPhis(); + lowerCopiesToI1(); - if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { - unsigned Reg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = MRI.getRegClass(Reg); - if (RC == &AMDGPU::VReg_1RegClass) - MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass); - continue; - } + for (unsigned Reg : ConstrainRegs) + MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass); + ConstrainRegs.clear(); + + return true; +} +void SILowerI1Copies::lowerCopiesFromI1() { + SmallVector<MachineInstr *, 4> DeadCopies; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { if (MI.getOpcode() != AMDGPU::COPY) continue; - const MachineOperand &Dst = MI.getOperand(0); - const MachineOperand &Src = MI.getOperand(1); - - if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) || - !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = MI.getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass) continue; - const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); - const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); + if (isLaneMaskReg(DstReg) || + (TargetRegisterInfo::isVirtualRegister(DstReg) && + MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass)) + continue; + // Copy into a 32-bit vector register. + LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); DebugLoc DL = MI.getDebugLoc(); - MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); - if (DstRC == &AMDGPU::VReg_1RegClass && - TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { - I1Defs.push_back(Dst.getReg()); - - if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { - if (DefInst->getOperand(1).isImm()) { - I1Defs.push_back(Dst.getReg()); - - int64_t Val = DefInst->getOperand(1).getImm(); - assert(Val == 0 || Val == -1); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) - .add(Dst) - .addImm(Val); - MI.eraseFromParent(); - continue; + + assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32); + assert(!MI.getOperand(0).getSubReg()); + + ConstrainRegs.insert(SrcReg); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) + .addImm(-1) + .addReg(SrcReg); + DeadCopies.push_back(&MI); + } + + for (MachineInstr *MI : DeadCopies) + MI->eraseFromParent(); + DeadCopies.clear(); + } +} + +void SILowerI1Copies::lowerPhis() { + MachineSSAUpdater SSAUpdater(*MF); + LoopFinder LF(*DT, *PDT); + PhiIncomingAnalysis PIA(*PDT); + SmallVector<MachineInstr *, 4> DeadPhis; + SmallVector<MachineBasicBlock *, 4> IncomingBlocks; + SmallVector<unsigned, 4> IncomingRegs; + SmallVector<unsigned, 4> IncomingUpdated; + + for (MachineBasicBlock &MBB : *MF) { + LF.initialize(MBB); + + for (MachineInstr &MI : MBB.phis()) { + unsigned DstReg = MI.getOperand(0).getReg(); + if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass) + continue; + + LLVM_DEBUG(dbgs() << "Lower PHI: " << MI); + + MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + + // Collect incoming values. + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + assert(i + 1 < MI.getNumOperands()); + unsigned IncomingReg = MI.getOperand(i).getReg(); + MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB(); + MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); + + if (IncomingDef->getOpcode() == AMDGPU::COPY) { + IncomingReg = IncomingDef->getOperand(1).getReg(); + assert(isLaneMaskReg(IncomingReg)); + assert(!IncomingDef->getOperand(1).getSubReg()); + } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { + continue; + } else { + assert(IncomingDef->isPHI()); + } + + IncomingBlocks.push_back(IncomingMBB); + IncomingRegs.push_back(IncomingReg); + } + + // Phis in a loop that are observed outside the loop receive a simple but + // conservatively correct treatment. + MachineBasicBlock *PostDomBound = &MBB; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) { + PostDomBound = + PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); + } + + unsigned FoundLoopLevel = LF.findLoop(PostDomBound); + + SSAUpdater.Initialize(DstReg); + + if (FoundLoopLevel) { + LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(IncomingBlocks[i], + IncomingUpdated.back()); + } + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + } + } else { + // The phi is not observed from outside a loop. Use a more accurate + // lowering. + PIA.analyze(MBB, IncomingBlocks); + + for (MachineBasicBlock *MBB : PIA.predecessors()) + SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + if (PIA.isSource(IMBB)) { + IncomingUpdated.push_back(0); + SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); + } else { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); } } - unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc) - .add(Src); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) - .add(Dst) - .addImm(0) - .addImm(-1) - .addReg(TmpSrc); - MI.eraseFromParent(); - } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && - SrcRC == &AMDGPU::VReg_1RegClass) { - if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 && - DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() && - DefInst->getOperand(1).getImm() == 0 && - DefInst->getOperand(2).getImm() != 0 && - DefInst->getOperand(3).isReg() && - TargetRegisterInfo::isVirtualRegister( - DefInst->getOperand(3).getReg()) && - TRI->getCommonSubClass( - MRI.getRegClass(DefInst->getOperand(3).getReg()), - &AMDGPU::SGPR_64RegClass) && - AMDGPU::laneDominates(DefInst->getParent(), &MBB)) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64)) - .add(Dst) - .addReg(AMDGPU::EXEC) - .add(DefInst->getOperand(3)); - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64)) - .add(Dst) - .add(Src) - .addImm(0); + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + if (!IncomingUpdated[i]) + continue; + + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); } - MI.eraseFromParent(); + } + + unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); + if (NewReg != DstReg) { + MRI->replaceRegWith(NewReg, DstReg); + + // Ensure that DstReg has a single def and mark the old PHI node for + // deletion. + MI.getOperand(0).setReg(NewReg); + DeadPhis.push_back(&MI); + } + + IncomingBlocks.clear(); + IncomingRegs.clear(); + IncomingUpdated.clear(); + } + + for (MachineInstr *MI : DeadPhis) + MI->eraseFromParent(); + DeadPhis.clear(); + } +} + +void SILowerI1Copies::lowerCopiesToI1() { + MachineSSAUpdater SSAUpdater(*MF); + LoopFinder LF(*DT, *PDT); + SmallVector<MachineInstr *, 4> DeadCopies; + + for (MachineBasicBlock &MBB : *MF) { + LF.initialize(MBB); + + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && + MI.getOpcode() != AMDGPU::COPY) + continue; + + unsigned DstReg = MI.getOperand(0).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(DstReg) || + MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass) + continue; + + if (MRI->use_empty(DstReg)) { + DeadCopies.push_back(&MI); + continue; + } + + LLVM_DEBUG(dbgs() << "Lower Other: " << MI); + + MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) + continue; + + DebugLoc DL = MI.getDebugLoc(); + unsigned SrcReg = MI.getOperand(1).getReg(); + assert(!MI.getOperand(1).getSubReg()); + + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + !isLaneMaskReg(SrcReg)) { + assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); + unsigned TmpReg = createLaneMaskReg(*MF); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) + .addReg(SrcReg) + .addImm(0); + MI.getOperand(1).setReg(TmpReg); + SrcReg = TmpReg; + } + + // Defs in a loop that are observed outside the loop must be transformed + // into appropriate bit manipulation. + MachineBasicBlock *PostDomBound = &MBB; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) { + PostDomBound = + PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); + } + + unsigned FoundLoopLevel = LF.findLoop(PostDomBound); + if (FoundLoopLevel) { + SSAUpdater.Initialize(DstReg); + SSAUpdater.AddAvailableValue(&MBB, DstReg); + LF.addLoopEntries(FoundLoopLevel, SSAUpdater); + + buildMergeLaneMasks(MBB, MI, DL, DstReg, + SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); + DeadCopies.push_back(&MI); } } + + for (MachineInstr *MI : DeadCopies) + MI->eraseFromParent(); + DeadCopies.clear(); } +} - for (unsigned Reg : I1Defs) - MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass); +bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const { + const MachineInstr *MI; + for (;;) { + MI = MRI->getUniqueVRegDef(Reg); + if (MI->getOpcode() != AMDGPU::COPY) + break; + + Reg = MI->getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return false; + if (!isLaneMaskReg(Reg)) + return false; + } + + if (MI->getOpcode() != AMDGPU::S_MOV_B64) + return false; + + if (!MI->getOperand(1).isImm()) + return false; + + int64_t Imm = MI->getOperand(1).getImm(); + if (Imm == 0) { + Val = false; + return true; + } + if (Imm == -1) { + Val = true; + return true; + } return false; } + +static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { + Def = false; + Use = false; + + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { + if (MO.isUse()) + Use = true; + else + Def = true; + } + } +} + +/// Return a point at the end of the given \p MBB to insert SALU instructions +/// for lane mask calculation. Take terminators and SCC into account. +MachineBasicBlock::iterator +SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { + auto InsertionPt = MBB.getFirstTerminator(); + bool TerminatorsUseSCC = false; + for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { + bool DefsSCC; + instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); + if (TerminatorsUseSCC || DefsSCC) + break; + } + + if (!TerminatorsUseSCC) + return InsertionPt; + + while (InsertionPt != MBB.begin()) { + InsertionPt--; + + bool DefSCC, UseSCC; + instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); + if (DefSCC) + return InsertionPt; + } + + // We should have at least seen an IMPLICIT_DEF or COPY + llvm_unreachable("SCC used by terminator but no def in block"); +} + +void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DstReg, + unsigned PrevReg, unsigned CurReg) { + bool PrevVal; + bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); + bool CurVal; + bool CurConstant = isConstantLaneMask(CurReg, CurVal); + + if (PrevConstant && CurConstant) { + if (PrevVal == CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); + } else if (CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg) + .addReg(AMDGPU::EXEC) + .addImm(-1); + } + return; + } + + unsigned PrevMaskedReg = 0; + unsigned CurMaskedReg = 0; + if (!PrevConstant) { + if (CurConstant && CurVal) { + PrevMaskedReg = PrevReg; + } else { + PrevMaskedReg = createLaneMaskReg(*MF); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg) + .addReg(PrevReg) + .addReg(AMDGPU::EXEC); + } + } + if (!CurConstant) { + // TODO: check whether CurReg is already masked by EXEC + if (PrevConstant && PrevVal) { + CurMaskedReg = CurReg; + } else { + CurMaskedReg = createLaneMaskReg(*MF); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg) + .addReg(CurReg) + .addReg(AMDGPU::EXEC); + } + } + + if (PrevConstant && !PrevVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) + .addReg(CurMaskedReg); + } else if (CurConstant && !CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) + .addReg(PrevMaskedReg); + } else if (PrevConstant && PrevVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg) + .addReg(CurMaskedReg) + .addReg(AMDGPU::EXEC); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg) + .addReg(PrevMaskedReg) + .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC); + } +} diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp deleted file mode 100644 index 1924f71f11c..00000000000 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp +++ /dev/null @@ -1,75 +0,0 @@ -//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// MBB A lane-dominates MBB B if -// 1. A dominates B in the usual sense, i.e. every path from the entry to B -// goes through A, and -// 2. whenever B executes, every active lane during that execution of B was -// also active during the most recent execution of A. -// -// The simplest example where A dominates B but does not lane-dominate it is -// where A is a loop: -// -// | -// +--+ -// A | -// +--+ -// | -// B -// -// Unfortunately, the second condition is not fully captured by the control -// flow graph when it is unstructured (as may happen when branch conditions are -// uniform). -// -// The following replacement of the second condition is a conservative -// approximation. It is an equivalent condition when the CFG is fully -// structured: -// -// 2'. every cycle in the CFG that contains A also contains B. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPULaneDominator.h" - -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/MachineBasicBlock.h" - -namespace llvm { - -namespace AMDGPU { - -// Given machine basic blocks A and B where A dominates B, check whether -// A lane-dominates B. -// -// The check is conservative, i.e. there can be false-negatives. -bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) { - // Check whether A is reachable from itself without going through B. - DenseSet<MachineBasicBlock *> Reachable; - SmallVector<MachineBasicBlock *, 8> Stack; - - Stack.push_back(A); - do { - MachineBasicBlock *MBB = Stack.back(); - Stack.pop_back(); - - for (MachineBasicBlock *Succ : MBB->successors()) { - if (Succ == A) - return false; - if (Succ != B && Reachable.insert(Succ).second) - Stack.push_back(Succ); - } - } while (!Stack.empty()); - - return true; -} - -} // namespace AMDGPU - -} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h deleted file mode 100644 index 4f33a89a364..00000000000 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h +++ /dev/null @@ -1,24 +0,0 @@ -//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H -#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H - -namespace llvm { - -class MachineBasicBlock; - -namespace AMDGPU { - -bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB); - -} // end namespace AMDGPU -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt index c5ed32e4682..01b80ebe8d3 100644 --- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -2,5 +2,4 @@ add_llvm_library(LLVMAMDGPUUtils AMDGPUBaseInfo.cpp AMDKernelCodeTUtils.cpp AMDGPUAsmUtils.cpp - AMDGPULaneDominator.cpp ) |

