summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp830
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp75
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h24
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt1
-rw-r--r--llvm/test/CodeGen/AMDGPU/add_i1.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll38
-rw-r--r--llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-asm.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop_break.ll33
-rw-r--r--llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll47
-rw-r--r--llvm/test/CodeGen/AMDGPU/multilevel-break.ll94
-rw-r--r--llvm/test/CodeGen/AMDGPU/select-opt.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll49
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub_i1.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/valu-i1.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll2
21 files changed, 1013 insertions, 330 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 6d39c254c73..48cde90a972 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -817,8 +817,8 @@ bool GCNPassConfig::addILPOpts() {
bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
- addPass(createSILowerI1CopiesPass());
addPass(&SIFixSGPRCopiesID);
+ addPass(createSILowerI1CopiesPass());
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 5324cbc912d..809f5bab469 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -183,13 +183,15 @@ getCopyRegClasses(const MachineInstr &Copy,
static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
- return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
+ return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
+ TRI.hasVGPRs(SrcRC);
}
static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {
- return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
+ return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
+ TRI.hasVGPRs(DstRC);
}
static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index ecc6cff407e..eb038bb5d5f 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -5,37 +5,61 @@
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
-/// i1 values are usually inserted by the CFG Structurize pass and they are
-/// unique in that they can be copied from VALU to SALU registers.
-/// This is not possible for any other value type. Since there are no
-/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
-///
//===----------------------------------------------------------------------===//
//
+// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
+// to lane masks (64-bit scalar registers). The pass assumes machine SSA form
+// and a wave-level control flow graph.
+//
+// Before this pass, values that are semantically i1 and are defined and used
+// within the same basic block are already represented as lane masks in scalar
+// registers. However, values that cross basic blocks are always transferred
+// between basic blocks in vreg_1 virtual registers and are lowered by this
+// pass.
+//
+// The only instructions that use or define vreg_1 virtual registers are COPY,
+// PHI, and IMPLICIT_DEF.
+//
+//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "si-i1-copies"
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPULaneDominator.h"
-#include "llvm/CodeGen/LiveIntervals.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetMachine.h"
+#define DEBUG_TYPE "si-i1-copies"
+
using namespace llvm;
+static unsigned createLaneMaskReg(MachineFunction &MF);
+static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
+
namespace {
class SILowerI1Copies : public MachineFunctionPass {
public:
static char ID;
+private:
+ MachineFunction *MF = nullptr;
+ MachineDominatorTree *DT = nullptr;
+ MachinePostDominatorTree *PDT = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ const GCNSubtarget *ST = nullptr;
+ const SIInstrInfo *TII = nullptr;
+
+ DenseSet<unsigned> ConstrainRegs;
+
public:
SILowerI1Copies() : MachineFunctionPass(ID) {
initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
@@ -47,14 +71,337 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
+
+private:
+ void lowerCopiesFromI1();
+ void lowerPhis();
+ void lowerCopiesToI1();
+ bool isConstantLaneMask(unsigned Reg, bool &Val) const;
+ void buildMergeLaneMasks(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, const DebugLoc &DL,
+ unsigned DstReg, unsigned PrevReg, unsigned CurReg);
+ MachineBasicBlock::iterator
+ getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
+
+ bool isLaneMaskReg(unsigned Reg) const {
+ return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
+ TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
+ ST->getWavefrontSize();
+ }
+};
+
+/// Helper class that determines the relationship between incoming values of a
+/// phi in the control flow graph to determine where an incoming value can
+/// simply be taken as a scalar lane mask as-is, and where it needs to be
+/// merged with another, previously defined lane mask.
+///
+/// The approach is as follows:
+/// - Determine all basic blocks which, starting from the incoming blocks,
+/// a wave may reach before entering the def block (the block containing the
+/// phi).
+/// - If an incoming block has no predecessors in this set, we can take the
+/// incoming value as a scalar lane mask as-is.
+/// -- A special case of this is when the def block has a self-loop.
+/// - Otherwise, the incoming value needs to be merged with a previously
+/// defined lane mask.
+/// - If there is a path into the set of reachable blocks that does _not_ go
+/// through an incoming block where we can take the scalar lane mask as-is,
+/// we need to invent an available value for the SSAUpdater. Choices are
+/// 0 and undef, with differing consequences for how to merge values etc.
+///
+/// TODO: We could use region analysis to quickly skip over SESE regions during
+/// the traversal.
+///
+class PhiIncomingAnalysis {
+ MachinePostDominatorTree &PDT;
+
+ // For each reachable basic block, whether it is a source in the induced
+ // subgraph of the CFG.
+ DenseMap<MachineBasicBlock *, bool> ReachableMap;
+ SmallVector<MachineBasicBlock *, 4> ReachableOrdered;
+ SmallVector<MachineBasicBlock *, 4> Stack;
+ SmallVector<MachineBasicBlock *, 4> Predecessors;
+
+public:
+ PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {}
+
+ /// Returns whether \p MBB is a source in the induced subgraph of reachable
+ /// blocks.
+ bool isSource(MachineBasicBlock &MBB) const {
+ return ReachableMap.find(&MBB)->second;
+ }
+
+ ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
+
+ void analyze(MachineBasicBlock &DefBlock,
+ ArrayRef<MachineBasicBlock *> IncomingBlocks) {
+ assert(Stack.empty());
+ ReachableMap.clear();
+ ReachableOrdered.clear();
+ Predecessors.clear();
+
+ // Insert the def block first, so that it acts as an end point for the
+ // traversal.
+ ReachableMap.try_emplace(&DefBlock, false);
+ ReachableOrdered.push_back(&DefBlock);
+
+ for (MachineBasicBlock *MBB : IncomingBlocks) {
+ if (MBB == &DefBlock) {
+ ReachableMap[&DefBlock] = true; // self-loop on DefBlock
+ continue;
+ }
+
+ ReachableMap.try_emplace(MBB, false);
+ ReachableOrdered.push_back(MBB);
+
+ // If this block has a divergent terminator and the def block is its
+ // post-dominator, the wave may first visit the other successors.
+ bool Divergent = false;
+ for (MachineInstr &MI : MBB->terminators()) {
+ if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
+ MI.getOpcode() == AMDGPU::SI_IF ||
+ MI.getOpcode() == AMDGPU::SI_ELSE ||
+ MI.getOpcode() == AMDGPU::SI_LOOP) {
+ Divergent = true;
+ break;
+ }
+ }
+
+ if (Divergent && PDT.dominates(&DefBlock, MBB)) {
+ for (MachineBasicBlock *Succ : MBB->successors())
+ Stack.push_back(Succ);
+ }
+ }
+
+ while (!Stack.empty()) {
+ MachineBasicBlock *MBB = Stack.pop_back_val();
+ if (!ReachableMap.try_emplace(MBB, false).second)
+ continue;
+ ReachableOrdered.push_back(MBB);
+
+ for (MachineBasicBlock *Succ : MBB->successors())
+ Stack.push_back(Succ);
+ }
+
+ for (MachineBasicBlock *MBB : ReachableOrdered) {
+ bool HaveReachablePred = false;
+ for (MachineBasicBlock *Pred : MBB->predecessors()) {
+ if (ReachableMap.count(Pred)) {
+ HaveReachablePred = true;
+ } else {
+ Stack.push_back(Pred);
+ }
+ }
+ if (!HaveReachablePred)
+ ReachableMap[MBB] = true;
+ if (HaveReachablePred) {
+ for (MachineBasicBlock *UnreachablePred : Stack) {
+ if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
+ Predecessors.push_back(UnreachablePred);
+ }
+ }
+ Stack.clear();
+ }
+ }
+};
+
+/// Helper class that detects loops which require us to lower an i1 COPY into
+/// bitwise manipulation.
+///
+/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
+/// between loops with the same header. Consider this example:
+///
+/// A-+-+
+/// | | |
+/// B-+ |
+/// | |
+/// C---+
+///
+/// A is the header of a loop containing A, B, and C as far as LoopInfo is
+/// concerned. However, an i1 COPY in B that is used in C must be lowered to
+/// bitwise operations to combine results from different loop iterations when
+/// B has a divergent branch (since by default we will compile this code such
+/// that threads in a wave are merged at the entry of C).
+///
+/// The following rule is implemented to determine whether bitwise operations
+/// are required: use the bitwise lowering for a def in block B if a backward
+/// edge to B is reachable without going through the nearest common
+/// post-dominator of B and all uses of the def.
+///
+/// TODO: This rule is conservative because it does not check whether the
+/// relevant branches are actually divergent.
+///
+/// The class is designed to cache the CFG traversal so that it can be re-used
+/// for multiple defs within the same basic block.
+///
+/// TODO: We could use region analysis to quickly skip over SESE regions during
+/// the traversal.
+///
+class LoopFinder {
+ MachineDominatorTree &DT;
+ MachinePostDominatorTree &PDT;
+
+ // All visited / reachable block, tagged by level (level 0 is the def block,
+ // level 1 are all blocks reachable including but not going through the def
+ // block's IPDOM, etc.).
+ DenseMap<MachineBasicBlock *, unsigned> Visited;
+
+ // Nearest common dominator of all visited blocks by level (level 0 is the
+ // def block). Used for seeding the SSAUpdater.
+ SmallVector<MachineBasicBlock *, 4> CommonDominators;
+
+ // Post-dominator of all visited blocks.
+ MachineBasicBlock *VisitedPostDom = nullptr;
+
+ // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
+ // reachable without going through the IPDOM of the def block (if the IPDOM
+ // itself has an edge to the def block, the loop level is 2), etc.
+ unsigned FoundLoopLevel = ~0u;
+
+ MachineBasicBlock *DefBlock = nullptr;
+ SmallVector<MachineBasicBlock *, 4> Stack;
+ SmallVector<MachineBasicBlock *, 4> NextLevel;
+
+public:
+ LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
+ : DT(DT), PDT(PDT) {}
+
+ void initialize(MachineBasicBlock &MBB) {
+ Visited.clear();
+ CommonDominators.clear();
+ Stack.clear();
+ NextLevel.clear();
+ VisitedPostDom = nullptr;
+ FoundLoopLevel = ~0u;
+
+ DefBlock = &MBB;
+ }
+
+ /// Check whether a backward edge can be reached without going through the
+ /// given \p PostDom of the def block.
+ ///
+ /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
+ unsigned findLoop(MachineBasicBlock *PostDom) {
+ MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
+
+ if (!VisitedPostDom)
+ advanceLevel();
+
+ unsigned Level = 0;
+ while (PDNode->getBlock() != PostDom) {
+ if (PDNode->getBlock() == VisitedPostDom)
+ advanceLevel();
+ PDNode = PDNode->getIDom();
+ Level++;
+ if (FoundLoopLevel == Level)
+ return Level;
+ }
+
+ return 0;
+ }
+
+ /// Add undef values dominating the loop and the optionally given additional
+ /// blocks, so that the SSA updater doesn't have to search all the way to the
+ /// function entry.
+ void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
+ ArrayRef<MachineBasicBlock *> Blocks = {}) {
+ assert(LoopLevel < CommonDominators.size());
+
+ MachineBasicBlock *Dom = CommonDominators[LoopLevel];
+ for (MachineBasicBlock *MBB : Blocks)
+ Dom = DT.findNearestCommonDominator(Dom, MBB);
+
+ if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
+ SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
+ } else {
+ // The dominator is part of the loop or the given blocks, so add the
+ // undef value to unreachable predecessors instead.
+ for (MachineBasicBlock *Pred : Dom->predecessors()) {
+ if (!inLoopLevel(*Pred, LoopLevel, Blocks))
+ SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
+ }
+ }
+ }
+
+private:
+ bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
+ ArrayRef<MachineBasicBlock *> Blocks) const {
+ auto DomIt = Visited.find(&MBB);
+ if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
+ return true;
+
+ if (llvm::find(Blocks, &MBB) != Blocks.end())
+ return true;
+
+ return false;
+ }
+
+ void advanceLevel() {
+ MachineBasicBlock *VisitedDom;
+
+ if (!VisitedPostDom) {
+ VisitedPostDom = DefBlock;
+ VisitedDom = DefBlock;
+ Stack.push_back(DefBlock);
+ } else {
+ VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
+ VisitedDom = CommonDominators.back();
+
+ for (unsigned i = 0; i < NextLevel.size();) {
+ if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
+ Stack.push_back(NextLevel[i]);
+
+ NextLevel[i] = NextLevel.back();
+ NextLevel.pop_back();
+ } else {
+ i++;
+ }
+ }
+ }
+
+ unsigned Level = CommonDominators.size();
+ while (!Stack.empty()) {
+ MachineBasicBlock *MBB = Stack.pop_back_val();
+ if (!PDT.dominates(VisitedPostDom, MBB))
+ NextLevel.push_back(MBB);
+
+ Visited[MBB] = Level;
+ VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
+
+ for (MachineBasicBlock *Succ : MBB->successors()) {
+ if (Succ == DefBlock) {
+ if (MBB == VisitedPostDom)
+ FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
+ else
+ FoundLoopLevel = std::min(FoundLoopLevel, Level);
+ continue;
+ }
+
+ if (Visited.try_emplace(Succ, ~0u).second) {
+ if (MBB == VisitedPostDom)
+ NextLevel.push_back(Succ);
+ else
+ Stack.push_back(Succ);
+ }
+ }
+ }
+
+ CommonDominators.push_back(VisitedDom);
+ }
};
} // End anonymous namespace.
-INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE,
- "SI Lower i1 Copies", false, false)
+INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
+ false)
char SILowerI1Copies::ID = 0;
@@ -64,104 +411,415 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
return new SILowerI1Copies();
}
-bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+static unsigned createLaneMaskReg(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
+ return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+}
+
+static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
+ MachineFunction &MF = *MBB.getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+ unsigned UndefReg = createLaneMaskReg(MF);
+ BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
+ UndefReg);
+ return UndefReg;
+}
- std::vector<unsigned> I1Defs;
+/// Lower all instructions that def or use vreg_1 registers.
+///
+/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
+/// occur around inline assembly. We do this first, before vreg_1 registers
+/// are changed to scalar mask registers.
+///
+/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
+/// all others, because phi lowering looks through copies and can therefore
+/// often make copy lowering unnecessary.
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
+ MF = &TheMF;
+ MRI = &MF->getRegInfo();
+ DT = &getAnalysis<MachineDominatorTree>();
+ PDT = &getAnalysis<MachinePostDominatorTree>();
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
+ ST = &MF->getSubtarget<GCNSubtarget>();
+ TII = ST->getInstrInfo();
- MachineBasicBlock &MBB = *BI;
- MachineBasicBlock::iterator I, Next;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
- Next = std::next(I);
- MachineInstr &MI = *I;
+ lowerCopiesFromI1();
+ lowerPhis();
+ lowerCopiesToI1();
- if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
- unsigned Reg = MI.getOperand(0).getReg();
- const TargetRegisterClass *RC = MRI.getRegClass(Reg);
- if (RC == &AMDGPU::VReg_1RegClass)
- MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
- continue;
- }
+ for (unsigned Reg : ConstrainRegs)
+ MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
+ ConstrainRegs.clear();
+
+ return true;
+}
+void SILowerI1Copies::lowerCopiesFromI1() {
+ SmallVector<MachineInstr *, 4> DeadCopies;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
if (MI.getOpcode() != AMDGPU::COPY)
continue;
- const MachineOperand &Dst = MI.getOperand(0);
- const MachineOperand &Src = MI.getOperand(1);
-
- if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
- !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+ MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
continue;
- const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
- const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
+ if (isLaneMaskReg(DstReg) ||
+ (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+ MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
+ continue;
+ // Copy into a 32-bit vector register.
+ LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
DebugLoc DL = MI.getDebugLoc();
- MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
- if (DstRC == &AMDGPU::VReg_1RegClass &&
- TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
- I1Defs.push_back(Dst.getReg());
-
- if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
- if (DefInst->getOperand(1).isImm()) {
- I1Defs.push_back(Dst.getReg());
-
- int64_t Val = DefInst->getOperand(1).getImm();
- assert(Val == 0 || Val == -1);
-
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
- .add(Dst)
- .addImm(Val);
- MI.eraseFromParent();
- continue;
+
+ assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
+ assert(!MI.getOperand(0).getSubReg());
+
+ ConstrainRegs.insert(SrcReg);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addImm(0)
+ .addImm(-1)
+ .addReg(SrcReg);
+ DeadCopies.push_back(&MI);
+ }
+
+ for (MachineInstr *MI : DeadCopies)
+ MI->eraseFromParent();
+ DeadCopies.clear();
+ }
+}
+
+void SILowerI1Copies::lowerPhis() {
+ MachineSSAUpdater SSAUpdater(*MF);
+ LoopFinder LF(*DT, *PDT);
+ PhiIncomingAnalysis PIA(*PDT);
+ SmallVector<MachineInstr *, 4> DeadPhis;
+ SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
+ SmallVector<unsigned, 4> IncomingRegs;
+ SmallVector<unsigned, 4> IncomingUpdated;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ LF.initialize(MBB);
+
+ for (MachineInstr &MI : MBB.phis()) {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);
+
+ MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+
+ // Collect incoming values.
+ for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+ assert(i + 1 < MI.getNumOperands());
+ unsigned IncomingReg = MI.getOperand(i).getReg();
+ MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
+ MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
+
+ if (IncomingDef->getOpcode() == AMDGPU::COPY) {
+ IncomingReg = IncomingDef->getOperand(1).getReg();
+ assert(isLaneMaskReg(IncomingReg));
+ assert(!IncomingDef->getOperand(1).getSubReg());
+ } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
+ continue;
+ } else {
+ assert(IncomingDef->isPHI());
+ }
+
+ IncomingBlocks.push_back(IncomingMBB);
+ IncomingRegs.push_back(IncomingReg);
+ }
+
+ // Phis in a loop that are observed outside the loop receive a simple but
+ // conservatively correct treatment.
+ MachineBasicBlock *PostDomBound = &MBB;
+ for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
+ PostDomBound =
+ PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
+ }
+
+ unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
+
+ SSAUpdater.Initialize(DstReg);
+
+ if (FoundLoopLevel) {
+ LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
+
+ for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+ IncomingUpdated.push_back(createLaneMaskReg(*MF));
+ SSAUpdater.AddAvailableValue(IncomingBlocks[i],
+ IncomingUpdated.back());
+ }
+
+ for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+ MachineBasicBlock &IMBB = *IncomingBlocks[i];
+ buildMergeLaneMasks(
+ IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
+ SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
+ }
+ } else {
+ // The phi is not observed from outside a loop. Use a more accurate
+ // lowering.
+ PIA.analyze(MBB, IncomingBlocks);
+
+ for (MachineBasicBlock *MBB : PIA.predecessors())
+ SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
+
+ for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+ MachineBasicBlock &IMBB = *IncomingBlocks[i];
+ if (PIA.isSource(IMBB)) {
+ IncomingUpdated.push_back(0);
+ SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
+ } else {
+ IncomingUpdated.push_back(createLaneMaskReg(*MF));
+ SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
}
}
- unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc)
- .add(Src);
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
- .add(Dst)
- .addImm(0)
- .addImm(-1)
- .addReg(TmpSrc);
- MI.eraseFromParent();
- } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
- SrcRC == &AMDGPU::VReg_1RegClass) {
- if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
- DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
- DefInst->getOperand(1).getImm() == 0 &&
- DefInst->getOperand(2).getImm() != 0 &&
- DefInst->getOperand(3).isReg() &&
- TargetRegisterInfo::isVirtualRegister(
- DefInst->getOperand(3).getReg()) &&
- TRI->getCommonSubClass(
- MRI.getRegClass(DefInst->getOperand(3).getReg()),
- &AMDGPU::SGPR_64RegClass) &&
- AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
- .add(Dst)
- .addReg(AMDGPU::EXEC)
- .add(DefInst->getOperand(3));
- } else {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
- .add(Dst)
- .add(Src)
- .addImm(0);
+ for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+ if (!IncomingUpdated[i])
+ continue;
+
+ MachineBasicBlock &IMBB = *IncomingBlocks[i];
+ buildMergeLaneMasks(
+ IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
+ SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
}
- MI.eraseFromParent();
+ }
+
+ unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
+ if (NewReg != DstReg) {
+ MRI->replaceRegWith(NewReg, DstReg);
+
+ // Ensure that DstReg has a single def and mark the old PHI node for
+ // deletion.
+ MI.getOperand(0).setReg(NewReg);
+ DeadPhis.push_back(&MI);
+ }
+
+ IncomingBlocks.clear();
+ IncomingRegs.clear();
+ IncomingUpdated.clear();
+ }
+
+ for (MachineInstr *MI : DeadPhis)
+ MI->eraseFromParent();
+ DeadPhis.clear();
+ }
+}
+
+void SILowerI1Copies::lowerCopiesToI1() {
+ MachineSSAUpdater SSAUpdater(*MF);
+ LoopFinder LF(*DT, *PDT);
+ SmallVector<MachineInstr *, 4> DeadCopies;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ LF.initialize(MBB);
+
+ for (MachineInstr &MI : MBB) {
+ if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
+ MI.getOpcode() != AMDGPU::COPY)
+ continue;
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(DstReg) ||
+ MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+ continue;
+
+ if (MRI->use_empty(DstReg)) {
+ DeadCopies.push_back(&MI);
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
+
+ MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+ if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
+ continue;
+
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ assert(!MI.getOperand(1).getSubReg());
+
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+ !isLaneMaskReg(SrcReg)) {
+ assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
+ unsigned TmpReg = createLaneMaskReg(*MF);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
+ .addReg(SrcReg)
+ .addImm(0);
+ MI.getOperand(1).setReg(TmpReg);
+ SrcReg = TmpReg;
+ }
+
+ // Defs in a loop that are observed outside the loop must be transformed
+ // into appropriate bit manipulation.
+ MachineBasicBlock *PostDomBound = &MBB;
+ for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
+ PostDomBound =
+ PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
+ }
+
+ unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
+ if (FoundLoopLevel) {
+ SSAUpdater.Initialize(DstReg);
+ SSAUpdater.AddAvailableValue(&MBB, DstReg);
+ LF.addLoopEntries(FoundLoopLevel, SSAUpdater);
+
+ buildMergeLaneMasks(MBB, MI, DL, DstReg,
+ SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
+ DeadCopies.push_back(&MI);
}
}
+
+ for (MachineInstr *MI : DeadCopies)
+ MI->eraseFromParent();
+ DeadCopies.clear();
}
+}
- for (unsigned Reg : I1Defs)
- MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
+ const MachineInstr *MI;
+ for (;;) {
+ MI = MRI->getUniqueVRegDef(Reg);
+ if (MI->getOpcode() != AMDGPU::COPY)
+ break;
+
+ Reg = MI->getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return false;
+ if (!isLaneMaskReg(Reg))
+ return false;
+ }
+
+ if (MI->getOpcode() != AMDGPU::S_MOV_B64)
+ return false;
+
+ if (!MI->getOperand(1).isImm())
+ return false;
+
+ int64_t Imm = MI->getOperand(1).getImm();
+ if (Imm == 0) {
+ Val = false;
+ return true;
+ }
+ if (Imm == -1) {
+ Val = true;
+ return true;
+ }
return false;
}
+
+static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
+ Def = false;
+ Use = false;
+
+ for (const MachineOperand &MO : MI.operands()) {
+ if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
+ if (MO.isUse())
+ Use = true;
+ else
+ Def = true;
+ }
+ }
+}
+
+/// Return a point at the end of the given \p MBB to insert SALU instructions
+/// for lane mask calculation. Take terminators and SCC into account.
+MachineBasicBlock::iterator
+SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
+ auto InsertionPt = MBB.getFirstTerminator();
+ bool TerminatorsUseSCC = false;
+ for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
+ bool DefsSCC;
+ instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
+ if (TerminatorsUseSCC || DefsSCC)
+ break;
+ }
+
+ if (!TerminatorsUseSCC)
+ return InsertionPt;
+
+ while (InsertionPt != MBB.begin()) {
+ InsertionPt--;
+
+ bool DefSCC, UseSCC;
+ instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
+ if (DefSCC)
+ return InsertionPt;
+ }
+
+ // We should have at least seen an IMPLICIT_DEF or COPY
+ llvm_unreachable("SCC used by terminator but no def in block");
+}
+
+void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DstReg,
+ unsigned PrevReg, unsigned CurReg) {
+ bool PrevVal;
+ bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
+ bool CurVal;
+ bool CurConstant = isConstantLaneMask(CurReg, CurVal);
+
+ if (PrevConstant && CurConstant) {
+ if (PrevVal == CurVal) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
+ } else if (CurVal) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC);
+ } else {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
+ .addReg(AMDGPU::EXEC)
+ .addImm(-1);
+ }
+ return;
+ }
+
+ unsigned PrevMaskedReg = 0;
+ unsigned CurMaskedReg = 0;
+ if (!PrevConstant) {
+ if (CurConstant && CurVal) {
+ PrevMaskedReg = PrevReg;
+ } else {
+ PrevMaskedReg = createLaneMaskReg(*MF);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
+ .addReg(PrevReg)
+ .addReg(AMDGPU::EXEC);
+ }
+ }
+ if (!CurConstant) {
+ // TODO: check whether CurReg is already masked by EXEC
+ if (PrevConstant && PrevVal) {
+ CurMaskedReg = CurReg;
+ } else {
+ CurMaskedReg = createLaneMaskReg(*MF);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
+ .addReg(CurReg)
+ .addReg(AMDGPU::EXEC);
+ }
+ }
+
+ if (PrevConstant && !PrevVal) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
+ .addReg(CurMaskedReg);
+ } else if (CurConstant && !CurVal) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
+ .addReg(PrevMaskedReg);
+ } else if (PrevConstant && PrevVal) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
+ .addReg(CurMaskedReg)
+ .addReg(AMDGPU::EXEC);
+ } else {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
+ .addReg(PrevMaskedReg)
+ .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
+ }
+}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
deleted file mode 100644
index 1924f71f11c..00000000000
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// MBB A lane-dominates MBB B if
-// 1. A dominates B in the usual sense, i.e. every path from the entry to B
-// goes through A, and
-// 2. whenever B executes, every active lane during that execution of B was
-// also active during the most recent execution of A.
-//
-// The simplest example where A dominates B but does not lane-dominate it is
-// where A is a loop:
-//
-// |
-// +--+
-// A |
-// +--+
-// |
-// B
-//
-// Unfortunately, the second condition is not fully captured by the control
-// flow graph when it is unstructured (as may happen when branch conditions are
-// uniform).
-//
-// The following replacement of the second condition is a conservative
-// approximation. It is an equivalent condition when the CFG is fully
-// structured:
-//
-// 2'. every cycle in the CFG that contains A also contains B.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPULaneDominator.h"
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-
-namespace llvm {
-
-namespace AMDGPU {
-
-// Given machine basic blocks A and B where A dominates B, check whether
-// A lane-dominates B.
-//
-// The check is conservative, i.e. there can be false-negatives.
-bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) {
- // Check whether A is reachable from itself without going through B.
- DenseSet<MachineBasicBlock *> Reachable;
- SmallVector<MachineBasicBlock *, 8> Stack;
-
- Stack.push_back(A);
- do {
- MachineBasicBlock *MBB = Stack.back();
- Stack.pop_back();
-
- for (MachineBasicBlock *Succ : MBB->successors()) {
- if (Succ == A)
- return false;
- if (Succ != B && Reachable.insert(Succ).second)
- Stack.push_back(Succ);
- }
- } while (!Stack.empty());
-
- return true;
-}
-
-} // namespace AMDGPU
-
-} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
deleted file mode 100644
index 4f33a89a364..00000000000
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
-#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
-
-namespace llvm {
-
-class MachineBasicBlock;
-
-namespace AMDGPU {
-
-bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB);
-
-} // end namespace AMDGPU
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index c5ed32e4682..01b80ebe8d3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -2,5 +2,4 @@ add_llvm_library(LLVMAMDGPUUtils
AMDGPUBaseInfo.cpp
AMDKernelCodeTUtils.cpp
AMDGPUAsmUtils.cpp
- AMDGPULaneDominator.cpp
)
diff --git a/llvm/test/CodeGen/AMDGPU/add_i1.ll b/llvm/test/CodeGen/AMDGPU/add_i1.ll
index fb3b69ca3bd..c5f7e3af5e3 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i1.ll
@@ -21,8 +21,8 @@ define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)
}
; GCN-LABEL: {{^}}add_i1_cf:
-; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}
-; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+; GCN: ; %endif
+; GCN: s_not_b64
define amdgpu_kernel void @add_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
index 84a2d3d3a7b..ae78a1ecf32 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -1,19 +1,25 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: {{^}}i1_copy_from_loop:
;
-; Cannot use an SGPR mask to copy %cc out of the loop, since the mask would
-; only contain the lanes that were active during the last loop iteration.
-;
; SI: ; %for.body
-; SI: v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4,
-; SI: v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]]
-; SI-NEXT: s_cbranch_vccnz [[ENDIF:BB[0-9_]+]]
-; SI: [[ENDIF]]:
-; SI-NOT: [[VREG]]
-; SI: ; %for.end
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[VREG]]
+; SI: v_cmp_gt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4,
+; SI-DAG: s_andn2_b64 [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI-DAG: s_and_b64 [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec
+; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]]
+
+; SI: ; %Flow1
+; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec
+
+; SI: ; %Flow
+; SI-DAG: s_andn2_b64 [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
+; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI: s_or_b64 [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
+
+; SI: ; %for.end
+; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]]
+
define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
entry:
br label %for.body
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
new file mode 100644
index 00000000000..0aacbbfda18
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_dont_clobber_scc:
+
+; GCN: ; %entry
+; GCN: s_cmp_eq_u32 s0, 0
+; GCN: s_cbranch_scc1 [[PREEXIT:BB[0-9_]+]]
+
+; GCN: ; %blocka
+; GCN: s_xor_b64 s[{{[0-9:]+}}], exec, -1
+; GCN: s_cmp_eq_u32 s1, 0
+; GCN: s_cbranch_scc1 [[EXIT:BB[0-9_]+]]
+
+; GCN: [[PREEXIT]]:
+; GCN: [[EXIT]]:
+
+define amdgpu_vs float @test_dont_clobber_scc(i32 inreg %uni, i32 inreg %uni2) #0 {
+entry:
+ %cc.uni = icmp eq i32 %uni, 0
+ br i1 %cc.uni, label %exit, label %blocka
+
+blocka:
+ call void asm sideeffect "; dummy a", ""()
+ %cc.uni2 = icmp eq i32 %uni2, 0
+ br i1 %cc.uni2, label %exit, label %blockb
+
+blockb:
+ call void asm sideeffect "; dummy b", ""()
+ br label %exit
+
+exit:
+ %cc.phi = phi i1 [ true, %entry ], [ false, %blocka ], [ false, %blockb ]
+ call void asm sideeffect "; dummy exit", ""()
+ %r = select i1 %cc.phi, float 1.0, float 2.0
+ ret float %r
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 63a9f1feb6d..5b25271ce17 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -2,12 +2,16 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; SI-LABEL: {{^}}br_i1_phi:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; SI: s_and_saveexec_b64
-; SI: v_mov_b32_e32 [[REG]], -1{{$}}
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]]
-; SI: s_and_saveexec_b64
-; SI: s_endpgm
+
+; SI: ; %bb
+; SI: s_mov_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], 0
+
+; SI: ; %bb2
+; SI: s_mov_b64 [[TMP]], exec
+
+; SI: ; %bb3
+; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[TMP]]
+
define amdgpu_kernel void @br_i1_phi(i32 %arg) {
bb:
%tidig = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index a0563cdd319..9615efaaa93 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -198,7 +198,8 @@ entry:
}
; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:
-; CHECK: v_mov_b32_e32 v0, -1{{$}}
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], -1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, [[MASK]]
; CHECK: ; use v0
define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
entry:
@@ -212,10 +213,14 @@ entry:
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]]
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK: ; use v0
+; CHECK: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK: v_cndmask_b32_e64 [[STORE:v[0-9]+]], 0, 1, vcc
+; CHECK: {{buffer|flat}}_store_byte [[STORE]],
define amdgpu_kernel void @i1_input_phys_vgpr() {
entry:
%val = load i1, i1 addrspace(1)* undef
- call void asm sideeffect "; use $0 ", "{v0}"(i1 %val)
+ %cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val)
+ store i1 %cc, i1 addrspace(1)* undef
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 34b842d8436..63c1556212d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; FIXME: Enable for VI.
@@ -144,20 +144,24 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
}
; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
-; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
-; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
-; SI: buffer_load_dword [[LOAD:v[0-9]+]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: ; %entry
+; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
+; SI: s_mov_b64 vcc, 0
+; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]]
+; SI: ; %bb
+; SI: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
+; SI: s_and_b64 vcc, vcc, exec
+
+; SI: ; %exit
+; SI: s_or_b64 exec, exec, [[SAVE]]
+; SI-NOT: vcc
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: buffer_store_dword
+; SI: s_endpgm
-; SI: BB9_2:
-; SI: s_or_b64 exec, exec, [[SAVE]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: buffer_store_dword
-; SI: s_endpgm
define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
index 576950188d3..f37b3a3637a 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -22,23 +22,28 @@
; OPT: bb9:
; OPT: call void @llvm.amdgcn.end.cf(i64
-; TODO: Can remove exec fixes in return block
; GCN-LABEL: {{^}}break_loop:
-; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN: s_mov_b64 [[OUTER_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
-; GCN: v_cmp_lt_i32_e32 vcc, -1
-; GCN: s_and_b64 vcc, exec, vcc
-; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
-
-; GCN: ; %bb.2: ; %bb4
-; GCN: buffer_load_dword
-; GCN: v_cmp_ge_i32_e32 vcc,
-
-; GCN: [[FLOW]]:
-; GCN: s_or_b64 [[MASK]], vcc, [[MASK]]
-; GCN: s_andn2_b64 exec, exec, [[MASK]]
-; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]
+; GCN: v_cmp_lt_i32_e32 vcc, -1
+; GCN: s_and_b64 vcc, exec, vcc
+; GCN: s_or_b64 [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec
+; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: ; %bb4
+; GCN: buffer_load_dword
+; GCN: v_cmp_ge_i32_e32 vcc,
+; GCN: s_andn2_b64 [[INNER_MASK]], [[INNER_MASK]], exec
+; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN: s_or_b64 [[INNER_MASK]], [[INNER_MASK]], [[TMP0]]
+
+; GCN: [[FLOW]]: ; %Flow
+; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]]
+; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[OUTER_MASK]]
+; GCN: s_mov_b64 [[OUTER_MASK]], [[TMP1]]
+; GCN: s_andn2_b64 exec, exec, [[TMP1]]
+; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]
; GCN: ; %bb.4: ; %bb9
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index fbdf9832b29..679fd7c9870 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -59,31 +59,48 @@
; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
-; GCN: v_cmp_lt_i32_e32 vcc, 1
-; GCN: s_and_saveexec_b64
-; GCN: s_xor_b64
+; GCN: s_mov_b64 [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
+; GCN: v_cmp_lt_i32_e32 vcc, 1,
+; GCN: s_mov_b64 [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+
+; GCN: ; %LeafBlock1
+; GCN-NEXT: s_mov_b64 [[EXIT0]], exec
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2,
+; GCN-NEXT: s_and_b64 [[EXIT1]], vcc, exec
+
+; GCN: ; %Flow
+; GCN-NEXT: s_or_saveexec_b64
+; GCN-NEXT: s_xor_b64
; FIXME: Why is this compare essentially repeated?
-; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
-; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: ; %LeafBlock
+; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1,
+; GCN-DAG: v_cmp_ne_u32_e64 [[TMP1:s\[[0-9]+:[0-9]+\]]], 1,
+; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec
+; GCN-DAG: s_andn2_b64 [[EXIT1]], [[EXIT1]], exec
+; GCN-DAG: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN-DAG: s_and_b64 [[TMP1]], [[TMP1]], exec
+; GCN-DAG: s_or_b64 [[EXIT0]], [[EXIT0]], [[TMP0]]
+; GCN-DAG: s_or_b64 [[EXIT1]], [[EXIT1]], [[TMP1]]
; GCN: ; %Flow4
-; GCN-NEXT: s_or_b64 exec, exec
-; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_or_b64 exec, exec,
+; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]]
+; GCN-NEXT: s_xor_b64
; GCN: ; %exit1
-; GCN: ds_write_b32
+; GCN: ds_write_b32
+; GCN: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec
-; GCN: %Flow5
-; GCN-NEXT: s_or_b64 exec, exec
-; GCN: v_cmp_ne_u32_e32 vcc, 0
-; GCN-NEXT: s_and_saveexec_b64
+; GCN: ; %Flow5
+; GCN-NEXT: s_or_b64 exec, exec,
+; GCN-NEXT; s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]]
; GCN: ; %exit0
-; GCN: buffer_store_dword
+; GCN: buffer_store_dword
; GCN: ; %UnifiedReturnBlock
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index c4e2f1e3487..4c1a769d599 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -21,34 +21,46 @@
; GCN-LABEL: {{^}}multi_else_break:
+; GCN: ; %main_body
+; GCN: s_mov_b64 [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+
; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
+; GCN: s_mov_b64 [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
-; GCN: s_and_saveexec_b64 [[SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], vcc
-
-; GCN: BB{{[0-9]+}}_{{[0-9]+}}: ; %Flow{{$}}
-; GCN-NEXT: ; in Loop: Header=[[INNER_LOOP]] Depth=2
-
-; Ensure extra or eliminated
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]]
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], vcc, s{{\[[0-9]+:[0-9]+\]}}
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
-; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]
-
-; GCN: ; %bb.{{[0-9]+}}: ; %Flow2{{$}}
-; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1
-
-; Ensure copy is eliminated
-; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]]
-; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, vcc
-; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
-; GCN-NEXT: s_mov_b64
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]]
-; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]]
+; GCN: s_or_b64 [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]], [[BREAK_OUTER]], exec
+; GCN: s_or_b64 [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]], [[BREAK_INNER]], exec
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+
+; FIXME: duplicate comparison
+; GCN: ; %ENDIF
+; GCN-DAG: v_cmp_eq_u32_e32 vcc,
+; GCN-DAG: v_cmp_ne_u32_e64 [[TMP51NEG:s\[[0-9]+:[0-9]+\]]],
+; GCN-DAG: s_andn2_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], exec
+; GCN-DAG: s_andn2_b64 [[BREAK_INNER]], [[BREAK_INNER]], exec
+; GCN-DAG: s_and_b64 [[TMP_EQ:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN-DAG: s_and_b64 [[TMP_NE:s\[[0-9]+:[0-9]+\]]], [[TMP51NEG]], exec
+; GCN-DAG: s_or_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]]
+; GCN-DAG: s_or_b64 [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]]
+
+; GCN: ; %Flow
+; GCN: s_or_b64 exec, exec, [[SAVE_EXEC]]
+; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER]]
+; GCN: s_or_b64 [[TMP0]], [[TMP0]], [[LEFT_INNER]]
+; GCN: s_mov_b64 [[LEFT_INNER]], [[TMP0]]
+; GCN: s_andn2_b64 exec, exec, [[TMP0]]
+; GCN: s_cbranch_execnz [[INNER_LOOP]]
+
+; GCN: ; %Flow2
+; GCN: s_or_b64 exec, exec, [[TMP0]]
+; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER]]
+; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
+; GCN: s_mov_b64 [[LEFT_OUTER]], [[TMP1]]
+; GCN: s_andn2_b64 exec, exec, [[TMP1]]
+; GCN: s_cbranch_execnz [[OUTER_LOOP]]
+
+; GCN: ; %IF
+; GCN-NEXT: s_endpgm
define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
main_body:
br label %LOOP.outer
@@ -78,12 +90,38 @@ ENDIF: ; preds = %LOOP
; OPT: llvm.amdgcn.end.cf
; GCN-LABEL: {{^}}multi_if_break_loop:
-; GCN: s_mov_b64 [[BREAK_REG:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN: s_mov_b64 [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}}
; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
+; GCN: s_mov_b64 [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+
+; GCN: ; %LeafBlock1
+; GCN: s_mov_b64
+; GCN: s_mov_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+
+; GCN: ; %case1
+; GCN: buffer_load_dword [[LOAD2:v[0-9]+]],
+; GCN: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD2]]
+; GCN: s_orn2_b64 [[BREAK]], vcc, exec
+
+; GCN: ; %Flow3
+; GCN: s_branch [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: s_mov_b64 [[BREAK]], -1{{$}}
+
+; GCN: [[FLOW]]: ; %Flow
+
+; GCN: ; %case0
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]],
+; GCN-DAG: s_andn2_b64 [[BREAK]], [[BREAK]], exec
+; GCN-DAG: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD1]]
+; GCN-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN: s_or_b64 [[BREAK]], [[BREAK]], [[TMP]]
-; GCN: s_or_b64 [[BREAK_REG]], vcc, [[BREAK_REG]]
-; GCN: s_andn2_b64 exec, exec, [[BREAK_REG]]
+; GCN: ; %Flow4
+; GCN: s_and_b64 [[BREAK]], exec, [[BREAK]]
+; GCN: s_or_b64 [[LEFT]], [[BREAK]], [[OLD_LEFT]]
+; GCN: s_andn2_b64 exec, exec, [[LEFT]]
; GCN-NEXT: s_cbranch_execnz
define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index 33028f17531..f773357976c 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -137,7 +137,6 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, flo
; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0
; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}}
-; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 04df33b8dd4..3db6fd2d898 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -100,22 +100,22 @@ endif:
ret void
}
-; FIXME: Should write to different SGPR pairs instead of copying to
-; VALU for i1 phi.
-
; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
-
-; SI: BB{{[0-9]+}}_2:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
-
-; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
-; SI: buffer_store_dword [[RESULT]]
+
+; SI: ; %else
+; SI: buffer_load_dword [[AVAL:v[0-9]+]]
+; SI: v_cmp_gt_i32_e64 [[PHI:s\[[0-9]+:[0-9]+\]]], 0, [[AVAL]]
+
+; SI: ; %if
+; SI: buffer_load_dword [[AVAL:v[0-9]+]]
+; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
+; SI-DAG: s_andn2_b64 [[PHI]], [[PHI]], exec
+; SI-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP_ELSE]], exec
+; SI: s_or_b64 [[PHI]], [[PHI]], [[TMP]]
+
+; SI: ; %endif
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[PHI]]
+; SI: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 73e56593ce8..6215a486a36 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
@@ -27,18 +27,23 @@ ENDIF:
; FUNC-LABEL: {{^}}phi_cond_outside_loop:
-; FIXME: This could be folded into the s_or_b64 instruction
-; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0
-; SI: [[LOOP_LABEL:[A-Z0-9]+]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; SI_IF_BREAK instruction:
-; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]]
+; SI: s_mov_b64 [[LEFT:s\[[0-9]+:[0-9]+\]]], 0
+; SI: s_mov_b64 [[PHI:s\[[0-9]+:[0-9]+\]]], 0
-; SI_LOOP instruction:
-; SI: s_andn2_b64 exec, exec, [[BREAK]]
-; SI: s_cbranch_execnz [[LOOP_LABEL]]
-; SI: s_endpgm
+; SI: ; %else
+; SI: v_cmp_eq_u32_e64 [[TMP:s\[[0-9]+:[0-9]+\]]],
+; SI: s_and_b64 [[PHI]], [[TMP]], exec
+
+; SI: ; %endif
+
+; SI: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]: ; %loop
+; SI: s_mov_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+; SI: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[PHI]]
+; SI: s_or_b64 [[LEFT]], [[TMP1]], [[TMP]]
+; SI: s_andn2_b64 exec, exec, [[LEFT]]
+; SI: s_cbranch_execnz [[LOOP_LABEL]]
+; SI: s_endpgm
define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
entry:
@@ -90,19 +95,21 @@ declare float @llvm.fabs.f32(float) nounwind readnone
; This broke the old AMDIL cfg structurizer
; FUNC-LABEL: {{^}}loop_land_info_assert:
; SI: v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
-; SI: s_and_b64 vcc, exec, [[CMP4]]
-; SI-NEXT: s_cbranch_vccnz [[BR1:BB[0-9_]+]]
-; SI-NEXT: s_branch [[BR2:BB[0-9_]+]]
-; SI-NEXT: BB{{[0-9_]+}}:
-; SI-NEXT: buffer_store_dword
+; SI: s_and_b64 [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]]
+; SI: s_mov_b64 vcc, [[CMP4M]]
+; SI-NEXT: s_cbranch_vccnz [[CONVEX_EXIT:BB[0-9_]+]]
+; SI-NEXT: s_branch [[FOR_COND_PREHDR:BB[0-9_]+]]
+
+; SI: ; %if.else
+; SI: buffer_store_dword
; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]:
-; SI: [[BR1]]:
-; SI-NEXT: s_and_b64 vcc, exec,
-; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]
+; SI: [[CONVEX_EXIT]]:
+; SI: s_mov_b64 vcc,
+; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]
; SI: s_branch [[INFLOOP]]
-; SI-NEXT: [[BR2]]:
+; SI-NEXT: [[FOR_COND_PREHDR]]:
; SI: s_cbranch_vccz [[ENDPGM]]
; SI: [[ENDPGM]]:
diff --git a/llvm/test/CodeGen/AMDGPU/sub_i1.ll b/llvm/test/CodeGen/AMDGPU/sub_i1.ll
index 70562a59f0a..6861d32dccf 100644
--- a/llvm/test/CodeGen/AMDGPU/sub_i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub_i1.ll
@@ -21,8 +21,8 @@ define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)
}
; GCN-LABEL: {{^}}sub_i1_cf:
-; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}
-; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
+; GCN: ; %endif
+; GCN: s_not_b64
define amdgpu_kernel void @sub_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index 3d980b749a9..ca85f0bee4c 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -8,23 +8,22 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
; waitcnt should be inserted after exec modification
-; SI: v_cmp_lt_i32_e32 vcc, 0,
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
+; SI: v_cmp_lt_i32_e32 vcc, 0,
+; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
+; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
-; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
-; SI: v_mov_b32_e32 v{{[0-9]}}, -1
-; SI: s_and_saveexec_b64
+; SI: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
+; SI: s_and_saveexec_b64
; SI-NEXT: ; mask branch
; v_mov should be after exec modification
; SI: [[FLOW_BB]]:
; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
-; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
; SI-NEXT: ; mask branch
;
@@ -220,9 +219,10 @@ exit:
; SI: [[LABEL_FLOW]]:
; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
-; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
+; SI-NEXT: s_or_b64 [[TMP2:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[COND_STATE]]
+; SI-NEXT: s_mov_b64 [[COND_STATE]], [[TMP2]]
+; SI-NEXT: s_andn2_b64 exec, exec, [[TMP2]]
; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
; SI: [[LABEL_EXIT]]:
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
index a941e5fb1f7..08267b76aef 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll
@@ -6,7 +6,7 @@
; GCN-LABEL: {{^}}testKernel
; GCN: BB0_1:
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cmp_eq_f32_e64
+; GCN-NEXT: v_cmp_eq_f32_e32
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_f32_e32
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
OpenPOWER on IntegriCloud