diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2016-02-12 23:45:29 +0000 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2016-02-12 23:45:29 +0000 |
commit | bc4497b13ccc73e1c8c156350cc7fe50f9beae93 (patch) | |
tree | 71e7e13564f64f708f17731941d9b73818cb5be9 /llvm/lib | |
parent | 0de36ec169b8c818487606658ed2504c88f4c0e7 (diff) | |
download | bcm5719-llvm-bc4497b13ccc73e1c8c156350cc7fe50f9beae93.tar.gz bcm5719-llvm-bc4497b13ccc73e1c8c156350cc7fe50f9beae93.zip |
AMDGPU/SI: Detect uniform branches and emit s_cbranch instructions
Reviewers: arsenm
Subscribers: mareko, MatzeB, qcolombet, arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D16603
llvm-svn: 260765
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp | 18 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 55 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp | 25 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 38 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 69 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 27 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 33 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 5 |
15 files changed, 266 insertions, 41 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 3a2d384806f..31d223ea674 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -78,6 +78,7 @@ FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); extern char &AMDGPUPromoteAllocaID; +FunctionPass *createAMDGPUAddDivergenceMetadata(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); ModulePass *createAMDGPUAlwaysInlinePass(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index dfddc345f28..673066aeec3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -43,6 +43,7 @@ public: AU.setPreservesAll(); } + void visitBranchInst(BranchInst &I); void visitLoadInst(LoadInst &I); }; @@ -57,13 +58,28 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, char AMDGPUAnnotateUniformValues::ID = 0; +static void setUniformMetadata(Instruction *I) { + I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {})); +} + +void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { + if (I.isUnconditional()) + return; + + Value *Cond = I.getCondition(); + if (!DA->isUniform(Cond)) + return; + + setUniformMetadata(I.getParent()->getTerminator()); +} + void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); if (!DA->isUniform(Ptr)) return; if (Instruction *PtrI = dyn_cast<Instruction>(Ptr)) - PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {})); + setUniformMetadata(PtrI); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index c2a1beb63c5..fb9a66cca84 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" @@ -36,6 +37,20 @@ using namespace llvm; //===----------------------------------------------------------------------===// namespace { + +static bool isCBranchSCC(const SDNode *N) { + assert(N->getOpcode() == ISD::BRCOND); + if (!N->hasOneUse()) + return false; + + SDValue Cond = N->getOperand(1); + if (Cond.getOpcode() == ISD::CopyToReg) + Cond = Cond.getOperand(2); + return Cond.getOpcode() == ISD::SETCC && + Cond.getOperand(0).getValueType() == MVT::i32 && + Cond.hasOneUse(); +} + /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -82,6 +97,8 @@ private: bool isLocalLoad(const LoadSDNode *N) const; bool isRegionLoad(const LoadSDNode *N) const; + bool isUniformBr(const SDNode *N) const; + SDNode *glueCopyToM0(SDNode *N) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; @@ -143,6 +160,7 @@ private: uint32_t Offset, uint32_t Width); SDNode *SelectS_BFEFromShifts(SDNode *N); SDNode *SelectS_BFE(SDNode *N); + SDNode *SelectBRCOND(SDNode *N); // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" @@ -509,6 +527,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { break; return SelectS_BFE(N); + case ISD::BRCOND: + return SelectBRCOND(N); } return SelectCode(N); @@ -623,6 +643,11 @@ bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { return false; } +bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { + const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); + return BB->getTerminator()->getMetadata("amdgpu.uniform"); +} + const char *AMDGPUDAGToDAGISel::getPassName() const { return "AMDGPU DAG->DAG Pattern Instruction Selection"; } @@ -1365,6 +1390,36 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { return SelectCode(N); } +SDNode *AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { + SDValue Cond = N->getOperand(1); + + if (isCBranchSCC(N)) { + // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it. + return SelectCode(N); + } + + // The result of VOPC instructions is or'd against ~EXEC before it is + // written to vcc or another SGPR. This means that the value '1' is always + // written to the corresponding bit for results that are masked. In order + // to correctly check against vccz, we need to and VCC with the EXEC + // register in order to clear the value from the masked bits. + + SDLoc SL(N); + + SDNode *MaskedCond = + CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1, + CurDAG->getRegister(AMDGPU::EXEC, MVT::i1), + Cond); + SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, + SDValue(MaskedCond, 0), + SDValue()); // Passing SDValue() adds a + // glue output. + return CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other, + N->getOperand(2), // Basic Block + VCC.getValue(0), // Chain + VCC.getValue(1)); // Glue +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index ebb28903251..32e9d8a9d19 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -240,10 +240,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() { bool AMDGPUPassConfig::addPreISel() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); addPass(createFlattenCFGPass()); - if (ST.IsIRStructurizerEnabled()) - addPass(createStructurizeCFGPass()); return false; } @@ -263,6 +260,9 @@ bool AMDGPUPassConfig::addGCPasses() { bool R600PassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + if (ST.IsIRStructurizerEnabled()) + addPass(createStructurizeCFGPass()); addPass(createR600TextureIntrinsicsReplacer()); return false; } @@ -301,11 +301,11 @@ bool GCNPassConfig::addPreISel() { // FIXME: We need to run a pass to propagate the attributes when calls are // supported. addPass(&AMDGPUAnnotateKernelFeaturesID); - + addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions addPass(createSinkingPass()); addPass(createSITypeRewriter()); - addPass(createSIAnnotateControlFlowPass()); addPass(createAMDGPUAnnotateUniformValues()); + addPass(createSIAnnotateControlFlowPass()); return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 0a3eb7975aa..d1bb40aff9d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index d028edaf986..92a178bebca 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -14,6 +14,7 @@ #include "AMDGPU.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" @@ -43,6 +44,7 @@ static const char *const LoopIntrinsic = "llvm.amdgcn.loop"; static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf"; class SIAnnotateControlFlow : public FunctionPass { + DivergenceAnalysis *DA; Type *Boolean; Type *Void; @@ -105,6 +107,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<DivergenceAnalysis>(); AU.addPreserved<DominatorTreeWrapperPass>(); FunctionPass::getAnalysisUsage(AU); } @@ -115,6 +118,7 @@ public: INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) @@ -200,6 +204,9 @@ void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { /// \brief Open a new "If" block void SIAnnotateControlFlow::openIf(BranchInst *Term) { + if (DA->isUniform(Term->getCondition())) { + return; + } Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); @@ -207,6 +214,9 @@ void SIAnnotateControlFlow::openIf(BranchInst *Term) { /// \brief Close the last "If" block and open a new "Else" block void SIAnnotateControlFlow::insertElse(BranchInst *Term) { + if (DA->isUniform(Term->getCondition())) { + return; + } Value *Ret = CallInst::Create(Else, popSaved(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); @@ -290,6 +300,10 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, /// \brief Handle a back edge (loop) void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { + if (DA->isUniform(Term->getCondition())) { + return; + } + BasicBlock *BB = Term->getParent(); llvm::Loop *L = LI->getLoopFor(BB); BasicBlock *Target = Term->getSuccessor(1); @@ -311,6 +325,9 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { llvm::Loop *L = LI->getLoopFor(BB); + if (Stack.back().first != BB) + return; + if (L && L->getHeader() == BB) { // We can't insert an EndCF call into a loop header, because it will // get executed on every iteration of the loop, when it should be @@ -326,14 +343,18 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); } - CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt()); + Value *Exec = popSaved(); + if (!isa<UndefValue>(Exec)) + CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt()); } /// \brief Annotate the control flow with intrinsics so the backend can /// recognize if/then/else and loops. bool SIAnnotateControlFlow::runOnFunction(Function &F) { + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + DA = &getAnalysis<DivergenceAnalysis>(); for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { @@ -343,12 +364,14 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { if (!Term || Term->isUnconditional()) { if (isTopOfStack(*I)) closeControlFlow(*I); + continue; } if (I.nodeVisited(Term->getSuccessor(1))) { if (isTopOfStack(*I)) closeControlFlow(*I); + handleLoop(Term); continue; } diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp index 8bda283f0fc..4de01707bb6 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp @@ -108,9 +108,20 @@ FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { return new SIFixSGPRLiveRanges(); } +static bool hasOnlyScalarBr(const MachineBasicBlock *MBB, + const SIInstrInfo *TII) { + for (MachineBasicBlock::const_iterator I = MBB->getFirstTerminator(), + E = MBB->end(); I != E; ++I) { + if (!TII->isSOPP(*I)) + return false; + } + return true; +} + bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( MF.getSubtarget().getRegisterInfo()); bool MadeChange = false; @@ -147,7 +158,7 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { } } - if (MBB->succ_size() < 2) + if (MBB->succ_size() < 2 || hasOnlyScalarBr(MBB, TII)) continue; // We have structured control flow, so the number of successors should be diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 78bcc9aaf0a..0be5e6215c4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -130,6 +130,10 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BR_CC, MVT::f64, Expand); for (MVT VT : MVT::integer_valuetypes()) { if (VT == MVT::i64) @@ -1192,6 +1196,23 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); } +bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { + if (!Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) + return false; + + switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { + default: return false; + case AMDGPUIntrinsic::amdgcn_if: + case AMDGPUIntrinsic::amdgcn_else: + case AMDGPUIntrinsic::amdgcn_break: + case AMDGPUIntrinsic::amdgcn_if_break: + case AMDGPUIntrinsic::amdgcn_else_break: + case AMDGPUIntrinsic::amdgcn_loop: + case AMDGPUIntrinsic::amdgcn_end_cf: + return true; + } +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -1202,13 +1223,11 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SDNode *Intr = BRCOND.getOperand(1).getNode(); SDValue Target = BRCOND.getOperand(2); SDNode *BR = nullptr; + SDNode *SetCC = nullptr; if (Intr->getOpcode() == ISD::SETCC) { // As long as we negate the condition everything is fine - SDNode *SetCC = Intr; - assert(SetCC->getConstantOperandVal(1) == 1); - assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == - ISD::SETNE); + SetCC = Intr; Intr = SetCC->getOperand(0).getNode(); } else { @@ -1217,7 +1236,16 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, Target = BR->getOperand(1); } - assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); + if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) { + // This is a uniform branch so we don't need to legalize. + return BRCOND; + } + + assert(!SetCC || + (SetCC->getConstantOperandVal(1) == 1 && + isCFIntrinsic(Intr) && + cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == + ISD::SETNE)); // Build the result and ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index d321805ec46..5a196aeac15 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -60,6 +60,8 @@ class SITargetLowering : public AMDGPUTargetLowering { bool isLegalFlatAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; + + bool isCFIntrinsic(const SDNode *Intr) const; public: SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 7228d40e611..5650098efed 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1437,6 +1437,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + // Make sure we don't have SCC live-ins to basic blocks. moveToVALU assumes + // all SCC users are in the same blocks as their defs. + const MachineBasicBlock *MBB = MI->getParent(); + if (MI == &MBB->front()) { + if (MBB->isLiveIn(AMDGPU::SCC)) { + ErrInfo = "scc register cannot be live across blocks."; + return false; + } + } + // Make sure the number of operands is correct. const MCInstrDesc &Desc = get(Opcode); if (!Desc.isVariadic() && @@ -1605,6 +1615,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; + case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; + case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; + case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; + case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; + case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; + case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::S_LOAD_DWORD_SGPR: case AMDGPU::S_LOAD_DWORD_IMM_ci: @@ -1621,6 +1637,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; + case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; + case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; } } @@ -1979,7 +1997,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); // Legalize VOP2 - if (isVOP2(*MI)) { + if (isVOP2(*MI) || isVOPC(*MI)) { legalizeOperandsVOP2(MRI, MI); return; } @@ -2568,6 +2586,14 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst->eraseFromParent(); continue; + case AMDGPU::S_CBRANCH_SCC0: + case AMDGPU::S_CBRANCH_SCC1: + // Clear unused bits of vcc + BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + break; + case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); @@ -2589,8 +2615,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // both. for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { MachineOperand &Op = Inst->getOperand(i); - if (Op.isReg() && Op.getReg() == AMDGPU::SCC) + if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { Inst->RemoveOperand(i); + addSCCDefUsersToVALUWorklist(Inst, Worklist); + } } if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { @@ -2623,19 +2651,24 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst->addOperand(MachineOperand::CreateImm(BitWidth)); } - // Update the destination register class. - const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); - if (!NewDstRC) - continue; + bool HasDst = Inst->getOperand(0).isReg() && Inst->getOperand(0).isDef(); + unsigned NewDstReg = AMDGPU::NoRegister; + if (HasDst) { + // Update the destination register class. + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); + if (!NewDstRC) + continue; - unsigned DstReg = Inst->getOperand(0).getReg(); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); + unsigned DstReg = Inst->getOperand(0).getReg(); + NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + } // Legalize the operands legalizeOperands(Inst); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + if (HasDst) + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } } @@ -2910,6 +2943,22 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } +void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineInstr *SCCDefInst, + SmallVectorImpl<MachineInstr *> &Worklist) const { + // This assumes that all the users of SCC are in the same block + // as the SCC def. + for (MachineBasicBlock::iterator I = SCCDefInst, + E = SCCDefInst->getParent()->end(); I != E; ++I) { + + // Exit if we find another SCC def. + if (I->findRegisterDefOperandIdx(AMDGPU::SCC) != -1) + return; + + if (I->findRegisterUseOperandIdx(AMDGPU::SCC) != -1) + Worklist.push_back(I); + } +} + const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( const MachineInstr &Inst) const { const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 5fab4abf2e7..33dc3aef44c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -59,6 +59,9 @@ private: unsigned Reg, MachineRegisterInfo &MRI, SmallVectorImpl<MachineInstr *> &Worklist) const; + void addSCCDefUsersToVALUWorklist( + MachineInstr *SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const; + const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index e032935f598..bed05dd06d8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -247,6 +247,30 @@ def si_truncstore_local_i16 : PatFrag < return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16; }]>; +def si_setcc_uniform : PatFrag < + (ops node:$lhs, node:$rhs, node:$cond), + (setcc node:$lhs, node:$rhs, node:$cond), [{ + for (SDNode *Use : N->uses()) { + if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg) + return false; + + unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg(); + if (Reg != AMDGPU::SCC) + return false; + } + return true; +}]>; + +def si_uniform_br : PatFrag < + (ops node:$cond, node:$bb), (brcond node:$cond, node:$bb), [{ + return isUniformBr(N); +}]>; + +def si_uniform_br_scc : PatFrag < + (ops node:$cond, node:$bb), (si_uniform_br node:$cond, node:$bb), [{ + return isCBranchSCC(N); +}]>; + multiclass SIAtomicM0Glue2 <string op_name> { def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, @@ -826,7 +850,8 @@ multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, string opName, PatLeaf cond> : SOPC < op, (outs), (ins rc:$src0, rc:$src1), - opName#" $src0, $src1", []> { + opName#" $src0, $src1", + [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > { let Defs = [SCC]; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 0b793166c8c..37dce31c9e9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -336,18 +336,18 @@ defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>; // SOPC Instructions //===----------------------------------------------------------------------===// -def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">; -def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">; -def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">; -def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">; -def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">; -def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">; -def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">; -def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">; -def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">; -def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">; -def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">; -def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32", COND_EQ>; +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32", COND_NE>; +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32", COND_SGT>; +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32", COND_SGE>; +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32", COND_SLT>; +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32", COND_SLE>; +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32", COND_EQ>; +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32", COND_NE >; +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32", COND_UGT>; +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32", COND_UGE>; +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32", COND_ULT>; +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32", COND_ULE>; ////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>; ////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>; ////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>; @@ -449,7 +449,8 @@ def S_CBRANCH_SCC0 : SOPP < >; def S_CBRANCH_SCC1 : SOPP < 0x00000005, (ins sopp_brtarget:$simm16), - "s_cbranch_scc1 $simm16" + "s_cbranch_scc1 $simm16", + [(si_uniform_br_scc SCC, bb:$simm16)] >; } // End Uses = [SCC] @@ -2130,7 +2131,7 @@ def : Pat < def : Pat < (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, - (S_BCNT1_I32_B64 $src), sub0, + (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, (S_MOV_B32 0), sub1)) >; @@ -3030,10 +3031,12 @@ def : ZExt_i64_i32_Pat<anyext>; def : ZExt_i64_i1_Pat<zext>; def : ZExt_i64_i1_Pat<anyext>; +// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that +// REG_SEQUENCE patterns don't support instructions with multiple outputs. def : Pat < (i64 (sext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, - (S_ASHR_I32 $src, 31), sub1) + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SGPR_32)), sub1) >; def : Pat < diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 384275f7534..a1eb185cf09 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -427,7 +427,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, &AMDGPU::VReg_512RegClass, - &AMDGPU::SReg_512RegClass + &AMDGPU::SReg_512RegClass, + &AMDGPU::SCC_CLASSRegClass, }; for (const TargetRegisterClass *BaseClass : BaseClasses) { @@ -442,6 +443,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { // TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { switch (RC->getSize()) { + case 0: return false; + case 1: return false; case 4: return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; case 8: diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 5ab8830d1d9..a2debb1fc88 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -81,6 +81,11 @@ foreach Index = 0-255 in { // Groupings using register classes and tuples //===----------------------------------------------------------------------===// +def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> { + let CopyCost = -1; + let isAllocatable = 0; +} + // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers |