diff options
| author | Valery Pykhtin <Valery.Pykhtin@amd.com> | 2018-11-30 14:21:56 +0000 |
|---|---|---|
| committer | Valery Pykhtin <Valery.Pykhtin@amd.com> | 2018-11-30 14:21:56 +0000 |
| commit | 3d9afa273f94bfc1a7a63d7fd6f1484ffc34453c (patch) | |
| tree | 44171fc2b67bc5a9e34a85780e3f9bcd859fae68 /llvm/lib/Target/AMDGPU | |
| parent | 445b0b6260238f3e59204e6af921447564962004 (diff) | |
| download | bcm5719-llvm-3d9afa273f94bfc1a7a63d7fd6f1484ffc34453c.tar.gz bcm5719-llvm-3d9afa273f94bfc1a7a63d7fd6f1484ffc34453c.zip | |
[AMDGPU] Combine DPP mov with use instructions (VOP1/2/3)
Introduces DPP pseudo instructions and the pass that combines DPP mov with subsequent uses.
Differential revision: https://reviews.llvm.org/D53762
llvm-svn: 347993
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp | 446 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 81 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 32 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 28 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/VOP1Instructions.td | 30 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/VOP2Instructions.td | 69 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/VOPInstructions.td | 46 |
12 files changed, 711 insertions, 50 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 07ae2bee49b..b77b1f8ad79 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -37,6 +37,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel); // SI Passes +FunctionPass *createGCNDPPCombinePass(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); @@ -93,6 +94,9 @@ extern char &AMDGPULowerKernelAttributesID; void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; +void initializeGCNDPPCombinePass(PassRegistry &); +extern char &GCNDPPCombineID; + void initializeR600ClauseMergePassPass(PassRegistry &); extern char &R600ClauseMergePassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 23470c7a4d2..8530be9269c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -11,6 +11,10 @@ include "llvm/TableGen/SearchableTable.td" include "llvm/Target/Target.td" include "AMDGPUFeatures.td" +class BoolToList<bit Value> { + list<int> ret = !if(Value, [1]<int>, []<int>); +} + //===------------------------------------------------------------===// // Subtarget Features (device properties) //===------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2198ba8d6c0..4d265713aaf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -106,6 +106,11 @@ static cl::opt<bool> EnableSDWAPeephole( cl::desc("Enable SDWA peepholer"), cl::init(true)); +static cl::opt<bool> EnableDPPCombine( + "amdgpu-dpp-combine", + cl::desc("Enable DPP combiner"), + cl::init(false)); + // Enable address space based alias analysis static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), @@ -158,6 +163,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeR600VectorRegMergerPass(*PR); initializeGlobalISel(*PR); initializeAMDGPUDAGToDAGISelPass(*PR); + initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); @@ -790,6 +796,8 @@ void GCNPassConfig::addMachineSSAOptimization() { // // XXX - Can we get away without running DeadMachineInstructionElim again? addPass(&SIFoldOperandsID); + if (EnableDPPCombine) + addPass(&GCNDPPCombineID); addPass(&DeadMachineInstructionElimID); addPass(&SILoadStoreOptimizerID); if (EnableSDWAPeephole) { diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index bd941930872..e6a718bcb30 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -5275,12 +5275,14 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } - // All DPP instructions with at least one source operand have a fake "old" - // source at the beginning that's tied to the dst operand. Handle it here. - if (Desc.getNumOperands() >= 2) - Inst.addOperand(Inst.getOperand(0)); - for (unsigned E = Operands.size(); I != E; ++I) { + auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), + MCOI::TIED_TO); + if (TiedTo != -1) { + assert((unsigned)TiedTo < Inst.getNumOperands()); + // handle tied old or src2 for MAC instructions + Inst.addOperand(Inst.getOperand(TiedTo)); + } AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index bb1096bc1de..bdfaabac122 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -119,6 +119,7 @@ add_llvm_target(AMDGPUCodeGen SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp + GCNDPPCombine.cpp ) add_subdirectory(AsmParser) diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp new file mode 100644 index 00000000000..56071d0d237 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -0,0 +1,446 @@ +//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0 +// operand.If any of the use instruction cannot be combined with the mov the +// whole sequence is reverted. +// +// $old = ... +// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane, +// dpp_controls..., $bound_ctrl +// $res = VALU $dpp_value, ... +// +// to +// +// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ..., +// dpp_controls..., $folded_bound_ctrl +// +// Combining rules : +// +// $bound_ctrl is DPP_BOUND_ZERO, $old is any +// $bound_ctrl is DPP_BOUND_OFF, $old is 0 +// +// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO +// $bound_ctrl is DPP_BOUND_OFF, $old is undef +// +// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF +// $bound_ctrl is DPP_BOUND_OFF, $old is foldable +// +// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Pass.h" +#include <cassert> + +using namespace llvm; + +#define DEBUG_TYPE "gcn-dpp-combine" + +STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined."); + +namespace { + +class GCNDPPCombine : public MachineFunctionPass { + MachineRegisterInfo *MRI; + const SIInstrInfo *TII; + + using RegSubRegPair = TargetInstrInfo::RegSubRegPair; + + MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; + + RegSubRegPair foldOldOpnd(MachineInstr &OrigMI, + RegSubRegPair OldOpndVGPR, + MachineOperand &OldOpndValue) const; + + MachineInstr *createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + MachineOperand *OldOpnd, + bool BoundCtrlZero) const; + + MachineInstr *createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + bool BoundCtrlZero) const; + + bool hasNoImmOrEqual(MachineInstr &MI, + unsigned OpndName, + int64_t Value, + int64_t Mask = -1) const; + + bool combineDPPMov(MachineInstr &MI) const; + +public: + static char ID; + + GCNDPPCombine() : MachineFunctionPass(ID) { + initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN DPP Combine"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false) + +char GCNDPPCombine::ID = 0; + +char &llvm::GCNDPPCombineID = GCNDPPCombine::ID; + +FunctionPass *llvm::createGCNDPPCombinePass() { + return new GCNDPPCombine(); +} + +static int getDPPOp(unsigned Op) { + auto DPP32 = AMDGPU::getDPPOp32(Op); + if (DPP32 != -1) + return DPP32; + + auto E32 = AMDGPU::getVOPe32(Op); + return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1; +} + +// tracks the register operand definition and returns: +// 1. immediate operand used to initialize the register if found +// 2. nullptr if the register operand is undef +// 3. the operand itself otherwise +MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { + auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI); + if (!Def) + return nullptr; + + switch(Def->getOpcode()) { + default: break; + case AMDGPU::IMPLICIT_DEF: + return nullptr; + case AMDGPU::COPY: + case AMDGPU::V_MOV_B32_e32: { + auto &Op1 = Def->getOperand(1); + if (Op1.isImm()) + return &Op1; + break; + } + } + return &OldOpnd; +} + +MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + bool BoundCtrlZero) const { + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() == + TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg()); + + auto OrigOp = OrigMI.getOpcode(); + auto DPPOp = getDPPOp(OrigOp); + if (DPPOp == -1) { + LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n"); + return nullptr; + } + + auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI, + OrigMI.getDebugLoc(), TII->get(DPPOp)); + bool Fail = false; + do { + auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); + assert(Dst); + DPPInst.add(*Dst); + int NumOperands = 1; + + const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); + if (OldIdx != -1) { + assert(OldIdx == NumOperands); + assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI)); + DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg); + ++NumOperands; + } + + if (auto *Mod0 = TII->getNamedOperand(OrigMI, + AMDGPU::OpName::src0_modifiers)) { + assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src0_modifiers)); + assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + DPPInst.addImm(Mod0->getImm()); + ++NumOperands; + } + auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); + assert(Src0); + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) { + LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src0); + ++NumOperands; + + if (auto *Mod1 = TII->getNamedOperand(OrigMI, + AMDGPU::OpName::src1_modifiers)) { + assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src1_modifiers)); + assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + DPPInst.addImm(Mod1->getImm()); + ++NumOperands; + } + if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { + LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src1); + ++NumOperands; + } + + if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) { + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { + LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src2); + } + + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); + DPPInst.addImm(BoundCtrlZero ? 1 : 0); + } while (false); + + if (Fail) { + DPPInst.getInstr()->eraseFromParent(); + return nullptr; + } + LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr()); + return DPPInst.getInstr(); +} + +GCNDPPCombine::RegSubRegPair +GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI, + RegSubRegPair OldOpndVGPR, + MachineOperand &OldOpndValue) const { + assert(OldOpndValue.isImm()); + switch (OrigMI.getOpcode()) { + default: break; + case AMDGPU::V_MAX_U32_e32: + if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max()) + return OldOpndVGPR; + break; + case AMDGPU::V_MAX_I32_e32: + if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max()) + return OldOpndVGPR; + break; + case AMDGPU::V_MIN_I32_e32: + if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min()) + return OldOpndVGPR; + break; + + case AMDGPU::V_MUL_I32_I24_e32: + case AMDGPU::V_MUL_U32_U24_e32: + if (OldOpndValue.getImm() == 1) { + auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); + assert(Src1 && Src1->isReg()); + return getRegSubRegPair(*Src1); + } + break; + } + return RegSubRegPair(); +} + +// Cases to combine: +// $bound_ctrl is DPP_BOUND_ZERO, $old is any +// $bound_ctrl is DPP_BOUND_OFF, $old is 0 +// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO + +// $bound_ctrl is DPP_BOUND_OFF, $old is undef +// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF + +// $bound_ctrl is DPP_BOUND_OFF, $old is foldable +// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF + +MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + MachineOperand *OldOpndValue, + bool BoundCtrlZero) const { + assert(OldOpndVGPR.Reg); + if (!BoundCtrlZero && OldOpndValue) { + assert(OldOpndValue->isImm()); + OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue); + if (!OldOpndVGPR.Reg) { + LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n"); + return nullptr; + } + } + return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero); +} + +// returns true if MI doesn't have OpndName immediate operand or the +// operand has Value +bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, + int64_t Value, int64_t Mask) const { + auto *Imm = TII->getNamedOperand(MI, OpndName); + if (!Imm) + return true; + + assert(Imm->isImm()); + return (Imm->getImm() & Mask) == Value; +} + +bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl); + assert(BCZOpnd && BCZOpnd->isImm()); + bool BoundCtrlZero = 0 != BCZOpnd->getImm(); + + LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); + + auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); + assert(OldOpnd && OldOpnd->isReg()); + auto OldOpndVGPR = getRegSubRegPair(*OldOpnd); + auto *OldOpndValue = getOldOpndValue(*OldOpnd); + assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); + if (OldOpndValue) { + if (BoundCtrlZero) { + OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd + OldOpndValue = nullptr; + } else { + if (!OldOpndValue->isImm()) { + LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n"); + return false; + } + if (OldOpndValue->getImm() == 0) { + OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef + OldOpndValue = nullptr; + BoundCtrlZero = true; + } + } + } + + LLVM_DEBUG(dbgs() << " old="; + if (!OldOpndValue) + dbgs() << "undef"; + else + dbgs() << OldOpndValue->getImm(); + dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n'); + + std::vector<MachineInstr*> OrigMIs, DPPMIs; + if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef + OldOpndVGPR = RegSubRegPair( + MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass)); + auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg); + DPPMIs.push_back(UndefInst.getInstr()); + } + + OrigMIs.push_back(&MovMI); + bool Rollback = true; + for (auto &Use : MRI->use_nodbg_operands( + TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) { + Rollback = true; + + auto &OrigMI = *Use.getParent(); + auto OrigOp = OrigMI.getOpcode(); + if (TII->isVOP3(OrigOp)) { + if (!TII->hasVALU32BitEncoding(OrigOp)) { + LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n"); + break; + } + // check if other than abs|neg modifiers are set (opsel for example) + const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG); + if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) || + !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) || + !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) || + !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) { + LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n"); + break; + } + } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) { + LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n"); + break; + } + + LLVM_DEBUG(dbgs() << " combining: " << OrigMI); + if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { + if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR, + OldOpndValue, BoundCtrlZero)) { + DPPMIs.push_back(DPPInst); + Rollback = false; + } + } else if (OrigMI.isCommutable() && + &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + auto *BB = OrigMI.getParent(); + auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); + BB->insert(OrigMI, NewMI); + if (TII->commuteInstruction(*NewMI)) { + LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); + if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR, + OldOpndValue, BoundCtrlZero)) { + DPPMIs.push_back(DPPInst); + Rollback = false; + } + } else + LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n"); + NewMI->eraseFromParent(); + } else + LLVM_DEBUG(dbgs() << " failed: no suitable operands\n"); + if (Rollback) + break; + OrigMIs.push_back(&OrigMI); + } + + for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) + MI->eraseFromParent(); + + return !Rollback; +} + +bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { + auto &ST = MF.getSubtarget<GCNSubtarget>(); + if (!ST.hasDPP() || skipFunction(MF.getFunction())) + return false; + + MRI = &MF.getRegInfo(); + TII = ST.getInstrInfo(); + + assert(MRI->isSSA() && "Must be run on SSA"); + + bool Changed = false; + for (auto &MBB : MF) { + for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) { + auto &MI = *I++; + if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) { + Changed = true; + ++NumDPPMovsCombined; + } + } + } + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 580ceed8b8d..902ed3bf627 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5632,3 +5632,84 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { return MCOp; } + +static +TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { + assert(RegOpnd.isReg()); + return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : + getRegSubRegPair(RegOpnd); +} + +TargetInstrInfo::RegSubRegPair +llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { + assert(MI.isRegSequence()); + for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) + if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { + auto &RegOp = MI.getOperand(1 + 2 * I); + return getRegOrUndef(RegOp); + } + return TargetInstrInfo::RegSubRegPair(); +} + +// Try to find the definition of reg:subreg in subreg-manipulation pseudos +// Following a subreg of reg:subreg isn't supported +static bool followSubRegDef(MachineInstr &MI, + TargetInstrInfo::RegSubRegPair &RSR) { + if (!RSR.SubReg) + return false; + switch (MI.getOpcode()) { + default: break; + case AMDGPU::REG_SEQUENCE: + RSR = getRegSequenceSubReg(MI, RSR.SubReg); + return true; + // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg + case AMDGPU::INSERT_SUBREG: + if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) + // inserted the subreg we're looking for + RSR = getRegOrUndef(MI.getOperand(2)); + else { // the subreg in the rest of the reg + auto R1 = getRegOrUndef(MI.getOperand(1)); + if (R1.SubReg) // subreg of subreg isn't supported + return false; + RSR.Reg = R1.Reg; + } + return true; + } + return false; +} + +MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, + MachineRegisterInfo &MRI) { + assert(MRI.isSSA()); + if (!TargetRegisterInfo::isVirtualRegister(P.Reg)) + return nullptr; + + auto RSR = P; + auto *DefInst = MRI.getVRegDef(RSR.Reg); + while (auto *MI = DefInst) { + DefInst = nullptr; + switch (MI->getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::V_MOV_B32_e32: { + auto &Op1 = MI->getOperand(1); + if (Op1.isReg() && + TargetRegisterInfo::isVirtualRegister(Op1.getReg())) { + if (Op1.isUndef()) + return nullptr; + RSR = getRegSubRegPair(Op1); + DefInst = MRI.getVRegDef(RSR.Reg); + } + break; + } + default: + if (followSubRegDef(*MI, RSR)) { + if (!RSR.Reg) + return nullptr; + DefInst = MRI.getVRegDef(RSR.Reg); + } + } + if (!DefInst) + return MI; + } + return nullptr; +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 2c18455ac55..d4ed0bf204d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -917,9 +917,36 @@ public: /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; - }; +/// \brief Returns true if a reg:subreg pair P has a TRC class +inline bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P, + const TargetRegisterClass &TRC, + MachineRegisterInfo &MRI) { + auto *RC = MRI.getRegClass(P.Reg); + if (!P.SubReg) + return RC == &TRC; + auto *TRI = MRI.getTargetRegisterInfo(); + return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg); +} + +/// \brief Create RegSubRegPair from a register MachineOperand +inline +TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O) { + assert(O.isReg()); + return TargetInstrInfo::RegSubRegPair(O.getReg(), O.getSubReg()); +} + +/// \brief Return the SubReg component from REG_SEQUENCE +TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, + unsigned SubReg); + +/// \brief Return the defining instruction for a given reg:subreg pair +/// skipping copy like instructions and subreg-manipulation pseudos. +/// Following another subreg of a reg:subreg isn't supported. +MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, + MachineRegisterInfo &MRI); + namespace AMDGPU { LLVM_READONLY @@ -932,6 +959,9 @@ namespace AMDGPU { int getSDWAOp(uint16_t Opcode); LLVM_READONLY + int getDPPOp32(uint16_t Opcode); + + LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode); LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 265a05706a8..13afa4d4974 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1622,7 +1622,7 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, 0, // 64-bit dst - No DPP or SDWA for 64-bit operands !if(!eq(Src0VT.Size, 64), 0, // 64-bit src0 - !if(!eq(Src0VT.Size, 64), + !if(!eq(Src1VT.Size, 64), 0, // 64-bit src2 1 ) @@ -1631,6 +1631,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, ); } +class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, + ValueType Src1VT = i32> { + bit ret = !if(!eq(NumSrcArgs, 0), 0, + getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret); +} + class BitOr<bit a, bit b> { bit ret = !if(a, 1, !if(b, 1, 0)); } @@ -1710,7 +1716,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasSDWAOMod = isFloatType<DstVT>.ret; field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; - field bit HasExtDPP = HasExt; + field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; field bit HasExtSDWA = HasExt; field bit HasExtSDWA9 = HasExt; field int NeedPatGen = PatGenMode.NoPattern; @@ -1741,8 +1747,10 @@ class VOPProfile <list<ValueType> _ArgVT> { getOpSelMod<Src0VT>.ret, getOpSelMod<Src1VT>.ret, getOpSelMod<Src2VT>.ret>.ret; - field dag InsDPP = getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, - HasModifiers, Src0ModDPP, Src1ModDPP>.ret; + field dag InsDPP = !if(HasExtDPP, + getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, + HasModifiers, Src0ModDPP, Src1ModDPP>.ret, + (ins)); field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasSDWAOMod, Src0ModSDWA, Src1ModSDWA, DstVT>.ret; @@ -1756,7 +1764,8 @@ class VOPProfile <list<ValueType> _ArgVT> { HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret; - field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; + field string AsmDPP = !if(HasExtDPP, + getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, ""); field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret; field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret; } @@ -1931,6 +1940,15 @@ def getBasicFromSDWAOp : InstrMapping { let ValueCols = [["Default"]]; } +// Maps ordinary instructions to their DPP counterparts +def getDPPOp32 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["DPP"]]; +} + // Maps an commuted opcode to its original version def getCommuteOrig : InstrMapping { let FilterClass = "Commutable_REV"; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 9f4673cf7ab..9da99d9f63e 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -84,6 +84,10 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP1"; } +class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_DPP_Pseudo <OpName, P, pattern> { +} + class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, @@ -103,6 +107,8 @@ multiclass VOP1Inst <string opName, VOPProfile P, def _e32 : VOP1_Pseudo <opName, P>; def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>; def _sdwa : VOP1_SDWA_Pseudo <opName, P>; + foreach _ = BoolToList<P.HasExtDPP>.ret in + def _dpp : VOP1_DPP_Pseudo <opName, P>; } // Special profile for instructions which have clamp @@ -500,13 +506,8 @@ defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>; // VI //===----------------------------------------------------------------------===// -class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> : - VOP_DPP <ps.OpName, P> { - let Defs = ps.Defs; - let Uses = ps.Uses; - let SchedRW = ps.SchedRW; - let hasSideEffects = ps.hasSideEffects; - +class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : + VOP_DPPe <P> { bits<8> vdst; let Inst{8-0} = 0xfa; // dpp let Inst{16-9} = op; @@ -544,9 +545,10 @@ multiclass VOP1_Real_vi <bits<10> op> { VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>, + VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; } defm V_NOP : VOP1_Real_vi <0x0>; @@ -717,9 +719,11 @@ multiclass VOP1_Real_gfx9 <bits<10> op> { VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; + } defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index db031be7e55..1bea9c367b4 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -105,6 +105,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP2"; } +class VOP2_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_DPP_Pseudo <OpName, P, pattern> { +} + + class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, @@ -155,7 +160,12 @@ multiclass VOP2Inst<string opName, bit GFX9Renamed = 0> : VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>, VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>, - VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed>; + VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed> { + let renamedInGFX9 = GFX9Renamed in { + foreach _ = BoolToList<P.HasExtDPP>.ret in + def _dpp : VOP2_DPP_Pseudo <opName, P>; + } +} multiclass VOP2bInst <string opName, VOPProfile P, @@ -172,6 +182,8 @@ multiclass VOP2bInst <string opName, def _sdwa : VOP2_SDWA_Pseudo <opName, P> { let AsmMatchConverter = "cvtSdwaVOP2b"; } + foreach _ = BoolToList<P.HasExtDPP>.ret in + def _dpp : VOP2_DPP_Pseudo <opName, P>; } def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, @@ -194,6 +206,9 @@ multiclass VOP2eInst <string opName, def _sdwa : VOP2_SDWA_Pseudo <opName, P> { let AsmMatchConverter = "cvtSdwaVOP2b"; } + + foreach _ = BoolToList<P.HasExtDPP>.ret in + def _dpp : VOP2_DPP_Pseudo <opName, P>; } def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, @@ -233,9 +248,9 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, 0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; - let InsDPP = (ins DstRCDPP:$old, - Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, + VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); @@ -778,13 +793,8 @@ defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>; // VI //===----------------------------------------------------------------------===// -class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> : - VOP_DPP <OpName, P> { - let Defs = ps.Defs; - let Uses = ps.Uses; - let SchedRW = ps.SchedRW; - let hasSideEffects = ps.hasSideEffects; - +class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : + VOP_DPPe <P> { bits<8> vdst; bits<8> src1; let Inst{8-0} = 0xfa; //dpp @@ -865,8 +875,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); let AsmString = AsmName # ps.AsmOperands; } - def _dpp : - VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName>; + foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>, + VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> { + VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp"); + let AsmString = AsmName # ps.AsmOperands; + } } } @@ -893,10 +908,14 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); let AsmString = AsmName # ps.AsmOperands; } - def _dpp_gfx9 : - VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName> { - let DecoderNamespace = "SDWA9"; - } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>, + VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> { + VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp"); + let AsmString = AsmName # ps.AsmOperands; + let DecoderNamespace = "SDWA9"; + } } multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { @@ -914,19 +933,23 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { } - def _dpp_gfx9 : - VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { - let DecoderNamespace = "SDWA9"; - } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> { + let DecoderNamespace = "SDWA9"; + } } } // AssemblerPredicates = [isGFX9] multiclass VOP2_Real_e32e64_vi <bits<6> op> : Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; + + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>, + VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>; } defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index e177b2fd081..7de7d90d27b 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -505,9 +505,14 @@ class VOP_DPPe<VOPProfile P> : Enc64 { let Inst{63-60} = row_mask; } -class VOP_DPP <string OpName, VOPProfile P> : - InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>, - VOP_DPPe<P> { +class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>, + VOP <OpName>, + SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>, + MnemonicAlias <OpName#"_dpp", OpName> { + + let isPseudo = 1; + let isCodeGenOnly = 1; let mayLoad = 0; let mayStore = 0; @@ -517,6 +522,11 @@ class VOP_DPP <string OpName, VOPProfile P> : let VALU = 1; let DPP = 1; let Size = 8; + let Uses = [EXEC]; + let isConvergent = 1; + + string Mnemonic = OpName; + string AsmOperands = P.AsmDPP; let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", ""); let SubtargetPredicate = HasDPP; @@ -526,6 +536,36 @@ class VOP_DPP <string OpName, VOPProfile P> : let Constraints = !if(P.NumSrcArgs, "$old = $vdst", ""); let DisableEncoding = !if(P.NumSrcArgs, "$old", ""); let DecoderNamespace = "DPP"; + + VOPProfile Pfl = P; +} + +class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + SIMCInstr <ps.PseudoInstr, EncodingFamily> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Defs = ps.Defs; + let Uses = ps.Uses; + let SchedRW = ps.SchedRW; + let hasSideEffects = ps.hasSideEffects; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // Copy relevant pseudo op flags + let isConvergent = ps.isConvergent; + let SubtargetPredicate = ps.SubtargetPredicate; + let AssemblerPredicate = ps.AssemblerPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let AsmVariantName = ps.AsmVariantName; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let DecoderNamespace = ps.DecoderNamespace; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; } class getNumNodeArgs<SDPatternOperator Op> { |

