diff options
| author | Valery Pykhtin <Valery.Pykhtin@amd.com> | 2018-11-30 14:21:56 +0000 | 
|---|---|---|
| committer | Valery Pykhtin <Valery.Pykhtin@amd.com> | 2018-11-30 14:21:56 +0000 | 
| commit | 3d9afa273f94bfc1a7a63d7fd6f1484ffc34453c (patch) | |
| tree | 44171fc2b67bc5a9e34a85780e3f9bcd859fae68 /llvm/lib/Target | |
| parent | 445b0b6260238f3e59204e6af921447564962004 (diff) | |
| download | bcm5719-llvm-3d9afa273f94bfc1a7a63d7fd6f1484ffc34453c.tar.gz bcm5719-llvm-3d9afa273f94bfc1a7a63d7fd6f1484ffc34453c.zip | |
[AMDGPU] Combine DPP mov with use instructions (VOP1/2/3)
Introduces DPP pseudo instructions and the pass that combines DPP mov with subsequent uses.
Differential revision: https://reviews.llvm.org/D53762
llvm-svn: 347993
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp | 446 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 81 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 32 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 28 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/VOP1Instructions.td | 30 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/VOP2Instructions.td | 69 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/VOPInstructions.td | 46 | 
12 files changed, 711 insertions, 50 deletions
| diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 07ae2bee49b..b77b1f8ad79 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -37,6 +37,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass();  FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);  // SI Passes +FunctionPass *createGCNDPPCombinePass();  FunctionPass *createSIAnnotateControlFlowPass();  FunctionPass *createSIFoldOperandsPass();  FunctionPass *createSIPeepholeSDWAPass(); @@ -93,6 +94,9 @@ extern char &AMDGPULowerKernelAttributesID;  void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);  extern char &AMDGPURewriteOutArgumentsID; +void initializeGCNDPPCombinePass(PassRegistry &); +extern char &GCNDPPCombineID; +  void initializeR600ClauseMergePassPass(PassRegistry &);  extern char &R600ClauseMergePassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 23470c7a4d2..8530be9269c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -11,6 +11,10 @@ include "llvm/TableGen/SearchableTable.td"  include "llvm/Target/Target.td"  include "AMDGPUFeatures.td" +class BoolToList<bit Value> { +  list<int> ret = !if(Value, [1]<int>, []<int>); +} +  //===------------------------------------------------------------===//  // Subtarget Features (device properties)  //===------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2198ba8d6c0..4d265713aaf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -106,6 +106,11 @@ static cl::opt<bool> EnableSDWAPeephole(    cl::desc("Enable SDWA peepholer"),    cl::init(true)); +static cl::opt<bool> EnableDPPCombine( +  "amdgpu-dpp-combine", +  cl::desc("Enable DPP combiner"), +  cl::init(false)); +  // Enable address space based alias analysis  static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,    cl::desc("Enable AMDGPU Alias Analysis"), @@ -158,6 +163,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {    initializeR600VectorRegMergerPass(*PR);    initializeGlobalISel(*PR);    initializeAMDGPUDAGToDAGISelPass(*PR); +  initializeGCNDPPCombinePass(*PR);    initializeSILowerI1CopiesPass(*PR);    initializeSIFixSGPRCopiesPass(*PR);    initializeSIFixVGPRCopiesPass(*PR); @@ -790,6 +796,8 @@ void GCNPassConfig::addMachineSSAOptimization() {    //    // XXX - Can we get away without running DeadMachineInstructionElim again?    addPass(&SIFoldOperandsID); +  if (EnableDPPCombine) +    addPass(&GCNDPPCombineID);    addPass(&DeadMachineInstructionElimID);    addPass(&SILoadStoreOptimizerID);    if (EnableSDWAPeephole) { diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index bd941930872..e6a718bcb30 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -5275,12 +5275,14 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {      ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);    } -  // All DPP instructions with at least one source operand have a fake "old" -  // source at the beginning that's tied to the dst operand. Handle it here. -  if (Desc.getNumOperands() >= 2) -    Inst.addOperand(Inst.getOperand(0)); -    for (unsigned E = Operands.size(); I != E; ++I) { +    auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), +                                            MCOI::TIED_TO); +    if (TiedTo != -1) { +      assert((unsigned)TiedTo < Inst.getNumOperands()); +      // handle tied old or src2 for MAC instructions +      Inst.addOperand(Inst.getOperand(TiedTo)); +    }      AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);      // Add the register arguments      if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index bb1096bc1de..bdfaabac122 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -119,6 +119,7 @@ add_llvm_target(AMDGPUCodeGen    SIShrinkInstructions.cpp    SIWholeQuadMode.cpp    GCNILPSched.cpp +  GCNDPPCombine.cpp    )  add_subdirectory(AsmParser) diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp new file mode 100644 index 00000000000..56071d0d237 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -0,0 +1,446 @@ +//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0 +// operand.If any of the use instruction cannot be combined with the mov the +// whole sequence is reverted. +// +// $old = ... +// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane, +//                            dpp_controls..., $bound_ctrl +// $res = VALU $dpp_value, ... +// +// to +// +// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ..., +//                 dpp_controls..., $folded_bound_ctrl +// +// Combining rules : +// +// $bound_ctrl is DPP_BOUND_ZERO, $old is any +// $bound_ctrl is DPP_BOUND_OFF, $old is 0 +// +// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO +// $bound_ctrl is DPP_BOUND_OFF, $old is undef +// +// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF +// $bound_ctrl is DPP_BOUND_OFF, $old is foldable +// +// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Pass.h" +#include <cassert> + +using namespace llvm; + +#define DEBUG_TYPE "gcn-dpp-combine" + +STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined."); + +namespace { + +class GCNDPPCombine : public MachineFunctionPass { +  MachineRegisterInfo *MRI; +  const SIInstrInfo *TII; + +  using RegSubRegPair = TargetInstrInfo::RegSubRegPair; + +  MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; + +  RegSubRegPair foldOldOpnd(MachineInstr &OrigMI, +                            RegSubRegPair OldOpndVGPR, +                            MachineOperand &OldOpndValue) const; + +  MachineInstr *createDPPInst(MachineInstr &OrigMI, +                              MachineInstr &MovMI, +                              RegSubRegPair OldOpndVGPR, +                              MachineOperand *OldOpnd, +                              bool BoundCtrlZero) const; + +  MachineInstr *createDPPInst(MachineInstr &OrigMI, +                              MachineInstr &MovMI, +                              RegSubRegPair OldOpndVGPR, +                              bool BoundCtrlZero) const; + +  bool hasNoImmOrEqual(MachineInstr &MI, +                       unsigned OpndName, +                       int64_t Value, +                       int64_t Mask = -1) const; + +  bool combineDPPMov(MachineInstr &MI) const; + +public: +  static char ID; + +  GCNDPPCombine() : MachineFunctionPass(ID) { +    initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry()); +  } + +  bool runOnMachineFunction(MachineFunction &MF) override; + +  StringRef getPassName() const override { return "GCN DPP Combine"; } + +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.setPreservesCFG(); +    MachineFunctionPass::getAnalysisUsage(AU); +  } +}; + +} // end anonymous namespace + +INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false) + +char GCNDPPCombine::ID = 0; + +char &llvm::GCNDPPCombineID = GCNDPPCombine::ID; + +FunctionPass *llvm::createGCNDPPCombinePass() { +  return new GCNDPPCombine(); +} + +static int getDPPOp(unsigned Op) { +  auto DPP32 = AMDGPU::getDPPOp32(Op); +  if (DPP32 != -1) +    return DPP32; + +  auto E32 = AMDGPU::getVOPe32(Op); +  return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1; +} + +// tracks the register operand definition and returns: +//   1. immediate operand used to initialize the register if found +//   2. nullptr if the register operand is undef +//   3. the operand itself otherwise +MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { +  auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI); +  if (!Def) +    return nullptr; + +  switch(Def->getOpcode()) { +  default: break; +  case AMDGPU::IMPLICIT_DEF: +    return nullptr; +  case AMDGPU::COPY: +  case AMDGPU::V_MOV_B32_e32: { +    auto &Op1 = Def->getOperand(1); +    if (Op1.isImm()) +      return &Op1; +    break; +  } +  } +  return &OldOpnd; +} + +MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, +                                           MachineInstr &MovMI, +                                           RegSubRegPair OldOpndVGPR, +                                           bool BoundCtrlZero) const { +  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); +  assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() == +         TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg()); + +  auto OrigOp = OrigMI.getOpcode(); +  auto DPPOp = getDPPOp(OrigOp); +  if (DPPOp == -1) { +    LLVM_DEBUG(dbgs() << "  failed: no DPP opcode\n"); +    return nullptr; +  } + +  auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI, +                         OrigMI.getDebugLoc(), TII->get(DPPOp)); +  bool Fail = false; +  do { +    auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); +    assert(Dst); +    DPPInst.add(*Dst); +    int NumOperands = 1; + +    const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); +    if (OldIdx != -1) { +      assert(OldIdx == NumOperands); +      assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI)); +      DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg); +      ++NumOperands; +    } + +    if (auto *Mod0 = TII->getNamedOperand(OrigMI, +                                          AMDGPU::OpName::src0_modifiers)) { +      assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, +                                          AMDGPU::OpName::src0_modifiers)); +      assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); +      DPPInst.addImm(Mod0->getImm()); +      ++NumOperands; +    } +    auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); +    assert(Src0); +    if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) { +      LLVM_DEBUG(dbgs() << "  failed: src0 is illegal\n"); +      Fail = true; +      break; +    } +    DPPInst.add(*Src0); +    ++NumOperands; + +    if (auto *Mod1 = TII->getNamedOperand(OrigMI, +                                          AMDGPU::OpName::src1_modifiers)) { +      assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, +                                          AMDGPU::OpName::src1_modifiers)); +      assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); +      DPPInst.addImm(Mod1->getImm()); +      ++NumOperands; +    } +    if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { +      if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { +        LLVM_DEBUG(dbgs() << "  failed: src1 is illegal\n"); +        Fail = true; +        break; +      } +      DPPInst.add(*Src1); +      ++NumOperands; +    } + +    if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) { +      if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { +        LLVM_DEBUG(dbgs() << "  failed: src2 is illegal\n"); +        Fail = true; +        break; +      } +      DPPInst.add(*Src2); +    } + +    DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); +    DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); +    DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); +    DPPInst.addImm(BoundCtrlZero ? 1 : 0); +  } while (false); + +  if (Fail) { +    DPPInst.getInstr()->eraseFromParent(); +    return nullptr; +  } +  LLVM_DEBUG(dbgs() << "  combined:  " << *DPPInst.getInstr()); +  return DPPInst.getInstr(); +} + +GCNDPPCombine::RegSubRegPair +GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI, +                           RegSubRegPair OldOpndVGPR, +                           MachineOperand &OldOpndValue) const { +  assert(OldOpndValue.isImm()); +  switch (OrigMI.getOpcode()) { +  default: break; +  case AMDGPU::V_MAX_U32_e32: +    if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max()) +      return OldOpndVGPR; +    break; +  case AMDGPU::V_MAX_I32_e32: +    if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max()) +      return OldOpndVGPR; +    break; +  case AMDGPU::V_MIN_I32_e32: +    if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min()) +      return OldOpndVGPR; +    break; + +  case AMDGPU::V_MUL_I32_I24_e32: +  case AMDGPU::V_MUL_U32_U24_e32: +    if (OldOpndValue.getImm() == 1) { +      auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); +      assert(Src1 && Src1->isReg()); +      return getRegSubRegPair(*Src1); +    } +    break; +  } +  return RegSubRegPair(); +} + +// Cases to combine: +//  $bound_ctrl is DPP_BOUND_ZERO, $old is any +//  $bound_ctrl is DPP_BOUND_OFF, $old is 0 +//  -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO + +//  $bound_ctrl is DPP_BOUND_OFF, $old is undef +//  -> $old = undef, $bound_ctrl = DPP_BOUND_OFF + +//  $bound_ctrl is DPP_BOUND_OFF, $old is foldable +//  -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF + +MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, +                                           MachineInstr &MovMI, +                                           RegSubRegPair OldOpndVGPR, +                                           MachineOperand *OldOpndValue, +                                           bool BoundCtrlZero) const { +  assert(OldOpndVGPR.Reg); +  if (!BoundCtrlZero && OldOpndValue) { +    assert(OldOpndValue->isImm()); +    OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue); +    if (!OldOpndVGPR.Reg) { +      LLVM_DEBUG(dbgs() << "  failed: old immediate cannot be folded\n"); +      return nullptr; +    } +  } +  return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero); +} + +// returns true if MI doesn't have OpndName immediate operand or the +// operand has Value +bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, +                                    int64_t Value, int64_t Mask) const { +  auto *Imm = TII->getNamedOperand(MI, OpndName); +  if (!Imm) +    return true; + +  assert(Imm->isImm()); +  return (Imm->getImm() & Mask) == Value; +} + +bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { +  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); +  auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl); +  assert(BCZOpnd && BCZOpnd->isImm()); +  bool BoundCtrlZero = 0 != BCZOpnd->getImm(); + +  LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); + +  auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); +  assert(OldOpnd && OldOpnd->isReg()); +  auto OldOpndVGPR = getRegSubRegPair(*OldOpnd); +  auto *OldOpndValue = getOldOpndValue(*OldOpnd); +  assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); +  if (OldOpndValue) { +    if (BoundCtrlZero) { +      OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd +      OldOpndValue = nullptr; +    } else { +      if (!OldOpndValue->isImm()) { +        LLVM_DEBUG(dbgs() << "  failed: old operand isn't an imm or undef\n"); +        return false; +      } +      if (OldOpndValue->getImm() == 0) { +        OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef +        OldOpndValue = nullptr; +        BoundCtrlZero = true; +      } +    } +  } + +  LLVM_DEBUG(dbgs() << "  old="; +    if (!OldOpndValue) +      dbgs() << "undef"; +    else +      dbgs() << OldOpndValue->getImm(); +    dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n'); + +  std::vector<MachineInstr*> OrigMIs, DPPMIs; +  if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef +    OldOpndVGPR = RegSubRegPair( +      MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass)); +    auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), +                             TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg); +    DPPMIs.push_back(UndefInst.getInstr()); +  } + +  OrigMIs.push_back(&MovMI); +  bool Rollback = true; +  for (auto &Use : MRI->use_nodbg_operands( +       TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) { +    Rollback = true; + +    auto &OrigMI = *Use.getParent(); +    auto OrigOp = OrigMI.getOpcode(); +    if (TII->isVOP3(OrigOp)) { +      if (!TII->hasVALU32BitEncoding(OrigOp)) { +        LLVM_DEBUG(dbgs() << "  failed: VOP3 hasn't e32 equivalent\n"); +        break; +      } +      // check if other than abs|neg modifiers are set (opsel for example) +      const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG); +      if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) || +          !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) || +          !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) || +          !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) { +        LLVM_DEBUG(dbgs() << "  failed: VOP3 has non-default modifiers\n"); +        break; +      } +    } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) { +      LLVM_DEBUG(dbgs() << "  failed: not VOP1/2/3\n"); +      break; +    } + +    LLVM_DEBUG(dbgs() << "  combining: " << OrigMI); +    if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { +      if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR, +                                        OldOpndValue, BoundCtrlZero)) { +        DPPMIs.push_back(DPPInst); +        Rollback = false; +      } +    } else if (OrigMI.isCommutable() && +               &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { +      auto *BB = OrigMI.getParent(); +      auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); +      BB->insert(OrigMI, NewMI); +      if (TII->commuteInstruction(*NewMI)) { +        LLVM_DEBUG(dbgs() << "  commuted:  " << *NewMI); +        if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR, +                                          OldOpndValue, BoundCtrlZero)) { +          DPPMIs.push_back(DPPInst); +          Rollback = false; +        } +      } else +        LLVM_DEBUG(dbgs() << "  failed: cannot be commuted\n"); +      NewMI->eraseFromParent(); +    } else +      LLVM_DEBUG(dbgs() << "  failed: no suitable operands\n"); +    if (Rollback) +      break; +    OrigMIs.push_back(&OrigMI); +  } + +  for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) +    MI->eraseFromParent(); + +  return !Rollback; +} + +bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { +  auto &ST = MF.getSubtarget<GCNSubtarget>(); +  if (!ST.hasDPP() || skipFunction(MF.getFunction())) +    return false; + +  MRI = &MF.getRegInfo(); +  TII = ST.getInstrInfo(); + +  assert(MRI->isSSA() && "Must be run on SSA"); + +  bool Changed = false; +  for (auto &MBB : MF) { +    for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) { +      auto &MI = *I++; +      if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) { +        Changed = true; +        ++NumDPPMovsCombined; +      } +    } +  } +  return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 580ceed8b8d..902ed3bf627 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5632,3 +5632,84 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {    return MCOp;  } + +static +TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { +  assert(RegOpnd.isReg()); +  return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : +                             getRegSubRegPair(RegOpnd); +} + +TargetInstrInfo::RegSubRegPair +llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { +  assert(MI.isRegSequence()); +  for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) +    if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { +      auto &RegOp = MI.getOperand(1 + 2 * I); +      return getRegOrUndef(RegOp); +    } +  return TargetInstrInfo::RegSubRegPair(); +} + +// Try to find the definition of reg:subreg in subreg-manipulation pseudos +// Following a subreg of reg:subreg isn't supported +static bool followSubRegDef(MachineInstr &MI, +                            TargetInstrInfo::RegSubRegPair &RSR) { +  if (!RSR.SubReg) +    return false; +  switch (MI.getOpcode()) { +  default: break; +  case AMDGPU::REG_SEQUENCE: +    RSR = getRegSequenceSubReg(MI, RSR.SubReg); +    return true; +  // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg +  case AMDGPU::INSERT_SUBREG: +    if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) +      // inserted the subreg we're looking for +      RSR = getRegOrUndef(MI.getOperand(2)); +    else { // the subreg in the rest of the reg +      auto R1 = getRegOrUndef(MI.getOperand(1)); +      if (R1.SubReg) // subreg of subreg isn't supported +        return false; +      RSR.Reg = R1.Reg; +    } +    return true; +  } +  return false; +} + +MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, +                                     MachineRegisterInfo &MRI) { +  assert(MRI.isSSA()); +  if (!TargetRegisterInfo::isVirtualRegister(P.Reg)) +    return nullptr; + +  auto RSR = P; +  auto *DefInst = MRI.getVRegDef(RSR.Reg); +  while (auto *MI = DefInst) { +    DefInst = nullptr; +    switch (MI->getOpcode()) { +    case AMDGPU::COPY: +    case AMDGPU::V_MOV_B32_e32: { +      auto &Op1 = MI->getOperand(1); +      if (Op1.isReg() && +        TargetRegisterInfo::isVirtualRegister(Op1.getReg())) { +        if (Op1.isUndef()) +          return nullptr; +        RSR = getRegSubRegPair(Op1); +        DefInst = MRI.getVRegDef(RSR.Reg); +      } +      break; +    } +    default: +      if (followSubRegDef(*MI, RSR)) { +        if (!RSR.Reg) +          return nullptr; +        DefInst = MRI.getVRegDef(RSR.Reg); +      } +    } +    if (!DefInst) +      return MI; +  } +  return nullptr; +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 2c18455ac55..d4ed0bf204d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -917,9 +917,36 @@ public:    /// Return -1 if the target-specific opcode for the pseudo instruction does    /// not exist. If Opcode is not a pseudo instruction, this is identity.    int pseudoToMCOpcode(int Opcode) const; -  }; +/// \brief Returns true if a reg:subreg pair P has a TRC class +inline bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P, +                         const TargetRegisterClass &TRC, +                         MachineRegisterInfo &MRI) { +  auto *RC = MRI.getRegClass(P.Reg); +  if (!P.SubReg) +    return RC == &TRC; +  auto *TRI = MRI.getTargetRegisterInfo(); +  return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg); +} + +/// \brief Create RegSubRegPair from a register MachineOperand +inline +TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O) { +  assert(O.isReg()); +  return TargetInstrInfo::RegSubRegPair(O.getReg(), O.getSubReg()); +} + +/// \brief Return the SubReg component from REG_SEQUENCE +TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, +                                                    unsigned SubReg); + +/// \brief Return the defining instruction for a given reg:subreg pair +/// skipping copy like instructions and subreg-manipulation pseudos. +/// Following another subreg of a reg:subreg isn't supported. +MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, +                               MachineRegisterInfo &MRI); +  namespace AMDGPU {    LLVM_READONLY @@ -932,6 +959,9 @@ namespace AMDGPU {    int getSDWAOp(uint16_t Opcode);    LLVM_READONLY +  int getDPPOp32(uint16_t Opcode); + +  LLVM_READONLY    int getBasicFromSDWAOp(uint16_t Opcode);    LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 265a05706a8..13afa4d4974 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1622,7 +1622,7 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,                      0, // 64-bit dst - No DPP or SDWA for 64-bit operands                      !if(!eq(Src0VT.Size, 64),                          0, // 64-bit src0 -                        !if(!eq(Src0VT.Size, 64), +                        !if(!eq(Src1VT.Size, 64),                              0, // 64-bit src2                              1                          ) @@ -1631,6 +1631,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,              );  } +class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, +                 ValueType Src1VT = i32> { +  bit ret = !if(!eq(NumSrcArgs, 0), 0, +                getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret); +} +  class BitOr<bit a, bit b> {    bit ret = !if(a, 1, !if(b, 1, 0));  } @@ -1710,7 +1716,7 @@ class VOPProfile <list<ValueType> _ArgVT> {    field bit HasSDWAOMod = isFloatType<DstVT>.ret;    field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; -  field bit HasExtDPP = HasExt; +  field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;    field bit HasExtSDWA = HasExt;    field bit HasExtSDWA9 = HasExt;    field int NeedPatGen = PatGenMode.NoPattern; @@ -1741,8 +1747,10 @@ class VOPProfile <list<ValueType> _ArgVT> {                                             getOpSelMod<Src0VT>.ret,                                             getOpSelMod<Src1VT>.ret,                                             getOpSelMod<Src2VT>.ret>.ret; -  field dag InsDPP = getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, -                               HasModifiers, Src0ModDPP, Src1ModDPP>.ret; +  field dag InsDPP = !if(HasExtDPP, +                         getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, +                                   HasModifiers, Src0ModDPP, Src1ModDPP>.ret, +                         (ins));    field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,                                   HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,                                   DstVT>.ret; @@ -1756,7 +1764,8 @@ class VOPProfile <list<ValueType> _ArgVT> {                                                HasSrc0FloatMods,                                                HasSrc1FloatMods,                                                HasSrc2FloatMods>.ret; -  field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; +  field string AsmDPP = !if(HasExtDPP, +                            getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, "");    field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;    field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;  } @@ -1931,6 +1940,15 @@ def getBasicFromSDWAOp : InstrMapping {    let ValueCols = [["Default"]];  } +// Maps ordinary instructions to their DPP counterparts +def getDPPOp32 : InstrMapping { +  let FilterClass = "VOP"; +  let RowFields = ["OpName"]; +  let ColFields = ["AsmVariantName"]; +  let KeyCol = ["Default"]; +  let ValueCols = [["DPP"]]; +} +  // Maps an commuted opcode to its original version  def getCommuteOrig : InstrMapping {    let FilterClass = "Commutable_REV"; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 9f4673cf7ab..9da99d9f63e 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -84,6 +84,10 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :    let AsmMatchConverter = "cvtSdwaVOP1";  } +class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : +  VOP_DPP_Pseudo <OpName, P, pattern> { +} +  class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {    list<dag> ret =      !if(P.HasModifiers, @@ -103,6 +107,8 @@ multiclass VOP1Inst <string opName, VOPProfile P,    def _e32 : VOP1_Pseudo <opName, P>;    def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;    def _sdwa : VOP1_SDWA_Pseudo <opName, P>; +  foreach _ = BoolToList<P.HasExtDPP>.ret in +    def _dpp : VOP1_DPP_Pseudo <opName, P>;  }  // Special profile for instructions which have clamp @@ -500,13 +506,8 @@ defm V_EXP_LEGACY_F32    : VOP1_Real_ci <0x46>;  // VI  //===----------------------------------------------------------------------===// -class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> : -  VOP_DPP <ps.OpName, P> { -  let Defs = ps.Defs; -  let Uses = ps.Uses; -  let SchedRW = ps.SchedRW; -  let hasSideEffects = ps.hasSideEffects; - +class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : +  VOP_DPPe <P> {    bits<8> vdst;    let Inst{8-0}   = 0xfa; // dpp    let Inst{16-9}  = op; @@ -544,9 +545,10 @@ multiclass VOP1_Real_vi <bits<10> op> {      VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,      VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; -  // For now left dpp only for asm/dasm -  // TODO: add corresponding pseudo -  def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; +  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in +    def _dpp_vi : +      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>, +      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;  }  defm V_NOP               : VOP1_Real_vi <0x0>; @@ -717,9 +719,11 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {      VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,      VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; -  // For now left dpp only for asm/dasm -  // TODO: add corresponding pseudo -  def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; +  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in +    def _dpp_gfx9 : +      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, +      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; +  }  defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index db031be7e55..1bea9c367b4 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -105,6 +105,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :    let AsmMatchConverter = "cvtSdwaVOP2";  } +class VOP2_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : +  VOP_DPP_Pseudo <OpName, P, pattern> { +} + +  class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {    list<dag> ret = !if(P.HasModifiers,      [(set P.DstVT:$vdst, @@ -155,7 +160,12 @@ multiclass VOP2Inst<string opName,                      bit GFX9Renamed = 0> :      VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>,      VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>, -    VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed>; +    VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed> { +  let renamedInGFX9 = GFX9Renamed in { +    foreach _ = BoolToList<P.HasExtDPP>.ret in +      def _dpp  : VOP2_DPP_Pseudo <opName, P>; +  } +}  multiclass VOP2bInst <string opName,                        VOPProfile P, @@ -172,6 +182,8 @@ multiclass VOP2bInst <string opName,          def _sdwa  : VOP2_SDWA_Pseudo <opName, P> {            let AsmMatchConverter = "cvtSdwaVOP2b";          } +        foreach _ = BoolToList<P.HasExtDPP>.ret in +          def _dpp  : VOP2_DPP_Pseudo <opName, P>;        }        def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, @@ -194,6 +206,9 @@ multiclass VOP2eInst <string opName,        def _sdwa : VOP2_SDWA_Pseudo <opName, P> {          let AsmMatchConverter = "cvtSdwaVOP2b";        } + +      foreach _ = BoolToList<P.HasExtDPP>.ret in +        def _dpp  : VOP2_DPP_Pseudo <opName, P>;      }      def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, @@ -233,9 +248,9 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {    let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);    let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,                         0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; -  let InsDPP = (ins DstRCDPP:$old, -                    Src0ModDPP:$src0_modifiers, Src0DPP:$src0, +  let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,                      Src1ModDPP:$src1_modifiers, Src1DPP:$src1, +                    VGPR_32:$src2, // stub argument                      dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,                      bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); @@ -778,13 +793,8 @@ defm V_CVT_PK_I16_I32     : VOP2_Real_e32e64_si <0x31>;  // VI  //===----------------------------------------------------------------------===// -class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> : -  VOP_DPP <OpName, P> { -  let Defs = ps.Defs; -  let Uses = ps.Uses; -  let SchedRW = ps.SchedRW; -  let hasSideEffects = ps.hasSideEffects; - +class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : +  VOP_DPPe <P> {    bits<8> vdst;    bits<8> src1;    let Inst{8-0}   = 0xfa; //dpp @@ -865,8 +875,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName        VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");        let AsmString = AsmName # ps.AsmOperands;      } -  def _dpp : -    VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName>; +  foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in +    def _dpp_vi : +      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>, +      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> { +        VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp"); +        let AsmString = AsmName # ps.AsmOperands; +      }  }  } @@ -893,10 +908,14 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {        VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");        let AsmString = AsmName # ps.AsmOperands;      } -  def _dpp_gfx9 : -    VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName> { -      let DecoderNamespace = "SDWA9"; -    } +  foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in +    def _dpp_gfx9 : +      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>, +      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> { +        VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp"); +        let AsmString = AsmName # ps.AsmOperands; +        let DecoderNamespace = "SDWA9"; +      }  }  multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { @@ -914,19 +933,23 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {      VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,      VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {      } -  def _dpp_gfx9 : -    VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { -      let DecoderNamespace = "SDWA9"; -    } +  foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in +    def _dpp_gfx9 : +      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, +      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> { +        let DecoderNamespace = "SDWA9"; +      }  }  } // AssemblerPredicates = [isGFX9]  multiclass VOP2_Real_e32e64_vi <bits<6> op> :    Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { -  // For now left dpp only for asm/dasm -  // TODO: add corresponding pseudo -  def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; + +  foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in +    def _dpp_vi : +      VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>, +      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;  }  defm V_CNDMASK_B32        : VOP2_Real_e32e64_vi <0x0>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index e177b2fd081..7de7d90d27b 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -505,9 +505,14 @@ class VOP_DPPe<VOPProfile P> : Enc64 {    let Inst{63-60} = row_mask;  } -class VOP_DPP <string OpName, VOPProfile P> : -  InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>, -  VOP_DPPe<P> { +class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : +  InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>, +  VOP <OpName>, +  SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>, +  MnemonicAlias <OpName#"_dpp", OpName> { + +  let isPseudo = 1; +  let isCodeGenOnly = 1;    let mayLoad = 0;    let mayStore = 0; @@ -517,6 +522,11 @@ class VOP_DPP <string OpName, VOPProfile P> :    let VALU = 1;    let DPP = 1;    let Size = 8; +  let Uses = [EXEC]; +  let isConvergent = 1; + +  string Mnemonic = OpName; +  string AsmOperands = P.AsmDPP;    let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");    let SubtargetPredicate = HasDPP; @@ -526,6 +536,36 @@ class VOP_DPP <string OpName, VOPProfile P> :    let Constraints = !if(P.NumSrcArgs, "$old = $vdst", "");    let DisableEncoding = !if(P.NumSrcArgs, "$old", "");    let DecoderNamespace = "DPP"; + +  VOPProfile Pfl = P; +} + +class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> : +  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, +  SIMCInstr <ps.PseudoInstr, EncodingFamily> { + +  let isPseudo = 0; +  let isCodeGenOnly = 0; + +  let Defs = ps.Defs; +  let Uses = ps.Uses; +  let SchedRW = ps.SchedRW; +  let hasSideEffects = ps.hasSideEffects; + +  let Constraints     = ps.Constraints; +  let DisableEncoding = ps.DisableEncoding; + +  // Copy relevant pseudo op flags +  let isConvergent         = ps.isConvergent; +  let SubtargetPredicate   = ps.SubtargetPredicate; +  let AssemblerPredicate   = ps.AssemblerPredicate; +  let AsmMatchConverter    = ps.AsmMatchConverter; +  let AsmVariantName       = ps.AsmVariantName; +  let UseNamedOperandTable = ps.UseNamedOperandTable; +  let DecoderNamespace     = ps.DecoderNamespace; +  let Constraints          = ps.Constraints; +  let DisableEncoding      = ps.DisableEncoding; +  let TSFlags              = ps.TSFlags;  }  class getNumNodeArgs<SDPatternOperator Op> { | 

