diff options
| author | Connor Abbott <cwabbott0@gmail.com> | 2017-08-04 18:36:49 +0000 | 
|---|---|---|
| committer | Connor Abbott <cwabbott0@gmail.com> | 2017-08-04 18:36:49 +0000 | 
| commit | 8c217d0a295999583de52377afa1aa4ef4a3ebb4 (patch) | |
| tree | 8850d9668c02ecb9b197aee948860d3606f69a59 /llvm/lib/Target/AMDGPU | |
| parent | 3bab91332fcb139d8c15c6d6c5a81a06cde7a7d2 (diff) | |
| download | bcm5719-llvm-8c217d0a295999583de52377afa1aa4ef4a3ebb4.tar.gz bcm5719-llvm-8c217d0a295999583de52377afa1aa4ef4a3ebb4.zip | |
[AMDGPU] Add an llvm.amdgcn.wqm intrinsic for WQM
Summary:
Previously, we assumed that certain types of instructions needed WQM in
pixel shaders, particularly DS instructions and image sampling
instructions. This was ok because with OpenGL, the assumption was
correct. But we want to start using DPP instructions for derivatives as
well as other things, so the assumption that we can infer whether to use
WQM based on the instruction won't continue to hold. This intrinsic lets
frontends like Mesa indicate what things need WQM based on their
knowledge of the API, rather than second-guessing them in the backend.
We need to keep around the old method of enabling WQM, but eventually we
should remove it once Mesa catches up. For now, this will let us use DPP
instructions for computing derivatives correctly.
Reviewers: arsenm, tpr, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D35167
llvm-svn: 310085
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 15 | 
5 files changed, 32 insertions, 1 deletions
| diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 32e83cb385f..ee2b415099b 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -338,6 +338,9 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,                                      unsigned &SMovOp,                                      int64_t &Imm) { +  if (Copy->getOpcode() != AMDGPU::COPY) +    return false; +    if (!MoveImm->isMoveImmediate())      return false; @@ -564,7 +567,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {        switch (MI.getOpcode()) {        default:          continue; -      case AMDGPU::COPY: { +      case AMDGPU::COPY: +      case AMDGPU::WQM: {          // If the destination register is a physical register there isn't really          // much we can do to fix this.          if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 7abb4636d72..fecad1e1646 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3942,6 +3942,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,                                 Op.getOperand(1), Op.getOperand(2));      return DAG.getNode(ISD::BITCAST, DL, VT, Node);    } +  case Intrinsic::amdgcn_wqm: { +    SDValue Src = Op.getOperand(1); +    return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src), +                   0); +  }    default:      return Op;    } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 1d884524bcd..dc4b9998786 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2666,6 +2666,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {    case AMDGPU::COPY: return AMDGPU::COPY;    case AMDGPU::PHI: return AMDGPU::PHI;    case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; +  case AMDGPU::WQM: return AMDGPU::WQM;    case AMDGPU::S_MOV_B32:      return MI.getOperand(1).isReg() ?             AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; @@ -3970,6 +3971,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(    case AMDGPU::PHI:    case AMDGPU::REG_SEQUENCE:    case AMDGPU::INSERT_SUBREG: +  case AMDGPU::WQM:      if (RI.hasVGPRs(NewDstRC))        return nullptr; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 50e806188a9..f20ce4d4b28 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -116,6 +116,11 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),  // pass to enable folding of inline immediates.  def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),                                        (ins VSrc_b64:$src0)>; + +// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy +// after the WQM pass processes them. +def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; +  } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]  let usesCustomInserter = 1, SALU = 1 in { diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index a613a220e29..62e1b7e84c4 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -136,6 +136,7 @@ private:    DenseMap<const MachineInstr *, InstrInfo> Instructions;    DenseMap<MachineBasicBlock *, BlockInfo> Blocks;    SmallVector<MachineInstr *, 1> LiveMaskQueries; +  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;    void printInfo(); @@ -162,6 +163,7 @@ private:    void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);    void lowerLiveMaskQueries(unsigned LiveMaskReg); +  void lowerCopyInstrs();  public:    static char ID; @@ -294,6 +296,11 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,          markUsesWQM(MI, Worklist);          GlobalFlags |= StateWQM;          continue; +      } else if (Opcode == AMDGPU::WQM) { +        // The WQM intrinsic requires its output to have all the helper lanes +        // correct, so we need it to be in WQM. +        Flags = StateWQM; +        LowerToCopyInstrs.push_back(&MI);        } else if (TII->isDisableWQM(MI)) {          Flags = StateExact;        } else { @@ -666,6 +673,11 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {    }  } +void SIWholeQuadMode::lowerCopyInstrs() { +  for (MachineInstr *MI : LowerToCopyInstrs) +    MI->setDesc(TII->get(AMDGPU::COPY)); +} +  bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {    if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)      return false; @@ -673,6 +685,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {    Instructions.clear();    Blocks.clear();    LiveMaskQueries.clear(); +  LowerToCopyInstrs.clear();    const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); @@ -708,6 +721,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {            .addReg(AMDGPU::EXEC);        lowerLiveMaskQueries(LiveMaskReg); +      lowerCopyInstrs();        // EntryMI may become invalid here        return true;      } @@ -716,6 +730,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {    DEBUG(printInfo());    lowerLiveMaskQueries(LiveMaskReg); +  lowerCopyInstrs();    // Handle the general case    for (auto BII : Blocks) | 

