diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 166 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 91 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 13 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.h | 3 |
7 files changed, 176 insertions, 111 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 94b1e636c7b..8ad7a52c92b 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -302,52 +302,6 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, return true; } -static bool phiHasVGPROperands(const MachineInstr &PHI, - const MachineRegisterInfo &MRI, - const SIRegisterInfo *TRI, - const SIInstrInfo *TII) { - for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { - unsigned Reg = PHI.getOperand(i).getReg(); - if (TRI->hasVGPRs(MRI.getRegClass(Reg))) - return true; - } - return false; -} - -static bool phiHasBreakDef(const MachineInstr &PHI, - const MachineRegisterInfo &MRI, - SmallSet<unsigned, 8> &Visited) { - for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { - unsigned Reg = PHI.getOperand(i).getReg(); - if (Visited.count(Reg)) - continue; - - Visited.insert(Reg); - - MachineInstr *DefInstr = MRI.getVRegDef(Reg); - switch (DefInstr->getOpcode()) { - default: - break; - case AMDGPU::SI_IF_BREAK: - return true; - case AMDGPU::PHI: - if (phiHasBreakDef(*DefInstr, MRI, Visited)) - return true; - } - } - return false; -} - -static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB, - const TargetRegisterInfo &TRI) { - for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(), - E = MBB.end(); I != E; ++I) { - if (I->modifiesRegister(AMDGPU::EXEC, &TRI)) - return true; - } - return false; -} - static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, const MachineInstr *MoveImm, const SIInstrInfo *TII, @@ -409,12 +363,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB, return false; } -static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, - const TargetRegisterInfo *TRI) { - return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) { - return hasTerminatorThatModifiesExec(*MBB, *TRI); }); -} - // Checks if there is potential path From instruction To instruction. // If CutOff is specified and it sits in between of that path we ignore // a higher portion of the path and report it is not reachable. @@ -621,63 +569,73 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { break; } case AMDGPU::PHI: { - unsigned Reg = MI.getOperand(0).getReg(); - if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) - break; - - // We don't need to fix the PHI if the common dominator of the - // two incoming blocks terminates with a uniform branch. - bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII); - if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) { - MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); - MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); - - if (!predsHasDivergentTerminator(MBB0, TRI) && - !predsHasDivergentTerminator(MBB1, TRI)) { - LLVM_DEBUG(dbgs() - << "Not fixing PHI for uniform branch: " << MI << '\n'); + unsigned hasVGPRUses = 0; + SetVector<const MachineInstr *> worklist; + worklist.insert(&MI); + while (!worklist.empty()) { + const MachineInstr *Instr = worklist.pop_back_val(); + unsigned Reg = Instr->getOperand(0).getReg(); + for (const auto &Use : MRI.use_operands(Reg)) { + const MachineInstr *UseMI = Use.getParent(); + if (UseMI->isCopy() || UseMI->isRegSequence()) { + if (UseMI->isCopy() && + TRI->isPhysicalRegister(UseMI->getOperand(0).getReg()) && + !TRI->isSGPRReg(MRI, UseMI->getOperand(0).getReg())) { + hasVGPRUses++; + } + worklist.insert(UseMI); + continue; + } + + if (UseMI->isPHI()) { + if (!TRI->isSGPRReg(MRI, Use.getReg())) + hasVGPRUses++; + continue; + } + + unsigned OpNo = UseMI->getOperandNo(&Use); + const MCInstrDesc &Desc = TII->get(UseMI->getOpcode()); + if (Desc.OpInfo && Desc.OpInfo[OpNo].RegClass != -1) { + const TargetRegisterClass *OpRC = + TRI->getRegClass(Desc.OpInfo[OpNo].RegClass); + if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass && + OpRC != &AMDGPU::VS_64RegClass) { + hasVGPRUses++; + } + } + } + } + bool hasVGPRInput = false; + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + unsigned InputReg = MI.getOperand(i).getReg(); + MachineInstr *Def = MRI.getVRegDef(InputReg); + if (TRI->isVGPR(MRI, InputReg)) { + if (Def->isCopy()) { + unsigned SrcReg = Def->getOperand(1).getReg(); + const TargetRegisterClass *RC = + TRI->isVirtualRegister(SrcReg) ? MRI.getRegClass(SrcReg) + : TRI->getPhysRegClass(SrcReg); + if (TRI->isSGPRClass(RC)) + continue; + } + hasVGPRInput = true; + break; + } else if (Def->isCopy() && + TRI->isVGPR(MRI, Def->getOperand(1).getReg())) { + hasVGPRInput = true; break; } } + unsigned PHIRes = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC0 = MRI.getRegClass(PHIRes); - // If a PHI node defines an SGPR and any of its operands are VGPRs, - // then we need to move it to the VALU. - // - // Also, if a PHI node defines an SGPR and has all SGPR operands - // we must move it to the VALU, because the SGPR operands will - // all end up being assigned the same register, which means - // there is a potential for a conflict if different threads take - // different control flow paths. - // - // For Example: - // - // sgpr0 = def; - // ... - // sgpr1 = def; - // ... - // sgpr2 = PHI sgpr0, sgpr1 - // use sgpr2; - // - // Will Become: - // - // sgpr2 = def; - // ... - // sgpr2 = def; - // ... - // use sgpr2 - // - // The one exception to this rule is when one of the operands - // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK - // instruction. In this case, there we know the program will - // never enter the second block (the loop) without entering - // the first block (where the condition is computed), so there - // is no chance for values to be over-written. - - SmallSet<unsigned, 8> Visited; - if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) { - LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); - TII->moveToVALU(MI, MDT); + if ((!TRI->isVGPR(MRI, PHIRes) && RC0 != &AMDGPU::VReg_1RegClass) && + (hasVGPRInput || hasVGPRUses > 1)) { + TII->moveToVALU(MI); + } else { + TII->legalizeOperands(MI, MDT); } + break; } case AMDGPU::REG_SEQUENCE: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c2cda5ef4d7..8f93c63046c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9637,7 +9637,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, break; MVT VT = Src0.getValueType().getSimpleVT(); - const TargetRegisterClass *RC = getRegClassFor(VT); + const TargetRegisterClass *RC = + getRegClassFor(VT, Src0.getNode()->isDivergent()); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT); @@ -10171,3 +10172,91 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); } + +const TargetRegisterClass * +SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { + const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) + return &AMDGPU::SReg_64RegClass; + if (!TRI->isSGPRClass(RC) && !isDivergent) + return TRI->getEquivalentSGPRClass(RC); + else if (TRI->isSGPRClass(RC) && isDivergent) + return TRI->getEquivalentVGPRClass(RC); + + return RC; +} + +static bool hasIfBreakUser(const Value *V, SetVector<const Value *> &Visited) { + if (Visited.count(V)) + return false; + Visited.insert(V); + bool Result = false; + for (auto U : V->users()) { + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) { + if ((Intrinsic->getIntrinsicID() == Intrinsic::amdgcn_if_break) && + (V == U->getOperand(1))) + Result = true; + } else { + Result = hasIfBreakUser(U, Visited); + } + if (Result) + break; + } + return Result; +} + +bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, + const Value *V) const { + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if_break: + return true; + } + } + if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) { + if (const IntrinsicInst *Intrinsic = + dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: { + ArrayRef<unsigned> Indices = ExtValue->getIndices(); + if (Indices.size() == 1 && Indices[0] == 1) { + return true; + } + } + } + } + } + if (const CallInst *CI = dyn_cast<CallInst>(V)) { + if (isa<InlineAsm>(CI->getCalledValue())) { + const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); + ImmutableCallSite CS(CI); + TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( + MF.getDataLayout(), Subtarget->getRegisterInfo(), CS); + for (auto &TC : TargetConstraints) { + if (TC.Type == InlineAsm::isOutput) { + ComputeConstraintToUse(TC, SDValue()); + unsigned AssignedReg; + const TargetRegisterClass *RC; + std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint( + SIRI, TC.ConstraintCode, + getSimpleValueType(MF.getDataLayout(), CS.getType())); + if (RC) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg)) + return true; + else if (SIRI->isSGPRClass(RC)) + return true; + } + } + } + } + } + SetVector<const Value *> Visited; + return hasIfBreakUser(V, Visited); +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 60a474f51e5..094a0b054e2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -367,7 +367,10 @@ public: bool SNaN = false, unsigned Depth = 0) const override; AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; - + virtual const TargetRegisterClass * + getRegClassFor(MVT VT, bool isDivergent) const override; + virtual bool requiresUniformRegister(MachineFunction &MF, + const Value *V) const override; unsigned getPrefLoopAlignment(MachineLoop *ML) const override; }; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e42ed3505cf..14f5dbe6ad4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2219,6 +2219,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // These come before src2. removeModOperands(UseMI); UseMI.setDesc(get(NewOpc)); + // It might happen that UseMI was commuted + // and we now have SGPR as SRC1. If so 2 inlined + // constant and SGPR are illegal. + legalizeOperands(UseMI); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -3913,7 +3917,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, return; // Try to eliminate the copy if it is copying an immediate value. - if (Def->isMoveImmediate()) + if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) FoldImmediate(*Copy, *Def, OpReg, &MRI); } @@ -4147,7 +4151,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); - VRC = RI.getEquivalentVGPRClass(SRC); + if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { + VRC = &AMDGPU::VReg_1RegClass; + } else + VRC = RI.getEquivalentVGPRClass(SRC); } RC = VRC; } else { @@ -5309,7 +5316,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: case AMDGPU::WWM: - if (RI.hasVGPRs(NewDstRC)) + if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) return nullptr; NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index bfdc1ef9645..e2df3ae5ea7 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -195,6 +195,11 @@ public: unsigned Reg) const; bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + virtual bool + isDivergentRegClass(const TargetRegisterClass *RC) const override { + return !isSGPRClass(RC); + } + bool isSGPRPressureSet(unsigned SetID) const { return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID); } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 677e4d5b2e8..88d318e7bb3 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1447,7 +1447,9 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, /// getRegClassFor - Return the register class that should be used for the /// specified value type. -const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { +const TargetRegisterClass * +ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { + (void)isDivergent; // Map v4i64 to QQ registers but do not make the type legal. Similarly map // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to // load / store 4 to 8 consecutive D registers. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 3b94cb0dcb0..8e254d75b1c 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -456,7 +456,8 @@ class VectorType; /// getRegClassFor - Return the register class that should be used for the /// specified value type. - const TargetRegisterClass *getRegClassFor(MVT VT) const override; + const TargetRegisterClass * + getRegClassFor(MVT VT, bool isDivergent = false) const override; /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { |