13 files changed, 213 insertions, 135 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 117654bc7a3..d53ee3134d5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13919,9 +13919,11 @@ struct LoadedSlice {
     assert(DAG && "Missing context");
     const TargetLowering &TLI = DAG->getTargetLoweringInfo();
     EVT ResVT = Use->getValueType(0);
-    const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT());
+    const TargetRegisterClass *ResRC =
+        TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent());
     const TargetRegisterClass *ArgRC =
-        TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT());
+        TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT(),
+                           Use->getOperand(0)->isDivergent());
     if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
       return false;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index d8ef10f58aa..8b405562904 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -85,6 +85,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
   RegInfo = &MF->getRegInfo();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
   unsigned StackAlign = TFI->getStackAlignment();
+  DA = DAG->getDivergenceAnalysis();
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
@@ -345,9 +346,9 @@ void FunctionLoweringInfo::clear() {
 }
 
 /// CreateReg - Allocate a single virtual register for the given type.
-unsigned FunctionLoweringInfo::CreateReg(MVT VT) {
+unsigned FunctionLoweringInfo::CreateReg(MVT VT, bool isDivergent) {
   return RegInfo->createVirtualRegister(
-      MF->getSubtarget().getTargetLowering()->getRegClassFor(VT));
+      MF->getSubtarget().getTargetLowering()->getRegClassFor(VT, isDivergent));
 }
 
 /// CreateRegs - Allocate the appropriate number of virtual registers of
@@ -357,7 +358,7 @@ unsigned FunctionLoweringInfo::CreateReg(MVT VT) {
 /// In the case that the given value has struct or array type, this function
 /// will assign registers for each member or element.
 ///
-unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) {
+unsigned FunctionLoweringInfo::CreateRegs(Type *Ty, bool isDivergent) {
   const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
 
   SmallVector<EVT, 4> ValueVTs;
@@ -370,13 +371,18 @@ unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) {
 
     unsigned NumRegs = TLI->getNumRegisters(Ty->getContext(), ValueVT);
     for (unsigned i = 0; i != NumRegs; ++i) {
-      unsigned R = CreateReg(RegisterVT);
+      unsigned R = CreateReg(RegisterVT, isDivergent);
       if (!FirstReg) FirstReg = R;
     }
   }
   return FirstReg;
 }
 
+unsigned FunctionLoweringInfo::CreateRegs(const Value *V) {
+  return CreateRegs(V->getType(), DA && !TLI->requiresUniformRegister(*MF, V) &&
+                                      DA->isDivergent(V));
+}
+
 /// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the
 /// register is a PHI destination and the PHI's LiveOutInfo is not valid. If
 /// the register's LiveOutInfo is for a smaller bit width, it is extended to
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 059e5f7c8dd..4b78d1bb6b1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -105,7 +105,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
 
   // Stick to the preferred register classes for legal types.
   if (TLI->isTypeLegal(VT))
-    UseRC = TLI->getRegClassFor(VT);
+    UseRC = TLI->getRegClassFor(VT, Node->isDivergent());
 
   if (!IsClone && !IsCloned)
     for (SDNode *User : Node->uses()) {
@@ -164,7 +164,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
            "Incompatible phys register def and uses!");
     DstRC = UseRC;
   } else {
-    DstRC = TLI->getRegClassFor(VT);
+    DstRC = TLI->getRegClassFor(VT, Node->isDivergent());
   }
 
   // If all uses are reading from the src physical register and copying the
@@ -225,8 +225,9 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
     // type correctly. For example, a 64-bit float (X86::FR64) can't live in
     // the 32-bit float super-class (X86::FR32).
     if (i < NumResults && TLI->isTypeLegal(Node->getSimpleValueType(i))) {
-      const TargetRegisterClass *VTRC =
-        TLI->getRegClassFor(Node->getSimpleValueType(i));
+      const TargetRegisterClass *VTRC = TLI->getRegClassFor(
+          Node->getSimpleValueType(i),
+          (Node->isDivergent() || (RC && TRI->isDivergentRegClass(RC))));
       if (RC)
         VTRC = TRI->getCommonSubClass(RC, VTRC);
       if (VTRC)
@@ -289,8 +290,8 @@ unsigned InstrEmitter::getVR(SDValue Op,
     // IMPLICIT_DEF can produce any type of result so its MCInstrDesc
     // does not include operand register class info.
     if (!VReg) {
-      const TargetRegisterClass *RC =
-        TLI->getRegClassFor(Op.getSimpleValueType());
+      const TargetRegisterClass *RC = TLI->getRegClassFor(
+          Op.getSimpleValueType(), Op.getNode()->isDivergent());
       VReg = MRI->createVirtualRegister(RC);
     }
     BuildMI(*MBB, InsertPos, Op.getDebugLoc(),
@@ -395,11 +396,15 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
   } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
     unsigned VReg = R->getReg();
     MVT OpVT = Op.getSimpleValueType();
-    const TargetRegisterClass *OpRC =
-        TLI->isTypeLegal(OpVT) ? TLI->getRegClassFor(OpVT) : nullptr;
     const TargetRegisterClass *IIRC =
         II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI, *MF))
            : nullptr;
+    const TargetRegisterClass *OpRC =
+        TLI->isTypeLegal(OpVT)
+            ? TLI->getRegClassFor(OpVT,
+                                  Op.getNode()->isDivergent() ||
+                                      (IIRC && TRI->isDivergentRegClass(IIRC)))
+            : nullptr;
 
     if (OpRC && IIRC && OpRC != IIRC &&
         TargetRegisterInfo::isVirtualRegister(VReg)) {
@@ -464,7 +469,7 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
 }
 
 unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
-                                          MVT VT, const DebugLoc &DL) {
+                                          MVT VT, bool isDivergent, const DebugLoc &DL) {
   const TargetRegisterClass *VRC = MRI->getRegClass(VReg);
   const TargetRegisterClass *RC = TRI->getSubClassWithSubReg(VRC, SubIdx);
 
@@ -479,7 +484,7 @@ unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
 
   // VReg couldn't be reasonably constrained.  Emit a COPY to a new virtual
   // register instead.
-  RC = TRI->getSubClassWithSubReg(TLI->getRegClassFor(VT), SubIdx);
+  RC = TRI->getSubClassWithSubReg(TLI->getRegClassFor(VT, isDivergent), SubIdx);
   assert(RC && "No legal register class for VT supports that SubIdx");
   unsigned NewReg = MRI->createVirtualRegister(RC);
   BuildMI(*MBB, InsertPos, DL, TII->get(TargetOpcode::COPY), NewReg)
@@ -514,7 +519,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     // classes.
     unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
     const TargetRegisterClass *TRC =
-      TLI->getRegClassFor(Node->getSimpleValueType(0));
+      TLI->getRegClassFor(Node->getSimpleValueType(0), Node->isDivergent());
 
     unsigned Reg;
     MachineInstr *DefMI;
@@ -548,8 +553,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       if (TargetRegisterInfo::isVirtualRegister(Reg))
         Reg = ConstrainForSubReg(Reg, SubIdx,
                                  Node->getOperand(0).getSimpleValueType(),
-                                 Node->getDebugLoc());
-
+                                 Node->isDivergent(), Node->getDebugLoc());
       // Create the destreg if it is missing.
       if (VRBase == 0)
         VRBase = MRI->createVirtualRegister(TRC);
@@ -584,7 +588,8 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     //
     // There is no constraint on the %src register class.
     //
-    const TargetRegisterClass *SRC = TLI->getRegClassFor(Node->getSimpleValueType(0));
+    const TargetRegisterClass *SRC =
+        TLI->getRegClassFor(Node->getSimpleValueType(0), Node->isDivergent());
     SRC = TRI->getSubClassWithSubReg(SRC, SubIdx);
     assert(SRC && "No register class supports VT and SubIdx for INSERT_SUBREG");
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
index 3188c2678f1..42f7846fe7c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -83,7 +83,7 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
   /// supports SubIdx sub-registers.  Emit a copy if that isn't possible.
   /// Return the virtual register to use.
   unsigned ConstrainForSubReg(unsigned VReg, unsigned SubIdx, MVT VT,
-                              const DebugLoc &DL);
+                              bool isDivergent, const DebugLoc &DL);
 
   /// EmitSubregNode - Generate machine code for subreg nodes.
   ///
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 76e5847ba11..a5274877ece 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -9844,7 +9844,7 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       if (const Constant *C = dyn_cast<Constant>(PHIOp)) {
         unsigned &RegOut = ConstantsOut[C];
         if (RegOut == 0) {
-          RegOut = FuncInfo.CreateRegs(C->getType());
+          RegOut = FuncInfo.CreateRegs(C);
           CopyValueToVirtualRegister(C, RegOut);
         }
         Reg = RegOut;
@@ -9857,7 +9857,7 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
           assert(isa<AllocaInst>(PHIOp) &&
                  FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
                  "Didn't codegen value into a register!??");
-          Reg = FuncInfo.CreateRegs(PHIOp->getType());
+          Reg = FuncInfo.CreateRegs(PHIOp);
           CopyValueToVirtualRegister(PHIOp, Reg);
         }
       }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 6c9a1cd646e..6f55f98c51f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1485,7 +1485,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
               !Inst->use_empty()) {
             unsigned &R = FuncInfo->ValueMap[Inst];
             if (!R)
-              R = FuncInfo->CreateRegs(Inst->getType());
+              R = FuncInfo->CreateRegs(Inst);
           }
 
           bool HadTailCall = false;
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 94b1e636c7b..8ad7a52c92b 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -302,52 +302,6 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
   return true;
 }
 
-static bool phiHasVGPROperands(const MachineInstr &PHI,
-                               const MachineRegisterInfo &MRI,
-                               const SIRegisterInfo *TRI,
-                               const SIInstrInfo *TII) {
-  for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
-    unsigned Reg = PHI.getOperand(i).getReg();
-    if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
-      return true;
-  }
-  return false;
-}
-
-static bool phiHasBreakDef(const MachineInstr &PHI,
-                           const MachineRegisterInfo &MRI,
-                           SmallSet<unsigned, 8> &Visited) {
-  for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
-    unsigned Reg = PHI.getOperand(i).getReg();
-    if (Visited.count(Reg))
-      continue;
-
-    Visited.insert(Reg);
-
-    MachineInstr *DefInstr = MRI.getVRegDef(Reg);
-    switch (DefInstr->getOpcode()) {
-    default:
-      break;
-    case AMDGPU::SI_IF_BREAK:
-      return true;
-    case AMDGPU::PHI:
-      if (phiHasBreakDef(*DefInstr, MRI, Visited))
-        return true;
-    }
-  }
-  return false;
-}
-
-static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
-                                          const TargetRegisterInfo &TRI) {
-  for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
-       E = MBB.end(); I != E; ++I) {
-    if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
-      return true;
-  }
-  return false;
-}
-
 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
                                     const MachineInstr *MoveImm,
                                     const SIInstrInfo *TII,
@@ -409,12 +363,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
   return false;
 }
 
-static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
-                                        const TargetRegisterInfo *TRI) {
-  return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
-           return hasTerminatorThatModifiesExec(*MBB, *TRI); });
-}
-
 // Checks if there is potential path From instruction To instruction.
 // If CutOff is specified and it sits in between of that path we ignore
 // a higher portion of the path and report it is not reachable.
@@ -621,63 +569,73 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         break;
       }
       case AMDGPU::PHI: {
-        unsigned Reg = MI.getOperand(0).getReg();
-        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
-          break;
-
-        // We don't need to fix the PHI if the common dominator of the
-        // two incoming blocks terminates with a uniform branch.
-        bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
-        if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
-          MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
-          MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
-
-          if (!predsHasDivergentTerminator(MBB0, TRI) &&
-              !predsHasDivergentTerminator(MBB1, TRI)) {
-            LLVM_DEBUG(dbgs()
-                       << "Not fixing PHI for uniform branch: " << MI << '\n');
+        unsigned hasVGPRUses = 0;
+        SetVector<const MachineInstr *> worklist;
+        worklist.insert(&MI);
+        while (!worklist.empty()) {
+          const MachineInstr *Instr = worklist.pop_back_val();
+          unsigned Reg = Instr->getOperand(0).getReg();
+          for (const auto &Use : MRI.use_operands(Reg)) {
+            const MachineInstr *UseMI = Use.getParent();
+            if (UseMI->isCopy() || UseMI->isRegSequence()) {
+              if (UseMI->isCopy() &&
+                  TRI->isPhysicalRegister(UseMI->getOperand(0).getReg()) &&
+                  !TRI->isSGPRReg(MRI, UseMI->getOperand(0).getReg())) {
+                hasVGPRUses++;
+              }
+              worklist.insert(UseMI);
+              continue;
+            }
+
+            if (UseMI->isPHI()) {
+              if (!TRI->isSGPRReg(MRI, Use.getReg()))
+                hasVGPRUses++;
+              continue;
+            }
+
+            unsigned OpNo = UseMI->getOperandNo(&Use);
+            const MCInstrDesc &Desc = TII->get(UseMI->getOpcode());
+            if (Desc.OpInfo && Desc.OpInfo[OpNo].RegClass != -1) {
+              const TargetRegisterClass *OpRC =
+                  TRI->getRegClass(Desc.OpInfo[OpNo].RegClass);
+              if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
+                  OpRC != &AMDGPU::VS_64RegClass) {
+                hasVGPRUses++;
+              }
+            }
+          }
+        }
+        bool hasVGPRInput = false;
+        for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+          unsigned InputReg = MI.getOperand(i).getReg();
+          MachineInstr *Def = MRI.getVRegDef(InputReg);
+          if (TRI->isVGPR(MRI, InputReg)) {
+            if (Def->isCopy()) {
+              unsigned SrcReg = Def->getOperand(1).getReg();
+              const TargetRegisterClass *RC =
+                  TRI->isVirtualRegister(SrcReg) ? MRI.getRegClass(SrcReg)
+                                                 : TRI->getPhysRegClass(SrcReg);
+              if (TRI->isSGPRClass(RC))
+                continue;
+            }
+            hasVGPRInput = true;
+            break;
+          } else if (Def->isCopy() &&
+                     TRI->isVGPR(MRI, Def->getOperand(1).getReg())) {
+            hasVGPRInput = true;
             break;
           }
         }
+        unsigned PHIRes = MI.getOperand(0).getReg();
+        const TargetRegisterClass *RC0 = MRI.getRegClass(PHIRes);
 
-        // If a PHI node defines an SGPR and any of its operands are VGPRs,
-        // then we need to move it to the VALU.
-        //
-        // Also, if a PHI node defines an SGPR and has all SGPR operands
-        // we must move it to the VALU, because the SGPR operands will
-        // all end up being assigned the same register, which means
-        // there is a potential for a conflict if different threads take
-        // different control flow paths.
-        //
-        // For Example:
-        //
-        // sgpr0 = def;
-        // ...
-        // sgpr1 = def;
-        // ...
-        // sgpr2 = PHI sgpr0, sgpr1
-        // use sgpr2;
-        //
-        // Will Become:
-        //
-        // sgpr2 = def;
-        // ...
-        // sgpr2 = def;
-        // ...
-        // use sgpr2
-        //
-        // The one exception to this rule is when one of the operands
-        // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
-        // instruction.  In this case, there we know the program will
-        // never enter the second block (the loop) without entering
-        // the first block (where the condition is computed), so there
-        // is no chance for values to be over-written.
-
-        SmallSet<unsigned, 8> Visited;
-        if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
-          LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
-          TII->moveToVALU(MI, MDT);
+        if ((!TRI->isVGPR(MRI, PHIRes) && RC0 != &AMDGPU::VReg_1RegClass) &&
+            (hasVGPRInput || hasVGPRUses > 1)) {
+          TII->moveToVALU(MI);
+        } else {
+          TII->legalizeOperands(MI, MDT);
         }
+
         break;
       }
       case AMDGPU::REG_SEQUENCE:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c2cda5ef4d7..8f93c63046c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9637,7 +9637,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
       break;
 
     MVT VT = Src0.getValueType().getSimpleVT();
-    const TargetRegisterClass *RC = getRegClassFor(VT);
+    const TargetRegisterClass *RC =
+        getRegClassFor(VT, Src0.getNode()->isDivergent());
 
     MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
     SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
@@ -10171,3 +10172,91 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
 
   return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
 }
+
+const TargetRegisterClass *
+SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
+  const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
+  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
+    return &AMDGPU::SReg_64RegClass;
+  if (!TRI->isSGPRClass(RC) && !isDivergent)
+    return TRI->getEquivalentSGPRClass(RC);
+  else if (TRI->isSGPRClass(RC) && isDivergent)
+    return TRI->getEquivalentVGPRClass(RC);
+
+  return RC;
+}
+
+static bool hasIfBreakUser(const Value *V, SetVector<const Value *> &Visited) {
+  if (Visited.count(V))
+    return false;
+  Visited.insert(V);
+  bool Result = false;
+  for (auto U : V->users()) {
+    if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
+      if ((Intrinsic->getIntrinsicID() == Intrinsic::amdgcn_if_break) &&
+          (V == U->getOperand(1)))
+        Result = true;
+    } else {
+      Result = hasIfBreakUser(U, Visited);
+    }
+    if (Result)
+      break;
+  }
+  return Result;
+}
+
+bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
+                                               const Value *V) const {
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+    switch (Intrinsic->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::amdgcn_if_break:
+      return true;
+    }
+  }
+  if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) {
+    if (const IntrinsicInst *Intrinsic =
+            dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) {
+      switch (Intrinsic->getIntrinsicID()) {
+      default:
+        return false;
+      case Intrinsic::amdgcn_if:
+      case Intrinsic::amdgcn_else: {
+        ArrayRef<unsigned> Indices = ExtValue->getIndices();
+        if (Indices.size() == 1 && Indices[0] == 1) {
+          return true;
+        }
+      }
+      }
+    }
+  }
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (isa<InlineAsm>(CI->getCalledValue())) {
+      const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
+      ImmutableCallSite CS(CI);
+      TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
+          MF.getDataLayout(), Subtarget->getRegisterInfo(), CS);
+      for (auto &TC : TargetConstraints) {
+        if (TC.Type == InlineAsm::isOutput) {
+          ComputeConstraintToUse(TC, SDValue());
+          unsigned AssignedReg;
+          const TargetRegisterClass *RC;
+          std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint(
+              SIRI, TC.ConstraintCode,
+              getSimpleValueType(MF.getDataLayout(), CS.getType()));
+          if (RC) {
+            MachineRegisterInfo &MRI = MF.getRegInfo();
+            if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg))
+              return true;
+            else if (SIRI->isSGPRClass(RC))
+              return true;
+          }
+        }
+      }
+    }
+  }
+  SetVector<const Value *> Visited;
+  return hasIfBreakUser(V, Visited);
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 60a474f51e5..094a0b054e2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -367,7 +367,10 @@ public:
                                     bool SNaN = false,
                                     unsigned Depth = 0) const override;
   AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
-
+  virtual const TargetRegisterClass *
+  getRegClassFor(MVT VT, bool isDivergent) const override;
+  virtual bool requiresUniformRegister(MachineFunction &MF,
+                                       const Value *V) const override;
   unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
 };
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e42ed3505cf..14f5dbe6ad4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2219,6 +2219,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       // These come before src2.
       removeModOperands(UseMI);
       UseMI.setDesc(get(NewOpc));
+      // It might happen that UseMI was commuted
+      // and we now have SGPR as SRC1. If so 2 inlined
+      // constant and SGPR are illegal.
+      legalizeOperands(UseMI);
 
       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
       if (DeleteDef)
@@ -3913,7 +3917,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
     return;
 
   // Try to eliminate the copy if it is copying an immediate value.
-  if (Def->isMoveImmediate())
+  if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
     FoldImmediate(*Copy, *Def, OpReg, &MRI);
 }
 
@@ -4147,7 +4151,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
       if (!VRC) {
         assert(SRC);
-        VRC = RI.getEquivalentVGPRClass(SRC);
+       if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
+          VRC = &AMDGPU::VReg_1RegClass;
+        } else
+          VRC = RI.getEquivalentVGPRClass(SRC);
       }
       RC = VRC;
     } else {
@@ -5309,7 +5316,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
   case AMDGPU::INSERT_SUBREG:
   case AMDGPU::WQM:
   case AMDGPU::WWM:
-    if (RI.hasVGPRs(NewDstRC))
+    if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
       return nullptr;
 
     NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index bfdc1ef9645..e2df3ae5ea7 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -195,6 +195,11 @@ public:
                                                unsigned Reg) const;
   bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
 
+  virtual bool
+  isDivergentRegClass(const TargetRegisterClass *RC) const override {
+    return !isSGPRClass(RC);
+  }
+
   bool isSGPRPressureSet(unsigned SetID) const {
     return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID);
   }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 677e4d5b2e8..88d318e7bb3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1447,7 +1447,9 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
 
 /// getRegClassFor - Return the register class that should be used for the
 /// specified value type.
-const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
+const TargetRegisterClass *
+ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
+  (void)isDivergent;
   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
   // load / store 4 to 8 consecutive D registers.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 3b94cb0dcb0..8e254d75b1c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -456,7 +456,8 @@ class VectorType;
 
     /// getRegClassFor - Return the register class that should be used for the
     /// specified value type.
-    const TargetRegisterClass *getRegClassFor(MVT VT) const override;
+    const TargetRegisterClass *
+    getRegClassFor(MVT VT, bool isDivergent = false) const override;
 
     /// Returns true if a cast between SrcAS and DestAS is a noop.
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {