2 files changed, 139 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6be94ba5cbf..1b69cbdfcad 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3631,7 +3631,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     // 3rd parameter required to be a constant.
     const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
     if (!Param)
-      return DAG.getUNDEF(VT);
+      return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
 
     // Translate to the operands expected by the machine instruction. The
     // first parameter must be the same as the first instruction.
@@ -6005,6 +6005,60 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
     legalizeTargetIndependentNode(Node, DAG);
     return Node;
   }
+
+  switch (Opcode) {
+  case AMDGPU::V_DIV_SCALE_F32:
+  case AMDGPU::V_DIV_SCALE_F64: {
+    // Satisfy the operand register constraint when one of the inputs is
+    // undefined. Ordinarily each undef value will have its own implicit_def of
+    // a vreg, so force these to use a single register.
+    SDValue Src0 = Node->getOperand(0);
+    SDValue Src1 = Node->getOperand(1);
+    SDValue Src2 = Node->getOperand(2);
+
+    if ((Src0.isMachineOpcode() &&
+         Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
+        (Src0 == Src1 || Src0 == Src2))
+      break;
+
+    MVT VT = Src0.getValueType().getSimpleVT();
+    const TargetRegisterClass *RC = getRegClassFor(VT);
+
+    MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+    SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
+
+    SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
+                                      UndefReg, Src0, SDValue());
+
+    // src0 must be the same register as src1 or src2, even if the value is
+    // undefined, so make sure we don't violate this constraint.
+    if (Src0.isMachineOpcode() &&
+        Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
+      if (Src1.isMachineOpcode() &&
+          Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
+        Src0 = Src1;
+      else if (Src2.isMachineOpcode() &&
+               Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
+        Src0 = Src2;
+      else {
+        assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
+        Src0 = UndefReg;
+        Src1 = UndefReg;
+      }
+    } else
+      break;
+
+    SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
+    for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
+      Ops.push_back(Node->getOperand(I));
+
+    Ops.push_back(ImpDef.getValue(1));
+    return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+  }
+  default:
+    break;
+  }
+
   return Node;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 0b4f09ac651..70b01d6977f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
 
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) nounwind readnone
-declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) nounwind readnone
-declare float @llvm.fabs.f32(float) nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) #1
+declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) #1
+declare float @llvm.fabs.f32(float) #1
 
 ; SI-LABEL: {{^}}test_div_scale_f32_1:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
@@ -364,3 +364,82 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
+
+; SI-LABEL: {{^}}test_div_scale_f32_val_undef_val:
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x41000000
+; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], [[K]]
+define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 {
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_undef_val_val:
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x41000000
+; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], [[K]]
+define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 {
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_val:
+; SI-NOT: v0
+; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, v0, v0, v0
+define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 {
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Undefined selector gets deleted
+; SI-LABEL: {{^}}test_div_scale_f32_val_undef_undef:
+; SI-NOT: v_div_scale
+define amdgpu_kernel void @test_div_scale_f32_val_undef_undef(float addrspace(1)* %out) #0 {
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 undef)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_undef:
+; SI-NOT: v_div_scale
+define amdgpu_kernel void @test_div_scale_f32_undef_undef_undef(float addrspace(1)* %out) #0 {
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 undef)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_val_val_undef:
+; SI-NOT: v_div_scale
+define amdgpu_kernel void @test_div_scale_f32_val_val_undef(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 undef)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f64_val_undef_val:
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x40200000
+; SI: v_div_scale_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, v[0:1], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* %out) #0 {
+  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false)
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }