AMDGPU: Fix handling of div_scale with undef inputs

The src0 register must match src1 or src2, but if these were undefined they could end up using different implicit_defed virtual registers. Force these to use one undef vreg or pick the defined other register. Also fixes producing invalid nodes without the right number of inputs when src2 is undef. llvm-svn: 309743
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2017-08-01 20:49:41 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2017-08-01 20:49:41 +0000
commit: 206f8263489543a65361230e1e7ab7ea2d9c3ac8 (patch)
tree: 99855c0e2afd7be967ffb69e26b94876082ac820 /llvm
parent: 6886061dab1a53e361bddc1d480c2fdb7980d37d (diff)
download: bcm5719-llvm-206f8263489543a65361230e1e7ab7ea2d9c3ac8.tar.gz
bcm5719-llvm-206f8263489543a65361230e1e7ab7ea2d9c3ac8.zip
2 files changed, 139 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6be94ba5cbf..1b69cbdfcad 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3631,7 +3631,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     // 3rd parameter required to be a constant.
     const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
     if (!Param)
-      return DAG.getUNDEF(VT);
+      return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
 
     // Translate to the operands expected by the machine instruction. The
     // first parameter must be the same as the first instruction.
@@ -6005,6 +6005,60 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
     legalizeTargetIndependentNode(Node, DAG);
     return Node;
   }
+
+  switch (Opcode) {
+  case AMDGPU::V_DIV_SCALE_F32:
+  case AMDGPU::V_DIV_SCALE_F64: {
+    // Satisfy the operand register constraint when one of the inputs is
+    // undefined. Ordinarily each undef value will have its own implicit_def of
+    // a vreg, so force these to use a single register.
+    SDValue Src0 = Node->getOperand(0);
+    SDValue Src1 = Node->getOperand(1);
+    SDValue Src2 = Node->getOperand(2);
+
+    if ((Src0.isMachineOpcode() &&
+         Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
+        (Src0 == Src1 || Src0 == Src2))
+      break;
+
+    MVT VT = Src0.getValueType().getSimpleVT();
+    const TargetRegisterClass *RC = getRegClassFor(VT);
+
+    MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+    SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
+
+    SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
+                                      UndefReg, Src0, SDValue());
+
+    // src0 must be the same register as src1 or src2, even if the value is
+    // undefined, so make sure we don't violate this constraint.
+    if (Src0.isMachineOpcode() &&
+        Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
+      if (Src1.isMachineOpcode() &&
+          Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
+        Src0 = Src1;
+      else if (Src2.isMachineOpcode() &&
+               Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
+        Src0 = Src2;
+      else {
+        assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
+        Src0 = UndefReg;
+        Src1 = UndefReg;
+      }
+    } else
+      break;
+
+    SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
+    for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
+      Ops.push_back(Node->getOperand(I));
+
+    Ops.push_back(ImpDef.getValue(1));
+    return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+  }
+  default:
+    break;
+  }
+
   return Node;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 0b4f09ac651..70b01d6977f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
 
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) nounwind readnone
-declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) nounwind readnone
-declare float @llvm.fabs.f32(float) nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) #1
+declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) #1
+declare float @llvm.fabs.f32(float) #1
 
 ; SI-LABEL: {{^}}test_div_scale_f32_1:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
@@ -364,3 +364,82 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
   store float %result0, float addrspace(1)* %out, align 4
   ret void
 }
+
+; SI-LABEL: {{^}}test_div_scale_f32_val_undef_val:
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x41000000
+; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], [[K]]
+define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 {
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_undef_val_val:
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x41000000
+; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], [[K]]
+define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 {
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_val:
+; SI-NOT: v0
+; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, v0, v0, v0
+define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 {
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Undefined selector gets deleted
+; SI-LABEL: {{^}}test_div_scale_f32_val_undef_undef:
+; SI-NOT: v_div_scale
+define amdgpu_kernel void @test_div_scale_f32_val_undef_undef(float addrspace(1)* %out) #0 {
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 undef)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_undef:
+; SI-NOT: v_div_scale
+define amdgpu_kernel void @test_div_scale_f32_undef_undef_undef(float addrspace(1)* %out) #0 {
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 undef)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_val_val_undef:
+; SI-NOT: v_div_scale
+define amdgpu_kernel void @test_div_scale_f32_val_val_undef(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load volatile float, float addrspace(1)* %gep.0, align 4
+  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+
+  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 undef)
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f64_val_undef_val:
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x40200000
+; SI: v_div_scale_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, v[0:1], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* %out) #0 {
+  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false)
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2017-08-01 20:49:41 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2017-08-01 20:49:41 +0000
commit	206f8263489543a65361230e1e7ab7ea2d9c3ac8 (patch)
tree	99855c0e2afd7be967ffb69e26b94876082ac820 /llvm
parent	6886061dab1a53e361bddc1d480c2fdb7980d37d (diff)
download	bcm5719-llvm-206f8263489543a65361230e1e7ab7ea2d9c3ac8.tar.gz bcm5719-llvm-206f8263489543a65361230e1e7ab7ea2d9c3ac8.zip