summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp56
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll89
2 files changed, 139 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6be94ba5cbf..1b69cbdfcad 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3631,7 +3631,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// 3rd parameter required to be a constant.
const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
if (!Param)
- return DAG.getUNDEF(VT);
+ return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
// Translate to the operands expected by the machine instruction. The
// first parameter must be the same as the first instruction.
@@ -6005,6 +6005,60 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
legalizeTargetIndependentNode(Node, DAG);
return Node;
}
+
+ switch (Opcode) {
+ case AMDGPU::V_DIV_SCALE_F32:
+ case AMDGPU::V_DIV_SCALE_F64: {
+ // Satisfy the operand register constraint when one of the inputs is
+ // undefined. Ordinarily each undef value will have its own implicit_def of
+ // a vreg, so force these to use a single register.
+ SDValue Src0 = Node->getOperand(0);
+ SDValue Src1 = Node->getOperand(1);
+ SDValue Src2 = Node->getOperand(2);
+
+ if ((Src0.isMachineOpcode() &&
+ Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
+ (Src0 == Src1 || Src0 == Src2))
+ break;
+
+ MVT VT = Src0.getValueType().getSimpleVT();
+ const TargetRegisterClass *RC = getRegClassFor(VT);
+
+ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+ SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
+
+ SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
+ UndefReg, Src0, SDValue());
+
+ // src0 must be the same register as src1 or src2, even if the value is
+ // undefined, so make sure we don't violate this constraint.
+ if (Src0.isMachineOpcode() &&
+ Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
+ if (Src1.isMachineOpcode() &&
+ Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
+ Src0 = Src1;
+ else if (Src2.isMachineOpcode() &&
+ Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
+ Src0 = Src2;
+ else {
+ assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
+ Src0 = UndefReg;
+ Src1 = UndefReg;
+ }
+ } else
+ break;
+
+ SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
+ for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
+ Ops.push_back(Node->getOperand(I));
+
+ Ops.push_back(ImpDef.getValue(1));
+ return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+ }
+ default:
+ break;
+ }
+
return Node;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 0b4f09ac651..70b01d6977f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) nounwind readnone
-declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) nounwind readnone
-declare float @llvm.fabs.f32(float) nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) #1
+declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) #1
+declare float @llvm.fabs.f32(float) #1
; SI-LABEL: {{^}}test_div_scale_f32_1:
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
@@ -364,3 +364,82 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
store float %result0, float addrspace(1)* %out, align 4
ret void
}
+
+; SI-LABEL: {{^}}test_div_scale_f32_val_undef_val:
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x41000000
+; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], [[K]]
+define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 {
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false)
+ %result0 = extractvalue { float, i1 } %result, 0
+ store float %result0, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_undef_val_val:
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x41000000
+; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], [[K]]
+define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 {
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)
+ %result0 = extractvalue { float, i1 } %result, 0
+ store float %result0, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_val:
+; SI-NOT: v0
+; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, v0, v0, v0
+define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 {
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false)
+ %result0 = extractvalue { float, i1 } %result, 0
+ store float %result0, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; Undefined selector gets deleted
+; SI-LABEL: {{^}}test_div_scale_f32_val_undef_undef:
+; SI-NOT: v_div_scale
+define amdgpu_kernel void @test_div_scale_f32_val_undef_undef(float addrspace(1)* %out) #0 {
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 undef)
+ %result0 = extractvalue { float, i1 } %result, 0
+ store float %result0, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_undef:
+; SI-NOT: v_div_scale
+define amdgpu_kernel void @test_div_scale_f32_undef_undef_undef(float addrspace(1)* %out) #0 {
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 undef)
+ %result0 = extractvalue { float, i1 } %result, 0
+ store float %result0, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f32_val_val_undef:
+; SI-NOT: v_div_scale
+define amdgpu_kernel void @test_div_scale_f32_val_val_undef(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+ %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+ %a = load volatile float, float addrspace(1)* %gep.0, align 4
+ %b = load volatile float, float addrspace(1)* %gep.1, align 4
+
+ %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 undef)
+ %result0 = extractvalue { float, i1 } %result, 0
+ store float %result0, float addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_div_scale_f64_val_undef_val:
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x40200000
+; SI: v_div_scale_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, v[0:1], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* %out) #0 {
+ %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false)
+ %result0 = extractvalue { double, i1 } %result, 0
+ store double %result0, double addrspace(1)* %out, align 8
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
OpenPOWER on IntegriCloud