summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td5
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp79
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td2
6 files changed, 88 insertions, 1 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index acdedab7e13..485927c8e44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3993,6 +3993,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FMED3)
NODE_NAME_CASE(SMED3)
NODE_NAME_CASE(UMED3)
+ NODE_NAME_CASE(FDOT2)
NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DIV_SCALE)
NODE_NAME_CASE(DIV_FMAS)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 1e027dd6712..30967d30fdf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -363,6 +363,7 @@ enum NodeType : unsigned {
FMED3,
SMED3,
UMED3,
+ FDOT2,
URECIP,
DIV_SCALE,
DIV_FMAS,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index f7ce519b291..96b7568eec1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -341,6 +341,11 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
+ SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
+ SDTCisFP<0>, SDTCisVec<1>]>,
+ []>;
+
def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5caf03e909b..db1f2b3a3c3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -623,6 +623,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
+ setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMIN);
@@ -4945,6 +4946,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_fmed3:
return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::amdgcn_fdot2:
+ return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_fmul_legacy:
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
Op.getOperand(1), Op.getOperand(2));
@@ -7476,6 +7480,79 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performFMACombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc SL(N);
+
+ if (!Subtarget->hasDLInsts() || VT != MVT::f32)
+ return SDValue();
+
+ // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
+ // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
+ SDValue Op1 = N->getOperand(0);
+ SDValue Op2 = N->getOperand(1);
+ SDValue FMA = N->getOperand(2);
+
+ if (FMA.getOpcode() != ISD::FMA ||
+ Op1.getOpcode() != ISD::FP_EXTEND ||
+ Op2.getOpcode() != ISD::FP_EXTEND)
+ return SDValue();
+
+ // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
+ // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
+ // is sufficient to allow generaing fdot2.
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+ (N->getFlags().hasAllowContract() &&
+ FMA->getFlags().hasAllowContract())) {
+ Op1 = Op1.getOperand(0);
+ Op2 = Op2.getOperand(0);
+ if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue Vec1 = Op1.getOperand(0);
+ SDValue Idx1 = Op1.getOperand(1);
+ SDValue Vec2 = Op2.getOperand(0);
+
+ SDValue FMAOp1 = FMA.getOperand(0);
+ SDValue FMAOp2 = FMA.getOperand(1);
+ SDValue FMAAcc = FMA.getOperand(2);
+
+ if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
+ FMAOp2.getOpcode() != ISD::FP_EXTEND)
+ return SDValue();
+
+ FMAOp1 = FMAOp1.getOperand(0);
+ FMAOp2 = FMAOp2.getOperand(0);
+ if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue Vec3 = FMAOp1.getOperand(0);
+ SDValue Vec4 = FMAOp2.getOperand(0);
+ SDValue Idx2 = FMAOp1.getOperand(1);
+
+ if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
+ // Idx1 and Idx2 cannot be the same.
+ Idx1 == Idx2)
+ return SDValue();
+
+ if (Vec1 == Vec2 || Vec3 == Vec4)
+ return SDValue();
+
+ if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
+ return SDValue();
+
+ if ((Vec1 == Vec3 && Vec2 == Vec4) ||
+ (Vec1 == Vec4 && Vec2 == Vec3))
+ return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
+ }
+ return SDValue();
+}
+
SDValue SITargetLowering::performSetCCCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -7660,6 +7737,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performMinMaxCombine(N, DCI);
break;
}
+ case ISD::FMA:
+ return performFMACombine(N, DCI);
case ISD::LOAD: {
if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
return Widended;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 3e4ff84ab47..ad049f2a71c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -136,6 +136,7 @@ private:
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 8e2eff13d6d..5c78ada3211 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -167,7 +167,7 @@ defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
let SubtargetPredicate = HasDLInsts in {
-def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, int_amdgcn_fdot2>;
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;
def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;
def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;
def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>;
OpenPOWER on IntegriCloud