diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-10-22 16:27:27 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-10-22 16:27:27 +0000 |
commit | 687ec75d10bd860edb194d88d5438dcb1bc6eb92 (patch) | |
tree | 89cfd5a61dee8ea2ffaadea0623d22eb28a86278 /llvm/lib | |
parent | b96181c2bf1d068824c6fd635c0921d0ffd20187 (diff) | |
download | bcm5719-llvm-687ec75d10bd860edb194d88d5438dcb1bc6eb92.tar.gz bcm5719-llvm-687ec75d10bd860edb194d88d5438dcb1bc6eb92.zip |
DAG: Change behavior of fminnum/fmaxnum nodes
Introduce new versions that follow the IEEE semantics
to help with legalization that may need quieted inputs.
There are some regressions from inserting unnecessary
canonicalizes when these are matched from fast math
fcmp + select which should be fixed in a future commit.
llvm-svn: 344914
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 12 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 26 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 29 | ||||
-rw-r--r-- | llvm/lib/CodeGen/TargetLoweringBase.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 28 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 166 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 13 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP2Instructions.td | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3Instructions.td | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 4 |
17 files changed, 280 insertions, 50 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 381efb9cb94..f560f0e1a9c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7097,6 +7097,13 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, case ISD::SETLE: case ISD::SETULT: case ISD::SETULE: { + // Since it's known never nan to get here already, either fminnum or + // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is + // expanded in terms of it. + unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; + if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT)) + return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS); + unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM; if (TLI.isOperationLegalOrCustom(Opcode, TransformVT)) return DAG.getNode(Opcode, DL, VT, LHS, RHS); @@ -7108,6 +7115,10 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, case ISD::SETGE: case ISD::SETUGT: case ISD::SETUGE: { + unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE; + if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT)) + return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS); + unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM; if (TLI.isOperationLegalOrCustom(Opcode, TransformVT)) return DAG.getNode(Opcode, DL, VT, LHS, RHS); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 71d124c74ce..b73fc106a6b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3247,7 +3247,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(Tmp1); break; } - + case ISD::FMINNUM: + case ISD::FMAXNUM: { + if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Node, DAG)) + Results.push_back(Expanded); + break; + } case ISD::FSIN: case ISD::FCOS: { EVT VT = Node->getValueType(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 2c1a4942f68..e7edc0ef860 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -130,6 +130,7 @@ class VectorLegalizer { SDValue ExpandBITREVERSE(SDValue Op); SDValue ExpandCTLZ(SDValue Op); SDValue ExpandCTTZ(SDValue Op); + SDValue ExpandFMINNUM_FMAXNUM(SDValue Op); SDValue ExpandStrictFPOp(SDValue Op); /// Implements vector promotion. @@ -353,6 +354,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::FABS: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: case ISD::FMINNAN: case ISD::FMAXNAN: case ISD::FCOPYSIGN: @@ -721,6 +724,9 @@ SDValue VectorLegalizer::Expand(SDValue Op) { case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return ExpandCTTZ(Op); + case ISD::FMINNUM: + case ISD::FMAXNUM: + return ExpandFMINNUM_FMAXNUM(Op); case ISD::STRICT_FADD: case ISD::STRICT_FSUB: case ISD::STRICT_FMUL: @@ -1120,6 +1126,12 @@ SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) { return DAG.UnrollVectorOp(Op.getNode()); } +SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) { + if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG)) + return Expanded; + return DAG.UnrollVectorOp(Op.getNode()); +} + SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) { EVT VT = Op.getValueType(); EVT EltVT = VT.getVectorElementType(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 8d00b3249d1..2b5fd8d75f4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -113,6 +113,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::FMUL: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: case ISD::FMINNAN: case ISD::FMAXNAN: case ISD::SMIN: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 0f8bd080867..1f0f7325c9d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3712,9 +3712,31 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const // TODO: Refine on operand return false; } + case ISD::FMINNUM: + case ISD::FMAXNUM: { + // Only one needs to be known not-nan, since it will be returned if the + // other ends up being one. + return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) || + isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); + } + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: { + if (SNaN) + return true; + // This can return a NaN if either operand is an sNaN, or if both operands + // are NaN. + return (isKnownNeverNaN(Op.getOperand(0), false, Depth + 1) && + isKnownNeverSNaN(Op.getOperand(1), Depth + 1)) || + (isKnownNeverNaN(Op.getOperand(1), false, Depth + 1) && + isKnownNeverSNaN(Op.getOperand(0), Depth + 1)); + } + case ISD::FMINNAN: + case ISD::FMAXNAN: { + // TODO: Does this quiet or return the origina NaN as-is? + return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && + isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); - // TODO: Handle FMINNUM/FMAXNUM/FMINNAN/FMAXNAN when there is an agreement on - // what they should do. + } case ISD::EXTRACT_VECTOR_ELT: { return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 9967f0eba10..64a9764cce2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -176,6 +176,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FABS: return "fabs"; case ISD::FMINNUM: return "fminnum"; case ISD::FMAXNUM: return "fmaxnum"; + case ISD::FMINNUM_IEEE: return "fminnum_ieee"; + case ISD::FMAXNUM_IEEE: return "fmaxnum_ieee"; + case ISD::FMINNAN: return "fminnan"; case ISD::FMAXNAN: return "fmaxnan"; case ISD::FNEG: return "fneg"; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index b9b99b386af..ceedd06da1d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4113,6 +4113,35 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result, return true; } +SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, + SelectionDAG &DAG) const { + SDLoc dl(Node); + unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ? + ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; + EVT VT = Node->getValueType(0); + if (isOperationLegalOrCustom(NewOp, VT)) { + SDValue Quiet0 = Node->getOperand(0); + SDValue Quiet1 = Node->getOperand(1); + + if (!Node->getFlags().hasNoNaNs()) { + // Insert canonicalizes if it's possible we need to quiet to get correct + // sNaN behavior. + if (!DAG.isKnownNeverSNaN(Quiet0)) { + Quiet0 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0, + Node->getFlags()); + } + if (!DAG.isKnownNeverSNaN(Quiet1)) { + Quiet1 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1, + Node->getFlags()); + } + } + + return DAG.getNode(NewOp, dl, VT, Quiet0, Quiet1, Node->getFlags()); + } + + return SDValue(); +} + SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const { SDLoc SL(LD); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 03a29a3edf6..ddd5fc1df75 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -600,6 +600,8 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::CONCAT_VECTORS, VT, Expand); setOperationAction(ISD::FMINNUM, VT, Expand); setOperationAction(ISD::FMAXNUM, VT, Expand); + setOperationAction(ISD::FMINNUM_IEEE, VT, Expand); + setOperationAction(ISD::FMAXNUM_IEEE, VT, Expand); setOperationAction(ISD::FMINNAN, VT, Expand); setOperationAction(ISD::FMAXNAN, VT, Expand); setOperationAction(ISD::FMAD, VT, Expand); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index ae6b925800b..a1b9198f945 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -552,6 +552,8 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case ISD::FMAD: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: case ISD::FSIN: case ISD::FTRUNC: case ISD::FRINT: @@ -3512,6 +3514,10 @@ static unsigned inverseMinMax(unsigned Opc) { return ISD::FMINNUM; case ISD::FMINNUM: return ISD::FMAXNUM; + case ISD::FMAXNUM_IEEE: + return ISD::FMINNUM_IEEE; + case ISD::FMINNUM_IEEE: + return ISD::FMAXNUM_IEEE; case AMDGPUISD::FMAX_LEGACY: return AMDGPUISD::FMIN_LEGACY; case AMDGPUISD::FMIN_LEGACY: @@ -3617,6 +3623,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, } case ISD::FMAXNUM: case ISD::FMINNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: case AMDGPUISD::FMAX_LEGACY: case AMDGPUISD::FMIN_LEGACY: { // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 92d8991e582..0d22cb2e3e2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -360,6 +360,7 @@ enum NodeType : unsigned { SIN_HW, FMAX_LEGACY, FMIN_LEGACY, + FMAX3, SMAX3, UMAX3, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index ab00b1d6326..b7d1575ca89 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -152,8 +152,14 @@ def smax_oneuse : HasOneUseBinOp<smax>; def smin_oneuse : HasOneUseBinOp<smin>; def umax_oneuse : HasOneUseBinOp<umax>; def umin_oneuse : HasOneUseBinOp<umin>; + def fminnum_oneuse : HasOneUseBinOp<fminnum>; def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>; + +def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>; +def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>; + + def and_oneuse : HasOneUseBinOp<and>; def or_oneuse : HasOneUseBinOp<or>; def xor_oneuse : HasOneUseBinOp<xor>; @@ -837,3 +843,25 @@ class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat < (AMDGPUrcp (fsqrt vt:$src)), (RsqInst $src) >; + +// Instructions which select to the same v_min_f* +def fminnum_like : PatFrags<(ops node:$src0, node:$src1), + [(fminnum_ieee node:$src0, node:$src1), + (fminnum node:$src0, node:$src1)] +>; + +// Instructions which select to the same v_max_f* +def fmaxnum_like : PatFrags<(ops node:$src0, node:$src1), + [(fmaxnum_ieee node:$src0, node:$src1), + (fmaxnum node:$src0, node:$src1)] +>; + +def fminnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1), + [(fminnum_ieee_oneuse node:$src0, node:$src1), + (fminnum_oneuse node:$src0, node:$src1)] +>; + +def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1), + [(fmaxnum_ieee_oneuse node:$src0, node:$src1), + (fmaxnum_oneuse node:$src0, node:$src1)] +>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 81ff640f704..3ba04831d15 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -384,8 +384,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasBFE()) setHasExtractBitsInsn(true); - setOperationAction(ISD::FMINNUM, MVT::f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Custom); + setOperationAction(ISD::FMAXNUM, MVT::f32, Custom); + setOperationAction(ISD::FMINNUM, MVT::f64, Custom); + setOperationAction(ISD::FMAXNUM, MVT::f64, Custom); + + + // These are really only legal for ieee_mode functions. We should be avoiding + // them for functions that don't have ieee_mode enabled, so just say they are + // legal. + setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); @@ -474,8 +486,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP2 Actions. setOperationAction(ISD::BR_CC, MVT::f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); - setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); - setOperationAction(ISD::FMINNUM, MVT::f16, Legal); + setOperationAction(ISD::FDIV, MVT::f16, Custom); // F16 - VOP3 Actions. @@ -558,6 +569,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // This isn't really legal, but this avoids the legalizer unrolling it (and // allows matching fneg (fabs x) patterns) setOperationAction(ISD::FABS, MVT::v2f16, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::f16, Custom); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal); + setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal); + + setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom); + + setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand); + setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand); } if (Subtarget->hasVOP3PInsts()) { @@ -575,8 +597,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FADD, MVT::v2f16, Legal); setOperationAction(ISD::FMUL, MVT::v2f16, Legal); setOperationAction(ISD::FMA, MVT::v2f16, Legal); - setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); + + setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); @@ -596,6 +620,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FADD, MVT::v4f16, Custom); setOperationAction(ISD::FMUL, MVT::v4f16, Custom); + + setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom); @@ -634,6 +662,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::FMINNUM_IEEE); + setTargetDAGCombine(ISD::FMAXNUM_IEEE); setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SMIN); setTargetDAGCombine(ISD::SMAX); @@ -3580,6 +3610,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FNEG: case ISD::FCANONICALIZE: return splitUnaryVectorOp(Op, DAG); + case ISD::FMINNUM: + case ISD::FMAXNUM: + return lowerFMINNUM_FMAXNUM(Op, DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: @@ -3590,10 +3623,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: - case ISD::FMINNUM: - case ISD::FMAXNUM: case ISD::FADD: case ISD::FMUL: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: return splitBinaryVectorOp(Op, DAG); } return SDValue(); @@ -4048,6 +4081,23 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); } +SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + + // FIXME: Assert during eslection that this is only selected for + // ieee_mode. Currently a combine can produce the ieee version for non-ieee + // mode functions, but this happens to be OK since it's only done in cases + // where there is known no sNaN. + if (IsIEEEMode) + return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); + + if (VT == MVT::v4f16) + return splitBinaryVectorOp(Op, DAG); + return Op; +} + SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); @@ -7521,37 +7571,32 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: case AMDGPUISD::CLAMP: case AMDGPUISD::FMED3: case AMDGPUISD::FMAX3: case AMDGPUISD::FMIN3: { // FIXME: Shouldn't treat the generic operations different based these. - bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); - if (IsIEEEMode) { - // snans will be quieted, so we only need to worry about denormals. - if (Subtarget->supportsMinMaxDenormModes() || - denormalsEnabledForType(Op.getValueType())) - return true; - - // Flushing may be required. - // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such - // targets need to check their input recursively. - return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) && - isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); - } + // However, we aren't really required to flush the result from + // minnum/maxnum.. + // snans will be quieted, so we only need to worry about denormals. if (Subtarget->supportsMinMaxDenormModes() || - denormalsEnabledForType(Op.getValueType())) { - // Only quieting may be necessary. - return DAG.isKnownNeverSNaN(Op.getOperand(0)) && - DAG.isKnownNeverSNaN(Op.getOperand(1)); + denormalsEnabledForType(Op.getValueType())) + return true; + + // Flushing may be required. + // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such + // targets need to check their input recursively. + + // FIXME: Does this apply with clamp? It's implemented with max. + for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) { + if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1)) + return false; } - // Flushing and quieting may be necessary - // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it - // needs to be quieted. - return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) && - isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); + return true; } case ISD::SELECT: { return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) && @@ -7578,6 +7623,21 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, // Could be anything. return false; + case ISD::BITCAST: { + // Hack round the mess we make when legalizing extract_vector_elt + SDValue Src = Op.getOperand(0); + if (Src.getValueType() == MVT::i16 && + Src.getOpcode() == ISD::TRUNCATE) { + SDValue TruncSrc = Src.getOperand(0); + if (TruncSrc.getValueType() == MVT::i32 && + TruncSrc.getOpcode() == ISD::BITCAST && + TruncSrc.getOperand(0).getValueType() == MVT::v2f16) { + return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1); + } + } + + return false; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); @@ -7603,7 +7663,6 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, } // Constant fold canonicalize. - SDValue SITargetLowering::getCanonicalConstantFP( SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { // Flush denormals to 0 if not enabled. @@ -7699,18 +7758,40 @@ SDValue SITargetLowering::performFCanonicalizeCombine( } } + unsigned SrcOpc = N0.getOpcode(); + + // If it's free to do so, push canonicalizes further up the source, which may + // find a canonical source. + // + // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for + // sNaNs. + if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) { + auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1)); + if (CRHS && N0.hasOneUse()) { + SDLoc SL(N); + SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT, + N0.getOperand(0)); + SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF()); + DCI.AddToWorklist(Canon0.getNode()); + + return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1); + } + } + return isCanonicalized(DAG, N0) ? N0 : SDValue(); } static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { switch (Opc) { case ISD::FMAXNUM: + case ISD::FMAXNUM_IEEE: return AMDGPUISD::FMAX3; case ISD::SMAX: return AMDGPUISD::SMAX3; case ISD::UMAX: return AMDGPUISD::UMAX3; case ISD::FMINNUM: + case ISD::FMINNUM_IEEE: return AMDGPUISD::FMIN3; case ISD::SMIN: return AMDGPUISD::SMIN3; @@ -7877,6 +7958,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || + (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) || (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && (VT == MVT::f32 || VT == MVT::f64 || @@ -7995,7 +8077,9 @@ SDValue SITargetLowering::performExtractVectorEltCombine( case ISD::SMIN: case ISD::SMAX: case ISD::FMAXNUM: - case ISD::FMINNUM: { + case ISD::FMINNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: { SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec.getOperand(0), Idx); SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, @@ -8595,13 +8679,15 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performSetCCCombine(N, DCI); case ISD::FMAXNUM: case ISD::FMINNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: { - if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + if (//DCI.getDAGCombineLevel() >= AfterLegalizeDAG && getTargetMachine().getOptLevel() > CodeGenOpt::None) return performMinMaxCombine(N, DCI); break; @@ -9320,3 +9406,17 @@ bool SITargetLowering::denormalsEnabledForType(EVT VT) const { return false; } } + +bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN, + unsigned Depth) const { + if (Op.getOpcode() == AMDGPUISD::CLAMP) { + if (Subtarget->enableDX10Clamp()) + return true; // Clamped to 0. + return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); + } + + return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG, + SNaN, Depth); +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 1b0cb06a9b0..bcb46ec41d1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -110,6 +110,7 @@ private: /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; SDValue getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const; @@ -346,6 +347,11 @@ public: bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; bool denormalsEnabledForType(EVT VT) const; + + bool isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN = false, + unsigned Depth = 0) const override; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 1336a576e84..67aea73d1ca 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1645,10 +1645,11 @@ def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>; // This matches 16 permutations of // max(min(x, y), min(max(x, y), z)) class FPMed3Pat<ValueType vt, + //SDPatternOperator max, SDPatternOperator min, Instruction med3Inst> : GCNPat< - (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), - (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) @@ -1656,10 +1657,10 @@ class FPMed3Pat<ValueType vt, class FP16Med3Pat<ValueType vt, Instruction med3Inst> : GCNPat< - (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), - (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), - (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), - (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE) >; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index e9d12ba83f3..db031be7e55 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -393,8 +393,8 @@ defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>; defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_u24>; defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>; -defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>; -defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>; +defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>; +defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>; defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>; defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>; defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>; @@ -556,8 +556,8 @@ defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>; -defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>; -defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>; +defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>; +defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>; defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>; defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>; defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 96b233b5a38..51bee3efeb2 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -295,8 +295,8 @@ let SchedRW = [WriteDoubleAdd] in { def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>; def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>; def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>; -def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>; -def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>; +def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>; +def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>; } // End SchedRW = [WriteDoubleAdd] let SchedRW = [WriteQuarterRate32] in { diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 41e21c116a9..c91d911a283 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -48,8 +48,8 @@ def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_ def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>; def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>; -def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>; -def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>; +def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>; +def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>; def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; |