3 files changed, 47 insertions, 27 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0eb8d35cafa..f03deb1c66a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17454,19 +17454,34 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
             : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
 
       if (!Reciprocal) {
-        // Unfortunately, Est is now NaN if the input was exactly 0.0.
-        // Select out this case and force the answer to 0.0.
+        // The estimate is now completely wrong if the input was exactly 0.0 or
+        // possibly a denormal. Force the answer to 0.0 for those cases.
         EVT VT = Op.getValueType();
         SDLoc DL(Op);
-
-        SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
         EVT CCVT = getSetCCResultType(VT);
-        SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
-        AddToWorklist(ZeroCmp.getNode());
-
-        Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
-                          ZeroCmp, FPZero, Est);
-        AddToWorklist(Est.getNode());
+        ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
+        const Function &F = DAG.getMachineFunction().getFunction();
+        Attribute Denorms = F.getFnAttribute("denormal-fp-math");
+        if (Denorms.getValueAsString().equals("ieee")) {
+          // fabs(X) < SmallestNormal ? 0.0 : Est
+          const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
+          APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
+          SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
+          SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+          SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
+          SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
+          Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);
+          AddToWorklist(Fabs.getNode());
+          AddToWorklist(IsDenorm.getNode());
+          AddToWorklist(Est.getNode());
+        } else {
+          // X == 0.0 ? 0.0 : Est
+          SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+          SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
+          Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);
+          AddToWorklist(IsZero.getNode());
+          AddToWorklist(Est.getNode());
+        }
       }
     }
     return Est;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6e808919512..9d57e46bba5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5007,7 +5007,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
-
+      // FIXME: This does not detect denorm inputs, so we might produce INF
+      // when we should produce 0.0. Try to refactor the code in DAGCombiner,
+      // so we don't have to duplicate it here.
       if (!Reciprocal) {
         EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
                                       VT);
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index d458994a4e8..288879febb1 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -121,8 +121,8 @@ define float @sqrtf_check_denorms(float %x) #3 {
 ; SSE-NEXT:    mulss %xmm1, %xmm2
 ; SSE-NEXT:    addss {{.*}}(%rip), %xmm2
 ; SSE-NEXT:    mulss %xmm3, %xmm2
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    cmpeqss %xmm1, %xmm0
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    cmpltss {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    andnps %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
@@ -134,8 +134,8 @@ define float @sqrtf_check_denorms(float %x) #3 {
 ; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
 ; AVX-NEXT:    vmulss %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vcmpeqss %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vcmpltss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vandnps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %call = tail call float @__sqrtf_finite(float %x) #2
@@ -145,17 +145,19 @@ define float @sqrtf_check_denorms(float %x) #3 {
 define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
 ; SSE-LABEL: sqrt_v4f32_check_denorms:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    rsqrtps %xmm0, %xmm1
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    mulps %xmm1, %xmm2
+; SSE-NEXT:    rsqrtps %xmm0, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    mulps %xmm2, %xmm1
 ; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01]
-; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    mulps %xmm1, %xmm2
-; SSE-NEXT:    addps {{.*}}(%rip), %xmm2
-; SSE-NEXT:    mulps %xmm3, %xmm2
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    cmpneqps %xmm1, %xmm0
-; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    mulps %xmm1, %xmm3
+; SSE-NEXT:    mulps %xmm2, %xmm1
+; SSE-NEXT:    addps {{.*}}(%rip), %xmm1
+; SSE-NEXT:    mulps %xmm3, %xmm1
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.175494e-38,1.175494e-38,1.175494e-38,1.175494e-38]
+; SSE-NEXT:    cmpleps %xmm0, %xmm2
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sqrt_v4f32_check_denorms:
@@ -166,8 +168,9 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
 ; AVX-NEXT:    vmulps %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vaddps {{.*}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vmulps %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vcmpneqps %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovaps {{.*#+}} xmm2 = [1.175494e-38,1.175494e-38,1.175494e-38,1.175494e-38]
+; AVX-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2