summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2018-05-06 17:48:21 +0000
committerCraig Topper <craig.topper@intel.com>2018-05-06 17:48:21 +0000
commitcb2abc79778029d881daecc5bab00b920e955f46 (patch)
treedb4ba3c33b6bc53a4ad8168d61a605f60c5cd225
parentb02e3dec4b34f89411816f71aa1772a59c9e5e4a (diff)
downloadbcm5719-llvm-cb2abc79778029d881daecc5bab00b920e955f46.tar.gz
bcm5719-llvm-cb2abc79778029d881daecc5bab00b920e955f46.zip
[X86] Enable reciprocal estimates for v16f32 vectors by using VRCP14PS/VRSQRT14PS
Summary: The legacy VRCPPS/VRSQRTPS instructions aren't available in 512-bit versions. The new increased precision versions are. So we can use those to implement v16f32 reciprocal estimates. For KNL CPUs we can probably use VRCP28PS/VRSQRT28PS and avoid the NR step altogether, but I leave that for a future patch. Reviewers: spatel Reviewed By: spatel Subscribers: RKSimon, llvm-commits, mehdi_amini Differential Revision: https://reviews.llvm.org/D46498 llvm-svn: 331606
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp16
-rw-r--r--llvm/test/CodeGen/X86/recip-fastmath.ll28
-rw-r--r--llvm/test/CodeGen/X86/recip-fastmath2.ll56
-rw-r--r--llvm/test/CodeGen/X86/sqrt-fastmath.ll8
4 files changed, 71 insertions, 37 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 91ef663844c..b7bd22f8f39 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17803,7 +17803,6 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
EVT VT = Op.getValueType();
// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
- // TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
// instructions: convert to single, rsqrtss, convert back to double, refine
@@ -17814,12 +17813,15 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
- (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+ (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = 1;
UseOneConstNR = false;
- return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+ // There is no FSQRT for 512-bits, but there is RSQRT14.
+ unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
+ return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
}
return SDValue();
}
@@ -17832,7 +17834,6 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
EVT VT = Op.getValueType();
// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
- // TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
// reciprocal estimate with refinement on x86 prior to FMA requires
// 15 instructions: convert to single, rcpss, convert back to double, refine
@@ -17841,7 +17842,8 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
- (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+ (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
// Enable estimate codegen with 1 refinement step for vector division.
// Scalar division estimates are disabled because they break too much
// real-world code. These defaults are intended to match GCC behavior.
@@ -17851,7 +17853,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = 1;
- return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+ // There is no FSQRT for 512-bits, but there is RSQRT14.
+ unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
+ return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
}
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index 14f3255e041..1d041c3f259 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -1024,14 +1024,16 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
;
; KNL-LABEL: v16f32_one_step:
; KNL: # %bb.0:
-; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
-; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
+; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
+; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
+; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v16f32_one_step:
; SKX: # %bb.0:
-; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
-; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
+; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
+; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
+; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <16 x float> %div
@@ -1222,14 +1224,24 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
;
; KNL-LABEL: v16f32_two_step:
; KNL: # %bb.0:
-; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
-; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
+; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
+; KNL-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v16f32_two_step:
; SKX: # %bb.0:
-; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
-; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
+; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
+; SKX-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:0.33]
+; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.33]
+; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <16 x float> %div
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 48672c18ce3..27a07f09095 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -1323,14 +1323,18 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
;
; KNL-LABEL: v16f32_one_step2:
; KNL: # %bb.0:
-; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50]
-; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
+; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
+; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
+; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v16f32_one_step2:
; SKX: # %bb.0:
-; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50]
-; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
+; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
+; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
+; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.33]
+; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
ret <16 x float> %div
@@ -1485,16 +1489,18 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
;
; KNL-LABEL: v16f32_one_step_2_divs:
; KNL: # %bb.0:
-; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
-; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
+; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
+; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
+; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [12:0.50]
; KNL-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v16f32_one_step_2_divs:
; SKX: # %bb.0:
-; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
-; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
+; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
+; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
+; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [11:0.50]
; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -1703,14 +1709,26 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
;
; KNL-LABEL: v16f32_two_step2:
; KNL: # %bb.0:
-; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50]
-; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
+; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [5:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
+; KNL-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v16f32_two_step2:
; SKX: # %bb.0:
-; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50]
-; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
+; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [9:2.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
+; SKX-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:0.33]
+; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.33]
+; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.33]
+; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
ret <16 x float> %div
@@ -1763,14 +1781,12 @@ define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
;
; KNL-LABEL: v16f32_no_step:
; KNL: # %bb.0:
-; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
-; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
+; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [5:1.00]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v16f32_no_step:
; SKX: # %bb.0:
-; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
-; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
+; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [9:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <16 x float> %div
@@ -1839,14 +1855,14 @@ define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
;
; KNL-LABEL: v16f32_no_step2:
; KNL: # %bb.0:
-; KNL-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [5:0.50]
-; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
+; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [5:1.00]
+; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v16f32_no_step2:
; SKX: # %bb.0:
-; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01,1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] sched: [8:0.50]
-; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
+; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [9:2.00]
+; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
ret <16 x float> %div
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 91ce663ccef..33319641ed1 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -515,9 +515,11 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
;
; AVX512-LABEL: v16f32_estimate:
; AVX512: # %bb.0:
-; AVX512-NEXT: vsqrtps %zmm0, %zmm0
-; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: vrsqrt14ps %zmm0, %zmm1
+; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem
+; AVX512-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
%div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
OpenPOWER on IntegriCloud