summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorQiu Chaofan <qiucofan@cn.ibm.com>2019-09-12 07:51:24 +0000
committerQiu Chaofan <qiucofan@cn.ibm.com>2019-09-12 07:51:24 +0000
commitb7fb5d0f6f2abf07f0ce6edc3397b4c291c1d25e (patch)
treeb5fb0cbe9b96f54540d31e4c268e1cf633b249d2
parent75f65fe8d316407c5dd7cd9bfdfc760203947aa1 (diff)
downloadbcm5719-llvm-b7fb5d0f6f2abf07f0ce6edc3397b4c291c1d25e.tar.gz
bcm5719-llvm-b7fb5d0f6f2abf07f0ce6edc3397b4c291c1d25e.zip
[DAGCombiner] Improve division estimation of floating points.
Current implementation of estimating divisions loses precision since it estimates reciprocal first and does multiplication. This patch is to re-order arithmetic operations in the last iteration in DAGCombiner to improve the accuracy. Reviewed By: Sanjay Patel, Jinsong Ji Differential Revision: https://reviews.llvm.org/D66050 llvm-svn: 371713
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp44
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/rsq.ll2
-rw-r--r--llvm/test/CodeGen/PowerPC/qpx-recipest.ll17
-rw-r--r--llvm/test/CodeGen/PowerPC/recipest.ll14
-rw-r--r--llvm/test/CodeGen/X86/recip-fastmath.ll180
-rw-r--r--llvm/test/CodeGen/X86/recip-fastmath2.ll1139
7 files changed, 737 insertions, 689 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9d3063c0e13..eec3e0848be 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -529,7 +529,7 @@ namespace {
SDValue BuildSDIVPow2(SDNode *N);
SDValue BuildUDIV(SDNode *N);
SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
- SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags);
+ SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
@@ -12682,10 +12682,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
}
// Fold into a reciprocal estimate and multiply instead of a real divide.
- if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) {
- AddToWorklist(RV.getNode());
- return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
- }
+ if (SDValue RV = BuildDivEstimate(N0, N1, Flags))
+ return RV;
}
// (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
@@ -20329,7 +20327,10 @@ SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
/// =>
/// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
/// does not require additional intermediate precision]
-SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) {
+/// For the last iteration, put numerator N into it to gain more precision:
+/// Result = N X_i + X_i (N - N A X_i)
+SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
+ SDNodeFlags Flags) {
if (Level >= AfterLegalizeDAG)
return SDValue();
@@ -20350,18 +20351,39 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) {
if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
AddToWorklist(Est.getNode());
+ SDLoc DL(Op);
if (Iterations) {
- SDLoc DL(Op);
SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
- // Newton iterations: Est = Est + Est (1 - Arg * Est)
+ // Newton iterations: Est = Est + Est (N - Arg * Est)
+ // If this is the last iteration, also multiply by the numerator.
for (int i = 0; i < Iterations; ++i) {
- SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags);
- NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags);
+ SDValue MulEst = Est;
+
+ if (i == Iterations - 1) {
+ MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags);
+ AddToWorklist(MulEst.getNode());
+ }
+
+ SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags);
+ AddToWorklist(NewEst.getNode());
+
+ NewEst = DAG.getNode(ISD::FSUB, DL, VT,
+ (i == Iterations - 1 ? N : FPOne), NewEst, Flags);
+ AddToWorklist(NewEst.getNode());
+
NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
- Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags);
+ AddToWorklist(NewEst.getNode());
+
+ Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags);
+ AddToWorklist(Est.getNode());
}
+ } else {
+ // If no iterations are available, multiply with N.
+ Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags);
+ AddToWorklist(Est.getNode());
}
+
return Est;
}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index a9b938c3afa..bd4deb14aad 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -114,7 +114,7 @@ entry:
; FUNC-LABEL: {{^}}fdiv_f32_fast_math:
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z,
; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
@@ -129,7 +129,7 @@ entry:
; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math:
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z,
; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
@@ -144,7 +144,7 @@ entry:
; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z,
; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
@@ -188,8 +188,8 @@ entry:
; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math:
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[3].X,
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].W,
; GCN: v_rcp_f32
; GCN: v_rcp_f32
@@ -203,8 +203,8 @@ entry:
; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math:
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[3].X,
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].W,
; GCN: v_rcp_f32
; GCN: v_rcp_f32
@@ -243,10 +243,10 @@ define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
; GCN: v_rcp_f32
; GCN: v_rcp_f32
@@ -266,10 +266,10 @@ define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out,
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
; GCN: v_rcp_f32
; GCN: v_rcp_f32
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.ll b/llvm/test/CodeGen/AMDGPU/rsq.ll
index 204eeb99838..40c3c94246e 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.ll
@@ -48,7 +48,7 @@ define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float
; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]]
; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]]
-; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RSQA]], [[RCPB]]
+; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]]
; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
; SI-UNSAFE: buffer_store_dword [[RESULT]]
diff --git a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll
index f841cb1c14c..3bfd92a2e5b 100644
--- a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll
+++ b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll
@@ -229,11 +229,11 @@ define <4 x double> @foo2_fmf(<4 x double> %a, <4 x double> %b) nounwind {
; CHECK-NEXT: qvfre 3, 2
; CHECK-NEXT: addi 3, 3, .LCPI8_0@toc@l
; CHECK-NEXT: qvlfdx 0, 0, 3
-; CHECK-NEXT: qvfnmsub 4, 2, 3, 0
-; CHECK-NEXT: qvfmadd 3, 3, 4, 3
; CHECK-NEXT: qvfnmsub 0, 2, 3, 0
; CHECK-NEXT: qvfmadd 0, 3, 0, 3
-; CHECK-NEXT: qvfmul 1, 1, 0
+; CHECK-NEXT: qvfmul 3, 1, 0
+; CHECK-NEXT: qvfnmsub 1, 2, 3, 1
+; CHECK-NEXT: qvfmadd 1, 0, 1, 3
; CHECK-NEXT: blr
entry:
%r = fdiv fast <4 x double> %a, %b
@@ -266,13 +266,10 @@ define <4 x double> @foo2_safe(<4 x double> %a, <4 x double> %b) nounwind {
define <4 x float> @goo2_fmf(<4 x float> %a, <4 x float> %b) nounwind {
; CHECK-LABEL: goo2_fmf:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: addis 3, 2, .LCPI10_0@toc@ha
-; CHECK-NEXT: qvfres 3, 2
-; CHECK-NEXT: addi 3, 3, .LCPI10_0@toc@l
-; CHECK-NEXT: qvlfsx 0, 0, 3
-; CHECK-NEXT: qvfnmsubs 0, 2, 3, 0
-; CHECK-NEXT: qvfmadds 0, 3, 0, 3
-; CHECK-NEXT: qvfmuls 1, 1, 0
+; CHECK-NEXT: qvfres 0, 2
+; CHECK-NEXT: qvfmuls 3, 1, 0
+; CHECK-NEXT: qvfnmsubs 1, 2, 3, 1
+; CHECK-NEXT: qvfmadds 1, 0, 1, 3
; CHECK-NEXT: blr
entry:
%r = fdiv fast <4 x float> %a, %b
diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll
index d1bf1c9e502..bc33617662e 100644
--- a/llvm/test/CodeGen/PowerPC/recipest.ll
+++ b/llvm/test/CodeGen/PowerPC/recipest.ll
@@ -145,13 +145,13 @@ define float @no_estimate_refinement_f32(float %a, float %b) #0 {
define float @rsqrt_fmul_fmf(float %a, float %b, float %c) {
; CHECK: @rsqrt_fmul_fmf
; CHECK-DAG: frsqrtes
-; CHECK-DAG: fres
-; CHECK-DAG: fnmsubs
-; CHECK-DAG: fmuls
-; CHECK-DAG: fmadds
-; CHECK-DAG: fmadds
; CHECK: fmuls
+; CHECK-NEXT: fmadds
; CHECK-NEXT: fmuls
+; CHECK-DAG: fres
+; CHECK-COUNT-3: fmuls
+; CHECK-NEXT: fmsubs
+; CHECK-NEXT: fmadds
; CHECK-NEXT: fmuls
; CHECK-NEXT: blr
%x = call fast float @llvm.sqrt.f32(float %a)
@@ -196,9 +196,9 @@ define double @foo2_fmf(double %a, double %b) nounwind {
; CHECK-DAG: fre
; CHECK-DAG: fnmsub
; CHECK: fmadd
+; CHECK-NEXT: fmul
; CHECK-NEXT: fnmsub
; CHECK-NEXT: fmadd
-; CHECK-NEXT: fmul
; CHECK-NEXT: blr
%r = fdiv fast double %a, %b
ret double %r
@@ -215,9 +215,9 @@ define double @foo2_safe(double %a, double %b) nounwind {
define float @goo2_fmf(float %a, float %b) nounwind {
; CHECK: @goo2_fmf
; CHECK-DAG: fres
+; CHECK-NEXT: fmuls
; CHECK-DAG: fnmsubs
; CHECK: fmadds
-; CHECK-NEXT: fmuls
; CHECK-NEXT: blr
%r = fdiv fast float %a, %b
ret float %r
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index c7442f2bc20..c618c37e4fe 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -122,88 +122,87 @@ define float @f32_one_step_variables(float %x, float %y) #1 {
; SSE-LABEL: f32_one_step_variables:
; SSE: # %bb.0:
; SSE-NEXT: rcpss %xmm1, %xmm2
-; SSE-NEXT: mulss %xmm2, %xmm1
-; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE-NEXT: subss %xmm1, %xmm3
+; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: mulss %xmm2, %xmm3
-; SSE-NEXT: addss %xmm2, %xmm3
-; SSE-NEXT: mulss %xmm3, %xmm0
+; SSE-NEXT: mulss %xmm3, %xmm1
+; SSE-NEXT: subss %xmm1, %xmm0
+; SSE-NEXT: mulss %xmm2, %xmm0
+; SSE-NEXT: addss %xmm3, %xmm0
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: f32_one_step_variables:
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2
-; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX-RECIP-NEXT: vsubss %xmm1, %xmm3, %xmm1
-; AVX-RECIP-NEXT: vmulss %xmm1, %xmm2, %xmm1
-; AVX-RECIP-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3
+; AVX-RECIP-NEXT: vmulss %xmm3, %xmm1, %xmm1
+; AVX-RECIP-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT: vmulss %xmm0, %xmm2, %xmm0
+; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: f32_one_step_variables:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2
-; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem
-; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2
-; FMA-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; FMA-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3
+; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
+; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
+; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: f32_one_step_variables:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2
-; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm2, %xmm1, %xmm1
-; BDVER2-NEXT: vfmaddss %xmm2, %xmm1, %xmm2, %xmm1
-; BDVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3
+; BDVER2-NEXT: vfnmaddss %xmm0, %xmm3, %xmm1, %xmm0
+; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm2, %xmm0
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: f32_one_step_variables:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2
-; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; BTVER2-NEXT: vsubss %xmm1, %xmm3, %xmm1
-; BTVER2-NEXT: vmulss %xmm1, %xmm2, %xmm1
-; BTVER2-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3
+; BTVER2-NEXT: vmulss %xmm3, %xmm1, %xmm1
+; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT: vmulss %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0
; BTVER2-NEXT: retq
;
; SANDY-LABEL: f32_one_step_variables:
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm2
-; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SANDY-NEXT: vsubss %xmm1, %xmm3, %xmm1
-; SANDY-NEXT: vmulss %xmm1, %xmm2, %xmm1
-; SANDY-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT: vmulss %xmm2, %xmm0, %xmm3
+; SANDY-NEXT: vmulss %xmm3, %xmm1, %xmm1
+; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT: vmulss %xmm0, %xmm2, %xmm0
+; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0
; SANDY-NEXT: retq
;
; HASWELL-LABEL: f32_one_step_variables:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm2
-; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem
-; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2
-; HASWELL-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; HASWELL-NEXT: vmulss %xmm2, %xmm0, %xmm3
+; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
+; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
+; HASWELL-NEXT: vmovaps %xmm2, %xmm0
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: f32_one_step_variables:
; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm1, %xmm1, %xmm2
-; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm1
-; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; HASWELL-NO-FMA-NEXT: vsubss %xmm1, %xmm3, %xmm1
-; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm2, %xmm1
-; HASWELL-NO-FMA-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm0, %xmm3
+; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm1, %xmm1
+; HASWELL-NO-FMA-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
; AVX512-LABEL: f32_one_step_variables:
; AVX512: # %bb.0:
; AVX512-NEXT: vrcpss %xmm1, %xmm1, %xmm2
-; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem
-; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2
-; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm3
+; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
+; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
%div = fdiv fast float %x, %y
ret float %div
@@ -484,99 +483,88 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1
; SSE-LABEL: v4f32_one_step_variables:
; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm1, %xmm2
-; SSE-NEXT: mulps %xmm2, %xmm1
-; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT: subps %xmm1, %xmm3
+; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: mulps %xmm2, %xmm3
-; SSE-NEXT: addps %xmm2, %xmm3
-; SSE-NEXT: mulps %xmm3, %xmm0
+; SSE-NEXT: mulps %xmm3, %xmm1
+; SSE-NEXT: subps %xmm1, %xmm0
+; SSE-NEXT: mulps %xmm2, %xmm0
+; SSE-NEXT: addps %xmm3, %xmm0
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v4f32_one_step_variables:
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm1, %xmm2
-; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm1
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-RECIP-NEXT: vsubps %xmm1, %xmm3, %xmm1
-; AVX-RECIP-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; AVX-RECIP-NEXT: vaddps %xmm1, %xmm2, %xmm1
-; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3
+; AVX-RECIP-NEXT: vmulps %xmm3, %xmm1, %xmm1
+; AVX-RECIP-NEXT: vsubps %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT: vmulps %xmm0, %xmm2, %xmm0
+; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v4f32_one_step_variables:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm1, %xmm2
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2
-; FMA-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; FMA-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3
+; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
+; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
+; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: v4f32_one_step_variables:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %xmm1, %xmm2
-; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm2, %xmm1, %xmm1
-; BDVER2-NEXT: vfmaddps %xmm2, %xmm1, %xmm2, %xmm1
-; BDVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3
+; BDVER2-NEXT: vfnmaddps %xmm0, %xmm3, %xmm1, %xmm0
+; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm2, %xmm0
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: v4f32_one_step_variables:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm1, %xmm2
-; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm1
-; BTVER2-NEXT: vsubps %xmm1, %xmm3, %xmm1
-; BTVER2-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; BTVER2-NEXT: vaddps %xmm1, %xmm2, %xmm1
-; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3
+; BTVER2-NEXT: vmulps %xmm3, %xmm1, %xmm1
+; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT: vmulps %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0
; BTVER2-NEXT: retq
;
; SANDY-LABEL: v4f32_one_step_variables:
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm1, %xmm2
-; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm1
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SANDY-NEXT: vsubps %xmm1, %xmm3, %xmm1
-; SANDY-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; SANDY-NEXT: vaddps %xmm1, %xmm2, %xmm1
-; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT: vmulps %xmm2, %xmm0, %xmm3
+; SANDY-NEXT: vmulps %xmm3, %xmm1, %xmm1
+; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT: vmulps %xmm0, %xmm2, %xmm0
+; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0
; SANDY-NEXT: retq
;
; HASWELL-LABEL: v4f32_one_step_variables:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm1, %xmm2
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm3
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2
-; HASWELL-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm3
+; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
+; HASWELL-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
+; HASWELL-NEXT: vmovaps %xmm2, %xmm0
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables:
; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm1, %xmm2
-; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm1
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NO-FMA-NEXT: vsubps %xmm1, %xmm3, %xmm1
-; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; HASWELL-NO-FMA-NEXT: vaddps %xmm1, %xmm2, %xmm1
-; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm0, %xmm3
+; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm1, %xmm1
+; HASWELL-NO-FMA-NEXT: vsubps %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
-; KNL-LABEL: v4f32_one_step_variables:
-; KNL: # %bb.0:
-; KNL-NEXT: vrcpps %xmm1, %xmm2
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm3
-; KNL-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2
-; KNL-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: v4f32_one_step_variables:
-; SKX: # %bb.0:
-; SKX-NEXT: vrcpps %xmm1, %xmm2
-; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem
-; SKX-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2
-; SKX-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; SKX-NEXT: retq
+; AVX512-LABEL: v4f32_one_step_variables:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vrcpps %xmm1, %xmm2
+; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm3
+; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
+; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
+; AVX512-NEXT: retq
%div = fdiv fast <4 x float> %x, %y
ret <4 x float> %div
}
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index de511b411b3..a2bd6c2081c 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -32,89 +32,94 @@ define float @f32_one_step_2(float %x) #1 {
; SSE-LABEL: f32_one_step_2:
; SSE: # %bb.0:
; SSE-NEXT: rcpss %xmm0, %xmm2
-; SSE-NEXT: mulss %xmm2, %xmm0
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: movaps %xmm2, %xmm3
+; SSE-NEXT: mulss %xmm1, %xmm3
+; SSE-NEXT: mulss %xmm3, %xmm0
; SSE-NEXT: subss %xmm0, %xmm1
; SSE-NEXT: mulss %xmm2, %xmm1
-; SSE-NEXT: addss %xmm2, %xmm1
-; SSE-NEXT: mulss {{.*}}(%rip), %xmm1
+; SSE-NEXT: addss %xmm3, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: f32_one_step_2:
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
-; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3
+; AVX-RECIP-NEXT: vmulss %xmm3, %xmm0, %xmm0
; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0
; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: f32_one_step_2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
-; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
-; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
-; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3
+; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: f32_one_step_2:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
-; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0
-; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0
-; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BDVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3
+; BDVER2-NEXT: vfnmaddss %xmm2, %xmm3, %xmm0, %xmm0
+; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm1, %xmm0
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: f32_one_step_2:
; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
-; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3
+; BTVER2-NEXT: vmulss %xmm3, %xmm0, %xmm0
; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0
; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0
; BTVER2-NEXT: retq
;
; SANDY-LABEL: f32_one_step_2:
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1
-; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0
; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3
+; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0
; SANDY-NEXT: retq
;
; HASWELL-LABEL: f32_one_step_2:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
-; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NEXT: vmulss %xmm2, %xmm1, %xmm3
+; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: f32_one_step_2:
; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm3
+; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm0, %xmm0
; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
; AVX512-LABEL: f32_one_step_2:
; AVX512: # %bb.0:
; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
-; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm3
+; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3
; AVX512-NEXT: retq
%div = fdiv fast float 3456.0, %x
ret float %div
@@ -225,19 +230,20 @@ define float @f32_one_step_2_divs(float %x) #1 {
define float @f32_two_step_2(float %x) #2 {
; SSE-LABEL: f32_two_step_2:
; SSE: # %bb.0:
-; SSE-NEXT: rcpss %xmm0, %xmm2
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: mulss %xmm2, %xmm3
+; SSE-NEXT: rcpss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: mulss %xmm1, %xmm2
+; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-NEXT: subss %xmm2, %xmm3
+; SSE-NEXT: mulss %xmm1, %xmm3
+; SSE-NEXT: addss %xmm1, %xmm3
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: movaps %xmm1, %xmm4
-; SSE-NEXT: subss %xmm3, %xmm4
-; SSE-NEXT: mulss %xmm2, %xmm4
-; SSE-NEXT: addss %xmm2, %xmm4
-; SSE-NEXT: mulss %xmm4, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm2
+; SSE-NEXT: mulss %xmm1, %xmm2
+; SSE-NEXT: mulss %xmm2, %xmm0
; SSE-NEXT: subss %xmm0, %xmm1
-; SSE-NEXT: mulss %xmm4, %xmm1
-; SSE-NEXT: addss %xmm4, %xmm1
-; SSE-NEXT: mulss {{.*}}(%rip), %xmm1
+; SSE-NEXT: mulss %xmm3, %xmm1
+; SSE-NEXT: addss %xmm2, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
@@ -249,49 +255,51 @@ define float @f32_two_step_2(float %x) #2 {
; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2
; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2
; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0
+; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3
+; AVX-RECIP-NEXT: vmulss %xmm3, %xmm0, %xmm0
+; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0
; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: f32_two_step_2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
-; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
-; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
-; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
-; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
-; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1
+; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT: vmulss %xmm1, %xmm2, %xmm3
+; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1
+; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: f32_two_step_2:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
-; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3
-; BDVER2-NEXT: vfmaddss %xmm1, %xmm3, %xmm1, %xmm1
-; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0
-; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm2
+; BDVER2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; BDVER2-NEXT: vfmaddss %xmm1, %xmm2, %xmm1, %xmm1
+; BDVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3
+; BDVER2-NEXT: vfnmaddss %xmm4, %xmm3, %xmm0, %xmm0
+; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm1, %xmm0
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: f32_two_step_2:
; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1
+; BTVER2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2
; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2
; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2
; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0
+; BTVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3
+; BTVER2-NEXT: vmulss %xmm3, %xmm0, %xmm0
+; BTVER2-NEXT: vsubss %xmm0, %xmm4, %xmm0
; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0
; BTVER2-NEXT: retq
;
; SANDY-LABEL: f32_two_step_2:
@@ -302,23 +310,24 @@ define float @f32_two_step_2(float %x) #2 {
; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2
; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2
; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3
+; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0
+; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0
; SANDY-NEXT: retq
;
; HASWELL-LABEL: f32_two_step_2:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HASWELL-NEXT: vmovaps %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
-; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
-; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
-; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HASWELL-NEXT: vmulss %xmm1, %xmm2, %xmm3
+; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1
+; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: f32_two_step_2:
@@ -329,23 +338,24 @@ define float @f32_two_step_2(float %x) #2 {
; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2
; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2
; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0
+; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm3
+; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
; AVX512-LABEL: f32_two_step_2:
; AVX512: # %bb.0:
; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512-NEXT: vmovaps %xmm1, %xmm3
-; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
-; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
-; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
-; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm3
+; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1
+; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3
; AVX512-NEXT: retq
%div = fdiv fast float 6789.0, %x
ret float %div
@@ -355,100 +365,95 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; SSE-LABEL: v4f32_one_step2:
; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm0, %xmm2
-; SSE-NEXT: mulps %xmm2, %xmm0
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; SSE-NEXT: movaps %xmm2, %xmm3
+; SSE-NEXT: mulps %xmm1, %xmm3
+; SSE-NEXT: mulps %xmm3, %xmm0
; SSE-NEXT: subps %xmm0, %xmm1
; SSE-NEXT: mulps %xmm2, %xmm1
-; SSE-NEXT: addps %xmm2, %xmm1
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm1
+; SSE-NEXT: addps %xmm3, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v4f32_one_step2:
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
-; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3
+; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0
; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v4f32_one_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
-; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; FMA-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3
+; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: v4f32_one_step2:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %xmm0, %xmm1
-; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0
-; BDVER2-NEXT: vfmaddps %xmm1, %xmm0, %xmm1, %xmm0
-; BDVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; BDVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3
+; BDVER2-NEXT: vfnmaddps %xmm2, %xmm3, %xmm0, %xmm0
+; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm1, %xmm0
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: v4f32_one_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
-; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3
+; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0
; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0
; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0
; BTVER2-NEXT: retq
;
; SANDY-LABEL: v4f32_one_step2:
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1
-; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3
+; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0
; SANDY-NEXT: retq
;
; HASWELL-LABEL: v4f32_one_step2:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; HASWELL-NEXT: vmulps %xmm2, %xmm1, %xmm3
+; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; HASWELL-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3
+; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
-; KNL-LABEL: v4f32_one_step2:
-; KNL: # %bb.0:
-; KNL-NEXT: vrcpps %xmm0, %xmm1
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
-; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
-; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: v4f32_one_step2:
-; SKX: # %bb.0:
-; SKX-NEXT: vrcpps %xmm0, %xmm1
-; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
-; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
-; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
-; SKX-NEXT: retq
+; AVX512-LABEL: v4f32_one_step2:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vrcpps %xmm0, %xmm1
+; AVX512-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm3
+; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
+; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3
+; AVX512-NEXT: retq
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
ret <4 x float> %div
}
@@ -569,19 +574,20 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SSE-LABEL: v4f32_two_step2:
; SSE: # %bb.0:
-; SSE-NEXT: rcpps %xmm0, %xmm2
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: mulps %xmm2, %xmm3
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT: movaps %xmm1, %xmm4
-; SSE-NEXT: subps %xmm3, %xmm4
-; SSE-NEXT: mulps %xmm2, %xmm4
-; SSE-NEXT: addps %xmm2, %xmm4
-; SSE-NEXT: mulps %xmm4, %xmm0
+; SSE-NEXT: rcpps %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: mulps %xmm1, %xmm2
+; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE-NEXT: subps %xmm2, %xmm3
+; SSE-NEXT: mulps %xmm1, %xmm3
+; SSE-NEXT: addps %xmm1, %xmm3
+; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; SSE-NEXT: movaps %xmm3, %xmm2
+; SSE-NEXT: mulps %xmm1, %xmm2
+; SSE-NEXT: mulps %xmm2, %xmm0
; SSE-NEXT: subps %xmm0, %xmm1
-; SSE-NEXT: mulps %xmm4, %xmm1
-; SSE-NEXT: addps %xmm4, %xmm1
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm1
+; SSE-NEXT: mulps %xmm3, %xmm1
+; SSE-NEXT: addps %xmm2, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
@@ -593,49 +599,51 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2
; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2
; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0
+; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3
+; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0
+; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v4f32_two_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
-; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1
+; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; FMA-RECIP-NEXT: vmulps %xmm1, %xmm2, %xmm3
+; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1
+; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: v4f32_two_step2:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %xmm0, %xmm1
-; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3
-; BDVER2-NEXT: vfmaddps %xmm1, %xmm3, %xmm1, %xmm1
-; BDVER2-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER2-NEXT: vfmaddps %xmm1, %xmm0, %xmm1, %xmm0
-; BDVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm2
+; BDVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; BDVER2-NEXT: vfmaddps %xmm1, %xmm2, %xmm1, %xmm1
+; BDVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3
+; BDVER2-NEXT: vfnmaddps %xmm4, %xmm3, %xmm0, %xmm0
+; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm1, %xmm0
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: v4f32_two_step2:
; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1
+; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2
; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2
; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2
; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0
+; BTVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3
+; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0
+; BTVER2-NEXT: vsubps %xmm0, %xmm4, %xmm0
; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0
; BTVER2-NEXT: retq
;
; SANDY-LABEL: v4f32_two_step2:
@@ -646,23 +654,24 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3
+; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0
+; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0
; SANDY-NEXT: retq
;
; HASWELL-LABEL: v4f32_two_step2:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1
; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vmovaps %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1
+; HASWELL-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; HASWELL-NEXT: vmulps %xmm1, %xmm2, %xmm3
+; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1
+; HASWELL-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
@@ -673,23 +682,24 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2
; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2
; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0
+; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3
+; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
; AVX512-LABEL: v4f32_two_step2:
; AVX512: # %bb.0:
; AVX512-NEXT: vrcpps %xmm0, %xmm1
; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vmovaps %xmm1, %xmm3
-; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
-; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
-; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
-; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
-; AVX512-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
+; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1
+; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm3
+; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1
+; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3
; AVX512-NEXT: retq
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
ret <4 x float> %div
@@ -698,20 +708,22 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; SSE-LABEL: v8f32_one_step2:
; SSE: # %bb.0:
-; SSE-NEXT: rcpps %xmm1, %xmm4
+; SSE-NEXT: rcpps %xmm0, %xmm3
+; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; SSE-NEXT: movaps %xmm3, %xmm4
+; SSE-NEXT: mulps %xmm2, %xmm4
+; SSE-NEXT: mulps %xmm4, %xmm0
+; SSE-NEXT: subps %xmm0, %xmm2
+; SSE-NEXT: mulps %xmm3, %xmm2
+; SSE-NEXT: addps %xmm4, %xmm2
+; SSE-NEXT: rcpps %xmm1, %xmm0
+; SSE-NEXT: movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; SSE-NEXT: movaps %xmm0, %xmm4
+; SSE-NEXT: mulps %xmm3, %xmm4
; SSE-NEXT: mulps %xmm4, %xmm1
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT: movaps %xmm2, %xmm3
; SSE-NEXT: subps %xmm1, %xmm3
-; SSE-NEXT: mulps %xmm4, %xmm3
+; SSE-NEXT: mulps %xmm0, %xmm3
; SSE-NEXT: addps %xmm4, %xmm3
-; SSE-NEXT: rcpps %xmm0, %xmm1
-; SSE-NEXT: mulps %xmm1, %xmm0
-; SSE-NEXT: subps %xmm0, %xmm2
-; SSE-NEXT: mulps %xmm1, %xmm2
-; SSE-NEXT: addps %xmm1, %xmm2
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm2
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm3
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: movaps %xmm3, %xmm1
; SSE-NEXT: retq
@@ -719,88 +731,82 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; AVX-RECIP-LABEL: v8f32_one_step2:
; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
-; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0
; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-RECIP-NEXT: vaddps %ymm0, %ymm3, %ymm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v8f32_one_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
-; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; FMA-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
+; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm3
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: v8f32_one_step2:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm1
-; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0
-; BDVER2-NEXT: vfmaddps %ymm1, %ymm0, %ymm1, %ymm0
-; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; BDVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; BDVER2-NEXT: vfnmaddps %ymm2, %ymm3, %ymm0, %ymm0
+; BDVER2-NEXT: vfmaddps %ymm3, %ymm0, %ymm1, %ymm0
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: v8f32_one_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0
; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0
; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT: vaddps %ymm0, %ymm3, %ymm0
; BTVER2-NEXT: retq
;
; SANDY-LABEL: v8f32_one_step2:
; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1
-; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT: vaddps %ymm0, %ymm3, %ymm0
; SANDY-NEXT: retq
;
; HASWELL-LABEL: v8f32_one_step2:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; HASWELL-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
+; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm3
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0
; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm3, %ymm0
; HASWELL-NO-FMA-NEXT: retq
;
-; KNL-LABEL: v8f32_one_step2:
-; KNL: # %bb.0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
-; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: v8f32_one_step2:
-; SKX: # %bb.0:
-; SKX-NEXT: vrcpps %ymm0, %ymm1
-; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
-; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; SKX-NEXT: retq
+; AVX512-LABEL: v8f32_one_step2:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vrcpps %ymm0, %ymm1
+; AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
+; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm3
+; AVX512-NEXT: retq
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
ret <8 x float> %div
}
@@ -930,33 +936,35 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SSE-LABEL: v8f32_two_step2:
; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: rcpps %xmm1, %xmm3
-; SSE-NEXT: movaps %xmm1, %xmm4
-; SSE-NEXT: mulps %xmm3, %xmm4
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT: movaps %xmm0, %xmm5
-; SSE-NEXT: subps %xmm4, %xmm5
-; SSE-NEXT: mulps %xmm3, %xmm5
-; SSE-NEXT: addps %xmm3, %xmm5
-; SSE-NEXT: mulps %xmm5, %xmm1
+; SSE-NEXT: rcpps %xmm0, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: subps %xmm1, %xmm3
-; SSE-NEXT: mulps %xmm5, %xmm3
-; SSE-NEXT: addps %xmm5, %xmm3
-; SSE-NEXT: rcpps %xmm2, %xmm1
-; SSE-NEXT: movaps %xmm2, %xmm4
-; SSE-NEXT: mulps %xmm1, %xmm4
-; SSE-NEXT: movaps %xmm0, %xmm5
-; SSE-NEXT: subps %xmm4, %xmm5
-; SSE-NEXT: mulps %xmm1, %xmm5
-; SSE-NEXT: addps %xmm1, %xmm5
+; SSE-NEXT: mulps %xmm2, %xmm3
+; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE-NEXT: movaps %xmm4, %xmm5
+; SSE-NEXT: subps %xmm3, %xmm5
+; SSE-NEXT: mulps %xmm2, %xmm5
+; SSE-NEXT: addps %xmm2, %xmm5
+; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; SSE-NEXT: movaps %xmm5, %xmm3
+; SSE-NEXT: mulps %xmm2, %xmm3
+; SSE-NEXT: mulps %xmm3, %xmm0
+; SSE-NEXT: subps %xmm0, %xmm2
; SSE-NEXT: mulps %xmm5, %xmm2
-; SSE-NEXT: subps %xmm2, %xmm0
-; SSE-NEXT: mulps %xmm5, %xmm0
-; SSE-NEXT: addps %xmm5, %xmm0
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm0
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm3
+; SSE-NEXT: addps %xmm3, %xmm2
+; SSE-NEXT: rcpps %xmm1, %xmm0
+; SSE-NEXT: movaps %xmm1, %xmm3
+; SSE-NEXT: mulps %xmm0, %xmm3
+; SSE-NEXT: subps %xmm3, %xmm4
+; SSE-NEXT: mulps %xmm0, %xmm4
+; SSE-NEXT: addps %xmm0, %xmm4
+; SSE-NEXT: movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; SSE-NEXT: movaps %xmm4, %xmm0
+; SSE-NEXT: mulps %xmm3, %xmm0
+; SSE-NEXT: mulps %xmm0, %xmm1
+; SSE-NEXT: subps %xmm1, %xmm3
+; SSE-NEXT: mulps %xmm4, %xmm3
+; SSE-NEXT: addps %xmm0, %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: movaps %xmm3, %xmm1
; SSE-NEXT: retq
;
@@ -968,49 +976,51 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2
; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2
; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
+; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0
+; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-RECIP-NEXT: vaddps %ymm0, %ymm3, %ymm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v8f32_two_step2:
; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
-; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
+; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1
+; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm3
+; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1
+; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm3
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: v8f32_two_step2:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vrcpps %ymm0, %ymm1
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3
-; BDVER2-NEXT: vfmaddps %ymm1, %ymm3, %ymm1, %ymm1
-; BDVER2-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER2-NEXT: vfmaddps %ymm1, %ymm0, %ymm1, %ymm0
-; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm2
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; BDVER2-NEXT: vfmaddps %ymm1, %ymm2, %ymm1, %ymm1
+; BDVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3
+; BDVER2-NEXT: vfnmaddps %ymm4, %ymm3, %ymm0, %ymm0
+; BDVER2-NEXT: vfmaddps %ymm3, %ymm0, %ymm1, %ymm0
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: v8f32_two_step2:
; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2
; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2
; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2
; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
+; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3
+; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0
+; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0
; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT: vaddps %ymm0, %ymm3, %ymm0
; BTVER2-NEXT: retq
;
; SANDY-LABEL: v8f32_two_step2:
@@ -1021,23 +1031,24 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0
+; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT: vaddps %ymm0, %ymm3, %ymm0
; SANDY-NEXT: retq
;
; HASWELL-LABEL: v8f32_two_step2:
; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %ymm0, %ymm1
; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vmovaps %ymm1, %ymm3
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
+; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1
+; HASWELL-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm3
+; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1
+; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm3
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
@@ -1048,23 +1059,24 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2
; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2
; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0
+; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0
; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm3, %ymm0
; HASWELL-NO-FMA-NEXT: retq
;
; AVX512-LABEL: v8f32_two_step2:
; AVX512: # %bb.0:
; AVX512-NEXT: vrcpps %ymm0, %ymm1
; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vmovaps %ymm1, %ymm3
-; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
-; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
-; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
-; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
-; AVX512-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
+; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1
+; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; AVX512-NEXT: vmulps %ymm1, %ymm2, %ymm3
+; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1
+; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm3
; AVX512-NEXT: retq
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
ret <8 x float> %div
@@ -1088,9 +1100,9 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
; SSE-LABEL: v8f32_no_step2:
; SSE: # %bb.0:
-; SSE-NEXT: rcpps %xmm1, %xmm1
; SSE-NEXT: rcpps %xmm0, %xmm0
; SSE-NEXT: mulps {{.*}}(%rip), %xmm0
+; SSE-NEXT: rcpps %xmm1, %xmm1
; SSE-NEXT: mulps {{.*}}(%rip), %xmm1
; SSE-NEXT: retq
;
@@ -1106,153 +1118,165 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
; SSE-LABEL: v16f32_one_step2:
; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm3, %xmm4
-; SSE-NEXT: movaps %xmm2, %xmm5
-; SSE-NEXT: movaps %xmm0, %xmm6
-; SSE-NEXT: rcpps %xmm3, %xmm2
-; SSE-NEXT: mulps %xmm2, %xmm4
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: subps %xmm4, %xmm3
-; SSE-NEXT: mulps %xmm2, %xmm3
-; SSE-NEXT: addps %xmm2, %xmm3
-; SSE-NEXT: rcpps %xmm5, %xmm4
-; SSE-NEXT: mulps %xmm4, %xmm5
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: subps %xmm5, %xmm2
-; SSE-NEXT: mulps %xmm4, %xmm2
-; SSE-NEXT: addps %xmm4, %xmm2
-; SSE-NEXT: rcpps %xmm1, %xmm5
+; SSE-NEXT: movaps %xmm1, %xmm4
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: rcpps %xmm0, %xmm5
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; SSE-NEXT: movaps %xmm5, %xmm6
+; SSE-NEXT: mulps %xmm0, %xmm6
+; SSE-NEXT: mulps %xmm6, %xmm1
+; SSE-NEXT: subps %xmm1, %xmm0
+; SSE-NEXT: mulps %xmm5, %xmm0
+; SSE-NEXT: addps %xmm6, %xmm0
+; SSE-NEXT: rcpps %xmm4, %xmm5
+; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; SSE-NEXT: movaps %xmm5, %xmm6
+; SSE-NEXT: mulps %xmm1, %xmm6
+; SSE-NEXT: mulps %xmm6, %xmm4
+; SSE-NEXT: subps %xmm4, %xmm1
; SSE-NEXT: mulps %xmm5, %xmm1
-; SSE-NEXT: movaps %xmm0, %xmm4
-; SSE-NEXT: subps %xmm1, %xmm4
+; SSE-NEXT: addps %xmm6, %xmm1
+; SSE-NEXT: rcpps %xmm2, %xmm5
+; SSE-NEXT: movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1]
+; SSE-NEXT: movaps %xmm5, %xmm6
+; SSE-NEXT: mulps %xmm4, %xmm6
+; SSE-NEXT: mulps %xmm6, %xmm2
+; SSE-NEXT: subps %xmm2, %xmm4
; SSE-NEXT: mulps %xmm5, %xmm4
-; SSE-NEXT: addps %xmm5, %xmm4
-; SSE-NEXT: rcpps %xmm6, %xmm1
-; SSE-NEXT: mulps %xmm1, %xmm6
-; SSE-NEXT: subps %xmm6, %xmm0
-; SSE-NEXT: mulps %xmm1, %xmm0
-; SSE-NEXT: addps %xmm1, %xmm0
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm0
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm4
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm2
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm3
-; SSE-NEXT: movaps %xmm4, %xmm1
+; SSE-NEXT: addps %xmm6, %xmm4
+; SSE-NEXT: rcpps %xmm3, %xmm2
+; SSE-NEXT: movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; SSE-NEXT: movaps %xmm2, %xmm6
+; SSE-NEXT: mulps %xmm5, %xmm6
+; SSE-NEXT: mulps %xmm6, %xmm3
+; SSE-NEXT: subps %xmm3, %xmm5
+; SSE-NEXT: mulps %xmm2, %xmm5
+; SSE-NEXT: addps %xmm6, %xmm5
+; SSE-NEXT: movaps %xmm4, %xmm2
+; SSE-NEXT: movaps %xmm5, %xmm3
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v16f32_one_step2:
; AVX-RECIP: # %bb.0:
-; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
-; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1
-; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1
-; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
-; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
-; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
+; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4
+; AVX-RECIP-NEXT: vmulps %ymm4, %ymm0, %ymm0
; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
-; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; AVX-RECIP-NEXT: vaddps %ymm0, %ymm4, %ymm0
+; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
+; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4
+; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1
+; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1
+; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
+; AVX-RECIP-NEXT: vaddps %ymm1, %ymm4, %ymm1
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v16f32_one_step2:
; FMA-RECIP: # %bb.0:
-; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
-; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4
+; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
+; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4
+; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
+; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4
+; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
+; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm2 * ymm1) + ymm4
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: v16f32_one_step2:
; BDVER2: # %bb.0:
-; BDVER2-NEXT: vrcpps %ymm1, %ymm2
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT: vrcpps %ymm0, %ymm4
-; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1
+; BDVER2-NEXT: vrcpps %ymm0, %ymm2
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; BDVER2-NEXT: vrcpps %ymm1, %ymm5
+; BDVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4
; BDVER2-NEXT: vfnmaddps %ymm3, %ymm4, %ymm0, %ymm0
-; BDVER2-NEXT: vfmaddps %ymm2, %ymm1, %ymm2, %ymm1
-; BDVER2-NEXT: vfmaddps %ymm4, %ymm0, %ymm4, %ymm0
-; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; BDVER2-NEXT: vfmaddps %ymm4, %ymm0, %ymm2, %ymm0
+; BDVER2-NEXT: vmulps %ymm3, %ymm5, %ymm4
+; BDVER2-NEXT: vfnmaddps %ymm3, %ymm4, %ymm1, %ymm1
+; BDVER2-NEXT: vfmaddps %ymm4, %ymm1, %ymm5, %ymm1
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: v16f32_one_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BTVER2-NEXT: vrcpps %ymm1, %ymm2
-; BTVER2-NEXT: vrcpps %ymm0, %ymm4
-; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; BTVER2-NEXT: vrcpps %ymm0, %ymm2
+; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4
; BTVER2-NEXT: vmulps %ymm4, %ymm0, %ymm0
-; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1
; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
-; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1
-; BTVER2-NEXT: vmulps %ymm0, %ymm4, %ymm0
-; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT: vrcpps %ymm1, %ymm2
+; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5
; BTVER2-NEXT: vaddps %ymm0, %ymm4, %ymm0
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; BTVER2-NEXT: vmulps %ymm5, %ymm1, %ymm1
+; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1
+; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT: vaddps %ymm1, %ymm5, %ymm1
; BTVER2-NEXT: retq
;
; SANDY-LABEL: v16f32_one_step2:
; SANDY: # %bb.0:
-; SANDY-NEXT: vrcpps %ymm1, %ymm2
-; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1
-; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1
-; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1
; SANDY-NEXT: vrcpps %ymm0, %ymm2
-; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4
+; SANDY-NEXT: vmulps %ymm4, %ymm0, %ymm0
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
+; SANDY-NEXT: vrcpps %ymm1, %ymm3
; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
-; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; SANDY-NEXT: vaddps %ymm0, %ymm4, %ymm0
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm4
+; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1
+; SANDY-NEXT: vsubps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT: vmulps %ymm1, %ymm3, %ymm1
+; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1
; SANDY-NEXT: retq
;
; HASWELL-LABEL: v16f32_one_step2:
; HASWELL: # %bb.0:
-; HASWELL-NEXT: vrcpps %ymm1, %ymm2
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vrcpps %ymm0, %ymm4
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
+; HASWELL-NEXT: vrcpps %ymm0, %ymm2
+; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4
+; HASWELL-NEXT: vrcpps %ymm1, %ymm2
+; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4
+; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
+; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm2 * ymm1) + ymm4
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v16f32_one_step2:
; HASWELL-NO-FMA: # %bb.0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2
-; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1
-; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2
-; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4
+; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm0, %ymm0
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm3
; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0
-; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm4, %ymm0
+; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm4
+; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm2, %ymm1
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm3, %ymm1
+; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1
; HASWELL-NO-FMA-NEXT: retq
;
; AVX512-LABEL: v16f32_one_step2:
; AVX512: # %bb.0:
; AVX512-NEXT: vrcp14ps %zmm0, %zmm1
-; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem
-; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1
-; AVX512-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vmovaps {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm3
+; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2
+; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm3
; AVX512-NEXT: retq
%div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
ret <16 x float> %div
@@ -1436,228 +1460,245 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
; SSE-LABEL: v16f32_two_step2:
; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm3, %xmm6
-; SSE-NEXT: movaps %xmm2, %xmm5
-; SSE-NEXT: movaps %xmm0, %xmm4
-; SSE-NEXT: rcpps %xmm3, %xmm2
-; SSE-NEXT: mulps %xmm2, %xmm3
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT: movaps %xmm0, %xmm7
-; SSE-NEXT: subps %xmm3, %xmm7
-; SSE-NEXT: mulps %xmm2, %xmm7
-; SSE-NEXT: addps %xmm2, %xmm7
-; SSE-NEXT: mulps %xmm7, %xmm6
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: subps %xmm6, %xmm3
-; SSE-NEXT: mulps %xmm7, %xmm3
-; SSE-NEXT: addps %xmm7, %xmm3
-; SSE-NEXT: rcpps %xmm5, %xmm2
-; SSE-NEXT: movaps %xmm5, %xmm6
-; SSE-NEXT: mulps %xmm2, %xmm6
-; SSE-NEXT: movaps %xmm0, %xmm7
-; SSE-NEXT: subps %xmm6, %xmm7
-; SSE-NEXT: mulps %xmm2, %xmm7
-; SSE-NEXT: addps %xmm2, %xmm7
-; SSE-NEXT: mulps %xmm7, %xmm5
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: subps %xmm5, %xmm2
-; SSE-NEXT: mulps %xmm7, %xmm2
-; SSE-NEXT: addps %xmm7, %xmm2
-; SSE-NEXT: rcpps %xmm1, %xmm5
-; SSE-NEXT: movaps %xmm1, %xmm6
-; SSE-NEXT: mulps %xmm5, %xmm6
-; SSE-NEXT: movaps %xmm0, %xmm7
-; SSE-NEXT: subps %xmm6, %xmm7
-; SSE-NEXT: mulps %xmm5, %xmm7
-; SSE-NEXT: addps %xmm5, %xmm7
-; SSE-NEXT: mulps %xmm7, %xmm1
-; SSE-NEXT: movaps %xmm0, %xmm5
-; SSE-NEXT: subps %xmm1, %xmm5
-; SSE-NEXT: mulps %xmm7, %xmm5
-; SSE-NEXT: addps %xmm7, %xmm5
+; SSE-NEXT: movaps %xmm1, %xmm4
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: rcpps %xmm0, %xmm0
+; SSE-NEXT: movaps %xmm1, %xmm5
+; SSE-NEXT: mulps %xmm0, %xmm5
+; SSE-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE-NEXT: movaps %xmm6, %xmm7
+; SSE-NEXT: subps %xmm5, %xmm7
+; SSE-NEXT: mulps %xmm0, %xmm7
+; SSE-NEXT: addps %xmm0, %xmm7
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
+; SSE-NEXT: movaps %xmm7, %xmm5
+; SSE-NEXT: mulps %xmm0, %xmm5
+; SSE-NEXT: mulps %xmm5, %xmm1
+; SSE-NEXT: subps %xmm1, %xmm0
+; SSE-NEXT: mulps %xmm7, %xmm0
+; SSE-NEXT: addps %xmm5, %xmm0
; SSE-NEXT: rcpps %xmm4, %xmm1
-; SSE-NEXT: movaps %xmm4, %xmm6
-; SSE-NEXT: mulps %xmm1, %xmm6
-; SSE-NEXT: movaps %xmm0, %xmm7
-; SSE-NEXT: subps %xmm6, %xmm7
+; SSE-NEXT: movaps %xmm4, %xmm5
+; SSE-NEXT: mulps %xmm1, %xmm5
+; SSE-NEXT: movaps %xmm6, %xmm7
+; SSE-NEXT: subps %xmm5, %xmm7
; SSE-NEXT: mulps %xmm1, %xmm7
; SSE-NEXT: addps %xmm1, %xmm7
+; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; SSE-NEXT: movaps %xmm7, %xmm5
+; SSE-NEXT: mulps %xmm1, %xmm5
+; SSE-NEXT: mulps %xmm5, %xmm4
+; SSE-NEXT: subps %xmm4, %xmm1
+; SSE-NEXT: mulps %xmm7, %xmm1
+; SSE-NEXT: addps %xmm5, %xmm1
+; SSE-NEXT: rcpps %xmm2, %xmm4
+; SSE-NEXT: movaps %xmm2, %xmm5
+; SSE-NEXT: mulps %xmm4, %xmm5
+; SSE-NEXT: movaps %xmm6, %xmm7
+; SSE-NEXT: subps %xmm5, %xmm7
+; SSE-NEXT: mulps %xmm4, %xmm7
+; SSE-NEXT: addps %xmm4, %xmm7
+; SSE-NEXT: movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1]
+; SSE-NEXT: movaps %xmm7, %xmm5
+; SSE-NEXT: mulps %xmm4, %xmm5
+; SSE-NEXT: mulps %xmm5, %xmm2
+; SSE-NEXT: subps %xmm2, %xmm4
; SSE-NEXT: mulps %xmm7, %xmm4
-; SSE-NEXT: subps %xmm4, %xmm0
-; SSE-NEXT: mulps %xmm7, %xmm0
-; SSE-NEXT: addps %xmm7, %xmm0
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm0
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm5
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm2
-; SSE-NEXT: mulps {{.*}}(%rip), %xmm3
-; SSE-NEXT: movaps %xmm5, %xmm1
+; SSE-NEXT: addps %xmm5, %xmm4
+; SSE-NEXT: rcpps %xmm3, %xmm2
+; SSE-NEXT: movaps %xmm3, %xmm5
+; SSE-NEXT: mulps %xmm2, %xmm5
+; SSE-NEXT: subps %xmm5, %xmm6
+; SSE-NEXT: mulps %xmm2, %xmm6
+; SSE-NEXT: addps %xmm2, %xmm6
+; SSE-NEXT: movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; SSE-NEXT: movaps %xmm6, %xmm2
+; SSE-NEXT: mulps %xmm5, %xmm2
+; SSE-NEXT: mulps %xmm2, %xmm3
+; SSE-NEXT: subps %xmm3, %xmm5
+; SSE-NEXT: mulps %xmm6, %xmm5
+; SSE-NEXT: addps %xmm2, %xmm5
+; SSE-NEXT: movaps %xmm4, %xmm2
+; SSE-NEXT: movaps %xmm5, %xmm3
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v16f32_two_step2:
; AVX-RECIP: # %bb.0:
-; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
-; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
+; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3
; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
-; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1
-; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1
-; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
-; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1
-; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
-; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3
+; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm5
+; AVX-RECIP-NEXT: vmulps %ymm5, %ymm0, %ymm0
+; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
+; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
+; AVX-RECIP-NEXT: vaddps %ymm0, %ymm5, %ymm0
+; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
+; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
-; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0
-; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
-; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4
+; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1
+; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1
+; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
+; AVX-RECIP-NEXT: vaddps %ymm1, %ymm4, %ymm1
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v16f32_two_step2:
; FMA-RECIP: # %bb.0:
-; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
-; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
+; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
-; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
-; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
-; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; FMA-RECIP-NEXT: vmulps %ymm2, %ymm4, %ymm5
+; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm5 * ymm0) + ymm2
+; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm5
+; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
+; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} ymm3 = -(ymm1 * ymm2) + ymm3
+; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm2) + ymm2
+; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; FMA-RECIP-NEXT: vmulps %ymm2, %ymm3, %ymm4
+; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm2
+; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm4
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: v16f32_two_step2:
; BDVER2: # %bb.0:
-; BDVER2-NEXT: vrcpps %ymm1, %ymm2
-; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4
-; BDVER2-NEXT: vfmaddps %ymm2, %ymm4, %ymm2, %ymm2
-; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1
-; BDVER2-NEXT: vfmaddps %ymm2, %ymm1, %ymm2, %ymm1
; BDVER2-NEXT: vrcpps %ymm0, %ymm2
-; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4
; BDVER2-NEXT: vfmaddps %ymm2, %ymm4, %ymm2, %ymm2
-; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0
-; BDVER2-NEXT: vfmaddps %ymm2, %ymm0, %ymm2, %ymm0
-; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; BDVER2-NEXT: vmulps %ymm4, %ymm2, %ymm5
+; BDVER2-NEXT: vfnmaddps %ymm4, %ymm5, %ymm0, %ymm0
+; BDVER2-NEXT: vfmaddps %ymm5, %ymm0, %ymm2, %ymm0
+; BDVER2-NEXT: vrcpps %ymm1, %ymm2
+; BDVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm3
+; BDVER2-NEXT: vfmaddps %ymm2, %ymm3, %ymm2, %ymm2
+; BDVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4
+; BDVER2-NEXT: vfnmaddps %ymm5, %ymm4, %ymm1, %ymm1
+; BDVER2-NEXT: vfmaddps %ymm4, %ymm1, %ymm2, %ymm1
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: v16f32_two_step2:
; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; BTVER2-NEXT: vrcpps %ymm1, %ymm2
-; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3
-; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
-; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3
-; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2
-; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1
-; BTVER2-NEXT: vsubps %ymm1, %ymm4, %ymm1
-; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1
-; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1
; BTVER2-NEXT: vrcpps %ymm0, %ymm2
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3
; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3
; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2
-; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5
+; BTVER2-NEXT: vmulps %ymm5, %ymm0, %ymm0
+; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0
; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0
-; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT: vrcpps %ymm1, %ymm2
+; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3
+; BTVER2-NEXT: vaddps %ymm0, %ymm5, %ymm0
+; BTVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3
+; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3
+; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2
+; BTVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4
+; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1
+; BTVER2-NEXT: vsubps %ymm1, %ymm5, %ymm1
+; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1
+; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1
; BTVER2-NEXT: retq
;
; SANDY-LABEL: v16f32_two_step2:
; SANDY: # %bb.0:
-; SANDY-NEXT: vrcpps %ymm1, %ymm2
-; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3
-; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
-; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
-; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
-; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1
-; SANDY-NEXT: vsubps %ymm1, %ymm4, %ymm1
-; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1
-; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1
; SANDY-NEXT: vrcpps %ymm0, %ymm2
; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3
+; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3
; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3
; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2
-; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; SANDY-NEXT: vsubps %ymm0, %ymm4, %ymm0
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm5
+; SANDY-NEXT: vmulps %ymm5, %ymm0, %ymm0
+; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0
+; SANDY-NEXT: vrcpps %ymm1, %ymm3
; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0
-; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; SANDY-NEXT: vaddps %ymm0, %ymm5, %ymm0
+; SANDY-NEXT: vmulps %ymm3, %ymm1, %ymm2
+; SANDY-NEXT: vsubps %ymm2, %ymm4, %ymm2
+; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm2
+; SANDY-NEXT: vaddps %ymm2, %ymm3, %ymm2
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4
+; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1
+; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1
+; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1
+; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1
; SANDY-NEXT: retq
;
; HASWELL-LABEL: v16f32_two_step2:
; HASWELL: # %bb.0:
-; HASWELL-NEXT: vrcpps %ymm1, %ymm2
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NEXT: vmovaps %ymm2, %ymm4
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
; HASWELL-NEXT: vrcpps %ymm0, %ymm2
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; HASWELL-NEXT: vmovaps %ymm2, %ymm4
; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
-; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
-; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; HASWELL-NEXT: vmulps %ymm2, %ymm4, %ymm5
+; HASWELL-NEXT: vrcpps %ymm1, %ymm6
+; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm5 * ymm0) + ymm2
+; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm5
+; HASWELL-NEXT: vfnmadd231ps {{.*#+}} ymm3 = -(ymm1 * ymm6) + ymm3
+; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm6) + ymm6
+; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; HASWELL-NEXT: vmulps %ymm2, %ymm3, %ymm4
+; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm2
+; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm4
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v16f32_two_step2:
; HASWELL-NO-FMA: # %bb.0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2
-; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3
-; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3
-; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2
-; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1
-; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1
-; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2
; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3
; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3
; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2
-; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0
+; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm5
+; HASWELL-NO-FMA-NEXT: vmulps %ymm5, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm3
; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0
-; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm5, %ymm0
+; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm1, %ymm2
+; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm4, %ymm2
+; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm2
+; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm3, %ymm2
+; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4
+; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1
+; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1
; HASWELL-NO-FMA-NEXT: retq
;
; AVX512-LABEL: v16f32_two_step2:
; AVX512: # %bb.0:
; AVX512-NEXT: vrcp14ps %zmm0, %zmm1
; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT: vmovaps %zmm1, %zmm3
-; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2
-; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1
-; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2
-; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3
-; AVX512-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
+; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm2 = (zmm2 * zmm1) + zmm1
+; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
+; AVX512-NEXT: vmulps %zmm1, %zmm2, %zmm3
+; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm1
+; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm3
; AVX512-NEXT: retq
%div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
ret <16 x float> %div
@@ -1725,68 +1766,68 @@ define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
; SSE-LABEL: v16f32_no_step2:
; SSE: # %bb.0:
-; SSE-NEXT: rcpps %xmm3, %xmm3
-; SSE-NEXT: rcpps %xmm2, %xmm2
-; SSE-NEXT: rcpps %xmm1, %xmm1
; SSE-NEXT: rcpps %xmm0, %xmm0
; SSE-NEXT: mulps {{.*}}(%rip), %xmm0
+; SSE-NEXT: rcpps %xmm1, %xmm1
; SSE-NEXT: mulps {{.*}}(%rip), %xmm1
+; SSE-NEXT: rcpps %xmm2, %xmm2
; SSE-NEXT: mulps {{.*}}(%rip), %xmm2
+; SSE-NEXT: rcpps %xmm3, %xmm3
; SSE-NEXT: mulps {{.*}}(%rip), %xmm3
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v16f32_no_step2:
; AVX-RECIP: # %bb.0:
-; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0
; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1
; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v16f32_no_step2:
; FMA-RECIP: # %bb.0:
-; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0
; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1
; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
; FMA-RECIP-NEXT: retq
;
; BDVER2-LABEL: v16f32_no_step2:
; BDVER2: # %bb.0:
-; BDVER2-NEXT: vrcpps %ymm1, %ymm1
; BDVER2-NEXT: vrcpps %ymm0, %ymm0
+; BDVER2-NEXT: vrcpps %ymm1, %ymm1
; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: v16f32_no_step2:
; BTVER2: # %bb.0:
-; BTVER2-NEXT: vrcpps %ymm1, %ymm1
; BTVER2-NEXT: vrcpps %ymm0, %ymm0
+; BTVER2-NEXT: vrcpps %ymm1, %ymm1
; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
; BTVER2-NEXT: retq
;
; SANDY-LABEL: v16f32_no_step2:
; SANDY: # %bb.0:
-; SANDY-NEXT: vrcpps %ymm1, %ymm1
; SANDY-NEXT: vrcpps %ymm0, %ymm0
+; SANDY-NEXT: vrcpps %ymm1, %ymm1
; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
; SANDY-NEXT: retq
;
; HASWELL-LABEL: v16f32_no_step2:
; HASWELL: # %bb.0:
-; HASWELL-NEXT: vrcpps %ymm1, %ymm1
; HASWELL-NEXT: vrcpps %ymm0, %ymm0
+; HASWELL-NEXT: vrcpps %ymm1, %ymm1
; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
; HASWELL-NEXT: retq
;
; HASWELL-NO-FMA-LABEL: v16f32_no_step2:
; HASWELL-NO-FMA: # %bb.0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1
; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
; HASWELL-NO-FMA-NEXT: retq
OpenPOWER on IntegriCloud