diff options
Diffstat (limited to 'llvm/test/CodeGen/X86')
| -rw-r--r-- | llvm/test/CodeGen/X86/recip-fastmath.ll | 180 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/recip-fastmath2.ll | 1139 |
2 files changed, 674 insertions, 645 deletions
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll index c7442f2bc20..c618c37e4fe 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -122,88 +122,87 @@ define float @f32_one_step_variables(float %x, float %y) #1 { ; SSE-LABEL: f32_one_step_variables: ; SSE: # %bb.0: ; SSE-NEXT: rcpss %xmm1, %xmm2 -; SSE-NEXT: mulss %xmm2, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-NEXT: subss %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulss %xmm2, %xmm3 -; SSE-NEXT: addss %xmm2, %xmm3 -; SSE-NEXT: mulss %xmm3, %xmm0 +; SSE-NEXT: mulss %xmm3, %xmm1 +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm2, %xmm0 +; SSE-NEXT: addss %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_variables: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; AVX-RECIP-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; AVX-RECIP-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_variables: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; FMA-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_one_step_variables: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm2, %xmm1, %xmm1 -; BDVER2-NEXT: vfmaddss %xmm2, %xmm1, %xmm2, %xmm1 -; BDVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; BDVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; BDVER2-NEXT: vfnmaddss %xmm0, %xmm3, %xmm1, %xmm0 +; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm2, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_variables: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; BTVER2-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step_variables: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; SANDY-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_one_step_variables: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; HASWELL-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; HASWELL-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; HASWELL-NEXT: vmovaps %xmm2, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_variables: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; HASWELL-NO-FMA-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_one_step_variables: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; AVX512-NEXT: vmovaps %xmm2, %xmm0 ; AVX512-NEXT: retq %div = fdiv fast float %x, %y ret float %div @@ -484,99 +483,88 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1 ; SSE-LABEL: v4f32_one_step_variables: ; SSE: # %bb.0: ; SSE-NEXT: rcpps %xmm1, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: subps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: addps %xmm2, %xmm3 -; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: addps %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step_variables: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm1, %xmm2 -; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-RECIP-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; AVX-RECIP-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; AVX-RECIP-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step_variables: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm1, %xmm2 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; FMA-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v4f32_one_step_variables: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm1, %xmm2 -; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm2, %xmm1, %xmm1 -; BDVER2-NEXT: vfmaddps %xmm2, %xmm1, %xmm2, %xmm1 -; BDVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; BDVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; BDVER2-NEXT: vfnmaddps %xmm0, %xmm3, %xmm1, %xmm0 +; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm2, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step_variables: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm1, %xmm2 -; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; BTVER2-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step_variables: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm1, %xmm2 -; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; SANDY-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step_variables: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm1, %xmm2 -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; HASWELL-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; HASWELL-NEXT: vmovaps %xmm2, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NO-FMA-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; HASWELL-NO-FMA-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; -; KNL-LABEL: v4f32_one_step_variables: -; KNL: # %bb.0: -; KNL-NEXT: vrcpps %xmm1, %xmm2 -; KNL-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm3 -; KNL-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; KNL-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: v4f32_one_step_variables: -; SKX: # %bb.0: -; SKX-NEXT: vrcpps %xmm1, %xmm2 -; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; SKX-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; SKX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SKX-NEXT: retq +; AVX512-LABEL: v4f32_one_step_variables: +; AVX512: # %bb.0: +; AVX512-NEXT: vrcpps %xmm1, %xmm2 +; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: retq %div = fdiv fast <4 x float> %x, %y ret <4 x float> %div } diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll index de511b411b3..a2bd6c2081c 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -32,89 +32,94 @@ define float @f32_one_step_2(float %x) #1 { ; SSE-LABEL: f32_one_step_2: ; SSE: # %bb.0: ; SSE-NEXT: rcpss %xmm0, %xmm2 -; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: mulss %xmm1, %xmm3 +; SSE-NEXT: mulss %xmm3, %xmm0 ; SSE-NEXT: subss %xmm0, %xmm1 ; SSE-NEXT: mulss %xmm2, %xmm1 -; SSE-NEXT: addss %xmm2, %xmm1 -; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 +; SSE-NEXT: addss %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_2: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; AVX-RECIP-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_one_step_2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 -; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; BDVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; BDVER2-NEXT: vfnmaddss %xmm2, %xmm3, %xmm0, %xmm0 +; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_2: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; BTVER2-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step_2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_one_step_2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; HASWELL-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_one_step_2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 ; AVX512-NEXT: retq %div = fdiv fast float 3456.0, %x ret float %div @@ -225,19 +230,20 @@ define float @f32_one_step_2_divs(float %x) #1 { define float @f32_two_step_2(float %x) #2 { ; SSE-LABEL: f32_two_step_2: ; SSE: # %bb.0: -; SSE-NEXT: rcpss %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: mulss %xmm2, %xmm3 +; SSE-NEXT: rcpss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: mulss %xmm1, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: subss %xmm2, %xmm3 +; SSE-NEXT: mulss %xmm1, %xmm3 +; SSE-NEXT: addss %xmm1, %xmm3 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subss %xmm3, %xmm4 -; SSE-NEXT: mulss %xmm2, %xmm4 -; SSE-NEXT: addss %xmm2, %xmm4 -; SSE-NEXT: mulss %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: mulss %xmm1, %xmm2 +; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: subss %xmm0, %xmm1 -; SSE-NEXT: mulss %xmm4, %xmm1 -; SSE-NEXT: addss %xmm4, %xmm1 -; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 +; SSE-NEXT: mulss %xmm3, %xmm1 +; SSE-NEXT: addss %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -249,49 +255,51 @@ define float @f32_two_step_2(float %x) #2 { ; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 ; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 +; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; AVX-RECIP-NEXT: vmulss %xmm3, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_two_step_2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; FMA-RECIP-NEXT: vmulss %xmm1, %xmm2, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_two_step_2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 -; BDVER2-NEXT: vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 -; BDVER2-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 -; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm2 +; BDVER2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; BDVER2-NEXT: vfmaddss %xmm1, %xmm2, %xmm1, %xmm1 +; BDVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3 +; BDVER2-NEXT: vfnmaddss %xmm4, %xmm3, %xmm0, %xmm0 +; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: f32_two_step_2: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 +; BTVER2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 ; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 ; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 ; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 +; BTVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3 +; BTVER2-NEXT: vmulss %xmm3, %xmm0, %xmm0 +; BTVER2-NEXT: vsubss %xmm0, %xmm4, %xmm0 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_two_step_2: @@ -302,23 +310,24 @@ define float @f32_two_step_2(float %x) #2 { ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 +; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; SANDY-NEXT: vmulss %xmm3, %xmm0, %xmm0 +; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_two_step_2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HASWELL-NEXT: vmulss %xmm1, %xmm2, %xmm3 +; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_two_step_2: @@ -329,23 +338,24 @@ define float @f32_two_step_2(float %x) #2 { ; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 ; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_two_step_2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vmovaps %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm3 +; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; AVX512-NEXT: retq %div = fdiv fast float 6789.0, %x ret float %div @@ -355,100 +365,95 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { ; SSE-LABEL: v4f32_one_step2: ; SSE: # %bb.0: ; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: mulps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm3, %xmm0 ; SSE-NEXT: subps %xmm0, %xmm1 ; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: addps %xmm2, %xmm1 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step2: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; FMA-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v4f32_one_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 -; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; BDVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; BDVER2-NEXT: vfnmaddps %xmm2, %xmm3, %xmm0, %xmm0 +; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step2: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; -; KNL-LABEL: v4f32_one_step2: -; KNL: # %bb.0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 -; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: v4f32_one_step2: -; SKX: # %bb.0: -; SKX-NEXT: vrcpps %xmm0, %xmm1 -; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: retq +; AVX512-LABEL: v4f32_one_step2: +; AVX512: # %bb.0: +; AVX512-NEXT: vrcpps %xmm0, %xmm1 +; AVX512-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm3 +; AVX512-NEXT: retq %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x ret <4 x float> %div } @@ -569,19 +574,20 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; SSE-LABEL: v4f32_two_step2: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: subps %xmm3, %xmm4 -; SSE-NEXT: mulps %xmm2, %xmm4 -; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: rcpps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE-NEXT: subps %xmm2, %xmm3 +; SSE-NEXT: mulps %xmm1, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: subps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: addps %xmm4, %xmm1 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: addps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -593,49 +599,51 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; AVX-RECIP-NEXT: vmulps %xmm3, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; FMA-RECIP-NEXT: vmulps %xmm1, %xmm2, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v4f32_two_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 -; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 -; BDVER2-NEXT: vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 -; BDVER2-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm2 +; BDVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; BDVER2-NEXT: vfmaddps %xmm1, %xmm2, %xmm1, %xmm1 +; BDVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3 +; BDVER2-NEXT: vfnmaddps %xmm4, %xmm3, %xmm0, %xmm0 +; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v4f32_two_step2: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 +; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 +; BTVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3 +; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0 +; BTVER2-NEXT: vsubps %xmm0, %xmm4, %xmm0 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_two_step2: @@ -646,23 +654,24 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 +; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; SANDY-NEXT: vmulps %xmm3, %xmm0, %xmm0 +; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_two_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vmovaps %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; HASWELL-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NEXT: vmulps %xmm1, %xmm2, %xmm3 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_two_step2: @@ -673,23 +682,24 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: v4f32_two_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpps %xmm0, %xmm1 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vmovaps %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 -; AVX512-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 +; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 ; AVX512-NEXT: retq %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x ret <4 x float> %div @@ -698,20 +708,22 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; SSE-LABEL: v8f32_one_step2: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm1, %xmm4 +; SSE-NEXT: rcpps %xmm0, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: mulps %xmm2, %xmm4 +; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: subps %xmm0, %xmm2 +; SSE-NEXT: mulps %xmm3, %xmm2 +; SSE-NEXT: addps %xmm4, %xmm2 +; SSE-NEXT: rcpps %xmm1, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: mulps %xmm4, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm4, %xmm3 +; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm4, %xmm3 -; SSE-NEXT: rcpps %xmm0, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: subps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: addps %xmm1, %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: retq @@ -719,88 +731,82 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; AVX-RECIP-LABEL: v8f32_one_step2: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_one_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v8f32_one_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 -; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 -; BDVER2-NEXT: vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; BDVER2-NEXT: vfnmaddps %ymm2, %ymm3, %ymm0, %ymm0 +; BDVER2-NEXT: vfmaddps %ymm3, %ymm0, %ymm1, %ymm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v8f32_one_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_one_step2: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_one_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step2: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; HASWELL-NO-FMA-NEXT: retq ; -; KNL-LABEL: v8f32_one_step2: -; KNL: # %bb.0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 -; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: retq -; -; SKX-LABEL: v8f32_one_step2: -; SKX: # %bb.0: -; SKX-NEXT: vrcpps %ymm0, %ymm1 -; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem -; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: retq +; AVX512-LABEL: v8f32_one_step2: +; AVX512: # %bb.0: +; AVX512-NEXT: vrcpps %ymm0, %ymm1 +; AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 +; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm3 +; AVX512-NEXT: retq %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x ret <8 x float> %div } @@ -930,33 +936,35 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; SSE-LABEL: v8f32_two_step2: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: mulps %xmm3, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: subps %xmm4, %xmm5 -; SSE-NEXT: mulps %xmm3, %xmm5 -; SSE-NEXT: addps %xmm3, %xmm5 -; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: rcpps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm5, %xmm3 -; SSE-NEXT: addps %xmm5, %xmm3 -; SSE-NEXT: rcpps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: subps %xmm4, %xmm5 -; SSE-NEXT: mulps %xmm1, %xmm5 -; SSE-NEXT: addps %xmm1, %xmm5 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: subps %xmm3, %xmm5 +; SSE-NEXT: mulps %xmm2, %xmm5 +; SSE-NEXT: addps %xmm2, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: subps %xmm0, %xmm2 ; SSE-NEXT: mulps %xmm5, %xmm2 -; SSE-NEXT: subps %xmm2, %xmm0 -; SSE-NEXT: mulps %xmm5, %xmm0 -; SSE-NEXT: addps %xmm5, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 +; SSE-NEXT: addps %xmm3, %xmm2 +; SSE-NEXT: rcpps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm0, %xmm3 +; SSE-NEXT: subps %xmm3, %xmm4 +; SSE-NEXT: mulps %xmm0, %xmm4 +; SSE-NEXT: addps %xmm0, %xmm4 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm4, %xmm3 +; SSE-NEXT: addps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: retq ; @@ -968,49 +976,51 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v8f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 +; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm3 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v8f32_two_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 -; BDVER2-NEXT: vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 -; BDVER2-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 -; BDVER2-NEXT: vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm2 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vfmaddps %ymm1, %ymm2, %ymm1, %ymm1 +; BDVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3 +; BDVER2-NEXT: vfnmaddps %ymm4, %ymm3, %ymm0, %ymm0 +; BDVER2-NEXT: vfmaddps %ymm3, %ymm0, %ymm1, %ymm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v8f32_two_step2: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3 +; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0 +; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v8f32_two_step2: @@ -1021,23 +1031,24 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; SANDY-NEXT: vmulps %ymm3, %ymm0, %ymm0 +; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; SANDY-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v8f32_two_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vmovaps %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 +; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm3 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_two_step2: @@ -1048,23 +1059,24 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: v8f32_two_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpps %ymm0, %ymm1 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vmovaps %ymm1, %ymm3 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 -; AVX512-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 +; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1 +; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX512-NEXT: vmulps %ymm1, %ymm2, %ymm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1 +; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm3 ; AVX512-NEXT: retq %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x ret <8 x float> %div @@ -1088,9 +1100,9 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 { define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { ; SSE-LABEL: v8f32_no_step2: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: rcpps %xmm0, %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 +; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 ; SSE-NEXT: retq ; @@ -1106,153 +1118,165 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 { ; SSE-LABEL: v16f32_one_step2: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: rcpps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subps %xmm4, %xmm3 -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: addps %xmm2, %xmm3 -; SSE-NEXT: rcpps %xmm5, %xmm4 -; SSE-NEXT: mulps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subps %xmm5, %xmm2 -; SSE-NEXT: mulps %xmm4, %xmm2 -; SSE-NEXT: addps %xmm4, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: rcpps %xmm0, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm0, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm5, %xmm0 +; SSE-NEXT: addps %xmm6, %xmm0 +; SSE-NEXT: rcpps %xmm4, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm1, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm4 +; SSE-NEXT: subps %xmm4, %xmm1 ; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: subps %xmm1, %xmm4 +; SSE-NEXT: addps %xmm6, %xmm1 +; SSE-NEXT: rcpps %xmm2, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm4, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm2 +; SSE-NEXT: subps %xmm2, %xmm4 ; SSE-NEXT: mulps %xmm5, %xmm4 -; SSE-NEXT: addps %xmm5, %xmm4 -; SSE-NEXT: rcpps %xmm6, %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm6 -; SSE-NEXT: subps %xmm6, %xmm0 -; SSE-NEXT: mulps %xmm1, %xmm0 -; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm4 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: addps %xmm6, %xmm4 +; SSE-NEXT: rcpps %xmm3, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: mulps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm6, %xmm3 +; SSE-NEXT: subps %xmm3, %xmm5 +; SSE-NEXT: mulps %xmm2, %xmm5 +; SSE-NEXT: addps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v16f32_one_step2: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; AVX-RECIP-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm4, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; AVX-RECIP-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v16f32_one_step2: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4 +; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; FMA-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm2 * ymm1) + ymm4 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v16f32_one_step2: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vrcpps %ymm1, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vrcpps %ymm0, %ymm4 -; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 +; BDVER2-NEXT: vrcpps %ymm0, %ymm2 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vrcpps %ymm1, %ymm5 +; BDVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm4, %ymm0, %ymm0 -; BDVER2-NEXT: vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 -; BDVER2-NEXT: vfmaddps %ymm4, %ymm0, %ymm4, %ymm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BDVER2-NEXT: vfmaddps %ymm4, %ymm0, %ymm2, %ymm0 +; BDVER2-NEXT: vmulps %ymm3, %ymm5, %ymm4 +; BDVER2-NEXT: vfnmaddps %ymm3, %ymm4, %ymm1, %ymm1 +; BDVER2-NEXT: vfmaddps %ymm4, %ymm1, %ymm5, %ymm1 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v16f32_one_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BTVER2-NEXT: vrcpps %ymm1, %ymm2 -; BTVER2-NEXT: vrcpps %ymm0, %ymm4 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BTVER2-NEXT: vrcpps %ymm0, %ymm2 +; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; BTVER2-NEXT: vmulps %ymm4, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; BTVER2-NEXT: vmulps %ymm0, %ymm4, %ymm0 -; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 +; BTVER2-NEXT: vrcpps %ymm1, %ymm2 +; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5 ; BTVER2-NEXT: vaddps %ymm0, %ymm4, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; BTVER2-NEXT: vmulps %ymm5, %ymm1, %ymm1 +; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; BTVER2-NEXT: vaddps %ymm1, %ymm5, %ymm1 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v16f32_one_step2: ; SANDY: # %bb.0: -; SANDY-NEXT: vrcpps %ymm1, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; SANDY-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; SANDY-NEXT: vrcpps %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; SANDY-NEXT: vaddps %ymm0, %ymm4, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm4 +; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; SANDY-NEXT: vsubps %ymm1, %ymm2, %ymm1 +; SANDY-NEXT: vmulps %ymm1, %ymm3, %ymm1 +; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v16f32_one_step2: ; HASWELL: # %bb.0: -; HASWELL-NEXT: vrcpps %ymm1, %ymm2 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vrcpps %ymm0, %ymm4 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 +; HASWELL-NEXT: vrcpps %ymm0, %ymm2 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4 +; HASWELL-NEXT: vrcpps %ymm1, %ymm2 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm2 * ymm1) + ymm4 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v16f32_one_step2: ; HASWELL-NO-FMA: # %bb.0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm0, %ymm0 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm3 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm4, %ymm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm4 +; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm2, %ymm1 +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm3, %ymm1 +; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: v16f32_one_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem -; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 -; AVX512-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vmovaps {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 +; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm3 ; AVX512-NEXT: retq %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x ret <16 x float> %div @@ -1436,228 +1460,245 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { ; SSE-LABEL: v16f32_two_step2: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: rcpps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: subps %xmm3, %xmm7 -; SSE-NEXT: mulps %xmm2, %xmm7 -; SSE-NEXT: addps %xmm2, %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subps %xmm6, %xmm3 -; SSE-NEXT: mulps %xmm7, %xmm3 -; SSE-NEXT: addps %xmm7, %xmm3 -; SSE-NEXT: rcpps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: mulps %xmm2, %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: subps %xmm6, %xmm7 -; SSE-NEXT: mulps %xmm2, %xmm7 -; SSE-NEXT: addps %xmm2, %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm5 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subps %xmm5, %xmm2 -; SSE-NEXT: mulps %xmm7, %xmm2 -; SSE-NEXT: addps %xmm7, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: mulps %xmm5, %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: subps %xmm6, %xmm7 -; SSE-NEXT: mulps %xmm5, %xmm7 -; SSE-NEXT: addps %xmm5, %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: subps %xmm1, %xmm5 -; SSE-NEXT: mulps %xmm7, %xmm5 -; SSE-NEXT: addps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: rcpps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: subps %xmm5, %xmm7 +; SSE-NEXT: mulps %xmm0, %xmm7 +; SSE-NEXT: addps %xmm0, %xmm7 +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm5 +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm7, %xmm0 +; SSE-NEXT: addps %xmm5, %xmm0 ; SSE-NEXT: rcpps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: mulps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: subps %xmm6, %xmm7 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: mulps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: subps %xmm5, %xmm7 ; SSE-NEXT: mulps %xmm1, %xmm7 ; SSE-NEXT: addps %xmm1, %xmm7 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: mulps %xmm1, %xmm5 +; SSE-NEXT: mulps %xmm5, %xmm4 +; SSE-NEXT: subps %xmm4, %xmm1 +; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: addps %xmm5, %xmm1 +; SSE-NEXT: rcpps %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: mulps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: subps %xmm5, %xmm7 +; SSE-NEXT: mulps %xmm4, %xmm7 +; SSE-NEXT: addps %xmm4, %xmm7 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: mulps %xmm4, %xmm5 +; SSE-NEXT: mulps %xmm5, %xmm2 +; SSE-NEXT: subps %xmm2, %xmm4 ; SSE-NEXT: mulps %xmm7, %xmm4 -; SSE-NEXT: subps %xmm4, %xmm0 -; SSE-NEXT: mulps %xmm7, %xmm0 -; SSE-NEXT: addps %xmm7, %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm5 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 -; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: addps %xmm5, %xmm4 +; SSE-NEXT: rcpps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: mulps %xmm2, %xmm5 +; SSE-NEXT: subps %xmm5, %xmm6 +; SSE-NEXT: mulps %xmm2, %xmm6 +; SSE-NEXT: addps %xmm2, %xmm6 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: mulps %xmm5, %xmm2 +; SSE-NEXT: mulps %xmm2, %xmm3 +; SSE-NEXT: subps %xmm3, %xmm5 +; SSE-NEXT: mulps %xmm6, %xmm5 +; SSE-NEXT: addps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v16f32_two_step2: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 -; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm5 +; AVX-RECIP-NEXT: vmulps %ymm5, %ymm0, %ymm0 +; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 +; AVX-RECIP-NEXT: vaddps %ymm0, %ymm5, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 +; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0 -; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; AVX-RECIP-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; AVX-RECIP-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v16f32_two_step2: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; FMA-RECIP-NEXT: vmulps %ymm2, %ymm4, %ymm5 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm5 * ymm0) + ymm2 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm5 +; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 +; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} ymm3 = -(ymm1 * ymm2) + ymm3 +; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm2) + ymm2 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; FMA-RECIP-NEXT: vmulps %ymm2, %ymm3, %ymm4 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm2 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm4 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v16f32_two_step2: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vrcpps %ymm1, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 -; BDVER2-NEXT: vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 -; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 -; BDVER2-NEXT: vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 ; BDVER2-NEXT: vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 -; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 -; BDVER2-NEXT: vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 -; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vmulps %ymm4, %ymm2, %ymm5 +; BDVER2-NEXT: vfnmaddps %ymm4, %ymm5, %ymm0, %ymm0 +; BDVER2-NEXT: vfmaddps %ymm5, %ymm0, %ymm2, %ymm0 +; BDVER2-NEXT: vrcpps %ymm1, %ymm2 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm3 +; BDVER2-NEXT: vfmaddps %ymm2, %ymm3, %ymm2, %ymm2 +; BDVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 +; BDVER2-NEXT: vfnmaddps %ymm5, %ymm4, %ymm1, %ymm1 +; BDVER2-NEXT: vfmaddps %ymm4, %ymm1, %ymm2, %ymm1 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v16f32_two_step2: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BTVER2-NEXT: vrcpps %ymm1, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 -; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 -; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 -; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5 +; BTVER2-NEXT: vmulps %ymm5, %ymm0, %ymm0 +; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; BTVER2-NEXT: vrcpps %ymm1, %ymm2 +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; BTVER2-NEXT: vaddps %ymm0, %ymm5, %ymm0 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 +; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 +; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 +; BTVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 +; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; BTVER2-NEXT: vsubps %ymm1, %ymm5, %ymm1 +; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v16f32_two_step2: ; SANDY: # %bb.0: -; SANDY-NEXT: vrcpps %ymm1, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 -; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 -; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 -; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; SANDY-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 +; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm5 +; SANDY-NEXT: vmulps %ymm5, %ymm0, %ymm0 +; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; SANDY-NEXT: vrcpps %ymm1, %ymm3 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; SANDY-NEXT: vaddps %ymm0, %ymm5, %ymm0 +; SANDY-NEXT: vmulps %ymm3, %ymm1, %ymm2 +; SANDY-NEXT: vsubps %ymm2, %ymm4, %ymm2 +; SANDY-NEXT: vmulps %ymm2, %ymm3, %ymm2 +; SANDY-NEXT: vaddps %ymm2, %ymm3, %ymm2 +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v16f32_two_step2: ; HASWELL: # %bb.0: -; HASWELL-NEXT: vrcpps %ymm1, %ymm2 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vmovaps %ymm2, %ymm4 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2 +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; HASWELL-NEXT: vmovaps %ymm2, %ymm4 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NEXT: vmulps %ymm2, %ymm4, %ymm5 +; HASWELL-NEXT: vrcpps %ymm1, %ymm6 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm5 * ymm0) + ymm2 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm5 +; HASWELL-NEXT: vfnmadd231ps {{.*#+}} ymm3 = -(ymm1 * ymm6) + ymm3 +; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm6) + ymm6 +; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NEXT: vmulps %ymm2, %ymm3, %ymm4 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm2 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm4 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v16f32_two_step2: ; HASWELL-NO-FMA: # %bb.0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 -; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 -; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm5 +; HASWELL-NO-FMA-NEXT: vmulps %ymm5, %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm3 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 +; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm5, %ymm0 +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm1, %ymm2 +; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm4, %ymm2 +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm3, %ymm2 +; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm3, %ymm2 +; HASWELL-NO-FMA-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: v16f32_two_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 ; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vmovaps %zmm1, %zmm3 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 -; AVX512-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 +; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm2 = (zmm2 * zmm1) + zmm1 +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; AVX512-NEXT: vmulps %zmm1, %zmm2, %zmm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm1 +; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm3 ; AVX512-NEXT: retq %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x ret <16 x float> %div @@ -1725,68 +1766,68 @@ define <16 x float> @v16f32_no_step(<16 x float> %x) #3 { define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 { ; SSE-LABEL: v16f32_no_step2: ; SSE: # %bb.0: -; SSE-NEXT: rcpps %xmm3, %xmm3 -; SSE-NEXT: rcpps %xmm2, %xmm2 -; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: rcpps %xmm0, %xmm0 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 +; SSE-NEXT: rcpps %xmm1, %xmm1 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 +; SSE-NEXT: rcpps %xmm2, %xmm2 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 +; SSE-NEXT: rcpps %xmm3, %xmm3 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v16f32_no_step2: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v16f32_no_step2: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 +; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v16f32_no_step2: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vrcpps %ymm1, %ymm1 ; BDVER2-NEXT: vrcpps %ymm0, %ymm0 +; BDVER2-NEXT: vrcpps %ymm1, %ymm1 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v16f32_no_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vrcpps %ymm1, %ymm1 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0 +; BTVER2-NEXT: vrcpps %ymm1, %ymm1 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v16f32_no_step2: ; SANDY: # %bb.0: -; SANDY-NEXT: vrcpps %ymm1, %ymm1 ; SANDY-NEXT: vrcpps %ymm0, %ymm0 +; SANDY-NEXT: vrcpps %ymm1, %ymm1 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v16f32_no_step2: ; HASWELL: # %bb.0: -; HASWELL-NEXT: vrcpps %ymm1, %ymm1 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0 +; HASWELL-NEXT: vrcpps %ymm1, %ymm1 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v16f32_no_step2: ; HASWELL-NO-FMA: # %bb.0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 +; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 ; HASWELL-NO-FMA-NEXT: retq |

