diff options
| author | Qiu Chaofan <qiucofan@cn.ibm.com> | 2019-09-12 07:51:24 +0000 |
|---|---|---|
| committer | Qiu Chaofan <qiucofan@cn.ibm.com> | 2019-09-12 07:51:24 +0000 |
| commit | b7fb5d0f6f2abf07f0ce6edc3397b4c291c1d25e (patch) | |
| tree | b5fb0cbe9b96f54540d31e4c268e1cf633b249d2 /llvm/test/CodeGen/X86/recip-fastmath.ll | |
| parent | 75f65fe8d316407c5dd7cd9bfdfc760203947aa1 (diff) | |
| download | bcm5719-llvm-b7fb5d0f6f2abf07f0ce6edc3397b4c291c1d25e.tar.gz bcm5719-llvm-b7fb5d0f6f2abf07f0ce6edc3397b4c291c1d25e.zip | |
[DAGCombiner] Improve division estimation of floating points.
Current implementation of estimating divisions loses precision since it
estimates reciprocal first and does multiplication. This patch is to re-order
arithmetic operations in the last iteration in DAGCombiner to improve the
accuracy.
Reviewed By: Sanjay Patel, Jinsong Ji
Differential Revision: https://reviews.llvm.org/D66050
llvm-svn: 371713
Diffstat (limited to 'llvm/test/CodeGen/X86/recip-fastmath.ll')
| -rw-r--r-- | llvm/test/CodeGen/X86/recip-fastmath.ll | 180 |
1 files changed, 84 insertions, 96 deletions
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll index c7442f2bc20..c618c37e4fe 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -122,88 +122,87 @@ define float @f32_one_step_variables(float %x, float %y) #1 { ; SSE-LABEL: f32_one_step_variables: ; SSE: # %bb.0: ; SSE-NEXT: rcpss %xmm1, %xmm2 -; SSE-NEXT: mulss %xmm2, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-NEXT: subss %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulss %xmm2, %xmm3 -; SSE-NEXT: addss %xmm2, %xmm3 -; SSE-NEXT: mulss %xmm3, %xmm0 +; SSE-NEXT: mulss %xmm3, %xmm1 +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: mulss %xmm2, %xmm0 +; SSE-NEXT: addss %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: f32_one_step_variables: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-RECIP-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; AVX-RECIP-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; AVX-RECIP-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: f32_one_step_variables: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; FMA-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_one_step_variables: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm2, %xmm1, %xmm1 -; BDVER2-NEXT: vfmaddss %xmm2, %xmm1, %xmm2, %xmm1 -; BDVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; BDVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; BDVER2-NEXT: vfnmaddss %xmm0, %xmm3, %xmm1, %xmm0 +; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm2, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: f32_one_step_variables: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; BTVER2-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: f32_one_step_variables: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SANDY-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; SANDY-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; SANDY-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: f32_one_step_variables: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; HASWELL-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; HASWELL-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; HASWELL-NEXT: vmovaps %xmm2, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_variables: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; HASWELL-NO-FMA-NEXT: vsubss %xmm1, %xmm3, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; HASWELL-NO-FMA-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: f32_one_step_variables: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm1, %xmm1, %xmm2 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; AVX512-NEXT: vmovaps %xmm2, %xmm0 ; AVX512-NEXT: retq %div = fdiv fast float %x, %y ret float %div @@ -484,99 +483,88 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1 ; SSE-LABEL: v4f32_one_step_variables: ; SSE: # %bb.0: ; SSE-NEXT: rcpps %xmm1, %xmm2 -; SSE-NEXT: mulps %xmm2, %xmm1 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: subps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulps %xmm2, %xmm3 -; SSE-NEXT: addps %xmm2, %xmm3 -; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: mulps %xmm3, %xmm1 +; SSE-NEXT: subps %xmm1, %xmm0 +; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: addps %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-RECIP-LABEL: v4f32_one_step_variables: ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm1, %xmm2 -; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-RECIP-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; AVX-RECIP-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; AVX-RECIP-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; AVX-RECIP-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v4f32_one_step_variables: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm1, %xmm2 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; FMA-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v4f32_one_step_variables: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm1, %xmm2 -; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm2, %xmm1, %xmm1 -; BDVER2-NEXT: vfmaddps %xmm2, %xmm1, %xmm2, %xmm1 -; BDVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; BDVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; BDVER2-NEXT: vfnmaddps %xmm0, %xmm3, %xmm1, %xmm0 +; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm2, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v4f32_one_step_variables: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm1, %xmm2 -; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; BTVER2-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v4f32_one_step_variables: ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm1, %xmm2 -; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; SANDY-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; SANDY-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; SANDY-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; SANDY-NEXT: retq ; ; HASWELL-LABEL: v4f32_one_step_variables: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm1, %xmm2 -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; HASWELL-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; HASWELL-NEXT: vmovaps %xmm2, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables: ; HASWELL-NO-FMA: # %bb.0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm1, %xmm2 -; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm1 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NO-FMA-NEXT: vsubps %xmm1, %xmm3, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; HASWELL-NO-FMA-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; HASWELL-NO-FMA-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm2, %xmm0 +; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; -; KNL-LABEL: v4f32_one_step_variables: -; KNL: # %bb.0: -; KNL-NEXT: vrcpps %xmm1, %xmm2 -; KNL-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm3 -; KNL-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; KNL-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: v4f32_one_step_variables: -; SKX: # %bb.0: -; SKX-NEXT: vrcpps %xmm1, %xmm2 -; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + mem -; SKX-NEXT: vfmadd132ps {{.*#+}} xmm1 = (xmm1 * xmm2) + xmm2 -; SKX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SKX-NEXT: retq +; AVX512-LABEL: v4f32_one_step_variables: +; AVX512: # %bb.0: +; AVX512-NEXT: vrcpps %xmm1, %xmm2 +; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm3 +; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: retq %div = fdiv fast <4 x float> %x, %y ret <4 x float> %div } |

