Use rcpss/rcpps (X86) to speed up reciprocal calcs (PR21385).

This is a first step for generating SSE rcp instructions for reciprocal calcs when fast-math allows it. This is very similar to the rsqrt optimization enabled in D5658 ( http://reviews.llvm.org/rL220570 ). For now, be conservative and only enable this for AMD btver2 where performance improves significantly both in terms of latency and throughput. We may never enable this codegen for Intel Core* chips because the divider circuits are just too fast. On SandyBridge, divss can be as fast as 10 cycles versus the 21 cycle critical path for the rcp + mul + sub + mul + add estimate. Follow-on patches may allow configuration of the number of Newton-Raphson refinement steps, add AVX512 support, and enable the optimization for more chips. More background here: http://llvm.org/bugs/show_bug.cgi?id=21385 Differential Revision: http://reviews.llvm.org/D6175 llvm-svn: 221706
author: Sanjay Patel <spatel@rotateright.com> 2014-11-11 20:51:00 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2014-11-11 20:51:00 +0000
commit: e2e589288fcac4fe8b5262eecfc7e3a47d256022 (patch)
tree: 42f5eae1ee7ed781f7aac0ff26f1579900e7778f /llvm/test/CodeGen/X86/recip-fastmath.ll
parent: 07e694d29367ea411cd1c2f45c19298df9aec181 (diff)
download: bcm5719-llvm-e2e589288fcac4fe8b5262eecfc7e3a47d256022.tar.gz
bcm5719-llvm-e2e589288fcac4fe8b5262eecfc7e3a47d256022.zip
1 files changed, 72 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
new file mode 100644
index 00000000000..dd5563c965f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -0,0 +1,72 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+
+; If the target's divss/divps instructions are substantially
+; slower than rcpss/rcpps with a Newton-Raphson refinement,
+; we should generate the estimate sequence.
+
+; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 )
+; for details about the accuracy, speed, and implementation
+; differences of x86 reciprocal estimates.
+
+define float @reciprocal_estimate(float %x) #0 {
+  %div = fdiv fast float 1.0, %x
+  ret float %div
+
+; CHECK-LABEL: reciprocal_estimate:
+; CHECK: movss
+; CHECK-NEXT: divss
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate:
+; BTVER2: vrcpss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vsubss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vaddss
+; BTVER2-NEXT: retq
+}
+
+define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 {
+  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  ret <4 x float> %div
+
+; CHECK-LABEL: reciprocal_estimate_v4f32:
+; CHECK: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate_v4f32:
+; BTVER2: vrcpps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vsubps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vaddps
+; BTVER2-NEXT: retq
+}
+
+define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 {
+  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  ret <8 x float> %div
+
+; CHECK-LABEL: reciprocal_estimate_v8f32:
+; CHECK: movaps
+; CHECK: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: divps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate_v8f32:
+; BTVER2: vrcpps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vsubps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vaddps
+; BTVER2-NEXT: retq
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
author	Sanjay Patel <spatel@rotateright.com>	2014-11-11 20:51:00 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2014-11-11 20:51:00 +0000
commit	e2e589288fcac4fe8b5262eecfc7e3a47d256022 (patch)
tree	42f5eae1ee7ed781f7aac0ff26f1579900e7778f /llvm/test/CodeGen/X86/recip-fastmath.ll
parent	07e694d29367ea411cd1c2f45c19298df9aec181 (diff)
download	bcm5719-llvm-e2e589288fcac4fe8b5262eecfc7e3a47d256022.tar.gz bcm5719-llvm-e2e589288fcac4fe8b5262eecfc7e3a47d256022.zip