summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2015-04-15 15:22:55 +0000
committerSanjay Patel <spatel@rotateright.com>2015-04-15 15:22:55 +0000
commit7024b8121a9e51d468302e43ed41aeb6e1fb7274 (patch)
tree48ff3f56bac06c236228c51546fe471acba71c48 /llvm/test/CodeGen
parent280d8dc9f06989dea6b304d780f43e522146a6eb (diff)
downloadbcm5719-llvm-7024b8121a9e51d468302e43ed41aeb6e1fb7274.tar.gz
bcm5719-llvm-7024b8121a9e51d468302e43ed41aeb6e1fb7274.zip
[x86] Implement combineRepeatedFPDivisors
Set the transform bar at 2 divisions because the fastest current x86 FP divider circuit is in SandyBridge / Haswell at 10 cycle latency (best case) relative to a 5 cycle multiplier. So that's the worst case for this transform (no latency win), but multiplies are obviously pipelined while divisions are not, so there's still a big throughput win which we would expect to show up in typical FP code. These are the sequences I'm comparing: divss %xmm2, %xmm0 mulss %xmm1, %xmm0 divss %xmm2, %xmm0 Becomes: movss LCPI0_0(%rip), %xmm3 ## xmm3 = mem[0],zero,zero,zero divss %xmm2, %xmm3 mulss %xmm3, %xmm0 mulss %xmm1, %xmm0 mulss %xmm3, %xmm0 [Ignore for the moment that we don't optimize the chain of 3 multiplies into 2 independent fmuls followed by 1 dependent fmul...this is the DAG version of: https://llvm.org/bugs/show_bug.cgi?id=21768 ...if we fix that, then the transform becomes even more profitable on all targets.] Differential Revision: http://reviews.llvm.org/D8941 llvm-svn: 235012
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/X86/fdiv-combine.ll31
1 files changed, 31 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/X86/fdiv-combine.ll b/llvm/test/CodeGen/X86/fdiv-combine.ll
new file mode 100644
index 00000000000..279bb0624ac
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fdiv-combine.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Anything more than one division using a single divisor operand
+; should be converted into a reciprocal and multiplication.
+
+define float @div1_arcp(float %x, float %y, float %z) #0 {
+; CHECK-LABEL: div1_arcp:
+; CHECK: # BB#0:
+; CHECK-NEXT: divss %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %div1 = fdiv arcp float %x, %y
+ ret float %div1
+}
+
+define float @div2_arcp(float %x, float %y, float %z) #0 {
+; CHECK-LABEL: div2_arcp:
+; CHECK: # BB#0:
+; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT: divss %xmm2, %xmm3
+; CHECK-NEXT: mulss %xmm3, %xmm0
+; CHECK-NEXT: mulss %xmm1, %xmm0
+; CHECK-NEXT: mulss %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %div1 = fdiv arcp float %x, %z
+ %mul = fmul arcp float %div1, %y
+ %div2 = fdiv arcp float %mul, %z
+ ret float %div2
+}
+
+; FIXME: If the backend understands 'arcp', then this attribute is unnecessary.
+attributes #0 = { "unsafe-fp-math"="true" }
OpenPOWER on IntegriCloud