transform fadd chains to increase parallelism

This is a compromise: with this simple patch, we should always handle a chain of exactly 3 operations optimally, but we're not generating the optimal balanced binary tree for a longer sequence. In general, this transform will reduce the dependency chain for a sequence of instructions using N operands from a worst case N-1 dependent operations to N/2 dependent operations. The optimal balanced binary tree would reduce the chain to log2(N). The trade-off for not dealing with longer sequences is: (1) we have less complexity in the compiler, (2) we avoid unknown compile-time blowup calculating a balanced tree, and (3) we don't need to worry about the increased register pressure required to parallelize longer sequences. It also seems unlikely that we would ever encounter really long strings of dependent ops like that in the wild, but I'm not sure how to verify that speculation. FWIW, I see no perf difference for test-suite running on btver2 (x86-64) with -ffast-math and this patch. We can extend this patch to cover other associative operations such as fmul, fmax, fmin, integer add, integer mul. This is a partial fix for: https://llvm.org/bugs/show_bug.cgi?id=17305 and if extended: https://llvm.org/bugs/show_bug.cgi?id=21768 https://llvm.org/bugs/show_bug.cgi?id=23116 The issue also came up in: http://reviews.llvm.org/D8941 Differential Revision: http://reviews.llvm.org/D9232 llvm-svn: 236031
author: Sanjay Patel <spatel@rotateright.com> 2015-04-28 21:03:22 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2015-04-28 21:03:22 +0000
commit: 2fbc4e5c4942a3a816c2d13913db0ee65990f448 (patch)
tree: 64ec8eecd85d44b264ab61fc3a73e32b692fbfff /llvm/test/CodeGen/X86/fp-fast.ll
parent: 659ece9ddbb6bb8a351bfd6d9337e093af2248be (diff)
download: bcm5719-llvm-2fbc4e5c4942a3a816c2d13913db0ee65990f448.tar.gz
bcm5719-llvm-2fbc4e5c4942a3a816c2d13913db0ee65990f448.zip
1 files changed, 43 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/X86/fp-fast.ll b/llvm/test/CodeGen/X86/fp-fast.ll
index 479f60d91f1..eb2ebd9119a 100644
--- a/llvm/test/CodeGen/X86/fp-fast.ll
+++ b/llvm/test/CodeGen/X86/fp-fast.ll
@@ -113,3 +113,46 @@ define float @test11(float %a) {
   %t2 = fadd float %a, %t1
   ret float %t2
 }
+
+; Verify that the first two adds are independent; the destination registers
+; are used as source registers for the third add.
+
+define float @reassociate_adds1(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: reassociate_adds1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %add0 = fadd float %a, %b
+  %add1 = fadd float %add0, %c
+  %add2 = fadd float %add1, %d
+  ret float %add2
+}
+
+define float @reassociate_adds2(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: reassociate_adds2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %add0 = fadd float %a, %b
+  %add1 = fadd float %c, %add0
+  %add2 = fadd float %add1, %d
+  ret float %add2
+}
+
+define float @reassociate_adds3(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: reassociate_adds3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %add0 = fadd float %a, %b
+  %add1 = fadd float %add0, %c
+  %add2 = fadd float %d, %add1
+  ret float %add2
+}
+
author	Sanjay Patel <spatel@rotateright.com>	2015-04-28 21:03:22 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2015-04-28 21:03:22 +0000
commit	2fbc4e5c4942a3a816c2d13913db0ee65990f448 (patch)
tree	64ec8eecd85d44b264ab61fc3a73e32b692fbfff /llvm/test/CodeGen/X86/fp-fast.ll
parent	659ece9ddbb6bb8a351bfd6d9337e093af2248be (diff)
download	bcm5719-llvm-2fbc4e5c4942a3a816c2d13913db0ee65990f448.tar.gz bcm5719-llvm-2fbc4e5c4942a3a816c2d13913db0ee65990f448.zip