CodeGen: BlockPlacement: Increase tail duplication size for O3.

At O3 we are more willing to increase size if we believe it will improve performance. The current threshold for tail-duplication of 2 instructions is conservative, and can be relaxed at O3. Benchmark results: llvm test-suite: 6% improvement in aha, due to duplication of loop latch 3% improvement in hexxagon 2% slowdown in lpbench. Seems related, but couldn't completely diagnose. Internal google benchmark: Produces 4% improvement on internal google protocol buffer serialization benchmarks. Differential-Revision: https://reviews.llvm.org/D32324 llvm-svn: 303084
author: Kyle Butt <kyle+llvm@iteratee.net> 2017-05-15 17:30:47 +0000
committer: Kyle Butt <kyle+llvm@iteratee.net> 2017-05-15 17:30:47 +0000
commit: 7d531daecec9470a4540b5303f23786b28c3c6f9 (patch)
tree: 05eaaf45a01643a7b567bd6c53da1aa09e0d4de5 /llvm
parent: 886d2e6ef03eb1f429d193480941e50f48bf9a1d (diff)
download: bcm5719-llvm-7d531daecec9470a4540b5303f23786b28c3c6f9.tar.gz
bcm5719-llvm-7d531daecec9470a4540b5303f23786b28c3c6f9.zip
3 files changed, 131 insertions, 12 deletions
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 4cfc128a8c1..5003115a770 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -133,6 +133,14 @@ static cl::opt<unsigned> TailDupPlacementThreshold(
              "that won't conflict."), cl::init(2),
     cl::Hidden);
 
+// Heuristic for aggressive tail duplication.
+static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
+    "tail-dup-placement-aggressive-threshold",
+    cl::desc("Instruction cutoff for aggressive tail duplication during "
+             "layout. Used at -O3. Tail merging during layout is forced to "
+             "have a threshold that won't conflict."), cl::init(3),
+    cl::Hidden);
+
 // Heuristic for tail duplication.
 static cl::opt<unsigned> TailDupPlacementPenalty(
     "tail-dup-placement-penalty",
@@ -2646,9 +2654,26 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   assert(BlockToChain.empty());
   assert(ComputedEdges.empty());
 
+  unsigned TailDupSize = TailDupPlacementThreshold;
+  // If only the aggressive threshold is explicitly set, use it.
+  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
+      TailDupPlacementThreshold.getNumOccurrences() == 0)
+    TailDupSize = TailDupPlacementAggressiveThreshold;
+
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+  // For agressive optimization, we can adjust some thresholds to be less
+  // conservative.
+  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
+    // At O3 we should be more willing to copy blocks for tail duplication. This
+    // increases size pressure, so we only do it at O3
+    // Do this unless only the regular threshold is explicitly set.
+    if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
+        TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
+      TailDupSize = TailDupPlacementAggressiveThreshold;
+  }
+
   if (TailDupPlacement) {
     MPDT = &getAnalysis<MachinePostDominatorTree>();
-    unsigned TailDupSize = TailDupPlacementThreshold;
     if (MF.getFunction()->optForSize())
       TailDupSize = 1;
     TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
@@ -2658,7 +2683,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   buildCFGChains();
 
   // Changing the layout can create new tail merging opportunities.
-  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
   // TailMerge can create jump into if branches that make CFG irreducible for
   // HW that requires structured CFG.
   bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
@@ -2666,7 +2690,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
                          BranchFoldPlacement;
   // No tail merging opportunities if the block number is less than four.
   if (MF.size() > 3 && EnableTailMerge) {
-    unsigned TailMergeSize = TailDupPlacementThreshold + 1;
+    unsigned TailMergeSize = TailDupSize + 1;
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
                     *MBPI, TailMergeSize);
 
diff --git a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
index c9b5bf8c9ee..9665901e874 100644
--- a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
+++ b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -1,4 +1,5 @@
-; RUN: llc -O2 < %s | FileCheck %s
+; RUN: llc -O2 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O2 %s
+; RUN: llc -O3 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O3 %s
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-grtev4-linux-gnu"
 
@@ -99,11 +100,9 @@ exit:
 ; test1
 ; test2
 ; test3
-; test4
 ; optional1
 ; optional2
 ; optional3
-; optional4
 ; exit
 ; even for 50/50 branches.
 ; Tail duplication puts test n+1 at the end of optional n
@@ -163,6 +162,98 @@ exit:
 }
 
 ; Intended layout:
+; The chain-of-triangles based duplicating produces the layout when 3
+; instructions are allowed for tail-duplication.
+; test1
+; test2
+; test3
+; optional1
+; optional2
+; optional3
+; exit
+;
+; Otherwise it produces the layout:
+; test1
+; optional1
+; test2
+; optional2
+; test3
+; optional3
+; exit
+
+;CHECK-LABEL: straight_test_3_instr_test:
+; test1 may have been merged with entry
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: clrlwi {{[0-9]+}}, [[TAGREG]], 30
+;CHECK-NEXT: cmplwi {{[0-9]+}}, 2
+
+;CHECK-O3-NEXT: bne 0, .[[OPT1LABEL:[_0-9A-Za-z]+]]
+;CHECK-O3-NEXT: # %test2
+;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8
+;CHECK-O3-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
+;CHECK-O3-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
+;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32
+;CHECK-O3-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
+;CHECK-O3-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
+;CHECK-O3: blr
+;CHECK-O3-NEXT: .[[OPT1LABEL]]:
+;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8
+;CHECK-O3-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-O3-NEXT: .[[OPT2LABEL]]:
+;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32
+;CHECK-O3-NEXT: beq 0, .[[EXITLABEL]]
+;CHECK-O3-NEXT: .[[OPT3LABEL]]:
+;CHECK-O3: b .[[EXITLABEL]]
+
+;CHECK-O2-NEXT: beq 0, .[[TEST2LABEL:[_0-9A-Za-z]+]]
+;CHECK-O2-NEXT: # %optional1
+;CHECK-O2: .[[TEST2LABEL]]: # %test2
+;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
+;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 8
+;CHECK-O2-NEXT: beq 0, .[[TEST3LABEL:[_0-9A-Za-z]+]]
+;CHECK-O2-NEXT: # %optional2
+;CHECK-O2: .[[TEST3LABEL]]: # %test3
+;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
+;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 32
+;CHECK-O2-NEXT: beq 0, .[[EXITLABEL:[_0-9A-Za-z]+]]
+;CHECK-O2-NEXT: # %optional3
+;CHECK-O2: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
+;CHECK-O2: blr
+
+
+define void @straight_test_3_instr_test(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 3
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 2
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2
+optional1:
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 12
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 8
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2
+optional2:
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 48
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 32
+  br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1
+optional3:
+  call void @c()
+  br label %exit
+exit:
+  ret void
+}
+
+; Intended layout:
 ; The chain-based outlining produces the layout
 ; entry
 ; --- Begin loop ---
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index 8f4c8442db6..68ab3f9f320 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -66,7 +66,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X32-NEXT:    jne .LBB1_8
 ; X32-NEXT:  .LBB1_7:
 ; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT:    jmp .LBB1_9
+; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    je .LBB1_10
+; X32-NEXT:    jmp .LBB1_11
 ; X32-NEXT:  .LBB1_1:
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
@@ -77,11 +80,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X32-NEXT:    je .LBB1_7
 ; X32-NEXT:  .LBB1_8: # %entry
 ; X32-NEXT:    xorps %xmm3, %xmm3
-; X32-NEXT:  .LBB1_9: # %entry
 ; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    jne .LBB1_11
-; X32-NEXT:  # BB#10:
+; X32-NEXT:  .LBB1_10:
 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:  .LBB1_11: # %entry
 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -103,7 +105,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X64-NEXT:    jne .LBB1_8
 ; X64-NEXT:  .LBB1_7:
 ; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X64-NEXT:    jmp .LBB1_9
+; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    je .LBB1_10
+; X64-NEXT:    jmp .LBB1_11
 ; X64-NEXT:  .LBB1_1:
 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-NEXT:    testl %edx, %edx
@@ -114,11 +119,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X64-NEXT:    je .LBB1_7
 ; X64-NEXT:  .LBB1_8: # %entry
 ; X64-NEXT:    xorps %xmm3, %xmm3
-; X64-NEXT:  .LBB1_9: # %entry
 ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X64-NEXT:    testl %esi, %esi
 ; X64-NEXT:    jne .LBB1_11
-; X64-NEXT:  # BB#10:
+; X64-NEXT:  .LBB1_10:
 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:  .LBB1_11: # %entry
 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
author	Kyle Butt <kyle+llvm@iteratee.net>	2017-05-15 17:30:47 +0000
committer	Kyle Butt <kyle+llvm@iteratee.net>	2017-05-15 17:30:47 +0000
commit	7d531daecec9470a4540b5303f23786b28c3c6f9 (patch)
tree	05eaaf45a01643a7b567bd6c53da1aa09e0d4de5 /llvm
parent	886d2e6ef03eb1f429d193480941e50f48bf9a1d (diff)
download	bcm5719-llvm-7d531daecec9470a4540b5303f23786b28c3c6f9.tar.gz bcm5719-llvm-7d531daecec9470a4540b5303f23786b28c3c6f9.zip