[CGP] Split some critical edges coming out of indirect branches

Splitting critical edges when one of the source edges is an indirectbr is hard in general (because it requires changing the memory the indirectbr reads). But if a block only has a single indirectbr predecessor (which is the common case), we can simulate splitting that edge by splitting the destination block, and retargeting the *direct* branches. This is motivated by the use of computed gotos in python 2.7: PyEval_EvalFrame() ends up using an indirect branch with ~100 successors, and passing a constant to each of those. Since MachineSink can't break indirect critical edges on demand (and doing this in MIR doesn't look feasible), this causes us to emit about ~100 defs of registers containing constants, which we in the predecessor block, where only one of those constants is used in each successor. So, at each computed goto, we needlessly spill about a 100 constants to stack. The end result is that a clang-compiled python interpreter can be about ~2.5x slower on a simple python reduction loop than a gcc-compiled interpreter. Differential Revision: https://reviews.llvm.org/D29916 llvm-svn: 296149
author: Michael Kuperstein <mkuper@google.com> 2017-02-24 18:41:32 +0000
committer: Michael Kuperstein <mkuper@google.com> 2017-02-24 18:41:32 +0000
commit: 46b131e3f85b448151b717f1d240e0a28a10f579 (patch)
tree: 51985b23bf9e4331e2b9de64f1fb76d3b849f5fd /llvm/test
parent: cdf2bd656a68b614a2224d69072f43690b5141d2 (diff)
download: bcm5719-llvm-46b131e3f85b448151b717f1d240e0a28a10f579.tar.gz
bcm5719-llvm-46b131e3f85b448151b717f1d240e0a28a10f579.zip
4 files changed, 280 insertions, 13 deletions
diff --git a/llvm/test/CodeGen/ARM/indirectbr.ll b/llvm/test/CodeGen/ARM/indirectbr.ll
index d15ef14b449..90defad43a7 100644
--- a/llvm/test/CodeGen/ARM/indirectbr.ll
+++ b/llvm/test/CodeGen/ARM/indirectbr.ll
@@ -47,6 +47,7 @@ L3:                                               ; preds = %L4, %bb2
   br label %L2
 
 L2:                                               ; preds = %L3, %bb2
+; THUMB-LABEL: %L1.clone
 ; THUMB: muls
   %res.2 = phi i32 [ %res.1, %L3 ], [ 1, %bb2 ]   ; <i32> [#uses=1]
   %phitmp = mul i32 %res.2, 6                     ; <i32> [#uses=1]
diff --git a/llvm/test/CodeGen/MSP430/indirectbr2.ll b/llvm/test/CodeGen/MSP430/indirectbr2.ll
index b0b4f1cbfd2..55598b97d91 100644
--- a/llvm/test/CodeGen/MSP430/indirectbr2.ll
+++ b/llvm/test/CodeGen/MSP430/indirectbr2.ll
@@ -5,7 +5,7 @@ define internal i16 @foo(i16 %i) nounwind {
 entry:
   %tmp1 = getelementptr inbounds [5 x i8*], [5 x i8*]* @C.0.2070, i16 0, i16 %i ; <i8**> [#uses=1]
   %gotovar.4.0 = load i8*, i8** %tmp1, align 4        ; <i8*> [#uses=1]
-; CHECK: br .LC.0.2070(r12)
+; CHECK: br .LC.0.2070(r15)
   indirectbr i8* %gotovar.4.0, [label %L5, label %L4, label %L3, label %L2, label %L1]
 
 L5:                                               ; preds = %bb2
diff --git a/llvm/test/CodeGen/PowerPC/indirectbr.ll b/llvm/test/CodeGen/PowerPC/indirectbr.ll
index d1e03ca7773..c040d7859a8 100644
--- a/llvm/test/CodeGen/PowerPC/indirectbr.ll
+++ b/llvm/test/CodeGen/PowerPC/indirectbr.ll
@@ -17,23 +17,35 @@ entry:
 bb2:                                              ; preds = %entry, %bb3
   %gotovar.4.0 = phi i8* [ %gotovar.4.0.pre, %bb3 ], [ %0, %entry ] ; <i8*> [#uses=1]
 ; PIC: mtctr
-; PIC-NEXT: li
-; PIC-NEXT: li
-; PIC-NEXT: li
-; PIC-NEXT: li
 ; PIC-NEXT: bctr
+; PIC: li
+; PIC: b LBB
+; PIC: li
+; PIC: b LBB
+; PIC: li
+; PIC: b LBB
+; PIC: li
+; PIC: b LBB
 ; STATIC: mtctr
-; STATIC-NEXT: li
-; STATIC-NEXT: li
-; STATIC-NEXT: li
-; STATIC-NEXT: li
 ; STATIC-NEXT: bctr
+; STATIC: li
+; STATIC: b LBB
+; STATIC: li
+; STATIC: b LBB
+; STATIC: li
+; STATIC: b LBB
+; STATIC: li
+; STATIC: b LBB
 ; PPC64: mtctr
-; PPC64-NEXT: li
-; PPC64-NEXT: li
-; PPC64-NEXT: li
-; PPC64-NEXT: li
 ; PPC64-NEXT: bctr
+; PPC64: li
+; PPC64: b LBB
+; PPC64: li
+; PPC64: b LBB
+; PPC64: li
+; PPC64: b LBB
+; PPC64: li
+; PPC64: b LBB
   indirectbr i8* %gotovar.4.0, [label %L5, label %L4, label %L3, label %L2, label %L1]
 
 bb3:                                              ; preds = %entry
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/computedgoto.ll b/llvm/test/Transforms/CodeGenPrepare/X86/computedgoto.ll
new file mode 100644
index 00000000000..06621e3b1df
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/computedgoto.ll
@@ -0,0 +1,254 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -codegenprepare -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @use(i32) local_unnamed_addr
+declare void @useptr([2 x i8*]*) local_unnamed_addr
+
+; CHECK: @simple.targets = constant [2 x i8*] [i8* blockaddress(@simple, %bb0), i8* blockaddress(@simple, %bb1)], align 16
+@simple.targets = constant [2 x i8*] [i8* blockaddress(@simple, %bb0), i8* blockaddress(@simple, %bb1)], align 16
+
+; CHECK: @multi.targets = constant [2 x i8*] [i8* blockaddress(@multi, %bb0), i8* blockaddress(@multi, %bb1)], align 16
+@multi.targets = constant [2 x i8*] [i8* blockaddress(@multi, %bb0), i8* blockaddress(@multi, %bb1)], align 16
+
+; CHECK: @loop.targets = constant [2 x i8*] [i8* blockaddress(@loop, %bb0), i8* blockaddress(@loop, %bb1)], align 16
+@loop.targets = constant [2 x i8*] [i8* blockaddress(@loop, %bb0), i8* blockaddress(@loop, %bb1)], align 16
+
+; CHECK: @nophi.targets = constant [2 x i8*] [i8* blockaddress(@nophi, %bb0), i8* blockaddress(@nophi, %bb1)], align 16
+@nophi.targets = constant [2 x i8*] [i8* blockaddress(@nophi, %bb0), i8* blockaddress(@nophi, %bb1)], align 16
+
+; Check that we break the critical edge when an jump table has only one use.
+define void @simple(i32* nocapture readonly %p) {
+; CHECK-LABEL: @simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITVAL:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0_CLONE:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1_CLONE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32* [ [[PTR:%.*]], [[BB0:%.*]] ], [ [[INCDEC_PTR]], [[BB0_CLONE]] ]
+; CHECK-NEXT:    [[MERGE2:%.*]] = phi i32 [ 0, [[BB0]] ], [ [[INITVAL]], [[BB0_CLONE]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[MERGE2]])
+; CHECK-NEXT:    br label [[INDIRECTGOTO:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[DOTSPLIT3:%.*]]
+; CHECK:       .split3:
+; CHECK-NEXT:    [[MERGE5:%.*]] = phi i32* [ [[PTR]], [[BB1:%.*]] ], [ [[INCDEC_PTR]], [[BB1_CLONE]] ]
+; CHECK-NEXT:    [[MERGE7:%.*]] = phi i32 [ 1, [[BB1]] ], [ [[INITVAL]], [[BB1_CLONE]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[MERGE7]])
+; CHECK-NEXT:    br label [[INDIRECTGOTO]]
+; CHECK:       indirectgoto:
+; CHECK-NEXT:    [[P_ADDR_SINK:%.*]] = phi i32* [ [[MERGE5]], [[DOTSPLIT3]] ], [ [[MERGE]], [[DOTSPLIT]] ]
+; CHECK-NEXT:    [[PTR]] = getelementptr inbounds i32, i32* [[P_ADDR_SINK]], i64 1
+; CHECK-NEXT:    [[NEWP:%.*]] = load i32, i32* [[P_ADDR_SINK]], align 4
+; CHECK-NEXT:    [[IDX:%.*]] = sext i32 [[NEWP]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @simple.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[NEWOP:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       bb0.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT]]
+; CHECK:       bb1.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT3]]
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initval = load i32, i32* %p, align 4
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  %p.addr.0 = phi i32* [ %incdec.ptr, %entry ], [ %ptr, %indirectgoto ]
+  %opcode.0 = phi i32 [ %initval, %entry ], [ 0, %indirectgoto ]
+  tail call void @use(i32 %opcode.0)
+  br label %indirectgoto
+
+bb1:
+  %p.addr.1 = phi i32* [ %incdec.ptr, %entry ], [ %ptr, %indirectgoto ]
+  %opcode.1 = phi i32 [ %initval, %entry ], [ 1, %indirectgoto ]
+  tail call void @use(i32 %opcode.1)
+  br label %indirectgoto
+
+indirectgoto:
+  %p.addr.sink = phi i32* [ %p.addr.1, %bb1 ], [ %p.addr.0, %bb0 ]
+  %ptr = getelementptr inbounds i32, i32* %p.addr.sink, i64 1
+  %newp = load i32, i32* %p.addr.sink, align 4
+  %idx = sext i32 %newp to i64
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @simple.targets, i64 0, i64 %idx
+  %newop = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %newop, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}
+
+; Don't try to break critical edges when several indirectbr point to a single block
+define void @multi(i32* nocapture readonly %p) {
+; CHECK-LABEL: @multi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITVAL:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[P_ADDR_0:%.*]] = phi i32* [ [[INCDEC_PTR]], [[ENTRY:%.*]] ], [ [[NEXT0:%.*]], [[BB0]] ], [ [[NEXT1:%.*]], [[BB1]] ]
+; CHECK-NEXT:    [[OPCODE_0:%.*]] = phi i32 [ [[INITVAL]], [[ENTRY]] ], [ 0, [[BB0]] ], [ 1, [[BB1]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[OPCODE_0]])
+; CHECK-NEXT:    [[NEXT0]] = getelementptr inbounds i32, i32* [[P_ADDR_0]], i64 1
+; CHECK-NEXT:    [[NEWP0:%.*]] = load i32, i32* [[P_ADDR_0]], align 4
+; CHECK-NEXT:    [[IDX0:%.*]] = sext i32 [[NEWP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 [[IDX0]]
+; CHECK-NEXT:    [[NEWOP0:%.*]] = load i8*, i8** [[ARRAYIDX0]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP0]], [label [[BB0]], label %bb1]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[P_ADDR_1:%.*]] = phi i32* [ [[INCDEC_PTR]], [[ENTRY]] ], [ [[NEXT0]], [[BB0]] ], [ [[NEXT1]], [[BB1]] ]
+; CHECK-NEXT:    [[OPCODE_1:%.*]] = phi i32 [ [[INITVAL]], [[ENTRY]] ], [ 0, [[BB0]] ], [ 1, [[BB1]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[OPCODE_1]])
+; CHECK-NEXT:    [[NEXT1]] = getelementptr inbounds i32, i32* [[P_ADDR_1]], i64 1
+; CHECK-NEXT:    [[NEWP1:%.*]] = load i32, i32* [[P_ADDR_1]], align 4
+; CHECK-NEXT:    [[IDX1:%.*]] = sext i32 [[NEWP1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 [[IDX1]]
+; CHECK-NEXT:    [[NEWOP1:%.*]] = load i8*, i8** [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP1]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initval = load i32, i32* %p, align 4
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  %p.addr.0 = phi i32* [ %incdec.ptr, %entry ], [ %next0, %bb0 ], [ %next1, %bb1 ]
+  %opcode.0 = phi i32 [ %initval, %entry ], [ 0, %bb0 ], [ 1, %bb1 ]
+  tail call void @use(i32 %opcode.0)
+  %next0 = getelementptr inbounds i32, i32* %p.addr.0, i64 1
+  %newp0 = load i32, i32* %p.addr.0, align 4
+  %idx0 = sext i32 %newp0 to i64
+  %arrayidx0 = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 %idx0
+  %newop0 = load i8*, i8** %arrayidx0, align 8
+  indirectbr i8* %newop0, [label %bb0, label %bb1]
+
+bb1:
+  %p.addr.1 = phi i32* [ %incdec.ptr, %entry ], [ %next0, %bb0 ], [ %next1, %bb1 ]
+  %opcode.1 = phi i32 [ %initval, %entry ], [ 0, %bb0 ], [ 1, %bb1 ]
+  tail call void @use(i32 %opcode.1)
+  %next1 = getelementptr inbounds i32, i32* %p.addr.1, i64 1
+  %newp1 = load i32, i32* %p.addr.1, align 4
+  %idx1 = sext i32 %newp1 to i64
+  %arrayidx1 = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 %idx1
+  %newop1 = load i8*, i8** %arrayidx1, align 8
+  indirectbr i8* %newop1, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}
+
+; Make sure we do the right thing for cases where the indirectbr branches to
+; the block it terminates.
+define void @loop(i64* nocapture readonly %p) {
+; CHECK-LABEL: @loop(
+; CHECK-NEXT:  bb0.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[DOTSPLIT]]
+; CHECK:       .split:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[BB0:%.*]] ], [ 0, [[BB0_CLONE:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[MERGE]]
+; CHECK-NEXT:    store i64 [[MERGE]], i64* [[TMP0]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[MERGE]], 1
+; CHECK-NEXT:    [[IDX:%.*]] = srem i64 [[MERGE]], 2
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @loop.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[TARGET:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[TARGET]], [label [[BB0]], label %bb1]
+; CHECK:       bb1:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %bb0
+
+bb0:
+  %i = phi i64 [ %i.next, %bb0 ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i64, i64* %p, i64 %i
+  store i64 %i, i64* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %idx = srem i64 %i, 2
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @loop.targets, i64 0, i64 %idx
+  %target = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %target, [label %bb0, label %bb1]
+
+bb1:
+  ret void
+}
+
+; Don't do anything for cases that contain no phis.
+define void @nophi(i32* %p) {
+; CHECK-LABEL: @nophi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    tail call void @use(i32 0)
+; CHECK-NEXT:    br label [[INDIRECTGOTO:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    tail call void @use(i32 1)
+; CHECK-NEXT:    br label [[INDIRECTGOTO]]
+; CHECK:       indirectgoto:
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = ptrtoint i32* [[P]] to i64
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = add i64 [[SUNKADDR]], 4
+; CHECK-NEXT:    [[SUNKADDR2:%.*]] = inttoptr i64 [[SUNKADDR1]] to i32*
+; CHECK-NEXT:    [[NEWP:%.*]] = load i32, i32* [[SUNKADDR2]], align 4
+; CHECK-NEXT:    [[IDX:%.*]] = sext i32 [[NEWP]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @nophi.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[NEWOP:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  tail call void @use(i32 0)
+  br label %indirectgoto
+
+bb1:
+  tail call void @use(i32 1)
+  br label %indirectgoto
+
+indirectgoto:
+  %newp = load i32, i32* %incdec.ptr, align 4
+  %idx = sext i32 %newp to i64
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @nophi.targets, i64 0, i64 %idx
+  %newop = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %newop, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}
author	Michael Kuperstein <mkuper@google.com>	2017-02-24 18:41:32 +0000
committer	Michael Kuperstein <mkuper@google.com>	2017-02-24 18:41:32 +0000
commit	46b131e3f85b448151b717f1d240e0a28a10f579 (patch)
tree	51985b23bf9e4331e2b9de64f1fb76d3b849f5fd /llvm/test
parent	cdf2bd656a68b614a2224d69072f43690b5141d2 (diff)
download	bcm5719-llvm-46b131e3f85b448151b717f1d240e0a28a10f579.tar.gz bcm5719-llvm-46b131e3f85b448151b717f1d240e0a28a10f579.zip