summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPhilip Reames <listmail@philipreames.com>2019-11-08 07:04:02 -0800
committerPhilip Reames <listmail@philipreames.com>2019-11-08 08:19:48 -0800
commit8d22100f66c4170510c6ff028c60672acfe1cff9 (patch)
tree8c252fd202f2b6889bac1708fa573ab22b3b87ec
parent787dba7aae1d01f3fcf1e471f733f00a6ba66e33 (diff)
downloadbcm5719-llvm-8d22100f66c4170510c6ff028c60672acfe1cff9.tar.gz
bcm5719-llvm-8d22100f66c4170510c6ff028c60672acfe1cff9.zip
[LICM] Support hosting of dynamic allocas out of loops
This patch implements a correct, but not terribly useful, transform. In particular, if we have a dynamic alloca in a loop which is guaranteed to execute, and provably not captured, we hoist the alloca out of the loop. The capture tracking is needed so that we can prove that each previous stack region dies before the next one is allocated. The transform decreases the amount of stack allocation needed by a linear factor (e.g. the iteration count of the loop). Now, I really hope no one is actually using dynamic allocas. As such, why this patch? Well, the actual problem I'm hoping to make progress on is allocation hoisting. There's a large draft patch out for review (https://reviews.llvm.org/D60056), and this patch was the smallest chunk of testable functionality I could come up with which takes a step vaguely in that direction. Once this is in, it makes motivating the changes to capture tracking mentioned in TODOs testable. After that, I hope to extend this to trivial malloc free regions (i.e. free dominating all loop exits) and allocation functions for GCed languages. Differential Revision: https://reviews.llvm.org/D69227
-rw-r--r--llvm/lib/Transforms/Scalar/LICM.cpp45
-rw-r--r--llvm/test/Transforms/LICM/hoist-alloca.ll168
2 files changed, 213 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index c239edaf666..48fc9553fdd 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -789,6 +789,41 @@ public:
};
} // namespace
+
+/// Return true if we know how to rewrite all uses of the given alloca after
+/// hoisting it out of the loop. The main concerns are a) potential captures
+/// and b) invariant.start markers which don't capture, but are no longer
+/// valid w/o a corresponding invariant.end.
+static bool canRewriteUsesOfAlloca(AllocaInst &AI) {
+ // TODO: This looks a lot like capture tracking, but we need to remove any
+ // invariant starts if we extend the lifetime of the alloca by hoisting it.
+ // We should probably refactor capture tracking into a form which allows us
+ // to reuse the relevant bits and remove the duplicated logic here.
+
+ SmallVector<Use *, 16> Worklist;
+ for (Use &U : AI.uses())
+ Worklist.push_back(&U);
+
+ unsigned NumUsesExplored = 0;
+ while (!Worklist.empty()) {
+ Use *U = Worklist.pop_back_val();
+ Instruction *I = cast<Instruction>(U->getUser());
+ NumUsesExplored++;
+ if (NumUsesExplored > DefaultMaxUsesToExplore)
+ return false;
+ // Non capturing, terminating uses
+ if (isa<LoadInst>(I) ||
+ (isa<StoreInst>(I) && U->getOperandNo() == 1))
+ continue;
+ // Non capturing, non-terminating
+ if (!isa<BitCastInst>(I) && !isa<GetElementPtrInst>(I))
+ return false;
+ for (Use &U : I->uses())
+ Worklist.push_back(&U);
+ }
+ return true;
+}
+
/// Walk the specified region of the CFG (defined by all blocks dominated by
/// the specified block, and that are in the current loop) in depth first
/// order w.r.t the DominatorTree. This allows us to visit definitions before
@@ -909,6 +944,16 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
continue;
}
+ if (isa<AllocaInst>(&I) &&
+ SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) &&
+ canRewriteUsesOfAlloca(cast<AllocaInst>(I))) {
+ hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
+ MSSAU, SE, ORE);
+ HoistedInstructions.push_back(&I);
+ Changed = true;
+ continue;
+ }
+
if (PHINode *PN = dyn_cast<PHINode>(&I)) {
if (CFH.canHoistPHI(PN)) {
// Redirect incoming blocks first to ensure that we create hoisted
diff --git a/llvm/test/Transforms/LICM/hoist-alloca.ll b/llvm/test/Transforms/LICM/hoist-alloca.ll
new file mode 100644
index 00000000000..8e4debac283
--- /dev/null
+++ b/llvm/test/Transforms/LICM/hoist-alloca.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -licm < %s | FileCheck %s
+
+@G = external global i64
+
+define void @test(i64 %n) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A:%.*]] = alloca i64
+; CHECK-NEXT: [[VAL:%.*]] = load i64, i64* [[A]]
+; CHECK-NEXT: store i64 [[VAL]], i64* @G
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+ %a = alloca i64
+ %val = load i64, i64* %a
+ store i64 %val, i64* @G
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ult i64 %iv, %n
+ br i1 %exitcond, label %for.body, label %exit
+exit:
+ ret void
+}
+
+define void @test2(i64 %n) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A:%.*]] = alloca i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], [[FOR_BODY]] ]
+; CHECK-NEXT: store i64 [[IV_LCSSA]], i64* [[A]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+ %a = alloca i64
+ store i64 %iv, i64* %a
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ult i64 %iv, %n
+ br i1 %exitcond, label %for.body, label %exit
+exit:
+ ret void
+}
+
+
+define void @test3(i64 %n) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A:%.*]] = alloca i64
+; CHECK-NEXT: [[A_I8:%.*]] = bitcast i64* [[A]] to i8*
+; CHECK-NEXT: [[A_OFFSET:%.*]] = getelementptr i8, i8* [[A_I8]], i64 4
+; CHECK-NEXT: store i8 0, i8* [[A_OFFSET]]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+ %a = alloca i64
+ %a.i8 = bitcast i64* %a to i8*
+ %a.offset = getelementptr i8, i8* %a.i8, i64 4
+ store i8 0, i8* %a.offset
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ult i64 %iv, %n
+ br i1 %exitcond, label %for.body, label %exit
+exit:
+ ret void
+}
+
+; This example is subtle. Because the dynamic alloca isn't reclaimed until
+; end of function scope, the captured value can legally point to a dynamic
+; alloca stack region from a previous iteration.
+define void @test4(i64 %n) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[A:%.*]] = alloca i64
+; CHECK-NEXT: store i64 [[IV]], i64* [[A]]
+; CHECK-NEXT: call void @capture(i64* [[A]])
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+ %a = alloca i64
+ store i64 %iv, i64* %a
+ %a.i8 = bitcast i64* %a to i8*
+ call void @capture(i64* %a)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ult i64 %iv, %n
+ br i1 %exitcond, label %for.body, label %exit
+exit:
+ ret void
+}
+declare void @capture(i64* %a)
+
+
+; TODO: not yet handled
+define void @test5(i64 %n) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[A:%.*]] = alloca i64
+; CHECK-NEXT: store i64 [[IV]], i64* [[A]]
+; CHECK-NEXT: [[A_I8:%.*]] = bitcast i64* [[A]] to i8*
+; CHECK-NEXT: [[TMP0:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 8, i8* [[A_I8]])
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+ %a = alloca i64
+ store i64 %iv, i64* %a
+ %a.i8 = bitcast i64* %a to i8*
+ call {}* @llvm.invariant.start.p0i8(i64 8, i8* %a.i8)
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ult i64 %iv, %n
+ br i1 %exitcond, label %for.body, label %exit
+exit:
+ ret void
+}
+
+declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
+
OpenPOWER on IntegriCloud