2 files changed, 181 insertions, 6 deletions
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index c38cf82a692..7b512d60d97 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -189,9 +189,9 @@ bool Inliner::shouldInline(CallSite CS) {
   
   int Cost = IC.getValue();
   int CurrentThreshold = InlineThreshold;
-  Function *Fn = CS.getCaller();
-  if (Fn && !Fn->isDeclaration() &&
-      Fn->hasFnAttr(Attribute::OptimizeForSize) &&
+  Function *Caller = CS.getCaller();
+  if (Caller && !Caller->isDeclaration() &&
+      Caller->hasFnAttr(Attribute::OptimizeForSize) &&
       InlineLimit.getNumOccurrences() == 0 &&
       InlineThreshold != 50)
     CurrentThreshold = 50;
@@ -203,8 +203,72 @@ bool Inliner::shouldInline(CallSite CS) {
     return false;
   }
   
+  // Try to detect the case where the current inlining candidate caller
+  // (call it B) is a static function and is an inlining candidate elsewhere,
+  // and the current candidate callee (call it C) is large enough that
+  // inlining it into B would make B too big to inline later.  In these
+  // circumstances it may be best not to inline C into B, but to inline B
+  // into its callers.
+  if (Caller->hasLocalLinkage()) {
+    int TotalSecondaryCost = 0;
+    bool outerCallsFound = false;
+    bool allOuterCallsWillBeInlined = true;
+    bool someOuterCallWouldNotBeInlined = false;
+    for (Value::use_iterator I = Caller->use_begin(), E =Caller->use_end(); 
+         I != E; ++I) {
+      CallSite CS2 = CallSite::get(*I);
+
+      // If this isn't a call to Caller (it could be some other sort
+      // of reference) skip it.
+      if (CS2.getInstruction() == 0 || CS2.getCalledFunction() != Caller)
+        continue;
+
+      InlineCost IC2 = getInlineCost(CS2);
+      if (IC2.isNever())
+        allOuterCallsWillBeInlined = false;
+      if (IC2.isAlways() || IC2.isNever())
+        continue;
+
+      outerCallsFound = true;
+      int Cost2 = IC2.getValue();
+      int CurrentThreshold2 = InlineThreshold;
+      Function *Caller2 = CS2.getCaller();
+      if (Caller2 && !Caller2->isDeclaration() &&
+          Caller2->hasFnAttr(Attribute::OptimizeForSize) &&
+          InlineThreshold != 50)
+        CurrentThreshold2 = 50;
+
+      float FudgeFactor2 = getInlineFudgeFactor(CS2);
+
+      if (Cost2 >= (int)(CurrentThreshold2 * FudgeFactor2))
+        allOuterCallsWillBeInlined = false;
+
+      // See if we have this case.  The magic 6 is what InlineCost assigns
+      // for the call instruction, which we would be deleting.
+      if (Cost2 < (int)(CurrentThreshold2 * FudgeFactor2) &&
+          Cost2 + Cost - 6 >= (int)(CurrentThreshold2 * FudgeFactor2)) {
+        someOuterCallWouldNotBeInlined = true;
+        TotalSecondaryCost += Cost2;
+      }
+    }
+    // If all outer calls to Caller would get inlined, the cost for the last
+    // one is set very low by getInlineCost, in anticipation that Caller will
+    // be removed entirely.  We did not account for this above unless there
+    // is only one caller of Caller.
+    if (allOuterCallsWillBeInlined && Caller->use_begin() != Caller->use_end())
+      TotalSecondaryCost -= 15000;
+
+    if (outerCallsFound && someOuterCallWouldNotBeInlined && 
+        TotalSecondaryCost < Cost) {
+      DEBUG(errs() << "    NOT Inlining: " << *CS.getInstruction() << 
+           " Cost = " << Cost << 
+           ", outer Cost = " << TotalSecondaryCost << '\n');
+      return false;
+    }
+  }
+
   DEBUG(errs() << "    Inlining: cost=" << Cost
-        << ", Call: " << *CS.getInstruction() << "\n");
+        << ", Call: " << *CS.getInstruction() << '\n');
   return true;
 }
 
@@ -232,7 +296,7 @@ bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) {
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
         CallSite CS = CallSite::get(I);
-        // If this this isn't a call, or it is a call to an intrinsic, it can
+        // If this isn't a call, or it is a call to an intrinsic, it can
         // never be inlined.
         if (CS.getInstruction() == 0 || isa<IntrinsicInst>(I))
           continue;
@@ -279,7 +343,7 @@ bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) {
       // try to do so.
       if (!shouldInline(CS))
         continue;
-      
+
       Function *Caller = CS.getCaller();
       // Attempt to inline the function...
       if (!InlineCallIfPossible(CS, CG, TD, InlinedArrayAllocas))
diff --git a/llvm/test/Transforms/Inline/nested-inline.ll b/llvm/test/Transforms/Inline/nested-inline.ll
new file mode 100644
index 00000000000..12926671722
--- /dev/null
+++ b/llvm/test/Transforms/Inline/nested-inline.ll
@@ -0,0 +1,111 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+; Test that bar and bar2 are both inlined throughout and removed.
+@A = weak global i32 0		; <i32*> [#uses=1]
+@B = weak global i32 0		; <i32*> [#uses=1]
+@C = weak global i32 0		; <i32*> [#uses=1]
+
+define fastcc void @foo(i32 %X) {
+entry:
+; CHECK: @foo
+	%ALL = alloca i32, align 4		; <i32*> [#uses=1]
+	%tmp1 = and i32 %X, 1		; <i32> [#uses=1]
+	%tmp1.upgrd.1 = icmp eq i32 %tmp1, 0		; <i1> [#uses=1]
+	br i1 %tmp1.upgrd.1, label %cond_next, label %cond_true
+
+cond_true:		; preds = %entry
+	store i32 1, i32* @A
+	br label %cond_next
+
+cond_next:		; preds = %cond_true, %entry
+	%tmp4 = and i32 %X, 2		; <i32> [#uses=1]
+	%tmp4.upgrd.2 = icmp eq i32 %tmp4, 0		; <i1> [#uses=1]
+	br i1 %tmp4.upgrd.2, label %cond_next7, label %cond_true5
+
+cond_true5:		; preds = %cond_next
+	store i32 1, i32* @B
+	br label %cond_next7
+
+cond_next7:		; preds = %cond_true5, %cond_next
+	%tmp10 = and i32 %X, 4		; <i32> [#uses=1]
+	%tmp10.upgrd.3 = icmp eq i32 %tmp10, 0		; <i1> [#uses=1]
+	br i1 %tmp10.upgrd.3, label %cond_next13, label %cond_true11
+
+cond_true11:		; preds = %cond_next7
+	store i32 1, i32* @C
+	br label %cond_next13
+
+cond_next13:		; preds = %cond_true11, %cond_next7
+	%tmp16 = and i32 %X, 8		; <i32> [#uses=1]
+	%tmp16.upgrd.4 = icmp eq i32 %tmp16, 0		; <i1> [#uses=1]
+	br i1 %tmp16.upgrd.4, label %UnifiedReturnBlock, label %cond_true17
+
+cond_true17:		; preds = %cond_next13
+	call void @ext( i32* %ALL )
+	ret void
+
+UnifiedReturnBlock:		; preds = %cond_next13
+	ret void
+}
+
+; CHECK-NOT: @bar
+define internal fastcc void @bar(i32 %X) {
+entry:
+	%ALL = alloca i32, align 4		; <i32*> [#uses=1]
+	%tmp1 = and i32 %X, 1		; <i32> [#uses=1]
+	%tmp1.upgrd.1 = icmp eq i32 %tmp1, 0		; <i1> [#uses=1]
+	br i1 %tmp1.upgrd.1, label %cond_next, label %cond_true
+
+cond_true:		; preds = %entry
+	store i32 1, i32* @A
+	br label %cond_next
+
+cond_next:		; preds = %cond_true, %entry
+	%tmp4 = and i32 %X, 2		; <i32> [#uses=1]
+	%tmp4.upgrd.2 = icmp eq i32 %tmp4, 0		; <i1> [#uses=1]
+	br i1 %tmp4.upgrd.2, label %cond_next7, label %cond_true5
+
+cond_true5:		; preds = %cond_next
+	store i32 1, i32* @B
+	br label %cond_next7
+
+cond_next7:		; preds = %cond_true5, %cond_next
+	%tmp10 = and i32 %X, 4		; <i32> [#uses=1]
+	%tmp10.upgrd.3 = icmp eq i32 %tmp10, 0		; <i1> [#uses=1]
+	br i1 %tmp10.upgrd.3, label %cond_next13, label %cond_true11
+
+cond_true11:		; preds = %cond_next7
+	store i32 1, i32* @C
+	br label %cond_next13
+
+cond_next13:		; preds = %cond_true11, %cond_next7
+	%tmp16 = and i32 %X, 8		; <i32> [#uses=1]
+	%tmp16.upgrd.4 = icmp eq i32 %tmp16, 0		; <i1> [#uses=1]
+	br i1 %tmp16.upgrd.4, label %UnifiedReturnBlock, label %cond_true17
+
+cond_true17:		; preds = %cond_next13
+	call void @foo( i32 %X )
+	ret void
+
+UnifiedReturnBlock:		; preds = %cond_next13
+	ret void
+}
+
+define internal fastcc void @bar2(i32 %X) {
+entry:
+	call void @foo( i32 %X )
+	ret void
+}
+
+declare void @ext(i32*)
+
+define void @test(i32 %X) {
+entry:
+; CHECK: test
+; CHECK-NOT: @bar
+	tail call fastcc void @bar( i32 %X )
+	tail call fastcc void @bar( i32 %X )
+	tail call fastcc void @bar2( i32 %X )
+	tail call fastcc void @bar2( i32 %X )
+	ret void
+; CHECK: ret
+}