summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVedant Kumar <vsk@apple.com>2018-11-04 23:11:57 +0000
committerVedant Kumar <vsk@apple.com>2018-11-04 23:11:57 +0000
commitd2a895a9720e32e2f52e1ce10d2ba4769a034aa2 (patch)
treed3576d1fb7e7d46a92b9273de163ba1319e178b1
parent6c652b7f1167654f87f4a31b3922d1d0d24eb258 (diff)
downloadbcm5719-llvm-d2a895a9720e32e2f52e1ce10d2ba4769a034aa2.tar.gz
bcm5719-llvm-d2a895a9720e32e2f52e1ce10d2ba4769a034aa2.zip
[HotColdSplitting] Use TTI to inform outlining threshold
Using TargetTransformInfo allows the splitting pass to factor in the code size cost of instructions as it decides whether or not outlining is profitable. This did not regress the overall amount of outlining seen on the handful of internal frameworks I tested. Thanks to Jun Bum Lim for suggesting this! Differential Revision: https://reviews.llvm.org/D53835 llvm-svn: 346108
-rw-r--r--llvm/lib/Transforms/IPO/HotColdSplitting.cpp44
-rw-r--r--llvm/test/Transforms/HotColdSplit/X86/lit.local.cfg3
-rw-r--r--llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll25
-rw-r--r--llvm/test/Transforms/HotColdSplit/do-not-split.ll3
-rw-r--r--llvm/test/Transforms/HotColdSplit/minsize.ll4
-rw-r--r--llvm/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll3
6 files changed, 63 insertions, 19 deletions
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index ce8a5060a3a..621ac7dc8ab 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -66,10 +66,10 @@ using namespace llvm;
static cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis",
cl::init(true), cl::Hidden);
-static cl::opt<unsigned> MinOutliningInstCount(
- "min-outlining-inst-count", cl::init(3), cl::Hidden,
- cl::desc("Minimum number of instructions needed for a single-block region "
- "to be an outlining candidate"));
+static cl::opt<int>
+ MinOutliningThreshold("min-outlining-thresh", cl::init(3), cl::Hidden,
+ cl::desc("Code size threshold for outlining within a "
+ "single BB (as a multiple of TCC_Basic)"));
namespace {
@@ -135,14 +135,18 @@ static bool mayExtractBlock(const BasicBlock &BB) {
return !BB.hasAddressTaken();
}
-/// Check whether \p BB has at least \p Min non-debug, non-terminator
-/// instructions.
-static bool hasMinimumInstCount(const BasicBlock &BB, unsigned Min) {
- unsigned Count = 0;
+/// Check whether \p BB is profitable to outline (i.e. its code size cost meets
+/// the threshold set in \p MinOutliningThreshold).
+static bool isProfitableToOutline(const BasicBlock &BB,
+ TargetTransformInfo &TTI) {
+ int Cost = 0;
for (const Instruction &I : BB) {
if (isa<DbgInfoIntrinsic>(&I) || &I == BB.getTerminator())
continue;
- if (++Count >= Min)
+
+ Cost += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+
+ if (Cost >= (MinOutliningThreshold * TargetTransformInfo::TCC_Basic))
return true;
}
return false;
@@ -156,8 +160,10 @@ static bool hasMinimumInstCount(const BasicBlock &BB, unsigned Min) {
///
/// Return an empty sequence if the cold region is too small to outline, or if
/// the cold region has no warm predecessors.
-static BlockSequence
-findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) {
+static BlockSequence findMaximalColdRegion(BasicBlock &SinkBB,
+ TargetTransformInfo &TTI,
+ DominatorTree &DT,
+ PostDomTree &PDT) {
// The maximal cold region.
BlockSequence ColdRegion = {};
@@ -241,8 +247,7 @@ findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) {
++SuccIt;
}
- if (ColdRegion.size() == 1 &&
- !hasMinimumInstCount(*ColdRegion[0], MinOutliningInstCount))
+ if (ColdRegion.size() == 1 && !isProfitableToOutline(*ColdRegion[0], TTI))
return {};
return ColdRegion;
@@ -251,6 +256,7 @@ findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) {
/// Get the largest cold region in \p F.
static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI,
BlockFrequencyInfo *BFI,
+ TargetTransformInfo &TTI,
DominatorTree &DT, PostDomTree &PDT) {
// Keep track of the largest cold region.
BlockSequence LargestColdRegion = {};
@@ -270,7 +276,7 @@ static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI,
});
// Find a maximal cold region we can outline.
- BlockSequence ColdRegion = findMaximalColdRegion(BB, DT, PDT);
+ BlockSequence ColdRegion = findMaximalColdRegion(BB, TTI, DT, PDT);
if (ColdRegion.empty()) {
LLVM_DEBUG(dbgs() << " Skipping (block not profitable to extract)\n");
continue;
@@ -305,7 +311,7 @@ public:
private:
bool shouldOutlineFrom(const Function &F) const;
Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT,
- BlockFrequencyInfo *BFI,
+ BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
OptimizationRemarkEmitter &ORE, unsigned Count);
SmallPtrSet<const Function *, 2> OutlinedFunctions;
ProfileSummaryInfo *PSI;
@@ -365,6 +371,7 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
DominatorTree &DT,
BlockFrequencyInfo *BFI,
+ TargetTransformInfo &TTI,
OptimizationRemarkEmitter &ORE,
unsigned Count) {
assert(!Region.empty());
@@ -393,7 +400,7 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
CallInst *CI = cast<CallInst>(U);
CallSite CS(CI);
NumColdRegionsOutlined++;
- if (GetTTI(*OutF).useColdCCForColdCall(*OutF)) {
+ if (TTI.useColdCCForColdCall(*OutF)) {
OutF->setCallingConv(CallingConv::Cold);
CS.setCallingConv(CallingConv::Cold);
}
@@ -437,14 +444,15 @@ bool HotColdSplitting::run(Module &M) {
PostDomTree PDT(F);
PDT.recalculate(F);
BlockFrequencyInfo *BFI = GetBFI(F);
+ TargetTransformInfo &TTI = GetTTI(F);
- BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, DT, PDT);
+ BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, TTI, DT, PDT);
if (ColdRegion.empty())
continue;
OptimizationRemarkEmitter &ORE = (*GetORE)(F);
Function *Outlined =
- extractColdRegion(ColdRegion, DT, BFI, ORE, /*Count=*/1);
+ extractColdRegion(ColdRegion, DT, BFI, TTI, ORE, /*Count=*/1);
if (Outlined) {
OutlinedFunctions.insert(Outlined);
Changed = true;
diff --git a/llvm/test/Transforms/HotColdSplit/X86/lit.local.cfg b/llvm/test/Transforms/HotColdSplit/X86/lit.local.cfg
new file mode 100644
index 00000000000..e71f3cc4c41
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/X86/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+ config.unsupported = True
+
diff --git a/llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll b/llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll
new file mode 100644
index 00000000000..5b0cceae2af
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll
@@ -0,0 +1,25 @@
+; The magic number 6 comes from (1 * TCC_Expensive) + (1 * CostOfCallX86).
+; RUN: opt -hotcoldsplit -min-outlining-thresh=6 -S < %s | FileCheck %s
+
+; Test that we outline even though there are only two cold instructions. TTI
+; should determine that they are expensive in terms of code size.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: @fun
+; CHECK: call void @fun.cold.1
+define void @fun(i32 %x) {
+entry:
+ br i1 undef, label %if.then, label %if.else
+
+if.then:
+ ret void
+
+if.else:
+ %y = sdiv i32 %x, 111
+ call void @sink(i32 %y)
+ ret void
+}
+
+declare void @sink(i32 %x) cold
diff --git a/llvm/test/Transforms/HotColdSplit/do-not-split.ll b/llvm/test/Transforms/HotColdSplit/do-not-split.ll
index 213681383ea..d5a8c44cc04 100644
--- a/llvm/test/Transforms/HotColdSplit/do-not-split.ll
+++ b/llvm/test/Transforms/HotColdSplit/do-not-split.ll
@@ -1,6 +1,9 @@
; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
; RUN: opt -passes=hotcoldsplit -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
; Check that these functions are not split. Outlined functions are called from a
; basic block named codeRepl.
diff --git a/llvm/test/Transforms/HotColdSplit/minsize.ll b/llvm/test/Transforms/HotColdSplit/minsize.ll
index eb42ad14af2..69cd0979b94 100644
--- a/llvm/test/Transforms/HotColdSplit/minsize.ll
+++ b/llvm/test/Transforms/HotColdSplit/minsize.ll
@@ -1,8 +1,10 @@
; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
; CHECK-LABEL: @fun
; CHECK: call void @fun.cold.1
-
define void @fun() {
entry:
br i1 undef, label %if.then, label %if.else
diff --git a/llvm/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll b/llvm/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
index b77201fe0d3..becfaf8e63d 100644
--- a/llvm/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
+++ b/llvm/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
@@ -1,5 +1,8 @@
; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
; CHECK-LABEL: define {{.*}}@foo.cold
; CHECK-NOT: llvm.dbg.value
OpenPOWER on IntegriCloud