summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/include/llvm/ProfileData/SampleProfReader.h8
-rw-r--r--llvm/lib/Transforms/IPO/SampleProfile.cpp89
-rw-r--r--llvm/test/Transforms/SampleProfile/Inputs/profsampleacc.extbinary.afdobin0 -> 304 bytes
-rw-r--r--llvm/test/Transforms/SampleProfile/profile-sample-accurate.ll118
4 files changed, 195 insertions, 20 deletions
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index d690c6d6edb..3d5134d7749 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -330,6 +330,10 @@ public:
return nullptr;
};
+ /// It includes all the names that have samples either in outline instance
+ /// or inline instance.
+ virtual std::vector<StringRef> *getNameTable() { return nullptr; }
+
protected:
/// Map every function to its associated profile.
///
@@ -387,6 +391,10 @@ public:
/// Read sample profiles from the associated file.
std::error_code read() override;
+ /// It includes all the names that have samples either in outline instance
+ /// or inline instance.
+ virtual std::vector<StringRef> *getNameTable() override { return &NameTable; }
+
protected:
/// Read a numeric value of type T from the profile.
///
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index d0cf63b35f4..9c9d851a470 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -139,9 +139,11 @@ using EdgeWeightMap = DenseMap<Edge, uint64_t>;
using BlockEdgeMap =
DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
+class SampleProfileLoader;
+
class SampleCoverageTracker {
public:
- SampleCoverageTracker() = default;
+ SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){};
bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
uint32_t Discriminator, uint64_t Samples);
@@ -187,6 +189,8 @@ private:
/// keyed by FunctionSamples pointers, but these stats are cleared after
/// every function, so we just need to keep a single counter.
uint64_t TotalUsedSamples = 0;
+
+ SampleProfileLoader &SPLoader;
};
class GUIDToFuncNameMapper {
@@ -269,8 +273,9 @@ public:
std::function<AssumptionCache &(Function &)> GetAssumptionCache,
std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo)
: GetAC(std::move(GetAssumptionCache)),
- GetTTI(std::move(GetTargetTransformInfo)), Filename(Name),
- RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {}
+ GetTTI(std::move(GetTargetTransformInfo)), CoverageTracker(*this),
+ Filename(Name), RemappingFilename(RemapName),
+ IsThinLTOPreLink(IsThinLTOPreLink) {}
bool doInitialization(Module &M);
bool runOnModule(Module &M, ModuleAnalysisManager *AM,
@@ -279,6 +284,8 @@ public:
void dump() { Reader->dump(); }
protected:
+ friend class SampleCoverageTracker;
+
bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
unsigned getFunctionLoc(Function &F);
bool emitAnnotations(Function &F);
@@ -307,6 +314,8 @@ protected:
bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
void computeDominanceAndLoopInfo(Function &F);
void clearFunctionData();
+ bool callsiteIsHot(const FunctionSamples *CallsiteFS,
+ ProfileSummaryInfo *PSI);
/// Map basic blocks to their computed weights.
///
@@ -404,6 +413,13 @@ protected:
// GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
// all the function symbols defined or declared in current module.
DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
+
+ // All the Names used in FunctionSamples including outline function
+ // names, inline instance names and call target names.
+ StringSet<> NamesInProfile;
+
+ // Showing whether ProfileSampleAccurate is enabled for current function.
+ bool ProfSampleAccEnabled = false;
};
class SampleProfileLoaderLegacyPass : public ModulePass {
@@ -459,14 +475,22 @@ private:
/// To decide whether an inlined callsite is hot, we compare the callsite
/// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
/// regarded as hot if the count is above the cutoff value.
-static bool callsiteIsHot(const FunctionSamples *CallsiteFS,
- ProfileSummaryInfo *PSI) {
+///
+/// When profile-sample-accurate is enabled, functions without profile will
+/// be regarded as cold and much less inlining will happen in CGSCC inlining
+/// pass, so we tend to lower the hot criteria here to allow more early
+/// inlining to happen for warm callsites and it is helpful for performance.
+bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS,
+ ProfileSummaryInfo *PSI) {
if (!CallsiteFS)
return false; // The callsite was not inlined in the original binary.
assert(PSI && "PSI is expected to be non null");
uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
- return PSI->isHotCount(CallsiteTotalSamples);
+ if (ProfSampleAccEnabled)
+ return !PSI->isColdCount(CallsiteTotalSamples);
+ else
+ return PSI->isHotCount(CallsiteTotalSamples);
}
/// Mark as used the sample record for the given function samples at
@@ -503,7 +527,7 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
for (const auto &I : FS->getCallsiteSamples())
for (const auto &J : I.second) {
const FunctionSamples *CalleeSamples = &J.second;
- if (callsiteIsHot(CalleeSamples, PSI))
+ if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
Count += countUsedRecords(CalleeSamples, PSI);
}
@@ -522,7 +546,7 @@ SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
for (const auto &I : FS->getCallsiteSamples())
for (const auto &J : I.second) {
const FunctionSamples *CalleeSamples = &J.second;
- if (callsiteIsHot(CalleeSamples, PSI))
+ if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
Count += countBodyRecords(CalleeSamples, PSI);
}
@@ -543,7 +567,7 @@ SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
for (const auto &I : FS->getCallsiteSamples())
for (const auto &J : I.second) {
const FunctionSamples *CalleeSamples = &J.second;
- if (callsiteIsHot(CalleeSamples, PSI))
+ if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
Total += countBodySamples(CalleeSamples, PSI);
}
@@ -1643,6 +1667,12 @@ bool SampleProfileLoader::doInitialization(Module &M) {
ProfileIsValid = (Reader->read() == sampleprof_error::success);
PSL = Reader->getProfileSymbolList();
+ if (ProfileSampleAccurate) {
+ NamesInProfile.clear();
+ if (auto NameTable = Reader->getNameTable())
+ NamesInProfile.insert(NameTable->begin(), NameTable->end());
+ }
+
if (!RemappingFilename.empty()) {
// Apply profile remappings to the loaded profile data if requested.
// For now, we only support remapping symbols encoded using the Itanium
@@ -1733,17 +1763,36 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM)
// conservatively by getEntryCount as the same as unknown (None). This is
// to avoid newly added code to be treated as cold. If we have samples
// this will be overwritten in emitAnnotations.
- //
- // PSL -- profile symbol list include all the symbols in sampled binary.
- // If ProfileSampleAccurate is true or F has profile-sample-accurate
- // attribute, and if there is no profile symbol list read in, initialize
- // all the function entry counts to 0; if there is profile symbol list, only
- // initialize the entry count to 0 when current function is in the list.
- uint64_t initialEntryCount =
- ((ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) &&
- (!PSL || PSL->contains(F.getName())))
- ? 0
- : -1;
+ uint64_t initialEntryCount = -1;
+
+ ProfSampleAccEnabled =
+ ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate");
+ if (ProfSampleAccEnabled) {
+ // PSL -- profile symbol list include all the symbols in sampled binary.
+ // It is used to prevent new functions to be treated as cold.
+ // If ProfileSampleAccurate is true or F has profile-sample-accurate
+ // attribute, and if there is no profile symbol list read in, initialize
+ // all the function entry counts to 0; if there is profile symbol list, only
+ // initialize the entry count to 0 when current function is in the list.
+ if (!PSL || PSL->contains(F.getName()))
+ initialEntryCount = 0;
+
+ // When ProfileSampleAccurate is true, function without sample will be
+ // regarded as cold. To minimize the potential negative performance
+ // impact it could have, we want to be a little conservative here
+ // saying if a function shows up in the profile, no matter as outline
+ // function, inline instance or call targets, treat the function as not
+ // being cold. This will handle the cases such as most callsites of a
+ // function are inlined in sampled binary but not inlined in current
+ // build (because of source code drift, imprecise debug information, or
+ // the callsites are all cold individually but not cold accumulatively...),
+ // so the outline function showing up as cold in sampled binary will
+ // actually not be cold after current build.
+ StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
+ if (NamesInProfile.count(CanonName))
+ initialEntryCount = -1;
+ }
+
F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
if (AM) {
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profsampleacc.extbinary.afdo b/llvm/test/Transforms/SampleProfile/Inputs/profsampleacc.extbinary.afdo
new file mode 100644
index 00000000000..c05af6fb4a6
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profsampleacc.extbinary.afdo
Binary files differ
diff --git a/llvm/test/Transforms/SampleProfile/profile-sample-accurate.ll b/llvm/test/Transforms/SampleProfile/profile-sample-accurate.ll
new file mode 100644
index 00000000000..31b2d363e6d
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/profile-sample-accurate.ll
@@ -0,0 +1,118 @@
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -S | FileCheck %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -S | FileCheck %s
+; With the hot cutoff being set to 600000, the inline instance of _Z3sumii
+; in main is neither hot nor cold. Check it will still be inlined when
+; profile-sample-accurate is enabled, and check _Z3sumii's function entry
+; count won't be initialized to 0 because it shows up in the profile as
+; inline instance.
+
+; Original C++ test case
+;
+; #include <stdio.h>
+;
+; int sum(int x, int y) {
+; return x + y;
+; }
+;
+; int main() {
+; int s, i = 0;
+; while (i++ < 20000 * 20000)
+; if (i != 100) s = sum(i, s); else s = 30;
+; printf("sum is %d\n", s);
+; return 0;
+; }
+;
+@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
+
+; CHECK: define i32 @_Z3sumii{{.*}}!prof ![[UNKNOWN_ID:[0-9]+]]
+; Function Attrs: nounwind uwtable
+define i32 @_Z3sumii(i32 %x, i32 %y) !dbg !4 {
+entry:
+ %x.addr = alloca i32, align 4
+ %y.addr = alloca i32, align 4
+ store i32 %x, i32* %x.addr, align 4
+ store i32 %y, i32* %y.addr, align 4
+ %0 = load i32, i32* %x.addr, align 4, !dbg !11
+ %1 = load i32, i32* %y.addr, align 4, !dbg !11
+ %add = add nsw i32 %0, %1, !dbg !11
+ ret i32 %add, !dbg !11
+}
+
+; Function Attrs: uwtable
+define i32 @main() !dbg !7 {
+entry:
+ %retval = alloca i32, align 4
+ %s = alloca i32, align 4
+ %i = alloca i32, align 4
+ store i32 0, i32* %retval
+ store i32 0, i32* %i, align 4, !dbg !12
+ br label %while.cond, !dbg !13
+
+while.cond: ; preds = %if.end, %entry
+ %0 = load i32, i32* %i, align 4, !dbg !14
+ %inc = add nsw i32 %0, 1, !dbg !14
+ store i32 %inc, i32* %i, align 4, !dbg !14
+ %cmp = icmp slt i32 %0, 400000000, !dbg !14
+ br i1 %cmp, label %while.body, label %while.end, !dbg !14
+
+while.body: ; preds = %while.cond
+ %1 = load i32, i32* %i, align 4, !dbg !16
+ %cmp1 = icmp ne i32 %1, 100, !dbg !16
+ br i1 %cmp1, label %if.then, label %if.else, !dbg !16
+
+; Check _Z3sumii is inlined at this callsite.
+; CHECK: if.then:
+; CHECK-NOT: call i32 @_Z3sumii
+; CHECK: if.else:
+if.then: ; preds = %while.body
+ %2 = load i32, i32* %i, align 4, !dbg !18
+ %3 = load i32, i32* %s, align 4, !dbg !18
+ %call = call i32 @_Z3sumii(i32 %2, i32 %3), !dbg !18
+ store i32 %call, i32* %s, align 4, !dbg !18
+ br label %if.end, !dbg !18
+
+if.else: ; preds = %while.body
+ store i32 30, i32* %s, align 4, !dbg !20
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ br label %while.cond, !dbg !22
+
+while.end: ; preds = %while.cond
+ %4 = load i32, i32* %s, align 4, !dbg !24
+ %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %4), !dbg !24
+ ret i32 0, !dbg !25
+}
+
+declare i32 @printf(i8*, ...) #2
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+; CHECK: ![[UNKNOWN_ID]] = !{!"function_entry_count", i64 -1}
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: NoDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "calls.cc", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "sum", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "calls.cc", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "main", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 7, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 3}
+!10 = !{!"clang version 3.5 "}
+!11 = !DILocation(line: 4, scope: !4)
+!12 = !DILocation(line: 8, scope: !7)
+!13 = !DILocation(line: 9, scope: !7)
+!14 = !DILocation(line: 9, scope: !15)
+!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7)
+!16 = !DILocation(line: 10, scope: !17)
+!17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7)
+!18 = !DILocation(line: 10, scope: !19)
+!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17)
+!20 = !DILocation(line: 10, scope: !21)
+!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17)
+!22 = !DILocation(line: 10, scope: !23)
+!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17)
+!24 = !DILocation(line: 11, scope: !7)
+!25 = !DILocation(line: 12, scope: !7)
OpenPOWER on IntegriCloud