From d9830eb79fdc42368d370abeab9a3b56c08e3963 Mon Sep 17 00:00:00 2001 From: Piotr Padlewski Date: Mon, 26 Sep 2016 20:37:32 +0000 Subject: [thinlto] Basic thinlto fdo heuristic Summary: This patch improves thinlto importer by importing 3x larger functions that are called from hot block. I compared performance with the trunk on spec, and there were about 2% on povray and 3.33% on milc. These results seems to be consistant and match the results Teresa got with her simple heuristic. Some benchmarks got slower but I think they are just noisy (mcf, xalancbmki, omnetpp)- running the benchmarks again with more iterations to confirm. Geomean of all benchmarks including the noisy ones were about +0.02%. I see much better improvement on google branch with Easwaran patch for pgo callsite inlining (the inliner actually inline those big functions) Over all I see +0.5% improvement, and I get +8.65% on povray. So I guess we will see much bigger change when Easwaran patch will land (it depends on new pass manager), but it is still worth putting this to trunk before it. Implementation details changes: - Removed CallsiteCount. - ProfileCount got replaced by Hotness - hot-import-multiplier is set to 3.0 for now, didn't have time to tune it up, but I see that we get most of the interesting functions with 3, so there is no much performance difference with higher, and binary size doesn't grow as much as with 10.0. Reviewers: eraman, mehdi_amini, tejohnson Subscribers: mehdi_amini, llvm-commits Differential Revision: https://reviews.llvm.org/D24638 llvm-svn: 282437 --- llvm/lib/Analysis/ModuleSummaryAnalysis.cpp | 54 ++++++++++++++++++++------- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 57 +++++++++++++++++++---------- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 20 ++++------ llvm/lib/LTO/ThinLTOCodeGenerator.cpp | 4 +- llvm/lib/Transforms/IPO/FunctionImport.cpp | 15 ++++++-- 5 files changed, 102 insertions(+), 48 deletions(-) (limited to 'llvm/lib') diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index 1a8872adeb8..c736c6aa169 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/IndirectCallPromotionAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" @@ -63,8 +64,20 @@ static void findRefEdges(const User *CurUser, DenseSet &RefEdges, } } +static CalleeInfo::HotnessType getHotness(uint64_t ProfileCount, + ProfileSummaryInfo *PSI) { + if (!PSI) + return CalleeInfo::HotnessType::Unknown; + if (PSI->isHotCount(ProfileCount)) + return CalleeInfo::HotnessType::Hot; + if (PSI->isColdCount(ProfileCount)) + return CalleeInfo::HotnessType::Cold; + return CalleeInfo::HotnessType::None; +} + static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, - const Function &F, BlockFrequencyInfo *BFI) { + const Function &F, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) { // Summary not currently supported for anonymous functions, they must // be renamed. if (!F.hasName()) @@ -97,7 +110,10 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, auto ScaledCount = BFI ? BFI->getBlockProfileCount(&BB) : None; auto *CalleeId = M.getValueSymbolTable().lookup(CalledFunction->getName()); - CallGraphEdges[CalleeId] += (ScaledCount ? ScaledCount.getValue() : 0); + + auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI) + : CalleeInfo::HotnessType::Unknown; + CallGraphEdges[CalleeId].updateHotness(Hotness); } else { const auto *CI = dyn_cast(&I); // Skip inline assembly calls. @@ -113,7 +129,8 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, ICallAnalysis.getPromotionCandidatesForInstruction( &I, NumVals, TotalCount, NumCandidates); for (auto &Candidate : CandidateProfileData) - IndirectCallEdges[Candidate.Value] += Candidate.Count; + IndirectCallEdges[Candidate.Value].updateHotness( + getHotness(Candidate.Count, PSI)); } } @@ -140,7 +157,8 @@ static void computeVariableSummary(ModuleSummaryIndex &Index, ModuleSummaryIndex llvm::buildModuleSummaryIndex( const Module &M, - std::function GetBFICallback) { + std::function GetBFICallback, + ProfileSummaryInfo *PSI) { ModuleSummaryIndex Index; // Check if the module can be promoted, otherwise just disable importing from // it by not emitting any summary. @@ -165,7 +183,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( BFI = BFIPtr.get(); } - computeFunctionSummary(Index, M, F, BFI); + computeFunctionSummary(Index, M, F, BFI, PSI); } // Compute summaries for all variables defined in module, and save in the @@ -182,10 +200,15 @@ char ModuleSummaryIndexAnalysis::PassID; ModuleSummaryIndex ModuleSummaryIndexAnalysis::run(Module &M, ModuleAnalysisManager &AM) { + ProfileSummaryInfo &PSI = AM.getResult(M); auto &FAM = AM.getResult(M).getManager(); - return buildModuleSummaryIndex(M, [&FAM](const Function &F) { - return &FAM.getResult(*const_cast(&F)); - }); + return buildModuleSummaryIndex( + M, + [&FAM](const Function &F) { + return &FAM.getResult( + *const_cast(&F)); + }, + &PSI); } char ModuleSummaryIndexWrapperPass::ID = 0; @@ -205,11 +228,15 @@ ModuleSummaryIndexWrapperPass::ModuleSummaryIndexWrapperPass() } bool ModuleSummaryIndexWrapperPass::runOnModule(Module &M) { - Index = buildModuleSummaryIndex(M, [this](const Function &F) { - return &(this->getAnalysis( - *const_cast(&F)) - .getBFI()); - }); + auto &PSI = *getAnalysis().getPSI(M); + Index = buildModuleSummaryIndex( + M, + [this](const Function &F) { + return &(this->getAnalysis( + *const_cast(&F)) + .getBFI()); + }, + &PSI); return false; } @@ -221,6 +248,7 @@ bool ModuleSummaryIndexWrapperPass::doFinalization(Module &M) { void ModuleSummaryIndexWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequired(); + AU.addRequired(); } bool llvm::moduleCanBeRenamedForThinLTO(const Module &M) { diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 49bf8136f4e..101e8eba6b1 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -651,6 +651,9 @@ private: std::pair getGUIDFromValueId(unsigned ValueId); + std::pair + readCallGraphEdge(const SmallVector &Record, unsigned int &I, + bool IsOldProfileFormat, bool HasProfile); }; } // end anonymous namespace @@ -6218,8 +6221,10 @@ std::error_code ModuleSummaryIndexBitcodeReader::parseEntireSummary() { return error("Invalid Summary Block: version expected"); } const uint64_t Version = Record[0]; - if (Version != 1) - return error("Invalid summary version " + Twine(Version) + ", 1 expected"); + const bool IsOldProfileFormat = Version == 1; + if (!IsOldProfileFormat && Version != 2) + return error("Invalid summary version " + Twine(Version) + + ", 1 or 2 expected"); Record.clear(); // Keep around the last seen summary to be used when we see an optional @@ -6264,10 +6269,10 @@ std::error_code ModuleSummaryIndexBitcodeReader::parseEntireSummary() { default: // Default behavior: ignore. break; // FS_PERMODULE: [valueid, flags, instcount, numrefs, numrefs x valueid, - // n x (valueid, callsitecount)] + // n x (valueid)] // FS_PERMODULE_PROFILE: [valueid, flags, instcount, numrefs, // numrefs x valueid, - // n x (valueid, callsitecount, profilecount)] + // n x (valueid, hotness)] case bitc::FS_PERMODULE: case bitc::FS_PERMODULE_PROFILE: { unsigned ValueID = Record[0]; @@ -6296,12 +6301,11 @@ std::error_code ModuleSummaryIndexBitcodeReader::parseEntireSummary() { bool HasProfile = (BitCode == bitc::FS_PERMODULE_PROFILE); for (unsigned I = CallGraphEdgeStartIndex, E = Record.size(); I != E; ++I) { - unsigned CalleeValueId = Record[I]; - unsigned CallsiteCount = Record[++I]; - uint64_t ProfileCount = HasProfile ? Record[++I] : 0; - GlobalValue::GUID CalleeGUID = getGUIDFromValueId(CalleeValueId).first; - FS->addCallGraphEdge(CalleeGUID, - CalleeInfo(CallsiteCount, ProfileCount)); + CalleeInfo::HotnessType Hotness; + GlobalValue::GUID CalleeGUID; + std::tie(CalleeGUID, Hotness) = + readCallGraphEdge(Record, I, IsOldProfileFormat, HasProfile); + FS->addCallGraphEdge(CalleeGUID, CalleeInfo(Hotness)); } auto GUID = getGUIDFromValueId(ValueID); FS->setOriginalName(GUID.second); @@ -6356,10 +6360,9 @@ std::error_code ModuleSummaryIndexBitcodeReader::parseEntireSummary() { break; } // FS_COMBINED: [valueid, modid, flags, instcount, numrefs, - // numrefs x valueid, n x (valueid, callsitecount)] + // numrefs x valueid, n x (valueid)] // FS_COMBINED_PROFILE: [valueid, modid, flags, instcount, numrefs, - // numrefs x valueid, - // n x (valueid, callsitecount, profilecount)] + // numrefs x valueid, n x (valueid, hotness)] case bitc::FS_COMBINED: case bitc::FS_COMBINED_PROFILE: { unsigned ValueID = Record[0]; @@ -6385,12 +6388,11 @@ std::error_code ModuleSummaryIndexBitcodeReader::parseEntireSummary() { bool HasProfile = (BitCode == bitc::FS_COMBINED_PROFILE); for (unsigned I = CallGraphEdgeStartIndex, E = Record.size(); I != E; ++I) { - unsigned CalleeValueId = Record[I]; - unsigned CallsiteCount = Record[++I]; - uint64_t ProfileCount = HasProfile ? Record[++I] : 0; - GlobalValue::GUID CalleeGUID = getGUIDFromValueId(CalleeValueId).first; - FS->addCallGraphEdge(CalleeGUID, - CalleeInfo(CallsiteCount, ProfileCount)); + CalleeInfo::HotnessType Hotness; + GlobalValue::GUID CalleeGUID; + std::tie(CalleeGUID, Hotness) = + readCallGraphEdge(Record, I, IsOldProfileFormat, HasProfile); + FS->addCallGraphEdge(CalleeGUID, CalleeInfo(Hotness)); } GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first; TheIndex->addGlobalValueSummary(GUID, std::move(FS)); @@ -6456,6 +6458,23 @@ std::error_code ModuleSummaryIndexBitcodeReader::parseEntireSummary() { llvm_unreachable("Exit infinite loop"); } +std::pair +ModuleSummaryIndexBitcodeReader::readCallGraphEdge( + const SmallVector &Record, unsigned int &I, + const bool IsOldProfileFormat, const bool HasProfile) { + + auto Hotness = CalleeInfo::HotnessType::Unknown; + unsigned CalleeValueId = Record[I]; + GlobalValue::GUID CalleeGUID = getGUIDFromValueId(CalleeValueId).first; + if (IsOldProfileFormat) { + I += 1; // Skip old callsitecount field + if (HasProfile) + I += 1; // Skip old profilecount field + } else if (HasProfile) + Hotness = static_cast(Record[++I]); + return {CalleeGUID, Hotness}; +} + // Parse the module string table block into the Index. // This populates the ModulePathStringTable map in the index. std::error_code ModuleSummaryIndexBitcodeReader::parseModuleStringTable() { diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index a353edf7aec..af722723845 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -3293,10 +3293,8 @@ void ModuleBitcodeWriter::writePerModuleFunctionSummaryRecord( bool HasProfileData = F.getEntryCount().hasValue(); for (auto &ECI : Calls) { NameVals.push_back(getValueId(ECI.first)); - assert(ECI.second.CallsiteCount > 0 && "Expected at least one callsite"); - NameVals.push_back(ECI.second.CallsiteCount); if (HasProfileData) - NameVals.push_back(ECI.second.ProfileCount); + NameVals.push_back(static_cast(ECI.second.Hotness)); } unsigned FSAbbrev = (HasProfileData ? FSCallsProfileAbbrev : FSCallsAbbrev); @@ -3336,7 +3334,7 @@ void ModuleBitcodeWriter::writeModuleLevelReferences( // Current version for the summary. // This is bumped whenever we introduce changes in the way some record are // interpreted, like flags for instance. -static const uint64_t INDEX_VERSION = 1; +static const uint64_t INDEX_VERSION = 2; /// Emit the per-module summary section alongside the rest of /// the module's bitcode. @@ -3357,7 +3355,7 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs - // numrefs x valueid, n x (valueid, callsitecount) + // numrefs x valueid, n x (valueid) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv); @@ -3369,7 +3367,7 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs - // numrefs x valueid, n x (valueid, callsitecount, profilecount) + // numrefs x valueid, n x (valueid, hotness) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv); @@ -3442,7 +3440,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs - // numrefs x valueid, n x (valueid, callsitecount) + // numrefs x valueid, n x (valueid) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv); @@ -3455,7 +3453,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs - // numrefs x valueid, n x (valueid, callsitecount, profilecount) + // numrefs x valueid, n x (valueid, hotness) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv); @@ -3542,7 +3540,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { bool HasProfileData = false; for (auto &EI : FS->calls()) { - HasProfileData |= EI.second.ProfileCount != 0; + HasProfileData |= EI.second.Hotness != CalleeInfo::HotnessType::Unknown; if (HasProfileData) break; } @@ -3553,10 +3551,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { if (!hasValueId(EI.first.getGUID())) continue; NameVals.push_back(getValueId(EI.first.getGUID())); - assert(EI.second.CallsiteCount > 0 && "Expected at least one callsite"); - NameVals.push_back(EI.second.CallsiteCount); if (HasProfileData) - NameVals.push_back(EI.second.ProfileCount); + NameVals.push_back(static_cast(EI.second.Hotness)); } unsigned FSAbbrev = (HasProfileData ? FSCallsProfileAbbrev : FSCallsAbbrev); diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index ae646b7347a..f4232dc2f89 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeWriterPass.h" @@ -377,7 +378,8 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index, SmallVector OutputBuffer; { raw_svector_ostream OS(OutputBuffer); - auto Index = buildModuleSummaryIndex(TheModule); + ProfileSummaryInfo PSI(TheModule); + auto Index = buildModuleSummaryIndex(TheModule, nullptr, nullptr); WriteBitcodeToFile(&TheModule, OS, true, &Index); } return make_unique(std::move(OutputBuffer)); diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 81d9ca5638d..6c43b780870 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -48,6 +48,10 @@ static cl::opt cl::desc("As we import functions, multiply the " "`import-instr-limit` threshold by this factor " "before processing newly imported functions")); +static cl::opt ImportHotMultiplier( + "import-hot-multiplier", cl::init(3.0), cl::Hidden, cl::value_desc("x"), + cl::ZeroOrMore, cl::desc("Multiply the `import-instr-limit` threshold for " + "hot callsites")); static cl::opt PrintImports("print-imports", cl::init(false), cl::Hidden, cl::desc("Print imported functions")); @@ -268,7 +272,7 @@ using EdgeInfo = std::pair; /// exported from their source module. static void computeImportForFunction( const FunctionSummary &Summary, const ModuleSummaryIndex &Index, - unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries, + const unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries, SmallVectorImpl &Worklist, FunctionImporter::ImportMapTy &ImportList, StringMap *ExportLists = nullptr) { @@ -281,7 +285,12 @@ static void computeImportForFunction( continue; } - auto *CalleeSummary = selectCallee(GUID, Threshold, Index); + // FIXME: Also lower the threshold for cold callsites. + const auto NewThreshold = + Edge.second.Hotness == CalleeInfo::HotnessType::Hot + ? Threshold * ImportHotMultiplier + : Threshold; + auto *CalleeSummary = selectCallee(GUID, NewThreshold, Index); if (!CalleeSummary) { DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n"); continue; @@ -297,7 +306,7 @@ static void computeImportForFunction( } else ResolvedCalleeSummary = cast(CalleeSummary); - assert(ResolvedCalleeSummary->instCount() <= Threshold && + assert(ResolvedCalleeSummary->instCount() <= NewThreshold && "selectCallee() didn't honor the threshold"); auto ExportModulePath = ResolvedCalleeSummary->modulePath(); -- cgit v1.2.3