From c94d4d70d8116edab8de1663edc4f7bf6db24c78 Mon Sep 17 00:00:00 2001 From: Hiroshi Yamauchi Date: Mon, 20 Nov 2017 21:03:38 +0000 Subject: Add heuristics for irreducible loop metadata under PGO Summary: Add the following heuristics for irreducible loop metadata: - When an irreducible loop header is missing the loop header weight metadata, give it the minimum weight seen among other headers. - Annotate indirectbr targets with the loop header weight metadata (as they are likely to become irreducible loop headers after indirectbr tail duplication.) These greatly improve the accuracy of the block frequency info of the Python interpreter loop (eg. from ~3-16x off down to ~40-55% off) and the Python performance (eg. unpack_sequence from ~50% slower to ~8% faster than GCC) due to better register allocation under PGO. Reviewers: davidxl Reviewed By: davidxl Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39980 llvm-svn: 318693 --- .../include/llvm/Analysis/BlockFrequencyInfoImpl.h | 44 +++++++++++---- .../Instrumentation/PGOInstrumentation.cpp | 13 ++++- .../Analysis/BlockFrequencyInfo/irreducible_pgo.ll | 65 ++++++++++++++++++++++ llvm/test/Transforms/PGOProfile/irreducible.ll | 2 + 4 files changed, 112 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h index 7b916e3653b..91056797faa 100644 --- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -16,6 +16,7 @@ #define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" @@ -1155,35 +1156,56 @@ bool BlockFrequencyInfoImpl::computeMassInLoop(LoopData &Loop) { DEBUG(dbgs() << "isIrreducible = true\n"); Distribution Dist; unsigned NumHeadersWithWeight = 0; + Optional MinHeaderWeight; + DenseSet HeadersWithoutWeight; + HeadersWithoutWeight.reserve(Loop.NumHeaders); for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { auto &HeaderNode = Loop.Nodes[H]; const BlockT *Block = getBlock(HeaderNode); IsIrrLoopHeader.set(Loop.Nodes[H].Index); Optional HeaderWeight = Block->getIrrLoopHeaderWeight(); - if (!HeaderWeight) + if (!HeaderWeight) { + DEBUG(dbgs() << "Missing irr loop header metadata on " + << getBlockName(HeaderNode) << "\n"); + HeadersWithoutWeight.insert(H); continue; + } DEBUG(dbgs() << getBlockName(HeaderNode) << " has irr loop header weight " << HeaderWeight.getValue() << "\n"); NumHeadersWithWeight++; uint64_t HeaderWeightValue = HeaderWeight.getValue(); - if (HeaderWeightValue) + if (!MinHeaderWeight || HeaderWeightValue < MinHeaderWeight) + MinHeaderWeight = HeaderWeightValue; + if (HeaderWeightValue) { Dist.addLocal(HeaderNode, HeaderWeightValue); - } - if (NumHeadersWithWeight != Loop.NumHeaders) { - // Not all headers have a weight metadata. Distribute weight evenly. - Dist = Distribution(); - for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { - auto &HeaderNode = Loop.Nodes[H]; - Dist.addLocal(HeaderNode, 1); } } + // As a heuristic, if some headers don't have a weight, give them the + // minimium weight seen (not to disrupt the existing trends too much by + // using a weight that's in the general range of the other headers' weights, + // and the minimum seems to perform better than the average.) + // FIXME: better update in the passes that drop the header weight. + // If no headers have a weight, give them even weight (use weight 1). + if (!MinHeaderWeight) + MinHeaderWeight = 1; + for (uint32_t H : HeadersWithoutWeight) { + auto &HeaderNode = Loop.Nodes[H]; + const BlockT *Block = getBlock(HeaderNode); + assert(!Block->getIrrLoopHeaderWeight() && + "Shouldn't have a weight metadata"); + uint64_t MinWeight = MinHeaderWeight.getValue(); + DEBUG(dbgs() << "Giving weight " << MinWeight + << " to " << getBlockName(HeaderNode) << "\n"); + if (MinWeight) + Dist.addLocal(HeaderNode, MinWeight); + } distributeIrrLoopHeaderMass(Dist); for (const BlockNode &M : Loop.Nodes) if (!propagateMassToSuccessors(&Loop, M)) llvm_unreachable("unhandled irreducible control flow"); - if (NumHeadersWithWeight != Loop.NumHeaders) - // Not all headers have a weight metadata. Adjust header mass. + if (NumHeadersWithWeight == 0) + // No headers have a metadata. Adjust header mass. adjustLoopHeaderMass(Loop); } else { Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index c92d48396c8..47278e19283 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1188,11 +1188,22 @@ void PGOUseFunc::setBranchWeights() { } } +static bool isIndirectBrTarget(BasicBlock *BB) { + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + if (isa((*PI)->getTerminator())) + return true; + } + return false; +} + void PGOUseFunc::annotateIrrLoopHeaderWeights() { DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n"); // Find irr loop headers for (auto &BB : F) { - if (BFI->isIrrLoopHeader(&BB)) { + // As a heuristic also annotate indrectbr targets as they have a high chance + // to become an irreducible loop header after the indirectbr tail + // duplication. + if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) { TerminatorInst *TI = BB.getTerminator(); const UseBBInfo &BBCountInfo = getBBInfo(&BB); setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue); diff --git a/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll b/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll index 3eb0597a957..8a18cbaf896 100644 --- a/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll +++ b/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll @@ -159,3 +159,68 @@ indirectgoto: ; preds = %if.then18, %if.then ; CHECK-NEXT: - sw.default: {{.*}} count = 0 ; CHECK-NEXT: - exit: {{.*}} count = 1 ; CHECK-NEXT: - indirectgoto: {{.*}} count = 399, irr_loop_header_weight = 400 + +; Missing some irr loop annotations. +; Function Attrs: noinline norecurse nounwind uwtable +define i32 @_Z11irreduciblePh2(i8* nocapture readonly %p) !prof !27 { +entry: + %0 = load i32, i32* @tracing, align 4 + %1 = trunc i32 %0 to i8 + %tobool = icmp eq i32 %0, 0 + br label %for.cond1 + +for.cond1: ; preds = %sw.default, %entry + br label %dispatch_op + +dispatch_op: ; preds = %sw.bb6, %for.cond1 +switch i8 %1, label %sw.default [ + i8 0, label %sw.bb + i8 1, label %dispatch_op.sw.bb6_crit_edge + i8 2, label %sw.bb15 + ], !prof !36 + +dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op + br label %sw.bb6 + +sw.bb: ; preds = %indirectgoto, %dispatch_op + br label %exit + +TARGET_1: ; preds = %indirectgoto + br label %sw.bb6 + +sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge + br i1 %tobool, label %dispatch_op, label %if.then, !prof !37 ; Missing !irr_loop !38 + +if.then: ; preds = %sw.bb6 + br label %indirectgoto + +TARGET_2: ; preds = %indirectgoto + br label %sw.bb15 + +sw.bb15: ; preds = %TARGET_2, %dispatch_op + br i1 %tobool, label %if.then18, label %exit, !prof !39, !irr_loop !40 + +if.then18: ; preds = %sw.bb15 + br label %indirectgoto + +unknown_op: ; preds = %indirectgoto + br label %sw.default + +sw.default: ; preds = %unknown_op, %dispatch_op + br label %for.cond1 + +exit: ; preds = %sw.bb15, %sw.bb + ret i32 0 + +indirectgoto: ; preds = %if.then18, %if.then + %idxprom21 = zext i32 %0 to i64 + %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21 + %target = load i8*, i8** %arrayidx22, align 8 + indirectbr i8* %target, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !41, !irr_loop !42 +} + +; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreduciblePh2': +; CHECK: block-frequency-info: _Z11irreduciblePh2 +; CHECK: - sw.bb6: {{.*}} count = 100 +; CHECK: - sw.bb15: {{.*}} count = 100, irr_loop_header_weight = 100 +; CHECK: - indirectgoto: {{.*}} count = 400, irr_loop_header_weight = 400 diff --git a/llvm/test/Transforms/PGOProfile/irreducible.ll b/llvm/test/Transforms/PGOProfile/irreducible.ll index 9b2c8f638ed..9394b724f7e 100644 --- a/llvm/test/Transforms/PGOProfile/irreducible.ll +++ b/llvm/test/Transforms/PGOProfile/irreducible.ll @@ -91,6 +91,7 @@ sw.bb: ; preds = %indirectgoto, %disp TARGET_1: ; preds = %indirectgoto br label %sw.bb6 +; USE: br label %sw.bb6, !irr_loop {{.*}} sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge br i1 %tobool, label %dispatch_op, label %if.then @@ -102,6 +103,7 @@ if.then: ; preds = %sw.bb6 TARGET_2: ; preds = %indirectgoto br label %sw.bb15 +; USE: br label %sw.bb15, !irr_loop {{.*}} sw.bb15: ; preds = %TARGET_2, %dispatch_op br i1 %tobool, label %if.then18, label %exit -- cgit v1.2.3