diff options
author | Hiroshi Yamauchi <yamauchi@google.com> | 2017-11-20 21:03:38 +0000 |
---|---|---|
committer | Hiroshi Yamauchi <yamauchi@google.com> | 2017-11-20 21:03:38 +0000 |
commit | c94d4d70d8116edab8de1663edc4f7bf6db24c78 (patch) | |
tree | dd259e6efdb0acde8c95573272485d472e93d8c5 | |
parent | 62fae15600672dc90cfaa4eac3865b79f758ae01 (diff) | |
download | bcm5719-llvm-c94d4d70d8116edab8de1663edc4f7bf6db24c78.tar.gz bcm5719-llvm-c94d4d70d8116edab8de1663edc4f7bf6db24c78.zip |
Add heuristics for irreducible loop metadata under PGO
Summary:
Add the following heuristics for irreducible loop metadata:
- When an irreducible loop header is missing the loop header weight metadata,
give it the minimum weight seen among other headers.
- Annotate indirectbr targets with the loop header weight metadata (as they are
likely to become irreducible loop headers after indirectbr tail duplication.)
These greatly improve the accuracy of the block frequency info of the Python
interpreter loop (eg. from ~3-16x off down to ~40-55% off) and the Python
performance (eg. unpack_sequence from ~50% slower to ~8% faster than GCC) due to
better register allocation under PGO.
Reviewers: davidxl
Reviewed By: davidxl
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D39980
llvm-svn: 318693
4 files changed, 112 insertions, 12 deletions
diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h index 7b916e3653b..91056797faa 100644 --- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -16,6 +16,7 @@ #define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" @@ -1155,35 +1156,56 @@ bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) { DEBUG(dbgs() << "isIrreducible = true\n"); Distribution Dist; unsigned NumHeadersWithWeight = 0; + Optional<uint64_t> MinHeaderWeight; + DenseSet<uint32_t> HeadersWithoutWeight; + HeadersWithoutWeight.reserve(Loop.NumHeaders); for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { auto &HeaderNode = Loop.Nodes[H]; const BlockT *Block = getBlock(HeaderNode); IsIrrLoopHeader.set(Loop.Nodes[H].Index); Optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight(); - if (!HeaderWeight) + if (!HeaderWeight) { + DEBUG(dbgs() << "Missing irr loop header metadata on " + << getBlockName(HeaderNode) << "\n"); + HeadersWithoutWeight.insert(H); continue; + } DEBUG(dbgs() << getBlockName(HeaderNode) << " has irr loop header weight " << HeaderWeight.getValue() << "\n"); NumHeadersWithWeight++; uint64_t HeaderWeightValue = HeaderWeight.getValue(); - if (HeaderWeightValue) + if (!MinHeaderWeight || HeaderWeightValue < MinHeaderWeight) + MinHeaderWeight = HeaderWeightValue; + if (HeaderWeightValue) { Dist.addLocal(HeaderNode, HeaderWeightValue); - } - if (NumHeadersWithWeight != Loop.NumHeaders) { - // Not all headers have a weight metadata. Distribute weight evenly. - Dist = Distribution(); - for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { - auto &HeaderNode = Loop.Nodes[H]; - Dist.addLocal(HeaderNode, 1); } } + // As a heuristic, if some headers don't have a weight, give them the + // minimium weight seen (not to disrupt the existing trends too much by + // using a weight that's in the general range of the other headers' weights, + // and the minimum seems to perform better than the average.) + // FIXME: better update in the passes that drop the header weight. + // If no headers have a weight, give them even weight (use weight 1). + if (!MinHeaderWeight) + MinHeaderWeight = 1; + for (uint32_t H : HeadersWithoutWeight) { + auto &HeaderNode = Loop.Nodes[H]; + const BlockT *Block = getBlock(HeaderNode); + assert(!Block->getIrrLoopHeaderWeight() && + "Shouldn't have a weight metadata"); + uint64_t MinWeight = MinHeaderWeight.getValue(); + DEBUG(dbgs() << "Giving weight " << MinWeight + << " to " << getBlockName(HeaderNode) << "\n"); + if (MinWeight) + Dist.addLocal(HeaderNode, MinWeight); + } distributeIrrLoopHeaderMass(Dist); for (const BlockNode &M : Loop.Nodes) if (!propagateMassToSuccessors(&Loop, M)) llvm_unreachable("unhandled irreducible control flow"); - if (NumHeadersWithWeight != Loop.NumHeaders) - // Not all headers have a weight metadata. Adjust header mass. + if (NumHeadersWithWeight == 0) + // No headers have a metadata. Adjust header mass. adjustLoopHeaderMass(Loop); } else { Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index c92d48396c8..47278e19283 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1188,11 +1188,22 @@ void PGOUseFunc::setBranchWeights() { } } +static bool isIndirectBrTarget(BasicBlock *BB) { + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + if (isa<IndirectBrInst>((*PI)->getTerminator())) + return true; + } + return false; +} + void PGOUseFunc::annotateIrrLoopHeaderWeights() { DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n"); // Find irr loop headers for (auto &BB : F) { - if (BFI->isIrrLoopHeader(&BB)) { + // As a heuristic also annotate indrectbr targets as they have a high chance + // to become an irreducible loop header after the indirectbr tail + // duplication. + if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) { TerminatorInst *TI = BB.getTerminator(); const UseBBInfo &BBCountInfo = getBBInfo(&BB); setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue); diff --git a/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll b/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll index 3eb0597a957..8a18cbaf896 100644 --- a/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll +++ b/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll @@ -159,3 +159,68 @@ indirectgoto: ; preds = %if.then18, %if.then ; CHECK-NEXT: - sw.default: {{.*}} count = 0 ; CHECK-NEXT: - exit: {{.*}} count = 1 ; CHECK-NEXT: - indirectgoto: {{.*}} count = 399, irr_loop_header_weight = 400 + +; Missing some irr loop annotations. +; Function Attrs: noinline norecurse nounwind uwtable +define i32 @_Z11irreduciblePh2(i8* nocapture readonly %p) !prof !27 { +entry: + %0 = load i32, i32* @tracing, align 4 + %1 = trunc i32 %0 to i8 + %tobool = icmp eq i32 %0, 0 + br label %for.cond1 + +for.cond1: ; preds = %sw.default, %entry + br label %dispatch_op + +dispatch_op: ; preds = %sw.bb6, %for.cond1 +switch i8 %1, label %sw.default [ + i8 0, label %sw.bb + i8 1, label %dispatch_op.sw.bb6_crit_edge + i8 2, label %sw.bb15 + ], !prof !36 + +dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op + br label %sw.bb6 + +sw.bb: ; preds = %indirectgoto, %dispatch_op + br label %exit + +TARGET_1: ; preds = %indirectgoto + br label %sw.bb6 + +sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge + br i1 %tobool, label %dispatch_op, label %if.then, !prof !37 ; Missing !irr_loop !38 + +if.then: ; preds = %sw.bb6 + br label %indirectgoto + +TARGET_2: ; preds = %indirectgoto + br label %sw.bb15 + +sw.bb15: ; preds = %TARGET_2, %dispatch_op + br i1 %tobool, label %if.then18, label %exit, !prof !39, !irr_loop !40 + +if.then18: ; preds = %sw.bb15 + br label %indirectgoto + +unknown_op: ; preds = %indirectgoto + br label %sw.default + +sw.default: ; preds = %unknown_op, %dispatch_op + br label %for.cond1 + +exit: ; preds = %sw.bb15, %sw.bb + ret i32 0 + +indirectgoto: ; preds = %if.then18, %if.then + %idxprom21 = zext i32 %0 to i64 + %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21 + %target = load i8*, i8** %arrayidx22, align 8 + indirectbr i8* %target, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !41, !irr_loop !42 +} + +; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreduciblePh2': +; CHECK: block-frequency-info: _Z11irreduciblePh2 +; CHECK: - sw.bb6: {{.*}} count = 100 +; CHECK: - sw.bb15: {{.*}} count = 100, irr_loop_header_weight = 100 +; CHECK: - indirectgoto: {{.*}} count = 400, irr_loop_header_weight = 400 diff --git a/llvm/test/Transforms/PGOProfile/irreducible.ll b/llvm/test/Transforms/PGOProfile/irreducible.ll index 9b2c8f638ed..9394b724f7e 100644 --- a/llvm/test/Transforms/PGOProfile/irreducible.ll +++ b/llvm/test/Transforms/PGOProfile/irreducible.ll @@ -91,6 +91,7 @@ sw.bb: ; preds = %indirectgoto, %disp TARGET_1: ; preds = %indirectgoto br label %sw.bb6 +; USE: br label %sw.bb6, !irr_loop {{.*}} sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge br i1 %tobool, label %dispatch_op, label %if.then @@ -102,6 +103,7 @@ if.then: ; preds = %sw.bb6 TARGET_2: ; preds = %indirectgoto br label %sw.bb15 +; USE: br label %sw.bb15, !irr_loop {{.*}} sw.bb15: ; preds = %TARGET_2, %dispatch_op br i1 %tobool, label %if.then18, label %exit |