summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHiroshi Yamauchi <yamauchi@google.com>2017-11-20 21:03:38 +0000
committerHiroshi Yamauchi <yamauchi@google.com>2017-11-20 21:03:38 +0000
commitc94d4d70d8116edab8de1663edc4f7bf6db24c78 (patch)
treedd259e6efdb0acde8c95573272485d472e93d8c5
parent62fae15600672dc90cfaa4eac3865b79f758ae01 (diff)
downloadbcm5719-llvm-c94d4d70d8116edab8de1663edc4f7bf6db24c78.tar.gz
bcm5719-llvm-c94d4d70d8116edab8de1663edc4f7bf6db24c78.zip
Add heuristics for irreducible loop metadata under PGO
Summary: Add the following heuristics for irreducible loop metadata: - When an irreducible loop header is missing the loop header weight metadata, give it the minimum weight seen among other headers. - Annotate indirectbr targets with the loop header weight metadata (as they are likely to become irreducible loop headers after indirectbr tail duplication.) These greatly improve the accuracy of the block frequency info of the Python interpreter loop (eg. from ~3-16x off down to ~40-55% off) and the Python performance (eg. unpack_sequence from ~50% slower to ~8% faster than GCC) due to better register allocation under PGO. Reviewers: davidxl Reviewed By: davidxl Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39980 llvm-svn: 318693
-rw-r--r--llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h44
-rw-r--r--llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp13
-rw-r--r--llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll65
-rw-r--r--llvm/test/Transforms/PGOProfile/irreducible.ll2
4 files changed, 112 insertions, 12 deletions
diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 7b916e3653b..91056797faa 100644
--- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -16,6 +16,7 @@
#define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
@@ -1155,35 +1156,56 @@ bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
DEBUG(dbgs() << "isIrreducible = true\n");
Distribution Dist;
unsigned NumHeadersWithWeight = 0;
+ Optional<uint64_t> MinHeaderWeight;
+ DenseSet<uint32_t> HeadersWithoutWeight;
+ HeadersWithoutWeight.reserve(Loop.NumHeaders);
for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
auto &HeaderNode = Loop.Nodes[H];
const BlockT *Block = getBlock(HeaderNode);
IsIrrLoopHeader.set(Loop.Nodes[H].Index);
Optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight();
- if (!HeaderWeight)
+ if (!HeaderWeight) {
+ DEBUG(dbgs() << "Missing irr loop header metadata on "
+ << getBlockName(HeaderNode) << "\n");
+ HeadersWithoutWeight.insert(H);
continue;
+ }
DEBUG(dbgs() << getBlockName(HeaderNode)
<< " has irr loop header weight " << HeaderWeight.getValue()
<< "\n");
NumHeadersWithWeight++;
uint64_t HeaderWeightValue = HeaderWeight.getValue();
- if (HeaderWeightValue)
+ if (!MinHeaderWeight || HeaderWeightValue < MinHeaderWeight)
+ MinHeaderWeight = HeaderWeightValue;
+ if (HeaderWeightValue) {
Dist.addLocal(HeaderNode, HeaderWeightValue);
- }
- if (NumHeadersWithWeight != Loop.NumHeaders) {
- // Not all headers have a weight metadata. Distribute weight evenly.
- Dist = Distribution();
- for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
- auto &HeaderNode = Loop.Nodes[H];
- Dist.addLocal(HeaderNode, 1);
}
}
+ // As a heuristic, if some headers don't have a weight, give them the
+ // minimium weight seen (not to disrupt the existing trends too much by
+ // using a weight that's in the general range of the other headers' weights,
+ // and the minimum seems to perform better than the average.)
+ // FIXME: better update in the passes that drop the header weight.
+ // If no headers have a weight, give them even weight (use weight 1).
+ if (!MinHeaderWeight)
+ MinHeaderWeight = 1;
+ for (uint32_t H : HeadersWithoutWeight) {
+ auto &HeaderNode = Loop.Nodes[H];
+ const BlockT *Block = getBlock(HeaderNode);
+ assert(!Block->getIrrLoopHeaderWeight() &&
+ "Shouldn't have a weight metadata");
+ uint64_t MinWeight = MinHeaderWeight.getValue();
+ DEBUG(dbgs() << "Giving weight " << MinWeight
+ << " to " << getBlockName(HeaderNode) << "\n");
+ if (MinWeight)
+ Dist.addLocal(HeaderNode, MinWeight);
+ }
distributeIrrLoopHeaderMass(Dist);
for (const BlockNode &M : Loop.Nodes)
if (!propagateMassToSuccessors(&Loop, M))
llvm_unreachable("unhandled irreducible control flow");
- if (NumHeadersWithWeight != Loop.NumHeaders)
- // Not all headers have a weight metadata. Adjust header mass.
+ if (NumHeadersWithWeight == 0)
+ // No headers have a metadata. Adjust header mass.
adjustLoopHeaderMass(Loop);
} else {
Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index c92d48396c8..47278e19283 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1188,11 +1188,22 @@ void PGOUseFunc::setBranchWeights() {
}
}
+static bool isIndirectBrTarget(BasicBlock *BB) {
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ if (isa<IndirectBrInst>((*PI)->getTerminator()))
+ return true;
+ }
+ return false;
+}
+
void PGOUseFunc::annotateIrrLoopHeaderWeights() {
DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
// Find irr loop headers
for (auto &BB : F) {
- if (BFI->isIrrLoopHeader(&BB)) {
+ // As a heuristic also annotate indrectbr targets as they have a high chance
+ // to become an irreducible loop header after the indirectbr tail
+ // duplication.
+ if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) {
TerminatorInst *TI = BB.getTerminator();
const UseBBInfo &BBCountInfo = getBBInfo(&BB);
setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll b/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll
index 3eb0597a957..8a18cbaf896 100644
--- a/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll
+++ b/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll
@@ -159,3 +159,68 @@ indirectgoto: ; preds = %if.then18, %if.then
; CHECK-NEXT: - sw.default: {{.*}} count = 0
; CHECK-NEXT: - exit: {{.*}} count = 1
; CHECK-NEXT: - indirectgoto: {{.*}} count = 399, irr_loop_header_weight = 400
+
+; Missing some irr loop annotations.
+; Function Attrs: noinline norecurse nounwind uwtable
+define i32 @_Z11irreduciblePh2(i8* nocapture readonly %p) !prof !27 {
+entry:
+ %0 = load i32, i32* @tracing, align 4
+ %1 = trunc i32 %0 to i8
+ %tobool = icmp eq i32 %0, 0
+ br label %for.cond1
+
+for.cond1: ; preds = %sw.default, %entry
+ br label %dispatch_op
+
+dispatch_op: ; preds = %sw.bb6, %for.cond1
+switch i8 %1, label %sw.default [
+ i8 0, label %sw.bb
+ i8 1, label %dispatch_op.sw.bb6_crit_edge
+ i8 2, label %sw.bb15
+ ], !prof !36
+
+dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op
+ br label %sw.bb6
+
+sw.bb: ; preds = %indirectgoto, %dispatch_op
+ br label %exit
+
+TARGET_1: ; preds = %indirectgoto
+ br label %sw.bb6
+
+sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge
+ br i1 %tobool, label %dispatch_op, label %if.then, !prof !37 ; Missing !irr_loop !38
+
+if.then: ; preds = %sw.bb6
+ br label %indirectgoto
+
+TARGET_2: ; preds = %indirectgoto
+ br label %sw.bb15
+
+sw.bb15: ; preds = %TARGET_2, %dispatch_op
+ br i1 %tobool, label %if.then18, label %exit, !prof !39, !irr_loop !40
+
+if.then18: ; preds = %sw.bb15
+ br label %indirectgoto
+
+unknown_op: ; preds = %indirectgoto
+ br label %sw.default
+
+sw.default: ; preds = %unknown_op, %dispatch_op
+ br label %for.cond1
+
+exit: ; preds = %sw.bb15, %sw.bb
+ ret i32 0
+
+indirectgoto: ; preds = %if.then18, %if.then
+ %idxprom21 = zext i32 %0 to i64
+ %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21
+ %target = load i8*, i8** %arrayidx22, align 8
+ indirectbr i8* %target, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !41, !irr_loop !42
+}
+
+; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreduciblePh2':
+; CHECK: block-frequency-info: _Z11irreduciblePh2
+; CHECK: - sw.bb6: {{.*}} count = 100
+; CHECK: - sw.bb15: {{.*}} count = 100, irr_loop_header_weight = 100
+; CHECK: - indirectgoto: {{.*}} count = 400, irr_loop_header_weight = 400
diff --git a/llvm/test/Transforms/PGOProfile/irreducible.ll b/llvm/test/Transforms/PGOProfile/irreducible.ll
index 9b2c8f638ed..9394b724f7e 100644
--- a/llvm/test/Transforms/PGOProfile/irreducible.ll
+++ b/llvm/test/Transforms/PGOProfile/irreducible.ll
@@ -91,6 +91,7 @@ sw.bb: ; preds = %indirectgoto, %disp
TARGET_1: ; preds = %indirectgoto
br label %sw.bb6
+; USE: br label %sw.bb6, !irr_loop {{.*}}
sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge
br i1 %tobool, label %dispatch_op, label %if.then
@@ -102,6 +103,7 @@ if.then: ; preds = %sw.bb6
TARGET_2: ; preds = %indirectgoto
br label %sw.bb15
+; USE: br label %sw.bb15, !irr_loop {{.*}}
sw.bb15: ; preds = %TARGET_2, %dispatch_op
br i1 %tobool, label %if.then18, label %exit
OpenPOWER on IntegriCloud