summaryrefslogtreecommitdiffstats
path: root/llvm/lib/CodeGen/MachineBlockPlacement.cpp
diff options
context:
space:
mode:
authorCong Hou <congh@google.com>2015-11-02 21:24:00 +0000
committerCong Hou <congh@google.com>2015-11-02 21:24:00 +0000
commitb90b9e053109f0da882db37a6071536b2c385393 (patch)
tree4ffa08591c66d360803de87587d15d65c6b58d41 /llvm/lib/CodeGen/MachineBlockPlacement.cpp
parent935d79b0b11f3891a413a2c590602d7f60cf2922 (diff)
downloadbcm5719-llvm-b90b9e053109f0da882db37a6071536b2c385393.tar.gz
bcm5719-llvm-b90b9e053109f0da882db37a6071536b2c385393.zip
In MachineBlockPlacement, filter cold blocks off the loop chain when profile data is available.
In the current BB placement algorithm, a loop chain always contains all loop blocks. This has a drawback that cold blocks in the loop may be inserted on a hot function path, hence increasing branch cost and also reducing icache locality. Consider a simple example shown below: A | B⇆C | D When B->C is quite cold, the best BB-layout should be A,B,D,C. But the current implementation produces A,C,B,D. This patch filters those cold blocks off from the loop chain by comparing the ratio: LoopBBFreq / LoopFreq to 20%: if it is less than 20%, we don't include this BB to the loop chain. Here LoopFreq is the frequency of the loop when we reduce the loop into a single node. In general we have more cold blocks when the loop has few iterations. And vice versa. Differential revision: http://reviews.llvm.org/D11662 llvm-svn: 251833
Diffstat (limited to 'llvm/lib/CodeGen/MachineBlockPlacement.cpp')
-rw-r--r--llvm/lib/CodeGen/MachineBlockPlacement.cpp48
1 files changed, 46 insertions, 2 deletions
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 0383d2c56dd..b12fcf2940e 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -81,6 +81,12 @@ static cl::opt<unsigned> OutlineOptionalThreshold(
"instruction count below this threshold"),
cl::init(4), cl::Hidden);
+static cl::opt<unsigned> LoopToColdBlockRatio(
+ "loop-to-cold-block-ratio",
+ cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
+ "(frequency of block) is greater than this ratio"),
+ cl::init(5), cl::Hidden);
+
static cl::opt<bool>
PreciseRotationCost("precise-rotation-cost",
cl::desc("Model the cost of loop rotation more "
@@ -263,6 +269,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
const BlockFilterSet &LoopBlockSet);
MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L,
const BlockFilterSet &LoopBlockSet);
+ BlockFilterSet collectLoopBlockSet(MachineFunction &F, MachineLoop &L);
void buildLoopChains(MachineFunction &F, MachineLoop &L);
void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
const BlockFilterSet &LoopBlockSet);
@@ -960,6 +967,42 @@ void MachineBlockPlacement::rotateLoopWithProfile(
}
}
+/// \brief Collect blocks in the given loop that are to be placed.
+///
+/// When profile data is available, exclude cold blocks from the returned set;
+/// otherwise, collect all blocks in the loop.
+MachineBlockPlacement::BlockFilterSet
+MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) {
+ BlockFilterSet LoopBlockSet;
+
+ // Filter cold blocks off from LoopBlockSet when profile data is available.
+ // Collect the sum of frequencies of incoming edges to the loop header from
+ // outside. If we treat the loop as a super block, this is the frequency of
+ // the loop. Then for each block in the loop, we calculate the ratio between
+ // its frequency and the frequency of the loop block. When it is too small,
+ // don't add it to the loop chain. If there are outer loops, then this block
+ // will be merged into the first outer loop chain for which this block is not
+ // cold anymore. This needs precise profile data and we only do this when
+ // profile data is available.
+ if (F.getFunction()->getEntryCount()) {
+ BlockFrequency LoopFreq(0);
+ for (auto LoopPred : L.getHeader()->predecessors())
+ if (!L.contains(LoopPred))
+ LoopFreq += MBFI->getBlockFreq(LoopPred) *
+ MBPI->getEdgeProbability(LoopPred, L.getHeader());
+
+ for (MachineBasicBlock *LoopBB : L.getBlocks()) {
+ auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
+ if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
+ continue;
+ LoopBlockSet.insert(LoopBB);
+ }
+ } else
+ LoopBlockSet.insert(L.block_begin(), L.block_end());
+
+ return LoopBlockSet;
+}
+
/// \brief Forms basic block chains from the natural loop structures.
///
/// These chains are designed to preserve the existing *structure* of the code
@@ -974,7 +1017,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
buildLoopChains(F, *InnerLoop);
SmallVector<MachineBasicBlock *, 16> BlockWorkList;
- BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
+ BlockFilterSet LoopBlockSet = collectLoopBlockSet(F, L);
// Check if we have profile data for this function. If yes, we will rotate
// this loop by modeling costs more precisely which requires the profile data
@@ -1005,7 +1048,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
SmallPtrSet<BlockChain *, 4> UpdatedPreds;
assert(LoopChain.LoopPredecessors == 0);
UpdatedPreds.insert(&LoopChain);
- for (MachineBasicBlock *LoopBB : L.getBlocks()) {
+
+ for (MachineBasicBlock *LoopBB : LoopBlockSet) {
BlockChain &Chain = *BlockToChain[LoopBB];
if (!UpdatedPreds.insert(&Chain).second)
continue;
OpenPOWER on IntegriCloud