summaryrefslogtreecommitdiffstats
path: root/llvm/lib/CodeGen/MachineCombiner.cpp
diff options
context:
space:
mode:
authorSebastian Pop <sebpop@gmail.com>2016-12-11 19:39:32 +0000
committerSebastian Pop <sebpop@gmail.com>2016-12-11 19:39:32 +0000
commite08d9c7c8748670b4fa0e215ac11c89675bda6ab (patch)
tree33d5cff1132ebc45c51a91bf210cf7583c731b59 /llvm/lib/CodeGen/MachineCombiner.cpp
parent8766a76f3d561c42c95281f2ca0f8da489a5d824 (diff)
downloadbcm5719-llvm-e08d9c7c8748670b4fa0e215ac11c89675bda6ab.tar.gz
bcm5719-llvm-e08d9c7c8748670b4fa0e215ac11c89675bda6ab.zip
instr-combiner: sum up all latencies of the transformed instructions
We have found that -- when the selected subarchitecture has a scheduling model and we are not optimizing for size -- the machine-instruction combiner uses a too-simple algorithm to compute the cost of one of the two alternatives [before and after running a combining pass on a section of code], and therefor it throws away the combination results too often. This fix has the potential to help any ISA with the potential to combine instructions and for which at least one subarchitecture has a scheduling model. As of now, this is only known to definitely affect AArch64 subarchitectures with a scheduling model. Regression tested on AMD64/GNU-Linux, new test case tested to fail on an unpatched compiler and pass on a patched compiler. Patch by Abe Skolnik and Sebastian Pop. llvm-svn: 289399
Diffstat (limited to 'llvm/lib/CodeGen/MachineCombiner.cpp')
-rw-r--r--llvm/lib/CodeGen/MachineCombiner.cpp11
1 files changed, 9 insertions, 2 deletions
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index b4e217c4815..54c4741e6fa 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -71,6 +71,7 @@ private:
improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
MachineTraceMetrics::Trace BlockTrace,
SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineCombinerPattern Pattern);
bool preservesResourceLen(MachineBasicBlock *MBB,
@@ -242,6 +243,7 @@ bool MachineCombiner::improvesCriticalPathLen(
MachineBasicBlock *MBB, MachineInstr *Root,
MachineTraceMetrics::Trace BlockTrace,
SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineCombinerPattern Pattern) {
assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
@@ -269,8 +271,13 @@ bool MachineCombiner::improvesCriticalPathLen(
// A more flexible cost calculation for the critical path includes the slack
// of the original code sequence. This may allow the transform to proceed
// even if the instruction depths (data dependency cycles) become worse.
+
unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
- unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
+ unsigned RootLatency = 0;
+
+ for (auto I : DelInstrs)
+ RootLatency += TSchedModel.computeInstrLatency(I);
+
unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
@@ -421,7 +428,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
// resource pressure.
if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
(improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
- InstrIdxForVirtReg, P) &&
+ DelInstrs, InstrIdxForVirtReg, P) &&
preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
for (auto *InstrPtr : InsInstrs)
MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
OpenPOWER on IntegriCloud