diff options
| author | Hal Finkel <hfinkel@anl.gov> | 2016-06-23 13:46:39 +0000 |
|---|---|---|
| committer | Hal Finkel <hfinkel@anl.gov> | 2016-06-23 13:46:39 +0000 |
| commit | a1271036c59a8da7fa3d13cdb1101441166bc4f9 (patch) | |
| tree | bb27eb4a4350788587db8f9fa9b9c27488630c16 /llvm/lib | |
| parent | 6e9e88b30aeca4239dd333c4b04b638973150f79 (diff) | |
| download | bcm5719-llvm-a1271036c59a8da7fa3d13cdb1101441166bc4f9.tar.gz bcm5719-llvm-a1271036c59a8da7fa3d13cdb1101441166bc4f9.zip | |
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp | 75 |
1 files changed, 73 insertions, 2 deletions
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index d1687d680d4..22d08e03674 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -16,6 +16,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/DeadStoreElimination.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" @@ -34,10 +35,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" +#include <map> using namespace llvm; #define DEBUG_TYPE "dse" @@ -45,6 +48,12 @@ using namespace llvm; STATISTIC(NumRedundantStores, "Number of redundant stores deleted"); STATISTIC(NumFastStores, "Number of stores deleted"); STATISTIC(NumFastOther , "Number of other instrs removed"); +STATISTIC(NumCompletePartials, "Number of stores dead by later partials"); + +static cl::opt<bool> +EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking", + cl::init(true), cl::Hidden, + cl::desc("Enable partial-overwrite tracking in DSE")); //===----------------------------------------------------------------------===// @@ -272,6 +281,9 @@ enum OverwriteResult { }; } +typedef DenseMap<Instruction *, + std::map<int64_t, int64_t>> InstOverlapIntervalsTy; + /// Return 'OverwriteComplete' if a store to the 'Later' location completely /// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of /// the 'Earlier' location is completely overwritten by 'Later', @@ -281,7 +293,9 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, const MemoryLocation &Earlier, const DataLayout &DL, const TargetLibraryInfo &TLI, - int64_t &EarlierOff, int64_t &LaterOff) { + int64_t &EarlierOff, int64_t &LaterOff, + Instruction *DepWrite, + InstOverlapIntervalsTy &IOL) { // If we don't know the sizes of either access, then we can't do a comparison. if (Later.Size == MemoryLocation::UnknownSize || Earlier.Size == MemoryLocation::UnknownSize) @@ -347,6 +361,59 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size) return OverwriteComplete; + // We may now overlap, although the overlap is not complete. There might also + // be other incomplete overlaps, and together, they might cover the complete + // earlier write. + // Note: The correctness of this logic depends on the fact that this function + // is not even called providing DepWrite when there are any intervening reads. + if (EnablePartialOverwriteTracking && + LaterOff < int64_t(EarlierOff + Earlier.Size) && + int64_t(LaterOff + Later.Size) >= EarlierOff) { + + // Insert our part of the overlap into the map. + auto &IM = IOL[DepWrite]; + DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff << ", " << + int64_t(EarlierOff + Earlier.Size) << ") Later [" << + LaterOff << ", " << int64_t(LaterOff + Later.Size) << ")\n"); + + // Make sure that we only insert non-overlapping intervals and combine + // adjacent intervals. The intervals are stored in the map with the ending + // offset as the key (in the half-open sense) and the starting offset as + // the value. + int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + Later.Size; + + // Find any intervals ending at, or after, LaterIntStart which start + // before LaterIntEnd. + auto ILI = IM.lower_bound(LaterIntStart); + if (ILI != IM.end() && ILI->second < LaterIntEnd) { + // This existing interval ends in the middle of + // [LaterIntStart, LaterIntEnd), erase it adjusting our start. + LaterIntStart = std::min(LaterIntStart, ILI->second); + LaterIntEnd = std::max(LaterIntEnd, ILI->first); + ILI = IM.erase(ILI); + + while (ILI != IM.end() && ILI->first <= LaterIntEnd) + ILI = IM.erase(ILI); + + if (ILI != IM.end() && ILI->second < LaterIntEnd) + LaterIntEnd = std::max(LaterIntEnd, ILI->first); + } + + IM[LaterIntEnd] = LaterIntStart; + + ILI = IM.begin(); + if (ILI->second <= EarlierOff && + ILI->first >= int64_t(EarlierOff + Earlier.Size)) { + DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier [" << + EarlierOff << ", " << + int64_t(EarlierOff + Earlier.Size) << + ") Composite Later [" << + ILI->second << ", " << ILI->first << ")\n"); + ++NumCompletePartials; + return OverwriteComplete; + } + } + // Another interesting case is if the later store overwrites the end of the // earlier store. // @@ -737,6 +804,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, const DataLayout &DL = BB.getModule()->getDataLayout(); bool MadeChange = false; + // A map of interval maps representing partially-overwritten value parts. + InstOverlapIntervalsTy IOL; + // Do a top-down walk on the BB. for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { Instruction *Inst = &*BBI++; @@ -838,7 +908,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { int64_t InstWriteOffset, DepWriteOffset; OverwriteResult OR = - isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset); + isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset, + DepWrite, IOL); if (OR == OverwriteComplete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); |

