diff options
| author | Clement Courbet <courbet@google.com> | 2017-09-01 10:56:34 +0000 | 
|---|---|---|
| committer | Clement Courbet <courbet@google.com> | 2017-09-01 10:56:34 +0000 | 
| commit | 65130e2d8daaaf8dd6646f7840f27fac6709d5cf (patch) | |
| tree | 55e7b0e5dbc2ace098b8aa02e4274a93803eedb0 | |
| parent | d771f6cb16043afb97e2c4b3a6a02ff5aa963923 (diff) | |
| download | bcm5719-llvm-65130e2d8daaaf8dd6646f7840f27fac6709d5cf.tar.gz bcm5719-llvm-65130e2d8daaaf8dd6646f7840f27fac6709d5cf.zip  | |
Reland rL312315: [MergeICmps] MergeICmps is a new optimization pass that turns chains of integer
Add missing header.
This reverts commit 86dd6335cf7607af22f383a9a8e072ba929848cf.
llvm-svn: 312322
| -rw-r--r-- | llvm/include/llvm/InitializePasses.h | 1 | ||||
| -rw-r--r-- | llvm/include/llvm/LinkAllPasses.h | 1 | ||||
| -rw-r--r-- | llvm/include/llvm/Transforms/Scalar.h | 6 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/TargetPassConfig.cpp | 8 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/CMakeLists.txt | 1 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/MergeICmps.cpp | 645 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/Scalar.cpp | 1 | ||||
| -rw-r--r-- | llvm/test/Transforms/MergeICmps/pair-int32-int32.ll | 87 | ||||
| -rw-r--r-- | llvm/test/Transforms/MergeICmps/tuple-four-int8.ll | 73 | ||||
| -rw-r--r-- | llvm/test/Transforms/MergeICmps/volatile.ll | 30 | 
10 files changed, 853 insertions, 0 deletions
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 39ac4649b70..7c262f8d647 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -256,6 +256,7 @@ void initializeMemorySSAPrinterLegacyPassPass(PassRegistry&);  void initializeMemorySSAWrapperPassPass(PassRegistry&);  void initializeMemorySanitizerPass(PassRegistry&);  void initializeMergeFunctionsPass(PassRegistry&); +void initializeMergeICmpsPass(PassRegistry&);  void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);  void initializeMetaRenamerPass(PassRegistry&);  void initializeModuleDebugInfoPrinterPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index d07c15c1013..29314617177 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -179,6 +179,7 @@ namespace {        (void) llvm::createPostOrderFunctionAttrsLegacyPass();        (void) llvm::createReversePostOrderFunctionAttrsPass();        (void) llvm::createMergeFunctionsPass(); +      (void) llvm::createMergeICmpsPass();        std::string buf;        llvm::raw_string_ostream os(buf);        (void) llvm::createPrintModulePass(os); diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h index 7bc522a9653..ee1c3f7d402 100644 --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -422,6 +422,12 @@ Pass *createLowerGuardIntrinsicPass();  //===----------------------------------------------------------------------===//  // +// MergeICmps - Merge integer comparison chains +// +Pass *createMergeICmpsPass(); + +//===----------------------------------------------------------------------===// +//  // ValuePropagation - Propagate CFG-derived value information  //  Pass *createCorrelatedValuePropagationPass(); diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 481baea2dff..329768769f1 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -94,6 +94,10 @@ static cl::opt<bool> EnableImplicitNullChecks(      "enable-implicit-null-checks",      cl::desc("Fold null checks into faulting memory operations"),      cl::init(false)); +static cl::opt<bool> EnableMergeICmps( +    "enable-mergeicmps", +    cl::desc("Merge ICmp chains into a single memcmp"), +    cl::init(false));  static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,      cl::desc("Print LLVM IR produced by the loop-reduce pass"));  static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden, @@ -591,6 +595,10 @@ void TargetPassConfig::addIRPasses() {        addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));    } +  if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) { +    addPass(createMergeICmpsPass()); +  } +    // Run GC lowering passes for builtin collectors    // TODO: add a pass insertion point here    addPass(createGCLoweringPass()); diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt index 457c9427ab9..35683d9c369 100644 --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -42,6 +42,7 @@ add_llvm_library(LLVMScalarOpts    LowerExpectIntrinsic.cpp    LowerGuardIntrinsic.cpp    MemCpyOptimizer.cpp +  MergeICmps.cpp    MergedLoadStoreMotion.cpp    NaryReassociate.cpp    NewGVN.cpp diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp new file mode 100644 index 00000000000..3bd24eb8601 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -0,0 +1,645 @@ +//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass turns chains of integer comparisons into memcmp (the memcmp is +// later typically inlined as a chain of efficient hardware comparisons). This +// typically benefits c++ member or nonmember operator==(). +// +// The basic idea is to replace a larger chain of integer comparisons loaded +// from contiguous memory locations into a smaller chain of such integer +// comparisons. Benefits are double: +//  - There are less jumps, and therefore less opportunities for mispredictions +//    and I-cache misses. +//  - Code size is smaller, both because jumps are removed and because the +//    encoding of a 2*n byte compare is smaller than that of two n-byte +//    compares. + +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/APSInt.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" +#include <algorithm> +#include <numeric> +#include <utility> +#include <vector> + +using namespace llvm; + +namespace { + +#define DEBUG_TYPE "mergeicmps" + +#define MERGEICMPS_DOT_ON + +// A BCE atom. +struct BCEAtom { +  const Value *Base() const { return GEP ? GEP->getPointerOperand() : nullptr; } + +  bool operator<(const BCEAtom &O) const { +    return Base() == O.Base() ? Offset.slt(O.Offset) : Base() < O.Base(); +  } + +  GetElementPtrInst *GEP = nullptr; +  LoadInst *LoadI = nullptr; +  APInt Offset; +}; + +// If this value is a load from a constant offset w.r.t. a base address, and +// there are no othe rusers of the load or address, returns the base address and +// the offset. +BCEAtom visitICmpLoadOperand(Value *const Val) { +  BCEAtom Result; +  if (auto *const LoadI = dyn_cast<LoadInst>(Val)) { +    DEBUG(dbgs() << "load\n"); +    if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) { +      DEBUG(dbgs() << "used outside of block\n"); +      return {}; +    } +    if (LoadI->isVolatile()) { +      DEBUG(dbgs() << "volatile\n"); +      return {}; +    } +    Value *const Addr = LoadI->getOperand(0); +    if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) { +      DEBUG(dbgs() << "GEP\n"); +      if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) { +        DEBUG(dbgs() << "used outside of block\n"); +        return {}; +      } +      const auto &DL = GEP->getModule()->getDataLayout(); +      if (!isDereferenceablePointer(GEP, DL)) { +        DEBUG(dbgs() << "not dereferenceable\n"); +        // We need to make sure that we can do comparison in any order, so we +        // require memory to be unconditionnally dereferencable. +        return {}; +      } +      Result.Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0); +      if (GEP->accumulateConstantOffset(DL, Result.Offset)) { +        Result.GEP = GEP; +        Result.LoadI = LoadI; +      } +    } +  } +  return Result; +} + +// A basic block with a comparison between two BCE atoms. +// Note: the terminology is misleading: the comparison is symmetric, so there +// is no real {l/r}hs. To break the symmetry, we use the smallest atom as Lhs. +class BCECmpBlock { + public: +  BCECmpBlock() {} + +  BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits) +      : Lhs_(L), Rhs_(R), SizeBits_(SizeBits) { +    if (Rhs_ < Lhs_) +      std::swap(Rhs_, Lhs_); +  } + +  bool IsValid() const { +    return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr; +  } + +  // Assert the the block is consistent: If valid, it should also have +  // non-null members besides Lhs_ and Rhs_. +  void AssertConsistent() const { +    if (IsValid()) { +      assert(BB); +      assert(CmpI); +      assert(BranchI); +    } +  } + +  const BCEAtom &Lhs() const { return Lhs_; } +  const BCEAtom &Rhs() const { return Rhs_; } +  int SizeBits() const { return SizeBits_; } + +  // Returns true if the block does other works besides comparison. +  bool doesOtherWork() const; + +  // The basic block where this comparison happens. +  BasicBlock *BB = nullptr; +  // The ICMP for this comparison. +  ICmpInst *CmpI = nullptr; +  // The terminating branch. +  BranchInst *BranchI = nullptr; + + private: +  BCEAtom Lhs_; +  BCEAtom Rhs_; +  int SizeBits_ = 0; +}; + +bool BCECmpBlock::doesOtherWork() const { +  AssertConsistent(); +  // TODO(courbet): Can we allow some other things ? This is very conservative. +  // We might be able to get away with anything does does not have any side +  // effects outside of the basic block. +  // Note: The GEPs and/or loads are not necessarily in the same block. +  for (const Instruction &Inst : *BB) { +    if (const auto *const GEP = dyn_cast<GetElementPtrInst>(&Inst)) { +      if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP)) +        return true; +    } else if (const auto *const L = dyn_cast<LoadInst>(&Inst)) { +      if (!(Lhs_.LoadI == L || Rhs_.LoadI == L)) +        return true; +    } else if (const auto *const C = dyn_cast<ICmpInst>(&Inst)) { +      if (C != CmpI) +        return true; +    } else if (const auto *const Br = dyn_cast<BranchInst>(&Inst)) { +      if (Br != BranchI) +        return true; +    } else { +      return true; +    } +  } +  return false; +} + +// Visit the given comparison. If this is a comparison between two valid +// BCE atoms, returns the comparison. +BCECmpBlock visitICmp(const ICmpInst *const CmpI, +                      const ICmpInst::Predicate ExpectedPredicate) { +  if (CmpI->getPredicate() == ExpectedPredicate) { +    DEBUG(dbgs() << "cmp " +                 << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne") +                 << "\n"); +    auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0)); +    if (!Lhs.Base()) +      return {}; +    auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1)); +    if (!Rhs.Base()) +      return {}; +    return BCECmpBlock(std::move(Lhs), std::move(Rhs), +                       CmpI->getOperand(0)->getType()->getScalarSizeInBits()); +  } +  return {}; +} + +// Visit the given comparison block. If this is a comparison between two valid +// BCE atoms, returns the comparison. +BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block, +                          const BasicBlock *const PhiBlock) { +  if (Block->empty()) +    return {}; +  auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator()); +  if (!BranchI) +    return {}; +  DEBUG(dbgs() << "branch\n"); +  if (BranchI->isUnconditional()) { +    // In this case, we expect an incoming value which is the result of the +    // comparison. This is the last link in the chain of comparisons (note +    // that this does not mean that this is the last incoming value, blocks +    // can be reordered). +    auto *const CmpI = dyn_cast<ICmpInst>(Val); +    if (!CmpI) +      return {}; +    DEBUG(dbgs() << "icmp\n"); +    auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ); +    Result.CmpI = CmpI; +    Result.BranchI = BranchI; +    return Result; +  } else { +    // In this case, we expect a constant incoming value (the comparison is +    // chained). +    const auto *const Const = dyn_cast<ConstantInt>(Val); +    DEBUG(dbgs() << "const\n"); +    if (!Const->isZero()) +      return {}; +    DEBUG(dbgs() << "false\n"); +    auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition()); +    if (!CmpI) +      return {}; +    DEBUG(dbgs() << "icmp\n"); +    assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch"); +    BasicBlock *const FalseBlock = BranchI->getSuccessor(1); +    auto Result = visitICmp( +        CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE); +    Result.CmpI = CmpI; +    Result.BranchI = BranchI; +    return Result; +  } +  return {}; +} + +// A chain of comparisons. +class BCECmpChain { + public: +  BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi); + +  int size() const { return Comparisons_.size(); } + +#ifdef MERGEICMPS_DOT_ON +  void dump() const; +#endif  // MERGEICMPS_DOT_ON + +  bool simplify(const TargetLibraryInfo *const TLI); + + private: +  static bool IsContiguous(const BCECmpBlock &First, +                           const BCECmpBlock &Second) { +    return First.Lhs().Base() == Second.Lhs().Base() && +           First.Rhs().Base() == Second.Rhs().Base() && +           First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset && +           First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset; +  } + +  // Merges the given comparison blocks into one memcmp block and update +  // branches. Comparisons are assumed to be continguous. If NextBBInChain is +  // null, the merged block will link to the phi block. +  static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, +                               BasicBlock *const NextBBInChain, PHINode &Phi, +                               const TargetLibraryInfo *const TLI); + +  PHINode &Phi_; +  std::vector<BCECmpBlock> Comparisons_; +  // The original entry block (before sorting); +  BasicBlock *EntryBlock_; +}; + +BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi) +    : Phi_(Phi) { +  // Now look inside blocks to check for BCE comparisons. +  std::vector<BCECmpBlock> Comparisons; +  for (BasicBlock *Block : Blocks) { +    BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block), +                                           Block, Phi.getParent()); +    Comparison.BB = Block; +    if (!Comparison.IsValid()) { +      DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n"); +      return; +    } +    if (Comparison.doesOtherWork()) { +      DEBUG(dbgs() << "block does extra work besides compare\n"); +      if (Comparisons.empty()) {  // First block. +        // TODO(courbet): The first block can do other things, and we should +        // split them apart in a separate block before the comparison chain. +        // Right now we just discard it and make the chain shorter. +        DEBUG(dbgs() +              << "ignoring first block that does extra work besides compare\n"); +        continue; +      } +      // TODO(courbet): Right now we abort the whole chain. We could be +      // merging only the blocks that don't do other work and resume the +      // chain from there. For example: +      //  if (a[0] == b[0]) {  // bb1 +      //    if (a[1] == b[1]) {  // bb2 +      //      some_value = 3; //bb3 +      //      if (a[2] == b[2]) { //bb3 +      //        do a ton of stuff  //bb4 +      //      } +      //    } +      //  } +      // +      // This is: +      // +      // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+ +      //  \            \           \               \ +      //   ne           ne          ne              \ +      //    \            \           \               v +      //     +------------+-----------+----------> bb_phi +      // +      // We can only merge the first two comparisons, because bb3* does +      // "other work" (setting some_value to 3). +      // We could still merge bb1 and bb2 though. +      return; +    } +    DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits() +                 << " bits between " << Comparison.Lhs().Base() << " + " +                 << Comparison.Lhs().Offset << " and " +                 << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset +                 << "\n"); +    DEBUG(dbgs() << "\n"); +    Comparisons.push_back(Comparison); +  } +  EntryBlock_ = Comparisons[0].BB; +  Comparisons_ = std::move(Comparisons); +#ifdef MERGEICMPS_DOT_ON +  errs() << "BEFORE REORDERING:\n\n"; +  dump(); +#endif  // MERGEICMPS_DOT_ON +  // Reorder blocks by LHS. We can do that without changing the +  // semantics because we are only accessing dereferencable memory. +  std::sort(Comparisons_.begin(), Comparisons_.end(), +            [](const BCECmpBlock &a, const BCECmpBlock &b) { +              return a.Lhs() < b.Lhs(); +            }); +#ifdef MERGEICMPS_DOT_ON +  errs() << "AFTER REORDERING:\n\n"; +  dump(); +#endif  // MERGEICMPS_DOT_ON +} + +#ifdef MERGEICMPS_DOT_ON +void BCECmpChain::dump() const { +  errs() << "digraph dag {\n"; +  errs() << " graph [bgcolor=transparent];\n"; +  errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n"; +  errs() << " edge [color=black];\n"; +  for (size_t I = 0; I < Comparisons_.size(); ++I) { +    const auto &Comparison = Comparisons_[I]; +    errs() << " \"" << I << "\" [label=\"%" +           << Comparison.Lhs().Base()->getName() << " + " +           << Comparison.Lhs().Offset << " == %" +           << Comparison.Rhs().Base()->getName() << " + " +           << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8) +           << " bytes)\"];\n"; +    const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB); +    if (I > 0) +      errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n"; +    errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n"; +  } +  errs() << " \"Phi\" [label=\"Phi\"];\n"; +  errs() << "}\n\n"; +} +#endif  // MERGEICMPS_DOT_ON + +bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) { +  // First pass to check if there is at least one merge. If not, we don't do +  // anything and we keep analysis passes intact. +  { +    bool AtLeastOneMerged = false; +    for (size_t I = 1; I < Comparisons_.size(); ++I) { +      if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { +        AtLeastOneMerged = true; +        break; +      } +    } +    if (!AtLeastOneMerged) +      return false; +  } + +  // Remove phi references to comparison blocks, they will be rebuilt as we +  // merge the blocks. +  for (const auto &Comparison : Comparisons_) { +    Phi_.removeIncomingValue(Comparison.BB, false); +  } + +  // Point the predecessors of the chain to the first comparison block (which is +  // the new entry point). +  if (EntryBlock_ != Comparisons_[0].BB) +    EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB); + +  // Effectively merge blocks. +  int NumMerged = 1; +  for (size_t I = 1; I < Comparisons_.size(); ++I) { +    if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { +      ++NumMerged; +    } else { +      // Merge all previous comparisons and start a new merge block. +      mergeComparisons( +          makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged), +          Comparisons_[I].BB, Phi_, TLI); +      NumMerged = 1; +    } +  } +  mergeComparisons(makeArrayRef(Comparisons_) +                       .slice(Comparisons_.size() - NumMerged, NumMerged), +                   nullptr, Phi_, TLI); + +  return true; +} + +void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons, +                                   BasicBlock *const NextBBInChain, +                                   PHINode &Phi, +                                   const TargetLibraryInfo *const TLI) { +  assert(!Comparisons.empty()); +  const auto &FirstComparison = *Comparisons.begin(); +  BasicBlock *const BB = FirstComparison.BB; +  LLVMContext &Context = BB->getContext(); + +  if (Comparisons.size() >= 2) { +    DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); +    const auto TotalSize = +        std::accumulate(Comparisons.begin(), Comparisons.end(), 0, +                        [](int Size, const BCECmpBlock &C) { +                          return Size + C.SizeBits(); +                        }) / +        8; + +    // Incoming edges do not need to be updated, and both GEPs are already +    // computing the right address, we just need to: +    //   - replace the two loads and the icmp with the memcmp +    //   - update the branch +    //   - update the incoming values in the phi. +    FirstComparison.BranchI->eraseFromParent(); +    FirstComparison.CmpI->eraseFromParent(); +    FirstComparison.Lhs().LoadI->eraseFromParent(); +    FirstComparison.Rhs().LoadI->eraseFromParent(); + +    IRBuilder<> Builder(BB); +    const auto &DL = Phi.getModule()->getDataLayout(); +    Value *const MemCmpCall = +        emitMemCmp(FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, +                   ConstantInt::get(DL.getIntPtrType(Context), TotalSize), +                   Builder, DL, TLI); +    Value *const MemCmpIsZero = Builder.CreateICmpEQ( +        MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0)); + +    // Add a branch to the next basic block in the chain. +    if (NextBBInChain) { +      Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent()); +      Phi.addIncoming(ConstantInt::getFalse(Context), BB); +    } else { +      Builder.CreateBr(Phi.getParent()); +      Phi.addIncoming(MemCmpIsZero, BB); +    } + +    // Delete merged blocks. +    for (size_t I = 1; I < Comparisons.size(); ++I) { +      BasicBlock *CBB = Comparisons[I].BB; +      CBB->replaceAllUsesWith(BB); +      CBB->eraseFromParent(); +    } +  } else { +    assert(Comparisons.size() == 1); +    // There are no blocks to merge, but we still need to update the branches. +    DEBUG(dbgs() << "Only one comparison, updating branches\n"); +    if (NextBBInChain) { +      if (FirstComparison.BranchI->isConditional()) { +        DEBUG(dbgs() << "conditional -> conditional\n"); +        // Just update the "true" target, the "false" target should already be +        // the phi block. +        assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent()); +        FirstComparison.BranchI->setSuccessor(0, NextBBInChain); +        Phi.addIncoming(ConstantInt::getFalse(Context), BB); +      } else { +        DEBUG(dbgs() << "unconditional -> conditional\n"); +        // Replace the unconditional branch by a conditional one. +        FirstComparison.BranchI->eraseFromParent(); +        IRBuilder<> Builder(BB); +        Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain, +                             Phi.getParent()); +        Phi.addIncoming(FirstComparison.CmpI, BB); +      } +    } else { +      if (FirstComparison.BranchI->isConditional()) { +        DEBUG(dbgs() << "conditional -> unconditional\n"); +        // Replace the conditional branch by an unconditional one. +        FirstComparison.BranchI->eraseFromParent(); +        IRBuilder<> Builder(BB); +        Builder.CreateBr(Phi.getParent()); +        Phi.addIncoming(FirstComparison.CmpI, BB); +      } else { +        DEBUG(dbgs() << "unconditional -> unconditional\n"); +        Phi.addIncoming(FirstComparison.CmpI, BB); +      } +    } +  } +} + +std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi, +                                           BasicBlock *const LastBlock, +                                           int NumBlocks) { +  // Walk up from the last block to find other blocks. +  std::vector<BasicBlock *> Blocks(NumBlocks); +  BasicBlock *CurBlock = LastBlock; +  for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) { +    if (CurBlock->hasAddressTaken()) { +      // Somebody is jumping to the block through an address, all bets are +      // off. +      DEBUG(dbgs() << "skip: block " << BlockIndex +                   << " has its address taken\n"); +      return {}; +    } +    Blocks[BlockIndex] = CurBlock; +    auto *SinglePredecessor = CurBlock->getSinglePredecessor(); +    if (!SinglePredecessor) { +      // The block has two or more predecessors. +      DEBUG(dbgs() << "skip: block " << BlockIndex +                   << " has two or more predecessors\n"); +      return {}; +    } +    if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) { +      // The block does not link back to the phi. +      DEBUG(dbgs() << "skip: block " << BlockIndex +                   << " does not link back to the phi\n"); +      return {}; +    } +    CurBlock = SinglePredecessor; +  } +  Blocks[0] = CurBlock; +  return Blocks; +} + +bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) { +  DEBUG(dbgs() << "processPhi()\n"); +  if (Phi.getNumIncomingValues() <= 1) { +    DEBUG(dbgs() << "skip: only one incoming value in phi\n"); +    return false; +  } +  // We are looking for something that has the following structure: +  //   bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+ +  //     \            \           \               \ +  //      ne           ne          ne              \ +  //       \            \           \               v +  //        +------------+-----------+----------> bb_phi +  // +  //  - The last basic block (bb4 here) must branch unconditionally to bb_phi. +  //    It's the only block that contributes a non-constant value to the Phi. +  //  - All other blocks (b1, b2, b3) must have exactly two successors, one of +  //    them being the the phi block. +  //  - All intermediate blocks (bb2, bb3) must have only one predecessor. +  //  - Blocks cannot do other work besides the comparison, see doesOtherWork() + +  // The blocks are not necessarily ordered in the phi, so we start from the +  // last block and reconstruct the order. +  BasicBlock *LastBlock = nullptr; +  for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) { +    if (isa<ConstantInt>(Phi.getIncomingValue(I))) +      continue; +    if (LastBlock) { +      // There are several non-constant values. +      DEBUG(dbgs() << "skip: several non-constant values\n"); +      return false; +    } +    LastBlock = Phi.getIncomingBlock(I); +  } +  if (!LastBlock) { +    // There is no non-constant block. +    DEBUG(dbgs() << "skip: no non-constant block\n"); +    return false; +  } +  if (LastBlock->getSingleSuccessor() != Phi.getParent()) { +    DEBUG(dbgs() << "skip: last block non-phi successor\n"); +    return false; +  } + +  const auto Blocks = +      getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues()); +  if (Blocks.empty()) +    return false; +  BCECmpChain CmpChain(Blocks, Phi); + +  if (CmpChain.size() < 2) { +    DEBUG(dbgs() << "skip: only one compare block\n"); +    return false; +  } + +  return CmpChain.simplify(TLI); +} + +class MergeICmps : public FunctionPass { + public: +  static char ID; + +  MergeICmps() : FunctionPass(ID) { +    initializeMergeICmpsPass(*PassRegistry::getPassRegistry()); +  } + +  bool runOnFunction(Function &F) override { +    if (skipFunction(F)) return false; +    const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); +    auto PA = runImpl(F, &TLI); +    return !PA.areAllPreserved(); +  } + + private: +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<TargetLibraryInfoWrapperPass>(); +  } + +  PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI); +}; + +PreservedAnalyses MergeICmps::runImpl(Function &F, +                                      const TargetLibraryInfo *TLI) { +  DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n"); + +  bool MadeChange = false; + +  for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) { +    // A Phi operation is always first in a basic block. +    if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin())) +      MadeChange |= processPhi(*Phi, TLI); +  } + +  if (MadeChange) +    return PreservedAnalyses::none(); +  return PreservedAnalyses::all(); +} + +}  // namespace + +char MergeICmps::ID = 0; +INITIALIZE_PASS_BEGIN(MergeICmps, "mergeicmps", +                      "Merge contiguous icmps into a memcmp", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(MergeICmps, "mergeicmps", +                    "Merge contiguous icmps into a memcmp", false, false) + +Pass *llvm::createMergeICmpsPass() { return new MergeICmps(); } + diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index ce6f93eb0c1..d41fe6a3ba8 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -73,6 +73,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {    initializeLowerExpectIntrinsicPass(Registry);    initializeLowerGuardIntrinsicLegacyPassPass(Registry);    initializeMemCpyOptLegacyPassPass(Registry); +  initializeMergeICmpsPass(Registry);    initializeMergedLoadStoreMotionLegacyPassPass(Registry);    initializeNaryReassociateLegacyPassPass(Registry);    initializePartiallyInlineLibCallsLegacyPassPass(Registry); diff --git a/llvm/test/Transforms/MergeICmps/pair-int32-int32.ll b/llvm/test/Transforms/MergeICmps/pair-int32-int32.ll new file mode 100644 index 00000000000..351cb2adedf --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/pair-int32-int32.ll @@ -0,0 +1,87 @@ +; RUN: opt -mergeicmps -S -o - %s | FileCheck %s + +%"struct.std::pair" = type { i32, i32 } + +define zeroext i1 @opeq1( +    %"struct.std::pair"* nocapture readonly dereferenceable(8) %a, +    %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 { +entry: +  %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 +  %0 = load i32, i32* %first.i, align 4 +  %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 +  %1 = load i32, i32* %first1.i, align 4 +  %cmp.i = icmp eq i32 %0, %1 +  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: +  %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 +  %2 = load i32, i32* %second.i, align 4 +  %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 +  %3 = load i32, i32* %second2.i, align 4 +  %cmp3.i = icmp eq i32 %2, %3 +  br label %opeq1.exit + +opeq1.exit: +  %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ] +  ret i1 %4 +; CHECK-LABEL: @opeq1( +; The entry block with zero-offset GEPs is kept, loads are removed. +; CHECK: entry +; CHECK:     getelementptr {{.*}} i32 0 +; CHECK-NOT: load +; CHECK:     getelementptr {{.*}} i32 0 +; CHECK-NOT: load +; The two 4 byte loads and compares are replaced with a single 8-byte memcmp. +; CHECK:     @memcmp({{.*}}8) +; CHECK:     icmp eq {{.*}} 0 +; The branch is now a direct branch; the other block has been removed. +; CHECK:     br label %opeq1.exit +; CHECK-NOT: br +; The phi is updated. +; CHECK:      phi i1 [ %{{[^,]*}}, %entry ] +; CHECK-NEXT: ret +} + +; Same as above, but the two blocks are in inverse order. +define zeroext i1 @opeq1_inverse( +    %"struct.std::pair"* nocapture readonly dereferenceable(8) %a, +    %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 { +entry: +  %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 +  %0 = load i32, i32* %first.i, align 4 +  %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 +  %1 = load i32, i32* %first1.i, align 4 +  %cmp.i = icmp eq i32 %0, %1 +  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: +  %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 +  %2 = load i32, i32* %second.i, align 4 +  %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 +  %3 = load i32, i32* %second2.i, align 4 +  %cmp3.i = icmp eq i32 %2, %3 +  br label %opeq1.exit + +opeq1.exit: +  %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ] +  ret i1 %4 +; CHECK-LABEL: @opeq1_inverse( +; The second block with zero-offset GEPs is kept, loads are removed. +; CHECK: land.rhs.i +; CHECK:     getelementptr {{.*}} i32 0 +; CHECK-NOT: load +; CHECK:     getelementptr {{.*}} i32 0 +; CHECK-NOT: load +; The two 4 byte loads and compares are replaced with a single 8-byte memcmp. +; CHECK:     @memcmp({{.*}}8) +; CHECK:     icmp eq {{.*}} 0 +; The branch is now a direct branch; the other block has been removed. +; CHECK:     br label %opeq1.exit +; CHECK-NOT: br +; The phi is updated. +; CHECK:      phi i1 [ %{{[^,]*}}, %land.rhs.i ] +; CHECK-NEXT: ret +} + + + diff --git a/llvm/test/Transforms/MergeICmps/tuple-four-int8.ll b/llvm/test/Transforms/MergeICmps/tuple-four-int8.ll new file mode 100644 index 00000000000..f5e2ab57e04 --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/tuple-four-int8.ll @@ -0,0 +1,73 @@ +; RUN: opt -mergeicmps -S -o - %s | FileCheck %s + +; This is a more involved test: clang generates this weird pattern for +; tuple<uint8_t, uint8_t, uint8_t, uint8_t>. Right now we skip the entry block +; (which defines the base pointer for other blocks) and the last one (which +; does not have the expected structure). Only middle blocks (bytes [1,2]) are +; merged. + +%"class.std::tuple" = type { %"struct.std::_Tuple_impl" } +%"struct.std::_Tuple_impl" = type { %"struct.std::_Tuple_impl.0", %"struct.std::_Head_base.6" } +%"struct.std::_Tuple_impl.0" = type { %"struct.std::_Tuple_impl.1", %"struct.std::_Head_base.5" } +%"struct.std::_Tuple_impl.1" = type { %"struct.std::_Tuple_impl.2", %"struct.std::_Head_base.4" } +%"struct.std::_Tuple_impl.2" = type { %"struct.std::_Head_base" } +%"struct.std::_Head_base" = type { i8 } +%"struct.std::_Head_base.4" = type { i8 } +%"struct.std::_Head_base.5" = type { i8 } +%"struct.std::_Head_base.6" = type { i8 } + +define zeroext i1 @opeq( +    %"class.std::tuple"* nocapture readonly dereferenceable(4) %a, +    %"class.std::tuple"* nocapture readonly dereferenceable(4) %b) local_unnamed_addr #1 { +entry: +  %0 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %a, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 +  %add.ptr.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 3 +  %1 = load i8, i8* %add.ptr.i.i.i.i.i, align 1 +  %2 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %b, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 +  %add.ptr.i.i.i6.i.i = getelementptr inbounds i8, i8* %2, i64 3 +  %3 = load i8, i8* %add.ptr.i.i.i6.i.i, align 1 +  %cmp.i.i = icmp eq i8 %1, %3 +  br i1 %cmp.i.i, label %land.rhs.i.i, label %opeq.exit + +land.rhs.i.i: +  %add.ptr.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 2 +  %4 = load i8, i8* %add.ptr.i.i.i.i.i.i, align 1 +  %add.ptr.i.i.i6.i.i.i = getelementptr inbounds i8, i8* %2, i64 2 +  %5 = load i8, i8* %add.ptr.i.i.i6.i.i.i, align 1 +  %cmp.i.i.i = icmp eq i8 %4, %5 +  br i1 %cmp.i.i.i, label %land.rhs.i.i.i, label %opeq.exit + +land.rhs.i.i.i: +  %add.ptr.i.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 1 +  %6 = load i8, i8* %add.ptr.i.i.i.i.i.i.i, align 1 +  %add.ptr.i.i.i6.i.i.i.i = getelementptr inbounds i8, i8* %2, i64 1 +  %7 = load i8, i8* %add.ptr.i.i.i6.i.i.i.i, align 1 +  %cmp.i.i.i.i = icmp eq i8 %6, %7 +  br i1 %cmp.i.i.i.i, label %land.rhs.i.i.i.i, label %opeq.exit + +land.rhs.i.i.i.i: +  %8 = load i8, i8* %0, align 1 +  %9 = load i8, i8* %2, align 1 +  %cmp.i.i.i.i.i = icmp eq i8 %8, %9 +  br label %opeq.exit + +opeq.exit: +  %10 = phi i1 [ false, %entry ], [ false, %land.rhs.i.i ], [ false, %land.rhs.i.i.i ], [ %cmp.i.i.i.i.i, %land.rhs.i.i.i.i ] +  ret i1 %10 +; CHECK-LABEL: @opeq( +; The entry block is kept as is, but the next block is now the merged comparison +; block for bytes [1,2] or the block for the head. +; CHECK:     entry +; CHECK:     br i1 %cmp.i.i, label %land.rhs.i.i.i{{(.i)?}}, label %opeq.exit +; The two 1 byte loads and compares at offset 1 are replaced with a single +; 2-byte memcmp. +; CHECK:     land.rhs.i.i.i +; CHECK:     @memcmp({{.*}}2) +; CHECK:     icmp eq {{.*}} 0 +; In the end we have three blocks. +; CHECK: phi i1 +; CHECK-SAME %entry +; CHECK-SAME %land.rhs.i.i.i.i +; CHECK-SAME %land.rhs.i.i.i +} + diff --git a/llvm/test/Transforms/MergeICmps/volatile.ll b/llvm/test/Transforms/MergeICmps/volatile.ll new file mode 100644 index 00000000000..1df22575c2c --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/volatile.ll @@ -0,0 +1,30 @@ +; RUN: opt -mergeicmps -S -o - %s | FileCheck %s + +%"struct.std::pair" = type { i32, i32 } + +define zeroext i1 @opeq( +    %"struct.std::pair"* nocapture readonly dereferenceable(8) %a, +    %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 { +entry: +  %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0 +  %0 = load i32, i32* %first.i, align 4 +  %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 +  %1 = load i32, i32* %first1.i, align 4 +  %cmp.i = icmp eq i32 %0, %1 +  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit + +land.rhs.i: +  %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 +  %2 = load volatile i32, i32* %second.i, align 4 +  %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 +  %3 = load i32, i32* %second2.i, align 4 +  %cmp3.i = icmp eq i32 %2, %3 +  br label %opeq1.exit + +opeq1.exit: +  %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ] +  ret i1 %4 +; CHECK-LABEL: @opeq( +; CHECK-NOT: memcmp +} +  | 

