diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Analysis/TargetTransformInfo.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/CodeGen/CMakeLists.txt | 1 | ||||
-rw-r--r-- | llvm/lib/CodeGen/CodeGen.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/CodeGen/HardwareLoops.cpp | 441 | ||||
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCCTRLoops.cpp | 572 | ||||
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCInstr64Bit.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCInstrInfo.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCTargetMachine.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 344 | ||||
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h | 5 |
11 files changed, 805 insertions, 580 deletions
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 53dd2bf2304..763b6841878 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -130,6 +130,12 @@ bool TargetTransformInfo::isLoweredToCall(const Function *F) const { return TTIImpl->isLoweredToCall(F); } +bool TargetTransformInfo::isHardwareLoopProfitable( + Loop *L, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const { + return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); +} + void TargetTransformInfo::getUnrollingPreferences( Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const { return TTIImpl->getUnrollingPreferences(L, SE, UP); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index fedf04270d2..dcbd4e51576 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -33,6 +33,7 @@ add_llvm_library(LLVMCodeGen GCRootLowering.cpp GCStrategy.cpp GlobalMerge.cpp + HardwareLoops.cpp IfConversion.cpp ImplicitNullChecks.cpp IndirectBrExpandPass.cpp diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 748db034d27..31b6bb7688a 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -38,6 +38,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeFuncletLayoutPass(Registry); initializeGCMachineCodeAnalysisPass(Registry); initializeGCModuleInfoPass(Registry); + initializeHardwareLoopsPass(Registry); initializeIfConverterPass(Registry); initializeImplicitNullChecksPass(Registry); initializeIndirectBrExpandPassPass(Registry); diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp new file mode 100644 index 00000000000..1d48219029c --- /dev/null +++ b/llvm/lib/CodeGen/HardwareLoops.cpp @@ -0,0 +1,441 @@ +//===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// Insert hardware loop intrinsics into loops which are deemed profitable by +/// the target, by querying TargetTransformInfo. A hardware loop comprises of +/// two intrinsics: one, outside the loop, to set the loop iteration count and +/// another, in the exit block, to decrement the counter. The decremented value +/// can either be carried through the loop via a phi or handled in some opaque +/// way by the target. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/PassSupport.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +#define DEBUG_TYPE "hardware-loops" + +#define HW_LOOPS_NAME "Hardware Loop Insertion" + +using namespace llvm; + +static cl::opt<bool> +ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false), + cl::desc("Force hardware loops intrinsics to be inserted")); + +static cl::opt<bool> +ForceHardwareLoopPHI( + "force-hardware-loop-phi", cl::Hidden, cl::init(false), + cl::desc("Force hardware loop counter to be updated through a phi")); + +static cl::opt<bool> +ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false), + cl::desc("Force allowance of nested hardware loops")); + +static cl::opt<unsigned> +LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1), + cl::desc("Set the loop decrement value")); + +static cl::opt<unsigned> +CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32), + cl::desc("Set the loop counter bitwidth")); + +STATISTIC(NumHWLoops, "Number of loops converted to hardware loops"); + +namespace { + + using TTI = TargetTransformInfo; + + class HardwareLoops : public FunctionPass { + public: + static char ID; + + HardwareLoops() : FunctionPass(ID) { + initializeHardwareLoopsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + } + + // Try to convert the given Loop into a hardware loop. + bool TryConvertLoop(Loop *L); + + // Given that the target believes the loop to be profitable, try to + // convert it. + bool TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo); + + private: + ScalarEvolution *SE = nullptr; + LoopInfo *LI = nullptr; + const DataLayout *DL = nullptr; + const TargetTransformInfo *TTI = nullptr; + DominatorTree *DT = nullptr; + bool PreserveLCSSA = false; + AssumptionCache *AC = nullptr; + TargetLibraryInfo *LibInfo = nullptr; + Module *M = nullptr; + bool MadeChange = false; + }; + + class HardwareLoop { + // Expand the trip count scev into a value that we can use. + Value *InitLoopCount(BasicBlock *BB); + + // Insert the set_loop_iteration intrinsic. + void InsertIterationSetup(Value *LoopCountInit, BasicBlock *BB); + + // Insert the loop_decrement intrinsic. + void InsertLoopDec(); + + // Insert the loop_decrement_reg intrinsic. + Instruction *InsertLoopRegDec(Value *EltsRem); + + // If the target requires the counter value to be updated in the loop, + // insert a phi to hold the value. The intended purpose is for use by + // loop_decrement_reg. + PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem); + + // Create a new cmp, that checks the returned value of loop_decrement*, + // and update the exit branch to use it. + void UpdateBranch(Value *EltsRem); + + public: + HardwareLoop(TTI::HardwareLoopInfo &Info, ScalarEvolution &SE, + const DataLayout &DL) : + SE(SE), DL(DL), L(Info.L), M(L->getHeader()->getModule()), + ExitCount(Info.ExitCount), + CountType(Info.CountType), + ExitBranch(Info.ExitBranch), + LoopDecrement(Info.LoopDecrement), + UsePHICounter(Info.CounterInReg) { } + + void Create(); + + private: + ScalarEvolution &SE; + const DataLayout &DL; + Loop *L = nullptr; + Module *M = nullptr; + const SCEV *ExitCount = nullptr; + Type *CountType = nullptr; + BranchInst *ExitBranch = nullptr; + Value *LoopDecrement = nullptr; + bool UsePHICounter = false; + }; +} + +char HardwareLoops::ID = 0; + +bool HardwareLoops::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n"); + + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + DL = &F.getParent()->getDataLayout(); + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + LibInfo = TLIP ? &TLIP->getTLI() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + M = F.getParent(); + + for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) { + Loop *L = *I; + if (!L->getParentLoop()) + TryConvertLoop(L); + } + + return MadeChange; +} + +// Return true if the search should stop, which will be when an inner loop is +// converted and the parent loop doesn't support containing a hardware loop. +bool HardwareLoops::TryConvertLoop(Loop *L) { + // Process nested loops first. + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) + if (TryConvertLoop(*I)) + return true; // Stop search. + + // Bail out if the loop has irreducible control flow. + LoopBlocksRPO RPOT(L); + RPOT.perform(LI); + if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) + return false; + + TTI::HardwareLoopInfo HWLoopInfo(L); + if (TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo) || + ForceHardwareLoops) { + + // Allow overriding of the counter width and loop decrement value. + if (CounterBitWidth.getNumOccurrences()) + HWLoopInfo.CountType = + IntegerType::get(M->getContext(), CounterBitWidth); + + if (LoopDecrement.getNumOccurrences()) + HWLoopInfo.LoopDecrement = + ConstantInt::get(HWLoopInfo.CountType, LoopDecrement); + + MadeChange |= TryConvertLoop(HWLoopInfo); + return MadeChange && (!HWLoopInfo.IsNestingLegal && !ForceNestedLoop); + } + + return false; +} + +bool HardwareLoops::TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo) { + + Loop *L = HWLoopInfo.L; + LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L); + + SmallVector<BasicBlock*, 4> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(), + IE = ExitingBlocks.end(); I != IE; ++I) { + const SCEV *EC = SE->getExitCount(L, *I); + if (isa<SCEVCouldNotCompute>(EC)) + continue; + if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) { + if (ConstEC->getValue()->isZero()) + continue; + } else if (!SE->isLoopInvariant(EC, L)) + continue; + + if (SE->getTypeSizeInBits(EC->getType()) > + HWLoopInfo.CountType->getBitWidth()) + continue; + + // If this exiting block is contained in a nested loop, it is not eligible + // for insertion of the branch-and-decrement since the inner loop would + // end up messing up the value in the CTR. + if (!HWLoopInfo.IsNestingLegal && LI->getLoopFor(*I) != L && + !ForceNestedLoop) + continue; + + // We now have a loop-invariant count of loop iterations (which is not the + // constant zero) for which we know that this loop will not exit via this + // existing block. + + // We need to make sure that this block will run on every loop iteration. + // For this to be true, we must dominate all blocks with backedges. Such + // blocks are in-loop predecessors to the header block. + bool NotAlways = false; + for (pred_iterator PI = pred_begin(L->getHeader()), + PIE = pred_end(L->getHeader()); PI != PIE; ++PI) { + if (!L->contains(*PI)) + continue; + + if (!DT->dominates(*I, *PI)) { + NotAlways = true; + break; + } + } + + if (NotAlways) + continue; + + // Make sure this blocks ends with a conditional branch. + Instruction *TI = (*I)->getTerminator(); + if (!TI) + continue; + + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (!BI->isConditional()) + continue; + + HWLoopInfo.ExitBranch = BI; + } else + continue; + + // Note that this block may not be the loop latch block, even if the loop + // has a latch block. + HWLoopInfo.ExitBlock = *I; + HWLoopInfo.ExitCount = EC; + break; + } + + if (!HWLoopInfo.ExitBlock) + return false; + + BasicBlock *Preheader = L->getLoopPreheader(); + + // If we don't have a preheader, then insert one. + if (!Preheader) + Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA); + if (!Preheader) + return false; + + HardwareLoop HWLoop(HWLoopInfo, *SE, *DL); + HWLoop.Create(); + ++NumHWLoops; + return true; +} + +void HardwareLoop::Create() { + LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n"); + BasicBlock *BeginBB = L->getLoopPreheader(); + Value *LoopCountInit = InitLoopCount(BeginBB); + if (!LoopCountInit) + return; + + InsertIterationSetup(LoopCountInit, BeginBB); + + if (UsePHICounter || ForceHardwareLoopPHI) { + Instruction *LoopDec = InsertLoopRegDec(LoopCountInit); + Value *EltsRem = InsertPHICounter(LoopCountInit, LoopDec); + LoopDec->setOperand(0, EltsRem); + UpdateBranch(LoopDec); + } else + InsertLoopDec(); + + // Run through the basic blocks of the loop and see if any of them have dead + // PHIs that can be removed. + for (auto I : L->blocks()) + DeleteDeadPHIs(I); +} + +Value *HardwareLoop::InitLoopCount(BasicBlock *BB) { + SCEVExpander SCEVE(SE, DL, "loopcnt"); + if (!ExitCount->getType()->isPointerTy() && + ExitCount->getType() != CountType) + ExitCount = SE.getZeroExtendExpr(ExitCount, CountType); + + ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType)); + + if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) { + LLVM_DEBUG(dbgs() << "HWLoops: Bailing, unsafe to expand ExitCount " + << *ExitCount << "\n"); + return nullptr; + } + + Value *Count = SCEVE.expandCodeFor(ExitCount, CountType, + BB->getTerminator()); + LLVM_DEBUG(dbgs() << "HWLoops: Loop Count: " << *Count << "\n"); + return Count; +} + +void HardwareLoop::InsertIterationSetup(Value *LoopCountInit, + BasicBlock *BB) { + IRBuilder<> Builder(BB->getTerminator()); + Type *Ty = LoopCountInit->getType(); + Function *LoopIter = + Intrinsic::getDeclaration(M, Intrinsic::set_loop_iterations, Ty); + Value *Call = Builder.CreateCall(LoopIter, LoopCountInit); + LLVM_DEBUG(dbgs() << "HWLoops: Iteration set: " << *Call << "\n"); +} + +void HardwareLoop::InsertLoopDec() { + IRBuilder<> CondBuilder(ExitBranch); + + Function *DecFunc = + Intrinsic::getDeclaration(M, Intrinsic::loop_decrement, + LoopDecrement->getType()); + Value *Ops[] = { LoopDecrement }; + Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops); + Value *OldCond = ExitBranch->getCondition(); + ExitBranch->setCondition(NewCond); + + // The false branch must exit the loop. + if (!L->contains(ExitBranch->getSuccessor(0))) + ExitBranch->swapSuccessors(); + + // The old condition may be dead now, and may have even created a dead PHI + // (the original induction variable). + RecursivelyDeleteTriviallyDeadInstructions(OldCond); + + LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n"); +} + +Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) { + IRBuilder<> CondBuilder(ExitBranch); + + Function *DecFunc = + Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg, + { EltsRem->getType(), EltsRem->getType(), + LoopDecrement->getType() + }); + Value *Ops[] = { EltsRem, LoopDecrement }; + Value *Call = CondBuilder.CreateCall(DecFunc, Ops); + + LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n"); + return cast<Instruction>(Call); +} + +PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) { + BasicBlock *Preheader = L->getLoopPreheader(); + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = ExitBranch->getParent(); + IRBuilder<> Builder(Header->getFirstNonPHI()); + PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2); + Index->addIncoming(NumElts, Preheader); + Index->addIncoming(EltsRem, Latch); + LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n"); + return Index; +} + +void HardwareLoop::UpdateBranch(Value *EltsRem) { + IRBuilder<> CondBuilder(ExitBranch); + Value *NewCond = + CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0)); + Value *OldCond = ExitBranch->getCondition(); + ExitBranch->setCondition(NewCond); + + // The false branch must exit the loop. + if (!L->contains(ExitBranch->getSuccessor(0))) + ExitBranch->swapSuccessors(); + + // The old condition may be dead now, and may have even created a dead PHI + // (the original induction variable). + RecursivelyDeleteTriviallyDeadInstructions(OldCond); +} + +INITIALIZE_PASS_BEGIN(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false) + +FunctionPass *llvm::createHardwareLoopsPass() { return new HardwareLoops(); } diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp index 559d660897d..2b8d9b87724 100644 --- a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -71,63 +71,7 @@ using namespace llvm; static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1)); #endif -// The latency of mtctr is only justified if there are more than 4 -// comparisons that will be removed as a result. -static cl::opt<unsigned> -SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, - cl::desc("Loops with a constant trip count smaller than " - "this value will not use the count register.")); - -STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops"); - namespace { - struct PPCCTRLoops : public FunctionPass { - -#ifndef NDEBUG - static int Counter; -#endif - - public: - static char ID; - - PPCCTRLoops() : FunctionPass(ID) { - initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - } - - private: - bool mightUseCTR(BasicBlock *BB); - bool convertToCTRLoop(Loop *L); - - private: - const PPCTargetMachine *TM; - const PPCSubtarget *STI; - const PPCTargetLowering *TLI; - const DataLayout *DL; - const TargetLibraryInfo *LibInfo; - const TargetTransformInfo *TTI; - LoopInfo *LI; - ScalarEvolution *SE; - DominatorTree *DT; - bool PreserveLCSSA; - TargetSchedModel SchedModel; - }; - - char PPCCTRLoops::ID = 0; -#ifndef NDEBUG - int PPCCTRLoops::Counter = 0; -#endif #ifndef NDEBUG struct PPCCTRLoopsVerify : public MachineFunctionPass { @@ -153,16 +97,6 @@ namespace { #endif // NDEBUG } // end anonymous namespace -INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", - false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", - false, false) - -FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); } - #ifndef NDEBUG INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", "PowerPC CTR Loops Verify", false, false) @@ -175,512 +109,6 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() { } #endif // NDEBUG -bool PPCCTRLoops::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - if (!TPC) - return false; - - TM = &TPC->getTM<PPCTargetMachine>(); - STI = TM->getSubtargetImpl(F); - TLI = STI->getTargetLowering(); - - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - DL = &F.getParent()->getDataLayout(); - auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - LibInfo = TLIP ? &TLIP->getTLI() : nullptr; - PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); - SchedModel.init(STI); - - bool MadeChange = false; - - for (LoopInfo::iterator I = LI->begin(), E = LI->end(); - I != E; ++I) { - Loop *L = *I; - if (!L->getParentLoop()) - MadeChange |= convertToCTRLoop(L); - } - - return MadeChange; -} - -static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) { - if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) - return ITy->getBitWidth() > (Is32Bit ? 32U : 64U); - - return false; -} - -// Determining the address of a TLS variable results in a function call in -// certain TLS models. -static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) { - const auto *GV = dyn_cast<GlobalValue>(MemAddr); - if (!GV) { - // Recurse to check for constants that refer to TLS global variables. - if (const auto *CV = dyn_cast<Constant>(MemAddr)) - for (const auto &CO : CV->operands()) - if (memAddrUsesCTR(TM, CO)) - return true; - - return false; - } - - if (!GV->isThreadLocal()) - return false; - TLSModel::Model Model = TM.getTLSModel(GV); - return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic; -} - -// Loop through the inline asm constraints and look for something that clobbers -// ctr. -static bool asmClobbersCTR(InlineAsm *IA) { - InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); - for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { - InlineAsm::ConstraintInfo &C = CIV[i]; - if (C.Type != InlineAsm::isInput) - for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) - if (StringRef(C.Codes[j]).equals_lower("{ctr}")) - return true; - } - return false; -} - -bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) { - for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); - J != JE; ++J) { - if (CallInst *CI = dyn_cast<CallInst>(J)) { - // Inline ASM is okay, unless it clobbers the ctr register. - if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) { - if (asmClobbersCTR(IA)) - return true; - continue; - } - - if (Function *F = CI->getCalledFunction()) { - // Most intrinsics don't become function calls, but some might. - // sin, cos, exp and log are always calls. - unsigned Opcode = 0; - if (F->getIntrinsicID() != Intrinsic::not_intrinsic) { - switch (F->getIntrinsicID()) { - default: continue; - // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr - // we're definitely using CTR. - case Intrinsic::ppc_is_decremented_ctr_nonzero: - case Intrinsic::ppc_mtctr: - return true; - -// VisualStudio defines setjmp as _setjmp -#if defined(_MSC_VER) && defined(setjmp) && \ - !defined(setjmp_undefined_for_msvc) -# pragma push_macro("setjmp") -# undef setjmp -# define setjmp_undefined_for_msvc -#endif - - case Intrinsic::setjmp: - -#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) - // let's return it to _setjmp state -# pragma pop_macro("setjmp") -# undef setjmp_undefined_for_msvc -#endif - - case Intrinsic::longjmp: - - // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp - // because, although it does clobber the counter register, the - // control can't then return to inside the loop unless there is also - // an eh_sjlj_setjmp. - case Intrinsic::eh_sjlj_setjmp: - - case Intrinsic::memcpy: - case Intrinsic::memmove: - case Intrinsic::memset: - case Intrinsic::powi: - case Intrinsic::log: - case Intrinsic::log2: - case Intrinsic::log10: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::pow: - case Intrinsic::sin: - case Intrinsic::cos: - return true; - case Intrinsic::copysign: - if (CI->getArgOperand(0)->getType()->getScalarType()-> - isPPC_FP128Ty()) - return true; - else - continue; // ISD::FCOPYSIGN is never a library call. - case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; - case Intrinsic::floor: Opcode = ISD::FFLOOR; break; - case Intrinsic::ceil: Opcode = ISD::FCEIL; break; - case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; - case Intrinsic::rint: Opcode = ISD::FRINT; break; - case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; - case Intrinsic::round: Opcode = ISD::FROUND; break; - case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; - case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; - case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break; - case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break; - } - } - - // PowerPC does not use [US]DIVREM or other library calls for - // operations on regular types which are not otherwise library calls - // (i.e. soft float or atomics). If adapting for targets that do, - // additional care is required here. - - LibFunc Func; - if (!F->hasLocalLinkage() && F->hasName() && LibInfo && - LibInfo->getLibFunc(F->getName(), Func) && - LibInfo->hasOptimizedCodeGen(Func)) { - // Non-read-only functions are never treated as intrinsics. - if (!CI->onlyReadsMemory()) - return true; - - // Conversion happens only for FP calls. - if (!CI->getArgOperand(0)->getType()->isFloatingPointTy()) - return true; - - switch (Func) { - default: return true; - case LibFunc_copysign: - case LibFunc_copysignf: - continue; // ISD::FCOPYSIGN is never a library call. - case LibFunc_copysignl: - return true; - case LibFunc_fabs: - case LibFunc_fabsf: - case LibFunc_fabsl: - continue; // ISD::FABS is never a library call. - case LibFunc_sqrt: - case LibFunc_sqrtf: - case LibFunc_sqrtl: - Opcode = ISD::FSQRT; break; - case LibFunc_floor: - case LibFunc_floorf: - case LibFunc_floorl: - Opcode = ISD::FFLOOR; break; - case LibFunc_nearbyint: - case LibFunc_nearbyintf: - case LibFunc_nearbyintl: - Opcode = ISD::FNEARBYINT; break; - case LibFunc_ceil: - case LibFunc_ceilf: - case LibFunc_ceill: - Opcode = ISD::FCEIL; break; - case LibFunc_rint: - case LibFunc_rintf: - case LibFunc_rintl: - Opcode = ISD::FRINT; break; - case LibFunc_round: - case LibFunc_roundf: - case LibFunc_roundl: - Opcode = ISD::FROUND; break; - case LibFunc_trunc: - case LibFunc_truncf: - case LibFunc_truncl: - Opcode = ISD::FTRUNC; break; - case LibFunc_fmin: - case LibFunc_fminf: - case LibFunc_fminl: - Opcode = ISD::FMINNUM; break; - case LibFunc_fmax: - case LibFunc_fmaxf: - case LibFunc_fmaxl: - Opcode = ISD::FMAXNUM; break; - } - } - - if (Opcode) { - EVT EVTy = - TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true); - - if (EVTy == MVT::Other) - return true; - - if (TLI->isOperationLegalOrCustom(Opcode, EVTy)) - continue; - else if (EVTy.isVector() && - TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType())) - continue; - - return true; - } - } - - return true; - } else if (isa<BinaryOperator>(J) && - J->getType()->getScalarType()->isPPC_FP128Ty()) { - // Most operations on ppc_f128 values become calls. - return true; - } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) || - isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) { - CastInst *CI = cast<CastInst>(J); - if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || - CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || - isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) || - isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType())) - return true; - } else if (isLargeIntegerTy(!TM->isPPC64(), - J->getType()->getScalarType()) && - (J->getOpcode() == Instruction::UDiv || - J->getOpcode() == Instruction::SDiv || - J->getOpcode() == Instruction::URem || - J->getOpcode() == Instruction::SRem)) { - return true; - } else if (!TM->isPPC64() && - isLargeIntegerTy(false, J->getType()->getScalarType()) && - (J->getOpcode() == Instruction::Shl || - J->getOpcode() == Instruction::AShr || - J->getOpcode() == Instruction::LShr)) { - // Only on PPC32, for 128-bit integers (specifically not 64-bit - // integers), these might be runtime calls. - return true; - } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) { - // On PowerPC, indirect jumps use the counter register. - return true; - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) { - if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries()) - return true; - } - - // FREM is always a call. - if (J->getOpcode() == Instruction::FRem) - return true; - - if (STI->useSoftFloat()) { - switch(J->getOpcode()) { - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FPTrunc: - case Instruction::FPExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::UIToFP: - case Instruction::SIToFP: - case Instruction::FCmp: - return true; - } - } - - for (Value *Operand : J->operands()) - if (memAddrUsesCTR(*TM, Operand)) - return true; - } - - return false; -} -bool PPCCTRLoops::convertToCTRLoop(Loop *L) { - bool MadeChange = false; - - // Do not convert small short loops to CTR loop. - unsigned ConstTripCount = SE->getSmallConstantTripCount(L); - if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) { - SmallPtrSet<const Value *, 32> EphValues; - auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( - *L->getHeader()->getParent()); - CodeMetrics::collectEphemeralValues(L, &AC, EphValues); - CodeMetrics Metrics; - for (BasicBlock *BB : L->blocks()) - Metrics.analyzeBasicBlock(BB, *TTI, EphValues); - // 6 is an approximate latency for the mtctr instruction. - if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth())) - return false; - } - - // Process nested loops first. - for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) { - MadeChange |= convertToCTRLoop(*I); - LLVM_DEBUG(dbgs() << "Nested loop converted\n"); - } - - // If a nested loop has been converted, then we can't convert this loop. - if (MadeChange) - return MadeChange; - - // Bail out if the loop has irreducible control flow. - LoopBlocksRPO RPOT(L); - RPOT.perform(LI); - if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) - return false; - -#ifndef NDEBUG - // Stop trying after reaching the limit (if any). - int Limit = CTRLoopLimit; - if (Limit >= 0) { - if (Counter >= CTRLoopLimit) - return false; - Counter++; - } -#endif - - // We don't want to spill/restore the counter register, and so we don't - // want to use the counter register if the loop contains calls. - for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); - I != IE; ++I) - if (mightUseCTR(*I)) - return MadeChange; - - SmallVector<BasicBlock*, 4> ExitingBlocks; - L->getExitingBlocks(ExitingBlocks); - - // If there is an exit edge known to be frequently taken, - // we should not transform this loop. - for (auto &BB : ExitingBlocks) { - Instruction *TI = BB->getTerminator(); - if (!TI) continue; - - if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { - uint64_t TrueWeight = 0, FalseWeight = 0; - if (!BI->isConditional() || - !BI->extractProfMetadata(TrueWeight, FalseWeight)) - continue; - - // If the exit path is more frequent than the loop path, - // we return here without further analysis for this loop. - bool TrueIsExit = !L->contains(BI->getSuccessor(0)); - if (( TrueIsExit && FalseWeight < TrueWeight) || - (!TrueIsExit && FalseWeight > TrueWeight)) - return MadeChange; - } - } - - BasicBlock *CountedExitBlock = nullptr; - const SCEV *ExitCount = nullptr; - BranchInst *CountedExitBranch = nullptr; - for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(), - IE = ExitingBlocks.end(); I != IE; ++I) { - const SCEV *EC = SE->getExitCount(L, *I); - LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block " - << (*I)->getName() << ": " << *EC << "\n"); - if (isa<SCEVCouldNotCompute>(EC)) - continue; - if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) { - if (ConstEC->getValue()->isZero()) - continue; - } else if (!SE->isLoopInvariant(EC, L)) - continue; - - if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32)) - continue; - - // If this exiting block is contained in a nested loop, it is not eligible - // for insertion of the branch-and-decrement since the inner loop would - // end up messing up the value in the CTR. - if (LI->getLoopFor(*I) != L) - continue; - - // We now have a loop-invariant count of loop iterations (which is not the - // constant zero) for which we know that this loop will not exit via this - // existing block. - - // We need to make sure that this block will run on every loop iteration. - // For this to be true, we must dominate all blocks with backedges. Such - // blocks are in-loop predecessors to the header block. - bool NotAlways = false; - for (pred_iterator PI = pred_begin(L->getHeader()), - PIE = pred_end(L->getHeader()); PI != PIE; ++PI) { - if (!L->contains(*PI)) - continue; - - if (!DT->dominates(*I, *PI)) { - NotAlways = true; - break; - } - } - - if (NotAlways) - continue; - - // Make sure this blocks ends with a conditional branch. - Instruction *TI = (*I)->getTerminator(); - if (!TI) - continue; - - if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { - if (!BI->isConditional()) - continue; - - CountedExitBranch = BI; - } else - continue; - - // Note that this block may not be the loop latch block, even if the loop - // has a latch block. - CountedExitBlock = *I; - ExitCount = EC; - break; - } - - if (!CountedExitBlock) - return MadeChange; - - BasicBlock *Preheader = L->getLoopPreheader(); - - // If we don't have a preheader, then insert one. If we already have a - // preheader, then we can use it (except if the preheader contains a use of - // the CTR register because some such uses might be reordered by the - // selection DAG after the mtctr instruction). - if (!Preheader || mightUseCTR(Preheader)) - Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA); - if (!Preheader) - return MadeChange; - - LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() - << "\n"); - - // Insert the count into the preheader and replace the condition used by the - // selected branch. - MadeChange = true; - - SCEVExpander SCEVE(*SE, *DL, "loopcnt"); - LLVMContext &C = SE->getContext(); - Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C); - if (!ExitCount->getType()->isPointerTy() && - ExitCount->getType() != CountType) - ExitCount = SE->getZeroExtendExpr(ExitCount, CountType); - ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType)); - Value *ECValue = - SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator()); - - IRBuilder<> CountBuilder(Preheader->getTerminator()); - Module *M = Preheader->getParent()->getParent(); - Function *MTCTRFunc = - Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, CountType); - CountBuilder.CreateCall(MTCTRFunc, ECValue); - - IRBuilder<> CondBuilder(CountedExitBranch); - Function *DecFunc = - Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero); - Value *NewCond = CondBuilder.CreateCall(DecFunc, {}); - Value *OldCond = CountedExitBranch->getCondition(); - CountedExitBranch->setCondition(NewCond); - - // The false branch must exit the loop. - if (!L->contains(CountedExitBranch->getSuccessor(0))) - CountedExitBranch->swapSuccessors(); - - // The old condition may be dead now, and may have even created a dead PHI - // (the original induction variable). - RecursivelyDeleteTriviallyDeadInstructions(OldCond); - // Run through the basic blocks of the loop and see if any of them have dead - // PHIs that can be removed. - for (auto I : L->blocks()) - DeleteDeadPHIs(I); - - ++NumCTRLoops; - return MadeChange; -} - #ifndef NDEBUG static bool clobbersCTR(const MachineInstr &MI) { for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index ee8fda9c437..116916ff3b4 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -9944,7 +9944,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::INTRINSIC_W_CHAIN: { if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != - Intrinsic::ppc_is_decremented_ctr_nonzero) + Intrinsic::loop_decrement) break; assert(N->getValueType(0) == MVT::i1 && @@ -13636,7 +13636,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero) { + Intrinsic::loop_decrement) { // We now need to make the intrinsic dead (it cannot be instruction // selected). @@ -13662,14 +13662,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (LHS.getOpcode() == ISD::AND && LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero && + Intrinsic::loop_decrement && isa<ConstantSDNode>(LHS.getOperand(1)) && !isNullConstant(LHS.getOperand(1))) LHS = LHS.getOperand(0); if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == - Intrinsic::ppc_is_decremented_ctr_nonzero && + Intrinsic::loop_decrement && isa<ConstantSDNode>(RHS)) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Counter decrement comparison is not EQ or NE"); diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index e17bc25ac21..6b0e3a29f96 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -388,7 +388,7 @@ def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS), PPC970_DGroup_First, PPC970_Unit_FXU; } let hasSideEffects = 1, Defs = [CTR8] in { -let Pattern = [(int_ppc_mtctr i64:$rS)] in +let Pattern = [(int_set_loop_iterations i64:$rS)] in def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS), "mtctr $rS", IIC_SprMTSPR>, PPC970_DGroup_First, PPC970_Unit_FXU; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 2ec6d6ba3c5..6b6787f3521 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -2605,7 +2605,7 @@ def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS), PPC970_DGroup_First, PPC970_Unit_FXU; } let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in { -let Pattern = [(int_ppc_mtctr i32:$rS)] in +let Pattern = [(int_set_loop_iterations i32:$rS)] in def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS), "mtctr $rS", IIC_SprMTSPR>, PPC970_DGroup_First, PPC970_Unit_FXU; diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 915b89a3e29..da1121bac9c 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -101,7 +101,6 @@ extern "C" void LLVMInitializePowerPCTarget() { RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget()); PassRegistry &PR = *PassRegistry::getPassRegistry(); - initializePPCCTRLoopsPass(PR); #ifndef NDEBUG initializePPCCTRLoopsVerifyPass(PR); #endif @@ -422,7 +421,7 @@ bool PPCPassConfig::addPreISel() { addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine())); if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) - addPass(createPPCCTRLoops()); + addPass(createHardwareLoopsPass()); return false; } diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index fdb2a38419d..bc2983836f7 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -7,10 +7,12 @@ //===----------------------------------------------------------------------===// #include "PPCTargetTransformInfo.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" using namespace llvm; @@ -31,6 +33,13 @@ EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), cl::desc("Enable using coldcc calling conv for cold " "internal functions")); +// The latency of mtctr is only justified if there are more than 4 +// comparisons that will be removed as a result. +static cl::opt<unsigned> +SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, + cl::desc("Loops with a constant trip count smaller than " + "this value will not use the count register.")); + //===----------------------------------------------------------------------===// // // PPC cost model. @@ -204,6 +213,341 @@ unsigned PPCTTIImpl::getUserCost(const User *U, return BaseT::getUserCost(U, Operands); } +bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, + TargetLibraryInfo *LibInfo) { + const PPCTargetMachine &TM = ST->getTargetMachine(); + + // Loop through the inline asm constraints and look for something that + // clobbers ctr. + auto asmClobbersCTR = [](InlineAsm *IA) { + InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); + for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { + InlineAsm::ConstraintInfo &C = CIV[i]; + if (C.Type != InlineAsm::isInput) + for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) + if (StringRef(C.Codes[j]).equals_lower("{ctr}")) + return true; + } + return false; + }; + + // Determining the address of a TLS variable results in a function call in + // certain TLS models. + std::function<bool(const Value*)> memAddrUsesCTR = + [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool { + const auto *GV = dyn_cast<GlobalValue>(MemAddr); + if (!GV) { + // Recurse to check for constants that refer to TLS global variables. + if (const auto *CV = dyn_cast<Constant>(MemAddr)) + for (const auto &CO : CV->operands()) + if (memAddrUsesCTR(CO)) + return true; + + return false; + } + + if (!GV->isThreadLocal()) + return false; + TLSModel::Model Model = TM.getTLSModel(GV); + return Model == TLSModel::GeneralDynamic || + Model == TLSModel::LocalDynamic; + }; + + auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) { + if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) + return ITy->getBitWidth() > (Is32Bit ? 32U : 64U); + + return false; + }; + + for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); + J != JE; ++J) { + if (CallInst *CI = dyn_cast<CallInst>(J)) { + // Inline ASM is okay, unless it clobbers the ctr register. + if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) { + if (asmClobbersCTR(IA)) + return true; + continue; + } + + if (Function *F = CI->getCalledFunction()) { + // Most intrinsics don't become function calls, but some might. + // sin, cos, exp and log are always calls. + unsigned Opcode = 0; + if (F->getIntrinsicID() != Intrinsic::not_intrinsic) { + switch (F->getIntrinsicID()) { + default: continue; + // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr + // we're definitely using CTR. + case Intrinsic::set_loop_iterations: + case Intrinsic::loop_decrement: + return true; + +// VisualStudio defines setjmp as _setjmp +#if defined(_MSC_VER) && defined(setjmp) && \ + !defined(setjmp_undefined_for_msvc) +# pragma push_macro("setjmp") +# undef setjmp +# define setjmp_undefined_for_msvc +#endif + + case Intrinsic::setjmp: + +#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) + // let's return it to _setjmp state +# pragma pop_macro("setjmp") +# undef setjmp_undefined_for_msvc +#endif + + case Intrinsic::longjmp: + + // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp + // because, although it does clobber the counter register, the + // control can't then return to inside the loop unless there is also + // an eh_sjlj_setjmp. + case Intrinsic::eh_sjlj_setjmp: + + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + case Intrinsic::powi: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::pow: + case Intrinsic::sin: + case Intrinsic::cos: + return true; + case Intrinsic::copysign: + if (CI->getArgOperand(0)->getType()->getScalarType()-> + isPPC_FP128Ty()) + return true; + else + continue; // ISD::FCOPYSIGN is never a library call. + case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; + case Intrinsic::floor: Opcode = ISD::FFLOOR; break; + case Intrinsic::ceil: Opcode = ISD::FCEIL; break; + case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; + case Intrinsic::rint: Opcode = ISD::FRINT; break; + case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; + case Intrinsic::round: Opcode = ISD::FROUND; break; + case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; + case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; + case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break; + case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break; + } + } + + // PowerPC does not use [US]DIVREM or other library calls for + // operations on regular types which are not otherwise library calls + // (i.e. soft float or atomics). If adapting for targets that do, + // additional care is required here. + + LibFunc Func; + if (!F->hasLocalLinkage() && F->hasName() && LibInfo && + LibInfo->getLibFunc(F->getName(), Func) && + LibInfo->hasOptimizedCodeGen(Func)) { + // Non-read-only functions are never treated as intrinsics. + if (!CI->onlyReadsMemory()) + return true; + + // Conversion happens only for FP calls. + if (!CI->getArgOperand(0)->getType()->isFloatingPointTy()) + return true; + + switch (Func) { + default: return true; + case LibFunc_copysign: + case LibFunc_copysignf: + continue; // ISD::FCOPYSIGN is never a library call. + case LibFunc_copysignl: + return true; + case LibFunc_fabs: + case LibFunc_fabsf: + case LibFunc_fabsl: + continue; // ISD::FABS is never a library call. + case LibFunc_sqrt: + case LibFunc_sqrtf: + case LibFunc_sqrtl: + Opcode = ISD::FSQRT; break; + case LibFunc_floor: + case LibFunc_floorf: + case LibFunc_floorl: + Opcode = ISD::FFLOOR; break; + case LibFunc_nearbyint: + case LibFunc_nearbyintf: + case LibFunc_nearbyintl: + Opcode = ISD::FNEARBYINT; break; + case LibFunc_ceil: + case LibFunc_ceilf: + case LibFunc_ceill: + Opcode = ISD::FCEIL; break; + case LibFunc_rint: + case LibFunc_rintf: + case LibFunc_rintl: + Opcode = ISD::FRINT; break; + case LibFunc_round: + case LibFunc_roundf: + case LibFunc_roundl: + Opcode = ISD::FROUND; break; + case LibFunc_trunc: + case LibFunc_truncf: + case LibFunc_truncl: + Opcode = ISD::FTRUNC; break; + case LibFunc_fmin: + case LibFunc_fminf: + case LibFunc_fminl: + Opcode = ISD::FMINNUM; break; + case LibFunc_fmax: + case LibFunc_fmaxf: + case LibFunc_fmaxl: + Opcode = ISD::FMAXNUM; break; + } + } + + if (Opcode) { + EVT EVTy = + TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true); + + if (EVTy == MVT::Other) + return true; + + if (TLI->isOperationLegalOrCustom(Opcode, EVTy)) + continue; + else if (EVTy.isVector() && + TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType())) + continue; + + return true; + } + } + + return true; + } else if (isa<BinaryOperator>(J) && + J->getType()->getScalarType()->isPPC_FP128Ty()) { + // Most operations on ppc_f128 values become calls. + return true; + } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) || + isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) { + CastInst *CI = cast<CastInst>(J); + if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || + CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || + isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) || + isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType())) + return true; + } else if (isLargeIntegerTy(!TM.isPPC64(), + J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::UDiv || + J->getOpcode() == Instruction::SDiv || + J->getOpcode() == Instruction::URem || + J->getOpcode() == Instruction::SRem)) { + return true; + } else if (!TM.isPPC64() && + isLargeIntegerTy(false, J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::Shl || + J->getOpcode() == Instruction::AShr || + J->getOpcode() == Instruction::LShr)) { + // Only on PPC32, for 128-bit integers (specifically not 64-bit + // integers), these might be runtime calls. + return true; + } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) { + // On PowerPC, indirect jumps use the counter register. + return true; + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) { + if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries()) + return true; + } + + // FREM is always a call. + if (J->getOpcode() == Instruction::FRem) + return true; + + if (ST->useSoftFloat()) { + switch(J->getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FCmp: + return true; + } + } + + for (Value *Operand : J->operands()) + if (memAddrUsesCTR(Operand)) + return true; + } + + return false; +} + +bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo) { + const PPCTargetMachine &TM = ST->getTargetMachine(); + TargetSchedModel SchedModel; + SchedModel.init(ST); + + // Do not convert small short loops to CTR loop. + unsigned ConstTripCount = SE.getSmallConstantTripCount(L); + if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) { + SmallPtrSet<const Value *, 32> EphValues; + CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + CodeMetrics Metrics; + for (BasicBlock *BB : L->blocks()) + Metrics.analyzeBasicBlock(BB, *this, EphValues); + // 6 is an approximate latency for the mtctr instruction. + if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth())) + return false; + } + + // We don't want to spill/restore the counter register, and so we don't + // want to use the counter register if the loop contains calls. + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) + if (mightUseCTR(*I, LibInfo)) + return false; + + SmallVector<BasicBlock*, 4> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + // If there is an exit edge known to be frequently taken, + // we should not transform this loop. + for (auto &BB : ExitingBlocks) { + Instruction *TI = BB->getTerminator(); + if (!TI) continue; + + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + uint64_t TrueWeight = 0, FalseWeight = 0; + if (!BI->isConditional() || + !BI->extractProfMetadata(TrueWeight, FalseWeight)) + continue; + + // If the exit path is more frequent than the loop path, + // we return here without further analysis for this loop. + bool TrueIsExit = !L->contains(BI->getSuccessor(0)); + if (( TrueIsExit && FalseWeight < TrueWeight) || + (!TrueIsExit && FalseWeight > TrueWeight)) + return false; + } + } + + LLVMContext &C = L->getHeader()->getContext(); + HWLoopInfo.CountType = TM.isPPC64() ? + Type::getInt64Ty(C) : Type::getInt32Ty(C); + HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); + return true; +} + void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { if (ST->getDarwinDirective() == PPC::DIR_A2) { diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 993b2977cef..93a9968b5fa 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -33,6 +33,7 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> { const PPCSubtarget *getST() const { return ST; } const PPCTargetLowering *getTLI() const { return TLI; } + bool mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo); public: explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F) @@ -52,6 +53,10 @@ public: unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); |