summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
diff options
context:
space:
mode:
authorSam Parker <sam.parker@arm.com>2019-06-07 07:35:30 +0000
committerSam Parker <sam.parker@arm.com>2019-06-07 07:35:30 +0000
commitc5ef502ee8160c37823d21e622b2575a5504e815 (patch)
tree7e3f2f79313f58bfd1afd5c6ade9790feed72b12 /llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
parent04b418f2460153969ccf7e72a2ee90aad6c8fd35 (diff)
downloadbcm5719-llvm-c5ef502ee8160c37823d21e622b2575a5504e815.tar.gz
bcm5719-llvm-c5ef502ee8160c37823d21e622b2575a5504e815.zip
[CodeGen] Generic Hardware Loop Support
Patch which introduces a target-independent framework for generating hardware loops at the IR level. Most of the code has been taken from PowerPC CTRLoops and PowerPC has been ported over to use this generic pass. The target dependent parts have been moved into TargetTransformInfo, via isHardwareLoopProfitable, with HardwareLoopInfo introduced to transfer information from the backend. Three generic intrinsics have been introduced: - void @llvm.set_loop_iterations Takes as a single operand, the number of iterations to be executed. - i1 @llvm.loop_decrement(anyint) Takes the maximum number of elements processed in an iteration of the loop body and subtracts this from the total count. Returns false when the loop should exit. - anyint @llvm.loop_decrement_reg(anyint, anyint) Takes the number of elements remaining to be processed as well as the maximum numbe of elements processed in an iteration of the loop body. Returns the updated number of elements remaining. llvm-svn: 362774
Diffstat (limited to 'llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp344
1 files changed, 344 insertions, 0 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index fdb2a38419d..bc2983836f7 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -7,10 +7,12 @@
//===----------------------------------------------------------------------===//
#include "PPCTargetTransformInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
@@ -31,6 +33,13 @@ EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
cl::desc("Enable using coldcc calling conv for cold "
"internal functions"));
+// The latency of mtctr is only justified if there are more than 4
+// comparisons that will be removed as a result.
+static cl::opt<unsigned>
+SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+ cl::desc("Loops with a constant trip count smaller than "
+ "this value will not use the count register."));
+
//===----------------------------------------------------------------------===//
//
// PPC cost model.
@@ -204,6 +213,341 @@ unsigned PPCTTIImpl::getUserCost(const User *U,
return BaseT::getUserCost(U, Operands);
}
+bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
+ TargetLibraryInfo *LibInfo) {
+ const PPCTargetMachine &TM = ST->getTargetMachine();
+
+ // Loop through the inline asm constraints and look for something that
+ // clobbers ctr.
+ auto asmClobbersCTR = [](InlineAsm *IA) {
+ InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+ for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+ InlineAsm::ConstraintInfo &C = CIV[i];
+ if (C.Type != InlineAsm::isInput)
+ for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+ if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+ return true;
+ }
+ return false;
+ };
+
+ // Determining the address of a TLS variable results in a function call in
+ // certain TLS models.
+ std::function<bool(const Value*)> memAddrUsesCTR =
+ [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool {
+ const auto *GV = dyn_cast<GlobalValue>(MemAddr);
+ if (!GV) {
+ // Recurse to check for constants that refer to TLS global variables.
+ if (const auto *CV = dyn_cast<Constant>(MemAddr))
+ for (const auto &CO : CV->operands())
+ if (memAddrUsesCTR(CO))
+ return true;
+
+ return false;
+ }
+
+ if (!GV->isThreadLocal())
+ return false;
+ TLSModel::Model Model = TM.getTLSModel(GV);
+ return Model == TLSModel::GeneralDynamic ||
+ Model == TLSModel::LocalDynamic;
+ };
+
+ auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) {
+ if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+ return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
+
+ return false;
+ };
+
+ for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
+ J != JE; ++J) {
+ if (CallInst *CI = dyn_cast<CallInst>(J)) {
+ // Inline ASM is okay, unless it clobbers the ctr register.
+ if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+ if (asmClobbersCTR(IA))
+ return true;
+ continue;
+ }
+
+ if (Function *F = CI->getCalledFunction()) {
+ // Most intrinsics don't become function calls, but some might.
+ // sin, cos, exp and log are always calls.
+ unsigned Opcode = 0;
+ if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
+ switch (F->getIntrinsicID()) {
+ default: continue;
+ // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
+ // we're definitely using CTR.
+ case Intrinsic::set_loop_iterations:
+ case Intrinsic::loop_decrement:
+ return true;
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+ !defined(setjmp_undefined_for_msvc)
+# pragma push_macro("setjmp")
+# undef setjmp
+# define setjmp_undefined_for_msvc
+#endif
+
+ case Intrinsic::setjmp:
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
+ // let's return it to _setjmp state
+# pragma pop_macro("setjmp")
+# undef setjmp_undefined_for_msvc
+#endif
+
+ case Intrinsic::longjmp:
+
+ // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
+ // because, although it does clobber the counter register, the
+ // control can't then return to inside the loop unless there is also
+ // an eh_sjlj_setjmp.
+ case Intrinsic::eh_sjlj_setjmp:
+
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove:
+ case Intrinsic::memset:
+ case Intrinsic::powi:
+ case Intrinsic::log:
+ case Intrinsic::log2:
+ case Intrinsic::log10:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::pow:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ return true;
+ case Intrinsic::copysign:
+ if (CI->getArgOperand(0)->getType()->getScalarType()->
+ isPPC_FP128Ty())
+ return true;
+ else
+ continue; // ISD::FCOPYSIGN is never a library call.
+ case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
+ case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
+ case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
+ case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
+ case Intrinsic::rint: Opcode = ISD::FRINT; break;
+ case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+ case Intrinsic::round: Opcode = ISD::FROUND; break;
+ case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
+ case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
+ case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break;
+ case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break;
+ }
+ }
+
+ // PowerPC does not use [US]DIVREM or other library calls for
+ // operations on regular types which are not otherwise library calls
+ // (i.e. soft float or atomics). If adapting for targets that do,
+ // additional care is required here.
+
+ LibFunc Func;
+ if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
+ LibInfo->getLibFunc(F->getName(), Func) &&
+ LibInfo->hasOptimizedCodeGen(Func)) {
+ // Non-read-only functions are never treated as intrinsics.
+ if (!CI->onlyReadsMemory())
+ return true;
+
+ // Conversion happens only for FP calls.
+ if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
+ return true;
+
+ switch (Func) {
+ default: return true;
+ case LibFunc_copysign:
+ case LibFunc_copysignf:
+ continue; // ISD::FCOPYSIGN is never a library call.
+ case LibFunc_copysignl:
+ return true;
+ case LibFunc_fabs:
+ case LibFunc_fabsf:
+ case LibFunc_fabsl:
+ continue; // ISD::FABS is never a library call.
+ case LibFunc_sqrt:
+ case LibFunc_sqrtf:
+ case LibFunc_sqrtl:
+ Opcode = ISD::FSQRT; break;
+ case LibFunc_floor:
+ case LibFunc_floorf:
+ case LibFunc_floorl:
+ Opcode = ISD::FFLOOR; break;
+ case LibFunc_nearbyint:
+ case LibFunc_nearbyintf:
+ case LibFunc_nearbyintl:
+ Opcode = ISD::FNEARBYINT; break;
+ case LibFunc_ceil:
+ case LibFunc_ceilf:
+ case LibFunc_ceill:
+ Opcode = ISD::FCEIL; break;
+ case LibFunc_rint:
+ case LibFunc_rintf:
+ case LibFunc_rintl:
+ Opcode = ISD::FRINT; break;
+ case LibFunc_round:
+ case LibFunc_roundf:
+ case LibFunc_roundl:
+ Opcode = ISD::FROUND; break;
+ case LibFunc_trunc:
+ case LibFunc_truncf:
+ case LibFunc_truncl:
+ Opcode = ISD::FTRUNC; break;
+ case LibFunc_fmin:
+ case LibFunc_fminf:
+ case LibFunc_fminl:
+ Opcode = ISD::FMINNUM; break;
+ case LibFunc_fmax:
+ case LibFunc_fmaxf:
+ case LibFunc_fmaxl:
+ Opcode = ISD::FMAXNUM; break;
+ }
+ }
+
+ if (Opcode) {
+ EVT EVTy =
+ TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true);
+
+ if (EVTy == MVT::Other)
+ return true;
+
+ if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
+ continue;
+ else if (EVTy.isVector() &&
+ TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
+ continue;
+
+ return true;
+ }
+ }
+
+ return true;
+ } else if (isa<BinaryOperator>(J) &&
+ J->getType()->getScalarType()->isPPC_FP128Ty()) {
+ // Most operations on ppc_f128 values become calls.
+ return true;
+ } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
+ isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
+ CastInst *CI = cast<CastInst>(J);
+ if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
+ CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
+ isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) ||
+ isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType()))
+ return true;
+ } else if (isLargeIntegerTy(!TM.isPPC64(),
+ J->getType()->getScalarType()) &&
+ (J->getOpcode() == Instruction::UDiv ||
+ J->getOpcode() == Instruction::SDiv ||
+ J->getOpcode() == Instruction::URem ||
+ J->getOpcode() == Instruction::SRem)) {
+ return true;
+ } else if (!TM.isPPC64() &&
+ isLargeIntegerTy(false, J->getType()->getScalarType()) &&
+ (J->getOpcode() == Instruction::Shl ||
+ J->getOpcode() == Instruction::AShr ||
+ J->getOpcode() == Instruction::LShr)) {
+ // Only on PPC32, for 128-bit integers (specifically not 64-bit
+ // integers), these might be runtime calls.
+ return true;
+ } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
+ // On PowerPC, indirect jumps use the counter register.
+ return true;
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
+ if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
+ return true;
+ }
+
+ // FREM is always a call.
+ if (J->getOpcode() == Instruction::FRem)
+ return true;
+
+ if (ST->useSoftFloat()) {
+ switch(J->getOpcode()) {
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FCmp:
+ return true;
+ }
+ }
+
+ for (Value *Operand : J->operands())
+ if (memAddrUsesCTR(Operand))
+ return true;
+ }
+
+ return false;
+}
+
+bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ TTI::HardwareLoopInfo &HWLoopInfo) {
+ const PPCTargetMachine &TM = ST->getTargetMachine();
+ TargetSchedModel SchedModel;
+ SchedModel.init(ST);
+
+ // Do not convert small short loops to CTR loop.
+ unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
+ if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+ CodeMetrics Metrics;
+ for (BasicBlock *BB : L->blocks())
+ Metrics.analyzeBasicBlock(BB, *this, EphValues);
+ // 6 is an approximate latency for the mtctr instruction.
+ if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
+ return false;
+ }
+
+ // We don't want to spill/restore the counter register, and so we don't
+ // want to use the counter register if the loop contains calls.
+ for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+ I != IE; ++I)
+ if (mightUseCTR(*I, LibInfo))
+ return false;
+
+ SmallVector<BasicBlock*, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ // If there is an exit edge known to be frequently taken,
+ // we should not transform this loop.
+ for (auto &BB : ExitingBlocks) {
+ Instruction *TI = BB->getTerminator();
+ if (!TI) continue;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ uint64_t TrueWeight = 0, FalseWeight = 0;
+ if (!BI->isConditional() ||
+ !BI->extractProfMetadata(TrueWeight, FalseWeight))
+ continue;
+
+ // If the exit path is more frequent than the loop path,
+ // we return here without further analysis for this loop.
+ bool TrueIsExit = !L->contains(BI->getSuccessor(0));
+ if (( TrueIsExit && FalseWeight < TrueWeight) ||
+ (!TrueIsExit && FalseWeight > TrueWeight))
+ return false;
+ }
+ }
+
+ LLVMContext &C = L->getHeader()->getContext();
+ HWLoopInfo.CountType = TM.isPPC64() ?
+ Type::getInt64Ty(C) : Type::getInt32Ty(C);
+ HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
+ return true;
+}
+
void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
if (ST->getDarwinDirective() == PPC::DIR_A2) {
OpenPOWER on IntegriCloud