diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 194 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 6 |
2 files changed, 200 insertions, 0 deletions
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index c626c4149c7..128d14c1537 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -36,6 +36,10 @@ using namespace llvm; #define DEBUG_TYPE "armtti" +static cl::opt<bool> DisableLowOverheadLoops( + "disable-arm-loloops", cl::Hidden, cl::init(true), + cl::desc("Disable the generation of low-overhead loops")); + bool ARMTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -628,6 +632,196 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, UseMaskForCond, UseMaskForGaps); } +bool ARMTTIImpl::isLoweredToCall(const Function *F) { + if (!F->isIntrinsic()) + BaseT::isLoweredToCall(F); + + // Assume all Arm-specific intrinsics map to an instruction. + if (F->getName().startswith("llvm.arm")) + return false; + + switch (F->getIntrinsicID()) { + default: break; + case Intrinsic::powi: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::pow: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::exp: + case Intrinsic::exp2: + return true; + case Intrinsic::sqrt: + case Intrinsic::fabs: + case Intrinsic::copysign: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::round: + case Intrinsic::canonicalize: + case Intrinsic::lround: + case Intrinsic::llround: + case Intrinsic::lrint: + case Intrinsic::llrint: + if (F->getReturnType()->isDoubleTy() && !ST->hasFP64()) + return true; + if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16()) + return true; + // Some operations can be handled by vector instructions and assume + // unsupported vectors will be expanded into supported scalar ones. + // TODO Handle scalar operations properly. + return !ST->hasFPARMv8Base() && !ST->hasVFP2Base(); + case Intrinsic::masked_store: + case Intrinsic::masked_load: + case Intrinsic::masked_gather: + case Intrinsic::masked_scatter: + return !ST->hasMVEIntegerOps(); + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::usub_sat: + return false; + } + + return BaseT::isLoweredToCall(F); +} + +bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo) { + // Low-overhead branches are only supported in the 'low-overhead branch' + // extension of v8.1-m. + if (!ST->hasLOB() || DisableLowOverheadLoops) + return false; + + // For now, for simplicity, only support loops with one exit block. + if (!L->getExitBlock()) + return false; + + if (!SE.hasLoopInvariantBackedgeTakenCount(L)) + return false; + + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) + return false; + + const SCEV *TripCountSCEV = + SE.getAddExpr(BackedgeTakenCount, + SE.getOne(BackedgeTakenCount->getType())); + + // We need to store the trip count in LR, a 32-bit register. + if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) + return false; + + // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little + // point in generating a hardware loop if that's going to happen. + auto MaybeCall = [this](Instruction &I) { + const ARMTargetLowering *TLI = getTLI(); + unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); + EVT VT = TLI->getValueType(DL, I.getType(), true); + if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) + return true; + + // Check if an intrinsic will be lowered to a call and assume that any + // other CallInst will generate a bl. + if (auto *Call = dyn_cast<CallInst>(&I)) { + if (isa<IntrinsicInst>(Call)) { + if (const Function *F = Call->getCalledFunction()) + return isLoweredToCall(F); + } + return true; + } + + // FPv5 provides conversions between integer, double-precision, + // single-precision, and half-precision formats. + switch (I.getOpcode()) { + default: + break; + case Instruction::FPToSI: + case Instruction::FPToUI: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + return !ST->hasFPARMv8Base(); + } + + // FIXME: Unfortunately the approach of checking the Operation Action does + // not catch all cases of Legalization that use library calls. Our + // Legalization step categorizes some transformations into library calls as + // Custom, Expand or even Legal when doing type legalization. So for now + // we have to special case for instance the SDIV of 64bit integers and the + // use of floating point emulation. + if (VT.isInteger() && VT.getSizeInBits() >= 64) { + switch (ISD) { + default: + break; + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: + case ISD::SDIVREM: + case ISD::UDIVREM: + return true; + } + } + + // Assume all other non-float operations are supported. + if (!VT.isFloatingPoint()) + return false; + + // We'll need a library call to handle most floats when using soft. + if (TLI->useSoftFloat()) { + switch (I.getOpcode()) { + default: + return true; + case Instruction::Alloca: + case Instruction::Load: + case Instruction::Store: + case Instruction::Select: + case Instruction::PHI: + return false; + } + } + + // We'll need a libcall to perform double precision operations on a single + // precision only FPU. + if (I.getType()->isDoubleTy() && !ST->hasFP64()) + return true; + + // Likewise for half precision arithmetic. + if (I.getType()->isHalfTy() && !ST->hasFullFP16()) + return true; + + return false; + }; + + // Scan the instructions to see if there's any that we know will turn into a + // call. + for (auto *BB : L->getBlocks()) + for (auto &I : *BB) + if (MaybeCall(I)) + return false; + + // TODO: Check whether the trip count calculation is expensive. If L is the + // inner loop but we know it has a low trip count, calculating that trip + // count (in the parent loop) may be detrimental. + + LLVMContext &C = L->getHeader()->getContext(); + HWLoopInfo.CounterInReg = true; + HWLoopInfo.CountType = Type::getInt32Ty(C); + HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); + return true; +} + void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 882a63c33a5..4f68a0b244d 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -180,6 +180,12 @@ public: bool UseMaskForCond = false, bool UseMaskForGaps = false); + bool isLoweredToCall(const Function *F); + bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *LibInfo, + TTI::HardwareLoopInfo &HWLoopInfo); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); |

