diff options
| author | Sjoerd Meijer <sjoerd.meijer@arm.com> | 2019-11-13 13:02:16 +0000 |
|---|---|---|
| committer | Sjoerd Meijer <sjoerd.meijer@arm.com> | 2019-11-13 13:24:33 +0000 |
| commit | d90804d26befeda36641fade3edba107682cc5cf (patch) | |
| tree | e7ba1ef2b13433a4871b6a1313095d13c0de1ff4 /llvm/lib/Target/ARM | |
| parent | a5ce8bd715ad9e1d7dfc150f4eba9d24921ca5ba (diff) | |
| download | bcm5719-llvm-d90804d26befeda36641fade3edba107682cc5cf.tar.gz bcm5719-llvm-d90804d26befeda36641fade3edba107682cc5cf.zip | |
[ARM][MVE] canTailPredicateLoop
This implements TTI hook 'preferPredicateOverEpilogue' for MVE. This is a
first version and it operates on single block loops only. With this change, the
vectoriser will now determine if tail-folding scalar remainder loops is
possible/desired, which is the first step to generate MVE tail-predicated
vector loops.
This is disabled by default for now. I.e,, this is depends on option
-disable-mve-tail-predication, which is off by default.
I will follow up on this soon with a patch for the vectoriser to respect loop
hint 'vectorize.predicate.enable'. I.e., with this loop hint set to Disabled,
we don't want to tail-fold and we shouldn't query this TTI hook, which is
done in D70125.
Differential Revision: https://reviews.llvm.org/D69845
Diffstat (limited to 'llvm/lib/Target/ARM')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 107 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/MVETailPredication.cpp | 2 |
2 files changed, 100 insertions, 9 deletions
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index eb698375985..df3057d62c7 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -44,6 +44,8 @@ static cl::opt<bool> DisableLowOverheadLoops( "disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops")); +extern cl::opt<bool> DisableTailPredication; + bool ARMTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -1000,18 +1002,114 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, return true; } +static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { + // We don't allow icmp's, and because we only look at single block loops, + // we simply count the icmps, i.e. there should only be 1 for the backedge. + if (isa<ICmpInst>(&I) && ++ICmpCount > 1) + return false; + + // We could allow extending/narrowing FP loads/stores, but codegen is + // too inefficient so reject this for now. + if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I)) + return false; + + // Extends have to be extending-loads + if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) ) + if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0))) + return false; + + // Truncs have to be narrowing-stores + if (isa<TruncInst>(&I) ) + if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin())) + return false; + + return true; +} + +// To set up a tail-predicated loop, we need to know the total number of +// elements processed by that loop. Thus, we need to determine the element +// size and: +// 1) it should be uniform for all operations in the vector loop, so we +// e.g. don't want any widening/narrowing operations. +// 2) it should be smaller than i64s because we don't have vector operations +// that work on i64s. +// 3) we don't want elements to be reversed or shuffled, to make sure the +// tail-predication masks/predicates the right lanes. +// +static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + const DataLayout &DL, + const LoopAccessInfo *LAI) { + PredicatedScalarEvolution PSE = LAI->getPSE(); + int ICmpCount = 0; + int Stride = 0; + + LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n"); + SmallVector<Instruction *, 16> LoadStores; + for (BasicBlock *BB : L->blocks()) { + for (Instruction &I : BB->instructionsWithoutDebug()) { + if (isa<PHINode>(&I)) + continue; + if (!canTailPredicateInstruction(I, ICmpCount)) { + LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump()); + return false; + } + + Type *T = I.getType(); + if (T->isPointerTy()) + T = T->getPointerElementType(); + + if (T->getScalarSizeInBits() > 32) { + LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump()); + return false; + } + + if (isa<StoreInst>(I) || isa<LoadInst>(I)) { + Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1); + int64_t NextStride = getPtrStride(PSE, Ptr, L); + // TODO: for now only allow consecutive strides of 1. We could support + // other strides as long as it is uniform, but let's keep it simple for + // now. + if (Stride == 0 && NextStride == 1) { + Stride = NextStride; + continue; + } + if (Stride != NextStride) { + LLVM_DEBUG(dbgs() << "Different strides found, can't " + "tail-predicate\n."); + return false; + } + } + } + } + + LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n"); + return true; +} + bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI) { + if (DisableTailPredication) + return false; + // Creating a predicated vector loop is the first step for generating a // tail-predicated hardware loop, for which we need the MVE masked // load/stores instructions: if (!ST->hasMVEIntegerOps()) return false; + // For now, restrict this to single block loops. + if (L->getNumBlocks() > 1) { + LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block " + "loop.\n"); + return false; + } + + assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected"); + HardwareLoopInfo HWLoopInfo(L); if (!HWLoopInfo.canAnalyze(*LI)) { LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " @@ -1033,14 +1131,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, return false; } - // TODO: to set up a tail-predicated loop, which works by setting up - // the total number of elements processed by the loop, we need to - // determine the element size here, and if it is uniform for all operations - // in the vector loop. This means we will reject narrowing/widening - // operations, and don't want to predicate the vector loop, which is - // the main prep step for tail-predicated loops. - - return false; + return canTailPredicateLoop(L, LI, SE, DL, LAI); } diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index 4db8ab17c49..36dcde250e3 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -41,7 +41,7 @@ using namespace llvm; #define DEBUG_TYPE "mve-tail-predication" #define DESC "Transform predicated vector loops to use MVE tail predication" -static cl::opt<bool> +cl::opt<bool> DisableTailPredication("disable-mve-tail-predication", cl::Hidden, cl::init(true), cl::desc("Disable MVE Tail Predication")); |

