summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/ARM
diff options
context:
space:
mode:
authorSjoerd Meijer <sjoerd.meijer@arm.com>2019-11-13 13:02:16 +0000
committerSjoerd Meijer <sjoerd.meijer@arm.com>2019-11-13 13:24:33 +0000
commitd90804d26befeda36641fade3edba107682cc5cf (patch)
treee7ba1ef2b13433a4871b6a1313095d13c0de1ff4 /llvm/lib/Target/ARM
parenta5ce8bd715ad9e1d7dfc150f4eba9d24921ca5ba (diff)
downloadbcm5719-llvm-d90804d26befeda36641fade3edba107682cc5cf.tar.gz
bcm5719-llvm-d90804d26befeda36641fade3edba107682cc5cf.zip
[ARM][MVE] canTailPredicateLoop
This implements TTI hook 'preferPredicateOverEpilogue' for MVE. This is a first version and it operates on single block loops only. With this change, the vectoriser will now determine if tail-folding scalar remainder loops is possible/desired, which is the first step to generate MVE tail-predicated vector loops. This is disabled by default for now. I.e,, this is depends on option -disable-mve-tail-predication, which is off by default. I will follow up on this soon with a patch for the vectoriser to respect loop hint 'vectorize.predicate.enable'. I.e., with this loop hint set to Disabled, we don't want to tail-fold and we shouldn't query this TTI hook, which is done in D70125. Differential Revision: https://reviews.llvm.org/D69845
Diffstat (limited to 'llvm/lib/Target/ARM')
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp107
-rw-r--r--llvm/lib/Target/ARM/MVETailPredication.cpp2
2 files changed, 100 insertions, 9 deletions
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index eb698375985..df3057d62c7 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -44,6 +44,8 @@ static cl::opt<bool> DisableLowOverheadLoops(
"disable-arm-loloops", cl::Hidden, cl::init(false),
cl::desc("Disable the generation of low-overhead loops"));
+extern cl::opt<bool> DisableTailPredication;
+
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -1000,18 +1002,114 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
return true;
}
+static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
+ // We don't allow icmp's, and because we only look at single block loops,
+ // we simply count the icmps, i.e. there should only be 1 for the backedge.
+ if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
+ return false;
+
+ // We could allow extending/narrowing FP loads/stores, but codegen is
+ // too inefficient so reject this for now.
+ if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
+ return false;
+
+ // Extends have to be extending-loads
+ if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
+ if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
+ return false;
+
+ // Truncs have to be narrowing-stores
+ if (isa<TruncInst>(&I) )
+ if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
+ return false;
+
+ return true;
+}
+
+// To set up a tail-predicated loop, we need to know the total number of
+// elements processed by that loop. Thus, we need to determine the element
+// size and:
+// 1) it should be uniform for all operations in the vector loop, so we
+// e.g. don't want any widening/narrowing operations.
+// 2) it should be smaller than i64s because we don't have vector operations
+// that work on i64s.
+// 3) we don't want elements to be reversed or shuffled, to make sure the
+// tail-predication masks/predicates the right lanes.
+//
+static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+ const DataLayout &DL,
+ const LoopAccessInfo *LAI) {
+ PredicatedScalarEvolution PSE = LAI->getPSE();
+ int ICmpCount = 0;
+ int Stride = 0;
+
+ LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
+ SmallVector<Instruction *, 16> LoadStores;
+ for (BasicBlock *BB : L->blocks()) {
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ if (isa<PHINode>(&I))
+ continue;
+ if (!canTailPredicateInstruction(I, ICmpCount)) {
+ LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
+ return false;
+ }
+
+ Type *T = I.getType();
+ if (T->isPointerTy())
+ T = T->getPointerElementType();
+
+ if (T->getScalarSizeInBits() > 32) {
+ LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
+ return false;
+ }
+
+ if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
+ Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
+ int64_t NextStride = getPtrStride(PSE, Ptr, L);
+ // TODO: for now only allow consecutive strides of 1. We could support
+ // other strides as long as it is uniform, but let's keep it simple for
+ // now.
+ if (Stride == 0 && NextStride == 1) {
+ Stride = NextStride;
+ continue;
+ }
+ if (Stride != NextStride) {
+ LLVM_DEBUG(dbgs() << "Different strides found, can't "
+ "tail-predicate\n.");
+ return false;
+ }
+ }
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
+ return true;
+}
+
bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *TLI,
DominatorTree *DT,
const LoopAccessInfo *LAI) {
+ if (DisableTailPredication)
+ return false;
+
// Creating a predicated vector loop is the first step for generating a
// tail-predicated hardware loop, for which we need the MVE masked
// load/stores instructions:
if (!ST->hasMVEIntegerOps())
return false;
+ // For now, restrict this to single block loops.
+ if (L->getNumBlocks() > 1) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
+ "loop.\n");
+ return false;
+ }
+
+ assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected");
+
HardwareLoopInfo HWLoopInfo(L);
if (!HWLoopInfo.canAnalyze(*LI)) {
LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
@@ -1033,14 +1131,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
return false;
}
- // TODO: to set up a tail-predicated loop, which works by setting up
- // the total number of elements processed by the loop, we need to
- // determine the element size here, and if it is uniform for all operations
- // in the vector loop. This means we will reject narrowing/widening
- // operations, and don't want to predicate the vector loop, which is
- // the main prep step for tail-predicated loops.
-
- return false;
+ return canTailPredicateLoop(L, LI, SE, DL, LAI);
}
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 4db8ab17c49..36dcde250e3 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -41,7 +41,7 @@ using namespace llvm;
#define DEBUG_TYPE "mve-tail-predication"
#define DESC "Transform predicated vector loops to use MVE tail predication"
-static cl::opt<bool>
+cl::opt<bool>
DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
cl::init(true),
cl::desc("Disable MVE Tail Predication"));
OpenPOWER on IntegriCloud