summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp26
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/optsize.ll60
-rw-r--r--llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll54
3 files changed, 139 insertions, 1 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 23d4a6b2166..c9c70b5c536 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2557,7 +2557,8 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
if (C->isZero())
return;
- assert(!Cost->foldTailByMasking() && "Cannot check stride when folding tail");
+ assert(!Cost->foldTailByMasking() &&
+ "Cannot SCEV check stride or overflow when folding tail");
// Create a new block containing the stride check.
BB->setName("vector.scevcheck");
auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -4637,6 +4638,29 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
return None;
}
+ if (!PSE.getUnionPredicate().getPredicates().empty()) {
+ ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
+ << "runtime SCEV checks needed. Enable vectorization of this "
+ "loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz");
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
+ return None;
+ }
+
+ // FIXME: Avoid specializing for stride==1 instead of bailing out.
+ if (!Legal->getLAI()->getSymbolicStrides().empty()) {
+ ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
+ << "runtime stride == 1 checks needed. Enable vectorization of "
+ "this loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz");
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
+ return None;
+ }
+
// If we optimize the program for size, avoid creating the tail loop.
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index 508823475ea..9fa65534f32 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -3,6 +3,7 @@
; will produce a tail loop with the optimize for size or the minimize size
; attributes. This is a target-dependent version of the test.
; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s
+; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s --check-prefix AUTOVF
target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
@@ -136,3 +137,62 @@ for.end: ; preds = %for.body
attributes #1 = { minsize }
+
+; We can't vectorize this one because we version for stride==1; even having TC
+; a multiple of VF.
+; CHECK-LABEL: @scev4stride1
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+; AUTOVF-LABEL: @scev4stride1
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.body:
+define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
+for.body.preheader:
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %mul = mul nsw i32 %i.07, %k
+ %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
+ store i32 %0, i32* %arrayidx1, align 4
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, 256
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ ret void
+}
+
+attributes #2 = { optsize }
+
+
+; PR39497
+; We can't vectorize this one because we version for overflow check and tiny
+; trip count leads to opt-for-size (which otherwise could fold the tail by
+; masking).
+; CHECK-LABEL: @main
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.cond:
+; AUTOVF-LABEL: @main
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.cond:
+define i32 @main() local_unnamed_addr {
+while.cond:
+ br label %for.cond
+
+for.cond:
+ %d.0 = phi i32 [ 0, %while.cond ], [ %add, %for.cond ]
+ %conv = and i32 %d.0, 65535
+ %cmp = icmp ult i32 %conv, 4
+ %add = add nuw nsw i32 %conv, 1
+ br i1 %cmp, label %for.cond, label %while.cond.loopexit
+
+while.cond.loopexit:
+ ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
new file mode 100644
index 00000000000..6032fb18a38
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -0,0 +1,54 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; PR39417
+; Check that the need for overflow check prevents vectorizing a loop with tiny
+; trip count (which implies opt for size).
+; CHECK-LABEL: @func_34
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: bb67:
+define void @func_34() {
+bb1:
+ br label %bb67
+
+bb67:
+ %storemerge2 = phi i32 [ 0, %bb1 ], [ %_tmp2300, %bb67 ]
+ %sext = shl i32 %storemerge2, 16
+ %_tmp2299 = ashr exact i32 %sext, 16
+ %_tmp2300 = add nsw i32 %_tmp2299, 1
+ %_tmp2310 = trunc i32 %_tmp2300 to i16
+ %_tmp2312 = icmp slt i16 %_tmp2310, 3
+ br i1 %_tmp2312, label %bb67, label %bb68
+
+bb68:
+ ret void
+}
+
+; Check that the need for stride==1 check prevents vectorizing a loop under opt
+; for size.
+; CHECK-LABEL: @scev4stride1
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #0 {
+for.body.preheader:
+ br label %for.body
+
+for.body:
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %mul = mul nsw i32 %i.07, %k
+ %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
+ store i32 %0, i32* %arrayidx1, align 4
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, 1024
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ ret void
+}
+
+attributes #0 = { optsize }
OpenPOWER on IntegriCloud