summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h2
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp47
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/small-size.ll26
-rw-r--r--llvm/test/Transforms/LoopVectorize/tripcount.ll91
4 files changed, 111 insertions, 55 deletions
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
index 73d1f264c37..57d10c4c747 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -87,8 +87,6 @@ struct LoopVectorizePass : public PassInfoMixin<LoopVectorizePass> {
std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
OptimizationRemarkEmitter *ORE;
- BlockFrequency ColdEntryFreq;
-
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
// Shim for old PM.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1abdb248485..eac2867233b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5702,14 +5702,14 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// We should not collect Uniforms more than once per VF. Right now,
- // this function is called from collectUniformsAndScalars(), which
+ // this function is called from collectUniformsAndScalars(), which
// already does this check. Collecting Uniforms for VF=1 does not make any
// sense.
assert(VF >= 2 && !Uniforms.count(VF) &&
"This function should not be visited twice for the same VF");
- // Visit the list of Uniforms. If we'll not find any uniform value, we'll
+ // Visit the list of Uniforms. If we'll not find any uniform value, we'll
// not analyze again. Uniforms.count(VF) will return 1.
Uniforms[VF].clear();
@@ -5988,10 +5988,10 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
continue;
Value *Ptr = getPointerOperand(&I);
- // We don't check wrapping here because we don't know yet if Ptr will be
- // part of a full group or a group with gaps. Checking wrapping for all
+ // We don't check wrapping here because we don't know yet if Ptr will be
+ // part of a full group or a group with gaps. Checking wrapping for all
// pointers (even those that end up in groups with no gaps) will be overly
- // conservative. For full groups, wrapping should be ok since if we would
+ // conservative. For full groups, wrapping should be ok since if we would
// wrap around the address space we would do a memory access at nullptr
// even without the transformation. The wrapping checks are therefore
// deferred until after we've formed the interleaved groups.
@@ -6244,7 +6244,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
if (LastMember) {
Value *LastMemberPtr = getPointerOperand(LastMember);
- if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
+ if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
/*ShouldCheckWrap=*/true)) {
DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
"last group member potentially pointer-wrapping.\n");
@@ -6252,9 +6252,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
}
} else {
// Case 3: A non-reversed interleaved load group with gaps: We need
- // to execute at least one scalar epilogue iteration. This will ensure
+ // to execute at least one scalar epilogue iteration. This will ensure
// we don't speculatively access memory out-of-bounds. We only need
- // to look for a member at index factor - 1, since every group must have
+ // to look for a member at index factor - 1, since every group must have
// a member at index zero.
if (Group->isReverse()) {
releaseGroup(Group);
@@ -7789,8 +7789,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check the loop for a trip count threshold:
// do not vectorize loops with a tiny trip count.
- const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L);
- if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) {
+ unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
+ bool HasExpectedTC = (ExpectedTC > 0);
+
+ if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
+ auto EstimatedTC = getLoopEstimatedTripCount(L);
+ if (EstimatedTC) {
+ ExpectedTC = *EstimatedTC;
+ HasExpectedTC = true;
+ }
+ }
+
+ if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is not worth vectorizing.");
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
@@ -7822,18 +7832,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool OptForSize =
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
- // Compute the weighted frequency of this loop being executed and see if it
- // is less than 20% of the function entry baseline frequency. Note that we
- // always have a canonical loop here because we think we *can* vectorize.
- // FIXME: This is hidden behind a flag due to pervasive problems with
- // exactly what block frequency models.
- if (LoopVectorizeWithBlockFrequency) {
- BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
- if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
- LoopEntryFreq < ColdEntryFreq)
- OptForSize = true;
- }
-
// Check the function attributes to see if implicit floats are allowed.
// FIXME: This check doesn't seem possibly correct -- what if the loop is
// an integer loop and the vector instructions selected are purely integer
@@ -8015,11 +8013,6 @@ bool LoopVectorizePass::runImpl(
DB = &DB_;
ORE = &ORE_;
- // Compute some weights outside of the loop over the loops. Compute this
- // using a BranchProbability to re-use its scaling math.
- const BranchProbability ColdProb(1, 5); // 20%
- ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
-
// Don't attempt if
// 1. the target claims to have no vector registers, and
// 2. interleaving won't help ILP.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index 47c262b11b4..89d69e232f5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -115,32 +115,6 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
ret void
}
-; N is unknown, we need a tail. Can't vectorize because the loop is cold.
-;CHECK-LABEL: @example4(
-;CHECK-NOT: <4 x i32>
-;CHECK: ret void
-define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) {
- %1 = icmp eq i32 %n, 0
- br i1 %1, label %._crit_edge, label %.lr.ph, !prof !0
-
-.lr.ph: ; preds = %0, %.lr.ph
- %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
- %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
- %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
- %2 = add nsw i32 %.05, -1
- %3 = getelementptr inbounds i32, i32* %.023, i64 1
- %4 = load i32, i32* %.023, align 16
- %5 = getelementptr inbounds i32, i32* %.014, i64 1
- store i32 %4, i32* %.014, align 16
- %6 = icmp eq i32 %2, 0
- br i1 %6, label %._crit_edge, label %.lr.ph
-
-._crit_edge: ; preds = %.lr.ph, %0
- ret void
-}
-
-!0 = !{!"branch_weights", i32 64, i32 4}
-
; We can't vectorize this one because we need a runtime ptr check.
;CHECK-LABEL: @example23(
;CHECK-NOT: <4 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll
new file mode 100644
index 00000000000..03b3aa171d4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/tripcount.ll
@@ -0,0 +1,91 @@
+; This test verifies that the loop vectorizer will not vectorizes low trip count
+; loops that require runtime checks (Trip count is computed with profile info).
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -loop-vectorize-with-block-frequency -S | FileCheck %s
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_low_trip_count1(i32 %bound) {
+; Simple loop with low tripcount. Should not be vectorized.
+
+; CHECK-LABEL: @foo_low_trip_count1(
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+ %0 = load i8, i8* %arrayidx, align 1
+ %cmp1 = icmp eq i8 %0, 0
+ %. = select i1 %cmp1, i8 2, i8 1
+ store i8 %., i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %i.08, %bound
+ br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end: ; preds = %for.body
+ ret i32 0
+}
+
+define i32 @foo_low_trip_count2(i32 %bound) !prof !0 {
+; The loop has a same invocation count with the function, but has a low
+; trip_count per invocation and not worth to vectorize.
+
+; CHECK-LABEL: @foo_low_trip_count2(
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+ %0 = load i8, i8* %arrayidx, align 1
+ %cmp1 = icmp eq i8 %0, 0
+ %. = select i1 %cmp1, i8 2, i8 1
+ store i8 %., i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %i.08, %bound
+ br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end: ; preds = %for.body
+ ret i32 0
+}
+
+define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
+; The loop has low invocation count compare to the function invocation count,
+; but has a high trip count per invocation. Vectorize it.
+
+; CHECK-LABEL: @foo_low_trip_count3(
+; CHECK: vector.body:
+
+entry:
+ br i1 %cond, label %for.preheader, label %for.end, !prof !2
+
+for.preheader:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+ %0 = load i8, i8* %arrayidx, align 1
+ %cmp1 = icmp eq i8 %0, 0
+ %. = select i1 %cmp1, i8 2, i8 1
+ store i8 %., i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %i.08, %bound
+ br i1 %exitcond, label %for.end, label %for.body, !prof !3
+
+for.end: ; preds = %for.body
+ ret i32 0
+}
+
+
+!0 = !{!"function_entry_count", i64 100}
+!1 = !{!"branch_weights", i32 100, i32 0}
+!2 = !{!"branch_weights", i32 10, i32 90}
+!3 = !{!"branch_weights", i32 10, i32 10000}
OpenPOWER on IntegriCloud