summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/Support/MathExtras.h7
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp9
-rw-r--r--llvm/test/Transforms/LoopVectorize/unroll_novec.ll12
3 files changed, 24 insertions, 4 deletions
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 13c2f72e1d9..30a1ad45844 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -552,6 +552,13 @@ inline uint64_t NextPowerOf2(uint64_t A) {
return A + 1;
}
+/// Returns the power of two which is less than or equal to the given value.
+/// Essentially, it is a floor operation across the domain of powers of two.
+inline uint64_t PowerOf2Floor(uint64_t A) {
+ if (!A) return 0;
+ return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
+}
+
/// Returns the next integer (mod 2**64) that is greater than or equal to
/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
///
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 23dc8f39eea..5c9933a2c72 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5004,8 +5004,11 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
// registers. These registers are used by all of the unrolled instances.
// Next, divide the remaining registers by the number of registers that is
// required by the loop, in order to estimate how many parallel instances
- // fit without causing spills.
- unsigned UF = (TargetNumRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
+ // fit without causing spills. All of this is rounded down if necessary to be
+ // a power of two. We want power of two unroll factors to simplify any
+ // addressing operations or alignment considerations.
+ unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
+ R.MaxLocalUsers);
// Clamp the unroll factor ranges to reasonable factors.
unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
@@ -5045,7 +5048,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
if (LoopCost < SmallLoopCost) {
DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
- unsigned NewUF = SmallLoopCost / (LoopCost + 1);
+ unsigned NewUF = PowerOf2Floor(SmallLoopCost / LoopCost);
return std::min(NewUF, UF);
}
diff --git a/llvm/test/Transforms/LoopVectorize/unroll_novec.ll b/llvm/test/Transforms/LoopVectorize/unroll_novec.ll
index 33f128da905..be5bbb68e65 100644
--- a/llvm/test/Transforms/LoopVectorize/unroll_novec.ll
+++ b/llvm/test/Transforms/LoopVectorize/unroll_novec.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=2 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-target-num-scalar-regs=16 -force-target-max-scalar-unroll=8 -small-loop-cost=20 -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
@@ -12,10 +12,20 @@ target triple = "x86_64-apple-macosx10.8.0"
;CHECK-LABEL: @inc(
;CHECK: load i32*
;CHECK: load i32*
+;CHECK: load i32*
+;CHECK: load i32*
+;CHECK-NOT: load i32*
+;CHECK: add nsw i32
;CHECK: add nsw i32
;CHECK: add nsw i32
+;CHECK: add nsw i32
+;CHECK-NOT: add nsw i32
+;CHECK: store i32
+;CHECK: store i32
;CHECK: store i32
;CHECK: store i32
+;CHECK-NOT: store i32
+;CHECK: add i64 %{{.*}}, 4
;CHECK: ret void
define void @inc(i32 %n) nounwind uwtable noinline ssp {
%1 = icmp sgt i32 %n, 0
OpenPOWER on IntegriCloud