3 files changed, 24 insertions, 4 deletions
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 13c2f72e1d9..30a1ad45844 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -552,6 +552,13 @@ inline uint64_t NextPowerOf2(uint64_t A) {
   return A + 1;
 }
 
+/// Returns the power of two which is less than or equal to the given value.
+/// Essentially, it is a floor operation across the domain of powers of two.
+inline uint64_t PowerOf2Floor(uint64_t A) {
+  if (!A) return 0;
+  return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
+}
+
 /// Returns the next integer (mod 2**64) that is greater than or equal to
 /// \p Value and is a multiple of \p Align. \p Align must be non-zero.
 ///
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 23dc8f39eea..5c9933a2c72 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5004,8 +5004,11 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   // registers. These registers are used by all of the unrolled instances.
   // Next, divide the remaining registers by the number of registers that is
   // required by the loop, in order to estimate how many parallel instances
-  // fit without causing spills.
-  unsigned UF = (TargetNumRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
+  // fit without causing spills. All of this is rounded down if necessary to be
+  // a power of two. We want power of two unroll factors to simplify any
+  // addressing operations or alignment considerations.
+  unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
+                              R.MaxLocalUsers);
 
   // Clamp the unroll factor ranges to reasonable factors.
   unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
@@ -5045,7 +5048,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
   if (LoopCost < SmallLoopCost) {
     DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
-    unsigned NewUF = SmallLoopCost / (LoopCost + 1);
+    unsigned NewUF = PowerOf2Floor(SmallLoopCost / LoopCost);
     return std::min(NewUF, UF);
   }
 
diff --git a/llvm/test/Transforms/LoopVectorize/unroll_novec.ll b/llvm/test/Transforms/LoopVectorize/unroll_novec.ll
index 33f128da905..be5bbb68e65 100644
--- a/llvm/test/Transforms/LoopVectorize/unroll_novec.ll
+++ b/llvm/test/Transforms/LoopVectorize/unroll_novec.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -force-vector-unroll=2 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -force-target-num-scalar-regs=16 -force-target-max-scalar-unroll=8 -small-loop-cost=20 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -12,10 +12,20 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;CHECK-LABEL: @inc(
 ;CHECK: load i32*
 ;CHECK: load i32*
+;CHECK: load i32*
+;CHECK: load i32*
+;CHECK-NOT: load i32*
+;CHECK: add nsw i32
 ;CHECK: add nsw i32
 ;CHECK: add nsw i32
+;CHECK: add nsw i32
+;CHECK-NOT: add nsw i32
+;CHECK: store i32
+;CHECK: store i32
 ;CHECK: store i32
 ;CHECK: store i32
+;CHECK-NOT: store i32
+;CHECK: add i64 %{{.*}}, 4
 ;CHECK: ret void
 define void @inc(i32 %n) nounwind uwtable noinline ssp {
   %1 = icmp sgt i32 %n, 0