diff options
| -rw-r--r-- | llvm/include/llvm/Support/MathExtras.h | 7 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 9 | ||||
| -rw-r--r-- | llvm/test/Transforms/LoopVectorize/unroll_novec.ll | 12 |
3 files changed, 24 insertions, 4 deletions
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h index 13c2f72e1d9..30a1ad45844 100644 --- a/llvm/include/llvm/Support/MathExtras.h +++ b/llvm/include/llvm/Support/MathExtras.h @@ -552,6 +552,13 @@ inline uint64_t NextPowerOf2(uint64_t A) { return A + 1; } +/// Returns the power of two which is less than or equal to the given value. +/// Essentially, it is a floor operation across the domain of powers of two. +inline uint64_t PowerOf2Floor(uint64_t A) { + if (!A) return 0; + return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); +} + /// Returns the next integer (mod 2**64) that is greater than or equal to /// \p Value and is a multiple of \p Align. \p Align must be non-zero. /// diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 23dc8f39eea..5c9933a2c72 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5004,8 +5004,11 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // registers. These registers are used by all of the unrolled instances. // Next, divide the remaining registers by the number of registers that is // required by the loop, in order to estimate how many parallel instances - // fit without causing spills. - unsigned UF = (TargetNumRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; + // fit without causing spills. All of this is rounded down if necessary to be + // a power of two. We want power of two unroll factors to simplify any + // addressing operations or alignment considerations. + unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / + R.MaxLocalUsers); // Clamp the unroll factor ranges to reasonable factors. unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor(); @@ -5045,7 +5048,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); if (LoopCost < SmallLoopCost) { DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n"); - unsigned NewUF = SmallLoopCost / (LoopCost + 1); + unsigned NewUF = PowerOf2Floor(SmallLoopCost / LoopCost); return std::min(NewUF, UF); } diff --git a/llvm/test/Transforms/LoopVectorize/unroll_novec.ll b/llvm/test/Transforms/LoopVectorize/unroll_novec.ll index 33f128da905..be5bbb68e65 100644 --- a/llvm/test/Transforms/LoopVectorize/unroll_novec.ll +++ b/llvm/test/Transforms/LoopVectorize/unroll_novec.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=2 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-target-num-scalar-regs=16 -force-target-max-scalar-unroll=8 -small-loop-cost=20 -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -12,10 +12,20 @@ target triple = "x86_64-apple-macosx10.8.0" ;CHECK-LABEL: @inc( ;CHECK: load i32* ;CHECK: load i32* +;CHECK: load i32* +;CHECK: load i32* +;CHECK-NOT: load i32* +;CHECK: add nsw i32 ;CHECK: add nsw i32 ;CHECK: add nsw i32 +;CHECK: add nsw i32 +;CHECK-NOT: add nsw i32 +;CHECK: store i32 +;CHECK: store i32 ;CHECK: store i32 ;CHECK: store i32 +;CHECK-NOT: store i32 +;CHECK: add i64 %{{.*}}, 4 ;CHECK: ret void define void @inc(i32 %n) nounwind uwtable noinline ssp { %1 = icmp sgt i32 %n, 0 |

