[LV] Optimize for size when vectorizing loops with tiny trip count

It may be detrimental to vectorize loops with very small trip count, as various costs of the vectorized loop body as well as enclosing overheads including runtime tests and scalar iterations may outweigh the gains of vectorizing. The current cost model measures the cost of the vectorized loop body only, expecting it will amortize other costs, and loops with known or expected very small trip counts are not vectorized at all. This patch allows loops with very small trip counts to be vectorized, but under OptForSize constraints, which ensure the cost of the loop body is dominant, having no runtime guards nor scalar iterations. Patch inspired by D32451. Differential Revision: https://reviews.llvm.org/D34373 llvm-svn: 306803
author: Ayal Zaks <ayal.zaks@intel.com> 2017-06-30 08:02:35 +0000
committer: Ayal Zaks <ayal.zaks@intel.com> 2017-06-30 08:02:35 +0000
commit: 8d26f0a602f8f21f99d38d12fd0f2fb21da1409f (patch)
tree: 3b7d574d9c900ee3620820778230ae72dce516de /llvm/test/Transforms
parent: cc78ea6985707d8be08be362d859aeb4044b39c2 (diff)
download: bcm5719-llvm-8d26f0a602f8f21f99d38d12fd0f2fb21da1409f.tar.gz
bcm5719-llvm-8d26f0a602f8f21f99d38d12fd0f2fb21da1409f.zip
2 files changed, 32 insertions, 5 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 8d139ac7e5a..46fd022af66 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -3,10 +3,11 @@
 
 ; CHECK: LV: Loop hints: force=enabled
 ; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Loop hints: force=?
 ; No more loops in the module
 ; CHECK-NOT: LV: Loop hints: force=
-; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
-; CHECK: 1 loop-vectorize               - Number of loops vectorized
+; CHECK: 3 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 2 loop-vectorize               - Number of loops vectorized
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -71,3 +72,29 @@ for.end:
 
 !3 = !{!3}
 
+;
+; This loop will be vectorized as the trip count is below the threshold but no
+; scalar iterations are needed.
+;
+define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+
+!4 = !{!4}
+
diff --git a/llvm/test/Transforms/LoopVectorize/small-loop.ll b/llvm/test/Transforms/LoopVectorize/small-loop.ll
index 9a5dc4aa1b7..378283b464b 100644
--- a/llvm/test/Transforms/LoopVectorize/small-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/small-loop.ll
@@ -7,7 +7,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 @c = common global [2048 x i32] zeroinitializer, align 16
 
 ;CHECK-LABEL: @example1(
-;CHECK-NOT: load <4 x i32>
+;CHECK: load <4 x i32>
 ;CHECK: ret void
 define void @example1() nounwind uwtable ssp {
   br label %1
@@ -23,8 +23,8 @@ define void @example1() nounwind uwtable ssp {
   store i32 %6, i32* %7, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 8  ;   <-----  A really small trip count.
-  br i1 %exitcond, label %8, label %1
+  %exitcond = icmp eq i32 %lftr.wideiv, 8  ;   <-----  A really small trip count
+  br i1 %exitcond, label %8, label %1      ;           w/o scalar iteration overhead.
 
 ; <label>:8                                       ; preds = %1
   ret void
author	Ayal Zaks <ayal.zaks@intel.com>	2017-06-30 08:02:35 +0000
committer	Ayal Zaks <ayal.zaks@intel.com>	2017-06-30 08:02:35 +0000
commit	8d26f0a602f8f21f99d38d12fd0f2fb21da1409f (patch)
tree	3b7d574d9c900ee3620820778230ae72dce516de /llvm/test/Transforms
parent	cc78ea6985707d8be08be362d859aeb4044b39c2 (diff)
download	bcm5719-llvm-8d26f0a602f8f21f99d38d12fd0f2fb21da1409f.tar.gz bcm5719-llvm-8d26f0a602f8f21f99d38d12fd0f2fb21da1409f.zip