Revert "r306473 - re-commit r306336: Enable vectorizer-maximize-bandwidth by default."

This still breaks PPC tests we have. I'll forward reproduction instructions to dehao. llvm-svn: 306792
author: Daniel Jasper <djasper@google.com> 2017-06-30 06:32:21 +0000
committer: Daniel Jasper <djasper@google.com> 2017-06-30 06:32:21 +0000
commit: 5ce1ce742ed24d215eed7d7f93373d484477979c (patch)
tree: c0443f441348f075459079fb6793d8119656ba26 /llvm/test/Transforms/LoopVectorize/X86
parent: 37c8ee76116616d4964cba2f1aa3317914c83eb8 (diff)
download: bcm5719-llvm-5ce1ce742ed24d215eed7d7f93373d484477979c.tar.gz
bcm5719-llvm-5ce1ce742ed24d215eed7d7f93373d484477979c.zip
8 files changed, 40 insertions, 49 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
index f002dd8adec..c066afcfa63 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
@@ -9,9 +9,7 @@ target triple = "x86_64-apple-macosx"
 
 ; If we need to scalarize the fptoui and then use inserts to build up the
 ; vector again, then there is certainly no value in going 256-bit wide.
-; But as we default to maximize bandwidth, we should convert it to 256-bit
-; anyway.
-; CHECK: vpinsrd
+; CHECK-NOT: vpinsrd
 
 define void @convert() {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll
index 2e3dfa0a15f..c581f4bf2a6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -44,16 +44,17 @@ define void @example1() nounwind uwtable ssp {
   ret void
 }
 
+; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
 ;CHECK-LABEL: @example10b(
-;CHECK: load <8 x i16>
-;CHECK: sext <8 x i16>
-;CHECK: store <8 x i32>
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
 ;CHECK: ret void
 ;UNROLL-LABEL: @example10b(
-;UNROLL: load <8 x i16>
-;UNROLL: load <8 x i16>
-;UNROLL: store <8 x i32>
-;UNROLL: store <8 x i32>
+;UNROLL: load <4 x i16>
+;UNROLL: load <4 x i16>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
 ;UNROLL: ret void
 define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
   br label %1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index e15c707587f..0377ae1c24d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -260,28 +260,20 @@ for.end:                                          ; preds = %for.cond
 ;  }
 ;}
 
-;AVX1-LABEL: @foo3
-;AVX1: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
-;AVX1: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
-;AVX1: sitofp <4 x i32> %wide.load to <4 x double>
-;AVX1: fadd <4 x double>
-;AVX1: call void @llvm.masked.store.v4f64.p0v4f64
-;AVX1: ret void
-
-;AVX2-LABEL: @foo3
-;AVX2: icmp slt <8 x i32> %wide.load, <i32 100, i32 100,
-;AVX2: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
-;AVX2: sitofp <8 x i32> %wide.load to <8 x double>
-;AVX2: fadd <8 x double>
-;AVX2: call void @llvm.masked.store.v8f64.p0v8f64
-;AVX2: ret void
+;AVX-LABEL: @foo3
+;AVX: icmp slt <4 x i32> %wide.load, <i32 100, i32 100,
+;AVX: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
+;AVX: sitofp <4 x i32> %wide.load to <4 x double>
+;AVX: fadd <4 x double>
+;AVX: call void @llvm.masked.store.v4f64.p0v4f64
+;AVX: ret void
 
 ;AVX512-LABEL: @foo3
-;AVX512: icmp slt <16 x i32> %wide.load, <i32 100, i32 100,
-;AVX512: call <16 x double> @llvm.masked.load.v16f64.p0v16f64
-;AVX512: sitofp <16 x i32> %wide.load to <16 x double>
-;AVX512: fadd <16 x double>
-;AVX512: call void @llvm.masked.store.v16f64.p0v16f64
+;AVX512: icmp slt <8 x i32> %wide.load, <i32 100, i32 100,
+;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
+;AVX512: sitofp <8 x i32> %wide.load to <8 x double>
+;AVX512: fadd <8 x double>
+;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
 ;AVX512: ret void
 
 
@@ -510,19 +502,19 @@ for.end:                                          ; preds = %for.cond
 ;  }
 ;}
 ;AVX2-LABEL: @foo6
-;AVX2: icmp sgt <8 x i32> %reverse, zeroinitializer
-;AVX2: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
-;AVX2: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
-;AVX2: fadd <8 x double>
-;AVX2: call void @llvm.masked.store.v8f64.p0v8f64
+;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer
+;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> <i32 3, i32 2, i32 1, i32 0>
+;AVX2: call <4 x double> @llvm.masked.load.v4f64.p0v4f64
+;AVX2: fadd <4 x double>
+;AVX2: call void @llvm.masked.store.v4f64.p0v4f64
 ;AVX2: ret void
 
 ;AVX512-LABEL: @foo6
-;AVX512: icmp sgt <16 x i32> %reverse, zeroinitializer
-;AVX512: shufflevector <16 x i1>{{.*}}<16 x i32> <i32 15, i32 14, i32 13, i32 12
-;AVX512: call <16 x double> @llvm.masked.load.v16f64.p0v16f64
-;AVX512: fadd <16 x double>
-;AVX512: call void @llvm.masked.store.v16f64.p0v16f64
+;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer
+;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> <i32 7, i32 6, i32 5, i32 4
+;AVX512: call <8 x double> @llvm.masked.load.v8f64.p0v8f64
+;AVX512: fadd <8 x double>
+;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
 ;AVX512: ret void
 
 
@@ -590,8 +582,8 @@ for.end:                                          ; preds = %for.cond
 ; }
 
 ;AVX512-LABEL: @foo7
-;AVX512: call <64 x double*> @llvm.masked.load.v64p0f64.p0v64p0f64(<64 x double*>*
-;AVX512: call void @llvm.masked.store.v64f64.p0v64f64
+;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>*
+;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
 ;AVX512: ret void
 
 define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 {
@@ -662,8 +654,8 @@ for.end:                                          ; preds = %for.cond
 ;}
 
 ;AVX512-LABEL: @foo8
-;AVX512: call <64 x i32 ()*> @llvm.masked.load.v64p0f_i32f.p0v64p0f_i32f(<64 x i32 ()*>* %
-;AVX512: call void @llvm.masked.store.v64f64.p0v64f64
+;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* %
+;AVX512: call void @llvm.masked.store.v8f64.p0v8f64
 ;AVX512: ret void
 
 define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 {
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll
index 04ecaeb4e4e..2efe928f0f4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations
 ; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
-; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2)
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.10.0"
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
index 569c50d4fd6..86b40dc613b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
@@ -3,7 +3,7 @@
 
 ; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations (hotness: 300)
 ; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
-; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2) (hotness: 300)
+; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) (hotness: 300)
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.10.0"
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
index c9761c3b31d..6393002d507 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
@@ -7,7 +7,7 @@ target triple = "i386-apple-darwin"
 define void @test1(float* nocapture %arg, i32 %arg1) nounwind {
 ; CHECK-LABEL: @test1(
 ; CHECK: preheader
-; CHECK: insertelement <4 x double> zeroinitializer, double %tmp, i32 0
+; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0
 ; CHECK: vector.memcheck
 
 bb:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
index 65ff9b72e2d..1d51b9c4bea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
@@ -6,7 +6,7 @@
 ; DEBUG-OUTPUT-NOT: .loc
 ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
 
-; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1)
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
 ; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
 ; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
index 22cbc4a931e..c14a2cb91b6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
@@ -6,7 +6,7 @@
 ; DEBUG-OUTPUT-NOT: .loc
 ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
 
-; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1)
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
 ; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
 ; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
author	Daniel Jasper <djasper@google.com>	2017-06-30 06:32:21 +0000
committer	Daniel Jasper <djasper@google.com>	2017-06-30 06:32:21 +0000
commit	5ce1ce742ed24d215eed7d7f93373d484477979c (patch)
tree	c0443f441348f075459079fb6793d8119656ba26 /llvm/test/Transforms/LoopVectorize/X86
parent	37c8ee76116616d4964cba2f1aa3317914c83eb8 (diff)
download	bcm5719-llvm-5ce1ce742ed24d215eed7d7f93373d484477979c.tar.gz bcm5719-llvm-5ce1ce742ed24d215eed7d7f93373d484477979c.zip