[X86] Enable interleaved memory access by default

This lets the loop vectorizer generate interleaved memory accesses on x86. Differential Revision: https://reviews.llvm.org/D25350 llvm-svn: 284779
author: Michael Kuperstein <mkuper@google.com> 2016-10-20 21:04:31 +0000
committer: Michael Kuperstein <mkuper@google.com> 2016-10-20 21:04:31 +0000
commit: b2443ed62bcf393693eedb9e789e4198f4e460cd (patch)
tree: 19fa5386500e9b6a671e8c605e7dd621d736428d /llvm/test/Transforms/LoopVectorize
parent: 2b81f42a76a58a23c358f6d72b65385c0073f94f (diff)
download: bcm5719-llvm-b2443ed62bcf393693eedb9e789e4198f4e460cd.tar.gz
bcm5719-llvm-b2443ed62bcf393693eedb9e789e4198f4e460cd.zip
4 files changed, 46 insertions, 11 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 699dd5bf035..0ee2660ed31 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -67,7 +67,7 @@ for:
   %t2 = load float, float* %arrayidx3, align 4
   %add = fadd fast float %t1, %s.02
   %add4 = fadd fast float %add, %t2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 32
   %cmp1 = icmp slt i64 %indvars.iv.next, %t0
   br i1 %cmp1, label %for, label %loopexit
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index 222dd7eef6b..2ce357540d0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -85,7 +85,7 @@ for.end:                                          ; preds = %for.cond
 ; The source code
 ;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
 ;
-;  for (int i=0; i<SIZE; ++i) {
+;  for (int i=0; i<SIZE; i += 16) {
 ;    if (trigger[i] > 0) {
 ;      out[i] = in[i].b + (float) 0.5;
 ;    }
@@ -95,9 +95,9 @@ for.end:                                          ; preds = %for.cond
 %struct.In = type { float, float }
 
 ;AVX512-LABEL: @foo2
-;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
+;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
 ;AVX512: llvm.masked.gather.v16f32
-;AVX512: llvm.masked.store.v16f32
+;AVX512: llvm.masked.scatter.v16f32
 ;AVX512: ret void
 define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
 entry:
@@ -147,7 +147,7 @@ if.end:                                           ; preds = %if.then, %for.body
 
 for.inc:                                          ; preds = %if.end
   %9 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %9, 1
+  %inc = add nsw i32 %9, 16
   store i32 %inc, i32* %i, align 4
   br label %for.cond
 
@@ -162,7 +162,7 @@ for.end:                                          ; preds = %for.cond
 ;};
 ;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
 ;
-;  for (int i=0; i<SIZE; ++i) {
+;  for (int i=0; i<SIZE; i += 16) {
 ;    if (trigger[i] > 0) {
 ;      out[i].b = in[i].b + (float) 0.5;
 ;    }
@@ -170,10 +170,10 @@ for.end:                                          ; preds = %for.cond
 ;}
 
 ;AVX512-LABEL: @foo3
-;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
+;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
 ;AVX512: llvm.masked.gather.v16f32
 ;AVX512: fadd <16 x float>
-;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1
+;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> {{.*}}, i32 1
 ;AVX512: llvm.masked.scatter.v16f32
 ;AVX512: ret void
 
@@ -226,7 +226,7 @@ if.end:                                           ; preds = %if.then, %for.body
 
 for.inc:                                          ; preds = %if.end
   %9 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %9, 1
+  %inc = add nsw i32 %9, 16
   store i32 %inc, i32* %i, align 4
   br label %for.cond
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
new file mode 100644
index 00000000000..de5db532438
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=ATOM
+
+; NORMAL-LABEL: foo
+; NORMAL: %[[WIDE:.*]] = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; NORMAL: %[[STRIDED1:.*]] = shufflevector <8 x i32> %[[WIDE]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; NORMAL: %[[STRIDED2:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; NORMAL: add nsw <4 x i32> %[[STRIDED2]], %[[STRIDED1]]
+
+; ATOM-LABEL: foo
+; ATOM: load i32
+; ATOM: load i32
+; ATOM: store i32
+define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = or i64 %0, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %3, %1
+  %arrayidx6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add4, i32* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index 1227344daff..0377ae1c24d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -341,7 +341,7 @@ for.end:                                          ; preds = %for.cond
 ;
 ;void foo4(double *A, double *B, int *trigger) {
 ;
-;  for (int i=0; i<10000; i++) {
+;  for (int i=0; i<10000; i += 16) {
 ;    if (trigger[i] < 100) {
 ;          A[i] = B[i*2] + trigger[i]; << non-cosecutive access
 ;    }
@@ -410,7 +410,7 @@ if.end:                                           ; preds = %if.then, %for.body
 
 for.inc:                                          ; preds = %if.end
   %12 = load i32, i32* %i, align 4
-  %inc = add nsw i32 %12, 1
+  %inc = add nsw i32 %12, 16
   store i32 %inc, i32* %i, align 4
   br label %for.cond
author	Michael Kuperstein <mkuper@google.com>	2016-10-20 21:04:31 +0000
committer	Michael Kuperstein <mkuper@google.com>	2016-10-20 21:04:31 +0000
commit	b2443ed62bcf393693eedb9e789e4198f4e460cd (patch)
tree	19fa5386500e9b6a671e8c605e7dd621d736428d /llvm/test/Transforms/LoopVectorize
parent	2b81f42a76a58a23c358f6d72b65385c0073f94f (diff)
download	bcm5719-llvm-b2443ed62bcf393693eedb9e789e4198f4e460cd.tar.gz bcm5719-llvm-b2443ed62bcf393693eedb9e789e4198f4e460cd.zip