[LV] Reallow positive-stride interleaved load groups with gaps

We previously disallowed interleaved load groups that may cause us to speculatively access memory out-of-bounds (r261331). We did this by ensuring each load group had an access corresponding to the first and last member. Instead of bailing out for these interleaved groups, this patch enables us to peel off the last vector iteration, ensuring that we execute at least one iteration of the scalar remainder loop. This solution was proposed in the review of the previous patch. Differential Revision: http://reviews.llvm.org/D19487 llvm-svn: 267751
author: Matthew Simpson <mssimpso@codeaurora.org> 2016-04-27 18:21:36 +0000
committer: Matthew Simpson <mssimpso@codeaurora.org> 2016-04-27 18:21:36 +0000
commit: 622b95be7b0b49e6e428cff3bc7759bc544994aa (patch)
tree: 1c53075e5328b3bc17f91aa0d720bab2bb06b874 /llvm/test/Transforms
parent: ccd318dc7ec4365ea03a68d7fe13929ea0b1b3c8 (diff)
download: bcm5719-llvm-622b95be7b0b49e6e428cff3bc7759bc544994aa.tar.gz
bcm5719-llvm-622b95be7b0b49e6e428cff3bc7759bc544994aa.zip
1 files changed, 99 insertions, 6 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 54ce3e29293..1cce7931936 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -284,18 +284,24 @@ for.body:                                         ; preds = %for.body, %entry
 }
 
 ; Check vectorization on an interleaved load group of factor 2 with 1 gap
-; (missing the load of odd elements).
+; (missing the load of odd elements). Because the vectorized loop would
+; speculatively access memory out-of-bounds, we must execute at least one
+; iteration of the scalar loop.
 
-; void even_load(int *A, int *B) {
+; void even_load_static_tc(int *A, int *B) {
 ;  for (unsigned i = 0; i < 1024; i+=2)
 ;     B[i/2] = A[i] * 2;
 ; }
 
-; CHECK-LABEL: @even_load(
-; CHECK-NOT: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
-; CHECK-NOT: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-LABEL: @even_load_static_tc(
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   icmp eq i64 %index.next, 508
+; CHECK: middle.block:
+; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
 
-define void @even_load(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
 entry:
   br label %for.body
 
@@ -315,6 +321,93 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
+; Check vectorization on an interleaved load group of factor 2 with 1 gap
+; (missing the load of odd elements). Because the vectorized loop would
+; speculatively access memory out-of-bounds, we must execute at least one
+; iteration of the scalar loop.
+
+; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
+;  for (unsigned i = 0; i < N; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK-LABEL: @even_load_dynamic_tc(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   icmp eq i64 %index.next, %n.vec
+; CHECK: middle.block:
+; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
+
+define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %tmp, 1
+  %tmp1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, %N
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on a reverse interleaved load group of factor 2 with 1
+; gap and a reverse interleaved store group of factor 2. The interleaved load
+; group should be removed since it has a gap and is reverse.
+
+; struct pair {
+;  int x;
+;  int y;
+; };
+;
+; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = X + i;
+;     int b = A[i].y - i;
+;     B[i].x = a;
+;     B[i].y = b;
+;   }
+; }
+
+; CHECK-LABEL: @load_gap_reverse(
+; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8
+; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+
+%pair = type { i64, i64 }
+define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
+  %0 = add nsw i64 %X, %i
+  %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
+  %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
+  %3 = load i64, i64* %2, align 8
+  %4 = sub nsw i64 %3, %i
+  store i64 %0, i64* %1, align 8
+  store i64 %4, i64* %2, align 8
+  %i.next = add nsw i64 %i, -1
+  %cond = icmp sgt i64 %i, 0
+  br i1 %cond, label %for.body, label %for.exit
+
+for.exit:
+  ret void
+}
+
 ; Check vectorization on interleaved access groups identified from mixed
 ; loads/stores.
 ; void mixed_load2_store2(int *A, int *B) {
author	Matthew Simpson <mssimpso@codeaurora.org>	2016-04-27 18:21:36 +0000
committer	Matthew Simpson <mssimpso@codeaurora.org>	2016-04-27 18:21:36 +0000
commit	622b95be7b0b49e6e428cff3bc7759bc544994aa (patch)
tree	1c53075e5328b3bc17f91aa0d720bab2bb06b874 /llvm/test/Transforms
parent	ccd318dc7ec4365ea03a68d7fe13929ea0b1b3c8 (diff)
download	bcm5719-llvm-622b95be7b0b49e6e428cff3bc7759bc544994aa.tar.gz bcm5719-llvm-622b95be7b0b49e6e428cff3bc7759bc544994aa.zip