[LV] Preserve order of dependences in interleaved accesses analysis

The interleaved access analysis currently assumes that the inserted run-time pointer aliasing checks ensure the absence of dependences that would prevent its instruction reordering. However, this is not the case. Issues can arise from how code generation is performed for interleaved groups. For a load group, all loads in the group are essentially moved to the location of the first load in program order, and for a store group, all stores in the group are moved to the location of the last store. For groups having members involved in a dependence relation with any other instruction in the loop, this reordering can violate the dependence. This patch teaches the interleaved access analysis how to avoid breaking such dependences, and should fix PR27626. An assumption of the original analysis was that the accesses had been collected in "program order". The analysis was then simplified by visiting the accesses bottom-up. However, this ordering was never guaranteed for anything other than single basic block loops. Thus, this patch also enforces the desired ordering. Reference: https://llvm.org/bugs/show_bug.cgi?id=27626 Differential Revision: http://reviews.llvm.org/D19984 llvm-svn: 273687
author: Matthew Simpson <mssimpso@codeaurora.org> 2016-06-24 15:33:25 +0000
committer: Matthew Simpson <mssimpso@codeaurora.org> 2016-06-24 15:33:25 +0000
commit: e794678404abc41ce3b22625da39c1379c1d7a1d (patch)
tree: 09b28fc16111f5883eacf49399c5999ee3517175 /llvm/test/Transforms/LoopVectorize
parent: 6c7a8abf5c78005bd329f389db8d36820bbee819 (diff)
download: bcm5719-llvm-e794678404abc41ce3b22625da39c1379c1d7a1d.tar.gz
bcm5719-llvm-e794678404abc41ce3b22625da39c1379c1d7a1d.zip
1 files changed, 305 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 1cce7931936..868c3a2cdab 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -555,4 +555,309 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
+; Check vectorization of interleaved access groups in the presence of
+; dependences (PR27626). The following tests check that we don't reorder
+; dependent loads and stores when generating code for interleaved access
+; groups. Stores should be scalarized because the required code motion would
+; break dependences, and the remaining interleaved load groups should have
+; gaps.
+
+; PR27626_0: Ensure a strided store is not moved after a dependent (zero
+;            distance) strided load.
+
+; void PR27626_0(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i].x = z;
+;     p[i].y = p[i].x;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_0(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK:   store i32 %[[X1]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK:   store i32 %[[X2]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK:   store i32 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK:   store i32 %[[X4]], {{.*}}
+
+%pair.i32 = type { i32, i32 }
+define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %z, i32* %p_i.x, align 4
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32 *%p_i.y, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_1: Ensure a strided load is not moved before a dependent (zero
+;            distance) strided store.
+
+; void PR27626_1(struct pair *p, int n) {
+;   int s = 0;
+;   for (int i = 0; i < n; i++) {
+;     p[i].y = p[i].x;
+;     s += p[i].y
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_1(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
+; CHECK:   store i32 %[[X1:.+]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
+; CHECK:   store i32 %[[X2:.+]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
+; CHECK:   store i32 %[[X3:.+]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
+; CHECK:   store i32 %[[X4:.+]], {{.*}}
+; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
+
+define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32* %p_i.y, align 4
+  %1 = load i32, i32* %p_i.y, align 4
+  %2 = add nsw i32 %1, %s
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3 = phi i32 [ %2, %for.body ]
+  ret i32 %3
+}
+
+; PR27626_2: Ensure a strided store is not moved after a dependent (negative
+;            distance) strided load.
+
+; void PR27626_2(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i].x = z;
+;     p[i].y = p[i - 1].x;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_2(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK:   store i32 %[[X1]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK:   store i32 %[[X2]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK:   store i32 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK:   store i32 %[[X4]], {{.*}}
+
+define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %i_minus_1 = add nuw nsw i64 %i, -1
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %z, i32* %p_i.x, align 4
+  %0 = load i32, i32* %p_i_minus_1.x, align 4
+  store i32 %0, i32 *%p_i.y, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_3: Ensure a strided load is not moved before a dependent (negative
+;            distance) strided store.
+
+; void PR27626_3(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i + 1].y = p[i].x;
+;     s += p[i].y;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_3(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
+; CHECK:   store i32 %[[X1:.+]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
+; CHECK:   store i32 %[[X2:.+]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
+; CHECK:   store i32 %[[X3:.+]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
+; CHECK:   store i32 %[[X4:.+]], {{.*}}
+; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
+
+define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
+  %i_plus_1 = add nuw nsw i64 %i, 1
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32* %p_i_plus_1.y, align 4
+  %1 = load i32, i32* %p_i.y, align 4
+  %2 = add nsw i32 %1, %s
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3 = phi i32 [ %2, %for.body ]
+  ret i32 %3
+}
+
+; PR27626_4: Ensure we form an interleaved group for strided stores in the
+;            presence of a write-after-write dependence. We create a group for
+;            (2) and (3) while excluding (1).
+
+; void PR27626_4(int *a, int x, int y, int z, int n) {
+;   for (int i = 0; i < n; i += 2) {
+;     a[i] = x;      // (1)
+;     a[i] = y;      // (2)
+;     a[i + 1] = z;  // (3)
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_4(
+; CHECK: vector.ph:
+; CHECK:   %[[INS_Y:.+]] = insertelement <4 x i32> undef, i32 %y, i32 0
+; CHECK:   %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:   %[[INS_Z:.+]] = insertelement <4 x i32> undef, i32 %z, i32 0
+; CHECK:   %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: vector.body:
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK:   store <8 x i32> %[[VEC]], {{.*}}
+
+define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %i_plus_1 = add i64 %i, 1
+  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
+  %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
+  store i32 %x, i32* %a_i, align 4
+  store i32 %y, i32* %a_i, align 4
+  store i32 %z, i32* %a_i_plus_1, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_5: Ensure we do not form an interleaved group for strided stores in
+;            the presence of a write-after-write dependence.
+
+; void PR27626_5(int *a, int x, int y, int z, int n) {
+;   for (int i = 3; i < n; i += 2) {
+;     a[i - 1] = x;
+;     a[i - 3] = y;
+;     a[i] = z;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_5(
+; CHECK: vector.body:
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+
+define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
+  %i_minus_1 = sub i64 %i, 1
+  %i_minus_3 = sub i64 %i_minus_1, 2
+  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
+  %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
+  %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
+  store i32 %x, i32* %a_i_minus_1, align 4
+  store i32 %y, i32* %a_i_minus_3, align 4
+  store i32 %z, i32* %a_i, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
 attributes #0 = { "unsafe-fp-math"="true" }
author	Matthew Simpson <mssimpso@codeaurora.org>	2016-06-24 15:33:25 +0000
committer	Matthew Simpson <mssimpso@codeaurora.org>	2016-06-24 15:33:25 +0000
commit	e794678404abc41ce3b22625da39c1379c1d7a1d (patch)
tree	09b28fc16111f5883eacf49399c5999ee3517175 /llvm/test/Transforms/LoopVectorize
parent	6c7a8abf5c78005bd329f389db8d36820bbee819 (diff)
download	bcm5719-llvm-e794678404abc41ce3b22625da39c1379c1d7a1d.tar.gz bcm5719-llvm-e794678404abc41ce3b22625da39c1379c1d7a1d.zip