diff options
author | Diego Caballero <diego.caballero@intel.com> | 2018-04-24 17:04:17 +0000 |
---|---|---|
committer | Diego Caballero <diego.caballero@intel.com> | 2018-04-24 17:04:17 +0000 |
commit | 60f2776b2f68698ab342174c122c26bf3bf4f198 (patch) | |
tree | 73fd8e165324ad2104b55213c463ad9f2844f8ee /llvm/test/Transforms/LoopVectorize | |
parent | ceee7889472456e370f4e943ca587c03f2de16be (diff) | |
download | bcm5719-llvm-60f2776b2f68698ab342174c122c26bf3bf4f198.tar.gz bcm5719-llvm-60f2776b2f68698ab342174c122c26bf3bf4f198.zip |
[LV][VPlan] Detect outer loops for explicit vectorization.
Patch #2 from VPlan Outer Loop Vectorization Patch Series #1
(RFC: http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html).
This patch introduces the basic infrastructure to detect, legality check
and process outer loops annotated with hints for explicit vectorization.
All these changes are protected under the feature flag
-enable-vplan-native-path. This should make this patch NFC for the existing
inner loop vectorizer.
Reviewers: hfinkel, mkuper, rengolin, fhahn, aemerson, mssimpso.
Differential Revision: https://reviews.llvm.org/D42447
llvm-svn: 330739
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize')
3 files changed, 553 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll new file mode 100644 index 00000000000..0bbeb40ac15 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll @@ -0,0 +1,238 @@ +; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Verify that outer loops annotated only with the expected explicit +; vectorization hints are collected for vectorization instead of inner loops. + +; Root C/C++ source code for all the test cases +; void foo(int *a, int *b, int N, int M) +; { +; int i, j; +; #pragma clang loop vectorize(enable) +; for (i = 0; i < N; i++) { +; for (j = 0; j < M; j++) { +; a[i*M+j] = b[i*M+j] * b[i*M+j]; +; } +; } +; } + +; Case 1: Annotated outer loop WITH vector width information must be collected. + +; CHECK-LABEL: vector_width +; CHECK: LV: Loop hints: force=enabled width=4 unroll=0 +; CHECK: LV: We can vectorize this outer loop! +; CHECK: LV: Using user VF 4. +; CHECK-NOT: LV: Loop hints: force=? +; CHECK-NOT: LV: Found a loop: inner.body + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @vector_width(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp32 = icmp sgt i32 %N, 0 + br i1 %cmp32, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %cmp230 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %M to i64 + %wide.trip.count38 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ] + br i1 %cmp230, label %inner.ph, label %outer.inc + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv35, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ] + %2 = add nsw i64 %indvars.iv, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38 + br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !6 + +for.end15: ; preds = %outer.inc, %entry + ret void +} + +; Case 2: Annotated outer loop WITHOUT vector width information doesn't have to +; be collected. + +; CHECK-LABEL: case2 +; CHECK-NOT: LV: Loop hints: force=enabled +; CHECK-NOT: LV: We can vectorize this outer loop! +; CHECK: LV: Loop hints: force=? +; CHECK: LV: Found a loop: inner.body + +define void @case2(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp32 = icmp sgt i32 %N, 0 + br i1 %cmp32, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %cmp230 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %M to i64 + %wide.trip.count38 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ] + br i1 %cmp230, label %inner.ph, label %outer.inc + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv35, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ] + %2 = add nsw i64 %indvars.iv, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38 + br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !9 + +for.end15: ; preds = %outer.inc, %entry + ret void +} + +; Case 3: Annotated outer loop WITH vector width and interleave information +; doesn't have to be collected. + +; CHECK-LABEL: case3 +; CHECK-NOT: LV: Loop hints: force=enabled +; CHECK-NOT: LV: We can vectorize this outer loop! +; CHECK: LV: Loop hints: force=? +; CHECK: LV: Found a loop: inner.body + +define void @case3(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp32 = icmp sgt i32 %N, 0 + br i1 %cmp32, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %cmp230 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %M to i64 + %wide.trip.count38 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ] + br i1 %cmp230, label %inner.ph, label %outer.inc + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv35, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ] + %2 = add nsw i64 %indvars.iv, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38 + br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !11 + +for.end15: ; preds = %outer.inc, %entry + ret void +} + +; Case 4: Outer loop without any explicit vectorization annotation doesn't have +; to be collected. + +; CHECK-LABEL: case4 +; CHECK-NOT: LV: Loop hints: force=enabled +; CHECK-NOT: LV: We can vectorize this outer loop! +; CHECK: LV: Loop hints: force=? +; CHECK: LV: Found a loop: inner.body + +define void @case4(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp32 = icmp sgt i32 %N, 0 + br i1 %cmp32, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %cmp230 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %M to i64 + %wide.trip.count38 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ] + br i1 %cmp230, label %inner.ph, label %outer.inc + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv35, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ] + %2 = add nsw i64 %indvars.iv, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38 + br i1 %exitcond39, label %for.end15, label %outer.body + +for.end15: ; preds = %outer.inc, %entry + ret void +} + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +; Case 1 +!6 = distinct !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.width", i32 4} +!8 = !{!"llvm.loop.vectorize.enable", i1 true} +; Case 2 +!9 = distinct !{!9, !8} +; Case 3 +!10 = !{!"llvm.loop.interleave.count", i32 2} +!11 = distinct !{!11, !7, !10, !8} diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll new file mode 100644 index 00000000000..d0ab58b30e3 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll @@ -0,0 +1,177 @@ +; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Verify that LV bails out on explicit vectorization outer loops that contain +; divergent inner loops. + +; Root C/C++ source code for all the test cases +; void foo(int *a, int *b, int N, int M) +; { +; int i, j; +; #pragma clang loop vectorize(enable) vectorize_width(8) +; for (i = 0; i < N; i++) { +; // Tested inner loop. It will be replaced per test. +; for (j = 0; j < M; j++) { +; a[i*M+j] = b[i*M+j] * b[i*M+j]; +; } +; } +; } + +; Case 1 (for (j = i; j < M; j++)): Inner loop with divergent IV start. + +; CHECK-LABEL: iv_start +; CHECK: LV: Not vectorizing: Outer loop contains divergent loops. +; CHECK: LV: Not vectorizing: Unsupported outer loop. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @iv_start(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp33 = icmp sgt i32 %N, 0 + br i1 %cmp33, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %M to i64 + %wide.trip.count41 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ] + %cmp231 = icmp slt i64 %indvars.iv38, %0 + br i1 %cmp231, label %inner.ph, label %outer.inc + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv38, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv35 = phi i64 [ %indvars.iv38, %inner.ph ], [ %indvars.iv.next36, %inner.body ] + %2 = add nsw i64 %indvars.iv35, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1 + %exitcond = icmp eq i64 %indvars.iv.next36, %wide.trip.count + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1 + %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41 + br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6 + +for.end15: ; preds = %outer.inc, %entry + ret void +} + + +; Case 2 (for (j = 0; j < i; j++)): Inner loop with divergent upper-bound. + +; CHECK-LABEL: loop_ub +; CHECK: LV: Not vectorizing: Outer loop contains divergent loops. +; CHECK: LV: Not vectorizing: Unsupported outer loop. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @loop_ub(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp32 = icmp sgt i32 %N, 0 + br i1 %cmp32, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %0 = sext i32 %M to i64 + %wide.trip.count41 = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ] + %cmp230 = icmp eq i64 %indvars.iv38, 0 + br i1 %cmp230, label %outer.inc, label %inner.ph + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv38, %0 + br label %inner.body + +inner.body: ; preds = %inner.body, %inner.ph + %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ] + %2 = add nsw i64 %indvars.iv, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %indvars.iv38 + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1 + %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41 + br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6 + +for.end15: ; preds = %outer.inc, %entry + ret void +} + +; Case 3 (for (j = 0; j < M; j+=i)): Inner loop with divergent step. + +; CHECK-LABEL: iv_step +; CHECK: LV: Not vectorizing: Outer loop contains divergent loops. +; CHECK: LV: Not vectorizing: Unsupported outer loop. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @iv_step(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp33 = icmp sgt i32 %N, 0 + br i1 %cmp33, label %outer.ph, label %for.end15 + +outer.ph: ; preds = %entry + %cmp231 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %N to i64 + br label %outer.body + +outer.body: ; preds = %for.inc14, %outer.ph + %indvars.iv39 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next40, %for.inc14 ] + br i1 %cmp231, label %inner.ph, label %for.inc14 + +inner.ph: ; preds = %outer.body + %1 = mul nsw i64 %indvars.iv39, %0 + br label %inner.body + +inner.body: ; preds = %inner.ph, %inner.body + %indvars.iv36 = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next37, %inner.body ] + %2 = add nsw i64 %indvars.iv36, %1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %mul8 = mul nsw i32 %3, %3 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2 + %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, %indvars.iv39 + %cmp2 = icmp slt i64 %indvars.iv.next37, %0 + br i1 %cmp2, label %inner.body, label %for.inc14 + +for.inc14: ; preds = %inner.body, %outer.body + %indvars.iv.next40 = add nuw nsw i64 %indvars.iv39, 1 + %exitcond = icmp eq i64 %indvars.iv.next40, %wide.trip.count + br i1 %exitcond, label %for.end15, label %outer.body, !llvm.loop !6 + +for.end15: ; preds = %for.inc14, %entry + ret void +} + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.width", i32 8} +!8 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll new file mode 100644 index 00000000000..e05e9dd813b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll @@ -0,0 +1,138 @@ +; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Verify that LV can handle explicit vectorization outer loops with uniform branches +; but bails out on outer loops with divergent branches. + +; Root C/C++ source code for the test cases +; void foo(int *a, int *b, int N, int M) +; { +; int i, j; +; #pragma clang loop vectorize(enable) vectorize_width(8) +; for (i = 0; i < N; i++) { +; // Tested conditional branch. COND will be replaced per test. +; if (COND) +; for (j = 0; j < M; j++) { +; a[i*M+j] = b[i*M+j] * b[i*M+j]; +; } +; } +; } + +; Case 1 (COND => M == N): Outer loop with uniform conditional branch. + +; CHECK-LABEL: uniform_branch +; CHECK: LV: We can vectorize this outer loop! + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @uniform_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp39 = icmp sgt i32 %N, 0 + br i1 %cmp39, label %outer.ph, label %for.end19 + +outer.ph: ; preds = %entry + %cmp337 = icmp slt i32 %M, 1 + %0 = sext i32 %M to i64 + %N64 = zext i32 %N to i64 + %M64 = zext i32 %M to i64 + %cmp1 = icmp ne i32 %M, %N ; Uniform condition + %brmerge = or i1 %cmp1, %cmp337 ; Uniform condition + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ] + %1 = mul nsw i64 %indvars.iv42, %0 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1 + %2 = load i32, i32* %arrayidx, align 4, !tbaa !2 + br i1 %brmerge, label %outer.inc, label %inner.ph ; Supported uniform branch + +inner.ph: ; preds = %outer.body + br label %inner.body + +inner.body: ; preds = %inner.ph, %inner.body + %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ] + %3 = add nsw i64 %indvars.iv, %1 + %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3 + %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2 + %mul12 = mul nsw i32 %4, %4 + %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3 + store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %M64 + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1 + %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64 + br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6 + +for.end19: ; preds = %outer.inc, %entry + ret void +} + + +; Case 2 (COND => B[i * M] == 0): Outer loop with divergent conditional branch. + +; CHECK-LABEL: divergent_branch +; CHECK: Unsupported conditional branch. +; CHECK: LV: Not vectorizing: Unsupported outer loop. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @divergent_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { +entry: + %cmp39 = icmp sgt i32 %N, 0 + br i1 %cmp39, label %outer.ph, label %for.end19 + +outer.ph: ; preds = %entry + %cmp337 = icmp slt i32 %M, 1 + %0 = sext i32 %M to i64 + %N64 = zext i32 %N to i64 + %M64 = zext i32 %M to i64 + br label %outer.body + +outer.body: ; preds = %outer.inc, %outer.ph + %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ] + %1 = mul nsw i64 %indvars.iv42, %0 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1 + %2 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %cmp1 = icmp ne i32 %2, 0 ; Divergent condition + %brmerge = or i1 %cmp1, %cmp337 ; Divergent condition + br i1 %brmerge, label %outer.inc, label %inner.ph ; Unsupported divergent branch. + +inner.ph: ; preds = %outer.body + br label %inner.body + +inner.body: ; preds = %inner.ph, %inner.body + %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ] + %3 = add nsw i64 %indvars.iv, %1 + %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3 + %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2 + %mul12 = mul nsw i32 %4, %4 + %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3 + store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %M64 + br i1 %exitcond, label %outer.inc, label %inner.body + +outer.inc: ; preds = %inner.body, %outer.body + %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1 + %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64 + br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6 + +for.end19: ; preds = %outer.inc, %entry + ret void +} + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.width", i32 8} +!8 = !{!"llvm.loop.vectorize.enable", i1 true} |