diff options
author | Matthew Simpson <mssimpso@codeaurora.org> | 2016-07-06 14:26:59 +0000 |
---|---|---|
committer | Matthew Simpson <mssimpso@codeaurora.org> | 2016-07-06 14:26:59 +0000 |
commit | 433cb1dfe31a85e5e39743032a18c96bf12ce955 (patch) | |
tree | fc9919257eaf7d0e5944e0c4057d99727de3c1dd /llvm/test/Transforms | |
parent | ad0a56f3da287000ba0b64642db99b17186c3a5b (diff) | |
download | bcm5719-llvm-433cb1dfe31a85e5e39743032a18c96bf12ce955.tar.gz bcm5719-llvm-433cb1dfe31a85e5e39743032a18c96bf12ce955.zip |
[LV] Don't widen trivial induction variables
We currently always vectorize induction variables. However, if an induction
variable is only used for counting loop iterations or computing addresses with
getelementptr instructions, we don't need to do this. Vectorizing these trivial
induction variables can create vector code that is difficult to simplify later
on. This is especially true when the unroll factor is greater than one, and we
create vector arithmetic when computing step vectors. With this patch, we check
if an induction variable is only used for counting iterations or computing
addresses, and if so, scalarize the arithmetic when computing step vectors
instead. This allows for greater simplification.
This patch addresses the suboptimal pointer arithmetic sequence seen in
PR27881.
Reference: https://llvm.org/bugs/show_bug.cgi?id=27881
Differential Revision: http://reviews.llvm.org/D21620
llvm-svn: 274627
Diffstat (limited to 'llvm/test/Transforms')
4 files changed, 198 insertions, 16 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll b/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll index fb12e172f54..e73b6eacbe1 100644 --- a/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll +++ b/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll @@ -12,11 +12,11 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" ; CHECK-LABEL: @foo ; CHECK: vector.body -; CHECK: %0 = phi -; CHECK: %2 = getelementptr inbounds double*, double** %in, i64 %0 -; CHECK: %3 = bitcast double** %2 to <4 x i64>* -; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %3, align 8 -; CHECK: %4 = icmp eq <4 x i64> %wide.load, zeroinitializer +; CHECK: %[[IV:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %[[v0:.+]] = getelementptr inbounds double*, double** %in, i64 %[[IV]] +; CHECK: %[[v1:.+]] = bitcast double** %[[v0]] to <4 x i64>* +; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %[[v1]], align 8 +; CHECK: icmp eq <4 x i64> %wide.load, zeroinitializer ; CHECK: br i1 define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 { diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index b193a5b4a85..beee3978abb 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -1,6 +1,7 @@ ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL +; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses -instcombine -S | FileCheck %s --check-prefix=INTERLEAVE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -66,6 +67,137 @@ loopexit: ret void } +; Make sure we don't create a vector induction phi node that is unused. +; Scalarize the step vectors instead. +; +; for (int i = 0; i < n; ++i) +; sum += a[i]; +; +; IND-LABEL: @scalarize_induction_variable_01( +; IND: vector.body: +; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; IND-NOT: add i64 {{.*}}, 2 +; IND: getelementptr inbounds i64, i64* %a, i64 %index +; +; UNROLL-LABEL: @scalarize_induction_variable_01( +; UNROLL: vector.body: +; UNROLL: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; UNROLL-NOT: add i64 {{.*}}, 4 +; UNROLL: %[[g1:.+]] = getelementptr inbounds i64, i64* %a, i64 %index +; UNROLL: getelementptr i64, i64* %[[g1]], i64 2 + +define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %sum = phi i64 [ %2, %for.body ], [ 0, %entry ] + %0 = getelementptr inbounds i64, i64* %a, i64 %i + %1 = load i64, i64* %0, align 8 + %2 = add i64 %1, %sum + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %3 = phi i64 [ %2, %for.body ] + ret i64 %3 +} + +; Make sure we scalarize the step vectors used for the pointer arithmetic. We +; can't easily simplify vectorized step vectors. +; +; float s = 0; +; for (int i ; 0; i < n; i += 8) +; s += (a[i] + b[i] + 1.0f); +; +; IND-LABEL: @scalarize_induction_variable_02( +; IND: vector.body: +; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; IND: %[[i0:.+]] = shl i64 %index, 3 +; IND: %[[i1:.+]] = or i64 %[[i0]], 8 +; IND: getelementptr inbounds float, float* %a, i64 %[[i0]] +; IND: getelementptr inbounds float, float* %a, i64 %[[i1]] +; +; UNROLL-LABEL: @scalarize_induction_variable_02( +; UNROLL: vector.body: +; UNROLL: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; UNROLL: %[[i0:.+]] = shl i64 %index, 3 +; UNROLL: %[[i1:.+]] = or i64 %[[i0]], 8 +; UNROLL: %[[i2:.+]] = or i64 %[[i0]], 16 +; UNROLL: %[[i3:.+]] = or i64 %[[i0]], 24 +; UNROLL: getelementptr inbounds float, float* %a, i64 %[[i0]] +; UNROLL: getelementptr inbounds float, float* %a, i64 %[[i1]] +; UNROLL: getelementptr inbounds float, float* %a, i64 %[[i2]] +; UNROLL: getelementptr inbounds float, float* %a, i64 %[[i3]] + +define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %s = phi float [ 0.0, %entry ], [ %6, %for.body ] + %0 = getelementptr inbounds float, float* %a, i64 %i + %1 = load float, float* %0, align 4 + %2 = getelementptr inbounds float, float* %b, i64 %i + %3 = load float, float* %2, align 4 + %4 = fadd fast float %s, 1.0 + %5 = fadd fast float %4, %1 + %6 = fadd fast float %5, %3 + %i.next = add nuw nsw i64 %i, 8 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %s.lcssa = phi float [ %6, %for.body ] + ret float %s.lcssa +} + +; Make sure we scalarize the step vectors used for the pointer arithmetic. We +; can't easily simplify vectorized step vectors. (Interleaved accesses.) +; +; for (int i = 0; i < n; ++i) +; a[i].f ^= y; +; +; INTERLEAVE-LABEL: @scalarize_induction_variable_03( +; INTERLEAVE: vector.body: +; INTERLEAVE: %[[i0:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; INTERLEAVE: %[[i1:.+]] = or i64 %[[i0]], 1 +; INTERLEAVE: %[[i2:.+]] = or i64 %[[i0]], 2 +; INTERLEAVE: %[[i3:.+]] = or i64 %[[i0]], 3 +; INTERLEAVE: %[[i4:.+]] = or i64 %[[i0]], 4 +; INTERLEAVE: %[[i5:.+]] = or i64 %[[i0]], 5 +; INTERLEAVE: %[[i6:.+]] = or i64 %[[i0]], 6 +; INTERLEAVE: %[[i7:.+]] = or i64 %[[i0]], 7 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i0]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i1]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i2]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i3]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i4]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i5]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i6]], i32 1 +; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i7]], i32 1 + +%pair = type { i32, i32 } +define void @scalarize_induction_variable_03(%pair *%p, i32 %y, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %f = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1 + %0 = load i32, i32* %f, align 8 + %1 = xor i32 %0, %y + store i32 %1, i32* %f, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} ; Make sure that the loop exit count computation does not overflow for i8 and ; i16. The exit count of these loops is i8/i16 max + 1. If we don't cast the @@ -114,9 +246,11 @@ define i32 @i16_loop() nounwind readnone ssp uwtable { ; CHECK-LABEL: max_i32_backedgetaken ; CHECK: br i1 true, label %scalar.ph, label %min.iters.checked +; CHECK: middle.block: +; CHECK: %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0 ; CHECK: scalar.ph: -; CHECK: %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %0 ] -; CHECK: %bc.merge.rdx = phi i32 [ 1, %0 ], [ 1, %min.iters.checked ], [ %5, %middle.block ] +; CHECK: %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ] +; CHECK: %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ 1, %min.iters.checked ], [ %[[v9]], %middle.block ] define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable { diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll index d1a7f4c92d7..4e345f3517b 100644 --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -22,8 +22,8 @@ for.end: ; CHECK-LABEL: @preinc ; CHECK-LABEL: middle.block: -; CHECK: %3 = sub i32 %n.vec, 1 -; CHECK: %ind.escape = add i32 0, %3 +; CHECK: %[[v3:.+]] = sub i32 %n.vec, 1 +; CHECK: %ind.escape = add i32 0, %[[v3]] ; CHECK-LABEL: scalar.ph: ; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ] ; CHECK-LABEL: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll index c19e438bc71..7eb35100c75 100644 --- a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll +++ b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll @@ -5,9 +5,24 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Make sure consecutive vector generates correct negative indices. ; PR15882 -; CHECK-LABEL: @reverse_induction_i64( -; CHECK: %step.add = add <4 x i64> %vec.ind, <i64 -4, i64 -4, i64 -4, i64 -4> -; CHECK: %step.add2 = add <4 x i64> %step.add, <i64 -4, i64 -4, i64 -4, i64 -4> +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %offset.idx = sub i64 %startval, %index +; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0 +; CHECK: %[[v0:.+]] = insertelement <4 x i64> undef, i64 %[[a0]], i64 0 +; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1 +; CHECK: %[[v1:.+]] = insertelement <4 x i64> %[[v0]], i64 %[[a1]], i64 1 +; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2 +; CHECK: %[[v2:.+]] = insertelement <4 x i64> %[[v1]], i64 %[[a2]], i64 2 +; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3 +; CHECK: %[[v3:.+]] = insertelement <4 x i64> %[[v2]], i64 %[[a3]], i64 3 +; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4 +; CHECK: %[[v4:.+]] = insertelement <4 x i64> undef, i64 %[[a4]], i64 0 +; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5 +; CHECK: %[[v5:.+]] = insertelement <4 x i64> %[[v4]], i64 %[[a5]], i64 1 +; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6 +; CHECK: %[[v6:.+]] = insertelement <4 x i64> %[[v5]], i64 %[[a6]], i64 2 +; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7 +; CHECK: %[[v7:.+]] = insertelement <4 x i64> %[[v6]], i64 %[[a7]], i64 3 define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) { entry: @@ -30,8 +45,25 @@ loopend: } ; CHECK-LABEL: @reverse_induction_i128( -; CHECK: %step.add = add <4 x i128> %vec.ind, <i128 -4, i128 -4, i128 -4, i128 -4> -; CHECK: %step.add2 = add <4 x i128> %step.add, <i128 -4, i128 -4, i128 -4, i128 -4> +; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %offset.idx = sub i128 %startval, %index +; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0 +; CHECK: %[[v0:.+]] = insertelement <4 x i128> undef, i128 %[[a0]], i64 0 +; CHECK: %[[a1:.+]] = add i128 %offset.idx, -1 +; CHECK: %[[v1:.+]] = insertelement <4 x i128> %[[v0]], i128 %[[a1]], i64 1 +; CHECK: %[[a2:.+]] = add i128 %offset.idx, -2 +; CHECK: %[[v2:.+]] = insertelement <4 x i128> %[[v1]], i128 %[[a2]], i64 2 +; CHECK: %[[a3:.+]] = add i128 %offset.idx, -3 +; CHECK: %[[v3:.+]] = insertelement <4 x i128> %[[v2]], i128 %[[a3]], i64 3 +; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4 +; CHECK: %[[v4:.+]] = insertelement <4 x i128> undef, i128 %[[a4]], i64 0 +; CHECK: %[[a5:.+]] = add i128 %offset.idx, -5 +; CHECK: %[[v5:.+]] = insertelement <4 x i128> %[[v4]], i128 %[[a5]], i64 1 +; CHECK: %[[a6:.+]] = add i128 %offset.idx, -6 +; CHECK: %[[v6:.+]] = insertelement <4 x i128> %[[v5]], i128 %[[a6]], i64 2 +; CHECK: %[[a7:.+]] = add i128 %offset.idx, -7 +; CHECK: %[[v7:.+]] = insertelement <4 x i128> %[[v6]], i128 %[[a7]], i64 3 + define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) { entry: br label %for.body @@ -53,8 +85,24 @@ loopend: } ; CHECK-LABEL: @reverse_induction_i16( -; CHECK: add <4 x i16> %[[SPLAT:.*]], <i16 0, i16 -1, i16 -2, i16 -3> -; CHECK: add <4 x i16> %[[SPLAT]], <i16 -4, i16 -5, i16 -6, i16 -7> +; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %offset.idx = sub i16 %startval, {{.*}} +; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0 +; CHECK: %[[v0:.+]] = insertelement <4 x i16> undef, i16 %[[a0]], i64 0 +; CHECK: %[[a1:.+]] = add i16 %offset.idx, -1 +; CHECK: %[[v1:.+]] = insertelement <4 x i16> %[[v0]], i16 %[[a1]], i64 1 +; CHECK: %[[a2:.+]] = add i16 %offset.idx, -2 +; CHECK: %[[v2:.+]] = insertelement <4 x i16> %[[v1]], i16 %[[a2]], i64 2 +; CHECK: %[[a3:.+]] = add i16 %offset.idx, -3 +; CHECK: %[[v3:.+]] = insertelement <4 x i16> %[[v2]], i16 %[[a3]], i64 3 +; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4 +; CHECK: %[[v4:.+]] = insertelement <4 x i16> undef, i16 %[[a4]], i64 0 +; CHECK: %[[a5:.+]] = add i16 %offset.idx, -5 +; CHECK: %[[v5:.+]] = insertelement <4 x i16> %[[v4]], i16 %[[a5]], i64 1 +; CHECK: %[[a6:.+]] = add i16 %offset.idx, -6 +; CHECK: %[[v6:.+]] = insertelement <4 x i16> %[[v5]], i16 %[[a6]], i64 2 +; CHECK: %[[a7:.+]] = add i16 %offset.idx, -7 +; CHECK: %[[v7:.+]] = insertelement <4 x i16> %[[v6]], i16 %[[a7]], i64 3 define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) { entry: |