diff options
author | Matthew Simpson <mssimpso@codeaurora.org> | 2016-01-15 18:51:51 +0000 |
---|---|---|
committer | Matthew Simpson <mssimpso@codeaurora.org> | 2016-01-15 18:51:51 +0000 |
commit | 57fe1b10db94080f7acd932578b103ac793eb28a (patch) | |
tree | 947a6fe51c42bd93a244ddbed336aa42dc486315 /llvm/test | |
parent | b4adf55e0f050a494cc0c9a98b0bd46e8e30070a (diff) | |
download | bcm5719-llvm-57fe1b10db94080f7acd932578b103ac793eb28a.tar.gz bcm5719-llvm-57fe1b10db94080f7acd932578b103ac793eb28a.zip |
Reapply r257800 with fix
The fix uniques the bundle of getelementptr indices we are about to vectorize
since it's possible for the same index to be used by multiple instructions.
The original commit message is below.
[SLP] Vectorize the index computations of getelementptr instructions.
This patch seeds the SLP vectorizer with getelementptr indices. The primary
motivation in doing so is to vectorize gather-like idioms beginning with
consecutive loads (e.g., g[a[0] - b[0]] + g[a[1] - b[1]] + ...). While these
cases could be vectorized with a top-down phase, seeding the existing bottom-up
phase with the index computations avoids the complexity, compile-time, and
phase ordering issues associated with a full top-down pass. Only bundles of
single-index getelementptrs with non-constant differences are considered for
vectorization.
llvm-svn: 257918
Diffstat (limited to 'llvm/test')
-rw-r--r-- | llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll | 258 | ||||
-rw-r--r-- | llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll | 111 |
2 files changed, 369 insertions, 0 deletions
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll new file mode 100644 index 00000000000..59ceba1717a --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -0,0 +1,258 @@ +; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; These tests check that we vectorize the index calculations in the +; gather-reduce pattern shown below. We check cases having i32 and i64 +; subtraction. +; +; int gather_reduce_8x16(short *a, short *b, short *g, int n) { +; int sum = 0; +; for (int i = 0; i < n ; ++i) { +; sum += g[*a++ - b[0]]; sum += g[*a++ - b[1]]; +; sum += g[*a++ - b[2]]; sum += g[*a++ - b[3]]; +; sum += g[*a++ - b[4]]; sum += g[*a++ - b[5]]; +; sum += g[*a++ - b[6]]; sum += g[*a++ - b[7]]; +; } +; return sum; +; } + +; CHECK-LABEL: @gather_reduce_8x16_i32 +; +; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> +; CHECK: zext <8 x i16> [[L]] to <8 x i32> +; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> +; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] +; CHECK: sext i32 [[X]] to i64 +; +define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { +entry: + %cmp.99 = icmp sgt i32 %n, 0 + br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ] + %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1 + %0 = load i16, i16* %a.addr.0101, align 2 + %conv = zext i16 %0 to i32 + %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1 + %1 = load i16, i16* %b, align 2 + %conv2 = zext i16 %1 to i32 + %sub = sub nsw i32 %conv, %conv2 + %arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub + %2 = load i16, i16* %arrayidx, align 2 + %conv3 = zext i16 %2 to i32 + %add = add nsw i32 %conv3, %sum.0102 + %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2 + %3 = load i16, i16* %incdec.ptr, align 2 + %conv5 = zext i16 %3 to i32 + %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2 + %4 = load i16, i16* %incdec.ptr1, align 2 + %conv7 = zext i16 %4 to i32 + %sub8 = sub nsw i32 %conv5, %conv7 + %arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8 + %5 = load i16, i16* %arrayidx10, align 2 + %conv11 = zext i16 %5 to i32 + %add12 = add nsw i32 %add, %conv11 + %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3 + %6 = load i16, i16* %incdec.ptr4, align 2 + %conv14 = zext i16 %6 to i32 + %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3 + %7 = load i16, i16* %incdec.ptr6, align 2 + %conv16 = zext i16 %7 to i32 + %sub17 = sub nsw i32 %conv14, %conv16 + %arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17 + %8 = load i16, i16* %arrayidx19, align 2 + %conv20 = zext i16 %8 to i32 + %add21 = add nsw i32 %add12, %conv20 + %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4 + %9 = load i16, i16* %incdec.ptr13, align 2 + %conv23 = zext i16 %9 to i32 + %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4 + %10 = load i16, i16* %incdec.ptr15, align 2 + %conv25 = zext i16 %10 to i32 + %sub26 = sub nsw i32 %conv23, %conv25 + %arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26 + %11 = load i16, i16* %arrayidx28, align 2 + %conv29 = zext i16 %11 to i32 + %add30 = add nsw i32 %add21, %conv29 + %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5 + %12 = load i16, i16* %incdec.ptr22, align 2 + %conv32 = zext i16 %12 to i32 + %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5 + %13 = load i16, i16* %incdec.ptr24, align 2 + %conv34 = zext i16 %13 to i32 + %sub35 = sub nsw i32 %conv32, %conv34 + %arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35 + %14 = load i16, i16* %arrayidx37, align 2 + %conv38 = zext i16 %14 to i32 + %add39 = add nsw i32 %add30, %conv38 + %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6 + %15 = load i16, i16* %incdec.ptr31, align 2 + %conv41 = zext i16 %15 to i32 + %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6 + %16 = load i16, i16* %incdec.ptr33, align 2 + %conv43 = zext i16 %16 to i32 + %sub44 = sub nsw i32 %conv41, %conv43 + %arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44 + %17 = load i16, i16* %arrayidx46, align 2 + %conv47 = zext i16 %17 to i32 + %add48 = add nsw i32 %add39, %conv47 + %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7 + %18 = load i16, i16* %incdec.ptr40, align 2 + %conv50 = zext i16 %18 to i32 + %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7 + %19 = load i16, i16* %incdec.ptr42, align 2 + %conv52 = zext i16 %19 to i32 + %sub53 = sub nsw i32 %conv50, %conv52 + %arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53 + %20 = load i16, i16* %arrayidx55, align 2 + %conv56 = zext i16 %20 to i32 + %add57 = add nsw i32 %add48, %conv56 + %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8 + %21 = load i16, i16* %incdec.ptr49, align 2 + %conv59 = zext i16 %21 to i32 + %22 = load i16, i16* %incdec.ptr51, align 2 + %conv61 = zext i16 %22 to i32 + %sub62 = sub nsw i32 %conv59, %conv61 + %arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62 + %23 = load i16, i16* %arrayidx64, align 2 + %conv65 = zext i16 %23 to i32 + %add66 = add nsw i32 %add57, %conv65 + %inc = add nuw nsw i32 %i.0103, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; CHECK-LABEL: @gather_reduce_8x16_i64 +; +; CHECK-NOT: load <8 x i16> +; +; FIXME: We are currently unable to vectorize the case with i64 subtraction +; because the zero extensions are too expensive. The solution here is to +; convert the i64 subtractions to i32 subtractions during vectorization. +; This would then match the case above. +; +define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { +entry: + %cmp.99 = icmp sgt i32 %n, 0 + br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ] + %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1 + %0 = load i16, i16* %a.addr.0101, align 2 + %conv = zext i16 %0 to i64 + %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1 + %1 = load i16, i16* %b, align 2 + %conv2 = zext i16 %1 to i64 + %sub = sub nsw i64 %conv, %conv2 + %arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub + %2 = load i16, i16* %arrayidx, align 2 + %conv3 = zext i16 %2 to i32 + %add = add nsw i32 %conv3, %sum.0102 + %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2 + %3 = load i16, i16* %incdec.ptr, align 2 + %conv5 = zext i16 %3 to i64 + %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2 + %4 = load i16, i16* %incdec.ptr1, align 2 + %conv7 = zext i16 %4 to i64 + %sub8 = sub nsw i64 %conv5, %conv7 + %arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8 + %5 = load i16, i16* %arrayidx10, align 2 + %conv11 = zext i16 %5 to i32 + %add12 = add nsw i32 %add, %conv11 + %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3 + %6 = load i16, i16* %incdec.ptr4, align 2 + %conv14 = zext i16 %6 to i64 + %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3 + %7 = load i16, i16* %incdec.ptr6, align 2 + %conv16 = zext i16 %7 to i64 + %sub17 = sub nsw i64 %conv14, %conv16 + %arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17 + %8 = load i16, i16* %arrayidx19, align 2 + %conv20 = zext i16 %8 to i32 + %add21 = add nsw i32 %add12, %conv20 + %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4 + %9 = load i16, i16* %incdec.ptr13, align 2 + %conv23 = zext i16 %9 to i64 + %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4 + %10 = load i16, i16* %incdec.ptr15, align 2 + %conv25 = zext i16 %10 to i64 + %sub26 = sub nsw i64 %conv23, %conv25 + %arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26 + %11 = load i16, i16* %arrayidx28, align 2 + %conv29 = zext i16 %11 to i32 + %add30 = add nsw i32 %add21, %conv29 + %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5 + %12 = load i16, i16* %incdec.ptr22, align 2 + %conv32 = zext i16 %12 to i64 + %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5 + %13 = load i16, i16* %incdec.ptr24, align 2 + %conv34 = zext i16 %13 to i64 + %sub35 = sub nsw i64 %conv32, %conv34 + %arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35 + %14 = load i16, i16* %arrayidx37, align 2 + %conv38 = zext i16 %14 to i32 + %add39 = add nsw i32 %add30, %conv38 + %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6 + %15 = load i16, i16* %incdec.ptr31, align 2 + %conv41 = zext i16 %15 to i64 + %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6 + %16 = load i16, i16* %incdec.ptr33, align 2 + %conv43 = zext i16 %16 to i64 + %sub44 = sub nsw i64 %conv41, %conv43 + %arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44 + %17 = load i16, i16* %arrayidx46, align 2 + %conv47 = zext i16 %17 to i32 + %add48 = add nsw i32 %add39, %conv47 + %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7 + %18 = load i16, i16* %incdec.ptr40, align 2 + %conv50 = zext i16 %18 to i64 + %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7 + %19 = load i16, i16* %incdec.ptr42, align 2 + %conv52 = zext i16 %19 to i64 + %sub53 = sub nsw i64 %conv50, %conv52 + %arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53 + %20 = load i16, i16* %arrayidx55, align 2 + %conv56 = zext i16 %20 to i32 + %add57 = add nsw i32 %add48, %conv56 + %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8 + %21 = load i16, i16* %incdec.ptr49, align 2 + %conv59 = zext i16 %21 to i64 + %22 = load i16, i16* %incdec.ptr51, align 2 + %conv61 = zext i16 %22 to i64 + %sub62 = sub nsw i64 %conv59, %conv61 + %arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62 + %23 = load i16, i16* %arrayidx64, align 2 + %conv65 = zext i16 %23 to i32 + %add66 = add nsw i32 %add57, %conv65 + %inc = add nuw nsw i32 %i.0103, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll new file mode 100644 index 00000000000..e9b71963530 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -0,0 +1,111 @@ +; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; These tests check that we remove from consideration pairs of seed +; getelementptrs when they are known to have a constant difference. Such pairs +; are likely not good candidates for vectorization since one can be computed +; from the other. We use an unprofitable threshold to force vectorization. +; +; int getelementptr(int *g, int n, int w, int x, int y, int z) { +; int sum = 0; +; for (int i = 0; i < n ; ++i) { +; sum += g[2*i + w]; sum += g[2*i + x]; +; sum += g[2*i + y]; sum += g[2*i + z]; +; } +; return sum; +; } +; + +; CHECK-LABEL: @getelementptr_4x32 +; +; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32> +; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]] +; CHECK: sext i32 [[X]] to i64 +; +define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) { +entry: + %cmp31 = icmp sgt i32 %n, 0 + br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ] + %t4 = shl nsw i32 %indvars.iv, 1 + %t5 = add nsw i32 %t4, 0 + %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5 + %t6 = load i32, i32* %arrayidx, align 4 + %add1 = add nsw i32 %t6, %sum.032 + %t7 = add nsw i32 %t4, %x + %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7 + %t8 = load i32, i32* %arrayidx5, align 4 + %add6 = add nsw i32 %add1, %t8 + %t9 = add nsw i32 %t4, %y + %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9 + %t10 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %add6, %t10 + %t11 = add nsw i32 %t4, %z + %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11 + %t12 = load i32, i32* %arrayidx15, align 4 + %add16 = add nsw i32 %add11, %t12 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next , %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; CHECK-LABEL: @getelementptr_2x32 +; +; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> +; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]] +; CHECK: sext i32 [[X]] to i64 +; +define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) { +entry: + %cmp31 = icmp sgt i32 %n, 0 + br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ] + %t4 = shl nsw i32 %indvars.iv, 1 + %t5 = add nsw i32 %t4, 0 + %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5 + %t6 = load i32, i32* %arrayidx, align 4 + %add1 = add nsw i32 %t6, %sum.032 + %t7 = add nsw i32 %t4, 1 + %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7 + %t8 = load i32, i32* %arrayidx5, align 4 + %add6 = add nsw i32 %add1, %t8 + %t9 = add nsw i32 %t4, %y + %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9 + %t10 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %add6, %t10 + %t11 = add nsw i32 %t4, %z + %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11 + %t12 = load i32, i32* %arrayidx15, align 4 + %add16 = add nsw i32 %add11, %t12 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next , %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} |