summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/LoopVectorize
diff options
context:
space:
mode:
authorJames Molloy <james.molloy@arm.com>2015-10-12 12:34:45 +0000
committerJames Molloy <james.molloy@arm.com>2015-10-12 12:34:45 +0000
commit55d633bd602358c68ada16c9197ae444d363507d (patch)
treed4bdbca6ec273df345d17af5458f066e94a2fe98 /llvm/test/Transforms/LoopVectorize
parentfb677dfd73c02bce99fabbdff2fab1e98acb32c0 (diff)
downloadbcm5719-llvm-55d633bd602358c68ada16c9197ae444d363507d.tar.gz
bcm5719-llvm-55d633bd602358c68ada16c9197ae444d363507d.zip
[LoopVectorize] Shrink integer operations into the smallest type possible
C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize')
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll243
1 files changed, 243 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
new file mode 100644
index 00000000000..f5b6a643c07
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -0,0 +1,243 @@
+; RUN: opt -S < %s -basicaa -loop-vectorize -simplifycfg -instsimplify -instcombine -licm -force-vector-interleave=1 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; CHECK-LABEL: @add_a(
+; CHECK: load <16 x i8>, <16 x i8>*
+; CHECK: add nuw nsw <16 x i8>
+; CHECK: store <16 x i8>
+; Function Attrs: nounwind
+define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+entry:
+ %cmp8 = icmp sgt i32 %len, 0
+ br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+ %0 = load i8, i8* %arrayidx
+ %conv = zext i8 %0 to i32
+ %add = add nuw nsw i32 %conv, 2
+ %conv1 = trunc i32 %add to i8
+ %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+ store i8 %conv1, i8* %arrayidx3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %len
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_b(
+; CHECK: load <8 x i16>, <8 x i16>*
+; CHECK: add nuw nsw <8 x i16>
+; CHECK: store <8 x i16>
+; Function Attrs: nounwind
+define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
+entry:
+ %cmp9 = icmp sgt i32 %len, 0
+ br i1 %cmp9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+ %0 = load i16, i16* %arrayidx
+ %conv8 = zext i16 %0 to i32
+ %add = add nuw nsw i32 %conv8, 2
+ %conv1 = trunc i32 %add to i16
+ %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
+ store i16 %conv1, i16* %arrayidx3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %len
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_c(
+; CHECK: load <8 x i8>, <8 x i8>*
+; CHECK: add nuw nsw <8 x i16>
+; CHECK: store <8 x i16>
+; Function Attrs: nounwind
+define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
+entry:
+ %cmp8 = icmp sgt i32 %len, 0
+ br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+ %0 = load i8, i8* %arrayidx
+ %conv = zext i8 %0 to i32
+ %add = add nuw nsw i32 %conv, 2
+ %conv1 = trunc i32 %add to i16
+ %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
+ store i16 %conv1, i16* %arrayidx3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %len
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_d(
+; CHECK: load <4 x i16>
+; CHECK: add nsw <4 x i32>
+; CHECK: store <4 x i32>
+define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
+entry:
+ %cmp7 = icmp sgt i32 %len, 0
+ br i1 %cmp7, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+ %0 = load i16, i16* %arrayidx
+ %conv = sext i16 %0 to i32
+ %add = add nsw i32 %conv, 2
+ %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+ store i32 %add, i32* %arrayidx2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %len
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_e(
+; CHECK: load <16 x i8>
+; CHECK: shl <16 x i8>
+; CHECK: add nuw nsw <16 x i8>
+; CHECK: or <16 x i8>
+; CHECK: mul nuw nsw <16 x i8>
+; CHECK: and <16 x i8>
+; CHECK: xor <16 x i8>
+; CHECK: mul nuw nsw <16 x i8>
+; CHECK: store <16 x i8>
+define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
+entry:
+ %cmp.32 = icmp sgt i32 %len, 0
+ br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph: ; preds = %entry
+ %conv11 = zext i8 %arg2 to i32
+ %conv13 = zext i8 %arg1 to i32
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body, %for.body.lr.ph
+ %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+ %0 = load i8, i8* %arrayidx
+ %conv = zext i8 %0 to i32
+ %add = shl i32 %conv, 4
+ %conv2 = add nuw nsw i32 %add, 32
+ %or = or i32 %conv, 51
+ %mul = mul nuw nsw i32 %or, 60
+ %and = and i32 %conv2, %conv13
+ %mul.masked = and i32 %mul, 252
+ %conv17 = xor i32 %mul.masked, %conv11
+ %mul18 = mul nuw nsw i32 %conv17, %and
+ %conv19 = trunc i32 %mul18 to i8
+ %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+ store i8 %conv19, i8* %arrayidx21
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %len
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_f
+; CHECK: load <8 x i16>
+; CHECK: trunc <8 x i16>
+; CHECK: shl <8 x i8>
+; CHECK: add nsw <8 x i8>
+; CHECK: or <8 x i8>
+; CHECK: mul nuw nsw <8 x i8>
+; CHECK: and <8 x i8>
+; CHECK: xor <8 x i8>
+; CHECK: mul nuw nsw <8 x i8>
+; CHECK: store <8 x i8>
+define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
+entry:
+ %cmp.32 = icmp sgt i32 %len, 0
+ br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph: ; preds = %entry
+ %conv11 = zext i8 %arg2 to i32
+ %conv13 = zext i8 %arg1 to i32
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body, %for.body.lr.ph
+ %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+ %0 = load i16, i16* %arrayidx
+ %conv = sext i16 %0 to i32
+ %add = shl i32 %conv, 4
+ %conv2 = add nsw i32 %add, 32
+ %or = and i32 %conv, 204
+ %conv8 = or i32 %or, 51
+ %mul = mul nuw nsw i32 %conv8, 60
+ %and = and i32 %conv2, %conv13
+ %mul.masked = and i32 %mul, 252
+ %conv17 = xor i32 %mul.masked, %conv11
+ %mul18 = mul nuw nsw i32 %conv17, %and
+ %conv19 = trunc i32 %mul18 to i8
+ %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+ store i8 %conv19, i8* %arrayidx21
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %len
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_g
+; CHECK: load <16 x i8>
+; CHECK: xor <16 x i8>
+; CHECK: icmp ult <16 x i8>
+; CHECK: select <16 x i1> {{.*}}, <16 x i8>
+; CHECK: store <16 x i8>
+define void @add_g(i8* noalias nocapture readonly %p, i8* noalias nocapture readonly %q, i8* noalias nocapture %r, i8 %arg1, i32 %len) #0 {
+ %1 = icmp sgt i32 %len, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %0
+ %2 = sext i8 %arg1 to i64
+ br label %3
+
+._crit_edge: ; preds = %3, %0
+ ret void
+
+; <label>:3 ; preds = %3, %.lr.ph
+ %indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %3 ]
+ %x4 = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+ %x5 = load i8, i8* %x4
+ %x7 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+ %x8 = load i8, i8* %x7
+ %x9 = zext i8 %x5 to i32
+ %x10 = xor i32 %x9, 255
+ %x11 = icmp ult i32 %x10, 24
+ %x12 = select i1 %x11, i32 %x10, i32 24
+ %x13 = trunc i32 %x12 to i8
+ store i8 %x13, i8* %x4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %len
+ br i1 %exitcond, label %._crit_edge, label %3
+}
+
+attributes #0 = { nounwind }
OpenPOWER on IntegriCloud