summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/LoopVectorize/ARM
diff options
context:
space:
mode:
authorEric Christopher <echristo@gmail.com>2019-04-17 04:52:47 +0000
committerEric Christopher <echristo@gmail.com>2019-04-17 04:52:47 +0000
commitcee313d288a4faf0355d76fb6e0e927e211d08a5 (patch)
treed386075318d761197779a96e5d8fc0dc7b06342b /llvm/test/Transforms/LoopVectorize/ARM
parentc3d6a929fdd92fd06d4304675ade8d7210ee711a (diff)
downloadbcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.tar.gz
bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.zip
Revert "Temporarily Revert "Add basic loop fusion pass.""
The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize/ARM')
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll330
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll71
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll88
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll60
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll147
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg3
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll114
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll165
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll37
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll52
10 files changed, 1067 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
new file mode 100644
index 00000000000..369568f6dfa
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -0,0 +1,330 @@
+; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
+; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
+; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
+; REQUIRES: asserts
+
+; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
+; regarding IEEE 754 standard.
+; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
+; because NEON is not IEEE compliant.
+; Darwin, on the other hand, doesn't support subnormals, and all optimizations
+; are allowed, even without -ffast-math.
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi"
+; CHECK: We can vectorize this loop!
+define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+ %1 = load i32, i32* %arrayidx1, align 4
+ %mul = mul nsw i32 %1, %0
+ %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+ store i32 %mul, i32* %arrayidx2, align 4
+ %inc = add nuw nsw i32 %i.06, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ ret void
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX: Checking a loop in "sumf"
+; LINUX: Potentially unsafe FP op prevents vectorization
+; DARWIN: Checking a loop in "sumf"
+; DARWIN: We can vectorize this loop!
+define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+ %1 = load float, float* %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+ store float %mul, float* %arrayidx2, align 4
+ %inc = add nuw nsw i32 %i.06, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi"
+; CHECK: We can vectorize this loop!
+define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+ %1 = load i32, i32* %arrayidx1, align 4
+ %mul = mul nsw i32 %1, %0
+ %add = add nsw i32 %mul, %Red.06
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ %add.lcssa = phi i32 [ %add, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+ ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX: Checking a loop in "redf"
+; LINUX: Potentially unsafe FP op prevents vectorization
+; DARWIN: Checking a loop in "redf"
+; DARWIN: We can vectorize this loop!
+define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+ %1 = load float, float* %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %add = fadd float %Red.06, %mul
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ %add.lcssa = phi float [ %add, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+ ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; LINUX: Checking a loop in "fabs"
+; LINUX: Potentially unsafe FP op prevents vectorization
+; DARWIN: Checking a loop in "fabs"
+; DARWIN: We can vectorize this loop!
+define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp10 = icmp eq i32 %N, 0
+ br i1 %cmp10, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+ %1 = load float, float* %arrayidx1, align 4
+ %fabsf = tail call float @fabsf(float %1) #1
+ %conv3 = fmul float %0, %fabsf
+ %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+ store float %conv3, float* %arrayidx4, align 4
+ %inc = add nuw nsw i32 %i.011, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi_fast"
+; CHECK: We can vectorize this loop!
+define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+ %1 = load i32, i32* %arrayidx1, align 4
+ %mul = mul nsw i32 %1, %0
+ %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+ store i32 %mul, i32* %arrayidx2, align 4
+ %inc = add nuw nsw i32 %i.06, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ ret void
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "sumf_fast"
+; CHECK: We can vectorize this loop!
+define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+ %1 = load float, float* %arrayidx1, align 4
+ %mul = fmul fast float %1, %0
+ %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+ store float %mul, float* %arrayidx2, align 4
+ %inc = add nuw nsw i32 %i.06, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi_fast"
+; CHECK: We can vectorize this loop!
+define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+ %1 = load i32, i32* %arrayidx1, align 4
+ %mul = mul nsw i32 %1, %0
+ %add = add nsw i32 %mul, %Red.06
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ %add.lcssa = phi i32 [ %add, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+ ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "redf_fast"
+; CHECK: We can vectorize this loop!
+define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+ %1 = load float, float* %arrayidx1, align 4
+ %mul = fmul fast float %1, %0
+ %add = fadd fast float %mul, %Red.06
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ %add.lcssa = phi float [ %add, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+ ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; CHECK: Checking a loop in "fabs_fast"
+; CHECK: We can vectorize this loop!
+define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp10 = icmp eq i32 %N, 0
+ br i1 %cmp10, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+ %1 = load float, float* %arrayidx1, align 4
+ %fabsf = tail call fast float @fabsf(float %1) #2
+ %conv3 = fmul fast float %fabsf, %0
+ %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+ store float %conv3, float* %arrayidx4, align 4
+ %inc = add nuw nsw i32 %i.011, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare float @fabsf(float)
+
+attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
new file mode 100644
index 00000000000..7b09913636f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
@@ -0,0 +1,71 @@
+; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFTUNROLL
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+;CHECK-LABEL: @foo(
+;CHECK: load <4 x i32>
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret
+;SWIFT-LABEL: @foo(
+;SWIFT: load <4 x i32>
+;SWIFT: load <4 x i32>
+;SWIFT: ret
+define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp {
+ %1 = icmp sgt i32 %n, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %0, %.lr.ph
+ %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
+ %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
+ %2 = getelementptr inbounds i32, i32* %A, i32 %i.02
+ %3 = load i32, i32* %2, align 4
+ %4 = add nsw i32 %3, %sum.01
+ %5 = add nsw i32 %i.02, 1
+ %exitcond = icmp eq i32 %5, %n
+ br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
+ ret i32 %sum.0.lcssa
+}
+
+; Verify the register limit. On arm we don't have 16 allocatable registers.
+;SWIFTUNROLL-LABEL: @register_limit(
+;SWIFTUNROLL: load i32
+;SWIFTUNROLL-NOT: load i32
+define i32 @register_limit(i32* nocapture %A, i32 %n) {
+ %1 = icmp sgt i32 %n, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:
+ %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
+ %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
+ %sum.02 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ]
+ %sum.03 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ]
+ %sum.04 = phi i32 [ %8, %.lr.ph ], [ 0, %0 ]
+ %sum.05 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+ %sum.06 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+ %2 = getelementptr inbounds i32, i32* %A, i32 %i.02
+ %3 = load i32, i32* %2, align 4
+ %4 = add nsw i32 %3, %sum.01
+ %5 = add nsw i32 %i.02, 1
+ %6 = add nsw i32 %3, %sum.02
+ %7 = add nsw i32 %3, %sum.03
+ %8 = add nsw i32 %3, %sum.04
+ %9 = add nsw i32 %3, %sum.05
+ %10 = add nsw i32 %3, %sum.05
+ %exitcond = icmp eq i32 %5, %n
+ br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
+ %sum.1.lcssa = phi i32 [ 0, %0 ], [ %6, %.lr.ph ]
+ %sum.2.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ]
+ %sum.4.lcssa = phi i32 [ 0, %0 ], [ %8, %.lr.ph ]
+ %sum.5.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+ %sum.6.lcssa = phi i32 [ 0, %0 ], [ %10, %.lr.ph ]
+ ret i32 %sum.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
new file mode 100644
index 00000000000..6d1fa6f36a9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
@@ -0,0 +1,88 @@
+; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+@kernel = global [512 x float] zeroinitializer, align 4
+@kernel2 = global [512 x float] zeroinitializer, align 4
+@kernel3 = global [512 x float] zeroinitializer, align 4
+@kernel4 = global [512 x float] zeroinitializer, align 4
+@src_data = global [1536 x float] zeroinitializer, align 4
+@r_ = global i8 0, align 4
+@g_ = global i8 0, align 4
+@b_ = global i8 0, align 4
+
+; We don't want to vectorize most loops containing gathers because they are
+; expensive. This function represents a point where vectorization starts to
+; become beneficial.
+; Make sure we are conservative and don't vectorize it.
+; CHECK-NOT: <2 x float>
+; CHECK-NOT: <4 x float>
+
+define void @_Z4testmm(i32 %size, i32 %offset) {
+entry:
+ %cmp53 = icmp eq i32 %size, 0
+ br i1 %cmp53, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+ br label %for.body
+
+for.body:
+ %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
+ %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
+ %v.055 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
+ %add = add i32 %v.055, %offset
+ %mul = mul i32 %add, 3
+ %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %mul
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 %v.055
+ %1 = load float, float* %arrayidx2, align 4
+ %mul3 = fmul fast float %0, %1
+ %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 %v.055
+ %2 = load float, float* %arrayidx4, align 4
+ %mul5 = fmul fast float %mul3, %2
+ %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 %v.055
+ %3 = load float, float* %arrayidx6, align 4
+ %mul7 = fmul fast float %mul5, %3
+ %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 %v.055
+ %4 = load float, float* %arrayidx8, align 4
+ %mul9 = fmul fast float %mul7, %4
+ %add10 = fadd fast float %r.057, %mul9
+ %arrayidx.sum = add i32 %mul, 1
+ %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum
+ %5 = load float, float* %arrayidx11, align 4
+ %mul13 = fmul fast float %1, %5
+ %mul15 = fmul fast float %2, %mul13
+ %mul17 = fmul fast float %3, %mul15
+ %mul19 = fmul fast float %4, %mul17
+ %add20 = fadd fast float %g.056, %mul19
+ %arrayidx.sum52 = add i32 %mul, 2
+ %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum52
+ %6 = load float, float* %arrayidx21, align 4
+ %mul23 = fmul fast float %1, %6
+ %mul25 = fmul fast float %2, %mul23
+ %mul27 = fmul fast float %3, %mul25
+ %mul29 = fmul fast float %4, %mul27
+ %add30 = fadd fast float %b.054, %mul29
+ %inc = add i32 %v.055, 1
+ %exitcond = icmp ne i32 %inc, %size
+ br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+ %add30.lcssa = phi float [ %add30, %for.body ]
+ %add20.lcssa = phi float [ %add20, %for.body ]
+ %add10.lcssa = phi float [ %add10, %for.body ]
+ %phitmp = fptoui float %add10.lcssa to i8
+ %phitmp60 = fptoui float %add20.lcssa to i8
+ %phitmp61 = fptoui float %add30.lcssa to i8
+ br label %for.end
+
+for.end:
+ %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+ %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+ %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+ store i8 %r.0.lcssa, i8* @r_, align 4
+ store i8 %g.0.lcssa, i8* @g_, align 4
+ store i8 %b.0.lcssa, i8* @b_, align 4
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
new file mode 100644
index 00000000000..783156d7399
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK-LABEL: @example1(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+ br label %1
+
+; <label>:1 ; preds = %1, %0
+ %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+ %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+ %3 = load i32, i32* %2, align 4
+ %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+ %5 = load i32, i32* %4, align 4
+ %6 = add nsw i32 %5, %3
+ %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+ store i32 %6, i32* %7, align 4
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, 256
+ br i1 %exitcond, label %8, label %1
+
+; <label>:8 ; preds = %1
+ ret void
+}
+
+;CHECK-LABEL: @example10b(
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+ br label %1
+
+; <label>:1 ; preds = %1, %0
+ %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+ %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
+ %3 = load i16, i16* %2, align 2
+ %4 = sext i16 %3 to i32
+ %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
+ store i32 %4, i32* %5, align 4
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, 1024
+ br i1 %exitcond, label %6, label %1
+
+; <label>:6 ; preds = %1
+ ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
new file mode 100644
index 00000000000..29adec049f6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
@@ -0,0 +1,147 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "armv8--linux-gnueabihf"
+
+%i8.2 = type {i8, i8}
+define void @i8_factor_2(%i8.2* %data, i64 %n) {
+entry:
+ br label %for.body
+
+; VF_8-LABEL: Checking a loop in "i8_factor_2"
+; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
+ %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
+ %tmp2 = load i8, i8* %tmp0, align 1
+ %tmp3 = load i8, i8* %tmp1, align 1
+ store i8 0, i8* %tmp0, align 1
+ store i8 0, i8* %tmp1, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
+%i16.2 = type {i16, i16}
+define void @i16_factor_2(%i16.2* %data, i64 %n) {
+entry:
+ br label %for.body
+
+; VF_4-LABEL: Checking a loop in "i16_factor_2"
+; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL: Checking a loop in "i16_factor_2"
+; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
+ %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
+ %tmp2 = load i16, i16* %tmp0, align 2
+ %tmp3 = load i16, i16* %tmp1, align 2
+ store i16 0, i16* %tmp0, align 2
+ store i16 0, i16* %tmp1, align 2
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
+%i32.2 = type {i32, i32}
+define void @i32_factor_2(%i32.2* %data, i64 %n) {
+entry:
+ br label %for.body
+
+; VF_2-LABEL: Checking a loop in "i32_factor_2"
+; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL: Checking a loop in "i32_factor_2"
+; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL: Checking a loop in "i32_factor_2"
+; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
+ %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
+ %tmp2 = load i32, i32* %tmp0, align 4
+ %tmp3 = load i32, i32* %tmp1, align 4
+ store i32 0, i32* %tmp0, align 4
+ store i32 0, i32* %tmp1, align 4
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
+%half.2 = type {half, half}
+define void @half_factor_2(%half.2* %data, i64 %n) {
+entry:
+ br label %for.body
+
+; VF_4-LABEL: Checking a loop in "half_factor_2"
+; VF_4: Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-LABEL: Checking a loop in "half_factor_2"
+; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0
+ %tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1
+ %tmp2 = load half, half* %tmp0, align 2
+ %tmp3 = load half, half* %tmp1, align 2
+ store half 0., half* %tmp0, align 2
+ store half 0., half* %tmp1, align 2
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg
new file mode 100644
index 00000000000..98c6700c209
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'ARM' in config.root.targets:
+ config.unsupported = True
+
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll
new file mode 100644
index 00000000000..e88fcca1225
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll
@@ -0,0 +1,114 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
+; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
+
+; ModuleID = 'arm.ll'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+%T216 = type <2 x i16>
+%T232 = type <2 x i32>
+%T264 = type <2 x i64>
+
+%T416 = type <4 x i16>
+%T432 = type <4 x i32>
+%T464 = type <4 x i64>
+
+define void @direct(%T432* %loadaddr, %T432* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'direct':
+ %v0 = load %T432, %T432* %loadaddr
+; ASM: vld1.64
+ %v1 = load %T432, %T432* %loadaddr2
+; ASM: vld1.64
+ %r3 = mul %T432 %v0, %v1
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmul.i32
+ store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+ ret void
+}
+
+define void @ups1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'ups1632':
+ %v0 = load %T416, %T416* %loadaddr
+; ASM: vldr
+ %v1 = load %T416, %T416* %loadaddr2
+; ASM: vldr
+ %r1 = sext %T416 %v0 to %T432
+ %r2 = sext %T416 %v1 to %T432
+; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32>
+ %r3 = mul %T432 %r1, %r2
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmull.s16
+ store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+ ret void
+}
+
+define void @upu1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'upu1632':
+ %v0 = load %T416, %T416* %loadaddr
+; ASM: vldr
+ %v1 = load %T416, %T416* %loadaddr2
+; ASM: vldr
+ %r1 = zext %T416 %v0 to %T432
+ %r2 = zext %T416 %v1 to %T432
+; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32>
+ %r3 = mul %T432 %r1, %r2
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmull.u16
+ store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+ ret void
+}
+
+define void @ups3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
+; COST: function 'ups3264':
+ %v0 = load %T232, %T232* %loadaddr
+; ASM: vldr
+ %v1 = load %T232, %T232* %loadaddr2
+; ASM: vldr
+ %r3 = mul %T232 %v0, %v1
+; ASM: vmul.i32
+; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
+ %st = sext %T232 %r3 to %T264
+; ASM: vmovl.s32
+; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64>
+ store %T264 %st, %T264* %storeaddr
+; ASM: vst1.64
+ ret void
+}
+
+define void @upu3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
+; COST: function 'upu3264':
+ %v0 = load %T232, %T232* %loadaddr
+; ASM: vldr
+ %v1 = load %T232, %T232* %loadaddr2
+; ASM: vldr
+ %r3 = mul %T232 %v0, %v1
+; ASM: vmul.i32
+; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
+ %st = zext %T232 %r3 to %T264
+; ASM: vmovl.u32
+; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64>
+ store %T264 %st, %T264* %storeaddr
+; ASM: vst1.64
+ ret void
+}
+
+define void @dn3216(%T432* %loadaddr, %T432* %loadaddr2, %T416* %storeaddr) {
+; COST: function 'dn3216':
+ %v0 = load %T432, %T432* %loadaddr
+; ASM: vld1.64
+ %v1 = load %T432, %T432* %loadaddr2
+; ASM: vld1.64
+ %r3 = mul %T432 %v0, %v1
+; ASM: vmul.i32
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+ %st = trunc %T432 %r3 to %T416
+; ASM: vmovn.i32
+; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16>
+ store %T416 %st, %T416* %storeaddr
+; ASM: vstr
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
new file mode 100644
index 00000000000..a1cf4b318f3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -mtriple=thumbv8-unknown-unknown -mcpu=cortex-a53 -S | FileCheck %s
+
+; This test is reduced from SPECFP 2006 482.sphinx.
+; We expect vectorization with <2 x double> and <2 x float> ops.
+; See https://bugs.llvm.org/show_bug.cgi?id=36280 for more details.
+
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+@a = external global i32
+@v = external global i32
+@mm = external global float**
+@vv = external global float**
+@ll = external global float*
+
+define i32 @test(float* nocapture readonly %x) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[T:%.*]] = load i32, i32* @v, align 8
+; CHECK-NEXT: [[T1:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT: br label [[OUTERLOOP:%.*]]
+; CHECK: outerloop:
+; CHECK-NEXT: [[T2:%.*]] = phi i32 [ [[V17:%.*]], [[OUTEREND:%.*]] ], [ [[T1]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[J_0136:%.*]] = phi i32 [ [[INC144:%.*]], [[OUTEREND]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: [[SCORE_1135:%.*]] = phi i32 [ [[CALL142:%.*]], [[OUTEREND]] ], [ -939524096, [[ENTRY]] ]
+; CHECK-NEXT: [[T3:%.*]] = load float**, float*** @mm, align 4
+; CHECK-NEXT: [[ARRAYIDX109:%.*]] = getelementptr inbounds float*, float** [[T3]], i32 [[T2]]
+; CHECK-NEXT: [[T4:%.*]] = load float*, float** [[ARRAYIDX109]], align 4
+; CHECK-NEXT: [[T5:%.*]] = load float**, float*** @vv, align 4
+; CHECK-NEXT: [[ARRAYIDX111:%.*]] = getelementptr inbounds float*, float** [[T5]], i32 [[T2]]
+; CHECK-NEXT: [[T6:%.*]] = load float*, float** [[ARRAYIDX111]], align 4
+; CHECK-NEXT: [[T7:%.*]] = load float*, float** @ll, align 4
+; CHECK-NEXT: [[ARRAYIDX113:%.*]] = getelementptr inbounds float, float* [[T7]], i32 [[T2]]
+; CHECK-NEXT: [[T8:%.*]] = load float, float* [[ARRAYIDX113]], align 4
+; CHECK-NEXT: [[CONV114:%.*]] = fpext float [[T8]] to double
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 2
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[T]], 2
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[T]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> zeroinitializer, double [[CONV114]], i32 0
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>*
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP9:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
+; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP1]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>*
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP13]], align 4
+; CHECK-NEXT: [[TMP14:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double>
+; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP14]]
+; CHECK-NEXT: [[TMP16]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP15]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK: middle.block:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP16]], [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTERLOOP]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[INNERLOOP:%.*]]
+; CHECK: innerloop:
+; CHECK-NEXT: [[I_2132:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC129:%.*]], [[INNERLOOP]] ]
+; CHECK-NEXT: [[DVAL1_4131:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB127:%.*]], [[INNERLOOP]] ]
+; CHECK-NEXT: [[ARRAYIDX119:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_2132]]
+; CHECK-NEXT: [[T9:%.*]] = load float, float* [[ARRAYIDX119]], align 4
+; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[I_2132]]
+; CHECK-NEXT: [[T10:%.*]] = load float, float* [[ARRAYIDX120]], align 4
+; CHECK-NEXT: [[SUB121:%.*]] = fsub fast float [[T9]], [[T10]]
+; CHECK-NEXT: [[CONV122:%.*]] = fpext float [[SUB121]] to double
+; CHECK-NEXT: [[MUL123:%.*]] = fmul fast double [[CONV122]], [[CONV122]]
+; CHECK-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[I_2132]]
+; CHECK-NEXT: [[T11:%.*]] = load float, float* [[ARRAYIDX124]], align 4
+; CHECK-NEXT: [[CONV125:%.*]] = fpext float [[T11]] to double
+; CHECK-NEXT: [[MUL126:%.*]] = fmul fast double [[MUL123]], [[CONV125]]
+; CHECK-NEXT: [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]]
+; CHECK-NEXT: [[INC129]] = add nuw nsw i32 [[I_2132]], 1
+; CHECK-NEXT: [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]]
+; CHECK-NEXT: br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop !2
+; CHECK: outerend:
+; CHECK-NEXT: [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32
+; CHECK-NEXT: [[CALL142]] = add nuw nsw i32 [[SCORE_1135]], [[CONV138]]
+; CHECK-NEXT: [[INC144]] = add nuw nsw i32 [[J_0136]], 1
+; CHECK-NEXT: [[ARRAYIDX102:%.*]] = getelementptr inbounds i32, i32* @a, i32 [[INC144]]
+; CHECK-NEXT: [[V17]] = load i32, i32* [[ARRAYIDX102]], align 4
+; CHECK-NEXT: [[CMP103:%.*]] = icmp sgt i32 [[V17]], -1
+; CHECK-NEXT: br i1 [[CMP103]], label [[OUTERLOOP]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: ret i32 [[CALL142]]
+;
+entry:
+ %t = load i32, i32* @v, align 8
+ %t1 = load i32, i32* @a, align 4
+ br label %outerloop
+
+outerloop:
+ %t2 = phi i32 [ %v17, %outerend ], [ %t1, %entry ]
+ %j.0136 = phi i32 [ %inc144, %outerend ], [ 0, %entry ]
+ %score.1135 = phi i32 [ %call142, %outerend ], [ -939524096, %entry ]
+ %t3 = load float**, float*** @mm, align 4
+ %arrayidx109 = getelementptr inbounds float*, float** %t3, i32 %t2
+ %t4 = load float*, float** %arrayidx109, align 4
+ %t5 = load float**, float*** @vv, align 4
+ %arrayidx111 = getelementptr inbounds float*, float** %t5, i32 %t2
+ %t6 = load float*, float** %arrayidx111, align 4
+ %t7 = load float*, float** @ll, align 4
+ %arrayidx113 = getelementptr inbounds float, float* %t7, i32 %t2
+ %t8 = load float, float* %arrayidx113, align 4
+ %conv114 = fpext float %t8 to double
+ br label %innerloop
+
+innerloop:
+ %i.2132 = phi i32 [ 0, %outerloop ], [ %inc129, %innerloop ]
+ %dval1.4131 = phi double [ %conv114, %outerloop ], [ %sub127, %innerloop ]
+ %arrayidx119 = getelementptr inbounds float, float* %x, i32 %i.2132
+ %t9 = load float, float* %arrayidx119, align 4
+ %arrayidx120 = getelementptr inbounds float, float* %t4, i32 %i.2132
+ %t10 = load float, float* %arrayidx120, align 4
+ %sub121 = fsub fast float %t9, %t10
+ %conv122 = fpext float %sub121 to double
+ %mul123 = fmul fast double %conv122, %conv122
+ %arrayidx124 = getelementptr inbounds float, float* %t6, i32 %i.2132
+ %t11 = load float, float* %arrayidx124, align 4
+ %conv125 = fpext float %t11 to double
+ %mul126 = fmul fast double %mul123, %conv125
+ %sub127 = fsub fast double %dval1.4131, %mul126
+ %inc129 = add nuw nsw i32 %i.2132, 1
+ %exitcond143 = icmp eq i32 %inc129, %t
+ br i1 %exitcond143, label %outerend, label %innerloop
+
+outerend:
+ %sub127.lcssa = phi double [ %sub127, %innerloop ]
+ %conv138 = fptosi double %sub127.lcssa to i32
+ %call142 = add nuw nsw i32 %score.1135, %conv138
+ %inc144 = add nuw nsw i32 %j.0136, 1
+ %arrayidx102 = getelementptr inbounds i32, i32* @a, i32 %inc144
+ %v17 = load i32, i32* %arrayidx102, align 4
+ %cmp103 = icmp sgt i32 %v17, -1
+ br i1 %cmp103, label %outerloop, label %exit
+
+exit:
+ ret i32 %call142
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll b/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll
new file mode 100644
index 00000000000..3be22d708da
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll
@@ -0,0 +1,37 @@
+; RUN: opt -loop-vectorize -tbaa -S -mattr=+neon < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabi"
+
+; This requires the loop vectorizer to create an interleaved access group
+; for the stores to the struct. Here we need to perform a bitcast from a vector
+; of pointers to a vector i32s.
+
+%class.A = type { i8*, i32 }
+
+; CHECK-LABEL: test0
+define void @test0(%class.A* %StartPtr, %class.A* %APtr) {
+entry:
+ br label %for.body.i
+
+for.body.i:
+ %addr = phi %class.A* [ %StartPtr, %entry ], [ %incdec.ptr.i, %for.body.i ]
+ %Data.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 0
+ store i8* null, i8** %Data.i.i, align 4, !tbaa !8
+ %Length.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 1
+ store i32 0, i32* %Length.i.i, align 4, !tbaa !11
+ %incdec.ptr.i = getelementptr inbounds %class.A, %class.A* %addr, i32 1
+ %cmp.i = icmp eq %class.A* %incdec.ptr.i, %APtr
+ br i1 %cmp.i, label %exit, label %for.body.i
+
+exit:
+ ret void
+}
+
+!5 = !{!"any pointer", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!9, !5, i64 0}
+!9 = !{!"some struct", !5, i64 0, !10, i64 4}
+!10 = !{!"int", !6, i64 0}
+!11 = !{!9, !10, i64 4}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll b/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll
new file mode 100644
index 00000000000..66d2556dfb8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+;CHECK:foo_F32
+;CHECK: <4 x float>
+;CHECK:ret
+define float @foo_F32(float* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+ %1 = icmp sgt i32 %n, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %0, %.lr.ph
+ %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+ %prod.01 = phi float [ %4, %.lr.ph ], [ 0.000000e+00, %0 ]
+ %2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+ %3 = load float, float* %2, align 8
+ %4 = fmul fast float %prod.01, %3
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ %prod.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %4, %.lr.ph ]
+ ret float %prod.0.lcssa
+}
+
+;CHECK:foo_I8
+;CHECK: xor <16 x i8>
+;CHECK:ret
+define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+ %1 = icmp sgt i32 %n, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %0, %.lr.ph
+ %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+ %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ]
+ %2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+ %3 = load i8, i8* %2, align 1
+ %4 = xor i8 %3, %red.01
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ]
+ ret i8 %red.0.lcssa
+}
+
+
OpenPOWER on IntegriCloud