Temporarily Revert "Add basic loop fusion pass."

As it's causing some bot failures (and per request from kbarton). This reverts commit r358543/ab70da07286e618016e78247e4a24fcb84077fda. llvm-svn: 358546
author: Eric Christopher <echristo@gmail.com> 2019-04-17 02:12:23 +0000
committer: Eric Christopher <echristo@gmail.com> 2019-04-17 02:12:23 +0000
commit: a86343512845c9c1fdbac865fea88aa5fce7142a (patch)
tree: 666fc6353de19ad8b00e56b67edd33f24104e4a7 /llvm/test/Transforms/LoopVectorize/ARM
parent: 7f8ca6e3679b3af951cb7a4b1377edfaa3244b93 (diff)
download: bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.tar.gz
bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.zip
10 files changed, 0 insertions, 1067 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
deleted file mode 100644
index 369568f6dfa..00000000000
--- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
+++ /dev/null
@@ -1,330 +0,0 @@
-; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
-; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
-; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
-; REQUIRES: asserts
-
-; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
-; regarding IEEE 754 standard.
-; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
-; because NEON is not IEEE compliant.
-; Darwin, on the other hand, doesn't support subnormals, and all optimizations
-; are allowed, even without -ffast-math.
-
-; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "sumi"
-; CHECK: We can vectorize this loop!
-define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
-  %1 = load i32, i32* %arrayidx1, align 4
-  %mul = mul nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
-  store i32 %mul, i32* %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.06, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
-; Floating-point loops need fast-math to be vectorizeable
-; LINUX: Checking a loop in "sumf"
-; LINUX: Potentially unsafe FP op prevents vectorization
-; DARWIN: Checking a loop in "sumf"
-; DARWIN: We can vectorize this loop!
-define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
-  %1 = load float, float* %arrayidx1, align 4
-  %mul = fmul float %0, %1
-  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
-  store float %mul, float* %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.06, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
-; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "redi"
-; CHECK: We can vectorize this loop!
-define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
-  %1 = load i32, i32* %arrayidx1, align 4
-  %mul = mul nsw i32 %1, %0
-  %add = add nsw i32 %mul, %Red.06
-  %inc = add nuw nsw i32 %i.07, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %add.lcssa = phi i32 [ %add, %for.body ]
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret i32 %Red.0.lcssa
-}
-
-; Floating-point loops need fast-math to be vectorizeable
-; LINUX: Checking a loop in "redf"
-; LINUX: Potentially unsafe FP op prevents vectorization
-; DARWIN: Checking a loop in "redf"
-; DARWIN: We can vectorize this loop!
-define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
-  %1 = load float, float* %arrayidx1, align 4
-  %mul = fmul float %0, %1
-  %add = fadd float %Red.06, %mul
-  %inc = add nuw nsw i32 %i.07, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %add.lcssa = phi float [ %add, %for.body ]
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret float %Red.0.lcssa
-}
-
-; Make sure calls that turn into builtins are also covered
-; LINUX: Checking a loop in "fabs"
-; LINUX: Potentially unsafe FP op prevents vectorization
-; DARWIN: Checking a loop in "fabs"
-; DARWIN: We can vectorize this loop!
-define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp10 = icmp eq i32 %N, 0
-  br i1 %cmp10, label %for.end, label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
-  %1 = load float, float* %arrayidx1, align 4
-  %fabsf = tail call float @fabsf(float %1) #1
-  %conv3 = fmul float %0, %fabsf
-  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
-  store float %conv3, float* %arrayidx4, align 4
-  %inc = add nuw nsw i32 %i.011, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "sumi_fast"
-; CHECK: We can vectorize this loop!
-define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
-  %1 = load i32, i32* %arrayidx1, align 4
-  %mul = mul nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
-  store i32 %mul, i32* %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.06, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
-; Floating-point loops can be vectorizeable with fast-math
-; CHECK: Checking a loop in "sumf_fast"
-; CHECK: We can vectorize this loop!
-define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
-  %1 = load float, float* %arrayidx1, align 4
-  %mul = fmul fast float %1, %0
-  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
-  store float %mul, float* %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.06, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  ret void
-}
-
-; Integer loops are always vectorizeable
-; CHECK: Checking a loop in "redi_fast"
-; CHECK: We can vectorize this loop!
-define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
-  %1 = load i32, i32* %arrayidx1, align 4
-  %mul = mul nsw i32 %1, %0
-  %add = add nsw i32 %mul, %Red.06
-  %inc = add nuw nsw i32 %i.07, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %add.lcssa = phi i32 [ %add, %for.body ]
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret i32 %Red.0.lcssa
-}
-
-; Floating-point loops can be vectorizeable with fast-math
-; CHECK: Checking a loop in "redf_fast"
-; CHECK: We can vectorize this loop!
-define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
-entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
-  %1 = load float, float* %arrayidx1, align 4
-  %mul = fmul fast float %1, %0
-  %add = fadd fast float %mul, %Red.06
-  %inc = add nuw nsw i32 %i.07, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
-
-for.end.loopexit:                                 ; preds = %for.body
-  %add.lcssa = phi float [ %add, %for.body ]
-  br label %for.end
-
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret float %Red.0.lcssa
-}
-
-; Make sure calls that turn into builtins are also covered
-; CHECK: Checking a loop in "fabs_fast"
-; CHECK: We can vectorize this loop!
-define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
-entry:
-  %cmp10 = icmp eq i32 %N, 0
-  br i1 %cmp10, label %for.end, label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
-  %1 = load float, float* %arrayidx1, align 4
-  %fabsf = tail call fast float @fabsf(float %1) #2
-  %conv3 = fmul fast float %fabsf, %0
-  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
-  store float %conv3, float* %arrayidx4, align 4
-  %inc = add nuw nsw i32 %i.011, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-declare float @fabsf(float)
-
-attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
deleted file mode 100644
index 7b09913636f..00000000000
--- a/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
-; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT
-; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFTUNROLL
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-target triple = "thumbv7-apple-ios3.0.0"
-
-;CHECK-LABEL: @foo(
-;CHECK: load <4 x i32>
-;CHECK-NOT: load <4 x i32>
-;CHECK: ret
-;SWIFT-LABEL: @foo(
-;SWIFT: load <4 x i32>
-;SWIFT: load <4 x i32>
-;SWIFT: ret
-define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
-  %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds i32, i32* %A, i32 %i.02
-  %3 = load i32, i32* %2, align 4
-  %4 = add nsw i32 %3, %sum.01
-  %5 = add nsw i32 %i.02, 1
-  %exitcond = icmp eq i32 %5, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
-  ret i32 %sum.0.lcssa
-}
-
-; Verify the register limit. On arm we don't have 16 allocatable registers.
-;SWIFTUNROLL-LABEL: @register_limit(
-;SWIFTUNROLL: load i32
-;SWIFTUNROLL-NOT: load i32
-define i32 @register_limit(i32* nocapture %A, i32 %n) {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:
-  %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
-  %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
-  %sum.02 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ]
-  %sum.03 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ]
-  %sum.04 = phi i32 [ %8, %.lr.ph ], [ 0, %0 ]
-  %sum.05 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
-  %sum.06 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds i32, i32* %A, i32 %i.02
-  %3 = load i32, i32* %2, align 4
-  %4 = add nsw i32 %3, %sum.01
-  %5 = add nsw i32 %i.02, 1
-  %6 = add nsw i32 %3, %sum.02
-  %7 = add nsw i32 %3, %sum.03
-  %8 = add nsw i32 %3, %sum.04
-  %9 = add nsw i32 %3, %sum.05
-  %10 = add nsw i32 %3, %sum.05
-  %exitcond = icmp eq i32 %5, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
-  %sum.1.lcssa = phi i32 [ 0, %0 ], [ %6, %.lr.ph ]
-  %sum.2.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ]
-  %sum.4.lcssa = phi i32 [ 0, %0 ], [ %8, %.lr.ph ]
-  %sum.5.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
-  %sum.6.lcssa = phi i32 [ 0, %0 ], [ %10, %.lr.ph ]
-  ret i32 %sum.0.lcssa
-}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
deleted file mode 100644
index 6d1fa6f36a9..00000000000
--- a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-
-@kernel = global [512 x float] zeroinitializer, align 4
-@kernel2 = global [512 x float] zeroinitializer, align 4
-@kernel3 = global [512 x float] zeroinitializer, align 4
-@kernel4 = global [512 x float] zeroinitializer, align 4
-@src_data = global [1536 x float] zeroinitializer, align 4
-@r_ = global i8 0, align 4
-@g_ = global i8 0, align 4
-@b_ = global i8 0, align 4
-
-; We don't want to vectorize most loops containing gathers because they are
-; expensive. This function represents a point where vectorization starts to
-; become beneficial.
-; Make sure we are conservative and don't vectorize it.
-; CHECK-NOT: <2 x float>
-; CHECK-NOT: <4 x float>
-
-define void @_Z4testmm(i32 %size, i32 %offset) {
-entry:
-  %cmp53 = icmp eq i32 %size, 0
-  br i1 %cmp53, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:
-  br label %for.body
-
-for.body:
-  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
-  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
-  %v.055 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
-  %add = add i32 %v.055, %offset
-  %mul = mul i32 %add, 3
-  %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %mul
-  %0 = load float, float* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 %v.055
-  %1 = load float, float* %arrayidx2, align 4
-  %mul3 = fmul fast float %0, %1
-  %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 %v.055
-  %2 = load float, float* %arrayidx4, align 4
-  %mul5 = fmul fast float %mul3, %2
-  %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 %v.055
-  %3 = load float, float* %arrayidx6, align 4
-  %mul7 = fmul fast float %mul5, %3
-  %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 %v.055
-  %4 = load float, float* %arrayidx8, align 4
-  %mul9 = fmul fast float %mul7, %4
-  %add10 = fadd fast float %r.057, %mul9
-  %arrayidx.sum = add i32 %mul, 1
-  %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum
-  %5 = load float, float* %arrayidx11, align 4
-  %mul13 = fmul fast float %1, %5
-  %mul15 = fmul fast float %2, %mul13
-  %mul17 = fmul fast float %3, %mul15
-  %mul19 = fmul fast float %4, %mul17
-  %add20 = fadd fast float %g.056, %mul19
-  %arrayidx.sum52 = add i32 %mul, 2
-  %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum52
-  %6 = load float, float* %arrayidx21, align 4
-  %mul23 = fmul fast float %1, %6
-  %mul25 = fmul fast float %2, %mul23
-  %mul27 = fmul fast float %3, %mul25
-  %mul29 = fmul fast float %4, %mul27
-  %add30 = fadd fast float %b.054, %mul29
-  %inc = add i32 %v.055, 1
-  %exitcond = icmp ne i32 %inc, %size
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:
-  %add30.lcssa = phi float [ %add30, %for.body ]
-  %add20.lcssa = phi float [ %add20, %for.body ]
-  %add10.lcssa = phi float [ %add10, %for.body ]
-  %phitmp = fptoui float %add10.lcssa to i8
-  %phitmp60 = fptoui float %add20.lcssa to i8
-  %phitmp61 = fptoui float %add30.lcssa to i8
-  br label %for.end
-
-for.end:
-  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  store i8 %r.0.lcssa, i8* @r_, align 4
-  store i8 %g.0.lcssa, i8* @g_, align 4
-  store i8 %b.0.lcssa, i8* @b_, align 4
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
deleted file mode 100644
index 783156d7399..00000000000
--- a/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-target triple = "thumbv7-apple-ios3.0.0"
-
-@b = common global [2048 x i32] zeroinitializer, align 16
-@c = common global [2048 x i32] zeroinitializer, align 16
-@a = common global [2048 x i32] zeroinitializer, align 16
-
-; Select VF = 8;
-;CHECK-LABEL: @example1(
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: ret void
-define void @example1() nounwind uwtable ssp {
-  br label %1
-
-; <label>:1                                       ; preds = %1, %0
-  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
-  %3 = load i32, i32* %2, align 4
-  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
-  %5 = load i32, i32* %4, align 4
-  %6 = add nsw i32 %5, %3
-  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
-  store i32 %6, i32* %7, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 256
-  br i1 %exitcond, label %8, label %1
-
-; <label>:8                                       ; preds = %1
-  ret void
-}
-
-;CHECK-LABEL: @example10b(
-;CHECK: load <4 x i16>
-;CHECK: sext <4 x i16>
-;CHECK: store <4 x i32>
-;CHECK: ret void
-define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
-  br label %1
-
-; <label>:1                                       ; preds = %1, %0
-  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
-  %3 = load i16, i16* %2, align 2
-  %4 = sext i16 %3 to i32
-  %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
-  store i32 %4, i32* %5, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %6, label %1
-
-; <label>:6                                       ; preds = %1
-  ret void
-}
-
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
deleted file mode 100644
index 29adec049f6..00000000000
--- a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
-; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
-; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
-; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
-; REQUIRES: asserts
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "armv8--linux-gnueabihf"
-
-%i8.2 = type {i8, i8}
-define void @i8_factor_2(%i8.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_8-LABEL:  Checking a loop in "i8_factor_2"
-; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
-; VF_16-LABEL: Checking a loop in "i8_factor_2"
-; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
-; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
-  %tmp2 = load i8, i8* %tmp0, align 1
-  %tmp3 = load i8, i8* %tmp1, align 1
-  store i8 0, i8* %tmp0, align 1
-  store i8 0, i8* %tmp1, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-%i16.2 = type {i16, i16}
-define void @i16_factor_2(%i16.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_4-LABEL:  Checking a loop in "i16_factor_2"
-; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_8-LABEL:  Checking a loop in "i16_factor_2"
-; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_16-LABEL: Checking a loop in "i16_factor_2"
-; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
-  %tmp2 = load i16, i16* %tmp0, align 2
-  %tmp3 = load i16, i16* %tmp1, align 2
-  store i16 0, i16* %tmp0, align 2
-  store i16 0, i16* %tmp1, align 2
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-%i32.2 = type {i32, i32}
-define void @i32_factor_2(%i32.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_2-LABEL:  Checking a loop in "i32_factor_2"
-; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_4-LABEL:  Checking a loop in "i32_factor_2"
-; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_8-LABEL:  Checking a loop in "i32_factor_2"
-; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_16-LABEL: Checking a loop in "i32_factor_2"
-; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
-  %tmp2 = load i32, i32* %tmp0, align 4
-  %tmp3 = load i32, i32* %tmp1, align 4
-  store i32 0, i32* %tmp0, align 4
-  store i32 0, i32* %tmp1, align 4
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
-
-%half.2 = type {half, half}
-define void @half_factor_2(%half.2* %data, i64 %n) {
-entry:
-  br label %for.body
-
-; VF_4-LABEL: Checking a loop in "half_factor_2"
-; VF_4:         Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
-; VF_4-NEXT:    Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
-; VF_8-LABEL: Checking a loop in "half_factor_2"
-; VF_8:         Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
-; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
-; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
-; VF_8-NEXT:    Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0
-  %tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1
-  %tmp2 = load half, half* %tmp0, align 2
-  %tmp3 = load half, half* %tmp1, align 2
-  store half 0., half* %tmp0, align 2
-  store half 0., half* %tmp1, align 2
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, %n
-  br i1 %cond, label %for.body, label %for.end
-
-for.end:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg
deleted file mode 100644
index 98c6700c209..00000000000
--- a/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-if not 'ARM' in config.root.targets:
-    config.unsupported = True
-
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll
deleted file mode 100644
index e88fcca1225..00000000000
--- a/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll
+++ /dev/null
@@ -1,114 +0,0 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
-; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
-; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
-
-; ModuleID = 'arm.ll'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
-target triple = "armv7--linux-gnueabihf"
-
-%T216 = type <2 x i16>
-%T232 = type <2 x i32>
-%T264 = type <2 x i64>
-
-%T416 = type <4 x i16>
-%T432 = type <4 x i32>
-%T464 = type <4 x i64>
-
-define void @direct(%T432* %loadaddr, %T432* %loadaddr2, %T432* %storeaddr) {
-; COST: function 'direct':
-  %v0 = load %T432, %T432* %loadaddr
-; ASM: vld1.64
-  %v1 = load %T432, %T432* %loadaddr2
-; ASM: vld1.64
-  %r3 = mul %T432 %v0, %v1 
-; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
-; ASM: vmul.i32
-  store %T432 %r3, %T432* %storeaddr
-; ASM: vst1.64
-  ret void
-}
-
-define void @ups1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
-; COST: function 'ups1632':
-  %v0 = load %T416, %T416* %loadaddr
-; ASM: vldr
-  %v1 = load %T416, %T416* %loadaddr2
-; ASM: vldr
-  %r1 = sext %T416 %v0 to %T432
-  %r2 = sext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32>
-  %r3 = mul %T432 %r1, %r2 
-; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
-; ASM: vmull.s16
-  store %T432 %r3, %T432* %storeaddr
-; ASM: vst1.64
-  ret void
-}
-
-define void @upu1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
-; COST: function 'upu1632':
-  %v0 = load %T416, %T416* %loadaddr
-; ASM: vldr
-  %v1 = load %T416, %T416* %loadaddr2
-; ASM: vldr
-  %r1 = zext %T416 %v0 to %T432
-  %r2 = zext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32>
-  %r3 = mul %T432 %r1, %r2 
-; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
-; ASM: vmull.u16
-  store %T432 %r3, %T432* %storeaddr
-; ASM: vst1.64
-  ret void
-}
-
-define void @ups3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
-; COST: function 'ups3264':
-  %v0 = load %T232, %T232* %loadaddr
-; ASM: vldr
-  %v1 = load %T232, %T232* %loadaddr2
-; ASM: vldr
-  %r3 = mul %T232 %v0, %v1 
-; ASM: vmul.i32
-; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
-  %st = sext %T232 %r3 to %T264
-; ASM: vmovl.s32
-; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64>
-  store %T264 %st, %T264* %storeaddr
-; ASM: vst1.64
-  ret void
-}
-
-define void @upu3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
-; COST: function 'upu3264':
-  %v0 = load %T232, %T232* %loadaddr
-; ASM: vldr
-  %v1 = load %T232, %T232* %loadaddr2
-; ASM: vldr
-  %r3 = mul %T232 %v0, %v1 
-; ASM: vmul.i32
-; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
-  %st = zext %T232 %r3 to %T264
-; ASM: vmovl.u32
-; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64>
-  store %T264 %st, %T264* %storeaddr
-; ASM: vst1.64
-  ret void
-}
-
-define void @dn3216(%T432* %loadaddr, %T432* %loadaddr2, %T416* %storeaddr) {
-; COST: function 'dn3216':
-  %v0 = load %T432, %T432* %loadaddr
-; ASM: vld1.64
-  %v1 = load %T432, %T432* %loadaddr2
-; ASM: vld1.64
-  %r3 = mul %T432 %v0, %v1 
-; ASM: vmul.i32
-; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
-  %st = trunc %T432 %r3 to %T416
-; ASM: vmovn.i32
-; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16>
-  store %T416 %st, %T416* %storeaddr
-; ASM: vstr
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
deleted file mode 100644
index a1cf4b318f3..00000000000
--- a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
+++ /dev/null
@@ -1,165 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s  -loop-vectorize -mtriple=thumbv8-unknown-unknown -mcpu=cortex-a53 -S | FileCheck %s
-
-; This test is reduced from SPECFP 2006 482.sphinx.
-; We expect vectorization with <2 x double> and <2 x float> ops.
-; See https://bugs.llvm.org/show_bug.cgi?id=36280 for more details.
-
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-
-@a = external global i32
-@v = external global i32
-@mm = external global float**
-@vv = external global float**
-@ll = external global float*
-
-define i32 @test(float* nocapture readonly %x) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[T:%.*]] = load i32, i32* @v, align 8
-; CHECK-NEXT:    [[T1:%.*]] = load i32, i32* @a, align 4
-; CHECK-NEXT:    br label [[OUTERLOOP:%.*]]
-; CHECK:       outerloop:
-; CHECK-NEXT:    [[T2:%.*]] = phi i32 [ [[V17:%.*]], [[OUTEREND:%.*]] ], [ [[T1]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[J_0136:%.*]] = phi i32 [ [[INC144:%.*]], [[OUTEREND]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCORE_1135:%.*]] = phi i32 [ [[CALL142:%.*]], [[OUTEREND]] ], [ -939524096, [[ENTRY]] ]
-; CHECK-NEXT:    [[T3:%.*]] = load float**, float*** @mm, align 4
-; CHECK-NEXT:    [[ARRAYIDX109:%.*]] = getelementptr inbounds float*, float** [[T3]], i32 [[T2]]
-; CHECK-NEXT:    [[T4:%.*]] = load float*, float** [[ARRAYIDX109]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float**, float*** @vv, align 4
-; CHECK-NEXT:    [[ARRAYIDX111:%.*]] = getelementptr inbounds float*, float** [[T5]], i32 [[T2]]
-; CHECK-NEXT:    [[T6:%.*]] = load float*, float** [[ARRAYIDX111]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float*, float** @ll, align 4
-; CHECK-NEXT:    [[ARRAYIDX113:%.*]] = getelementptr inbounds float, float* [[T7]], i32 [[T2]]
-; CHECK-NEXT:    [[T8:%.*]] = load float, float* [[ARRAYIDX113]], align 4
-; CHECK-NEXT:    [[CONV114:%.*]] = fpext float [[T8]] to double
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[T]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[T]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> zeroinitializer, double [[CONV114]], i32 0
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double>
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP15]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP16]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTERLOOP]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[INNERLOOP:%.*]]
-; CHECK:       innerloop:
-; CHECK-NEXT:    [[I_2132:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC129:%.*]], [[INNERLOOP]] ]
-; CHECK-NEXT:    [[DVAL1_4131:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB127:%.*]], [[INNERLOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX119:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_2132]]
-; CHECK-NEXT:    [[T9:%.*]] = load float, float* [[ARRAYIDX119]], align 4
-; CHECK-NEXT:    [[ARRAYIDX120:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[I_2132]]
-; CHECK-NEXT:    [[T10:%.*]] = load float, float* [[ARRAYIDX120]], align 4
-; CHECK-NEXT:    [[SUB121:%.*]] = fsub fast float [[T9]], [[T10]]
-; CHECK-NEXT:    [[CONV122:%.*]] = fpext float [[SUB121]] to double
-; CHECK-NEXT:    [[MUL123:%.*]] = fmul fast double [[CONV122]], [[CONV122]]
-; CHECK-NEXT:    [[ARRAYIDX124:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[I_2132]]
-; CHECK-NEXT:    [[T11:%.*]] = load float, float* [[ARRAYIDX124]], align 4
-; CHECK-NEXT:    [[CONV125:%.*]] = fpext float [[T11]] to double
-; CHECK-NEXT:    [[MUL126:%.*]] = fmul fast double [[MUL123]], [[CONV125]]
-; CHECK-NEXT:    [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]]
-; CHECK-NEXT:    [[INC129]] = add nuw nsw i32 [[I_2132]], 1
-; CHECK-NEXT:    [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]]
-; CHECK-NEXT:    br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop !2
-; CHECK:       outerend:
-; CHECK-NEXT:    [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32
-; CHECK-NEXT:    [[CALL142]] = add nuw nsw i32 [[SCORE_1135]], [[CONV138]]
-; CHECK-NEXT:    [[INC144]] = add nuw nsw i32 [[J_0136]], 1
-; CHECK-NEXT:    [[ARRAYIDX102:%.*]] = getelementptr inbounds i32, i32* @a, i32 [[INC144]]
-; CHECK-NEXT:    [[V17]] = load i32, i32* [[ARRAYIDX102]], align 4
-; CHECK-NEXT:    [[CMP103:%.*]] = icmp sgt i32 [[V17]], -1
-; CHECK-NEXT:    br i1 [[CMP103]], label [[OUTERLOOP]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret i32 [[CALL142]]
-;
-entry:
-  %t = load i32, i32* @v, align 8
-  %t1 = load i32, i32* @a, align 4
-  br label %outerloop
-
-outerloop:
-  %t2 = phi i32 [ %v17, %outerend ], [ %t1, %entry ]
-  %j.0136 = phi i32 [ %inc144, %outerend ], [ 0, %entry ]
-  %score.1135 = phi i32 [ %call142, %outerend ], [ -939524096, %entry ]
-  %t3 = load float**, float*** @mm, align 4
-  %arrayidx109 = getelementptr inbounds float*, float** %t3, i32 %t2
-  %t4 = load float*, float** %arrayidx109, align 4
-  %t5 = load float**, float*** @vv, align 4
-  %arrayidx111 = getelementptr inbounds float*, float** %t5, i32 %t2
-  %t6 = load float*, float** %arrayidx111, align 4
-  %t7 = load float*, float** @ll, align 4
-  %arrayidx113 = getelementptr inbounds float, float* %t7, i32 %t2
-  %t8 = load float, float* %arrayidx113, align 4
-  %conv114 = fpext float %t8 to double
-  br label %innerloop
-
-innerloop:
-  %i.2132 = phi i32 [ 0, %outerloop ], [ %inc129, %innerloop ]
-  %dval1.4131 = phi double [ %conv114, %outerloop ], [ %sub127, %innerloop ]
-  %arrayidx119 = getelementptr inbounds float, float* %x, i32 %i.2132
-  %t9 = load float, float* %arrayidx119, align 4
-  %arrayidx120 = getelementptr inbounds float, float* %t4, i32 %i.2132
-  %t10 = load float, float* %arrayidx120, align 4
-  %sub121 = fsub fast float %t9, %t10
-  %conv122 = fpext float %sub121 to double
-  %mul123 = fmul fast double %conv122, %conv122
-  %arrayidx124 = getelementptr inbounds float, float* %t6, i32 %i.2132
-  %t11 = load float, float* %arrayidx124, align 4
-  %conv125 = fpext float %t11 to double
-  %mul126 = fmul fast double %mul123, %conv125
-  %sub127 = fsub fast double %dval1.4131, %mul126
-  %inc129 = add nuw nsw i32 %i.2132, 1
-  %exitcond143 = icmp eq i32 %inc129, %t
-  br i1 %exitcond143, label %outerend, label %innerloop
-
-outerend:
-  %sub127.lcssa = phi double [ %sub127, %innerloop ]
-  %conv138 = fptosi double %sub127.lcssa to i32
-  %call142 = add nuw nsw i32 %score.1135, %conv138
-  %inc144 = add nuw nsw i32 %j.0136, 1
-  %arrayidx102 = getelementptr inbounds i32, i32* @a, i32 %inc144
-  %v17 = load i32, i32* %arrayidx102, align 4
-  %cmp103 = icmp sgt i32 %v17, -1
-  br i1 %cmp103, label %outerloop, label %exit
-
-exit:
-  ret i32 %call142
-}
-
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll b/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll
deleted file mode 100644
index 3be22d708da..00000000000
--- a/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: opt -loop-vectorize -tbaa -S -mattr=+neon < %s | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv7--linux-gnueabi"
-
-; This requires the loop vectorizer to create an interleaved access group
-; for the stores to the struct. Here we need to perform a bitcast from a vector
-; of pointers to a vector i32s.
-
-%class.A = type { i8*, i32 }
-
-; CHECK-LABEL: test0
-define void @test0(%class.A* %StartPtr, %class.A* %APtr) {
-entry:
-  br label %for.body.i
-
-for.body.i:
-  %addr = phi %class.A* [ %StartPtr, %entry ], [ %incdec.ptr.i, %for.body.i ]
-  %Data.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 0
-  store i8* null, i8** %Data.i.i, align 4, !tbaa !8
-  %Length.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 1
-  store i32 0, i32* %Length.i.i, align 4, !tbaa !11
-  %incdec.ptr.i = getelementptr inbounds %class.A, %class.A* %addr, i32 1
-  %cmp.i = icmp eq %class.A* %incdec.ptr.i, %APtr
-  br i1 %cmp.i, label %exit, label %for.body.i
-
-exit:
-  ret void
-}
-
-!5 = !{!"any pointer", !6, i64 0}
-!6 = !{!"omnipotent char", !7, i64 0}
-!7 = !{!"Simple C/C++ TBAA"}
-!8 = !{!9, !5, i64 0}
-!9 = !{!"some struct", !5, i64 0, !10, i64 4}
-!10 = !{!"int", !6, i64 0}
-!11 = !{!9, !10, i64 4}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll b/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll
deleted file mode 100644
index 66d2556dfb8..00000000000
--- a/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-target triple = "thumbv7-apple-ios3.0.0"
-
-;CHECK:foo_F32
-;CHECK: <4 x float>
-;CHECK:ret
-define float @foo_F32(float* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %prod.01 = phi float [ %4, %.lr.ph ], [ 0.000000e+00, %0 ]
-  %2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  %3 = load float, float* %2, align 8
-  %4 = fmul fast float %prod.01, %3
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  %prod.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %4, %.lr.ph ]
-  ret float %prod.0.lcssa
-}
-
-;CHECK:foo_I8
-;CHECK: xor <16 x i8>
-;CHECK:ret
-define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
-  %3 = load i8, i8* %2, align 1
-  %4 = xor i8 %3, %red.01
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ]
-  ret i8 %red.0.lcssa
-}
-
-
author	Eric Christopher <echristo@gmail.com>	2019-04-17 02:12:23 +0000
committer	Eric Christopher <echristo@gmail.com>	2019-04-17 02:12:23 +0000
commit	a86343512845c9c1fdbac865fea88aa5fce7142a (patch)
tree	666fc6353de19ad8b00e56b67edd33f24104e4a7 /llvm/test/Transforms/LoopVectorize/ARM
parent	7f8ca6e3679b3af951cb7a4b1377edfaa3244b93 (diff)
download	bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.tar.gz bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.zip