diff options
author | Eric Christopher <echristo@gmail.com> | 2019-04-17 02:12:23 +0000 |
---|---|---|
committer | Eric Christopher <echristo@gmail.com> | 2019-04-17 02:12:23 +0000 |
commit | a86343512845c9c1fdbac865fea88aa5fce7142a (patch) | |
tree | 666fc6353de19ad8b00e56b67edd33f24104e4a7 /llvm/test/Transforms/LoopVectorize/ARM | |
parent | 7f8ca6e3679b3af951cb7a4b1377edfaa3244b93 (diff) | |
download | bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.tar.gz bcm5719-llvm-a86343512845c9c1fdbac865fea88aa5fce7142a.zip |
Temporarily Revert "Add basic loop fusion pass."
As it's causing some bot failures (and per request from kbarton).
This reverts commit r358543/ab70da07286e618016e78247e4a24fcb84077fda.
llvm-svn: 358546
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize/ARM')
10 files changed, 0 insertions, 1067 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll deleted file mode 100644 index 369568f6dfa..00000000000 --- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll +++ /dev/null @@ -1,330 +0,0 @@ -; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX -; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX -; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN -; REQUIRES: asserts - -; Testing the ability of the loop vectorizer to tell when SIMD is safe or not -; regarding IEEE 754 standard. -; On Linux, we only want the vectorizer to work when -ffast-math flag is set, -; because NEON is not IEEE compliant. -; Darwin, on the other hand, doesn't support subnormals, and all optimizations -; are allowed, even without -ffast-math. - -; Integer loops are always vectorizeable -; CHECK: Checking a loop in "sumi" -; CHECK: We can vectorize this loop! -define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { -entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %for.body.preheader - -for.body.preheader: ; preds = %entry - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 - %1 = load i32, i32* %arrayidx1, align 4 - %mul = mul nsw i32 %1, %0 - %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 - store i32 %mul, i32* %arrayidx2, align 4 - %inc = add nuw nsw i32 %i.06, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.end.loopexit, label %for.body - -for.end.loopexit: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - ret void -} - -; Floating-point loops need fast-math to be vectorizeable -; LINUX: Checking a loop in "sumf" -; LINUX: Potentially unsafe FP op prevents vectorization -; DARWIN: Checking a loop in "sumf" -; DARWIN: We can vectorize this loop! -define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { -entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %for.body.preheader - -for.body.preheader: ; preds = %entry - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 - %0 = load float, float* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 - %1 = load float, float* %arrayidx1, align 4 - %mul = fmul float %0, %1 - %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 - store float %mul, float* %arrayidx2, align 4 - %inc = add nuw nsw i32 %i.06, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.end.loopexit, label %for.body - -for.end.loopexit: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - ret void -} - -; Integer loops are always vectorizeable -; CHECK: Checking a loop in "redi" -; CHECK: We can vectorize this loop! -define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { -entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %for.body.preheader - -for.body.preheader: ; preds = %entry - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 - %1 = load i32, i32* %arrayidx1, align 4 - %mul = mul nsw i32 %1, %0 - %add = add nsw i32 %mul, %Red.06 - %inc = add nuw nsw i32 %i.07, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.end.loopexit, label %for.body - -for.end.loopexit: ; preds = %for.body - %add.lcssa = phi i32 [ %add, %for.body ] - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] - ret i32 %Red.0.lcssa -} - -; Floating-point loops need fast-math to be vectorizeable -; LINUX: Checking a loop in "redf" -; LINUX: Potentially unsafe FP op prevents vectorization -; DARWIN: Checking a loop in "redf" -; DARWIN: We can vectorize this loop! -define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { -entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %for.body.preheader - -for.body.preheader: ; preds = %entry - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] - %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 - %0 = load float, float* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 - %1 = load float, float* %arrayidx1, align 4 - %mul = fmul float %0, %1 - %add = fadd float %Red.06, %mul - %inc = add nuw nsw i32 %i.07, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.end.loopexit, label %for.body - -for.end.loopexit: ; preds = %for.body - %add.lcssa = phi float [ %add, %for.body ] - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] - ret float %Red.0.lcssa -} - -; Make sure calls that turn into builtins are also covered -; LINUX: Checking a loop in "fabs" -; LINUX: Potentially unsafe FP op prevents vectorization -; DARWIN: Checking a loop in "fabs" -; DARWIN: We can vectorize this loop! -define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { -entry: - %cmp10 = icmp eq i32 %N, 0 - br i1 %cmp10, label %for.end, label %for.body - -for.body: ; preds = %entry, %for.body - %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 - %0 = load float, float* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 - %1 = load float, float* %arrayidx1, align 4 - %fabsf = tail call float @fabsf(float %1) #1 - %conv3 = fmul float %0, %fabsf - %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 - store float %conv3, float* %arrayidx4, align 4 - %inc = add nuw nsw i32 %i.011, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; Integer loops are always vectorizeable -; CHECK: Checking a loop in "sumi_fast" -; CHECK: We can vectorize this loop! -define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { -entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %for.body.preheader - -for.body.preheader: ; preds = %entry - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 - %1 = load i32, i32* %arrayidx1, align 4 - %mul = mul nsw i32 %1, %0 - %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 - store i32 %mul, i32* %arrayidx2, align 4 - %inc = add nuw nsw i32 %i.06, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.end.loopexit, label %for.body - -for.end.loopexit: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - ret void -} - -; Floating-point loops can be vectorizeable with fast-math -; CHECK: Checking a loop in "sumf_fast" -; CHECK: We can vectorize this loop! -define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { -entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %for.body.preheader - -for.body.preheader: ; preds = %entry - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 - %0 = load float, float* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 - %1 = load float, float* %arrayidx1, align 4 - %mul = fmul fast float %1, %0 - %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 - store float %mul, float* %arrayidx2, align 4 - %inc = add nuw nsw i32 %i.06, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.end.loopexit, label %for.body - -for.end.loopexit: ; preds = %for.body - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - ret void -} - -; Integer loops are always vectorizeable -; CHECK: Checking a loop in "redi_fast" -; CHECK: We can vectorize this loop! -define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { -entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %for.body.preheader - -for.body.preheader: ; preds = %entry - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 - %1 = load i32, i32* %arrayidx1, align 4 - %mul = mul nsw i32 %1, %0 - %add = add nsw i32 %mul, %Red.06 - %inc = add nuw nsw i32 %i.07, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.end.loopexit, label %for.body - -for.end.loopexit: ; preds = %for.body - %add.lcssa = phi i32 [ %add, %for.body ] - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] - ret i32 %Red.0.lcssa -} - -; Floating-point loops can be vectorizeable with fast-math -; CHECK: Checking a loop in "redf_fast" -; CHECK: We can vectorize this loop! -define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { -entry: - %cmp5 = icmp eq i32 %N, 0 - br i1 %cmp5, label %for.end, label %for.body.preheader - -for.body.preheader: ; preds = %entry - br label %for.body - -for.body: ; preds = %for.body.preheader, %for.body - %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] - %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 - %0 = load float, float* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 - %1 = load float, float* %arrayidx1, align 4 - %mul = fmul fast float %1, %0 - %add = fadd fast float %mul, %Red.06 - %inc = add nuw nsw i32 %i.07, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.end.loopexit, label %for.body - -for.end.loopexit: ; preds = %for.body - %add.lcssa = phi float [ %add, %for.body ] - br label %for.end - -for.end: ; preds = %for.end.loopexit, %entry - %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] - ret float %Red.0.lcssa -} - -; Make sure calls that turn into builtins are also covered -; CHECK: Checking a loop in "fabs_fast" -; CHECK: We can vectorize this loop! -define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { -entry: - %cmp10 = icmp eq i32 %N, 0 - br i1 %cmp10, label %for.end, label %for.body - -for.body: ; preds = %entry, %for.body - %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 - %0 = load float, float* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 - %1 = load float, float* %arrayidx1, align 4 - %fabsf = tail call fast float @fabsf(float %1) #2 - %conv3 = fmul fast float %fabsf, %0 - %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 - store float %conv3, float* %arrayidx4, align 4 - %inc = add nuw nsw i32 %i.011, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -declare float @fabsf(float) - -attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll deleted file mode 100644 index 7b09913636f..00000000000 --- a/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll +++ /dev/null @@ -1,71 +0,0 @@ -; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s -; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT -; RUN: opt < %s -loop-vectorize -force-vector-width=1 -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFTUNROLL - -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" -target triple = "thumbv7-apple-ios3.0.0" - -;CHECK-LABEL: @foo( -;CHECK: load <4 x i32> -;CHECK-NOT: load <4 x i32> -;CHECK: ret -;SWIFT-LABEL: @foo( -;SWIFT: load <4 x i32> -;SWIFT: load <4 x i32> -;SWIFT: ret -define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp { - %1 = icmp sgt i32 %n, 0 - br i1 %1, label %.lr.ph, label %._crit_edge - -.lr.ph: ; preds = %0, %.lr.ph - %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ] - %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ] - %2 = getelementptr inbounds i32, i32* %A, i32 %i.02 - %3 = load i32, i32* %2, align 4 - %4 = add nsw i32 %3, %sum.01 - %5 = add nsw i32 %i.02, 1 - %exitcond = icmp eq i32 %5, %n - br i1 %exitcond, label %._crit_edge, label %.lr.ph - -._crit_edge: ; preds = %.lr.ph, %0 - %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ] - ret i32 %sum.0.lcssa -} - -; Verify the register limit. On arm we don't have 16 allocatable registers. -;SWIFTUNROLL-LABEL: @register_limit( -;SWIFTUNROLL: load i32 -;SWIFTUNROLL-NOT: load i32 -define i32 @register_limit(i32* nocapture %A, i32 %n) { - %1 = icmp sgt i32 %n, 0 - br i1 %1, label %.lr.ph, label %._crit_edge - -.lr.ph: - %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ] - %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ] - %sum.02 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ] - %sum.03 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ] - %sum.04 = phi i32 [ %8, %.lr.ph ], [ 0, %0 ] - %sum.05 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] - %sum.06 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] - %2 = getelementptr inbounds i32, i32* %A, i32 %i.02 - %3 = load i32, i32* %2, align 4 - %4 = add nsw i32 %3, %sum.01 - %5 = add nsw i32 %i.02, 1 - %6 = add nsw i32 %3, %sum.02 - %7 = add nsw i32 %3, %sum.03 - %8 = add nsw i32 %3, %sum.04 - %9 = add nsw i32 %3, %sum.05 - %10 = add nsw i32 %3, %sum.05 - %exitcond = icmp eq i32 %5, %n - br i1 %exitcond, label %._crit_edge, label %.lr.ph - -._crit_edge: ; preds = %.lr.ph, %0 - %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ] - %sum.1.lcssa = phi i32 [ 0, %0 ], [ %6, %.lr.ph ] - %sum.2.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ] - %sum.4.lcssa = phi i32 [ 0, %0 ], [ %8, %.lr.ph ] - %sum.5.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] - %sum.6.lcssa = phi i32 [ 0, %0 ], [ %10, %.lr.ph ] - ret i32 %sum.0.lcssa -} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll deleted file mode 100644 index 6d1fa6f36a9..00000000000 --- a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll +++ /dev/null @@ -1,88 +0,0 @@ -; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s - -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" - -@kernel = global [512 x float] zeroinitializer, align 4 -@kernel2 = global [512 x float] zeroinitializer, align 4 -@kernel3 = global [512 x float] zeroinitializer, align 4 -@kernel4 = global [512 x float] zeroinitializer, align 4 -@src_data = global [1536 x float] zeroinitializer, align 4 -@r_ = global i8 0, align 4 -@g_ = global i8 0, align 4 -@b_ = global i8 0, align 4 - -; We don't want to vectorize most loops containing gathers because they are -; expensive. This function represents a point where vectorization starts to -; become beneficial. -; Make sure we are conservative and don't vectorize it. -; CHECK-NOT: <2 x float> -; CHECK-NOT: <4 x float> - -define void @_Z4testmm(i32 %size, i32 %offset) { -entry: - %cmp53 = icmp eq i32 %size, 0 - br i1 %cmp53, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: - br label %for.body - -for.body: - %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ] - %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ] - %v.055 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] - %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ] - %add = add i32 %v.055, %offset - %mul = mul i32 %add, 3 - %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %mul - %0 = load float, float* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 %v.055 - %1 = load float, float* %arrayidx2, align 4 - %mul3 = fmul fast float %0, %1 - %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 %v.055 - %2 = load float, float* %arrayidx4, align 4 - %mul5 = fmul fast float %mul3, %2 - %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 %v.055 - %3 = load float, float* %arrayidx6, align 4 - %mul7 = fmul fast float %mul5, %3 - %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 %v.055 - %4 = load float, float* %arrayidx8, align 4 - %mul9 = fmul fast float %mul7, %4 - %add10 = fadd fast float %r.057, %mul9 - %arrayidx.sum = add i32 %mul, 1 - %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum - %5 = load float, float* %arrayidx11, align 4 - %mul13 = fmul fast float %1, %5 - %mul15 = fmul fast float %2, %mul13 - %mul17 = fmul fast float %3, %mul15 - %mul19 = fmul fast float %4, %mul17 - %add20 = fadd fast float %g.056, %mul19 - %arrayidx.sum52 = add i32 %mul, 2 - %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum52 - %6 = load float, float* %arrayidx21, align 4 - %mul23 = fmul fast float %1, %6 - %mul25 = fmul fast float %2, %mul23 - %mul27 = fmul fast float %3, %mul25 - %mul29 = fmul fast float %4, %mul27 - %add30 = fadd fast float %b.054, %mul29 - %inc = add i32 %v.055, 1 - %exitcond = icmp ne i32 %inc, %size - br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: - %add30.lcssa = phi float [ %add30, %for.body ] - %add20.lcssa = phi float [ %add20, %for.body ] - %add10.lcssa = phi float [ %add10, %for.body ] - %phitmp = fptoui float %add10.lcssa to i8 - %phitmp60 = fptoui float %add20.lcssa to i8 - %phitmp61 = fptoui float %add30.lcssa to i8 - br label %for.end - -for.end: - %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] - %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ] - %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ] - store i8 %r.0.lcssa, i8* @r_, align 4 - store i8 %g.0.lcssa, i8* @g_, align 4 - store i8 %b.0.lcssa, i8* @b_, align 4 - ret void -} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll deleted file mode 100644 index 783156d7399..00000000000 --- a/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s - -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" -target triple = "thumbv7-apple-ios3.0.0" - -@b = common global [2048 x i32] zeroinitializer, align 16 -@c = common global [2048 x i32] zeroinitializer, align 16 -@a = common global [2048 x i32] zeroinitializer, align 16 - -; Select VF = 8; -;CHECK-LABEL: @example1( -;CHECK: load <4 x i32> -;CHECK: add nsw <4 x i32> -;CHECK: store <4 x i32> -;CHECK: ret void -define void @example1() nounwind uwtable ssp { - br label %1 - -; <label>:1 ; preds = %1, %0 - %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] - %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv - %3 = load i32, i32* %2, align 4 - %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv - %5 = load i32, i32* %4, align 4 - %6 = add nsw i32 %5, %3 - %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv - store i32 %6, i32* %7, align 4 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 256 - br i1 %exitcond, label %8, label %1 - -; <label>:8 ; preds = %1 - ret void -} - -;CHECK-LABEL: @example10b( -;CHECK: load <4 x i16> -;CHECK: sext <4 x i16> -;CHECK: store <4 x i32> -;CHECK: ret void -define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { - br label %1 - -; <label>:1 ; preds = %1, %0 - %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] - %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv - %3 = load i16, i16* %2, align 2 - %4 = sext i16 %3 to i32 - %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv - store i32 %4, i32* %5, align 4 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %6, label %1 - -; <label>:6 ; preds = %1 - ret void -} - diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll deleted file mode 100644 index 29adec049f6..00000000000 --- a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll +++ /dev/null @@ -1,147 +0,0 @@ -; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 -; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 -; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 -; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 -; REQUIRES: asserts - -target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" -target triple = "armv8--linux-gnueabihf" - -%i8.2 = type {i8, i8} -define void @i8_factor_2(%i8.2* %data, i64 %n) { -entry: - br label %for.body - -; VF_8-LABEL: Checking a loop in "i8_factor_2" -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 -; VF_16-LABEL: Checking a loop in "i8_factor_2" -; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 -for.body: - %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] - %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 - %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1 - %tmp2 = load i8, i8* %tmp0, align 1 - %tmp3 = load i8, i8* %tmp1, align 1 - store i8 0, i8* %tmp0, align 1 - store i8 0, i8* %tmp1, align 1 - %i.next = add nuw nsw i64 %i, 1 - %cond = icmp slt i64 %i.next, %n - br i1 %cond, label %for.body, label %for.end - -for.end: - ret void -} - -%i16.2 = type {i16, i16} -define void @i16_factor_2(%i16.2* %data, i64 %n) { -entry: - br label %for.body - -; VF_4-LABEL: Checking a loop in "i16_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 -; VF_8-LABEL: Checking a loop in "i16_factor_2" -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 -; VF_16-LABEL: Checking a loop in "i16_factor_2" -; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 -for.body: - %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] - %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 - %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1 - %tmp2 = load i16, i16* %tmp0, align 2 - %tmp3 = load i16, i16* %tmp1, align 2 - store i16 0, i16* %tmp0, align 2 - store i16 0, i16* %tmp1, align 2 - %i.next = add nuw nsw i64 %i, 1 - %cond = icmp slt i64 %i.next, %n - br i1 %cond, label %for.body, label %for.end - -for.end: - ret void -} - -%i32.2 = type {i32, i32} -define void @i32_factor_2(%i32.2* %data, i64 %n) { -entry: - br label %for.body - -; VF_2-LABEL: Checking a loop in "i32_factor_2" -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 -; VF_4-LABEL: Checking a loop in "i32_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 -; VF_8-LABEL: Checking a loop in "i32_factor_2" -; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 -; VF_16-LABEL: Checking a loop in "i32_factor_2" -; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 -for.body: - %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] - %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 - %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1 - %tmp2 = load i32, i32* %tmp0, align 4 - %tmp3 = load i32, i32* %tmp1, align 4 - store i32 0, i32* %tmp0, align 4 - store i32 0, i32* %tmp1, align 4 - %i.next = add nuw nsw i64 %i, 1 - %cond = icmp slt i64 %i.next, %n - br i1 %cond, label %for.body, label %for.end - -for.end: - ret void -} - -%half.2 = type {half, half} -define void @half_factor_2(%half.2* %data, i64 %n) { -entry: - br label %for.body - -; VF_4-LABEL: Checking a loop in "half_factor_2" -; VF_4: Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 -; VF_8-LABEL: Checking a loop in "half_factor_2" -; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 -for.body: - %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] - %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0 - %tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1 - %tmp2 = load half, half* %tmp0, align 2 - %tmp3 = load half, half* %tmp1, align 2 - store half 0., half* %tmp0, align 2 - store half 0., half* %tmp1, align 2 - %i.next = add nuw nsw i64 %i, 1 - %cond = icmp slt i64 %i.next, %n - br i1 %cond, label %for.body, label %for.end - -for.end: - ret void -} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg deleted file mode 100644 index 98c6700c209..00000000000 --- a/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg +++ /dev/null @@ -1,3 +0,0 @@ -if not 'ARM' in config.root.targets: - config.unsupported = True - diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll deleted file mode 100644 index e88fcca1225..00000000000 --- a/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll +++ /dev/null @@ -1,114 +0,0 @@ -; RUN: opt < %s -cost-model -analyze -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s -; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s -; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM - -; ModuleID = 'arm.ll' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64" -target triple = "armv7--linux-gnueabihf" - -%T216 = type <2 x i16> -%T232 = type <2 x i32> -%T264 = type <2 x i64> - -%T416 = type <4 x i16> -%T432 = type <4 x i32> -%T464 = type <4 x i64> - -define void @direct(%T432* %loadaddr, %T432* %loadaddr2, %T432* %storeaddr) { -; COST: function 'direct': - %v0 = load %T432, %T432* %loadaddr -; ASM: vld1.64 - %v1 = load %T432, %T432* %loadaddr2 -; ASM: vld1.64 - %r3 = mul %T432 %v0, %v1 -; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> -; ASM: vmul.i32 - store %T432 %r3, %T432* %storeaddr -; ASM: vst1.64 - ret void -} - -define void @ups1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) { -; COST: function 'ups1632': - %v0 = load %T416, %T416* %loadaddr -; ASM: vldr - %v1 = load %T416, %T416* %loadaddr2 -; ASM: vldr - %r1 = sext %T416 %v0 to %T432 - %r2 = sext %T416 %v1 to %T432 -; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32> - %r3 = mul %T432 %r1, %r2 -; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> -; ASM: vmull.s16 - store %T432 %r3, %T432* %storeaddr -; ASM: vst1.64 - ret void -} - -define void @upu1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) { -; COST: function 'upu1632': - %v0 = load %T416, %T416* %loadaddr -; ASM: vldr - %v1 = load %T416, %T416* %loadaddr2 -; ASM: vldr - %r1 = zext %T416 %v0 to %T432 - %r2 = zext %T416 %v1 to %T432 -; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32> - %r3 = mul %T432 %r1, %r2 -; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> -; ASM: vmull.u16 - store %T432 %r3, %T432* %storeaddr -; ASM: vst1.64 - ret void -} - -define void @ups3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) { -; COST: function 'ups3264': - %v0 = load %T232, %T232* %loadaddr -; ASM: vldr - %v1 = load %T232, %T232* %loadaddr2 -; ASM: vldr - %r3 = mul %T232 %v0, %v1 -; ASM: vmul.i32 -; COST: cost of 1 for instruction: {{.*}} mul <2 x i32> - %st = sext %T232 %r3 to %T264 -; ASM: vmovl.s32 -; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64> - store %T264 %st, %T264* %storeaddr -; ASM: vst1.64 - ret void -} - -define void @upu3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) { -; COST: function 'upu3264': - %v0 = load %T232, %T232* %loadaddr -; ASM: vldr - %v1 = load %T232, %T232* %loadaddr2 -; ASM: vldr - %r3 = mul %T232 %v0, %v1 -; ASM: vmul.i32 -; COST: cost of 1 for instruction: {{.*}} mul <2 x i32> - %st = zext %T232 %r3 to %T264 -; ASM: vmovl.u32 -; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64> - store %T264 %st, %T264* %storeaddr -; ASM: vst1.64 - ret void -} - -define void @dn3216(%T432* %loadaddr, %T432* %loadaddr2, %T416* %storeaddr) { -; COST: function 'dn3216': - %v0 = load %T432, %T432* %loadaddr -; ASM: vld1.64 - %v1 = load %T432, %T432* %loadaddr2 -; ASM: vld1.64 - %r3 = mul %T432 %v0, %v1 -; ASM: vmul.i32 -; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> - %st = trunc %T432 %r3 to %T416 -; ASM: vmovn.i32 -; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16> - store %T416 %st, %T416* %storeaddr -; ASM: vstr - ret void -} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll deleted file mode 100644 index a1cf4b318f3..00000000000 --- a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll +++ /dev/null @@ -1,165 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-vectorize -mtriple=thumbv8-unknown-unknown -mcpu=cortex-a53 -S | FileCheck %s - -; This test is reduced from SPECFP 2006 482.sphinx. -; We expect vectorization with <2 x double> and <2 x float> ops. -; See https://bugs.llvm.org/show_bug.cgi?id=36280 for more details. - - -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" - -@a = external global i32 -@v = external global i32 -@mm = external global float** -@vv = external global float** -@ll = external global float* - -define i32 @test(float* nocapture readonly %x) { -; CHECK-LABEL: @test( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[T:%.*]] = load i32, i32* @v, align 8 -; CHECK-NEXT: [[T1:%.*]] = load i32, i32* @a, align 4 -; CHECK-NEXT: br label [[OUTERLOOP:%.*]] -; CHECK: outerloop: -; CHECK-NEXT: [[T2:%.*]] = phi i32 [ [[V17:%.*]], [[OUTEREND:%.*]] ], [ [[T1]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[J_0136:%.*]] = phi i32 [ [[INC144:%.*]], [[OUTEREND]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[SCORE_1135:%.*]] = phi i32 [ [[CALL142:%.*]], [[OUTEREND]] ], [ -939524096, [[ENTRY]] ] -; CHECK-NEXT: [[T3:%.*]] = load float**, float*** @mm, align 4 -; CHECK-NEXT: [[ARRAYIDX109:%.*]] = getelementptr inbounds float*, float** [[T3]], i32 [[T2]] -; CHECK-NEXT: [[T4:%.*]] = load float*, float** [[ARRAYIDX109]], align 4 -; CHECK-NEXT: [[T5:%.*]] = load float**, float*** @vv, align 4 -; CHECK-NEXT: [[ARRAYIDX111:%.*]] = getelementptr inbounds float*, float** [[T5]], i32 [[T2]] -; CHECK-NEXT: [[T6:%.*]] = load float*, float** [[ARRAYIDX111]], align 4 -; CHECK-NEXT: [[T7:%.*]] = load float*, float** @ll, align 4 -; CHECK-NEXT: [[ARRAYIDX113:%.*]] = getelementptr inbounds float, float* [[T7]], i32 [[T2]] -; CHECK-NEXT: [[T8:%.*]] = load float, float* [[ARRAYIDX113]], align 4 -; CHECK-NEXT: [[CONV114:%.*]] = fpext float [[T8]] to double -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[T]], 2 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[T]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> zeroinitializer, double [[CONV114]], i32 0 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1> -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP9:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>* -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double> -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP14]] -; CHECK-NEXT: [[TMP16]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP15]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 -; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> undef, <2 x i32> <i32 1, i32 undef> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP16]], [[RDX_SHUF]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTERLOOP]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[INNERLOOP:%.*]] -; CHECK: innerloop: -; CHECK-NEXT: [[I_2132:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC129:%.*]], [[INNERLOOP]] ] -; CHECK-NEXT: [[DVAL1_4131:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB127:%.*]], [[INNERLOOP]] ] -; CHECK-NEXT: [[ARRAYIDX119:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_2132]] -; CHECK-NEXT: [[T9:%.*]] = load float, float* [[ARRAYIDX119]], align 4 -; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[I_2132]] -; CHECK-NEXT: [[T10:%.*]] = load float, float* [[ARRAYIDX120]], align 4 -; CHECK-NEXT: [[SUB121:%.*]] = fsub fast float [[T9]], [[T10]] -; CHECK-NEXT: [[CONV122:%.*]] = fpext float [[SUB121]] to double -; CHECK-NEXT: [[MUL123:%.*]] = fmul fast double [[CONV122]], [[CONV122]] -; CHECK-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[I_2132]] -; CHECK-NEXT: [[T11:%.*]] = load float, float* [[ARRAYIDX124]], align 4 -; CHECK-NEXT: [[CONV125:%.*]] = fpext float [[T11]] to double -; CHECK-NEXT: [[MUL126:%.*]] = fmul fast double [[MUL123]], [[CONV125]] -; CHECK-NEXT: [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]] -; CHECK-NEXT: [[INC129]] = add nuw nsw i32 [[I_2132]], 1 -; CHECK-NEXT: [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]] -; CHECK-NEXT: br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop !2 -; CHECK: outerend: -; CHECK-NEXT: [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32 -; CHECK-NEXT: [[CALL142]] = add nuw nsw i32 [[SCORE_1135]], [[CONV138]] -; CHECK-NEXT: [[INC144]] = add nuw nsw i32 [[J_0136]], 1 -; CHECK-NEXT: [[ARRAYIDX102:%.*]] = getelementptr inbounds i32, i32* @a, i32 [[INC144]] -; CHECK-NEXT: [[V17]] = load i32, i32* [[ARRAYIDX102]], align 4 -; CHECK-NEXT: [[CMP103:%.*]] = icmp sgt i32 [[V17]], -1 -; CHECK-NEXT: br i1 [[CMP103]], label [[OUTERLOOP]], label [[EXIT:%.*]] -; CHECK: exit: -; CHECK-NEXT: ret i32 [[CALL142]] -; -entry: - %t = load i32, i32* @v, align 8 - %t1 = load i32, i32* @a, align 4 - br label %outerloop - -outerloop: - %t2 = phi i32 [ %v17, %outerend ], [ %t1, %entry ] - %j.0136 = phi i32 [ %inc144, %outerend ], [ 0, %entry ] - %score.1135 = phi i32 [ %call142, %outerend ], [ -939524096, %entry ] - %t3 = load float**, float*** @mm, align 4 - %arrayidx109 = getelementptr inbounds float*, float** %t3, i32 %t2 - %t4 = load float*, float** %arrayidx109, align 4 - %t5 = load float**, float*** @vv, align 4 - %arrayidx111 = getelementptr inbounds float*, float** %t5, i32 %t2 - %t6 = load float*, float** %arrayidx111, align 4 - %t7 = load float*, float** @ll, align 4 - %arrayidx113 = getelementptr inbounds float, float* %t7, i32 %t2 - %t8 = load float, float* %arrayidx113, align 4 - %conv114 = fpext float %t8 to double - br label %innerloop - -innerloop: - %i.2132 = phi i32 [ 0, %outerloop ], [ %inc129, %innerloop ] - %dval1.4131 = phi double [ %conv114, %outerloop ], [ %sub127, %innerloop ] - %arrayidx119 = getelementptr inbounds float, float* %x, i32 %i.2132 - %t9 = load float, float* %arrayidx119, align 4 - %arrayidx120 = getelementptr inbounds float, float* %t4, i32 %i.2132 - %t10 = load float, float* %arrayidx120, align 4 - %sub121 = fsub fast float %t9, %t10 - %conv122 = fpext float %sub121 to double - %mul123 = fmul fast double %conv122, %conv122 - %arrayidx124 = getelementptr inbounds float, float* %t6, i32 %i.2132 - %t11 = load float, float* %arrayidx124, align 4 - %conv125 = fpext float %t11 to double - %mul126 = fmul fast double %mul123, %conv125 - %sub127 = fsub fast double %dval1.4131, %mul126 - %inc129 = add nuw nsw i32 %i.2132, 1 - %exitcond143 = icmp eq i32 %inc129, %t - br i1 %exitcond143, label %outerend, label %innerloop - -outerend: - %sub127.lcssa = phi double [ %sub127, %innerloop ] - %conv138 = fptosi double %sub127.lcssa to i32 - %call142 = add nuw nsw i32 %score.1135, %conv138 - %inc144 = add nuw nsw i32 %j.0136, 1 - %arrayidx102 = getelementptr inbounds i32, i32* @a, i32 %inc144 - %v17 = load i32, i32* %arrayidx102, align 4 - %cmp103 = icmp sgt i32 %v17, -1 - br i1 %cmp103, label %outerloop, label %exit - -exit: - ret i32 %call142 -} - diff --git a/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll b/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll deleted file mode 100644 index 3be22d708da..00000000000 --- a/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll +++ /dev/null @@ -1,37 +0,0 @@ -; RUN: opt -loop-vectorize -tbaa -S -mattr=+neon < %s | FileCheck %s - -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" -target triple = "armv7--linux-gnueabi" - -; This requires the loop vectorizer to create an interleaved access group -; for the stores to the struct. Here we need to perform a bitcast from a vector -; of pointers to a vector i32s. - -%class.A = type { i8*, i32 } - -; CHECK-LABEL: test0 -define void @test0(%class.A* %StartPtr, %class.A* %APtr) { -entry: - br label %for.body.i - -for.body.i: - %addr = phi %class.A* [ %StartPtr, %entry ], [ %incdec.ptr.i, %for.body.i ] - %Data.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 0 - store i8* null, i8** %Data.i.i, align 4, !tbaa !8 - %Length.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 1 - store i32 0, i32* %Length.i.i, align 4, !tbaa !11 - %incdec.ptr.i = getelementptr inbounds %class.A, %class.A* %addr, i32 1 - %cmp.i = icmp eq %class.A* %incdec.ptr.i, %APtr - br i1 %cmp.i, label %exit, label %for.body.i - -exit: - ret void -} - -!5 = !{!"any pointer", !6, i64 0} -!6 = !{!"omnipotent char", !7, i64 0} -!7 = !{!"Simple C/C++ TBAA"} -!8 = !{!9, !5, i64 0} -!9 = !{!"some struct", !5, i64 0, !10, i64 4} -!10 = !{!"int", !6, i64 0} -!11 = !{!9, !10, i64 4} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll b/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll deleted file mode 100644 index 66d2556dfb8..00000000000 --- a/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll +++ /dev/null @@ -1,52 +0,0 @@ -; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s - -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" -target triple = "thumbv7-apple-ios3.0.0" - -;CHECK:foo_F32 -;CHECK: <4 x float> -;CHECK:ret -define float @foo_F32(float* nocapture %A, i32 %n) nounwind uwtable readonly ssp { - %1 = icmp sgt i32 %n, 0 - br i1 %1, label %.lr.ph, label %._crit_edge - -.lr.ph: ; preds = %0, %.lr.ph - %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] - %prod.01 = phi float [ %4, %.lr.ph ], [ 0.000000e+00, %0 ] - %2 = getelementptr inbounds float, float* %A, i64 %indvars.iv - %3 = load float, float* %2, align 8 - %4 = fmul fast float %prod.01, %3 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %._crit_edge, label %.lr.ph - -._crit_edge: ; preds = %.lr.ph, %0 - %prod.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %4, %.lr.ph ] - ret float %prod.0.lcssa -} - -;CHECK:foo_I8 -;CHECK: xor <16 x i8> -;CHECK:ret -define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp { - %1 = icmp sgt i32 %n, 0 - br i1 %1, label %.lr.ph, label %._crit_edge - -.lr.ph: ; preds = %0, %.lr.ph - %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] - %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ] - %2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv - %3 = load i8, i8* %2, align 1 - %4 = xor i8 %3, %red.01 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %n - br i1 %exitcond, label %._crit_edge, label %.lr.ph - -._crit_edge: ; preds = %.lr.ph, %0 - %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ] - ret i8 %red.0.lcssa -} - - |