diff options
| author | Eric Christopher <echristo@gmail.com> | 2019-04-17 04:52:47 +0000 |
|---|---|---|
| committer | Eric Christopher <echristo@gmail.com> | 2019-04-17 04:52:47 +0000 |
| commit | cee313d288a4faf0355d76fb6e0e927e211d08a5 (patch) | |
| tree | d386075318d761197779a96e5d8fc0dc7b06342b /llvm/test/Transforms/LoopVectorize/ARM | |
| parent | c3d6a929fdd92fd06d4304675ade8d7210ee711a (diff) | |
| download | bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.tar.gz bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.zip | |
Revert "Temporarily Revert "Add basic loop fusion pass.""
The reversion apparently deleted the test/Transforms directory.
Will be re-reverting again.
llvm-svn: 358552
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize/ARM')
10 files changed, 1067 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll new file mode 100644 index 00000000000..369568f6dfa --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll @@ -0,0 +1,330 @@ +; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX +; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX +; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN +; REQUIRES: asserts + +; Testing the ability of the loop vectorizer to tell when SIMD is safe or not +; regarding IEEE 754 standard. +; On Linux, we only want the vectorizer to work when -ffast-math flag is set, +; because NEON is not IEEE compliant. +; Darwin, on the other hand, doesn't support subnormals, and all optimizations +; are allowed, even without -ffast-math. + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "sumi" +; CHECK: We can vectorize this loop! +define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Floating-point loops need fast-math to be vectorizeable +; LINUX: Checking a loop in "sumf" +; LINUX: Potentially unsafe FP op prevents vectorization +; DARWIN: Checking a loop in "sumf" +; DARWIN: We can vectorize this loop! +define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul float %0, %1 + %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 + store float %mul, float* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "redi" +; CHECK: We can vectorize this loop! +define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %Red.06 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret i32 %Red.0.lcssa +} + +; Floating-point loops need fast-math to be vectorizeable +; LINUX: Checking a loop in "redf" +; LINUX: Potentially unsafe FP op prevents vectorization +; DARWIN: Checking a loop in "redf" +; DARWIN: We can vectorize this loop! +define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul float %0, %1 + %add = fadd float %Red.06, %mul + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi float [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret float %Red.0.lcssa +} + +; Make sure calls that turn into builtins are also covered +; LINUX: Checking a loop in "fabs" +; LINUX: Potentially unsafe FP op prevents vectorization +; DARWIN: Checking a loop in "fabs" +; DARWIN: We can vectorize this loop! +define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { +entry: + %cmp10 = icmp eq i32 %N, 0 + br i1 %cmp10, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 + %1 = load float, float* %arrayidx1, align 4 + %fabsf = tail call float @fabsf(float %1) #1 + %conv3 = fmul float %0, %fabsf + %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 + store float %conv3, float* %arrayidx4, align 4 + %inc = add nuw nsw i32 %i.011, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "sumi_fast" +; CHECK: We can vectorize this loop! +define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Floating-point loops can be vectorizeable with fast-math +; CHECK: Checking a loop in "sumf_fast" +; CHECK: We can vectorize this loop! +define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul fast float %1, %0 + %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 + store float %mul, float* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Integer loops are always vectorizeable +; CHECK: Checking a loop in "redi_fast" +; CHECK: We can vectorize this loop! +define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, %Red.06 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret i32 %Red.0.lcssa +} + +; Floating-point loops can be vectorizeable with fast-math +; CHECK: Checking a loop in "redf_fast" +; CHECK: We can vectorize this loop! +define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp5 = icmp eq i32 %N, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] + %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 + %1 = load float, float* %arrayidx1, align 4 + %mul = fmul fast float %1, %0 + %add = fadd fast float %mul, %Red.06 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + %add.lcssa = phi float [ %add, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret float %Red.0.lcssa +} + +; Make sure calls that turn into builtins are also covered +; CHECK: Checking a loop in "fabs_fast" +; CHECK: We can vectorize this loop! +define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { +entry: + %cmp10 = icmp eq i32 %N, 0 + br i1 %cmp10, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 + %1 = load float, float* %arrayidx1, align 4 + %fabsf = tail call fast float @fabsf(float %1) #2 + %conv3 = fmul fast float %fabsf, %0 + %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 + store float %conv3, float* %arrayidx4, align 4 + %inc = add nuw nsw i32 %i.011, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare float @fabsf(float) + +attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll new file mode 100644 index 00000000000..7b09913636f --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll @@ -0,0 +1,71 @@ +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT +; RUN: opt < %s -loop-vectorize -force-vector-width=1 -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFTUNROLL + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +;CHECK-LABEL: @foo( +;CHECK: load <4 x i32> +;CHECK-NOT: load <4 x i32> +;CHECK: ret +;SWIFT-LABEL: @foo( +;SWIFT: load <4 x i32> +;SWIFT: load <4 x i32> +;SWIFT: ret +define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ] + %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32, i32* %A, i32 %i.02 + %3 = load i32, i32* %2, align 4 + %4 = add nsw i32 %3, %sum.01 + %5 = add nsw i32 %i.02, 1 + %exitcond = icmp eq i32 %5, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ] + ret i32 %sum.0.lcssa +} + +; Verify the register limit. On arm we don't have 16 allocatable registers. +;SWIFTUNROLL-LABEL: @register_limit( +;SWIFTUNROLL: load i32 +;SWIFTUNROLL-NOT: load i32 +define i32 @register_limit(i32* nocapture %A, i32 %n) { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: + %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ] + %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ] + %sum.02 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ] + %sum.03 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ] + %sum.04 = phi i32 [ %8, %.lr.ph ], [ 0, %0 ] + %sum.05 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] + %sum.06 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32, i32* %A, i32 %i.02 + %3 = load i32, i32* %2, align 4 + %4 = add nsw i32 %3, %sum.01 + %5 = add nsw i32 %i.02, 1 + %6 = add nsw i32 %3, %sum.02 + %7 = add nsw i32 %3, %sum.03 + %8 = add nsw i32 %3, %sum.04 + %9 = add nsw i32 %3, %sum.05 + %10 = add nsw i32 %3, %sum.05 + %exitcond = icmp eq i32 %5, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ] + %sum.1.lcssa = phi i32 [ 0, %0 ], [ %6, %.lr.ph ] + %sum.2.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ] + %sum.4.lcssa = phi i32 [ 0, %0 ], [ %8, %.lr.ph ] + %sum.5.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] + %sum.6.lcssa = phi i32 [ 0, %0 ], [ %10, %.lr.ph ] + ret i32 %sum.0.lcssa +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll new file mode 100644 index 00000000000..6d1fa6f36a9 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll @@ -0,0 +1,88 @@ +; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" + +@kernel = global [512 x float] zeroinitializer, align 4 +@kernel2 = global [512 x float] zeroinitializer, align 4 +@kernel3 = global [512 x float] zeroinitializer, align 4 +@kernel4 = global [512 x float] zeroinitializer, align 4 +@src_data = global [1536 x float] zeroinitializer, align 4 +@r_ = global i8 0, align 4 +@g_ = global i8 0, align 4 +@b_ = global i8 0, align 4 + +; We don't want to vectorize most loops containing gathers because they are +; expensive. This function represents a point where vectorization starts to +; become beneficial. +; Make sure we are conservative and don't vectorize it. +; CHECK-NOT: <2 x float> +; CHECK-NOT: <4 x float> + +define void @_Z4testmm(i32 %size, i32 %offset) { +entry: + %cmp53 = icmp eq i32 %size, 0 + br i1 %cmp53, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: + br label %for.body + +for.body: + %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ] + %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ] + %v.055 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ] + %add = add i32 %v.055, %offset + %mul = mul i32 %add, 3 + %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %mul + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 %v.055 + %1 = load float, float* %arrayidx2, align 4 + %mul3 = fmul fast float %0, %1 + %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 %v.055 + %2 = load float, float* %arrayidx4, align 4 + %mul5 = fmul fast float %mul3, %2 + %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 %v.055 + %3 = load float, float* %arrayidx6, align 4 + %mul7 = fmul fast float %mul5, %3 + %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 %v.055 + %4 = load float, float* %arrayidx8, align 4 + %mul9 = fmul fast float %mul7, %4 + %add10 = fadd fast float %r.057, %mul9 + %arrayidx.sum = add i32 %mul, 1 + %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum + %5 = load float, float* %arrayidx11, align 4 + %mul13 = fmul fast float %1, %5 + %mul15 = fmul fast float %2, %mul13 + %mul17 = fmul fast float %3, %mul15 + %mul19 = fmul fast float %4, %mul17 + %add20 = fadd fast float %g.056, %mul19 + %arrayidx.sum52 = add i32 %mul, 2 + %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum52 + %6 = load float, float* %arrayidx21, align 4 + %mul23 = fmul fast float %1, %6 + %mul25 = fmul fast float %2, %mul23 + %mul27 = fmul fast float %3, %mul25 + %mul29 = fmul fast float %4, %mul27 + %add30 = fadd fast float %b.054, %mul29 + %inc = add i32 %v.055, 1 + %exitcond = icmp ne i32 %inc, %size + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: + %add30.lcssa = phi float [ %add30, %for.body ] + %add20.lcssa = phi float [ %add20, %for.body ] + %add10.lcssa = phi float [ %add10, %for.body ] + %phitmp = fptoui float %add10.lcssa to i8 + %phitmp60 = fptoui float %add20.lcssa to i8 + %phitmp61 = fptoui float %add30.lcssa to i8 + br label %for.end + +for.end: + %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ] + %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ] + %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ] + store i8 %r.0.lcssa, i8* @r_, align 4 + store i8 %g.0.lcssa, i8* @g_, align 4 + store i8 %b.0.lcssa, i8* @b_, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll new file mode 100644 index 00000000000..783156d7399 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll @@ -0,0 +1,60 @@ +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +@b = common global [2048 x i32] zeroinitializer, align 16 +@c = common global [2048 x i32] zeroinitializer, align 16 +@a = common global [2048 x i32] zeroinitializer, align 16 + +; Select VF = 8; +;CHECK-LABEL: @example1( +;CHECK: load <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example1() nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv + %3 = load i32, i32* %2, align 4 + %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv + %5 = load i32, i32* %4, align 4 + %6 = add nsw i32 %5, %3 + %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %6, i32* %7, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 256 + br i1 %exitcond, label %8, label %1 + +; <label>:8 ; preds = %1 + ret void +} + +;CHECK-LABEL: @example10b( +;CHECK: load <4 x i16> +;CHECK: sext <4 x i16> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv + %3 = load i16, i16* %2, align 2 + %4 = sext i16 %3 to i32 + %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv + store i32 %4, i32* %5, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %6, label %1 + +; <label>:6 ; preds = %1 + ret void +} + diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll new file mode 100644 index 00000000000..29adec049f6 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll @@ -0,0 +1,147 @@ +; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 +; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4 +; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 +; REQUIRES: asserts + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "armv8--linux-gnueabihf" + +%i8.2 = type {i8, i8} +define void @i8_factor_2(%i8.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_8-LABEL: Checking a loop in "i8_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_16-LABEL: Checking a loop in "i8_factor_2" +; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1 + %tmp2 = load i8, i8* %tmp0, align 1 + %tmp3 = load i8, i8* %tmp1, align 1 + store i8 0, i8* %tmp0, align 1 + store i8 0, i8* %tmp1, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +%i16.2 = type {i16, i16} +define void @i16_factor_2(%i16.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_4-LABEL: Checking a loop in "i16_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_8-LABEL: Checking a loop in "i16_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_16-LABEL: Checking a loop in "i16_factor_2" +; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1 + %tmp2 = load i16, i16* %tmp0, align 2 + %tmp3 = load i16, i16* %tmp1, align 2 + store i16 0, i16* %tmp0, align 2 + store i16 0, i16* %tmp1, align 2 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +%i32.2 = type {i32, i32} +define void @i32_factor_2(%i32.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_2-LABEL: Checking a loop in "i32_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_4-LABEL: Checking a loop in "i32_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_8-LABEL: Checking a loop in "i32_factor_2" +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_16-LABEL: Checking a loop in "i32_factor_2" +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1 + %tmp2 = load i32, i32* %tmp0, align 4 + %tmp3 = load i32, i32* %tmp1, align 4 + store i32 0, i32* %tmp0, align 4 + store i32 0, i32* %tmp1, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +%half.2 = type {half, half} +define void @half_factor_2(%half.2* %data, i64 %n) { +entry: + br label %for.body + +; VF_4-LABEL: Checking a loop in "half_factor_2" +; VF_4: Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VF_8-LABEL: Checking a loop in "half_factor_2" +; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1 + %tmp2 = load half, half* %tmp0, align 2 + %tmp3 = load half, half* %tmp1, align 2 + store half 0., half* %tmp0, align 2 + store half 0., half* %tmp1, align 2 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg new file mode 100644 index 00000000000..98c6700c209 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'ARM' in config.root.targets: + config.unsupported = True + diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll new file mode 100644 index 00000000000..e88fcca1225 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll @@ -0,0 +1,114 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s +; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s +; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM + +; ModuleID = 'arm.ll' +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64" +target triple = "armv7--linux-gnueabihf" + +%T216 = type <2 x i16> +%T232 = type <2 x i32> +%T264 = type <2 x i64> + +%T416 = type <4 x i16> +%T432 = type <4 x i32> +%T464 = type <4 x i64> + +define void @direct(%T432* %loadaddr, %T432* %loadaddr2, %T432* %storeaddr) { +; COST: function 'direct': + %v0 = load %T432, %T432* %loadaddr +; ASM: vld1.64 + %v1 = load %T432, %T432* %loadaddr2 +; ASM: vld1.64 + %r3 = mul %T432 %v0, %v1 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> +; ASM: vmul.i32 + store %T432 %r3, %T432* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @ups1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) { +; COST: function 'ups1632': + %v0 = load %T416, %T416* %loadaddr +; ASM: vldr + %v1 = load %T416, %T416* %loadaddr2 +; ASM: vldr + %r1 = sext %T416 %v0 to %T432 + %r2 = sext %T416 %v1 to %T432 +; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32> + %r3 = mul %T432 %r1, %r2 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> +; ASM: vmull.s16 + store %T432 %r3, %T432* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @upu1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) { +; COST: function 'upu1632': + %v0 = load %T416, %T416* %loadaddr +; ASM: vldr + %v1 = load %T416, %T416* %loadaddr2 +; ASM: vldr + %r1 = zext %T416 %v0 to %T432 + %r2 = zext %T416 %v1 to %T432 +; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32> + %r3 = mul %T432 %r1, %r2 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> +; ASM: vmull.u16 + store %T432 %r3, %T432* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @ups3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) { +; COST: function 'ups3264': + %v0 = load %T232, %T232* %loadaddr +; ASM: vldr + %v1 = load %T232, %T232* %loadaddr2 +; ASM: vldr + %r3 = mul %T232 %v0, %v1 +; ASM: vmul.i32 +; COST: cost of 1 for instruction: {{.*}} mul <2 x i32> + %st = sext %T232 %r3 to %T264 +; ASM: vmovl.s32 +; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64> + store %T264 %st, %T264* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @upu3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) { +; COST: function 'upu3264': + %v0 = load %T232, %T232* %loadaddr +; ASM: vldr + %v1 = load %T232, %T232* %loadaddr2 +; ASM: vldr + %r3 = mul %T232 %v0, %v1 +; ASM: vmul.i32 +; COST: cost of 1 for instruction: {{.*}} mul <2 x i32> + %st = zext %T232 %r3 to %T264 +; ASM: vmovl.u32 +; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64> + store %T264 %st, %T264* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @dn3216(%T432* %loadaddr, %T432* %loadaddr2, %T416* %storeaddr) { +; COST: function 'dn3216': + %v0 = load %T432, %T432* %loadaddr +; ASM: vld1.64 + %v1 = load %T432, %T432* %loadaddr2 +; ASM: vld1.64 + %r3 = mul %T432 %v0, %v1 +; ASM: vmul.i32 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> + %st = trunc %T432 %r3 to %T416 +; ASM: vmovn.i32 +; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16> + store %T416 %st, %T416* %storeaddr +; ASM: vstr + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll new file mode 100644 index 00000000000..a1cf4b318f3 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-vectorize -mtriple=thumbv8-unknown-unknown -mcpu=cortex-a53 -S | FileCheck %s + +; This test is reduced from SPECFP 2006 482.sphinx. +; We expect vectorization with <2 x double> and <2 x float> ops. +; See https://bugs.llvm.org/show_bug.cgi?id=36280 for more details. + + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + +@a = external global i32 +@v = external global i32 +@mm = external global float** +@vv = external global float** +@ll = external global float* + +define i32 @test(float* nocapture readonly %x) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T:%.*]] = load i32, i32* @v, align 8 +; CHECK-NEXT: [[T1:%.*]] = load i32, i32* @a, align 4 +; CHECK-NEXT: br label [[OUTERLOOP:%.*]] +; CHECK: outerloop: +; CHECK-NEXT: [[T2:%.*]] = phi i32 [ [[V17:%.*]], [[OUTEREND:%.*]] ], [ [[T1]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[J_0136:%.*]] = phi i32 [ [[INC144:%.*]], [[OUTEREND]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[SCORE_1135:%.*]] = phi i32 [ [[CALL142:%.*]], [[OUTEREND]] ], [ -939524096, [[ENTRY]] ] +; CHECK-NEXT: [[T3:%.*]] = load float**, float*** @mm, align 4 +; CHECK-NEXT: [[ARRAYIDX109:%.*]] = getelementptr inbounds float*, float** [[T3]], i32 [[T2]] +; CHECK-NEXT: [[T4:%.*]] = load float*, float** [[ARRAYIDX109]], align 4 +; CHECK-NEXT: [[T5:%.*]] = load float**, float*** @vv, align 4 +; CHECK-NEXT: [[ARRAYIDX111:%.*]] = getelementptr inbounds float*, float** [[T5]], i32 [[T2]] +; CHECK-NEXT: [[T6:%.*]] = load float*, float** [[ARRAYIDX111]], align 4 +; CHECK-NEXT: [[T7:%.*]] = load float*, float** @ll, align 4 +; CHECK-NEXT: [[ARRAYIDX113:%.*]] = getelementptr inbounds float, float* [[T7]], i32 [[T2]] +; CHECK-NEXT: [[T8:%.*]] = load float, float* [[ARRAYIDX113]], align 4 +; CHECK-NEXT: [[CONV114:%.*]] = fpext float [[T8]] to double +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[T]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[T]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> zeroinitializer, double [[CONV114]], i32 0 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1> +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP9:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double> +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>* +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double> +; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP16]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP15]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> undef, <2 x i32> <i32 1, i32 undef> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP16]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTERLOOP]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[INNERLOOP:%.*]] +; CHECK: innerloop: +; CHECK-NEXT: [[I_2132:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC129:%.*]], [[INNERLOOP]] ] +; CHECK-NEXT: [[DVAL1_4131:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB127:%.*]], [[INNERLOOP]] ] +; CHECK-NEXT: [[ARRAYIDX119:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_2132]] +; CHECK-NEXT: [[T9:%.*]] = load float, float* [[ARRAYIDX119]], align 4 +; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[I_2132]] +; CHECK-NEXT: [[T10:%.*]] = load float, float* [[ARRAYIDX120]], align 4 +; CHECK-NEXT: [[SUB121:%.*]] = fsub fast float [[T9]], [[T10]] +; CHECK-NEXT: [[CONV122:%.*]] = fpext float [[SUB121]] to double +; CHECK-NEXT: [[MUL123:%.*]] = fmul fast double [[CONV122]], [[CONV122]] +; CHECK-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[I_2132]] +; CHECK-NEXT: [[T11:%.*]] = load float, float* [[ARRAYIDX124]], align 4 +; CHECK-NEXT: [[CONV125:%.*]] = fpext float [[T11]] to double +; CHECK-NEXT: [[MUL126:%.*]] = fmul fast double [[MUL123]], [[CONV125]] +; CHECK-NEXT: [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]] +; CHECK-NEXT: [[INC129]] = add nuw nsw i32 [[I_2132]], 1 +; CHECK-NEXT: [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]] +; CHECK-NEXT: br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop !2 +; CHECK: outerend: +; CHECK-NEXT: [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32 +; CHECK-NEXT: [[CALL142]] = add nuw nsw i32 [[SCORE_1135]], [[CONV138]] +; CHECK-NEXT: [[INC144]] = add nuw nsw i32 [[J_0136]], 1 +; CHECK-NEXT: [[ARRAYIDX102:%.*]] = getelementptr inbounds i32, i32* @a, i32 [[INC144]] +; CHECK-NEXT: [[V17]] = load i32, i32* [[ARRAYIDX102]], align 4 +; CHECK-NEXT: [[CMP103:%.*]] = icmp sgt i32 [[V17]], -1 +; CHECK-NEXT: br i1 [[CMP103]], label [[OUTERLOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret i32 [[CALL142]] +; +entry: + %t = load i32, i32* @v, align 8 + %t1 = load i32, i32* @a, align 4 + br label %outerloop + +outerloop: + %t2 = phi i32 [ %v17, %outerend ], [ %t1, %entry ] + %j.0136 = phi i32 [ %inc144, %outerend ], [ 0, %entry ] + %score.1135 = phi i32 [ %call142, %outerend ], [ -939524096, %entry ] + %t3 = load float**, float*** @mm, align 4 + %arrayidx109 = getelementptr inbounds float*, float** %t3, i32 %t2 + %t4 = load float*, float** %arrayidx109, align 4 + %t5 = load float**, float*** @vv, align 4 + %arrayidx111 = getelementptr inbounds float*, float** %t5, i32 %t2 + %t6 = load float*, float** %arrayidx111, align 4 + %t7 = load float*, float** @ll, align 4 + %arrayidx113 = getelementptr inbounds float, float* %t7, i32 %t2 + %t8 = load float, float* %arrayidx113, align 4 + %conv114 = fpext float %t8 to double + br label %innerloop + +innerloop: + %i.2132 = phi i32 [ 0, %outerloop ], [ %inc129, %innerloop ] + %dval1.4131 = phi double [ %conv114, %outerloop ], [ %sub127, %innerloop ] + %arrayidx119 = getelementptr inbounds float, float* %x, i32 %i.2132 + %t9 = load float, float* %arrayidx119, align 4 + %arrayidx120 = getelementptr inbounds float, float* %t4, i32 %i.2132 + %t10 = load float, float* %arrayidx120, align 4 + %sub121 = fsub fast float %t9, %t10 + %conv122 = fpext float %sub121 to double + %mul123 = fmul fast double %conv122, %conv122 + %arrayidx124 = getelementptr inbounds float, float* %t6, i32 %i.2132 + %t11 = load float, float* %arrayidx124, align 4 + %conv125 = fpext float %t11 to double + %mul126 = fmul fast double %mul123, %conv125 + %sub127 = fsub fast double %dval1.4131, %mul126 + %inc129 = add nuw nsw i32 %i.2132, 1 + %exitcond143 = icmp eq i32 %inc129, %t + br i1 %exitcond143, label %outerend, label %innerloop + +outerend: + %sub127.lcssa = phi double [ %sub127, %innerloop ] + %conv138 = fptosi double %sub127.lcssa to i32 + %call142 = add nuw nsw i32 %score.1135, %conv138 + %inc144 = add nuw nsw i32 %j.0136, 1 + %arrayidx102 = getelementptr inbounds i32, i32* @a, i32 %inc144 + %v17 = load i32, i32* %arrayidx102, align 4 + %cmp103 = icmp sgt i32 %v17, -1 + br i1 %cmp103, label %outerloop, label %exit + +exit: + ret i32 %call142 +} + diff --git a/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll b/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll new file mode 100644 index 00000000000..3be22d708da --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll @@ -0,0 +1,37 @@ +; RUN: opt -loop-vectorize -tbaa -S -mattr=+neon < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv7--linux-gnueabi" + +; This requires the loop vectorizer to create an interleaved access group +; for the stores to the struct. Here we need to perform a bitcast from a vector +; of pointers to a vector i32s. + +%class.A = type { i8*, i32 } + +; CHECK-LABEL: test0 +define void @test0(%class.A* %StartPtr, %class.A* %APtr) { +entry: + br label %for.body.i + +for.body.i: + %addr = phi %class.A* [ %StartPtr, %entry ], [ %incdec.ptr.i, %for.body.i ] + %Data.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 0 + store i8* null, i8** %Data.i.i, align 4, !tbaa !8 + %Length.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 1 + store i32 0, i32* %Length.i.i, align 4, !tbaa !11 + %incdec.ptr.i = getelementptr inbounds %class.A, %class.A* %addr, i32 1 + %cmp.i = icmp eq %class.A* %incdec.ptr.i, %APtr + br i1 %cmp.i, label %exit, label %for.body.i + +exit: + ret void +} + +!5 = !{!"any pointer", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = !{!9, !5, i64 0} +!9 = !{!"some struct", !5, i64 0, !10, i64 4} +!10 = !{!"int", !6, i64 0} +!11 = !{!9, !10, i64 4} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll b/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll new file mode 100644 index 00000000000..66d2556dfb8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll @@ -0,0 +1,52 @@ +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +;CHECK:foo_F32 +;CHECK: <4 x float> +;CHECK:ret +define float @foo_F32(float* nocapture %A, i32 %n) nounwind uwtable readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %prod.01 = phi float [ %4, %.lr.ph ], [ 0.000000e+00, %0 ] + %2 = getelementptr inbounds float, float* %A, i64 %indvars.iv + %3 = load float, float* %2, align 8 + %4 = fmul fast float %prod.01, %3 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %prod.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %4, %.lr.ph ] + ret float %prod.0.lcssa +} + +;CHECK:foo_I8 +;CHECK: xor <16 x i8> +;CHECK:ret +define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv + %3 = load i8, i8* %2, align 1 + %4 = xor i8 %3, %red.01 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ] + ret i8 %red.0.lcssa +} + + |

