[ARM] Adding IEEE-754 SIMD detection to loop vectorizer

Some SIMD implementations are not IEEE-754 compliant, for example ARM's NEON. This patch teaches the loop vectorizer to only allow transformations of loops that either contain no floating-point operations or have enough allowance flags supporting lack of precision (ex. -ffast-math, Darwin). For that, the target description now has a method which tells us if the vectorizer is allowed to handle FP math without falling into unsafe representations, plus a check on every FP instruction in the candidate loop to check for the safety flags. This commit makes LLVM behave like GCC with respect to ARM NEON support, but it stops short of fixing the underlying problem: sub-normals. Neither GCC nor LLVM have a flag for allowing sub-normal operations. Before this patch, GCC only allows it using unsafe-math flags and LLVM allows it by default with no way to turn it off (short of not using NEON at all). As a first step, we push this change to make it safe and in sync with GCC. The second step is to discuss a new sub-normal's flag on both communitues and come up with a common solution. The third step is to improve the FastMath flags in LLVM to encode sub-normals and use those flags to restrict NEON FP. Fixes PR16275. llvm-svn: 266363
author: Renato Golin <renato.golin@linaro.org> 2016-04-14 20:42:18 +0000
committer: Renato Golin <renato.golin@linaro.org> 2016-04-14 20:42:18 +0000
commit: 5cb666add7d47962c82db4da224a7978fb69f403 (patch)
tree: 2d94cd96392937da81a6b9708c399ac1d8b5c8a8 /llvm/test/Transforms/LoopVectorize
parent: e998b91d865e6d0db389b026d7b85f65ef5a2cad (diff)
download: bcm5719-llvm-5cb666add7d47962c82db4da224a7978fb69f403.tar.gz
bcm5719-llvm-5cb666add7d47962c82db4da224a7978fb69f403.zip
1 files changed, 335 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
new file mode 100644
index 00000000000..e224c82c909
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -0,0 +1,335 @@
+; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX-V7
+; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX-V8
+; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
+
+; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
+; regarding IEEE 754 standard.
+; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
+; because NEON is not IEEE compliant.
+; Darwin, on the other hand, doesn't support subnormals, and all optimizations
+; are allowed, even without -ffast-math.
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi"
+; CHECK: We can vectorize this loop!
+define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX-V7: Checking a loop in "sumf"
+; LINUX-V7: Potentially unsafe FP op prevents vectorization
+; LINUX-V8: Checking a loop in "sumf"
+; LINUX-V8: We can vectorize this loop!
+; DARWIN: Checking a loop in "sumf"
+; DARWIN: We can vectorize this loop!
+define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+  store float %mul, float* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi"
+; CHECK: We can vectorize this loop!
+define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX-V7: Checking a loop in "redf"
+; LINUX-V7: Potentially unsafe FP op prevents vectorization
+; LINUX-V8: Checking a loop in "redf"
+; LINUX-V8: We can vectorize this loop!
+; DARWIN: Checking a loop in "redf"
+; DARWIN: We can vectorize this loop!
+define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %add = fadd float %Red.06, %mul
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi float [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; LINUX-V7: Checking a loop in "fabs"
+; LINUX-V7: Potentially unsafe FP op prevents vectorization
+; LINUX-V8: Checking a loop in "fabs"
+; LINUX-V8: We can vectorize this loop!
+; DARWIN: Checking a loop in "fabs"
+; DARWIN: We can vectorize this loop!
+define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp10 = icmp eq i32 %N, 0
+  br i1 %cmp10, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+  %1 = load float, float* %arrayidx1, align 4
+  %fabsf = tail call float @fabsf(float %1) #1
+  %conv3 = fmul float %0, %fabsf
+  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+  store float %conv3, float* %arrayidx4, align 4
+  %inc = add nuw nsw i32 %i.011, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi_fast"
+; CHECK: We can vectorize this loop!
+define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "sumf_fast"
+; CHECK: We can vectorize this loop!
+define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul fast float %1, %0
+  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+  store float %mul, float* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi_fast"
+; CHECK: We can vectorize this loop!
+define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "redf_fast"
+; CHECK: We can vectorize this loop!
+define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul fast float %1, %0
+  %add = fadd fast float %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi float [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; CHECK: Checking a loop in "fabs_fast"
+; CHECK: We can vectorize this loop!
+define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp10 = icmp eq i32 %N, 0
+  br i1 %cmp10, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+  %1 = load float, float* %arrayidx1, align 4
+  %fabsf = tail call fast float @fabsf(float %1) #2
+  %conv3 = fmul fast float %fabsf, %0
+  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+  store float %conv3, float* %arrayidx4, align 4
+  %inc = add nuw nsw i32 %i.011, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @fabsf(float)
+
+attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
author	Renato Golin <renato.golin@linaro.org>	2016-04-14 20:42:18 +0000
committer	Renato Golin <renato.golin@linaro.org>	2016-04-14 20:42:18 +0000
commit	5cb666add7d47962c82db4da224a7978fb69f403 (patch)
tree	2d94cd96392937da81a6b9708c399ac1d8b5c8a8 /llvm/test/Transforms/LoopVectorize
parent	e998b91d865e6d0db389b026d7b85f65ef5a2cad (diff)
download	bcm5719-llvm-5cb666add7d47962c82db4da224a7978fb69f403.tar.gz bcm5719-llvm-5cb666add7d47962c82db4da224a7978fb69f403.zip