summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfo.h13
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfoImpl.h2
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp4
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h4
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp54
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll335
6 files changed, 406 insertions, 6 deletions
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 735b022e309..1471890294a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -367,6 +367,15 @@ public:
/// \brief Enable matching of interleaved access groups.
bool enableInterleavedAccessVectorization() const;
+ /// \brief Indicate that it is potentially unsafe to automatically vectorize
+ /// floating-point operations because the semantics of vector and scalar
+ /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
+ /// does not support IEEE-754 denormal numbers, while depending on the
+ /// platform, scalar floating-point math does.
+ /// This applies to floating-point math operations and calls, not memory
+ /// operations, shuffles, or casts.
+ bool isFPVectorizationPotentiallyUnsafe() const;
+
/// \brief Return hardware support for population count.
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
@@ -621,6 +630,7 @@ public:
virtual bool shouldBuildLookupTables() = 0;
virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
virtual bool enableInterleavedAccessVectorization() = 0;
+ virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
virtual bool haveFastSqrt(Type *Ty) = 0;
virtual int getFPOpCost(Type *Ty) = 0;
@@ -779,6 +789,9 @@ public:
bool enableInterleavedAccessVectorization() override {
return Impl.enableInterleavedAccessVectorization();
}
+ bool isFPVectorizationPotentiallyUnsafe() override {
+ return Impl.isFPVectorizationPotentiallyUnsafe();
+ }
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
return Impl.getPopcntSupport(IntTyWidthInBit);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 487de0892ff..899f3f57ef7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -240,6 +240,8 @@ public:
bool enableInterleavedAccessVectorization() { return false; }
+ bool isFPVectorizationPotentiallyUnsafe() { return false; }
+
TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) {
return TTI::PSK_Software;
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 48e441bac69..9b23d7ca932 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -172,6 +172,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
return TTIImpl->enableInterleavedAccessVectorization();
}
+bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const {
+ return TTIImpl->isFPVectorizationPotentiallyUnsafe();
+}
+
TargetTransformInfo::PopcntSupportKind
TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const {
return TTIImpl->getPopcntSupport(IntTyWidthInBit);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 7808587c205..0fe964f36de 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -54,6 +54,10 @@ public:
bool enableInterleavedAccessVectorization() { return true; }
+ bool isFPVectorizationPotentiallyUnsafe() {
+ return !ST->hasFPARMv8() && !ST->isTargetDarwin();
+ }
+
/// \name Scalar TTI Implementations
/// @{
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2c5fec64c18..8072d06c34b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -963,6 +963,9 @@ class LoopVectorizeHints {
/// Return the loop metadata prefix.
static StringRef Prefix() { return "llvm.loop."; }
+ /// True if there is any unsafe math in the loop.
+ bool PotentiallyUnsafe;
+
public:
enum ForceKind {
FK_Undefined = -1, ///< Not selected.
@@ -975,7 +978,7 @@ public:
HK_WIDTH),
Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
Force("vectorize.enable", FK_Undefined, HK_FORCE),
- TheLoop(L) {
+ PotentiallyUnsafe(false), TheLoop(L) {
// Populate values with existing loop metadata.
getHintsFromMetadata();
@@ -1073,6 +1076,19 @@ public:
return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
}
+ bool isPotentiallyUnsafe() const {
+ // Avoid FP vectorization if the target is unsure about proper support.
+ // This may be related to the SIMD unit in the target not handling
+ // IEEE 754 FP ops properly, or bad single-to-double promotions.
+ // Otherwise, a sequence of vectorized loops, even without reduction,
+ // could lead to different end results on the destination vectors.
+ return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
+ }
+
+ void setPotentiallyUnsafe() {
+ PotentiallyUnsafe = true;
+ }
+
private:
/// Find hints specified in the loop metadata and update local values.
void getHintsFromMetadata() {
@@ -1234,7 +1250,7 @@ public:
const TargetTransformInfo *TTI,
LoopAccessAnalysis *LAA,
LoopVectorizationRequirements *R,
- const LoopVectorizeHints *H)
+ LoopVectorizeHints *H)
: NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
@@ -1460,7 +1476,7 @@ private:
LoopVectorizationRequirements *Requirements;
/// Used to emit an analysis of any legality issues.
- const LoopVectorizeHints *Hints;
+ LoopVectorizeHints *Hints;
ValueToValueMap Strides;
SmallPtrSet<Value *, 8> StrideSet;
@@ -1884,6 +1900,21 @@ struct LoopVectorize : public FunctionPass {
return false;
}
+ // Check if the target supports potentially unsafe FP vectorization.
+ // FIXME: Add a check for the type of safety issue (denormal, signaling)
+ // for the target we're vectorizing for, to make sure none of the
+ // additional fp-math flags can help.
+ if (Hints.isPotentiallyUnsafe() &&
+ TTI->isFPVectorizationPotentiallyUnsafe()) {
+ DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
+ emitAnalysisDiag(
+ F, L, Hints,
+ VectorizationReport()
+ << "loop not vectorized due to unsafe FP support.");
+ emitMissedWarning(F, L, Hints);
+ return false;
+ }
+
// Select the optimal vectorization factor.
const LoopVectorizationCostModel::VectorizationFactor VF =
CM.selectVectorizationFactor(OptForSize);
@@ -4695,12 +4726,23 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
if (EnableMemAccessVersioning)
collectStridedAccess(ST);
- }
- if (EnableMemAccessVersioning)
- if (LoadInst *LI = dyn_cast<LoadInst>(it))
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(it)) {
+ if (EnableMemAccessVersioning)
collectStridedAccess(LI);
+ // FP instructions can allow unsafe algebra, thus vectorizable by
+ // non-IEEE-754 compliant SIMD units.
+ // This applies to floating-point math operations and calls, not memory
+ // operations, shuffles, or casts, as they don't change precision or
+ // semantics.
+ } else if (it->getType()->isFloatingPointTy() &&
+ (CI || it->isBinaryOp()) &&
+ !it->hasUnsafeAlgebra()) {
+ DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+ Hints->setPotentiallyUnsafe();
+ }
+
// Reduction instructions are allowed to have exit users.
// All other instructions must not have external users.
if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
new file mode 100644
index 00000000000..e224c82c909
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -0,0 +1,335 @@
+; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX-V7
+; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX-V8
+; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
+
+; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
+; regarding IEEE 754 standard.
+; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
+; because NEON is not IEEE compliant.
+; Darwin, on the other hand, doesn't support subnormals, and all optimizations
+; are allowed, even without -ffast-math.
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi"
+; CHECK: We can vectorize this loop!
+define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+ %1 = load i32, i32* %arrayidx1, align 4
+ %mul = mul nsw i32 %1, %0
+ %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+ store i32 %mul, i32* %arrayidx2, align 4
+ %inc = add nuw nsw i32 %i.06, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ ret void
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX-V7: Checking a loop in "sumf"
+; LINUX-V7: Potentially unsafe FP op prevents vectorization
+; LINUX-V8: Checking a loop in "sumf"
+; LINUX-V8: We can vectorize this loop!
+; DARWIN: Checking a loop in "sumf"
+; DARWIN: We can vectorize this loop!
+define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+ %1 = load float, float* %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+ store float %mul, float* %arrayidx2, align 4
+ %inc = add nuw nsw i32 %i.06, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi"
+; CHECK: We can vectorize this loop!
+define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+ %1 = load i32, i32* %arrayidx1, align 4
+ %mul = mul nsw i32 %1, %0
+ %add = add nsw i32 %mul, %Red.06
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ %add.lcssa = phi i32 [ %add, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+ ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX-V7: Checking a loop in "redf"
+; LINUX-V7: Potentially unsafe FP op prevents vectorization
+; LINUX-V8: Checking a loop in "redf"
+; LINUX-V8: We can vectorize this loop!
+; DARWIN: Checking a loop in "redf"
+; DARWIN: We can vectorize this loop!
+define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+ %1 = load float, float* %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %add = fadd float %Red.06, %mul
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ %add.lcssa = phi float [ %add, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+ ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; LINUX-V7: Checking a loop in "fabs"
+; LINUX-V7: Potentially unsafe FP op prevents vectorization
+; LINUX-V8: Checking a loop in "fabs"
+; LINUX-V8: We can vectorize this loop!
+; DARWIN: Checking a loop in "fabs"
+; DARWIN: We can vectorize this loop!
+define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp10 = icmp eq i32 %N, 0
+ br i1 %cmp10, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+ %1 = load float, float* %arrayidx1, align 4
+ %fabsf = tail call float @fabsf(float %1) #1
+ %conv3 = fmul float %0, %fabsf
+ %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+ store float %conv3, float* %arrayidx4, align 4
+ %inc = add nuw nsw i32 %i.011, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi_fast"
+; CHECK: We can vectorize this loop!
+define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+ %1 = load i32, i32* %arrayidx1, align 4
+ %mul = mul nsw i32 %1, %0
+ %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+ store i32 %mul, i32* %arrayidx2, align 4
+ %inc = add nuw nsw i32 %i.06, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ ret void
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "sumf_fast"
+; CHECK: We can vectorize this loop!
+define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+ %1 = load float, float* %arrayidx1, align 4
+ %mul = fmul fast float %1, %0
+ %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+ store float %mul, float* %arrayidx2, align 4
+ %inc = add nuw nsw i32 %i.06, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi_fast"
+; CHECK: We can vectorize this loop!
+define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+ %1 = load i32, i32* %arrayidx1, align 4
+ %mul = mul nsw i32 %1, %0
+ %add = add nsw i32 %mul, %Red.06
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ %add.lcssa = phi i32 [ %add, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+ ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "redf_fast"
+; CHECK: We can vectorize this loop!
+define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+ %cmp5 = icmp eq i32 %N, 0
+ br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+ %1 = load float, float* %arrayidx1, align 4
+ %mul = fmul fast float %1, %0
+ %add = fadd fast float %mul, %Red.06
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit: ; preds = %for.body
+ %add.lcssa = phi float [ %add, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+ ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; CHECK: Checking a loop in "fabs_fast"
+; CHECK: We can vectorize this loop!
+define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+ %cmp10 = icmp eq i32 %N, 0
+ br i1 %cmp10, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+ %0 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+ %1 = load float, float* %arrayidx1, align 4
+ %fabsf = tail call fast float @fabsf(float %1) #2
+ %conv3 = fmul fast float %fabsf, %0
+ %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+ store float %conv3, float* %arrayidx4, align 4
+ %inc = add nuw nsw i32 %i.011, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare float @fabsf(float)
+
+attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
OpenPOWER on IntegriCloud