diff options
author | Sanjay Patel <spatel@rotateright.com> | 2018-02-19 16:11:44 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2018-02-19 16:11:44 +0000 |
commit | 3e8a76abfda50c5416558875fe799ffa44c169af (patch) | |
tree | 11c4bc67924f5267423175aed5801228a2f84af9 /llvm/test/Transforms/LoopVectorize | |
parent | c7e51805ff52e84594b0514d5bdf31579434b80c (diff) | |
download | bcm5719-llvm-3e8a76abfda50c5416558875fe799ffa44c169af.tar.gz bcm5719-llvm-3e8a76abfda50c5416558875fe799ffa44c169af.zip |
[TTI CostModel] change default cost of FP ops to 1 (PR36280)
This change was mentioned at least as far back as:
https://bugs.llvm.org/show_bug.cgi?id=26837#c26
...and I found a real program that is harmed by this:
Himeno running on AMD Jaguar gets 6% slower with SLP vectorization:
https://bugs.llvm.org/show_bug.cgi?id=36280
...but the change here appears to solve that bug only accidentally.
The div/rem costs for x86 look very wrong in some cases, but that's already true,
so we can fix those in follow-up patches. There's also evidence that more cost model
changes are needed to solve SLP problems as shown in D42981, but that's an independent
problem (though the solution may be adjusted after this change is made).
Differential Revision: https://reviews.llvm.org/D43079
llvm-svn: 325515
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize')
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll | 45 |
1 files changed, 9 insertions, 36 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll index 25e164fc1c9..07eb4cbcbc0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll @@ -1,11 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin %s | FileCheck %s +; FIXME: The intent is that we should be able to vectorize this on x86 +; because that would be profitable, but the cost model says it is not. + ; Two mostly identical functions. The only difference is the presence of ; fast-math flags on the second. The loop is a pretty simple reduction: ; for (int i = 0; i < 32; ++i) -; if (arr[i] != 42) +; if (arr[i] != 42.0) ; tot += arr[i]; define double @sumIfScalar(double* nocapture readonly %arr) { @@ -66,41 +69,11 @@ done: define double @sumIfVector(double* nocapture readonly %arr) { ; CHECK-LABEL: @sumIfVector( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1> -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01> -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true> -; CHECK-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 -; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[PREDPHI]], <2 x double> undef, <2 x i32> <i32 1, i32 undef> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[PREDPHI]], [[RDX_SHUF]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32 -; CHECK-NEXT: br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] -; CHECK-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] +; CHECK-NEXT: [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]] ; CHECK-NEXT: [[NEXTVAL:%.*]] = load double, double* [[ADDR]] ; CHECK-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01 ; CHECK-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]] @@ -113,9 +86,9 @@ define double @sumIfVector(double* nocapture readonly %arr) { ; CHECK-NEXT: [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ] ; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 ; CHECK-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32 -; CHECK-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop !2 +; CHECK-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]] ; CHECK: done: -; CHECK-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ] ; CHECK-NEXT: ret double [[TOT_NEXT_LCSSA]] ; entry: |