summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/LoopVectorize
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2018-02-19 16:11:44 +0000
committerSanjay Patel <spatel@rotateright.com>2018-02-19 16:11:44 +0000
commit3e8a76abfda50c5416558875fe799ffa44c169af (patch)
tree11c4bc67924f5267423175aed5801228a2f84af9 /llvm/test/Transforms/LoopVectorize
parentc7e51805ff52e84594b0514d5bdf31579434b80c (diff)
downloadbcm5719-llvm-3e8a76abfda50c5416558875fe799ffa44c169af.tar.gz
bcm5719-llvm-3e8a76abfda50c5416558875fe799ffa44c169af.zip
[TTI CostModel] change default cost of FP ops to 1 (PR36280)
This change was mentioned at least as far back as: https://bugs.llvm.org/show_bug.cgi?id=26837#c26 ...and I found a real program that is harmed by this: Himeno running on AMD Jaguar gets 6% slower with SLP vectorization: https://bugs.llvm.org/show_bug.cgi?id=36280 ...but the change here appears to solve that bug only accidentally. The div/rem costs for x86 look very wrong in some cases, but that's already true, so we can fix those in follow-up patches. There's also evidence that more cost model changes are needed to solve SLP problems as shown in D42981, but that's an independent problem (though the solution may be adjusted after this change is made). Differential Revision: https://reviews.llvm.org/D43079 llvm-svn: 325515
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize')
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll45
1 files changed, 9 insertions, 36 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
index 25e164fc1c9..07eb4cbcbc0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -1,11 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin %s | FileCheck %s
+; FIXME: The intent is that we should be able to vectorize this on x86
+; because that would be profitable, but the cost model says it is not.
+
; Two mostly identical functions. The only difference is the presence of
; fast-math flags on the second. The loop is a pretty simple reduction:
; for (int i = 0; i < 32; ++i)
-; if (arr[i] != 42)
+; if (arr[i] != 42.0)
; tot += arr[i];
define double @sumIfScalar(double* nocapture readonly %arr) {
@@ -66,41 +69,11 @@ done:
define double @sumIfVector(double* nocapture readonly %arr) {
; CHECK-LABEL: @sumIfVector(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01>
-; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
-; CHECK: middle.block:
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[PREDPHI]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[PREDPHI]], [[RDX_SHUF]]
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32
-; CHECK-NEXT: br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
-; CHECK-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
-; CHECK-NEXT: [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; CHECK-NEXT: [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; CHECK-NEXT: [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
; CHECK-NEXT: [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
; CHECK-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
; CHECK-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
@@ -113,9 +86,9 @@ define double @sumIfVector(double* nocapture readonly %arr) {
; CHECK-NEXT: [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
; CHECK-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
-; CHECK-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop !2
+; CHECK-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]]
; CHECK: done:
-; CHECK-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ]
; CHECK-NEXT: ret double [[TOT_NEXT_LCSSA]]
;
entry:
OpenPOWER on IntegriCloud