[TTI CostModel] change default cost of FP ops to 1 (PR36280)

This change was mentioned at least as far back as: https://bugs.llvm.org/show_bug.cgi?id=26837#c26 ...and I found a real program that is harmed by this: Himeno running on AMD Jaguar gets 6% slower with SLP vectorization: https://bugs.llvm.org/show_bug.cgi?id=36280 ...but the change here appears to solve that bug only accidentally. The div/rem costs for x86 look very wrong in some cases, but that's already true, so we can fix those in follow-up patches. There's also evidence that more cost model changes are needed to solve SLP problems as shown in D42981, but that's an independent problem (though the solution may be adjusted after this change is made). Differential Revision: https://reviews.llvm.org/D43079 llvm-svn: 325515
author: Sanjay Patel <spatel@rotateright.com> 2018-02-19 16:11:44 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2018-02-19 16:11:44 +0000
commit: 3e8a76abfda50c5416558875fe799ffa44c169af (patch)
tree: 11c4bc67924f5267423175aed5801228a2f84af9 /llvm/test
parent: c7e51805ff52e84594b0514d5bdf31579434b80c (diff)
download: bcm5719-llvm-3e8a76abfda50c5416558875fe799ffa44c169af.tar.gz
bcm5719-llvm-3e8a76abfda50c5416558875fe799ffa44c169af.zip
10 files changed, 291 insertions, 351 deletions
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp.ll b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
index e5043010c11..7e7057f372e 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
@@ -10,54 +10,54 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK-LABEL: 'fadd'
 define i32 @fadd(i32 %arg) {
-  ; SSE2: cost of 2 {{.*}} %F32 = fadd
-  ; SSE42: cost of 2 {{.*}} %F32 = fadd
-  ; AVX: cost of 2 {{.*}} %F32 = fadd
-  ; AVX2: cost of 2 {{.*}} %F32 = fadd
-  ; AVX512: cost of 2 {{.*}} %F32 = fadd
+  ; SSE2: cost of 1 {{.*}} %F32 = fadd
+  ; SSE42: cost of 1 {{.*}} %F32 = fadd
+  ; AVX: cost of 1 {{.*}} %F32 = fadd
+  ; AVX2: cost of 1 {{.*}} %F32 = fadd
+  ; AVX512: cost of 1 {{.*}} %F32 = fadd
   %F32 = fadd float undef, undef
-  ; SSE2: cost of 2 {{.*}} %V4F32 = fadd
-  ; SSE42: cost of 2 {{.*}} %V4F32 = fadd
-  ; AVX: cost of 2 {{.*}} %V4F32 = fadd
-  ; AVX2: cost of 2 {{.*}} %V4F32 = fadd
-  ; AVX512: cost of 2 {{.*}} %V4F32 = fadd
+  ; SSE2: cost of 1 {{.*}} %V4F32 = fadd
+  ; SSE42: cost of 1 {{.*}} %V4F32 = fadd
+  ; AVX: cost of 1 {{.*}} %V4F32 = fadd
+  ; AVX2: cost of 1 {{.*}} %V4F32 = fadd
+  ; AVX512: cost of 1 {{.*}} %V4F32 = fadd
   %V4F32 = fadd <4 x float> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V8F32 = fadd
-  ; SSE42: cost of 4 {{.*}} %V8F32 = fadd
-  ; AVX: cost of 2 {{.*}} %V8F32 = fadd
-  ; AVX2: cost of 2 {{.*}} %V8F32 = fadd
-  ; AVX512: cost of 2 {{.*}} %V8F32 = fadd
+  ; SSE2: cost of 2 {{.*}} %V8F32 = fadd
+  ; SSE42: cost of 2 {{.*}} %V8F32 = fadd
+  ; AVX: cost of 1 {{.*}} %V8F32 = fadd
+  ; AVX2: cost of 1 {{.*}} %V8F32 = fadd
+  ; AVX512: cost of 1 {{.*}} %V8F32 = fadd
   %V8F32 = fadd <8 x float> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V16F32 = fadd
-  ; SSE42: cost of 8 {{.*}} %V16F32 = fadd
-  ; AVX: cost of 4 {{.*}} %V16F32 = fadd
-  ; AVX2: cost of 4 {{.*}} %V16F32 = fadd
-  ; AVX512: cost of 2 {{.*}} %V16F32 = fadd
+  ; SSE2: cost of 4 {{.*}} %V16F32 = fadd
+  ; SSE42: cost of 4 {{.*}} %V16F32 = fadd
+  ; AVX: cost of 2 {{.*}} %V16F32 = fadd
+  ; AVX2: cost of 2 {{.*}} %V16F32 = fadd
+  ; AVX512: cost of 1 {{.*}} %V16F32 = fadd
   %V16F32 = fadd <16 x float> undef, undef
 
-  ; SSE2: cost of 2 {{.*}} %F64 = fadd
-  ; SSE42: cost of 2 {{.*}} %F64 = fadd
-  ; AVX: cost of 2 {{.*}} %F64 = fadd
-  ; AVX2: cost of 2 {{.*}} %F64 = fadd
-  ; AVX512: cost of 2 {{.*}} %F64 = fadd
+  ; SSE2: cost of 1 {{.*}} %F64 = fadd
+  ; SSE42: cost of 1 {{.*}} %F64 = fadd
+  ; AVX: cost of 1 {{.*}} %F64 = fadd
+  ; AVX2: cost of 1 {{.*}} %F64 = fadd
+  ; AVX512: cost of 1 {{.*}} %F64 = fadd
   %F64 = fadd double undef, undef
-  ; SSE2: cost of 2 {{.*}} %V2F64 = fadd
-  ; SSE42: cost of 2 {{.*}} %V2F64 = fadd
-  ; AVX: cost of 2 {{.*}} %V2F64 = fadd
-  ; AVX2: cost of 2 {{.*}} %V2F64 = fadd
-  ; AVX512: cost of 2 {{.*}} %V2F64 = fadd
+  ; SSE2: cost of 1 {{.*}} %V2F64 = fadd
+  ; SSE42: cost of 1 {{.*}} %V2F64 = fadd
+  ; AVX: cost of 1 {{.*}} %V2F64 = fadd
+  ; AVX2: cost of 1 {{.*}} %V2F64 = fadd
+  ; AVX512: cost of 1 {{.*}} %V2F64 = fadd
   %V2F64 = fadd <2 x double> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V4F64 = fadd
-  ; SSE42: cost of 4 {{.*}} %V4F64 = fadd
-  ; AVX: cost of 2 {{.*}} %V4F64 = fadd
-  ; AVX2: cost of 2 {{.*}} %V4F64 = fadd
-  ; AVX512: cost of 2 {{.*}} %V4F64 = fadd
+  ; SSE2: cost of 2 {{.*}} %V4F64 = fadd
+  ; SSE42: cost of 2 {{.*}} %V4F64 = fadd
+  ; AVX: cost of 1 {{.*}} %V4F64 = fadd
+  ; AVX2: cost of 1 {{.*}} %V4F64 = fadd
+  ; AVX512: cost of 1 {{.*}} %V4F64 = fadd
   %V4F64 = fadd <4 x double> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V8F64 = fadd
-  ; SSE42: cost of 8 {{.*}} %V8F64 = fadd
-  ; AVX: cost of 4 {{.*}} %V8F64 = fadd
-  ; AVX2: cost of 4 {{.*}} %V8F64 = fadd
-  ; AVX512: cost of 2 {{.*}} %V8F64 = fadd
+  ; SSE2: cost of 4 {{.*}} %V8F64 = fadd
+  ; SSE42: cost of 4 {{.*}} %V8F64 = fadd
+  ; AVX: cost of 2 {{.*}} %V8F64 = fadd
+  ; AVX2: cost of 2 {{.*}} %V8F64 = fadd
+  ; AVX512: cost of 1 {{.*}} %V8F64 = fadd
   %V8F64 = fadd <8 x double> undef, undef
 
   ret i32 undef
@@ -65,54 +65,54 @@ define i32 @fadd(i32 %arg) {
 
 ; CHECK-LABEL: 'fsub'
 define i32 @fsub(i32 %arg) {
-  ; SSE2: cost of 2 {{.*}} %F32 = fsub
-  ; SSE42: cost of 2 {{.*}} %F32 = fsub
-  ; AVX: cost of 2 {{.*}} %F32 = fsub
-  ; AVX2: cost of 2 {{.*}} %F32 = fsub
-  ; AVX512: cost of 2 {{.*}} %F32 = fsub
+  ; SSE2: cost of 1 {{.*}} %F32 = fsub
+  ; SSE42: cost of 1 {{.*}} %F32 = fsub
+  ; AVX: cost of 1 {{.*}} %F32 = fsub
+  ; AVX2: cost of 1 {{.*}} %F32 = fsub
+  ; AVX512: cost of 1 {{.*}} %F32 = fsub
   %F32 = fsub float undef, undef
-  ; SSE2: cost of 2 {{.*}} %V4F32 = fsub
-  ; SSE42: cost of 2 {{.*}} %V4F32 = fsub
-  ; AVX: cost of 2 {{.*}} %V4F32 = fsub
-  ; AVX2: cost of 2 {{.*}} %V4F32 = fsub
-  ; AVX512: cost of 2 {{.*}} %V4F32 = fsub
+  ; SSE2: cost of 1 {{.*}} %V4F32 = fsub
+  ; SSE42: cost of 1 {{.*}} %V4F32 = fsub
+  ; AVX: cost of 1 {{.*}} %V4F32 = fsub
+  ; AVX2: cost of 1 {{.*}} %V4F32 = fsub
+  ; AVX512: cost of 1 {{.*}} %V4F32 = fsub
   %V4F32 = fsub <4 x float> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V8F32 = fsub
-  ; SSE42: cost of 4 {{.*}} %V8F32 = fsub
-  ; AVX: cost of 2 {{.*}} %V8F32 = fsub
-  ; AVX2: cost of 2 {{.*}} %V8F32 = fsub
-  ; AVX512: cost of 2 {{.*}} %V8F32 = fsub
+  ; SSE2: cost of 2 {{.*}} %V8F32 = fsub
+  ; SSE42: cost of 2 {{.*}} %V8F32 = fsub
+  ; AVX: cost of 1 {{.*}} %V8F32 = fsub
+  ; AVX2: cost of 1 {{.*}} %V8F32 = fsub
+  ; AVX512: cost of 1 {{.*}} %V8F32 = fsub
   %V8F32 = fsub <8 x float> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V16F32 = fsub
-  ; SSE42: cost of 8 {{.*}} %V16F32 = fsub
-  ; AVX: cost of 4 {{.*}} %V16F32 = fsub
-  ; AVX2: cost of 4 {{.*}} %V16F32 = fsub
-  ; AVX512: cost of 2 {{.*}} %V16F32 = fsub
+  ; SSE2: cost of 4 {{.*}} %V16F32 = fsub
+  ; SSE42: cost of 4 {{.*}} %V16F32 = fsub
+  ; AVX: cost of 2 {{.*}} %V16F32 = fsub
+  ; AVX2: cost of 2 {{.*}} %V16F32 = fsub
+  ; AVX512: cost of 1 {{.*}} %V16F32 = fsub
   %V16F32 = fsub <16 x float> undef, undef
 
-  ; SSE2: cost of 2 {{.*}} %F64 = fsub
-  ; SSE42: cost of 2 {{.*}} %F64 = fsub
-  ; AVX: cost of 2 {{.*}} %F64 = fsub
-  ; AVX2: cost of 2 {{.*}} %F64 = fsub
-  ; AVX512: cost of 2 {{.*}} %F64 = fsub
+  ; SSE2: cost of 1 {{.*}} %F64 = fsub
+  ; SSE42: cost of 1 {{.*}} %F64 = fsub
+  ; AVX: cost of 1 {{.*}} %F64 = fsub
+  ; AVX2: cost of 1 {{.*}} %F64 = fsub
+  ; AVX512: cost of 1 {{.*}} %F64 = fsub
   %F64 = fsub double undef, undef
-  ; SSE2: cost of 2 {{.*}} %V2F64 = fsub
-  ; SSE42: cost of 2 {{.*}} %V2F64 = fsub
-  ; AVX: cost of 2 {{.*}} %V2F64 = fsub
-  ; AVX2: cost of 2 {{.*}} %V2F64 = fsub
-  ; AVX512: cost of 2 {{.*}} %V2F64 = fsub
+  ; SSE2: cost of 1 {{.*}} %V2F64 = fsub
+  ; SSE42: cost of 1 {{.*}} %V2F64 = fsub
+  ; AVX: cost of 1 {{.*}} %V2F64 = fsub
+  ; AVX2: cost of 1 {{.*}} %V2F64 = fsub
+  ; AVX512: cost of 1 {{.*}} %V2F64 = fsub
   %V2F64 = fsub <2 x double> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V4F64 = fsub
-  ; SSE42: cost of 4 {{.*}} %V4F64 = fsub
-  ; AVX: cost of 2 {{.*}} %V4F64 = fsub
-  ; AVX2: cost of 2 {{.*}} %V4F64 = fsub
-  ; AVX512: cost of 2 {{.*}} %V4F64 = fsub
+  ; SSE2: cost of 2 {{.*}} %V4F64 = fsub
+  ; SSE42: cost of 2 {{.*}} %V4F64 = fsub
+  ; AVX: cost of 1 {{.*}} %V4F64 = fsub
+  ; AVX2: cost of 1 {{.*}} %V4F64 = fsub
+  ; AVX512: cost of 1 {{.*}} %V4F64 = fsub
   %V4F64 = fsub <4 x double> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V8F64 = fsub
-  ; SSE42: cost of 8 {{.*}} %V8F64 = fsub
-  ; AVX: cost of 4 {{.*}} %V8F64 = fsub
-  ; AVX2: cost of 4 {{.*}} %V8F64 = fsub
-  ; AVX512: cost of 2 {{.*}} %V8F64 = fsub
+  ; SSE2: cost of 4 {{.*}} %V8F64 = fsub
+  ; SSE42: cost of 4 {{.*}} %V8F64 = fsub
+  ; AVX: cost of 2 {{.*}} %V8F64 = fsub
+  ; AVX2: cost of 2 {{.*}} %V8F64 = fsub
+  ; AVX512: cost of 1 {{.*}} %V8F64 = fsub
   %V8F64 = fsub <8 x double> undef, undef
 
   ret i32 undef
@@ -120,54 +120,54 @@ define i32 @fsub(i32 %arg) {
 
 ; CHECK-LABEL: 'fmul'
 define i32 @fmul(i32 %arg) {
-  ; SSE2: cost of 2 {{.*}} %F32 = fmul
-  ; SSE42: cost of 2 {{.*}} %F32 = fmul
-  ; AVX: cost of 2 {{.*}} %F32 = fmul
-  ; AVX2: cost of 2 {{.*}} %F32 = fmul
-  ; AVX512: cost of 2 {{.*}} %F32 = fmul
+  ; SSE2: cost of 1 {{.*}} %F32 = fmul
+  ; SSE42: cost of 1 {{.*}} %F32 = fmul
+  ; AVX: cost of 1 {{.*}} %F32 = fmul
+  ; AVX2: cost of 1 {{.*}} %F32 = fmul
+  ; AVX512: cost of 1 {{.*}} %F32 = fmul
   %F32 = fmul float undef, undef
-  ; SSE2: cost of 2 {{.*}} %V4F32 = fmul
-  ; SSE42: cost of 2 {{.*}} %V4F32 = fmul
-  ; AVX: cost of 2 {{.*}} %V4F32 = fmul
-  ; AVX2: cost of 2 {{.*}} %V4F32 = fmul
-  ; AVX512: cost of 2 {{.*}} %V4F32 = fmul
+  ; SSE2: cost of 1 {{.*}} %V4F32 = fmul
+  ; SSE42: cost of 1 {{.*}} %V4F32 = fmul
+  ; AVX: cost of 1 {{.*}} %V4F32 = fmul
+  ; AVX2: cost of 1 {{.*}} %V4F32 = fmul
+  ; AVX512: cost of 1 {{.*}} %V4F32 = fmul
   %V4F32 = fmul <4 x float> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V8F32 = fmul
-  ; SSE42: cost of 4 {{.*}} %V8F32 = fmul
-  ; AVX: cost of 2 {{.*}} %V8F32 = fmul
-  ; AVX2: cost of 2 {{.*}} %V8F32 = fmul
-  ; AVX512: cost of 2 {{.*}} %V8F32 = fmul
+  ; SSE2: cost of 2 {{.*}} %V8F32 = fmul
+  ; SSE42: cost of 2 {{.*}} %V8F32 = fmul
+  ; AVX: cost of 1 {{.*}} %V8F32 = fmul
+  ; AVX2: cost of 1 {{.*}} %V8F32 = fmul
+  ; AVX512: cost of 1 {{.*}} %V8F32 = fmul
   %V8F32 = fmul <8 x float> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V16F32 = fmul
-  ; SSE42: cost of 8 {{.*}} %V16F32 = fmul
-  ; AVX: cost of 4 {{.*}} %V16F32 = fmul
-  ; AVX2: cost of 4 {{.*}} %V16F32 = fmul
-  ; AVX512: cost of 2 {{.*}} %V16F32 = fmul
+  ; SSE2: cost of 4 {{.*}} %V16F32 = fmul
+  ; SSE42: cost of 4 {{.*}} %V16F32 = fmul
+  ; AVX: cost of 2 {{.*}} %V16F32 = fmul
+  ; AVX2: cost of 2 {{.*}} %V16F32 = fmul
+  ; AVX512: cost of 1 {{.*}} %V16F32 = fmul
   %V16F32 = fmul <16 x float> undef, undef
 
-  ; SSE2: cost of 2 {{.*}} %F64 = fmul
-  ; SSE42: cost of 2 {{.*}} %F64 = fmul
-  ; AVX: cost of 2 {{.*}} %F64 = fmul
-  ; AVX2: cost of 2 {{.*}} %F64 = fmul
-  ; AVX512: cost of 2 {{.*}} %F64 = fmul
+  ; SSE2: cost of 1 {{.*}} %F64 = fmul
+  ; SSE42: cost of 1 {{.*}} %F64 = fmul
+  ; AVX: cost of 1 {{.*}} %F64 = fmul
+  ; AVX2: cost of 1 {{.*}} %F64 = fmul
+  ; AVX512: cost of 1 {{.*}} %F64 = fmul
   %F64 = fmul double undef, undef
-  ; SSE2: cost of 2 {{.*}} %V2F64 = fmul
-  ; SSE42: cost of 2 {{.*}} %V2F64 = fmul
-  ; AVX: cost of 2 {{.*}} %V2F64 = fmul
-  ; AVX2: cost of 2 {{.*}} %V2F64 = fmul
-  ; AVX512: cost of 2 {{.*}} %V2F64 = fmul
+  ; SSE2: cost of 1 {{.*}} %V2F64 = fmul
+  ; SSE42: cost of 1 {{.*}} %V2F64 = fmul
+  ; AVX: cost of 1 {{.*}} %V2F64 = fmul
+  ; AVX2: cost of 1 {{.*}} %V2F64 = fmul
+  ; AVX512: cost of 1 {{.*}} %V2F64 = fmul
   %V2F64 = fmul <2 x double> undef, undef
-  ; SSE2: cost of 4 {{.*}} %V4F64 = fmul
-  ; SSE42: cost of 4 {{.*}} %V4F64 = fmul
-  ; AVX: cost of 2 {{.*}} %V4F64 = fmul
-  ; AVX2: cost of 2 {{.*}} %V4F64 = fmul
-  ; AVX512: cost of 2 {{.*}} %V4F64 = fmul
+  ; SSE2: cost of 2 {{.*}} %V4F64 = fmul
+  ; SSE42: cost of 2 {{.*}} %V4F64 = fmul
+  ; AVX: cost of 1 {{.*}} %V4F64 = fmul
+  ; AVX2: cost of 1 {{.*}} %V4F64 = fmul
+  ; AVX512: cost of 1 {{.*}} %V4F64 = fmul
   %V4F64 = fmul <4 x double> undef, undef
-  ; SSE2: cost of 8 {{.*}} %V8F64 = fmul
-  ; SSE42: cost of 8 {{.*}} %V8F64 = fmul
-  ; AVX: cost of 4 {{.*}} %V8F64 = fmul
-  ; AVX2: cost of 4 {{.*}} %V8F64 = fmul
-  ; AVX512: cost of 2 {{.*}} %V8F64 = fmul
+  ; SSE2: cost of 4 {{.*}} %V8F64 = fmul
+  ; SSE42: cost of 4 {{.*}} %V8F64 = fmul
+  ; AVX: cost of 2 {{.*}} %V8F64 = fmul
+  ; AVX2: cost of 2 {{.*}} %V8F64 = fmul
+  ; AVX512: cost of 1 {{.*}} %V8F64 = fmul
   %V8F64 = fmul <8 x double> undef, undef
 
   ret i32 undef
@@ -197,7 +197,7 @@ define i32 @fdiv(i32 %arg) {
   ; SSE42: cost of 56 {{.*}} %V16F32 = fdiv
   ; AVX: cost of 56 {{.*}} %V16F32 = fdiv
   ; AVX2: cost of 28 {{.*}} %V16F32 = fdiv
-  ; AVX512: cost of 2 {{.*}} %V16F32 = fdiv
+  ; AVX512: cost of 1 {{.*}} %V16F32 = fdiv
   %V16F32 = fdiv <16 x float> undef, undef
 
   ; SSE2: cost of 38 {{.*}} %F64 = fdiv
@@ -222,7 +222,7 @@ define i32 @fdiv(i32 %arg) {
   ; SSE42: cost of 88 {{.*}} %V8F64 = fdiv
   ; AVX: cost of 88 {{.*}} %V8F64 = fdiv
   ; AVX2: cost of 56 {{.*}} %V8F64 = fdiv
-  ; AVX512: cost of 2 {{.*}} %V8F64 = fdiv
+  ; AVX512: cost of 1 {{.*}} %V8F64 = fdiv
   %V8F64 = fdiv <8 x double> undef, undef
 
   ret i32 undef
@@ -230,54 +230,54 @@ define i32 @fdiv(i32 %arg) {
 
 ; CHECK-LABEL: 'frem'
 define i32 @frem(i32 %arg) {
-  ; SSE2: cost of 2 {{.*}} %F32 = frem
-  ; SSE42: cost of 2 {{.*}} %F32 = frem
-  ; AVX: cost of 2 {{.*}} %F32 = frem
-  ; AVX2: cost of 2 {{.*}} %F32 = frem
-  ; AVX512: cost of 2 {{.*}} %F32 = frem
+  ; SSE2: cost of 1 {{.*}} %F32 = frem
+  ; SSE42: cost of 1 {{.*}} %F32 = frem
+  ; AVX: cost of 1 {{.*}} %F32 = frem
+  ; AVX2: cost of 1 {{.*}} %F32 = frem
+  ; AVX512: cost of 1 {{.*}} %F32 = frem
   %F32 = frem float undef, undef
-  ; SSE2: cost of 14 {{.*}} %V4F32 = frem
-  ; SSE42: cost of 14 {{.*}} %V4F32 = frem
-  ; AVX: cost of 14 {{.*}} %V4F32 = frem
-  ; AVX2: cost of 14 {{.*}} %V4F32 = frem
-  ; AVX512: cost of 14 {{.*}} %V4F32 = frem
+  ; SSE2: cost of 10 {{.*}} %V4F32 = frem
+  ; SSE42: cost of 10 {{.*}} %V4F32 = frem
+  ; AVX: cost of 10 {{.*}} %V4F32 = frem
+  ; AVX2: cost of 10 {{.*}} %V4F32 = frem
+  ; AVX512: cost of 10 {{.*}} %V4F32 = frem
   %V4F32 = frem <4 x float> undef, undef
-  ; SSE2: cost of 28 {{.*}} %V8F32 = frem
-  ; SSE42: cost of 28 {{.*}} %V8F32 = frem
-  ; AVX: cost of 30 {{.*}} %V8F32 = frem
-  ; AVX2: cost of 30 {{.*}} %V8F32 = frem
-  ; AVX512: cost of 30 {{.*}} %V8F32 = frem
+  ; SSE2: cost of 20 {{.*}} %V8F32 = frem
+  ; SSE42: cost of 20 {{.*}} %V8F32 = frem
+  ; AVX: cost of 22 {{.*}} %V8F32 = frem
+  ; AVX2: cost of 22 {{.*}} %V8F32 = frem
+  ; AVX512: cost of 22 {{.*}} %V8F32 = frem
   %V8F32 = frem <8 x float> undef, undef
-  ; SSE2: cost of 56 {{.*}} %V16F32 = frem
-  ; SSE42: cost of 56 {{.*}} %V16F32 = frem
-  ; AVX: cost of 60 {{.*}} %V16F32 = frem
-  ; AVX2: cost of 60 {{.*}} %V16F32 = frem
-  ; AVX512: cost of 62 {{.*}} %V16F32 = frem
+  ; SSE2: cost of 40 {{.*}} %V16F32 = frem
+  ; SSE42: cost of 40 {{.*}} %V16F32 = frem
+  ; AVX: cost of 44 {{.*}} %V16F32 = frem
+  ; AVX2: cost of 44 {{.*}} %V16F32 = frem
+  ; AVX512: cost of 46 {{.*}} %V16F32 = frem
   %V16F32 = frem <16 x float> undef, undef
 
-  ; SSE2: cost of 2 {{.*}} %F64 = frem
-  ; SSE42: cost of 2 {{.*}} %F64 = frem
-  ; AVX: cost of 2 {{.*}} %F64 = frem
-  ; AVX2: cost of 2 {{.*}} %F64 = frem
-  ; AVX512: cost of 2 {{.*}} %F64 = frem
+  ; SSE2: cost of 1 {{.*}} %F64 = frem
+  ; SSE42: cost of 1 {{.*}} %F64 = frem
+  ; AVX: cost of 1 {{.*}} %F64 = frem
+  ; AVX2: cost of 1 {{.*}} %F64 = frem
+  ; AVX512: cost of 1 {{.*}} %F64 = frem
   %F64 = frem double undef, undef
-  ; SSE2: cost of 6 {{.*}} %V2F64 = frem
-  ; SSE42: cost of 6 {{.*}} %V2F64 = frem
-  ; AVX: cost of 6 {{.*}} %V2F64 = frem
-  ; AVX2: cost of 6 {{.*}} %V2F64 = frem
-  ; AVX512: cost of 6 {{.*}} %V2F64 = frem
+  ; SSE2: cost of 4 {{.*}} %V2F64 = frem
+  ; SSE42: cost of 4 {{.*}} %V2F64 = frem
+  ; AVX: cost of 4 {{.*}} %V2F64 = frem
+  ; AVX2: cost of 4 {{.*}} %V2F64 = frem
+  ; AVX512: cost of 4 {{.*}} %V2F64 = frem
   %V2F64 = frem <2 x double> undef, undef
-  ; SSE2: cost of 12 {{.*}} %V4F64 = frem
-  ; SSE42: cost of 12 {{.*}} %V4F64 = frem
-  ; AVX: cost of 14 {{.*}} %V4F64 = frem
-  ; AVX2: cost of 14 {{.*}} %V4F64 = frem
-  ; AVX512: cost of 14 {{.*}} %V4F64 = frem
+  ; SSE2: cost of 8 {{.*}} %V4F64 = frem
+  ; SSE42: cost of 8 {{.*}} %V4F64 = frem
+  ; AVX: cost of 10 {{.*}} %V4F64 = frem
+  ; AVX2: cost of 10 {{.*}} %V4F64 = frem
+  ; AVX512: cost of 10 {{.*}} %V4F64 = frem
   %V4F64 = frem <4 x double> undef, undef
-  ; SSE2: cost of 24 {{.*}} %V8F64 = frem
-  ; SSE42: cost of 24 {{.*}} %V8F64 = frem
-  ; AVX: cost of 28 {{.*}} %V8F64 = frem
-  ; AVX2: cost of 28 {{.*}} %V8F64 = frem
-  ; AVX512: cost of 30 {{.*}} %V8F64 = frem
+  ; SSE2: cost of 16 {{.*}} %V8F64 = frem
+  ; SSE42: cost of 16 {{.*}} %V8F64 = frem
+  ; AVX: cost of 20 {{.*}} %V8F64 = frem
+  ; AVX2: cost of 20 {{.*}} %V8F64 = frem
+  ; AVX512: cost of 22 {{.*}} %V8F64 = frem
   %V8F64 = frem <8 x double> undef, undef
 
   ret i32 undef
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll
index efc1263373e..2c4b7139f90 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll
@@ -78,10 +78,10 @@ for.end:                                          ; preds = %vector.body
   ret void
 
 ; CORE2: Printing analysis 'Cost Model Analysis' for function 'test3':
-; CORE2: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
+; CORE2: Cost Model: Found an estimated cost of 2 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
 
 ; COREI7: Printing analysis 'Cost Model Analysis' for function 'test3':
-; COREI7: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
+; COREI7: Cost Model: Found an estimated cost of 2 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
 
 }
 
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
index 45e2215cd36..296d14bc797 100644
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -11,8 +11,8 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 
 ; Check that we recognize the tree starting at the extractelement as a
 ; reduction.
-; CHECK-LABEL: reduction_cost
-; CHECK:  cost of 9 {{.*}} extractelement
+; CHECK-LABEL: reduction_cost_float
+; CHECK:  cost of 7 {{.*}} extractelement
 
   %r = extractelement <4 x float> %bin.rdx8, i32 0
   ret float %r
@@ -54,7 +54,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
   %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
 
 ; CHECK-LABEL: pairwise_hadd
-; CHECK: cost of 11 {{.*}} extractelement
+; CHECK: cost of 9 {{.*}} extractelement
 
   %r = extractelement <4 x float> %bin.rdx.1, i32 0
   %r2 = fadd float %r, %f1
@@ -74,7 +74,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
   %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
 
 ; CHECK-LABEL: pairwise_hadd_assoc
-; CHECK: cost of 11 {{.*}} extractelement
+; CHECK: cost of 9 {{.*}} extractelement
 
   %r = extractelement <4 x float> %bin.rdx.1, i32 0
   %r2 = fadd float %r, %f1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
index 25e164fc1c9..07eb4cbcbc0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -1,11 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin %s | FileCheck %s
 
+; FIXME: The intent is that we should be able to vectorize this on x86
+; because that would be profitable, but the cost model says it is not.
+
 ; Two mostly identical functions. The only difference is the presence of
 ; fast-math flags on the second. The loop is a pretty simple reduction:
 
 ; for (int i = 0; i < 32; ++i)
-;   if (arr[i] != 42)
+;   if (arr[i] != 42.0)
 ;     tot += arr[i];
 
 define double @sumIfScalar(double* nocapture readonly %arr) {
@@ -66,41 +69,11 @@ done:
 define double @sumIfVector(double* nocapture readonly %arr) {
 ; CHECK-LABEL: @sumIfVector(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01>
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[PREDPHI]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[PREDPHI]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
-; CHECK-NEXT:    [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; CHECK-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
 ; CHECK-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
 ; CHECK-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
 ; CHECK-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
@@ -113,9 +86,9 @@ define double @sumIfVector(double* nocapture readonly %arr) {
 ; CHECK-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
 ; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
 ; CHECK-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
-; CHECK-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop !2
+; CHECK-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]]
 ; CHECK:       done:
-; CHECK-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ]
 ; CHECK-NEXT:    ret double [[TOT_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
index e8c37512594..ec4b81b64d7 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
@@ -9,7 +9,7 @@ define void @f(double* %r, double* %w) {
   %add1 = fadd double %f1, %f1
   %w0 = getelementptr inbounds double, double* %w, i64 0
   %w1 = getelementptr inbounds double, double* %w, i64 1
-; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3
+; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -3 and with tree size 3
   store double %add0, double* %w0, !dbg !9
   store double %add1, double* %w1
   ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR36280.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR36280.ll
index 37b11d8ddc9..d40246d6267 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR36280.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR36280.ll
@@ -1,19 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
 
+; It is not profitable to vectorize this with <2 x float> ops.
+; This is a reduction from the Himeno benchmark.
+; https://bugs.llvm.org/show_bug.cgi?id=36280
+
 define float @jacobi(float* %p, float %x, float %y, float %z) {
 ; CHECK-LABEL: @jacobi(
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr float, float* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr float, float* [[P]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[X:%.*]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[TMP6]], [[Z:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[TMP7]], [[ADD1]]
+; CHECK-NEXT:    [[P1:%.*]] = load float, float* [[GEP1]]
+; CHECK-NEXT:    [[P2:%.*]] = load float, float* [[GEP2]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul float [[P1]], [[X:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul float [[P2]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[MUL1]], [[Z:%.*]]
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[MUL2]], [[ADD1]]
 ; CHECK-NEXT:    ret float [[ADD2]]
 ;
   %gep1 = getelementptr float, float* %p, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
index 751cd9dff01..5860a24906b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -19,20 +19,19 @@ define i32 @test(double* nocapture %G) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> <double 4.000000e+00, double 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> <double 1.000000e+00, double 6.000000e+00>, [[TMP2]]
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd double [[TMP5]], 7.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[MUL11]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> <double 1.000000e+00, double 6.000000e+00, double 7.000000e+00, double 8.000000e+00>, [[TMP9]]
+; CHECK-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
+; CHECK-NEXT:    [[ADD12:%.*]] = fadd double [[MUL11]], 8.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[G]] to <4 x double>*
-; CHECK-NEXT:    store <4 x double> [[TMP10]], <4 x double>* [[TMP11]], align 8
+; CHECK-NEXT:    store double [[ADD12]], double* [[ARRAYIDX13]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
index 90fe10a6a8b..986da9fa52b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -730,28 +730,26 @@ define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[TMP0]], 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 2
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP0]], 3
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP0]], 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP0]], 3
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
 ; CHECK-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
 ; CHECK:       for.body16.lr.ph:
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ADD_PTR]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY16:%.*]]
 ; CHECK:       for.cond.cleanup15:
-; CHECK-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP21:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP9]], [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP12:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP6]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
 ; CHECK-NEXT:    store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
@@ -760,36 +758,26 @@ define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture
 ; CHECK-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
 ; CHECK-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.body16:
+; CHECK-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
 ; CHECK-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY16]] ]
-; CHECK-NEXT:    [[TMP12]] = extractelement <4 x float> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul fast <2 x float> <float 0x3FF19999A0000000, float 0xBFF3333340000000>, [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; CHECK-NEXT:    [[SUB92:%.*]] = fadd fast float [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP10]]
+; CHECK-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
+; CHECK-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
+; CHECK-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
+; CHECK-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
 ; CHECK-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul fast <4 x float> <float 0xC0019999A0000000, float 0x4002666660000000, float 0x4008CCCCC0000000, float 0xC0099999A0000000>, [[TMP11]]
-; CHECK-NEXT:    [[ADD2293:%.*]] = fadd fast float undef, undef
-; CHECK-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], undef
-; CHECK-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], undef
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP19]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP21]] = fadd fast float [[TMP20]], [[MUL20]]
-; CHECK-NEXT:    [[SUB28:%.*]] = fadd fast float [[SUB2694]], [[MUL20]]
+; CHECK-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
+; CHECK-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
+; CHECK-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
+; CHECK-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
+; CHECK-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
+; CHECK-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
+; CHECK-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
+; CHECK-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> undef, float [[SUB19]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP12]], i32 1
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP21]], i32 2
-; CHECK-NEXT:    [[TMP25]] = extractelement <4 x float> [[TMP11]], i32 2
-; CHECK-NEXT:    [[TMP26]] = insertelement <4 x float> [[TMP24]], float [[TMP25]], i32 3
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
 ;
 ; STORE-LABEL: @foo(
@@ -802,28 +790,26 @@ define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture
 ; STORE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
 ; STORE-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
 ; STORE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
-; STORE-NEXT:    [[TMP1:%.*]] = or i64 [[TMP0]], 1
-; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]]
-; STORE-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 2
-; STORE-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
-; STORE-NEXT:    [[TMP3:%.*]] = or i64 [[TMP0]], 3
-; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]]
-; STORE-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
-; STORE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; STORE-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
-; STORE-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
-; STORE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
-; STORE-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; STORE-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; STORE-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
+; STORE-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; STORE-NEXT:    [[TMP4:%.*]] = or i64 [[TMP0]], 2
+; STORE-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
+; STORE-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
+; STORE-NEXT:    [[TMP6:%.*]] = or i64 [[TMP0]], 3
+; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
+; STORE-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
 ; STORE-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
 ; STORE:       for.body16.lr.ph:
 ; STORE-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
-; STORE-NEXT:    [[TMP10:%.*]] = load float, float* [[ADD_PTR]], align 4
+; STORE-NEXT:    [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
 ; STORE-NEXT:    br label [[FOR_BODY16:%.*]]
 ; STORE:       for.cond.cleanup15:
-; STORE-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP8]], [[FOR_BODY]] ], [ [[TMP21:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP9]], [[FOR_BODY]] ], [ [[TMP25:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP12:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP6]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
 ; STORE-NEXT:    store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
 ; STORE-NEXT:    store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
 ; STORE-NEXT:    store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
@@ -832,36 +818,26 @@ define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture
 ; STORE-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
 ; STORE-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
 ; STORE:       for.body16:
+; STORE-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
 ; STORE-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[TMP11:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP26:%.*]], [[FOR_BODY16]] ]
-; STORE-NEXT:    [[TMP12]] = extractelement <4 x float> [[TMP11]], i32 0
-; STORE-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
-; STORE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP12]], i32 0
-; STORE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i32 1
-; STORE-NEXT:    [[TMP16:%.*]] = fmul fast <2 x float> <float 0x3FF19999A0000000, float 0xBFF3333340000000>, [[TMP15]]
-; STORE-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; STORE-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; STORE-NEXT:    [[SUB92:%.*]] = fadd fast float [[TMP17]], [[TMP18]]
-; STORE-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP10]]
+; STORE-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
+; STORE-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
+; STORE-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
+; STORE-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
 ; STORE-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
-; STORE-NEXT:    [[TMP19:%.*]] = fmul fast <4 x float> <float 0xC0019999A0000000, float 0x4002666660000000, float 0x4008CCCCC0000000, float 0xC0099999A0000000>, [[TMP11]]
-; STORE-NEXT:    [[ADD2293:%.*]] = fadd fast float undef, undef
-; STORE-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], undef
-; STORE-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], undef
-; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP19]], [[RDX_SHUF]]
-; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; STORE-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; STORE-NEXT:    [[TMP21]] = fadd fast float [[TMP20]], [[MUL20]]
-; STORE-NEXT:    [[SUB28:%.*]] = fadd fast float [[SUB2694]], [[MUL20]]
+; STORE-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
+; STORE-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
+; STORE-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
+; STORE-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
+; STORE-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
+; STORE-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
+; STORE-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
+; STORE-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
 ; STORE-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
 ; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
-; STORE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> undef, float [[SUB19]], i32 0
-; STORE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP12]], i32 1
-; STORE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP21]], i32 2
-; STORE-NEXT:    [[TMP25]] = extractelement <4 x float> [[TMP11]], i32 2
-; STORE-NEXT:    [[TMP26]] = insertelement <4 x float> [[TMP24]], float [[TMP25]], i32 3
 ; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
index 0dda7c39d6a..afee26f0d40 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
@@ -9,40 +9,33 @@ define  void @foo (%struct.complex* %A, %struct.complex* %B, %struct.complex* %R
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 256, 0
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP25:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP24:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP6]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> undef, float [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP16]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP20:%.*]] = fmul <2 x float> [[TMP17]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <2 x float> [[TMP13]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x float> [[TMP13]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x float> [[TMP21]], <2 x float> [[TMP22]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP24]] = fadd <2 x float> [[TMP2]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[TMP26]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul float [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul float [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul float [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul float [[TMP5]], [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18]] = fadd float [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19]] = fadd float [[TMP2]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[TMP24]], i32 0
-; CHECK-NEXT:    store float [[TMP28]], float* [[TMP27]], align 4
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP24]], i32 1
-; CHECK-NEXT:    store float [[TMP30]], float* [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0
+; CHECK-NEXT:    store float [[TMP18]], float* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1
+; CHECK-NEXT:    store float [[TMP19]], float* [[TMP23]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
index 33ca0029c48..4ddc9cbe6fb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
@@ -64,17 +64,15 @@ define void @test_volatile_load(double* %a, double* %b, double* %c) {
 ; CHECK-LABEL: @test_volatile_load(
 ; CHECK-NEXT:    [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
 ; CHECK-NEXT:    [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
 ; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
 ; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[I0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[I1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[I4]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; CHECK-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %i0 = load volatile double, double* %a, align 8
author	Sanjay Patel <spatel@rotateright.com>	2018-02-19 16:11:44 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2018-02-19 16:11:44 +0000
commit	3e8a76abfda50c5416558875fe799ffa44c169af (patch)
tree	11c4bc67924f5267423175aed5801228a2f84af9 /llvm/test
parent	c7e51805ff52e84594b0514d5bdf31579434b80c (diff)
download	bcm5719-llvm-3e8a76abfda50c5416558875fe799ffa44c169af.tar.gz bcm5719-llvm-3e8a76abfda50c5416558875fe799ffa44c169af.zip