summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
authorGadi Haber <gadi.haber@intel.com>2017-06-27 15:05:13 +0000
committerGadi Haber <gadi.haber@intel.com>2017-06-27 15:05:13 +0000
commit13759a7ed62a362bc3d7455da8b96279e545cdc6 (patch)
tree2090b37ec2372cd62bbda9fca89ec5c157b44be7 /llvm/test
parenta179d25b99fec680d2430a07b6a35254c548e298 (diff)
downloadbcm5719-llvm-13759a7ed62a362bc3d7455da8b96279e545cdc6.tar.gz
bcm5719-llvm-13759a7ed62a362bc3d7455da8b96279e545cdc6.zip
Updated and extended the information about each instruction in HSW and SNB to include the following data:
•static latency •number of uOps from which the instructions consists •all ports used by the instruction Reviewers:  RKSimon zvi aymanmus m_zuckerman Differential Revision: https://reviews.llvm.org/D33897 llvm-svn: 306414
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/X86/avx-schedule.ll808
-rw-r--r--llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll6
-rw-r--r--llvm/test/CodeGen/X86/avx2-schedule.ll58
-rw-r--r--llvm/test/CodeGen/X86/avx2-vector-shifts.ll44
-rw-r--r--llvm/test/CodeGen/X86/avx512-cmp.ll4
-rw-r--r--llvm/test/CodeGen/X86/avx512-cvt.ll54
-rw-r--r--llvm/test/CodeGen/X86/avx512-insert-extract.ll166
-rw-r--r--llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll4
-rw-r--r--llvm/test/CodeGen/X86/avx512-mask-op.ll646
-rw-r--r--llvm/test/CodeGen/X86/avx512-vec-cmp.ll97
-rw-r--r--llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll1480
-rw-r--r--llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll128
-rw-r--r--llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll2
-rw-r--r--llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll48
-rw-r--r--llvm/test/CodeGen/X86/fp128-i128.ll2
-rw-r--r--llvm/test/CodeGen/X86/gather-addresses.ll17
-rw-r--r--llvm/test/CodeGen/X86/half.ll1045
-rw-r--r--llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll34
-rw-r--r--llvm/test/CodeGen/X86/mul-constant-i32.ll206
-rw-r--r--llvm/test/CodeGen/X86/mul-constant-i64.ll132
-rw-r--r--llvm/test/CodeGen/X86/pr32329.ll2
-rw-r--r--llvm/test/CodeGen/X86/recip-fastmath.ll224
-rw-r--r--llvm/test/CodeGen/X86/recip-fastmath2.ll442
-rw-r--r--llvm/test/CodeGen/X86/sse-schedule.ll466
-rw-r--r--llvm/test/CodeGen/X86/sse2-schedule.ll1108
-rw-r--r--llvm/test/CodeGen/X86/sse3-schedule.ll88
-rw-r--r--llvm/test/CodeGen/X86/sse41-schedule.ll432
-rw-r--r--llvm/test/CodeGen/X86/sse42-schedule.ll70
-rw-r--r--llvm/test/CodeGen/X86/ssse3-schedule.ll134
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-ashr-512.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll12
31 files changed, 4339 insertions, 3632 deletions
diff --git a/llvm/test/CodeGen/X86/avx-schedule.ll b/llvm/test/CodeGen/X86/avx-schedule.ll
index 47e95fe31bd..6fe5f4c4da2 100644
--- a/llvm/test/CodeGen/X86/avx-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx-schedule.ll
@@ -10,14 +10,14 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_addpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addpd:
; BTVER2: # BB#0:
@@ -40,14 +40,14 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_addps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addps:
; BTVER2: # BB#0:
@@ -70,14 +70,14 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; SANDY-LABEL: test_addsubpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addsubpd:
; BTVER2: # BB#0:
@@ -101,14 +101,14 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; SANDY-LABEL: test_addsubps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addsubps:
; BTVER2: # BB#0:
@@ -131,17 +131,17 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_andnotpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andnotpd:
; BTVER2: # BB#0:
@@ -172,17 +172,17 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_andnotps:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andnotps:
; BTVER2: # BB#0:
@@ -216,14 +216,14 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andpd:
; BTVER2: # BB#0:
@@ -255,14 +255,14 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andps:
; BTVER2: # BB#0:
@@ -291,17 +291,17 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_blendpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
+; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendpd:
; BTVER2: # BB#0:
@@ -326,15 +326,15 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_blendps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
-; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33]
-; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendps:
; BTVER2: # BB#0:
@@ -356,15 +356,15 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *
define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
; SANDY-LABEL: test_blendvpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendvpd:
; BTVER2: # BB#0:
@@ -387,15 +387,15 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
; SANDY-LABEL: test_blendvps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendvps:
; BTVER2: # BB#0:
@@ -418,13 +418,13 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f
define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
; SANDY-LABEL: test_broadcastf128:
; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastf128:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [?:5.000000e-01]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_broadcastf128:
; BTVER2: # BB#0:
@@ -443,13 +443,13 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
define <4 x double> @test_broadcastsd_ymm(double *%a0) {
; SANDY-LABEL: test_broadcastsd_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastsd_ymm:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [?:5.000000e-01]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_broadcastsd_ymm:
; BTVER2: # BB#0:
@@ -469,13 +469,13 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) {
define <4 x float> @test_broadcastss(float *%a0) {
; SANDY-LABEL: test_broadcastss:
; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastss:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [?:5.000000e-01]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_broadcastss:
; BTVER2: # BB#0:
@@ -496,12 +496,12 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) {
; SANDY-LABEL: test_broadcastss_ymm:
; SANDY: # BB#0:
; SANDY-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastss_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_broadcastss_ymm:
; BTVER2: # BB#0:
@@ -521,17 +521,17 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) {
define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_cmppd:
; SANDY: # BB#0:
-; SANDY-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
+; SANDY-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vorpd %ymm2, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmppd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmppd:
; BTVER2: # BB#0:
@@ -559,17 +559,17 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double>
define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_cmpps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
+; SANDY-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vorps %ymm2, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmpps:
; BTVER2: # BB#0:
@@ -598,16 +598,16 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_cvtdq2pd:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [10:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2pd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [6:1.00]
-; HASWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [6:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtdq2pd:
; BTVER2: # BB#0:
@@ -632,19 +632,19 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
; SANDY-LABEL: test_cvtdq2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
-; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [4:1.00]
+; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [3:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2ps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [3:1.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtdq2ps:
; BTVER2: # BB#0:
@@ -669,17 +669,17 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_cvtpd2dq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00]
-; HASWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [10:1.00]
+; HASWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00]
; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtpd2dq:
; BTVER2: # BB#0:
@@ -704,17 +704,17 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_cvtpd2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2ps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00]
; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtpd2ps:
; BTVER2: # BB#0:
@@ -741,15 +741,15 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
; SANDY: # BB#0:
; SANDY-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
-; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtps2dq:
; BTVER2: # BB#0:
@@ -774,15 +774,15 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_divpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [45:3.00]
+; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [52:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [27:2.00]
-; HASWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [31:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [35:2.00]
+; HASWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [35:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divpd:
; BTVER2: # BB#0:
@@ -804,15 +804,15 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_divps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [29:3.00]
+; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [36:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:2.00]
-; HASWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [23:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divps:
; BTVER2: # BB#0:
@@ -834,15 +834,15 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_dpps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:2.00]
; SANDY-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [14:2.00]
-; HASWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [18:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [14:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_dpps:
; BTVER2: # BB#0:
@@ -866,16 +866,16 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa
; SANDY-LABEL: test_extractf128:
; SANDY: # BB#0:
; SANDY-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_extractf128:
; HASWELL: # BB#0:
; HASWELL-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [4:1.00]
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_extractf128:
; BTVER2: # BB#0:
@@ -900,13 +900,13 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
; SANDY: # BB#0:
; SANDY-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
; HASWELL-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_haddpd:
; BTVER2: # BB#0:
@@ -929,15 +929,15 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_haddps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_haddps:
; BTVER2: # BB#0:
@@ -960,15 +960,15 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: test_hsubpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_hsubpd:
; BTVER2: # BB#0:
@@ -991,15 +991,15 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_hsubps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_hsubps:
; BTVER2: # BB#0:
@@ -1023,16 +1023,16 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
; SANDY-LABEL: test_insertf128:
; SANDY: # BB#0:
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
-; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_insertf128:
; HASWELL: # BB#0:
; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_insertf128:
; BTVER2: # BB#0:
@@ -1059,13 +1059,13 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
define <32 x i8> @test_lddqu(i8* %a0) {
; SANDY-LABEL: test_lddqu:
; SANDY: # BB#0:
-; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lddqu:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [?:5.000000e-01]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lddqu:
; BTVER2: # BB#0:
@@ -1084,17 +1084,17 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
; SANDY-LABEL: test_maskmovpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [2:2.00]
+; HASWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [4:1.00]
; HASWELL-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maskmovpd:
; BTVER2: # BB#0:
@@ -1119,29 +1119,29 @@ declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind
define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2) {
; SANDY-LABEL: test_maskmovpd_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [5:1.00]
+; SANDY-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi)
; SANDY-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovpd_ymm:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [4:1.00]
+; HASWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi)
; HASWELL-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maskmovpd_ymm:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; BTVER2-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi)
; BTVER2-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maskmovpd_ymm:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi)
; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1)
@@ -1154,17 +1154,17 @@ declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwi
define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
; SANDY-LABEL: test_maskmovps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [2:2.00]
+; HASWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [4:1.00]
; HASWELL-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maskmovps:
; BTVER2: # BB#0:
@@ -1189,29 +1189,29 @@ declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind
define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) {
; SANDY-LABEL: test_maskmovps_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [1:0.50]
+; SANDY-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
; SANDY-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovps_ymm:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [1:0.50]
+; HASWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
; HASWELL-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maskmovps_ymm:
; BTVER2: # BB#0:
; BTVER2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; BTVER2-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
; BTVER2-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maskmovps_ymm:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
%1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1)
@@ -1225,14 +1225,14 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_maxpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxpd:
; BTVER2: # BB#0:
@@ -1256,14 +1256,14 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_maxps:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxps:
; BTVER2: # BB#0:
@@ -1288,13 +1288,13 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY: # BB#0:
; SANDY-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minpd:
; BTVER2: # BB#0:
@@ -1319,13 +1319,13 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY: # BB#0:
; SANDY-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minps:
; BTVER2: # BB#0:
@@ -1348,17 +1348,17 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movapd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [7:0.50]
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movapd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovapd %ymm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movapd:
; BTVER2: # BB#0:
@@ -1382,17 +1382,17 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movaps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movaps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovaps %ymm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movaps:
; BTVER2: # BB#0:
@@ -1417,16 +1417,16 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movddup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
-; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movddup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
-; HASWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; HASWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movddup:
; BTVER2: # BB#0:
@@ -1451,15 +1451,15 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
define i32 @test_movmskpd(<4 x double> %a0) {
; SANDY-LABEL: test_movmskpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.33]
+; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00]
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [3:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movmskpd:
; BTVER2: # BB#0:
@@ -1479,15 +1479,15 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
define i32 @test_movmskps(<8 x float> %a0) {
; SANDY-LABEL: test_movmskps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.33]
+; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00]
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovmskps %ymm0, %eax # sched: [5:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movmskps:
; BTVER2: # BB#0:
@@ -1508,14 +1508,14 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movntpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovntpd %ymm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntpd:
; BTVER2: # BB#0:
@@ -1537,14 +1537,14 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movntps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovntps %ymm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntps:
; BTVER2: # BB#0:
@@ -1566,16 +1566,16 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movshdup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
-; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movshdup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
-; HASWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; HASWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movshdup:
; BTVER2: # BB#0:
@@ -1601,16 +1601,16 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movsldup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
-; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsldup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
-; HASWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; HASWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movsldup:
; BTVER2: # BB#0:
@@ -1635,19 +1635,19 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
; SANDY-LABEL: test_movupd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movupd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovupd %ymm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movupd:
; BTVER2: # BB#0:
@@ -1671,19 +1671,19 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
; SANDY-LABEL: test_movups:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movups:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovups %ymm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movups:
; BTVER2: # BB#0:
@@ -1708,14 +1708,14 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_mulpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulpd:
; BTVER2: # BB#0:
@@ -1738,14 +1738,14 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_mulps:
; SANDY: # BB#0:
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulps:
; BTVER2: # BB#0:
@@ -1767,17 +1767,17 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
; SANDY-LABEL: orpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: orpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: orpd:
; BTVER2: # BB#0:
@@ -1806,17 +1806,17 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
; SANDY-LABEL: test_orps:
; SANDY: # BB#0:
-; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_orps:
; BTVER2: # BB#0:
@@ -1846,16 +1846,16 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_permilpd:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
-; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
-; HASWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; HASWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilpd:
; BTVER2: # BB#0:
@@ -1880,17 +1880,17 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_permilpd_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
+; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [8:1.00]
; SANDY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilpd_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
; HASWELL-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilpd_ymm:
; BTVER2: # BB#0:
@@ -1916,16 +1916,16 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_permilps:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
-; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
-; HASWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; HASWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilps:
; BTVER2: # BB#0:
@@ -1950,17 +1950,17 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_permilps_ymm:
; SANDY: # BB#0:
-; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [8:1.00]
; SANDY-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilps_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
; HASWELL-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilps_ymm:
; BTVER2: # BB#0:
@@ -1986,14 +1986,14 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64>
; SANDY-LABEL: test_permilvarpd:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilvarpd:
; BTVER2: # BB#0:
@@ -2018,13 +2018,13 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x
; SANDY: # BB#0:
; SANDY-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
; SANDY-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarpd_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilvarpd_ymm:
; BTVER2: # BB#0:
@@ -2048,14 +2048,14 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *
; SANDY-LABEL: test_permilvarps:
; SANDY: # BB#0:
; SANDY-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilvarps:
; BTVER2: # BB#0:
@@ -2080,13 +2080,13 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3
; SANDY: # BB#0:
; SANDY-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
; SANDY-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarps_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilvarps_ymm:
; BTVER2: # BB#0:
@@ -2112,14 +2112,14 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vrcpps (%rdi), %ymm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rcpps:
; BTVER2: # BB#0:
@@ -2148,14 +2148,14 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [7:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundpd:
; BTVER2: # BB#0:
@@ -2184,14 +2184,14 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [7:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [7:1.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundps:
; BTVER2: # BB#0:
@@ -2217,17 +2217,17 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_rsqrtps:
; SANDY: # BB#0:
-; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [9:1.00]
+; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [14:3.00]
+; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:3.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [11:2.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rsqrtps:
; BTVER2: # BB#0:
@@ -2254,16 +2254,16 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
; SANDY-LABEL: test_shufpd:
; SANDY: # BB#0:
; SANDY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
-; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; HASWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_shufpd:
; BTVER2: # BB#0:
@@ -2289,14 +2289,14 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
; SANDY-LABEL: test_shufps:
; SANDY: # BB#0:
; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
-; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
-; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_shufps:
; BTVER2: # BB#0:
@@ -2318,17 +2318,17 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
; SANDY-LABEL: test_sqrtpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [52:3.00]
+; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [45:3.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [32:2.00]
-; HASWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [28:2.00]
+; HASWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [35:2.00]
+; HASWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [35:2.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtpd:
; BTVER2: # BB#0:
@@ -2354,17 +2354,17 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
; SANDY-LABEL: test_sqrtps:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [36:3.00]
+; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [29:3.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [23:2.00]
-; HASWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [19:2.00]
+; HASWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [21:2.00]
+; HASWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:2.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtps:
; BTVER2: # BB#0:
@@ -2391,14 +2391,14 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-LABEL: test_subpd:
; SANDY: # BB#0:
; SANDY-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subpd:
; BTVER2: # BB#0:
@@ -2421,14 +2421,14 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-LABEL: test_subps:
; SANDY: # BB#0:
; SANDY-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subps:
; BTVER2: # BB#0:
@@ -2451,20 +2451,20 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; SANDY-LABEL: test_testpd:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [7:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: setb %al # sched: [1:1.00]
+; HASWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: adcl $0, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_testpd:
; BTVER2: # BB#0:
@@ -2495,22 +2495,22 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
; SANDY-LABEL: test_testpd_ymm:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [8:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testpd_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: setb %al # sched: [1:1.00]
+; HASWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: adcl $0, %eax # sched: [1:0.25]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_testpd_ymm:
; BTVER2: # BB#0:
@@ -2542,20 +2542,20 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; SANDY-LABEL: test_testps:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [7:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testps:
; HASWELL: # BB#0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: setb %al # sched: [1:1.00]
+; HASWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: adcl $0, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_testps:
; BTVER2: # BB#0:
@@ -2586,22 +2586,22 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
; SANDY-LABEL: test_testps_ymm:
; SANDY: # BB#0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50]
+; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [8:1.00]
; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testps_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: setb %al # sched: [1:1.00]
+; HASWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: adcl $0, %eax # sched: [1:0.25]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_testps_ymm:
; BTVER2: # BB#0:
@@ -2635,14 +2635,14 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; SANDY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; SANDY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpckhpd:
; BTVER2: # BB#0:
@@ -2669,13 +2669,13 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpckhps:
; BTVER2: # BB#0:
@@ -2698,16 +2698,16 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; SANDY-LABEL: test_unpcklpd:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpcklpd:
; BTVER2: # BB#0:
@@ -2733,14 +2733,14 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; SANDY-LABEL: test_unpcklps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpcklps:
; BTVER2: # BB#0:
@@ -2765,14 +2765,14 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_xorpd:
; BTVER2: # BB#0:
@@ -2804,14 +2804,14 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_xorps:
; BTVER2: # BB#0:
@@ -2841,12 +2841,12 @@ define void @test_zeroall() {
; SANDY-LABEL: test_zeroall:
; SANDY: # BB#0:
; SANDY-NEXT: vzeroall # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_zeroall:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vzeroall # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vzeroall # sched: [16:16.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_zeroall:
; BTVER2: # BB#0:
@@ -2866,12 +2866,12 @@ define void @test_zeroupper() {
; SANDY-LABEL: test_zeroupper:
; SANDY: # BB#0:
; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_zeroupper:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_zeroupper:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 52e37dbf269..a82c51747a2 100644
--- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -1619,10 +1619,10 @@ define <8 x float> @test_gather_mask(<8 x float> %a0, float* %a, <8 x i32> %idx
;
; AVX512VL-LABEL: test_gather_mask:
; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
-; AVX512VL-NEXT: vgatherdps %ymm3, (%eax,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x88]
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; AVX512VL-NEXT: vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
+; AVX512VL-NEXT: vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x89]
; AVX512VL-NEXT: vmovups %ymm2, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x10]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%a_i8 = bitcast float* %a to i8*
diff --git a/llvm/test/CodeGen/X86/avx2-schedule.ll b/llvm/test/CodeGen/X86/avx2-schedule.ll
index 042bc217b97..ca5a132f4f4 100644
--- a/llvm/test/CodeGen/X86/avx2-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx2-schedule.ll
@@ -9,7 +9,7 @@ define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) {
; HASWELL-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vpabsb (%rdi), %ymm1 # sched: [5:0.50]
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pabsb:
; ZNVER1: # BB#0:
@@ -29,9 +29,9 @@ define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) {
; HASWELL-LABEL: test_pabsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsd (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsd (%rdi), %ymm1 # sched: [1:0.50]
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pabsd:
; ZNVER1: # BB#0:
@@ -51,9 +51,9 @@ define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) {
; HASWELL-LABEL: test_pabsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsw (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsw (%rdi), %ymm1 # sched: [1:0.50]
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pabsw:
; ZNVER1: # BB#0:
@@ -74,7 +74,7 @@ define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_paddb:
; ZNVER1: # BB#0:
@@ -92,7 +92,7 @@ define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_paddd:
; ZNVER1: # BB#0:
@@ -109,8 +109,8 @@ define <4 x i64> @test_paddq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_paddq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_paddq:
; ZNVER1: # BB#0:
@@ -128,7 +128,7 @@ define <16 x i16> @test_paddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_paddw:
; ZNVER1: # BB#0:
@@ -145,9 +145,9 @@ define <4 x i64> @test_pand(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_pand:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pand:
; ZNVER1: # BB#0:
@@ -166,9 +166,9 @@ define <4 x i64> @test_pandn(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_pandn:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pandn:
; ZNVER1: # BB#0:
@@ -190,7 +190,7 @@ define <8 x i32> @test_pmulld(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00]
; HASWELL-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pmulld:
; ZNVER1: # BB#0:
@@ -207,8 +207,8 @@ define <16 x i16> @test_pmullw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2)
; HASWELL-LABEL: test_pmullw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pmullw:
; ZNVER1: # BB#0:
@@ -225,9 +225,9 @@ define <4 x i64> @test_por(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_por:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_por:
; ZNVER1: # BB#0:
@@ -246,8 +246,8 @@ define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
; HASWELL-LABEL: test_psubb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_psubb:
; ZNVER1: # BB#0:
@@ -264,8 +264,8 @@ define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
; HASWELL-LABEL: test_psubd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_psubd:
; ZNVER1: # BB#0:
@@ -282,8 +282,8 @@ define <4 x i64> @test_psubq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_psubq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_psubq:
; ZNVER1: # BB#0:
@@ -300,8 +300,8 @@ define <16 x i16> @test_psubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
; HASWELL-LABEL: test_psubw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_psubw:
; ZNVER1: # BB#0:
@@ -318,9 +318,9 @@ define <4 x i64> @test_pxor(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_pxor:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pxor:
; ZNVER1: # BB#0:
diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
index 45a1cd97503..958c7746e29 100644
--- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -381,6 +381,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
+; X32-NEXT: ## -- End function
;
; X64-LABEL: srl_trunc_and_v4i64:
; X64: ## BB#0:
@@ -391,6 +392,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+; X64-NEXT: ## -- End function
%and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
%trunc = trunc <4 x i64> %and to <4 x i32>
%sra = lshr <4 x i32> %x, %trunc
@@ -412,6 +414,7 @@ define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; X32-NEXT: vzeroupper
; X32-NEXT: retl
+; X32-NEXT: ## -- End function
;
; X64-LABEL: shl_8i16:
; X64: ## BB#0:
@@ -423,6 +426,7 @@ define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+; X64-NEXT: ## -- End function
%shl = shl <8 x i16> %r, %a
ret <8 x i16> %shl
}
@@ -434,13 +438,14 @@ define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X32-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
-; X32-NEXT: vpsrld $16, %ymm3, %ymm3
; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; X32-NEXT: vpsrld $16, %ymm3, %ymm1
; X32-NEXT: vpsrld $16, %ymm0, %ymm0
-; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; X32-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
+; X32-NEXT: ## -- End function
;
; X64-LABEL: shl_16i16:
; X64: ## BB#0:
@@ -448,13 +453,14 @@ define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X64-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
-; X64-NEXT: vpsrld $16, %ymm3, %ymm3
; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; X64-NEXT: vpsrld $16, %ymm3, %ymm1
; X64-NEXT: vpsrld $16, %ymm0, %ymm0
-; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; X64-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
+; X64-NEXT: ## -- End function
%shl = shl <16 x i16> %r, %a
ret <16 x i16> %shl
}
@@ -474,6 +480,7 @@ define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; X32-NEXT: retl
+; X32-NEXT: ## -- End function
;
; X64-LABEL: shl_32i8:
; X64: ## BB#0:
@@ -489,6 +496,7 @@ define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; X64-NEXT: retq
+; X64-NEXT: ## -- End function
%shl = shl <32 x i8> %r, %a
ret <32 x i8> %shl
}
@@ -504,6 +512,7 @@ define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; X32-NEXT: vzeroupper
; X32-NEXT: retl
+; X32-NEXT: ## -- End function
;
; X64-LABEL: ashr_8i16:
; X64: ## BB#0:
@@ -515,6 +524,7 @@ define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+; X64-NEXT: ## -- End function
%ashr = ashr <8 x i16> %r, %a
ret <8 x i16> %ashr
}
@@ -526,13 +536,14 @@ define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X32-NEXT: vpsravd %ymm3, %ymm4, %ymm3
-; X32-NEXT: vpsrld $16, %ymm3, %ymm3
; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; X32-NEXT: vpsrld $16, %ymm3, %ymm1
; X32-NEXT: vpsrld $16, %ymm0, %ymm0
-; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; X32-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
+; X32-NEXT: ## -- End function
;
; X64-LABEL: ashr_16i16:
; X64: ## BB#0:
@@ -540,13 +551,14 @@ define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X64-NEXT: vpsravd %ymm3, %ymm4, %ymm3
-; X64-NEXT: vpsrld $16, %ymm3, %ymm3
; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; X64-NEXT: vpsrld $16, %ymm3, %ymm1
; X64-NEXT: vpsrld $16, %ymm0, %ymm0
-; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; X64-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
+; X64-NEXT: ## -- End function
%ashr = ashr <16 x i16> %r, %a
ret <16 x i16> %ashr
}
@@ -579,6 +591,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X32-NEXT: vpsrlw $8, %ymm0, %ymm0
; X32-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; X32-NEXT: retl
+; X32-NEXT: ## -- End function
;
; X64-LABEL: ashr_32i8:
; X64: ## BB#0:
@@ -607,6 +620,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X64-NEXT: vpsrlw $8, %ymm0, %ymm0
; X64-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; X64-NEXT: retq
+; X64-NEXT: ## -- End function
%ashr = ashr <32 x i8> %r, %a
ret <32 x i8> %ashr
}
@@ -622,6 +636,7 @@ define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; X32-NEXT: vzeroupper
; X32-NEXT: retl
+; X32-NEXT: ## -- End function
;
; X64-LABEL: lshr_8i16:
; X64: ## BB#0:
@@ -633,6 +648,7 @@ define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+; X64-NEXT: ## -- End function
%lshr = lshr <8 x i16> %r, %a
ret <8 x i16> %lshr
}
@@ -644,13 +660,14 @@ define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X32-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
-; X32-NEXT: vpsrld $16, %ymm3, %ymm3
; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-NEXT: vpsrld $16, %ymm3, %ymm1
; X32-NEXT: vpsrld $16, %ymm0, %ymm0
-; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; X32-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
+; X32-NEXT: ## -- End function
;
; X64-LABEL: lshr_16i16:
; X64: ## BB#0:
@@ -658,13 +675,14 @@ define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X64-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
-; X64-NEXT: vpsrld $16, %ymm3, %ymm3
; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; X64-NEXT: vpsrld $16, %ymm3, %ymm1
; X64-NEXT: vpsrld $16, %ymm0, %ymm0
-; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; X64-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
+; X64-NEXT: ## -- End function
%lshr = lshr <16 x i16> %r, %a
ret <16 x i16> %lshr
}
@@ -685,6 +703,7 @@ define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; X32-NEXT: retl
+; X32-NEXT: ## -- End function
;
; X64-LABEL: lshr_32i8:
; X64: ## BB#0:
@@ -701,6 +720,7 @@ define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; X64-NEXT: retq
+; X64-NEXT: ## -- End function
%lshr = lshr <32 x i8> %r, %a
ret <32 x i8> %lshr
}
diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll
index eae7b94f513..d2e95f692e4 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp.ll
@@ -14,6 +14,7 @@ define double @test1(double %a, double %b) nounwind {
; ALL-NEXT: LBB0_2: ## %l2
; ALL-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
+; ALL-NEXT: ## -- End function
%tobool = fcmp une double %a, %b
br i1 %tobool, label %l1, label %l2
@@ -36,6 +37,7 @@ define float @test2(float %a, float %b) nounwind {
; ALL-NEXT: LBB1_2: ## %l2
; ALL-NEXT: vaddss %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
+; ALL-NEXT: ## -- End function
%tobool = fcmp olt float %a, %b
br i1 %tobool, label %l1, label %l2
@@ -124,11 +126,11 @@ entry:
define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
; ALL-LABEL: test8:
; ALL: ## BB#0:
-; ALL-NEXT: notl %edi
; ALL-NEXT: xorl $-2147483648, %esi ## imm = 0x80000000
; ALL-NEXT: testl %edx, %edx
; ALL-NEXT: movl $1, %eax
; ALL-NEXT: cmovel %eax, %edx
+; ALL-NEXT: notl %edi
; ALL-NEXT: orl %edi, %esi
; ALL-NEXT: cmovnel %edx, %eax
; ALL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index 140299f5495..295f98ce61b 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -1545,19 +1545,19 @@ define <4 x double> @uitofp_4i1_double(<4 x i32> %a) {
}
define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
-; NOVL-LABEL: uitofp_2i1_float:
-; NOVL: # BB#0:
-; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; NOVL-NEXT: vpextrb $8, %xmm0, %eax
-; NOVL-NEXT: andl $1, %eax
-; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
-; NOVL-NEXT: vpextrb $0, %xmm0, %eax
-; NOVL-NEXT: andl $1, %eax
-; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
-; NOVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; NOVL-NEXT: retq
+; KNL-LABEL: uitofp_2i1_float:
+; KNL: # BB#0:
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT: vpextrb $8, %xmm0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: vpextrb $0, %xmm0, %ecx
+; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; KNL-NEXT: andl $1, %ecx
+; KNL-NEXT: vcvtsi2ssl %ecx, %xmm2, %xmm1
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; KNL-NEXT: retq
;
; VL-LABEL: uitofp_2i1_float:
; VL: # BB#0:
@@ -1567,6 +1567,34 @@ define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: uitofp_2i1_float:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512DQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512DQ-NEXT: andl $1, %eax
+; AVX512DQ-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
+; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512DQ-NEXT: andl $1, %eax
+; AVX512DQ-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: uitofp_2i1_float:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT: andl $1, %eax
+; AVX512BW-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: andl $1, %eax
+; AVX512BW-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512BW-NEXT: retq
%mask = icmp ult <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x float>
ret <2 x float> %1
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 29a5325a0ae..459bd3dabfc 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -12,6 +12,7 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test1:
; SKX: ## BB#0:
@@ -21,6 +22,7 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
; SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; SKX-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%rrr = load float, float* %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
@@ -36,6 +38,7 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
; KNL-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test2:
; SKX: ## BB#0:
@@ -45,6 +48,7 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SKX-NEXT: vinsertf64x2 $3, %xmm0, %zmm2, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%rrr = load double, double* %br
%rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
%rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@@ -58,6 +62,7 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test3:
; SKX: ## BB#0:
@@ -65,6 +70,7 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%eee = extractelement <16 x float> %x, i32 4
%rrr2 = insertelement <16 x float> %x, float %eee, i32 1
ret <16 x float> %rrr2
@@ -78,6 +84,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
; KNL-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test4:
; SKX: ## BB#0:
@@ -86,6 +93,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
; SKX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%eee = extractelement <8 x i64> %x, i32 4
%rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
ret <8 x i64> %rrr2
@@ -96,11 +104,13 @@ define i32 @test5(<4 x float> %x) nounwind {
; KNL: ## BB#0:
; KNL-NEXT: vextractps $3, %xmm0, %eax
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test5:
; SKX: ## BB#0:
; SKX-NEXT: vextractps $3, %xmm0, %eax
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%ef = extractelement <4 x float> %x, i32 3
%ei = bitcast float %ef to i32
ret i32 %ei
@@ -111,11 +121,13 @@ define void @test6(<4 x float> %x, float* %out) nounwind {
; KNL: ## BB#0:
; KNL-NEXT: vextractps $3, %xmm0, (%rdi)
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test6:
; SKX: ## BB#0:
; SKX-NEXT: vextractps $3, %xmm0, (%rdi)
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%ef = extractelement <4 x float> %x, i32 3
store float %ef, float* %out, align 4
ret void
@@ -135,6 +147,7 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test7:
; SKX: ## BB#0:
@@ -150,6 +163,7 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <16 x float> %x, i32 %ind
ret float %e
}
@@ -168,6 +182,7 @@ define double @test8(<8 x double> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test8:
; SKX: ## BB#0:
@@ -183,6 +198,7 @@ define double @test8(<8 x double> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <8 x double> %x, i32 %ind
ret double %e
}
@@ -201,6 +217,7 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
@@ -216,6 +233,7 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <8 x float> %x, i32 %ind
ret float %e
}
@@ -234,6 +252,7 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test10:
; SKX: ## BB#0:
@@ -249,6 +268,7 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%e = extractelement <16 x i32> %x, i32 %ind
ret i32 %e
}
@@ -1114,137 +1134,137 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $32, %rsp
-; KNL-NEXT: xorl %eax, %eax
-; KNL-NEXT: cmpl %esi, %edi
-; KNL-NEXT: setb %al
; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
-; KNL-NEXT: vmovd %edx, %xmm1
-; KNL-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm1
+; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
-; KNL-NEXT: vmovd %edx, %xmm0
-; KNL-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vmovd %eax, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
+; KNL-NEXT: setb %al
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
@@ -1299,8 +1319,8 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; KNL-NEXT: vpextrb $4, %xmm0, %ecx
; KNL-NEXT: kmovw %ecx, %k1
-; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpextrb $0, %xmm0, %ecx
+; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
@@ -2124,8 +2144,8 @@ define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v16i8:
; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $15, %edi
; KNL-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; KNL-NEXT: movb (%rdi,%rax), %al
@@ -2156,8 +2176,8 @@ define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vmovaps %ymm0, (%rsp)
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $31, %edi
; KNL-NEXT: movq %rsp, %rax
; KNL-NEXT: movb (%rdi,%rax), %al
@@ -2204,9 +2224,9 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $63, %edi
; KNL-NEXT: movq %rsp, %rax
; KNL-NEXT: movb (%rdi,%rax), %al
@@ -2295,12 +2315,12 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index)
define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v2i1:
; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $1, %edi
; KNL-NEXT: movl -24(%rsp,%rdi,8), %eax
; KNL-NEXT: andl $1, %eax
@@ -2325,12 +2345,12 @@ define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b,
define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v4i1:
; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $3, %edi
; KNL-NEXT: movl -24(%rsp,%rdi,4), %eax
; KNL-NEXT: andl $1, %eax
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 86902ac926a..3c2e6afd225 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -2880,7 +2880,6 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8
define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
; CHECK-LABEL: test_mask_vextractf32x4:
; CHECK: ## BB#0:
-; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: kshiftlw $12, %k0, %k1
; CHECK-NEXT: kshiftrw $15, %k1, %k1
@@ -2898,6 +2897,7 @@ define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8
; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1
; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -2941,7 +2941,6 @@ declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i
define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
; CHECK-LABEL: test_maskz_vextracti32x4:
; CHECK: ## BB#0:
-; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: kshiftlw $12, %k0, %k1
; CHECK-NEXT: kshiftrw $15, %k1, %k1
@@ -2959,6 +2958,7 @@ define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
; CHECK-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0
; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
; CHECK-NEXT: vpsrad $31, %xmm1, %xmm1
; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index e1a92c60d18..0ee86d5fb45 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1837,73 +1837,8 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $32, %rsp
-; KNL-NEXT: vmovups (%rdi), %zmm2
-; KNL-NEXT: vmovups 64(%rdi), %zmm3
-; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1
-; KNL-NEXT: kshiftlw $14, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $15, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm3
-; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $13, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $12, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $11, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $10, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $9, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $8, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $7, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $6, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $5, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $4, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $3, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $2, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $1, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftrw $15, %k1, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2
+; KNL-NEXT: vmovups 64(%rdi), %zmm2
+; KNL-NEXT: vcmpltps %zmm1, %zmm2, %k2
; KNL-NEXT: kshiftlw $14, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -1967,138 +1902,203 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; KNL-NEXT: kshiftrw $15, %k2, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; KNL-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z}
-; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z}
+; KNL-NEXT: vmovups (%rdi), %zmm3
+; KNL-NEXT: vcmpltps %zmm0, %zmm3, %k1
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm3
+; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $13, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $12, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $11, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $10, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $9, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $8, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $7, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $6, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $5, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $4, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $3, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $2, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $1, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftrw $15, %k1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z}
; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kshiftlw $14, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %ecx
; KNL-NEXT: vmovd %ecx, %xmm4
; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $13, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $12, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $11, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $10, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $9, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $8, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $7, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $6, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $5, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $4, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $3, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $2, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $1, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; KNL-NEXT: vcmpltps %zmm3, %zmm0, %k0
+; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3
+; KNL-NEXT: vmovups 4(%rdi), %zmm4 {%k1} {z}
+; KNL-NEXT: vcmpltps %zmm4, %zmm0, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm3
-; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT: vmovd %ecx, %xmm4
+; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; KNL-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
@@ -2943,36 +2943,6 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
;
; KNL-LABEL: store_64i1:
; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi9:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: pushq %r15
-; KNL-NEXT: Lcfi10:
-; KNL-NEXT: .cfi_def_cfa_offset 24
-; KNL-NEXT: pushq %r14
-; KNL-NEXT: Lcfi11:
-; KNL-NEXT: .cfi_def_cfa_offset 32
-; KNL-NEXT: pushq %r13
-; KNL-NEXT: Lcfi12:
-; KNL-NEXT: .cfi_def_cfa_offset 40
-; KNL-NEXT: pushq %r12
-; KNL-NEXT: Lcfi13:
-; KNL-NEXT: .cfi_def_cfa_offset 48
-; KNL-NEXT: pushq %rbx
-; KNL-NEXT: Lcfi14:
-; KNL-NEXT: .cfi_def_cfa_offset 56
-; KNL-NEXT: Lcfi15:
-; KNL-NEXT: .cfi_offset %rbx, -56
-; KNL-NEXT: Lcfi16:
-; KNL-NEXT: .cfi_offset %r12, -48
-; KNL-NEXT: Lcfi17:
-; KNL-NEXT: .cfi_offset %r13, -40
-; KNL-NEXT: Lcfi18:
-; KNL-NEXT: .cfi_offset %r14, -32
-; KNL-NEXT: Lcfi19:
-; KNL-NEXT: .cfi_offset %r15, -24
-; KNL-NEXT: Lcfi20:
-; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -2984,281 +2954,275 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: vmovd %ecx, %xmm3
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r12d
+; KNL-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vmovd %r9d, %xmm3
-; KNL-NEXT: kmovw %k1, %r9d
-; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2
-; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT: vpslld $31, %zmm2, %zmm2
-; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
-; KNL-NEXT: kmovw %k0, 6(%rdi)
-; KNL-NEXT: kshiftlw $14, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r8d
-; KNL-NEXT: kshiftlw $15, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r10d
-; KNL-NEXT: kshiftlw $13, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r9d
-; KNL-NEXT: kshiftlw $12, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r11d
-; KNL-NEXT: kshiftlw $11, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r14d
-; KNL-NEXT: kshiftlw $10, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r15d
-; KNL-NEXT: kshiftlw $9, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r12d
-; KNL-NEXT: kshiftlw $8, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r13d
-; KNL-NEXT: kshiftlw $7, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: kshiftlw $6, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %esi
-; KNL-NEXT: kshiftlw $5, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebp
-; KNL-NEXT: kshiftlw $4, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebx
-; KNL-NEXT: kshiftlw $3, %k2, %k0
+; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm2
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $2, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: kshiftlw $1, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %r10d, %xmm2
-; KNL-NEXT: kmovw %k0, %r10d
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT: kshiftrw $15, %k2, %k0
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1
-; KNL-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, 4(%rdi)
; KNL-NEXT: kshiftlw $14, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $15, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $13, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: vmovd %eax, %xmm3
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $12, %k1, %k0
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $11, %k1, %k0
+; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $10, %k1, %k0
+; KNL-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $9, %k1, %k0
+; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $8, %k1, %k0
+; KNL-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $7, %k1, %k0
+; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $6, %k1, %k0
+; KNL-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $5, %k1, %k0
+; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $4, %k1, %k0
+; KNL-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $3, %k1, %k0
+; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $2, %k1, %k0
+; KNL-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $1, %k1, %k0
+; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %r10d, %xmm1
-; KNL-NEXT: kmovw %k0, %r10d
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0
-; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: kmovw %k1, 2(%rdi)
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm2
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm1
+; KNL-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, 6(%rdi)
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: kmovw %k1, 4(%rdi)
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: vmovd %ecx, %xmm1
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r12d
+; KNL-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vmovd %r9d, %xmm0
-; KNL-NEXT: kmovw %k1, %r9d
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm0
+; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, 2(%rdi)
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $13, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vmovd %ecx, %xmm0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $12, %k1, %k0
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $11, %k1, %k0
+; KNL-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $10, %k1, %k0
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $9, %k1, %k0
+; KNL-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $8, %k1, %k0
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $7, %k1, %k0
+; KNL-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $6, %k1, %k0
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $5, %k1, %k0
+; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $4, %k1, %k0
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $3, %k1, %k0
+; KNL-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $2, %k1, %k0
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $1, %k1, %k0
+; KNL-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftrw $15, %k1, %k0
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
-; KNL-NEXT: popq %rbx
-; KNL-NEXT: popq %r12
-; KNL-NEXT: popq %r13
-; KNL-NEXT: popq %r14
-; KNL-NEXT: popq %r15
-; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: store_64i1:
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index 2b04b9229b3..02ee3adeb7a 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -8,6 +8,7 @@ define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = fcmp ole <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
ret <16 x float> %max
@@ -19,6 +20,7 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = fcmp ole <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
ret <8 x double> %max
@@ -30,6 +32,7 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin
; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %yp, align 4
%mask = icmp eq <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -42,6 +45,7 @@ define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1)
; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp uge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
@@ -53,6 +57,7 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp eq <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
ret <8 x i64> %max
@@ -64,6 +69,7 @@ define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) noun
; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp ugt <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
ret <8 x i64> %max
@@ -117,12 +123,14 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -137,12 +145,14 @@ define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test10:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%mask = fcmp oeq <8 x float> %x, %y
%max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
@@ -154,6 +164,7 @@ define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
; CHECK: ## BB#0:
; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp ugt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -168,6 +179,7 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12:
; SKX: ## BB#0:
@@ -178,6 +190,7 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%res = icmp eq <16 x i64> %a, %b
%res1 = bitcast <16 x i1> %res to i16
ret i16 %res1
@@ -330,6 +343,7 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12_v32i32:
; SKX: ## BB#0:
@@ -339,6 +353,7 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%res = icmp eq <32 x i32> %a, %b
%res1 = bitcast <32 x i1> %res to i32
ret i32 %res1
@@ -562,72 +577,72 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm0
-; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: vmovd %ecx, %xmm1
+; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm0
+; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -642,6 +657,7 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12_v64i16:
; SKX: ## BB#0:
@@ -651,6 +667,7 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; SKX-NEXT: kmovq %k0, %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%res = icmp eq <64 x i16> %a, %b
%res1 = bitcast <64 x i1> %res to i64
ret i64 %res1
@@ -704,6 +721,7 @@ define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind
; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1
; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask = icmp sge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
@@ -715,6 +733,7 @@ define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sgt <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -727,6 +746,7 @@ define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sle <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -739,6 +759,7 @@ define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp ule <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -752,6 +773,7 @@ define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i3
; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp eq <16 x i32> %x1, %y1
%mask0 = icmp eq <16 x i32> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -766,6 +788,7 @@ define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y
; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <8 x i64> %x1, %y1
%mask0 = icmp sle <8 x i64> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -780,6 +803,7 @@ define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i6
; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sgt <8 x i64> %x1, %y1
%y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <8 x i64> %x, %y
@@ -795,6 +819,7 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <16 x i32> %x1, %y1
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask0 = icmp ule <16 x i32> %x, %y
@@ -809,6 +834,7 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
%y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -823,6 +849,7 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind
; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
%y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -838,6 +865,7 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32
; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <16 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
@@ -855,6 +883,7 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y
; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <8 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
@@ -920,12 +949,14 @@ define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind {
; KNL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test30:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%mask = fcmp oeq <4 x double> %x, %y
%max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %y
@@ -938,12 +969,14 @@ define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp
; KNL-NEXT: vcmpltpd (%rdi), %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test31:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1
; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <2 x double>, <2 x double>* %yp, align 4
%mask = fcmp olt <2 x double> %x, %y
@@ -957,12 +990,14 @@ define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp
; KNL-NEXT: vcmpltpd (%rdi), %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test32:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <4 x double>, <4 x double>* %yp, align 4
%mask = fcmp ogt <4 x double> %y, %x
@@ -976,6 +1011,7 @@ define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp
; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <8 x double>, <8 x double>* %yp, align 4
%mask = fcmp olt <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1
@@ -988,12 +1024,14 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no
; KNL-NEXT: vcmpltps (%rdi), %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test34:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1
; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <4 x float>, <4 x float>* %yp, align 4
%mask = fcmp olt <4 x float> %x, %y
%max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1
@@ -1010,12 +1048,14 @@ define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) no
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test35:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%y = load <8 x float>, <8 x float>* %yp, align 4
%mask = fcmp ogt <8 x float> %y, %x
@@ -1029,6 +1069,7 @@ define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp
; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%y = load <16 x float>, <16 x float>* %yp, align 4
%mask = fcmp olt <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1
@@ -1041,6 +1082,7 @@ define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nou
; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
@@ -1058,12 +1100,14 @@ define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nou
; KNL-NEXT: vcmpltpd %ymm2, %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test38:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <4 x double> undef, double %a, i32 0
@@ -1081,12 +1125,14 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nou
; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test39:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1
; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
@@ -1104,6 +1150,7 @@ define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) n
; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <16 x float> undef, float %a, i32 0
@@ -1124,12 +1171,14 @@ define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) noun
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test41:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <8 x float> undef, float %a, i32 0
@@ -1147,12 +1196,14 @@ define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, float* %ptr) noun
; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test42:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1
; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <4 x float> undef, float %a, i32 0
@@ -1172,6 +1223,7 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
+; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test43:
; SKX: ## BB#0:
@@ -1180,6 +1232,7 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
+; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 2b89373ceb0..d56c4675b73 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -1685,8 +1685,6 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: .cfi_offset %esi, -12
; AVX512F-32-NEXT: .Lcfi9:
; AVX512F-32-NEXT: .cfi_offset %ebx, -8
-; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
@@ -1707,39 +1705,39 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: kmovd %ecx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %edx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %ebx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
@@ -1748,8 +1746,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
@@ -1758,8 +1756,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
@@ -1767,8 +1765,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
@@ -1777,8 +1775,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
@@ -1789,8 +1787,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
@@ -1798,8 +1796,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1809,8 +1807,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1820,8 +1818,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1831,8 +1829,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1842,8 +1840,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1852,8 +1850,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1864,8 +1862,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %bl
@@ -1877,8 +1875,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
@@ -1887,8 +1885,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1898,8 +1896,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1910,8 +1908,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1921,8 +1919,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
@@ -1932,8 +1930,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1942,8 +1940,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1952,444 +1950,444 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5
+; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm6, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: movl %ecx, %esi
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm6
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm2, %ymm7, %ymm7
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $30, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000
; AVX512F-32-NEXT: shrl $12, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $13, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000
; AVX512F-32-NEXT: shrl $14, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000
; AVX512F-32-NEXT: shrl $15, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %ebx
; AVX512F-32-NEXT: shrl $16, %ebx
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: andb $15, %al
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: shrb $2, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
; AVX512F-32-NEXT: shrb $7, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $24, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: kmovd %eax, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $28, %eax
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0
@@ -2397,12 +2395,12 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
; AVX512F-32-NEXT: korq %k1, %k0, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT: vpcmpgtb %zmm5, %zmm6, %k2 {%k1}
-; AVX512F-32-NEXT: vpcmpleb %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT: vpcmpleb %zmm5, %zmm6, %k5 {%k1}
-; AVX512F-32-NEXT: vpcmpgtb %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k2 {%k1}
+; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k3 {%k1}
+; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k4 {%k1}
+; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k5 {%k1}
+; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 {%k1}
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -2571,8 +2569,6 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: .cfi_offset %esi, -12
; AVX512F-32-NEXT: .Lcfi15:
; AVX512F-32-NEXT: .cfi_offset %ebx, -8
-; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
@@ -2593,39 +2589,39 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: kmovd %ecx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %edx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %ebx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
@@ -2634,8 +2630,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
@@ -2644,8 +2640,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
@@ -2653,8 +2649,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
@@ -2663,8 +2659,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
@@ -2675,8 +2671,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
@@ -2684,8 +2680,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2695,8 +2691,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2706,8 +2702,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2717,8 +2713,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2728,8 +2724,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2738,8 +2734,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2750,8 +2746,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %bl
@@ -2763,8 +2759,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
@@ -2773,8 +2769,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2784,8 +2780,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2796,8 +2792,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2807,8 +2803,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
@@ -2818,8 +2814,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2828,8 +2824,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2838,444 +2834,444 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5
+; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm6, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: movl %ecx, %esi
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm6
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm2, %ymm7, %ymm7
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $30, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000
; AVX512F-32-NEXT: shrl $12, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $13, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000
; AVX512F-32-NEXT: shrl $14, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000
; AVX512F-32-NEXT: shrl $15, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %ebx
; AVX512F-32-NEXT: shrl $16, %ebx
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: andb $15, %al
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: shrb $2, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
; AVX512F-32-NEXT: shrb $7, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $24, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: kmovd %eax, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $28, %eax
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0
@@ -3283,12 +3279,12 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
; AVX512F-32-NEXT: korq %k1, %k0, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT: vpcmpltub %zmm6, %zmm5, %k2 {%k1}
-; AVX512F-32-NEXT: vpcmpleub %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT: vpcmpnltub %zmm6, %zmm5, %k5 {%k1}
-; AVX512F-32-NEXT: vpcmpnleub %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k2 {%k1}
+; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k3 {%k1}
+; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k4 {%k1}
+; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k5 {%k1}
+; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 {%k1}
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index f4504ed07fc..9f344c82f6b 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -2695,32 +2695,32 @@ declare <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16>, <16 x i16>, <32
define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: test_cmp_b_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
-; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
-; CHECK-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 ## encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02]
-; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04]
-; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xc0,0x02]
-; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
-; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x02]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %eax, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
+; CHECK-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01]
+; CHECK-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd2,0x02]
; CHECK-NEXT: kxnord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x46,0xc0]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03]
-; CHECK-NEXT: vmovd %ecx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9]
-; CHECK-NEXT: vmovd %r8d, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0]
-; CHECK-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
-; CHECK-NEXT: ## xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-NEXT: vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
-; CHECK-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
-; CHECK-NEXT: ## xmm1 = xmm1[0],xmm2[0]
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x03]
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 ## encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; CHECK-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
+; CHECK-NEXT: vpunpckldq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x62,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT: vmovd %edx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca]
+; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1]
+; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
@@ -2750,23 +2750,23 @@ define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
; CHECK-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0]
; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02]
-; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
-; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
-; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
-; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
; CHECK-NEXT: vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
@@ -2793,32 +2793,32 @@ declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) noun
define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: test_ucmp_b_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
-; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
-; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02]
-; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04]
-; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05]
-; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x06]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
-; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x02]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %eax, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
+; CHECK-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01]
+; CHECK-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd2,0x02]
; CHECK-NEXT: kxnord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x46,0xc0]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03]
-; CHECK-NEXT: vmovd %ecx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9]
-; CHECK-NEXT: vmovd %r8d, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0]
-; CHECK-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
-; CHECK-NEXT: ## xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-NEXT: vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
-; CHECK-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
-; CHECK-NEXT: ## xmm1 = xmm1[0],xmm2[0]
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x03]
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; CHECK-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8]
+; CHECK-NEXT: vpunpckldq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x62,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT: vmovd %edx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca]
+; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1]
+; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
@@ -2848,23 +2848,23 @@ define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask)
; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01]
; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02]
-; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
-; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
-; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
-; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
; CHECK-NEXT: vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
index a6d6ca15530..21b06915182 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -453,10 +453,10 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtb %xmm6, %xmm4
-; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4
; SSE2-SSSE3-NEXT: pcmpgtb %xmm7, %xmm5
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5
; SSE2-SSSE3-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4
; SSE2-SSSE3-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-SSSE3-NEXT: andb $1, %al
diff --git a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
index 5d5cbc76f92..2ebd30adcdc 100644
--- a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
+++ b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple i386-apple-darwin -mcpu=yonah | FileCheck %s
target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
@@ -6,31 +7,32 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
; into loads, off the stack or a previous store.
; Be very explicit about the ordering/stack offsets.
-; CHECK-LABEL: test_extractelement_legalization_storereuse:
-; CHECK: # BB#0
-; CHECK-NEXT: pushl %ebx
-; CHECK-NEXT: pushl %edi
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: movl 16(%esp), %eax
-; CHECK-NEXT: movl 24(%esp), %ecx
-; CHECK-NEXT: movl 20(%esp), %edx
-; CHECK-NEXT: paddd (%edx), %xmm0
-; CHECK-NEXT: movdqa %xmm0, (%edx)
-; CHECK-NEXT: movl (%edx), %esi
-; CHECK-NEXT: movl 4(%edx), %edi
-; CHECK-NEXT: shll $4, %ecx
-; CHECK-NEXT: movl 8(%edx), %ebx
-; CHECK-NEXT: movl 12(%edx), %edx
-; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
-; CHECK-NEXT: movl %edi, (%eax,%ecx)
-; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
-; CHECK-NEXT: movl %edx, 4(%eax,%ecx)
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: popl %edi
-; CHECK-NEXT: popl %ebx
-; CHECK-NEXT: retl
define void @test_extractelement_legalization_storereuse(<4 x i32> %a, i32* nocapture %x, i32* nocapture readonly %y, i32 %i) #0 {
+; CHECK-LABEL: _test_extractelement_legalization_storereuse: ## @test_extractelement_legalization_storereuse
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: paddd (%ecx), %xmm0
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: movdqa %xmm0, (%ecx)
+; CHECK-NEXT: movl (%ecx), %esi
+; CHECK-NEXT: movl 4(%ecx), %edi
+; CHECK-NEXT: shll $4, %edx
+; CHECK-NEXT: movl 8(%ecx), %ebx
+; CHECK-NEXT: movl 12(%ecx), %ecx
+; CHECK-NEXT: movl %esi, 12(%eax,%edx)
+; CHECK-NEXT: movl %edi, (%eax,%edx)
+; CHECK-NEXT: movl %ebx, 8(%eax,%edx)
+; CHECK-NEXT: movl %ecx, 4(%eax,%edx)
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
%0 = bitcast i32* %y to <4 x i32>*
%1 = load <4 x i32>, <4 x i32>* %0, align 16
diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll
index 6c6bc8bdc1d..98082ec611d 100644
--- a/llvm/test/CodeGen/X86/fp128-i128.ll
+++ b/llvm/test/CodeGen/X86/fp128-i128.ll
@@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
; CHECK-NEXT: andq %rdi, %rcx
; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: jmp foo # TAILCALL
diff --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll
index c3109673468..3383b6e2f4c 100644
--- a/llvm/test/CodeGen/X86/gather-addresses.ll
+++ b/llvm/test/CodeGen/X86/gather-addresses.ll
@@ -16,11 +16,10 @@
; LIN: sarq $32, %r[[REG2]]
; LIN: movslq %e[[REG4]], %r[[REG3:.+]]
; LIN: sarq $32, %r[[REG4]]
-; LIN: movsd (%rdi,%r[[REG1]],8), %xmm0
-; LIN: movhpd (%rdi,%r[[REG2]],8), %xmm0
-; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1
-; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1
-
+; LIN: movsd (%rdi,%rsi,8), %xmm1
+; LIN: movhpd (%rdi,%rax,8), %xmm1
+; LIN: movdqa (%rsi), %xmm0
+; LIN: movq %rdi, %xmm1
; WIN: movdqa (%rdx), %xmm0
; WIN: pand (%r8), %xmm0
; WIN: pextrq $1, %xmm0, %r[[REG4:.+]]
@@ -29,10 +28,10 @@
; WIN: sarq $32, %r[[REG2]]
; WIN: movslq %e[[REG4]], %r[[REG3:.+]]
; WIN: sarq $32, %r[[REG4]]
-; WIN: movsd (%rcx,%r[[REG1]],8), %xmm0
-; WIN: movhpd (%rcx,%r[[REG2]],8), %xmm0
-; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1
-; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1
+; WIN: movsd (%rcx,%r9,8), %xmm1
+; WIN: movhpd (%rcx,%rax,8), %xmm1
+; WIN: movdqa (%rdx), %xmm0
+; WIN: movq %rdx, %xmm1
define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
%a = load <4 x i32>, <4 x i32>* %i
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 4c8003f0c51..f72dfa1eef5 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1,266 +1,834 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=1 \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWON
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=0 \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWOFF
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c -asm-verbose=false -fixup-byte-word-insts=1 \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C -check-prefix=BWON
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -asm-verbose=false -fixup-byte-word-insts=0 \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -fixup-byte-word-insts=1 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWON,NOF16-BWINSTS
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -fixup-byte-word-insts=0 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWOFF,NOF16-NOBWINSTS
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c -fixup-byte-word-insts=1 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,BWON,CHECK-F16C
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -fixup-byte-word-insts=0 \
; RUN: | FileCheck %s -check-prefix=CHECK-I686
-define void @test_load_store(half* %in, half* %out) {
-; CHECK-LABEL: test_load_store:
-; BWON: movzwl (%rdi), %eax
-; BWOFF: movw (%rdi), %ax
-; CHECK: movw %ax, (%rsi)
+define void @test_load_store(half* %in, half* %out) #0 {
+; BWON-LABEL: test_load_store:
+; BWON: # BB#0:
+; BWON-NEXT: movzwl (%rdi), %eax
+; BWON-NEXT: movw %ax, (%rsi)
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: test_load_store:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: movw (%rdi), %ax
+; BWOFF-NEXT: movw %ax, (%rsi)
+; BWOFF-NEXT: retq
+;
+; CHECK-I686-LABEL: test_load_store:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT: movw (%ecx), %cx
+; CHECK-I686-NEXT: movw %cx, (%eax)
+; CHECK-I686-NEXT: retl
%val = load half, half* %in
store half %val, half* %out
ret void
}
-define i16 @test_bitcast_from_half(half* %addr) {
-; CHECK-LABEL: test_bitcast_from_half:
-; BWON: movzwl (%rdi), %eax
-; BWOFF: movw (%rdi), %ax
+define i16 @test_bitcast_from_half(half* %addr) #0 {
+; BWON-LABEL: test_bitcast_from_half:
+; BWON: # BB#0:
+; BWON-NEXT: movzwl (%rdi), %eax
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: test_bitcast_from_half:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: movw (%rdi), %ax
+; BWOFF-NEXT: retq
+;
+; CHECK-I686-LABEL: test_bitcast_from_half:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movw (%eax), %ax
+; CHECK-I686-NEXT: retl
%val = load half, half* %addr
%val_int = bitcast half %val to i16
ret i16 %val_int
}
-define void @test_bitcast_to_half(half* %addr, i16 %in) {
+define void @test_bitcast_to_half(half* %addr, i16 %in) #0 {
; CHECK-LABEL: test_bitcast_to_half:
-; CHECK: movw %si, (%rdi)
+; CHECK: # BB#0:
+; CHECK-NEXT: movw %si, (%rdi)
+; CHECK-NEXT: retq
+;
+; CHECK-I686-LABEL: test_bitcast_to_half:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: movw {{[0-9]+}}(%esp), %ax
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT: movw %ax, (%ecx)
+; CHECK-I686-NEXT: retl
%val_fp = bitcast i16 %in to half
store half %val_fp, half* %addr
ret void
}
-define float @test_extend32(half* %addr) {
-; CHECK-LABEL: test_extend32:
-
-; CHECK-LIBCALL: jmp __gnu_h2f_ieee
-; CHECK-F16C: vcvtph2ps
+define float @test_extend32(half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_extend32:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: jmp __gnu_h2f_ieee # TAILCALL
+;
+; CHECK-F16C-LABEL: test_extend32:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: movswl (%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm0
+; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend32:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%val16 = load half, half* %addr
%val32 = fpext half %val16 to float
ret float %val32
}
-define double @test_extend64(half* %addr) {
-; CHECK-LABEL: test_extend64:
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: cvtss2sd
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtss2sd
+define double @test_extend64(half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_extend64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: popq %rax
+; CHECK-LIBCALL-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_extend64:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: movswl (%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm0
+; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%val16 = load half, half* %addr
%val32 = fpext half %val16 to double
ret double %val32
}
-define void @test_trunc32(float %in, half* %addr) {
-; CHECK-LABEL: test_trunc32:
-
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-F16C: vcvtps2ph
+define void @test_trunc32(float %in, half* %addr) #0 {
+; CHECK-LIBCALL-LABEL: test_trunc32:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_trunc32:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-F16C-NEXT: vmovd %xmm0, %eax
+; CHECK-F16C-NEXT: movw %ax, (%rdi)
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc32:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $8, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $8, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%val16 = fptrunc float %in to half
store half %val16, half* %addr
ret void
}
-define void @test_trunc64(double %in, half* %addr) {
+define void @test_trunc64(double %in, half* %addr) #0 {
; CHECK-LABEL: test_trunc64:
-
-; CHECK-LIBCALL: callq __truncdfhf2
-; CHECK-F16C: callq __truncdfhf2
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: callq __truncdfhf2
+; CHECK-NEXT: movw %ax, (%rbx)
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $8, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movsd %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $8, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%val16 = fptrunc double %in to half
store half %val16, half* %addr
ret void
}
define i64 @test_fptosi_i64(half* %p) #0 {
-; CHECK-LABEL: test_fptosi_i64:
-
-; CHECK-LIBCALL-NEXT: pushq %rax
-; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax
-; CHECK-LIBCALL-NEXT: popq %rcx
-; CHECK-LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtph2ps [[REG1]], [[REG2:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG2]], %rax
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_fptosi_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax
+; CHECK-LIBCALL-NEXT: popq %rcx
+; CHECK-LIBCALL-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_fptosi_i64:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: movswl (%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm0
+; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT: vcvttss2si %xmm0, %rax
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_fptosi_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __fixsfdi
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%a = load half, half* %p, align 2
%r = fptosi half %a to i64
ret i64 %r
}
define void @test_sitofp_i64(i64 %a, half* %p) #0 {
-; CHECK-LABEL: test_sitofp_i64:
-
-; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z]+]]
-; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]]
-; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]])
-; CHECK_LIBCALL-NEXT: popq [[ADDR]]
-; CHECK_LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG0:%[a-z0-9]+]], [[REG0]]
-; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG0]], [[REG0]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], %eax
-; CHECK-F16C-NEXT: movw %ax, (%rsi)
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_sitofp_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: movq %rsi, %rbx
+; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_sitofp_i64:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
+; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-F16C-NEXT: vmovd %xmm0, %eax
+; CHECK-F16C-NEXT: movw %ax, (%rsi)
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_sitofp_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $24, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $24, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%r = sitofp i64 %a to half
store half %r, half* %p
ret void
}
define i64 @test_fptoui_i64(half* %p) #0 {
-; CHECK-LABEL: test_fptoui_i64:
-
-; FP_TO_UINT is expanded using FP_TO_SINT
-; CHECK-LIBCALL-NEXT: pushq %rax
-; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss {{.[A-Z_0-9]+}}(%rip), [[REG1:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movaps %xmm0, [[REG2:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: subss [[REG1]], [[REG2]]
-; CHECK-LIBCALL-NEXT: cvttss2si [[REG2]], [[REG3:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movabsq $-9223372036854775808, [[REG4:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: xorq [[REG3]], [[REG4]]
-; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, [[REG5:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: ucomiss [[REG1]], %xmm0
-; CHECK-LIBCALL-NEXT: cmovaeq [[REG4]], [[REG5]]
-; CHECK-LIBCALL-NEXT: popq %rcx
-; CHECK-LIBCALL-NEXT: retq
-
-; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG0]], [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtph2ps [[REG1]], [[REG2:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovss {{.[A-Z_0-9]+}}(%rip), [[REG3:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vsubss [[REG3]], [[REG2]], [[REG4:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG4]], [[REG5:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: movabsq $-9223372036854775808, [[REG6:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: xorq [[REG5]], [[REG6:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvttss2si [[REG2]], [[REG7:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vucomiss [[REG3]], [[REG2]]
-; CHECK-F16C-NEXT: cmovaeq [[REG6]], %rax
-; CHECK-F16C-NEXT: retq
+; CHECK-LIBCALL-LABEL: test_fptoui_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: movaps %xmm0, %xmm2
+; CHECK-LIBCALL-NEXT: subss %xmm1, %xmm2
+; CHECK-LIBCALL-NEXT: cvttss2si %xmm2, %rcx
+; CHECK-LIBCALL-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax
+; CHECK-LIBCALL-NEXT: xorq %rcx, %rdx
+; CHECK-LIBCALL-NEXT: ucomiss %xmm1, %xmm0
+; CHECK-LIBCALL-NEXT: cmovaeq %rdx, %rax
+; CHECK-LIBCALL-NEXT: popq %rcx
+; CHECK-LIBCALL-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_fptoui_i64:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: movswl (%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm0
+; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-F16C-NEXT: vsubss %xmm1, %xmm0, %xmm2
+; CHECK-F16C-NEXT: vcvttss2si %xmm2, %rcx
+; CHECK-F16C-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; CHECK-F16C-NEXT: vcvttss2si %xmm0, %rax
+; CHECK-F16C-NEXT: xorq %rcx, %rdx
+; CHECK-F16C-NEXT: vucomiss %xmm1, %xmm0
+; CHECK-F16C-NEXT: cmovaeq %rdx, %rax
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_fptoui_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __fixunssfdi
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%a = load half, half* %p, align 2
%r = fptoui half %a to i64
ret i64 %r
}
define void @test_uitofp_i64(i64 %a, half* %p) #0 {
-; CHECK-LABEL: test_uitofp_i64:
-; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]]
-; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: js [[LABEL1:.LBB[0-9_]+]]
-
-; simple conversion to float if non-negative
-; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]], [[REG1]]
-; CHECK-NEXT: jmp [[LABEL2:.LBB[0-9_]+]]
-
-; convert using shift+or if negative
-; CHECK-NEXT: [[LABEL1]]:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: orq %rax, [[REG2:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: cvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]]
-; CHECK-LIBCALL-NEXT: addss [[REG3]], [[REG1]]
-; CHECK-F16C-NEXT: vcvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]], [[REG3]]
-; CHECK-F16C-NEXT: vaddss [[REG3]], [[REG3]], [[REG1:[%a-z0-9]+]]
-
-; convert float to half
-; CHECK-NEXT: [[LABEL2]]:
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]])
-; CHECK-LIBCALL-NEXT: popq [[ADDR]]
-; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG1]], [[REG4:%[a-z0-9]+]]
-; CHECK-F16C-NEXT: vmovd [[REG4]], %eax
-; CHECK-F16C-NEXT: movw %ax, (%rsi)
-; CHECK-NEXT: retq
-
+; CHECK-LIBCALL-LABEL: test_uitofp_i64:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: movq %rsi, %rbx
+; CHECK-LIBCALL-NEXT: testq %rdi, %rdi
+; CHECK-LIBCALL-NEXT: js .LBB10_1
+; CHECK-LIBCALL-NEXT: # BB#2:
+; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT: jmp .LBB10_3
+; CHECK-LIBCALL-NEXT: .LBB10_1:
+; CHECK-LIBCALL-NEXT: movq %rdi, %rax
+; CHECK-LIBCALL-NEXT: shrq %rax
+; CHECK-LIBCALL-NEXT: andl $1, %edi
+; CHECK-LIBCALL-NEXT: orq %rax, %rdi
+; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
+; CHECK-LIBCALL-NEXT: addss %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: .LBB10_3:
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movw %ax, (%rbx)
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_uitofp_i64:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: testq %rdi, %rdi
+; CHECK-F16C-NEXT: js .LBB10_1
+; CHECK-F16C-NEXT: # BB#2:
+; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
+; CHECK-F16C-NEXT: jmp .LBB10_3
+; CHECK-F16C-NEXT: .LBB10_1:
+; CHECK-F16C-NEXT: movq %rdi, %rax
+; CHECK-F16C-NEXT: shrq %rax
+; CHECK-F16C-NEXT: andl $1, %edi
+; CHECK-F16C-NEXT: orq %rax, %rdi
+; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
+; CHECK-F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; CHECK-F16C-NEXT: .LBB10_3:
+; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-F16C-NEXT: vmovd %xmm0, %eax
+; CHECK-F16C-NEXT: movw %ax, (%rsi)
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_uitofp_i64:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $24, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: xorl %eax, %eax
+; CHECK-I686-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: setns %al
+; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fadds {{\.LCPI.*}}(,%eax,4)
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%esi)
+; CHECK-I686-NEXT: addl $24, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%r = uitofp i64 %a to half
store half %r, half* %p
ret void
}
define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
-; CHECK-LABEL: test_extend32_vec4:
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C: vcvtph2ps
+; CHECK-LIBCALL-LABEL: test_extend32_vec4:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: subq $48, %rsp
+; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl (%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-LIBCALL-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-LIBCALL-NEXT: insertps $32, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: # xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; CHECK-LIBCALL-NEXT: insertps $48, {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-LIBCALL-NEXT: # xmm1 = xmm1[0,1,2],mem[0]
+; CHECK-LIBCALL-NEXT: movaps %xmm1, %xmm0
+; CHECK-LIBCALL-NEXT: addq $48, %rsp
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_extend32_vec4:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: movswl 6(%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm0
+; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT: movswl 4(%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm1
+; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; CHECK-F16C-NEXT: movswl (%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm2
+; CHECK-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
+; CHECK-F16C-NEXT: movswl 2(%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm3
+; CHECK-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
+; CHECK-F16C-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; CHECK-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; CHECK-F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend32_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $56, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movzwl 2(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 4(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 6(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: movzwl (%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-I686-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-I686-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-I686-NEXT: addl $56, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%a = load <4 x half>, <4 x half>* %p, align 8
%b = fpext <4 x half> %a to <4 x float>
ret <4 x float> %b
}
define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
-; CHECK-LABEL: test_extend64_vec4
-
-; CHECK-LIBCALL: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL-DAG: cvtss2sd
-; CHECK-LIBCALL: cvtss2sd
-; CHECK-F16C: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtph2ps
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C-DAG: vcvtss2sd
-; CHECK-F16C: vcvtss2sd
+; CHECK-LIBCALL-LABEL: test_extend64_vec4:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: subq $16, %rsp
+; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
+; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl (%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm1
+; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
+; CHECK-LIBCALL-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
+; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
+; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm2
+; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
+; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm1
+; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-LIBCALL-NEXT: addq $16, %rsp
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_extend64_vec4:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: movswl (%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm0
+; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT: movswl 2(%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm1
+; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; CHECK-F16C-NEXT: movswl 4(%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm2
+; CHECK-F16C-NEXT: vcvtph2ps %xmm2, %xmm2
+; CHECK-F16C-NEXT: movswl 6(%rdi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm3
+; CHECK-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
+; CHECK-F16C-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; CHECK-F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; CHECK-F16C-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; CHECK-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; CHECK-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; CHECK-F16C-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_extend64_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $88, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT: movzwl 6(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 4(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl 2(%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT: movzwl (%esi), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-I686-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-I686-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-I686-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; CHECK-I686-NEXT: addl $88, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: retl
%a = load <4 x half>, <4 x half>* %p, align 8
%b = fpext <4 x half> %a to <4 x double>
ret <4 x double> %b
}
-define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) {
-; CHECK-LABEL: test_trunc32_vec4:
-
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-LIBCALL: callq __gnu_f2h_ieee
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK-F16C: vcvtps2ph
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
+define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
+; NOF16-BWINSTS-LABEL: test_trunc32_vec4:
+; NOF16-BWINSTS: # BB#0:
+; NOF16-BWINSTS-NEXT: pushq %rbp
+; NOF16-BWINSTS-NEXT: pushq %r15
+; NOF16-BWINSTS-NEXT: pushq %r14
+; NOF16-BWINSTS-NEXT: pushq %rbx
+; NOF16-BWINSTS-NEXT: subq $24, %rsp
+; NOF16-BWINSTS-NEXT: movq %rdi, %rbx
+; NOF16-BWINSTS-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; NOF16-BWINSTS-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; NOF16-BWINSTS-NEXT: callq __gnu_f2h_ieee
+; NOF16-BWINSTS-NEXT: movl %eax, %r14d
+; NOF16-BWINSTS-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; NOF16-BWINSTS-NEXT: callq __gnu_f2h_ieee
+; NOF16-BWINSTS-NEXT: movl %eax, %r15d
+; NOF16-BWINSTS-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; NOF16-BWINSTS-NEXT: callq __gnu_f2h_ieee
+; NOF16-BWINSTS-NEXT: movl %eax, %ebp
+; NOF16-BWINSTS-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT: callq __gnu_f2h_ieee
+; NOF16-BWINSTS-NEXT: movw %ax, (%rbx)
+; NOF16-BWINSTS-NEXT: movw %bp, 6(%rbx)
+; NOF16-BWINSTS-NEXT: movw %r15w, 4(%rbx)
+; NOF16-BWINSTS-NEXT: movw %r14w, 2(%rbx)
+; NOF16-BWINSTS-NEXT: addq $24, %rsp
+; NOF16-BWINSTS-NEXT: popq %rbx
+; NOF16-BWINSTS-NEXT: popq %r14
+; NOF16-BWINSTS-NEXT: popq %r15
+; NOF16-BWINSTS-NEXT: popq %rbp
+; NOF16-BWINSTS-NEXT: retq
+;
+; BWOFF-LABEL: test_trunc32_vec4:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: pushq %rbp
+; BWOFF-NEXT: pushq %r15
+; BWOFF-NEXT: pushq %r14
+; BWOFF-NEXT: pushq %rbx
+; BWOFF-NEXT: subq $24, %rsp
+; BWOFF-NEXT: movq %rdi, %rbx
+; BWOFF-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; BWOFF-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, %r14w
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, %r15w
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, %bp
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: callq __gnu_f2h_ieee
+; BWOFF-NEXT: movw %ax, (%rbx)
+; BWOFF-NEXT: movw %bp, 6(%rbx)
+; BWOFF-NEXT: movw %r15w, 4(%rbx)
+; BWOFF-NEXT: movw %r14w, 2(%rbx)
+; BWOFF-NEXT: addq $24, %rsp
+; BWOFF-NEXT: popq %rbx
+; BWOFF-NEXT: popq %r14
+; BWOFF-NEXT: popq %r15
+; BWOFF-NEXT: popq %rbp
+; BWOFF-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_trunc32_vec4:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-F16C-NEXT: vmovd %xmm1, %eax
+; CHECK-F16C-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-F16C-NEXT: vmovd %xmm1, %ecx
+; CHECK-F16C-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-F16C-NEXT: vmovd %xmm1, %edx
+; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-F16C-NEXT: vmovd %xmm0, %esi
+; CHECK-F16C-NEXT: movw %si, (%rdi)
+; CHECK-F16C-NEXT: movw %dx, 6(%rdi)
+; CHECK-F16C-NEXT: movw %cx, 4(%rdi)
+; CHECK-F16C-NEXT: movw %ax, 2(%rdi)
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc32_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %ebp
+; CHECK-I686-NEXT: pushl %ebx
+; CHECK-I686-NEXT: pushl %edi
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $44, %esp
+; CHECK-I686-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; CHECK-I686-NEXT: movaps %xmm0, %xmm1
+; CHECK-I686-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-I686-NEXT: movss %xmm1, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, %si
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, %di
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, %bx
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movw %ax, (%ebp)
+; CHECK-I686-NEXT: movw %bx, 6(%ebp)
+; CHECK-I686-NEXT: movw %di, 4(%ebp)
+; CHECK-I686-NEXT: movw %si, 2(%ebp)
+; CHECK-I686-NEXT: addl $44, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: popl %edi
+; CHECK-I686-NEXT: popl %ebx
+; CHECK-I686-NEXT: popl %ebp
+; CHECK-I686-NEXT: retl
%v = fptrunc <4 x float> %a to <4 x half>
store <4 x half> %v, <4 x half>* %p
ret void
}
-define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) {
-; CHECK-LABEL: test_trunc64_vec4:
-; CHECK: callq __truncdfhf2
-; CHECK: callq __truncdfhf2
-; CHECK: callq __truncdfhf2
-; CHECK: callq __truncdfhf2
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
-; CHECK: movw
+define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 {
+; NOF16-BWINSTS-LABEL: test_trunc64_vec4:
+; NOF16-BWINSTS: # BB#0:
+; NOF16-BWINSTS-NEXT: pushq %rbp
+; NOF16-BWINSTS-NEXT: pushq %r15
+; NOF16-BWINSTS-NEXT: pushq %r14
+; NOF16-BWINSTS-NEXT: pushq %rbx
+; NOF16-BWINSTS-NEXT: subq $40, %rsp
+; NOF16-BWINSTS-NEXT: movq %rdi, %rbx
+; NOF16-BWINSTS-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; NOF16-BWINSTS-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; NOF16-BWINSTS-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; NOF16-BWINSTS-NEXT: callq __truncdfhf2
+; NOF16-BWINSTS-NEXT: movl %eax, %r14d
+; NOF16-BWINSTS-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; NOF16-BWINSTS-NEXT: callq __truncdfhf2
+; NOF16-BWINSTS-NEXT: movl %eax, %r15d
+; NOF16-BWINSTS-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT: callq __truncdfhf2
+; NOF16-BWINSTS-NEXT: movl %eax, %ebp
+; NOF16-BWINSTS-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; NOF16-BWINSTS-NEXT: callq __truncdfhf2
+; NOF16-BWINSTS-NEXT: movw %ax, 4(%rbx)
+; NOF16-BWINSTS-NEXT: movw %bp, (%rbx)
+; NOF16-BWINSTS-NEXT: movw %r15w, 6(%rbx)
+; NOF16-BWINSTS-NEXT: movw %r14w, 2(%rbx)
+; NOF16-BWINSTS-NEXT: addq $40, %rsp
+; NOF16-BWINSTS-NEXT: popq %rbx
+; NOF16-BWINSTS-NEXT: popq %r14
+; NOF16-BWINSTS-NEXT: popq %r15
+; NOF16-BWINSTS-NEXT: popq %rbp
+; NOF16-BWINSTS-NEXT: retq
+;
+; BWOFF-LABEL: test_trunc64_vec4:
+; BWOFF: # BB#0:
+; BWOFF-NEXT: pushq %rbp
+; BWOFF-NEXT: pushq %r15
+; BWOFF-NEXT: pushq %r14
+; BWOFF-NEXT: pushq %rbx
+; BWOFF-NEXT: subq $40, %rsp
+; BWOFF-NEXT: movq %rdi, %rbx
+; BWOFF-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; BWOFF-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, %r14w
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, %r15w
+; BWOFF-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, %bp
+; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT: callq __truncdfhf2
+; BWOFF-NEXT: movw %ax, 4(%rbx)
+; BWOFF-NEXT: movw %bp, (%rbx)
+; BWOFF-NEXT: movw %r15w, 6(%rbx)
+; BWOFF-NEXT: movw %r14w, 2(%rbx)
+; BWOFF-NEXT: addq $40, %rsp
+; BWOFF-NEXT: popq %rbx
+; BWOFF-NEXT: popq %r14
+; BWOFF-NEXT: popq %r15
+; BWOFF-NEXT: popq %rbp
+; BWOFF-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_trunc64_vec4:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: pushq %rbp
+; CHECK-F16C-NEXT: pushq %r15
+; CHECK-F16C-NEXT: pushq %r14
+; CHECK-F16C-NEXT: pushq %rbx
+; CHECK-F16C-NEXT: subq $88, %rsp
+; CHECK-F16C-NEXT: movq %rdi, %rbx
+; CHECK-F16C-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-F16C-NEXT: vzeroupper
+; CHECK-F16C-NEXT: callq __truncdfhf2
+; CHECK-F16C-NEXT: movl %eax, %r14d
+; CHECK-F16C-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-F16C-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-F16C-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-F16C-NEXT: vzeroupper
+; CHECK-F16C-NEXT: callq __truncdfhf2
+; CHECK-F16C-NEXT: movl %eax, %r15d
+; CHECK-F16C-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-F16C-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; CHECK-F16C-NEXT: vzeroupper
+; CHECK-F16C-NEXT: callq __truncdfhf2
+; CHECK-F16C-NEXT: movl %eax, %ebp
+; CHECK-F16C-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-F16C-NEXT: callq __truncdfhf2
+; CHECK-F16C-NEXT: movw %ax, 4(%rbx)
+; CHECK-F16C-NEXT: movw %bp, (%rbx)
+; CHECK-F16C-NEXT: movw %r15w, 6(%rbx)
+; CHECK-F16C-NEXT: movw %r14w, 2(%rbx)
+; CHECK-F16C-NEXT: addq $88, %rsp
+; CHECK-F16C-NEXT: popq %rbx
+; CHECK-F16C-NEXT: popq %r14
+; CHECK-F16C-NEXT: popq %r15
+; CHECK-F16C-NEXT: popq %rbp
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_trunc64_vec4:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: pushl %ebp
+; CHECK-I686-NEXT: pushl %ebx
+; CHECK-I686-NEXT: pushl %edi
+; CHECK-I686-NEXT: pushl %esi
+; CHECK-I686-NEXT: subl $60, %esp
+; CHECK-I686-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; CHECK-I686-NEXT: movlps %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, %si
+; CHECK-I686-NEXT: movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movhpd %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, %di
+; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movlps %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, %bx
+; CHECK-I686-NEXT: movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT: movhpd %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __truncdfhf2
+; CHECK-I686-NEXT: movw %ax, 6(%ebp)
+; CHECK-I686-NEXT: movw %bx, 4(%ebp)
+; CHECK-I686-NEXT: movw %di, 2(%ebp)
+; CHECK-I686-NEXT: movw %si, (%ebp)
+; CHECK-I686-NEXT: addl $60, %esp
+; CHECK-I686-NEXT: popl %esi
+; CHECK-I686-NEXT: popl %edi
+; CHECK-I686-NEXT: popl %ebx
+; CHECK-I686-NEXT: popl %ebp
+; CHECK-I686-NEXT: retl
%v = fptrunc <4 x double> %a to <4 x half>
store <4 x half> %v, <4 x half>* %p
ret void
@@ -268,44 +836,99 @@ define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) {
declare float @test_floatret();
-; On i686, if SSE2 is available, the return value from test_floatret is loaded
-; to f80 and then rounded to f32. The DAG combiner should not combine this
-; fp_round and the subsequent fptrunc from float to half.
define half @test_f80trunc_nodagcombine() #0 {
-; CHECK-LABEL: test_f80trunc_nodagcombine:
-; CHECK-I686-NOT: calll __truncxfhf2
+; CHECK-LIBCALL-LABEL: test_f80trunc_nodagcombine:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rax
+; CHECK-LIBCALL-NEXT: callq test_floatret
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: popq %rax
+; CHECK-LIBCALL-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_f80trunc_nodagcombine:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: pushq %rax
+; CHECK-F16C-NEXT: callq test_floatret
+; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT: popq %rax
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_f80trunc_nodagcombine:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $12, %esp
+; CHECK-I686-NEXT: calll test_floatret
+; CHECK-I686-NEXT: fstps (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movzwl %ax, %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: addl $12, %esp
+; CHECK-I686-NEXT: retl
%1 = call float @test_floatret()
%2 = fptrunc float %1 to half
ret half %2
}
-; CHECK-LABEL: test_sitofp_fadd_i32:
-; CHECK-LIBCALL-NEXT: pushq %rbx
-; CHECK-LIBCALL-NEXT: subq $16, %rsp
-; CHECK-LIBCALL-NEXT: movl %edi, %ebx
-; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: movss %xmm0, 12(%rsp)
-; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0
-; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
-; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
-; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT: addss 12(%rsp), %xmm0
-; CHECK-LIBCALL-NEXT: addq $16, %rsp
-; CHECK-LIBCALL-NEXT: popq %rbx
-; CHECK-LIBCALL-NEXT: retq
-; CHECK-F16C-NEXT: movswl (%rsi), %eax
-; CHECK-F16C-NEXT: vmovd %eax, %xmm0
-; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1
-; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-F16C-NEXT: retq
define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
+; CHECK-LIBCALL-LABEL: test_sitofp_fadd_i32:
+; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: subq $16, %rsp
+; CHECK-LIBCALL-NEXT: movl %edi, %ebx
+; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: addss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT: addq $16, %rsp
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+;
+; CHECK-F16C-LABEL: test_sitofp_fadd_i32:
+; CHECK-F16C: # BB#0:
+; CHECK-F16C-NEXT: movswl (%rsi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm0
+; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1
+; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-F16C-NEXT: retq
+;
+; CHECK-I686-LABEL: test_sitofp_fadd_i32:
+; CHECK-I686: # BB#0:
+; CHECK-I686-NEXT: subl $28, %esp
+; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT: movzwl (%eax), %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp) # 4-byte Spill
+; CHECK-I686-NEXT: xorps %xmm0, %xmm0
+; CHECK-I686-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT: movss %xmm0, (%esp)
+; CHECK-I686-NEXT: calll __gnu_f2h_ieee
+; CHECK-I686-NEXT: movzwl %ax, %eax
+; CHECK-I686-NEXT: movl %eax, (%esp)
+; CHECK-I686-NEXT: calll __gnu_h2f_ieee
+; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # 4-byte Reload
+; CHECK-I686-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT: addss {{[0-9]+}}(%esp), %xmm0
+; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT: addl $28, %esp
+; CHECK-I686-NEXT: retl
%tmp0 = load half, half* %b
%tmp1 = sitofp i32 %a to half
%tmp2 = fadd half %tmp0, %tmp1
diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index ceb46571190..54214764c9a 100644
--- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -112,23 +112,23 @@ define void @i56_and_or(i56* %a) {
define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
; CHECK-LABEL: i56_insert_bit:
; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: movzwl 4(%rdi), %ecx
-; CHECK-NEXT: movzbl 6(%rdi), %edx
-; CHECK-NEXT: movl (%rdi), %esi
-; CHECK-NEXT: movb %dl, 6(%rdi)
-; CHECK-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> %RDX<def>
-; CHECK-NEXT: shll $16, %edx
-; CHECK-NEXT: orl %ecx, %edx
-; CHECK-NEXT: shlq $32, %rdx
-; CHECK-NEXT: orq %rdx, %rsi
-; CHECK-NEXT: shlq $13, %rax
-; CHECK-NEXT: movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF
-; CHECK-NEXT: andq %rsi, %rcx
-; CHECK-NEXT: orq %rax, %rcx
-; CHECK-NEXT: movl %ecx, (%rdi)
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: movw %cx, 4(%rdi)
+; CHECK-NEXT: movzwl 4(%rdi), %eax
+; CHECK-NEXT: movzbl 6(%rdi), %ecx
+; CHECK-NEXT: movl (%rdi), %edx
+; CHECK-NEXT: movb %cl, 6(%rdi)
+; CHECK-NEXT: movzbl %sil, %esi
+; CHECK-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
+; CHECK-NEXT: shll $16, %ecx
+; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: shlq $32, %rcx
+; CHECK-NEXT: orq %rcx, %rdx
+; CHECK-NEXT: shlq $13, %rsi
+; CHECK-NEXT: movabsq $72057594037919743, %rax # imm = 0xFFFFFFFFFFDFFF
+; CHECK-NEXT: andq %rdx, %rax
+; CHECK-NEXT: orq %rsi, %rax
+; CHECK-NEXT: movl %eax, (%rdi)
+; CHECK-NEXT: shrq $32, %rax
+; CHECK-NEXT: movw %ax, 4(%rdi)
; CHECK-NEXT: retq
%extbit = zext i1 %bit to i56
%b = load i56, i56* %a, align 1
diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll
index d545b477e10..a8b594904b6 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i32.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll
@@ -17,7 +17,7 @@ define i32 @test_mul_by_1(i32 %x) {
; X64-HSW-LABEL: test_mul_by_1:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_1:
; X64-JAG: # BB#0:
@@ -32,7 +32,7 @@ define i32 @test_mul_by_1(i32 %x) {
; HSW-NOOPT-LABEL: test_mul_by_1:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_1:
; JAG-NOOPT: # BB#0:
@@ -63,7 +63,7 @@ define i32 @test_mul_by_2(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_2:
; X64-JAG: # BB#0:
@@ -81,7 +81,7 @@ define i32 @test_mul_by_2(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_2:
; JAG-NOOPT: # BB#0:
@@ -114,7 +114,7 @@ define i32 @test_mul_by_3(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_3:
; X64-JAG: # BB#0:
@@ -131,7 +131,7 @@ define i32 @test_mul_by_3(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_3:
; JAG-NOOPT: # BB#0:
@@ -165,7 +165,7 @@ define i32 @test_mul_by_4(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_4:
; X64-JAG: # BB#0:
@@ -183,7 +183,7 @@ define i32 @test_mul_by_4(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_4:
; JAG-NOOPT: # BB#0:
@@ -216,7 +216,7 @@ define i32 @test_mul_by_5(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_5:
; X64-JAG: # BB#0:
@@ -233,7 +233,7 @@ define i32 @test_mul_by_5(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_5:
; JAG-NOOPT: # BB#0:
@@ -269,7 +269,7 @@ define i32 @test_mul_by_6(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_6:
; X64-JAG: # BB#0:
@@ -285,8 +285,8 @@ define i32 @test_mul_by_6(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_6:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_6:
; JAG-NOOPT: # BB#0:
@@ -321,7 +321,7 @@ define i32 @test_mul_by_7(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_7:
; X64-JAG: # BB#0:
@@ -337,8 +337,8 @@ define i32 @test_mul_by_7(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_7:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_7:
; JAG-NOOPT: # BB#0:
@@ -371,7 +371,7 @@ define i32 @test_mul_by_8(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_8:
; X64-JAG: # BB#0:
@@ -389,7 +389,7 @@ define i32 @test_mul_by_8(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_8:
; JAG-NOOPT: # BB#0:
@@ -422,7 +422,7 @@ define i32 @test_mul_by_9(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_9:
; X64-JAG: # BB#0:
@@ -439,7 +439,7 @@ define i32 @test_mul_by_9(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_9:
; JAG-NOOPT: # BB#0:
@@ -475,7 +475,7 @@ define i32 @test_mul_by_10(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_10:
; X64-JAG: # BB#0:
@@ -491,8 +491,8 @@ define i32 @test_mul_by_10(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_10:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_10:
; JAG-NOOPT: # BB#0:
@@ -527,7 +527,7 @@ define i32 @test_mul_by_11(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_11:
; X64-JAG: # BB#0:
@@ -543,8 +543,8 @@ define i32 @test_mul_by_11(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_11:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_11:
; JAG-NOOPT: # BB#0:
@@ -575,9 +575,9 @@ define i32 @test_mul_by_12(i32 %x) {
; X64-HSW-LABEL: test_mul_by_12:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50]
+; X64-HSW-NEXT: shll $2, %edi # sched: [1:1.00]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_12:
; X64-JAG: # BB#0:
@@ -593,8 +593,8 @@ define i32 @test_mul_by_12(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_12:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_12:
; JAG-NOOPT: # BB#0:
@@ -629,7 +629,7 @@ define i32 @test_mul_by_13(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_13:
; X64-JAG: # BB#0:
@@ -645,8 +645,8 @@ define i32 @test_mul_by_13(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_13:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_13:
; JAG-NOOPT: # BB#0:
@@ -681,7 +681,7 @@ define i32 @test_mul_by_14(i32 %x) {
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_14:
; X64-JAG: # BB#0:
@@ -698,8 +698,8 @@ define i32 @test_mul_by_14(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_14:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_14:
; JAG-NOOPT: # BB#0:
@@ -732,7 +732,7 @@ define i32 @test_mul_by_15(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_15:
; X64-JAG: # BB#0:
@@ -748,8 +748,8 @@ define i32 @test_mul_by_15(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_15:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_15:
; JAG-NOOPT: # BB#0:
@@ -780,9 +780,9 @@ define i32 @test_mul_by_16(i32 %x) {
;
; X64-HSW-LABEL: test_mul_by_16:
; X64-HSW: # BB#0:
-; X64-HSW-NEXT: shll $4, %edi # sched: [1:0.50]
+; X64-HSW-NEXT: shll $4, %edi # sched: [1:1.00]
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_16:
; X64-JAG: # BB#0:
@@ -798,9 +798,9 @@ define i32 @test_mul_by_16(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_16:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50]
+; HSW-NOOPT-NEXT: shll $4, %edi # sched: [1:1.00]
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_16:
; JAG-NOOPT: # BB#0:
@@ -836,9 +836,9 @@ define i32 @test_mul_by_17(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50]
+; X64-HSW-NEXT: shll $4, %eax # sched: [1:1.00]
; X64-HSW-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_17:
; X64-JAG: # BB#0:
@@ -855,8 +855,8 @@ define i32 @test_mul_by_17(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_17:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_17:
; JAG-NOOPT: # BB#0:
@@ -892,7 +892,7 @@ define i32 @test_mul_by_18(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_18:
; X64-JAG: # BB#0:
@@ -908,8 +908,8 @@ define i32 @test_mul_by_18(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_18:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_18:
; JAG-NOOPT: # BB#0:
@@ -944,9 +944,9 @@ define i32 @test_mul_by_19(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: shll $2, %eax # sched: [1:0.50]
+; X64-HSW-NEXT: shll $2, %eax # sched: [1:1.00]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_19:
; X64-JAG: # BB#0:
@@ -963,8 +963,8 @@ define i32 @test_mul_by_19(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_19:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_19:
; JAG-NOOPT: # BB#0:
@@ -995,9 +995,9 @@ define i32 @test_mul_by_20(i32 %x) {
; X64-HSW-LABEL: test_mul_by_20:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50]
+; X64-HSW-NEXT: shll $2, %edi # sched: [1:1.00]
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_20:
; X64-JAG: # BB#0:
@@ -1013,8 +1013,8 @@ define i32 @test_mul_by_20(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_20:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_20:
; JAG-NOOPT: # BB#0:
@@ -1049,7 +1049,7 @@ define i32 @test_mul_by_21(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_21:
; X64-JAG: # BB#0:
@@ -1065,8 +1065,8 @@ define i32 @test_mul_by_21(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_21:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_21:
; JAG-NOOPT: # BB#0:
@@ -1101,7 +1101,7 @@ define i32 @test_mul_by_22(i32 %x) {
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_22:
; X64-JAG: # BB#0:
@@ -1118,8 +1118,8 @@ define i32 @test_mul_by_22(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_22:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_22:
; JAG-NOOPT: # BB#0:
@@ -1152,9 +1152,9 @@ define i32 @test_mul_by_23(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: shll $3, %eax # sched: [1:0.50]
+; X64-HSW-NEXT: shll $3, %eax # sched: [1:1.00]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_23:
; X64-JAG: # BB#0:
@@ -1171,8 +1171,8 @@ define i32 @test_mul_by_23(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_23:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_23:
; JAG-NOOPT: # BB#0:
@@ -1203,9 +1203,9 @@ define i32 @test_mul_by_24(i32 %x) {
; X64-HSW-LABEL: test_mul_by_24:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT: shll $3, %edi # sched: [1:0.50]
+; X64-HSW-NEXT: shll $3, %edi # sched: [1:1.00]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_24:
; X64-JAG: # BB#0:
@@ -1221,8 +1221,8 @@ define i32 @test_mul_by_24(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_24:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_24:
; JAG-NOOPT: # BB#0:
@@ -1257,7 +1257,7 @@ define i32 @test_mul_by_25(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_25:
; X64-JAG: # BB#0:
@@ -1273,8 +1273,8 @@ define i32 @test_mul_by_25(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_25:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_25:
; JAG-NOOPT: # BB#0:
@@ -1311,7 +1311,7 @@ define i32 @test_mul_by_26(i32 %x) {
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_26:
; X64-JAG: # BB#0:
@@ -1328,8 +1328,8 @@ define i32 @test_mul_by_26(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_26:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_26:
; JAG-NOOPT: # BB#0:
@@ -1362,7 +1362,7 @@ define i32 @test_mul_by_27(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_27:
; X64-JAG: # BB#0:
@@ -1378,8 +1378,8 @@ define i32 @test_mul_by_27(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_27:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_27:
; JAG-NOOPT: # BB#0:
@@ -1416,7 +1416,7 @@ define i32 @test_mul_by_28(i32 %x) {
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_28:
; X64-JAG: # BB#0:
@@ -1433,8 +1433,8 @@ define i32 @test_mul_by_28(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_28:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_28:
; JAG-NOOPT: # BB#0:
@@ -1471,7 +1471,7 @@ define i32 @test_mul_by_29(i32 %x) {
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_29:
; X64-JAG: # BB#0:
@@ -1489,8 +1489,8 @@ define i32 @test_mul_by_29(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_29:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_29:
; JAG-NOOPT: # BB#0:
@@ -1523,10 +1523,10 @@ define i32 @test_mul_by_30(i32 %x) {
; X64-HSW-LABEL: test_mul_by_30:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50]
+; X64-HSW-NEXT: shll $5, %eax # sched: [1:1.00]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_30:
; X64-JAG: # BB#0:
@@ -1543,8 +1543,8 @@ define i32 @test_mul_by_30(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_30:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_30:
; JAG-NOOPT: # BB#0:
@@ -1576,9 +1576,9 @@ define i32 @test_mul_by_31(i32 %x) {
; X64-HSW-LABEL: test_mul_by_31:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50]
+; X64-HSW-NEXT: shll $5, %eax # sched: [1:1.00]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_31:
; X64-JAG: # BB#0:
@@ -1594,8 +1594,8 @@ define i32 @test_mul_by_31(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_31:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_31:
; JAG-NOOPT: # BB#0:
@@ -1626,9 +1626,9 @@ define i32 @test_mul_by_32(i32 %x) {
;
; X64-HSW-LABEL: test_mul_by_32:
; X64-HSW: # BB#0:
-; X64-HSW-NEXT: shll $5, %edi # sched: [1:0.50]
+; X64-HSW-NEXT: shll $5, %edi # sched: [1:1.00]
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_32:
; X64-JAG: # BB#0:
@@ -1644,9 +1644,9 @@ define i32 @test_mul_by_32(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_32:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50]
+; HSW-NOOPT-NEXT: shll $5, %edi # sched: [1:1.00]
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_32:
; JAG-NOOPT: # BB#0:
@@ -1686,8 +1686,8 @@ define i32 @test_mul_spec(i32 %x) nounwind {
; X64-HSW-NEXT: addl $42, %ecx # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl $2, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: imull %ecx, %eax # sched: [4:1.00]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: imull %ecx, %eax # sched: [3:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_spec:
; X64-JAG: # BB#0:
@@ -1712,8 +1712,8 @@ define i32 @test_mul_spec(i32 %x) nounwind {
; HSW-NOOPT-NEXT: addl $42, %ecx # sched: [1:0.25]
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; HSW-NOOPT-NEXT: addl $2, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: imull %ecx, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_spec:
; JAG-NOOPT: # BB#0:
diff --git a/llvm/test/CodeGen/X86/mul-constant-i64.ll b/llvm/test/CodeGen/X86/mul-constant-i64.ll
index ea841c761c7..bf732f2f536 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i64.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i64.ll
@@ -18,7 +18,7 @@ define i64 @test_mul_by_1(i64 %x) nounwind {
; X64-HSW-LABEL: test_mul_by_1:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_1:
; X64-JAG: # BB#0:
@@ -34,7 +34,7 @@ define i64 @test_mul_by_1(i64 %x) nounwind {
; HSW-NOOPT-LABEL: test_mul_by_1:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_1:
; JAG-NOOPT: # BB#0:
@@ -66,7 +66,7 @@ define i64 @test_mul_by_2(i64 %x) {
; X64-HSW-LABEL: test_mul_by_2:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_2:
; X64-JAG: # BB#0:
@@ -84,7 +84,7 @@ define i64 @test_mul_by_2(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_2:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_2:
; JAG-NOOPT: # BB#0:
@@ -116,7 +116,7 @@ define i64 @test_mul_by_3(i64 %x) {
; X64-HSW-LABEL: test_mul_by_3:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_3:
; X64-JAG: # BB#0:
@@ -134,7 +134,7 @@ define i64 @test_mul_by_3(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_3:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_3:
; JAG-NOOPT: # BB#0:
@@ -166,7 +166,7 @@ define i64 @test_mul_by_4(i64 %x) {
; X64-HSW-LABEL: test_mul_by_4:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_4:
; X64-JAG: # BB#0:
@@ -184,7 +184,7 @@ define i64 @test_mul_by_4(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_4:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_4:
; JAG-NOOPT: # BB#0:
@@ -216,7 +216,7 @@ define i64 @test_mul_by_5(i64 %x) {
; X64-HSW-LABEL: test_mul_by_5:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_5:
; X64-JAG: # BB#0:
@@ -234,7 +234,7 @@ define i64 @test_mul_by_5(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_5:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_5:
; JAG-NOOPT: # BB#0:
@@ -268,7 +268,7 @@ define i64 @test_mul_by_6(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_6:
; X64-JAG: # BB#0:
@@ -287,7 +287,7 @@ define i64 @test_mul_by_6(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_6:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_6:
; JAG-NOOPT: # BB#0:
@@ -323,7 +323,7 @@ define i64 @test_mul_by_7(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_7:
; X64-JAG: # BB#0:
@@ -342,7 +342,7 @@ define i64 @test_mul_by_7(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_7:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_7:
; JAG-NOOPT: # BB#0:
@@ -375,7 +375,7 @@ define i64 @test_mul_by_8(i64 %x) {
; X64-HSW-LABEL: test_mul_by_8:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_8:
; X64-JAG: # BB#0:
@@ -393,7 +393,7 @@ define i64 @test_mul_by_8(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_8:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_8:
; JAG-NOOPT: # BB#0:
@@ -425,7 +425,7 @@ define i64 @test_mul_by_9(i64 %x) {
; X64-HSW-LABEL: test_mul_by_9:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_9:
; X64-JAG: # BB#0:
@@ -443,7 +443,7 @@ define i64 @test_mul_by_9(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_9:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_9:
; JAG-NOOPT: # BB#0:
@@ -477,7 +477,7 @@ define i64 @test_mul_by_10(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_10:
; X64-JAG: # BB#0:
@@ -496,7 +496,7 @@ define i64 @test_mul_by_10(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_10:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_10:
; JAG-NOOPT: # BB#0:
@@ -532,7 +532,7 @@ define i64 @test_mul_by_11(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_11:
; X64-JAG: # BB#0:
@@ -551,7 +551,7 @@ define i64 @test_mul_by_11(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_11:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_11:
; JAG-NOOPT: # BB#0:
@@ -585,7 +585,7 @@ define i64 @test_mul_by_12(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_12:
; X64-JAG: # BB#0:
@@ -604,7 +604,7 @@ define i64 @test_mul_by_12(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_12:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_12:
; JAG-NOOPT: # BB#0:
@@ -640,7 +640,7 @@ define i64 @test_mul_by_13(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_13:
; X64-JAG: # BB#0:
@@ -659,7 +659,7 @@ define i64 @test_mul_by_13(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_13:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_13:
; JAG-NOOPT: # BB#0:
@@ -696,7 +696,7 @@ define i64 @test_mul_by_14(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_14:
; X64-JAG: # BB#0:
@@ -716,7 +716,7 @@ define i64 @test_mul_by_14(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_14:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_14:
; JAG-NOOPT: # BB#0:
@@ -751,7 +751,7 @@ define i64 @test_mul_by_15(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_15:
; X64-JAG: # BB#0:
@@ -770,7 +770,7 @@ define i64 @test_mul_by_15(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_15:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_15:
; JAG-NOOPT: # BB#0:
@@ -804,7 +804,7 @@ define i64 @test_mul_by_16(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shlq $4, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_16:
; X64-JAG: # BB#0:
@@ -824,7 +824,7 @@ define i64 @test_mul_by_16(i64 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_16:
; JAG-NOOPT: # BB#0:
@@ -864,7 +864,7 @@ define i64 @test_mul_by_17(i64 %x) {
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: shlq $4, %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_17:
; X64-JAG: # BB#0:
@@ -884,7 +884,7 @@ define i64 @test_mul_by_17(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_17:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_17:
; JAG-NOOPT: # BB#0:
@@ -920,7 +920,7 @@ define i64 @test_mul_by_18(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_18:
; X64-JAG: # BB#0:
@@ -939,7 +939,7 @@ define i64 @test_mul_by_18(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_18:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_18:
; JAG-NOOPT: # BB#0:
@@ -977,7 +977,7 @@ define i64 @test_mul_by_19(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: shlq $2, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_19:
; X64-JAG: # BB#0:
@@ -997,7 +997,7 @@ define i64 @test_mul_by_19(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_19:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_19:
; JAG-NOOPT: # BB#0:
@@ -1031,7 +1031,7 @@ define i64 @test_mul_by_20(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_20:
; X64-JAG: # BB#0:
@@ -1050,7 +1050,7 @@ define i64 @test_mul_by_20(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_20:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_20:
; JAG-NOOPT: # BB#0:
@@ -1086,7 +1086,7 @@ define i64 @test_mul_by_21(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_21:
; X64-JAG: # BB#0:
@@ -1105,7 +1105,7 @@ define i64 @test_mul_by_21(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_21:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_21:
; JAG-NOOPT: # BB#0:
@@ -1142,7 +1142,7 @@ define i64 @test_mul_by_22(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_22:
; X64-JAG: # BB#0:
@@ -1162,7 +1162,7 @@ define i64 @test_mul_by_22(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_22:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_22:
; JAG-NOOPT: # BB#0:
@@ -1199,7 +1199,7 @@ define i64 @test_mul_by_23(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: shlq $3, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_23:
; X64-JAG: # BB#0:
@@ -1219,7 +1219,7 @@ define i64 @test_mul_by_23(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_23:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_23:
; JAG-NOOPT: # BB#0:
@@ -1253,7 +1253,7 @@ define i64 @test_mul_by_24(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shlq $3, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_24:
; X64-JAG: # BB#0:
@@ -1272,7 +1272,7 @@ define i64 @test_mul_by_24(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_24:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_24:
; JAG-NOOPT: # BB#0:
@@ -1308,7 +1308,7 @@ define i64 @test_mul_by_25(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_25:
; X64-JAG: # BB#0:
@@ -1327,7 +1327,7 @@ define i64 @test_mul_by_25(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_25:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_25:
; JAG-NOOPT: # BB#0:
@@ -1365,7 +1365,7 @@ define i64 @test_mul_by_26(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_26:
; X64-JAG: # BB#0:
@@ -1385,7 +1385,7 @@ define i64 @test_mul_by_26(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_26:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_26:
; JAG-NOOPT: # BB#0:
@@ -1420,7 +1420,7 @@ define i64 @test_mul_by_27(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_27:
; X64-JAG: # BB#0:
@@ -1439,7 +1439,7 @@ define i64 @test_mul_by_27(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_27:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_27:
; JAG-NOOPT: # BB#0:
@@ -1477,7 +1477,7 @@ define i64 @test_mul_by_28(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_28:
; X64-JAG: # BB#0:
@@ -1497,7 +1497,7 @@ define i64 @test_mul_by_28(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_28:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_28:
; JAG-NOOPT: # BB#0:
@@ -1536,7 +1536,7 @@ define i64 @test_mul_by_29(i64 %x) {
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_29:
; X64-JAG: # BB#0:
@@ -1557,7 +1557,7 @@ define i64 @test_mul_by_29(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_29:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_29:
; JAG-NOOPT: # BB#0:
@@ -1596,7 +1596,7 @@ define i64 @test_mul_by_30(i64 %x) {
; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_30:
; X64-JAG: # BB#0:
@@ -1617,7 +1617,7 @@ define i64 @test_mul_by_30(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_30:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_30:
; JAG-NOOPT: # BB#0:
@@ -1654,7 +1654,7 @@ define i64 @test_mul_by_31(i64 %x) {
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_31:
; X64-JAG: # BB#0:
@@ -1674,7 +1674,7 @@ define i64 @test_mul_by_31(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_31:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_31:
; JAG-NOOPT: # BB#0:
@@ -1709,7 +1709,7 @@ define i64 @test_mul_by_32(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shlq $5, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_32:
; X64-JAG: # BB#0:
@@ -1729,7 +1729,7 @@ define i64 @test_mul_by_32(i64 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_32:
; JAG-NOOPT: # BB#0:
@@ -1793,7 +1793,7 @@ define i64 @test_mul_spec(i64 %x) nounwind {
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq $2, %rax # sched: [1:0.25]
; X64-HSW-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_spec:
; X64-JAG: # BB#0:
@@ -1841,7 +1841,7 @@ define i64 @test_mul_spec(i64 %x) nounwind {
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; HSW-NOOPT-NEXT: addq $2, %rax # sched: [1:0.25]
; HSW-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_spec:
; JAG-NOOPT: # BB#0:
diff --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll
index f2b79b67877..7cb38863e89 100644
--- a/llvm/test/CodeGen/X86/pr32329.ll
+++ b/llvm/test/CodeGen/X86/pr32329.ll
@@ -59,8 +59,8 @@ define void @foo() local_unnamed_addr {
; X86-NEXT: cmovnel %ecx, %esi
; X86-NEXT: cmpl %edx, %edi
; X86-NEXT: movl %ebp, var_50+4
-; X86-NEXT: movl %esi, var_50
; X86-NEXT: setge var_205
+; X86-NEXT: movl %esi, var_50
; X86-NEXT: imull %eax, %ebx
; X86-NEXT: movb %bl, var_218
; X86-NEXT: popl %esi
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index 16e261bf3c5..e9e1ee5e742 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -45,15 +45,15 @@ define float @f32_no_estimate(float %x) #0 {
;
; SANDY-LABEL: f32_no_estimate:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_no_estimate:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
+; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_no_estimate:
; HASWELL-NO-FMA: # BB#0:
@@ -63,9 +63,9 @@ define float @f32_no_estimate(float %x) #0 {
;
; AVX512-LABEL: f32_no_estimate:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
+; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 1.0, %x
ret float %div
}
@@ -113,18 +113,18 @@ define float @f32_one_step(float %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_one_step:
; HASWELL-NO-FMA: # BB#0:
@@ -139,9 +139,9 @@ define float @f32_one_step(float %x) #1 {
; AVX512-LABEL: f32_one_step:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 1.0, %x
ret float %div
}
@@ -207,7 +207,7 @@ define float @f32_two_step(float %x) #2 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -215,18 +215,18 @@ define float @f32_two_step(float %x) #2 {
; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_two_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_two_step:
; HASWELL-NO-FMA: # BB#0:
@@ -245,13 +245,13 @@ define float @f32_two_step(float %x) #2 {
; AVX512-LABEL: f32_two_step:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 1.0, %x
ret float %div
}
@@ -284,15 +284,15 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
;
; SANDY-LABEL: v4f32_no_estimate:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
-; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
+; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_no_estimate:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [?:5.000000e-01]
+; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
; HASWELL-NO-FMA: # BB#0:
@@ -302,9 +302,9 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
;
; AVX512-LABEL: v4f32_no_estimate:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50]
-; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [?:5.000000e-01]
+; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
@@ -350,21 +350,21 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
;
; SANDY-LABEL: v4f32_one_step:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step:
; HASWELL-NO-FMA: # BB#0:
@@ -379,17 +379,17 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; KNL-LABEL: v4f32_one_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v4f32_one_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
@@ -453,9 +453,9 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
;
; SANDY-LABEL: v4f32_two_step:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -463,18 +463,18 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_two_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_two_step:
; HASWELL-NO-FMA: # BB#0:
@@ -493,24 +493,24 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; KNL-LABEL: v4f32_two_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v4f32_two_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
@@ -546,15 +546,15 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
;
; SANDY-LABEL: v8f32_no_estimate:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
-; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
+; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [29:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_estimate:
; HASWELL: # BB#0:
; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00]
-; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
; HASWELL-NO-FMA: # BB#0:
@@ -565,8 +565,8 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; AVX512-LABEL: v8f32_no_estimate:
; AVX512: # BB#0:
; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00]
-; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
@@ -621,19 +621,19 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step:
; HASWELL-NO-FMA: # BB#0:
@@ -647,18 +647,18 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
;
; KNL-LABEL: v8f32_one_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_one_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
@@ -737,7 +737,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -745,18 +745,18 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_two_step:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_two_step:
; HASWELL-NO-FMA: # BB#0:
@@ -774,25 +774,25 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
;
; KNL-LABEL: v8f32_two_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_two_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 440a6f0bef1..a3ac4596c07 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -39,26 +39,26 @@ define float @f32_no_step_2(float %x) #3 {
; SANDY-LABEL: f32_no_step_2:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_no_step_2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_no_step_2:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; AVX512-LABEL: f32_no_step_2:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 1234.0, %x
ret float %div
}
@@ -110,39 +110,39 @@ define float @f32_one_step_2(float %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step_2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_one_step_2:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; AVX512-LABEL: f32_one_step_2:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 3456.0, %x
ret float %div
}
@@ -198,43 +198,43 @@ define float @f32_one_step_2_divs(float %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step_2_divs:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; AVX512-LABEL: f32_one_step_2_divs:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 3456.0, %x
%div2 = fdiv fast float %div, %x
ret float %div2
@@ -305,7 +305,7 @@ define float @f32_two_step_2(float %x) #2 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -313,26 +313,26 @@ define float @f32_two_step_2(float %x) #2 {
; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_two_step_2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_two_step_2:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -340,20 +340,20 @@ define float @f32_two_step_2(float %x) #2 {
; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; AVX512-LABEL: f32_two_step_2:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 6789.0, %x
ret float %div
}
@@ -403,51 +403,51 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
;
; SANDY-LABEL: v4f32_one_step2:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v4f32_one_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v4f32_one_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
ret <4 x float> %div
}
@@ -501,56 +501,56 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
;
; SANDY-LABEL: v4f32_one_step_2_divs:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step_2_divs:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v4f32_one_step_2_divs:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v4f32_one_step_2_divs:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
%div2 = fdiv fast <4 x float> %div, %x
ret <4 x float> %div2
@@ -619,9 +619,9 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
;
; SANDY-LABEL: v4f32_two_step2:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -629,26 +629,26 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_two_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 # sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 # sched: [?:5.000000e-01]
; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -656,32 +656,32 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v4f32_two_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v4f32_two_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50]
+; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [?:5.000000e-01]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
ret <4 x float> %div
}
@@ -741,49 +741,49 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step2:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v8f32_one_step2:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_one_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
ret <8 x float> %div
}
@@ -848,54 +848,54 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:2.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step_2_divs:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v8f32_one_step_2_divs:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_one_step_2_divs:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
%div2 = fdiv fast <8 x float> %div, %x
ret <8 x float> %div2
@@ -980,7 +980,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -988,59 +988,59 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_two_step2:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v8f32_two_step2:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_two_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
ret <8 x float> %div
}
@@ -1070,27 +1070,27 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
; SANDY-LABEL: v8f32_no_step:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_step:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_no_step:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v8f32_no_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_no_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
@@ -1125,32 +1125,32 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
; SANDY-LABEL: v8f32_no_step2:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_step2:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v8f32_no_step2:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_no_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
ret <8 x float> %div
}
diff --git a/llvm/test/CodeGen/X86/sse-schedule.ll b/llvm/test/CodeGen/X86/sse-schedule.ll
index 52e6b61aedf..e94935972c1 100644
--- a/llvm/test/CodeGen/X86/sse-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse-schedule.ll
@@ -31,14 +31,14 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_addps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addps:
; BTVER2: # BB#0:
@@ -73,14 +73,14 @@ define float @test_addss(float %a0, float %a1, float *%a2) {
; SANDY-LABEL: test_addss:
; SANDY: # BB#0:
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addss:
; BTVER2: # BB#0:
@@ -122,15 +122,15 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
;
; SANDY-LABEL: test_andps:
; SANDY: # BB#0:
-; SANDY-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andps:
; BTVER2: # BB#0:
@@ -176,15 +176,15 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float>
;
; SANDY-LABEL: test_andnotps:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andnotps:
; BTVER2: # BB#0:
@@ -228,16 +228,16 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_cmpps:
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmpps:
; BTVER2: # BB#0:
@@ -277,13 +277,13 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmpss:
; BTVER2: # BB#0:
@@ -347,30 +347,30 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; SANDY-LABEL: test_comiss:
; SANDY: # BB#0:
; SANDY-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vcomiss (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_comiss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: setnp %al # sched: [1:0.50]
-; HASWELL-NEXT: sete %cl # sched: [1:0.50]
+; HASWELL-NEXT: setnp %al # sched: [1:1.00]
+; HASWELL-NEXT: sete %cl # sched: [1:1.00]
; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
; HASWELL-NEXT: vcomiss (%rdi), %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: setnp %al # sched: [1:0.50]
-; HASWELL-NEXT: sete %dl # sched: [1:0.50]
+; HASWELL-NEXT: setnp %al # sched: [1:1.00]
+; HASWELL-NEXT: sete %dl # sched: [1:1.00]
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_comiss:
; BTVER2: # BB#0:
@@ -417,17 +417,17 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
;
; SANDY-LABEL: test_cvtsi2ss:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2ss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
; HASWELL-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsi2ss:
; BTVER2: # BB#0:
@@ -466,17 +466,17 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
;
; SANDY-LABEL: test_cvtsi2ssq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2ssq:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
; HASWELL-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsi2ssq:
; BTVER2: # BB#0:
@@ -515,17 +515,17 @@ define i32 @test_cvtss2si(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvtss2si:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT: vcvtss2si (%rdi), %eax # sched: [7:1.00]
+; SANDY-NEXT: vcvtss2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT: vcvtss2si (%rdi), %eax # sched: [10:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2si:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtss2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtss2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtss2si (%rdi), %eax # sched: [4:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtss2si:
; BTVER2: # BB#0:
@@ -567,17 +567,17 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvtss2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvtss2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvtss2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvtss2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2siq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtss2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtss2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtss2si (%rdi), %rax # sched: [4:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtss2siq:
; BTVER2: # BB#0:
@@ -619,17 +619,17 @@ define i32 @test_cvttss2si(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvttss2si:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT: vcvttss2si (%rdi), %eax # sched: [7:1.00]
+; SANDY-NEXT: vcvttss2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT: vcvttss2si (%rdi), %eax # sched: [10:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttss2si:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttss2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttss2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttss2si (%rdi), %eax # sched: [4:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttss2si:
; BTVER2: # BB#0:
@@ -668,17 +668,17 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvttss2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvttss2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvttss2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvttss2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttss2siq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttss2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttss2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttss2si (%rdi), %rax # sched: [4:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttss2siq:
; BTVER2: # BB#0:
@@ -714,15 +714,15 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
;
; SANDY-LABEL: test_divps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divps:
; BTVER2: # BB#0:
@@ -756,15 +756,15 @@ define float @test_divss(float %a0, float %a1, float *%a2) {
;
; SANDY-LABEL: test_divss:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divss:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divss:
; BTVER2: # BB#0:
@@ -799,14 +799,14 @@ define void @test_ldmxcsr(i32 %a0) {
; SANDY-LABEL: test_ldmxcsr:
; SANDY: # BB#0:
; SANDY-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SANDY-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ldmxcsr:
; HASWELL: # BB#0:
; HASWELL-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; HASWELL-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [6:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ldmxcsr:
; BTVER2: # BB#0:
@@ -843,14 +843,14 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_maxps:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxps:
; BTVER2: # BB#0:
@@ -886,14 +886,14 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_maxss:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxss:
; BTVER2: # BB#0:
@@ -929,14 +929,14 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_minps:
; SANDY: # BB#0:
; SANDY-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minps:
; BTVER2: # BB#0:
@@ -972,14 +972,14 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_minss:
; SANDY: # BB#0:
; SANDY-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minss:
; BTVER2: # BB#0:
@@ -1017,17 +1017,17 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_movaps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovaps (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movaps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovaps (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovaps (%rdi), %xmm0 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovaps %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movaps:
; BTVER2: # BB#0:
@@ -1068,12 +1068,12 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
; SANDY-LABEL: test_movhlps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhlps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movhlps:
; BTVER2: # BB#0:
@@ -1111,17 +1111,17 @@ define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movhps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movhps:
; BTVER2: # BB#0:
@@ -1164,13 +1164,13 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlhps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movlhps:
; BTVER2: # BB#0:
@@ -1206,17 +1206,17 @@ define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movlps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovlps %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovlps %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movlps:
; BTVER2: # BB#0:
@@ -1254,13 +1254,13 @@ define i32 @test_movmskps(<4 x float> %a0) {
;
; SANDY-LABEL: test_movmskps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovmskps %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovmskps %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movmskps:
; BTVER2: # BB#0:
@@ -1295,13 +1295,13 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_movntps:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntps %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovntps %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntps:
; BTVER2: # BB#0:
@@ -1335,17 +1335,17 @@ define void @test_movss_mem(float* %a0, float* %a1) {
;
; SANDY-LABEL: test_movss_mem:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovss %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movss_mem:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovss %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movss_mem:
; BTVER2: # BB#0:
@@ -1383,13 +1383,13 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) {
;
; SANDY-LABEL: test_movss_reg:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movss_reg:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movss_reg:
; BTVER2: # BB#0:
@@ -1423,17 +1423,17 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_movups:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movups:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovups (%rdi), %xmm0 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovups %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movups:
; BTVER2: # BB#0:
@@ -1469,14 +1469,14 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_mulps:
; SANDY: # BB#0:
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulps:
; BTVER2: # BB#0:
@@ -1511,14 +1511,14 @@ define float @test_mulss(float %a0, float %a1, float *%a2) {
; SANDY-LABEL: test_mulss:
; SANDY: # BB#0:
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulss:
; BTVER2: # BB#0:
@@ -1560,15 +1560,15 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
;
; SANDY-LABEL: test_orps:
; SANDY: # BB#0:
-; SANDY-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_orps:
; BTVER2: # BB#0:
@@ -1609,13 +1609,13 @@ define void @test_prefetchnta(i8* %a0) {
;
; SANDY-LABEL: test_prefetchnta:
; SANDY: # BB#0:
-; SANDY-NEXT: prefetchnta (%rdi) # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: prefetchnta (%rdi) # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_prefetchnta:
; HASWELL: # BB#0:
-; HASWELL-NEXT: prefetchnta (%rdi) # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: prefetchnta (%rdi) # sched: [?:5.000000e-01]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_prefetchnta:
; BTVER2: # BB#0:
@@ -1652,17 +1652,17 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_rcpps:
; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vrcpps (%rdi), %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vrcpps %xmm0, %xmm0 # sched: [7:3.00]
+; SANDY-NEXT: vrcpps (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vrcpps (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vrcpps (%rdi), %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rcpps:
; BTVER2: # BB#0:
@@ -1708,18 +1708,18 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
; SANDY-LABEL: test_rcpss:
; SANDY: # BB#0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rcpss:
; BTVER2: # BB#0:
@@ -1765,16 +1765,16 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_rsqrtps:
; SANDY: # BB#0:
; SANDY-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rsqrtps:
; BTVER2: # BB#0:
@@ -1819,19 +1819,19 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
;
; SANDY-LABEL: test_rsqrtss:
; SANDY: # BB#0:
-; SANDY-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; HASWELL-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rsqrtss:
; BTVER2: # BB#0:
@@ -1875,12 +1875,12 @@ define void @test_sfence() {
; SANDY-LABEL: test_sfence:
; SANDY: # BB#0:
; SANDY-NEXT: sfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sfence:
; HASWELL: # BB#0:
-; HASWELL-NEXT: sfence # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: sfence # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sfence:
; BTVER2: # BB#0:
@@ -1917,14 +1917,14 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
; SANDY-LABEL: test_shufps:
; SANDY: # BB#0:
; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_shufps:
; BTVER2: # BB#0:
@@ -1962,17 +1962,17 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_sqrtps:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtps %xmm0, %xmm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtps %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vsqrtps (%rdi), %xmm1 # sched: [20:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtps %xmm0, %xmm0 # sched: [15:1.00]
-; HASWELL-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT: vsqrtps %xmm0, %xmm0 # sched: [14:1.00]
+; HASWELL-NEXT: vsqrtps (%rdi), %xmm1 # sched: [14:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtps:
; BTVER2: # BB#0:
@@ -2017,19 +2017,19 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_sqrtss:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00]
+; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [114:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtss:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; HASWELL-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [14:1.00]
+; HASWELL-NEXT: vmovaps (%rdi), %xmm1 # sched: [?:5.000000e-01]
+; HASWELL-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [14:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtss:
; BTVER2: # BB#0:
@@ -2067,15 +2067,15 @@ define i32 @test_stmxcsr() {
;
; SANDY-LABEL: test_stmxcsr:
; SANDY: # BB#0:
-; SANDY-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SANDY-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_stmxcsr:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
-; HASWELL-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; HASWELL-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [?:5.000000e-01]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_stmxcsr:
; BTVER2: # BB#0:
@@ -2112,14 +2112,14 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; SANDY-LABEL: test_subps:
; SANDY: # BB#0:
; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subps:
; BTVER2: # BB#0:
@@ -2154,14 +2154,14 @@ define float @test_subss(float %a0, float %a1, float *%a2) {
; SANDY-LABEL: test_subss:
; SANDY: # BB#0:
; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subss:
; BTVER2: # BB#0:
@@ -2220,30 +2220,30 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; SANDY-LABEL: test_ucomiss:
; SANDY: # BB#0:
; SANDY-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vucomiss (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ucomiss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: setnp %al # sched: [1:0.50]
-; HASWELL-NEXT: sete %cl # sched: [1:0.50]
+; HASWELL-NEXT: setnp %al # sched: [1:1.00]
+; HASWELL-NEXT: sete %cl # sched: [1:1.00]
; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
; HASWELL-NEXT: vucomiss (%rdi), %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: setnp %al # sched: [1:0.50]
-; HASWELL-NEXT: sete %dl # sched: [1:0.50]
+; HASWELL-NEXT: setnp %al # sched: [1:1.00]
+; HASWELL-NEXT: sete %dl # sched: [1:1.00]
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ucomiss:
; BTVER2: # BB#0:
@@ -2292,14 +2292,14 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; SANDY-LABEL: test_unpckhps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpckhps:
; BTVER2: # BB#0:
@@ -2338,14 +2338,14 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; SANDY-LABEL: test_unpcklps:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpcklps:
; BTVER2: # BB#0:
@@ -2387,15 +2387,15 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
;
; SANDY-LABEL: test_xorps:
; SANDY: # BB#0:
-; SANDY-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_xorps:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse2-schedule.ll b/llvm/test/CodeGen/X86/sse2-schedule.ll
index 14c155c8c6c..29782d8c01a 100644
--- a/llvm/test/CodeGen/X86/sse2-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse2-schedule.ll
@@ -31,14 +31,14 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_addpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addpd:
; BTVER2: # BB#0:
@@ -73,14 +73,14 @@ define double @test_addsd(double %a0, double %a1, double *%a2) {
; SANDY-LABEL: test_addsd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addsd:
; BTVER2: # BB#0:
@@ -117,17 +117,17 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_andpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andpd:
; BTVER2: # BB#0:
@@ -170,17 +170,17 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
;
; SANDY-LABEL: test_andnotpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andnotpd:
; BTVER2: # BB#0:
@@ -226,16 +226,16 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_cmppd:
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmppd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmppd:
; BTVER2: # BB#0:
@@ -275,13 +275,13 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmpsd:
; BTVER2: # BB#0:
@@ -345,30 +345,30 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; SANDY-LABEL: test_comisd:
; SANDY: # BB#0:
; SANDY-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vcomisd (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_comisd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: setnp %al # sched: [1:0.50]
-; HASWELL-NEXT: sete %cl # sched: [1:0.50]
+; HASWELL-NEXT: setnp %al # sched: [1:1.00]
+; HASWELL-NEXT: sete %cl # sched: [1:1.00]
; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
; HASWELL-NEXT: vcomisd (%rdi), %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: setnp %al # sched: [1:0.50]
-; HASWELL-NEXT: sete %dl # sched: [1:0.50]
+; HASWELL-NEXT: setnp %al # sched: [1:1.00]
+; HASWELL-NEXT: sete %dl # sched: [1:1.00]
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_comisd:
; BTVER2: # BB#0:
@@ -416,16 +416,16 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_cvtdq2pd:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2pd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [4:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtdq2pd:
; BTVER2: # BB#0:
@@ -467,17 +467,17 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
;
; SANDY-LABEL: test_cvtdq2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2ps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [3:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtdq2ps:
; BTVER2: # BB#0:
@@ -517,17 +517,17 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_cvtpd2dq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtpd2dq:
; BTVER2: # BB#0:
@@ -568,17 +568,17 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_cvtpd2ps:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2ps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtpd2ps:
; BTVER2: # BB#0:
@@ -620,16 +620,16 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_cvtps2dq:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [3:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtps2dq:
; BTVER2: # BB#0:
@@ -670,17 +670,17 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
;
; SANDY-LABEL: test_cvtps2pd:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
; SANDY-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2pd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtps2pd:
; BTVER2: # BB#0:
@@ -724,14 +724,14 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) {
; SANDY-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00]
; SANDY-NEXT: vcvtsd2si (%rdi), %eax # sched: [7:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2si:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtsd2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtsd2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00]
+; HASWELL-NEXT: vcvtsd2si (%rdi), %eax # sched: [7:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsd2si:
; BTVER2: # BB#0:
@@ -773,17 +773,17 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) {
;
; SANDY-LABEL: test_cvtsd2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2siq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtsd2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtsd2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtsd2si (%rdi), %rax # sched: [4:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsd2siq:
; BTVER2: # BB#0:
@@ -830,18 +830,18 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
; SANDY-LABEL: test_cvtsd2ss:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2ss:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
-; HASWELL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
+; HASWELL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [?:5.000000e-01]
+; HASWELL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsd2ss:
; BTVER2: # BB#0:
@@ -882,16 +882,16 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
; SANDY-LABEL: test_cvtsi2sd:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2sd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
; HASWELL-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsi2sd:
; BTVER2: # BB#0:
@@ -931,16 +931,16 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
; SANDY-LABEL: test_cvtsi2sdq:
; SANDY: # BB#0:
; SANDY-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2sdq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
; HASWELL-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsi2sdq:
; BTVER2: # BB#0:
@@ -985,19 +985,19 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
;
; SANDY-LABEL: test_cvtss2sd:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
+; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2sd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; HASWELL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [2:1.00]
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtss2sd:
; BTVER2: # BB#0:
@@ -1038,17 +1038,17 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_cvttpd2dq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttpd2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttpd2dq:
; BTVER2: # BB#0:
@@ -1091,16 +1091,16 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_cvttps2dq:
; SANDY: # BB#0:
; SANDY-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttps2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [3:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttps2dq:
; BTVER2: # BB#0:
@@ -1139,17 +1139,17 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) {
;
; SANDY-LABEL: test_cvttsd2si:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00]
+; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [5:1.00]
; SANDY-NEXT: vcvttsd2si (%rdi), %eax # sched: [7:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttsd2si:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttsd2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttsd2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttsd2si (%rdi), %eax # sched: [7:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttsd2si:
; BTVER2: # BB#0:
@@ -1188,17 +1188,17 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) {
;
; SANDY-LABEL: test_cvttsd2siq:
; SANDY: # BB#0:
-; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [7:1.00]
+; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttsd2siq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttsd2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttsd2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttsd2si (%rdi), %rax # sched: [4:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttsd2siq:
; BTVER2: # BB#0:
@@ -1234,15 +1234,15 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_divpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [28:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [19:4.00]
+; HASWELL-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [19:4.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divpd:
; BTVER2: # BB#0:
@@ -1276,15 +1276,15 @@ define double @test_divsd(double %a0, double %a1, double *%a2) {
;
; SANDY-LABEL: test_divsd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [28:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divsd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [19:4.00]
+; HASWELL-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [19:4.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divsd:
; BTVER2: # BB#0:
@@ -1322,12 +1322,12 @@ define void @test_lfence() {
; SANDY-LABEL: test_lfence:
; SANDY: # BB#0:
; SANDY-NEXT: lfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lfence:
; HASWELL: # BB#0:
-; HASWELL-NEXT: lfence # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: lfence # sched: [2:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lfence:
; BTVER2: # BB#0:
@@ -1363,12 +1363,12 @@ define void @test_mfence() {
; SANDY-LABEL: test_mfence:
; SANDY: # BB#0:
; SANDY-NEXT: mfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mfence:
; HASWELL: # BB#0:
-; HASWELL-NEXT: mfence # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: mfence # sched: [2:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mfence:
; BTVER2: # BB#0:
@@ -1402,12 +1402,12 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) {
; SANDY-LABEL: test_maskmovdqu:
; SANDY: # BB#0:
; SANDY-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovdqu:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [14:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maskmovdqu:
; BTVER2: # BB#0:
@@ -1440,14 +1440,14 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_maxpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxpd:
; BTVER2: # BB#0:
@@ -1483,14 +1483,14 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_maxsd:
; SANDY: # BB#0:
; SANDY-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxsd:
; BTVER2: # BB#0:
@@ -1526,14 +1526,14 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_minpd:
; SANDY: # BB#0:
; SANDY-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minpd:
; BTVER2: # BB#0:
@@ -1569,14 +1569,14 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_minsd:
; SANDY: # BB#0:
; SANDY-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minsd:
; BTVER2: # BB#0:
@@ -1614,17 +1614,17 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_movapd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movapd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovapd (%rdi), %xmm0 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovapd %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movapd:
; BTVER2: # BB#0:
@@ -1662,17 +1662,17 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
;
; SANDY-LABEL: test_movdqa:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movdqa:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovdqa (%rdi), %xmm0 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovdqa %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movdqa:
; BTVER2: # BB#0:
@@ -1710,17 +1710,17 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
;
; SANDY-LABEL: test_movdqu:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movdqu:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovdqu (%rdi), %xmm0 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovdqu %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movdqu:
; BTVER2: # BB#0:
@@ -1768,22 +1768,22 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
; SANDY-LABEL: test_movd:
; SANDY: # BB#0:
; SANDY-NEXT: vmovd %edi, %xmm1 # sched: [1:0.33]
-; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; SANDY-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovd %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovd %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovd %edi, %xmm1 # sched: [1:1.00]
-; HASWELL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovd %edi, %xmm1 # sched: [1:0.25]
+; HASWELL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [?:5.000000e-01]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovd %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovd %xmm1, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movd:
; BTVER2: # BB#0:
@@ -1838,23 +1838,23 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
;
; SANDY-LABEL: test_movd_64:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.33]
-; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00]
+; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; SANDY-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovq %xmm0, %rax # sched: [1:0.33]
-; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovq %xmm0, %rax # sched: [2:1.00]
+; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movd_64:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00]
-; HASWELL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [?:5.000000e-01]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovq %xmm0, %rax # sched: [1:1.00]
-; HASWELL-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovq %xmm1, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movd_64:
; BTVER2: # BB#0:
@@ -1900,17 +1900,17 @@ define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movhpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movhpd:
; BTVER2: # BB#0:
@@ -1951,17 +1951,17 @@ define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
;
; SANDY-LABEL: test_movlpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovlpd %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movlpd:
; BTVER2: # BB#0:
@@ -1998,13 +1998,13 @@ define i32 @test_movmskpd(<2 x double> %a0) {
;
; SANDY-LABEL: test_movmskpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovmskpd %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movmskpd:
; BTVER2: # BB#0:
@@ -2039,14 +2039,14 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
; SANDY-LABEL: test_movntdqa:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntdqa:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovntdq %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntdqa:
; BTVER2: # BB#0:
@@ -2080,14 +2080,14 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_movntpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovntpd %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntpd:
; BTVER2: # BB#0:
@@ -2123,17 +2123,17 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
;
; SANDY-LABEL: test_movq_mem:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movq_mem:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [?:5.000000e-01]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovq %xmm0, (%rdi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movq_mem:
; BTVER2: # BB#0:
@@ -2174,13 +2174,13 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
; SANDY: # BB#0:
; SANDY-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
; SANDY-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movq_reg:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
; HASWELL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movq_reg:
; BTVER2: # BB#0:
@@ -2216,17 +2216,17 @@ define void @test_movsd_mem(double* %a0, double* %a1) {
;
; SANDY-LABEL: test_movsd_mem:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50]
+; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsd_mem:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovsd %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movsd_mem:
; BTVER2: # BB#0:
@@ -2266,12 +2266,12 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) {
; SANDY-LABEL: test_movsd_reg:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsd_reg:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movsd_reg:
; BTVER2: # BB#0:
@@ -2305,17 +2305,17 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_movupd:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movupd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovupd (%rdi), %xmm0 # sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovupd %xmm0, (%rsi) # sched: [?:1.000000e+00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movupd:
; BTVER2: # BB#0:
@@ -2351,14 +2351,14 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_mulpd:
; SANDY: # BB#0:
; SANDY-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulpd:
; BTVER2: # BB#0:
@@ -2393,14 +2393,14 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) {
; SANDY-LABEL: test_mulsd:
; SANDY: # BB#0:
; SANDY-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulsd:
; BTVER2: # BB#0:
@@ -2437,17 +2437,17 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_orpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_orpd:
; BTVER2: # BB#0:
@@ -2496,14 +2496,14 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_packssdw:
; SANDY: # BB#0:
; SANDY-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packssdw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_packssdw:
; BTVER2: # BB#0:
@@ -2548,14 +2548,14 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_packsswb:
; SANDY: # BB#0:
; SANDY-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packsswb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_packsswb:
; BTVER2: # BB#0:
@@ -2600,14 +2600,14 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_packuswb:
; SANDY: # BB#0:
; SANDY-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packuswb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_packuswb:
; BTVER2: # BB#0:
@@ -2648,14 +2648,14 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_paddb:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddb:
; BTVER2: # BB#0:
@@ -2694,14 +2694,14 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_paddd:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddd:
; BTVER2: # BB#0:
@@ -2736,14 +2736,14 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_paddq:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddq:
; BTVER2: # BB#0:
@@ -2782,14 +2782,14 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_paddsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddsb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddsb:
; BTVER2: # BB#0:
@@ -2829,14 +2829,14 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_paddsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddsw:
; BTVER2: # BB#0:
@@ -2876,14 +2876,14 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_paddusb:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddusb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddusb:
; BTVER2: # BB#0:
@@ -2923,14 +2923,14 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_paddusw:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddusw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddusw:
; BTVER2: # BB#0:
@@ -2970,14 +2970,14 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_paddw:
; SANDY: # BB#0:
; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddw:
; BTVER2: # BB#0:
@@ -3015,16 +3015,16 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_pand:
; SANDY: # BB#0:
; SANDY-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pand:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pand:
; BTVER2: # BB#0:
@@ -3070,16 +3070,16 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_pandn:
; SANDY: # BB#0:
; SANDY-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pandn:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pandn:
; BTVER2: # BB#0:
@@ -3122,14 +3122,14 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pavgb:
; SANDY: # BB#0:
; SANDY-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pavgb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pavgb:
; BTVER2: # BB#0:
@@ -3169,14 +3169,14 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pavgw:
; SANDY: # BB#0:
; SANDY-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pavgw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pavgw:
; BTVER2: # BB#0:
@@ -3217,16 +3217,16 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pcmpeqb:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpeqb:
; BTVER2: # BB#0:
@@ -3269,16 +3269,16 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pcmpeqd:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpeqd:
; BTVER2: # BB#0:
@@ -3321,16 +3321,16 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pcmpeqw:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpeqw:
; BTVER2: # BB#0:
@@ -3374,16 +3374,16 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pcmpgtb:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpgtb:
; BTVER2: # BB#0:
@@ -3427,16 +3427,16 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pcmpgtd:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpgtd:
; BTVER2: # BB#0:
@@ -3480,16 +3480,16 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pcmpgtw:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpgtw:
; BTVER2: # BB#0:
@@ -3526,15 +3526,15 @@ define i16 @test_pextrw(<8 x i16> %a0) {
;
; SANDY-LABEL: test_pextrw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrw:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:1.00]
+; HASWELL-NEXT: vpextrw $6, %xmm0, %eax # sched: [2:1.00]
; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pextrw:
; BTVER2: # BB#0:
@@ -3570,15 +3570,15 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
;
; SANDY-LABEL: test_pinsrw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrw:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pinsrw:
; BTVER2: # BB#0:
@@ -3620,15 +3620,15 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmaddwd:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaddwd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaddwd:
; BTVER2: # BB#0:
@@ -3669,14 +3669,14 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pmaxsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxsw:
; BTVER2: # BB#0:
@@ -3716,14 +3716,14 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pmaxub:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxub:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxub:
; BTVER2: # BB#0:
@@ -3763,14 +3763,14 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pminsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminsw:
; BTVER2: # BB#0:
@@ -3810,14 +3810,14 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pminub:
; SANDY: # BB#0:
; SANDY-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminub:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminub:
; BTVER2: # BB#0:
@@ -3851,13 +3851,13 @@ define i32 @test_pmovmskb(<16 x i8> %a0) {
;
; SANDY-LABEL: test_pmovmskb:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovmskb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovmskb %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovmskb:
; BTVER2: # BB#0:
@@ -3891,13 +3891,13 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhuw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmulhuw:
; BTVER2: # BB#0:
@@ -3932,15 +3932,15 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmulhw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmulhw:
; BTVER2: # BB#0:
@@ -3975,15 +3975,15 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmullw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmullw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmullw:
; BTVER2: # BB#0:
@@ -4027,13 +4027,13 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmuludq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmuludq:
; BTVER2: # BB#0:
@@ -4073,16 +4073,16 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_por:
; SANDY: # BB#0:
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_por:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_por:
; BTVER2: # BB#0:
@@ -4126,15 +4126,15 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_psadbw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psadbw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psadbw:
; BTVER2: # BB#0:
@@ -4176,16 +4176,16 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_pshufd:
; SANDY: # BB#0:
; SANDY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50]
-; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:0.50]
+; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00]
-; HASWELL-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; HASWELL-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pshufd:
; BTVER2: # BB#0:
@@ -4227,16 +4227,16 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
; SANDY-LABEL: test_pshufhw:
; SANDY: # BB#0:
; SANDY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
-; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:0.50]
+; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufhw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
-; HASWELL-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:1.00]
+; HASWELL-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [1:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pshufhw:
; BTVER2: # BB#0:
@@ -4278,16 +4278,16 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
; SANDY-LABEL: test_pshuflw:
; SANDY: # BB#0:
; SANDY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
-; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:0.50]
+; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshuflw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00]
-; HASWELL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:1.00]
+; HASWELL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [1:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pshuflw:
; BTVER2: # BB#0:
@@ -4328,15 +4328,15 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; SANDY-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pslld:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pslld:
; BTVER2: # BB#0:
@@ -4378,12 +4378,12 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) {
; SANDY-LABEL: test_pslldq:
; SANDY: # BB#0:
; SANDY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pslldq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pslldq:
; BTVER2: # BB#0:
@@ -4419,15 +4419,15 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; SANDY-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psllq:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psllq:
; BTVER2: # BB#0:
@@ -4470,15 +4470,15 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY: # BB#0:
; SANDY-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; SANDY-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psllw:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psllw:
; BTVER2: # BB#0:
@@ -4519,17 +4519,17 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_psrad:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrad:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psrad:
; BTVER2: # BB#0:
@@ -4570,17 +4570,17 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_psraw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psraw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psraw:
; BTVER2: # BB#0:
@@ -4621,17 +4621,17 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_psrld:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrld:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psrld:
; BTVER2: # BB#0:
@@ -4673,12 +4673,12 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) {
; SANDY-LABEL: test_psrldq:
; SANDY: # BB#0:
; SANDY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrldq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psrldq:
; BTVER2: # BB#0:
@@ -4712,17 +4712,17 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_psrlq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrlq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psrlq:
; BTVER2: # BB#0:
@@ -4763,17 +4763,17 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_psrlw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrlw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psrlw:
; BTVER2: # BB#0:
@@ -4816,14 +4816,14 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psubb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubb:
; BTVER2: # BB#0:
@@ -4862,14 +4862,14 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_psubd:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubd:
; BTVER2: # BB#0:
@@ -4904,14 +4904,14 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_psubq:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubq:
; BTVER2: # BB#0:
@@ -4950,14 +4950,14 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psubsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubsb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubsb:
; BTVER2: # BB#0:
@@ -4997,14 +4997,14 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psubsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubsw:
; BTVER2: # BB#0:
@@ -5044,14 +5044,14 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psubusb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubusb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubusb:
; BTVER2: # BB#0:
@@ -5091,14 +5091,14 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psubusw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubusw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubusw:
; BTVER2: # BB#0:
@@ -5138,14 +5138,14 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psubw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubw:
; BTVER2: # BB#0:
@@ -5184,14 +5184,14 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_punpckhbw:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhbw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpckhbw:
; BTVER2: # BB#0:
@@ -5231,16 +5231,16 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_punpckhdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:0.50]
+; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhdq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:1.00]
+; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpckhdq:
; BTVER2: # BB#0:
@@ -5280,16 +5280,16 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
; SANDY-LABEL: test_punpckhqdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:0.50]
+; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhqdq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpckhqdq:
; BTVER2: # BB#0:
@@ -5330,14 +5330,14 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_punpckhwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhwd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpckhwd:
; BTVER2: # BB#0:
@@ -5376,14 +5376,14 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_punpcklbw:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklbw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpcklbw:
; BTVER2: # BB#0:
@@ -5423,16 +5423,16 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_punpckldq:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
-; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:0.50]
+; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckldq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpckldq:
; BTVER2: # BB#0:
@@ -5472,16 +5472,16 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
; SANDY-LABEL: test_punpcklqdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:0.50]
+; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklqdq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpcklqdq:
; BTVER2: # BB#0:
@@ -5522,14 +5522,14 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_punpcklwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklwd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpcklwd:
; BTVER2: # BB#0:
@@ -5567,16 +5567,16 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_pxor:
; SANDY: # BB#0:
; SANDY-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pxor:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pxor:
; BTVER2: # BB#0:
@@ -5616,16 +5616,16 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double
; SANDY-LABEL: test_shufpd:
; SANDY: # BB#0:
; SANDY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_shufpd:
; BTVER2: # BB#0:
@@ -5665,17 +5665,17 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_sqrtpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [28:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00]
-; HASWELL-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:1.00]
+; HASWELL-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [21:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtpd:
; BTVER2: # BB#0:
@@ -5720,19 +5720,19 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
;
; SANDY-LABEL: test_sqrtsd:
; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtsd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; HASWELL-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; HASWELL-NEXT: vmovapd (%rdi), %xmm1 # sched: [?:5.000000e-01]
+; HASWELL-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtsd:
; BTVER2: # BB#0:
@@ -5771,14 +5771,14 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; SANDY-LABEL: test_subpd:
; SANDY: # BB#0:
; SANDY-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subpd:
; BTVER2: # BB#0:
@@ -5813,14 +5813,14 @@ define double @test_subsd(double %a0, double %a1, double *%a2) {
; SANDY-LABEL: test_subsd:
; SANDY: # BB#0:
; SANDY-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subsd:
; BTVER2: # BB#0:
@@ -5879,30 +5879,30 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
; SANDY-LABEL: test_ucomisd:
; SANDY: # BB#0:
; SANDY-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vucomisd (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:1.00]
+; SANDY-NEXT: sete %dl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ucomisd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: setnp %al # sched: [1:0.50]
-; HASWELL-NEXT: sete %cl # sched: [1:0.50]
+; HASWELL-NEXT: setnp %al # sched: [1:1.00]
+; HASWELL-NEXT: sete %cl # sched: [1:1.00]
; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
; HASWELL-NEXT: vucomisd (%rdi), %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: setnp %al # sched: [1:0.50]
-; HASWELL-NEXT: sete %dl # sched: [1:0.50]
+; HASWELL-NEXT: setnp %al # sched: [1:1.00]
+; HASWELL-NEXT: sete %dl # sched: [1:1.00]
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ucomisd:
; BTVER2: # BB#0:
@@ -5950,16 +5950,16 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SANDY-LABEL: test_unpckhpd:
; SANDY: # BB#0:
; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpckhpd:
; BTVER2: # BB#0:
@@ -6005,16 +6005,16 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SANDY-LABEL: test_unpcklpd:
; SANDY: # BB#0:
; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpcklpd:
; BTVER2: # BB#0:
@@ -6053,17 +6053,17 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_xorpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_xorpd:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse3-schedule.ll b/llvm/test/CodeGen/X86/sse3-schedule.ll
index 482b2fcab64..672ae838a04 100644
--- a/llvm/test/CodeGen/X86/sse3-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse3-schedule.ll
@@ -31,14 +31,14 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SANDY-LABEL: test_addsubpd:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addsubpd:
; BTVER2: # BB#0:
@@ -74,14 +74,14 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; SANDY-LABEL: test_addsubps:
; SANDY: # BB#0:
; SANDY-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addsubps:
; BTVER2: # BB#0:
@@ -116,15 +116,15 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
;
; SANDY-LABEL: test_haddpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_haddpd:
; BTVER2: # BB#0:
@@ -159,15 +159,15 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
;
; SANDY-LABEL: test_haddps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_haddps:
; BTVER2: # BB#0:
@@ -202,15 +202,15 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
;
; SANDY-LABEL: test_hsubpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_hsubpd:
; BTVER2: # BB#0:
@@ -245,15 +245,15 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
;
; SANDY-LABEL: test_hsubps:
; SANDY: # BB#0:
-; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_hsubps:
; BTVER2: # BB#0:
@@ -287,13 +287,13 @@ define <16 x i8> @test_lddqu(i8* %a0) {
;
; SANDY-LABEL: test_lddqu:
; SANDY: # BB#0:
-; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lddqu:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vlddqu (%rdi), %xmm0 # sched: [?:5.000000e-01]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lddqu:
; BTVER2: # BB#0:
@@ -330,16 +330,16 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_movddup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
-; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50]
+; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [6:0.50]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movddup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
-; HASWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50]
+; HASWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movddup:
; BTVER2: # BB#0:
@@ -380,16 +380,16 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_movshdup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
-; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50]
+; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movshdup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
-; HASWELL-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50]
+; HASWELL-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movshdup:
; BTVER2: # BB#0:
@@ -430,16 +430,16 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_movsldup:
; SANDY: # BB#0:
; SANDY-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
-; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50]
+; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsldup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
-; HASWELL-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50]
+; HASWELL-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [?:5.000000e-01]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movsldup:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse41-schedule.ll b/llvm/test/CodeGen/X86/sse41-schedule.ll
index 340b9abe887..b30d8164157 100644
--- a/llvm/test/CodeGen/X86/sse41-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse41-schedule.ll
@@ -25,17 +25,17 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
;
; SANDY-LABEL: test_blendpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
+; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.33]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendpd:
; BTVER2: # BB#0:
@@ -65,15 +65,15 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *
;
; SANDY-LABEL: test_blendps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
-; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33]
-; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendps:
; BTVER2: # BB#0:
@@ -107,15 +107,15 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
;
; SANDY-LABEL: test_blendvpd:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendvpd:
; BTVER2: # BB#0:
@@ -150,15 +150,15 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float>
;
; SANDY-LABEL: test_blendvps:
; SANDY: # BB#0:
-; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendvps:
; BTVER2: # BB#0:
@@ -187,15 +187,15 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; SANDY-LABEL: test_dppd:
; SANDY: # BB#0:
-; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dppd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_dppd:
; BTVER2: # BB#0:
@@ -224,15 +224,15 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
;
; SANDY-LABEL: test_dpps:
; SANDY: # BB#0:
-; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [12:2.00]
; SANDY-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [14:2.00]
-; HASWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [18:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [14:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_dpps:
; BTVER2: # BB#0:
@@ -262,14 +262,14 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2)
; SANDY-LABEL: test_insertps:
; SANDY: # BB#0:
; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
-; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_insertps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
-; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_insertps:
; BTVER2: # BB#0:
@@ -296,13 +296,13 @@ define <2 x i64> @test_movntdqa(i8* %a0) {
;
; SANDY-LABEL: test_movntdqa:
; SANDY: # BB#0:
-; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntdqa:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [?:5.000000e-01]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntdqa:
; BTVER2: # BB#0:
@@ -328,15 +328,15 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_mpsadbw:
; SANDY: # BB#0:
-; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mpsadbw:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [7:2.00]
+; HASWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mpsadbw:
; BTVER2: # BB#0:
@@ -367,14 +367,14 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_packusdw:
; SANDY: # BB#0:
; SANDY-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packusdw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_packusdw:
; BTVER2: # BB#0:
@@ -411,14 +411,14 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16
; SANDY-LABEL: test_pblendvb:
; SANDY: # BB#0:
; SANDY-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pblendvb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pblendvb:
; BTVER2: # BB#0:
@@ -448,14 +448,14 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pblendw:
; SANDY: # BB#0:
; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pblendw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pblendw:
; BTVER2: # BB#0:
@@ -484,14 +484,14 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SANDY-LABEL: test_pcmpeqq:
; SANDY: # BB#0:
; SANDY-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpeqq:
; BTVER2: # BB#0:
@@ -521,15 +521,15 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
;
; SANDY-LABEL: test_pextrb:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrb:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pextrb:
; BTVER2: # BB#0:
@@ -558,15 +558,15 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
;
; SANDY-LABEL: test_pextrd:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pextrd:
; BTVER2: # BB#0:
@@ -594,15 +594,15 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
;
; SANDY-LABEL: test_pextrq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50]
+; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [3:1.00]
; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrq:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pextrq:
; BTVER2: # BB#0:
@@ -630,15 +630,15 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
;
; SANDY-LABEL: test_pextrw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrw:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pextrw:
; BTVER2: # BB#0:
@@ -667,15 +667,15 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
;
; SANDY-LABEL: test_phminposuw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
; SANDY-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phminposuw:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [5:1.00]
; HASWELL-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phminposuw:
; BTVER2: # BB#0:
@@ -704,15 +704,15 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
;
; SANDY-LABEL: test_pinsrb:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrb:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pinsrb:
; BTVER2: # BB#0:
@@ -740,15 +740,15 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
;
; SANDY-LABEL: test_pinsrd:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pinsrd:
; BTVER2: # BB#0:
@@ -778,17 +778,17 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
;
; SANDY-LABEL: test_pinsrq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrq:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pinsrq:
; BTVER2: # BB#0:
@@ -819,14 +819,14 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pmaxsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxsb:
; BTVER2: # BB#0:
@@ -856,14 +856,14 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pmaxsd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxsd:
; BTVER2: # BB#0:
@@ -893,14 +893,14 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pmaxud:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxud:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxud:
; BTVER2: # BB#0:
@@ -930,14 +930,14 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pmaxuw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxuw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxuw:
; BTVER2: # BB#0:
@@ -967,14 +967,14 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pminsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminsb:
; BTVER2: # BB#0:
@@ -1004,14 +1004,14 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pminsd:
; SANDY: # BB#0:
; SANDY-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminsd:
; BTVER2: # BB#0:
@@ -1041,14 +1041,14 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_pminud:
; SANDY: # BB#0:
; SANDY-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminud:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminud:
; BTVER2: # BB#0:
@@ -1078,14 +1078,14 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_pminuw:
; SANDY: # BB#0:
; SANDY-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminuw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminuw:
; BTVER2: # BB#0:
@@ -1118,16 +1118,16 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; SANDY-LABEL: test_pmovsxbw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxbw:
; BTVER2: # BB#0:
@@ -1162,16 +1162,16 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; SANDY-LABEL: test_pmovsxbd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxbd:
; BTVER2: # BB#0:
@@ -1206,16 +1206,16 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; SANDY-LABEL: test_pmovsxbq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxbq:
; BTVER2: # BB#0:
@@ -1250,16 +1250,16 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; SANDY-LABEL: test_pmovsxdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxdq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxdq:
; BTVER2: # BB#0:
@@ -1294,16 +1294,16 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; SANDY-LABEL: test_pmovsxwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxwd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxwd:
; BTVER2: # BB#0:
@@ -1338,16 +1338,16 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; SANDY-LABEL: test_pmovsxwq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxwq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxwq:
; BTVER2: # BB#0:
@@ -1382,16 +1382,16 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; SANDY-LABEL: test_pmovzxbw:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [1:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxbw:
; BTVER2: # BB#0:
@@ -1426,16 +1426,16 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; SANDY-LABEL: test_pmovzxbd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxbd:
; BTVER2: # BB#0:
@@ -1470,16 +1470,16 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; SANDY-LABEL: test_pmovzxbq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxbq:
; BTVER2: # BB#0:
@@ -1514,16 +1514,16 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; SANDY-LABEL: test_pmovzxdq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxdq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxdq:
; BTVER2: # BB#0:
@@ -1558,16 +1558,16 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; SANDY-LABEL: test_pmovzxwd:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxwd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxwd:
; BTVER2: # BB#0:
@@ -1602,16 +1602,16 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; SANDY-LABEL: test_pmovzxwq:
; SANDY: # BB#0:
; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxwq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxwq:
; BTVER2: # BB#0:
@@ -1642,15 +1642,15 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_pmuldq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmuldq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmuldq:
; BTVER2: # BB#0:
@@ -1680,15 +1680,15 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_pmulld:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulld:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [10:2.00]
; HASWELL-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmulld:
; BTVER2: # BB#0:
@@ -1724,23 +1724,23 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_ptest:
; SANDY: # BB#0:
-; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: setb %cl # sched: [1:0.33]
+; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: setb %al # sched: [1:1.00]
+; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: setb %cl # sched: [1:1.00]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: movzbl %cl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ptest:
; HASWELL: # BB#0:
; HASWELL-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: setb %al # sched: [1:0.50]
+; HASWELL-NEXT: setb %al # sched: [1:1.00]
; HASWELL-NEXT: vptest (%rdi), %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: setb %cl # sched: [1:0.50]
+; HASWELL-NEXT: setb %cl # sched: [1:1.00]
; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %cl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ptest:
; BTVER2: # BB#0:
@@ -1778,16 +1778,16 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
; SANDY-LABEL: test_roundpd:
; SANDY: # BB#0:
; SANDY-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:2.00]
+; HASWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [6:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundpd:
; BTVER2: # BB#0:
@@ -1822,16 +1822,16 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
; SANDY-LABEL: test_roundps:
; SANDY: # BB#0:
; SANDY-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:2.00]
+; HASWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [6:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundps:
; BTVER2: # BB#0:
@@ -1867,16 +1867,16 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
; SANDY-LABEL: test_roundsd:
; SANDY: # BB#0:
; SANDY-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundsd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
-; HASWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [5:2.00]
+; HASWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundsd:
; BTVER2: # BB#0:
@@ -1912,16 +1912,16 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
; SANDY-LABEL: test_roundss:
; SANDY: # BB#0:
; SANDY-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundss:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
-; HASWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; HASWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundss:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse42-schedule.ll b/llvm/test/CodeGen/X86/sse42-schedule.ll
index afc48bc57ee..c51c50a91bd 100644
--- a/llvm/test/CodeGen/X86/sse42-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse42-schedule.ll
@@ -26,16 +26,16 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
; SANDY-LABEL: crc32_32_8:
; SANDY: # BB#0:
; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_8:
; HASWELL: # BB#0:
; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: crc32_32_8:
; BTVER2: # BB#0:
@@ -68,16 +68,16 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
; SANDY-LABEL: crc32_32_16:
; SANDY: # BB#0:
; SANDY-NEXT: crc32w %si, %edi # sched: [3:1.00]
-; SANDY-NEXT: crc32w (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: crc32w (%rdx), %edi # sched: [8:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_16:
; HASWELL: # BB#0:
; HASWELL-NEXT: crc32w %si, %edi # sched: [3:1.00]
; HASWELL-NEXT: crc32w (%rdx), %edi # sched: [7:1.00]
; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: crc32_32_16:
; BTVER2: # BB#0:
@@ -112,14 +112,14 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
; SANDY-NEXT: crc32l %esi, %edi # sched: [3:1.00]
; SANDY-NEXT: crc32l (%rdx), %edi # sched: [7:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_32:
; HASWELL: # BB#0:
; HASWELL-NEXT: crc32l %esi, %edi # sched: [3:1.00]
; HASWELL-NEXT: crc32l (%rdx), %edi # sched: [7:1.00]
; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: crc32_32_32:
; BTVER2: # BB#0:
@@ -152,16 +152,16 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
; SANDY-LABEL: crc32_64_8:
; SANDY: # BB#0:
; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_64_8:
; HASWELL: # BB#0:
; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: crc32_64_8:
; BTVER2: # BB#0:
@@ -196,14 +196,14 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
; SANDY-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
; SANDY-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00]
; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_64_64:
; HASWELL: # BB#0:
; HASWELL-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
; HASWELL-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00]
; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: crc32_64_64:
; BTVER2: # BB#0:
@@ -256,20 +256,20 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; SANDY-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpestri:
; HASWELL: # BB#0:
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [18:4.00]
; HASWELL-NEXT: movl %ecx, %esi # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [18:4.00]
; HASWELL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; HASWELL-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpestri:
; BTVER2: # BB#0:
@@ -320,17 +320,17 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-NEXT: movl $7, %eax # sched: [1:0.33]
; SANDY-NEXT: movl $7, %edx # sched: [1:0.33]
; SANDY-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpestrm:
; HASWELL: # BB#0:
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [10:4.00]
+; HASWELL-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [19:4.00]
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [19:4.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpestrm:
; BTVER2: # BB#0:
@@ -369,12 +369,12 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_pcmpistri:
; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
; SANDY-NEXT: movl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; SANDY-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpistri:
; HASWELL: # BB#0:
@@ -383,7 +383,7 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [11:3.00]
; HASWELL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; HASWELL-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpistri:
; BTVER2: # BB#0:
@@ -416,15 +416,15 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_pcmpistrm:
; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:1.00]
-; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpistrm:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpistrm:
; BTVER2: # BB#0:
@@ -453,15 +453,15 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; SANDY-LABEL: test_pcmpgtq:
; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; HASWELL-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpgtq:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/ssse3-schedule.ll b/llvm/test/CodeGen/X86/ssse3-schedule.ll
index 8b7a0c0ec02..353784e6dc4 100644
--- a/llvm/test/CodeGen/X86/ssse3-schedule.ll
+++ b/llvm/test/CodeGen/X86/ssse3-schedule.ll
@@ -35,16 +35,16 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
; SANDY-LABEL: test_pabsb:
; SANDY: # BB#0:
; SANDY-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsb (%rdi), %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pabsb:
; BTVER2: # BB#0:
@@ -86,16 +86,16 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
; SANDY-LABEL: test_pabsd:
; SANDY: # BB#0:
; SANDY-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsd (%rdi), %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pabsd:
; BTVER2: # BB#0:
@@ -136,12 +136,12 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
; SANDY-LABEL: test_pabsw:
; SANDY: # BB#0:
; SANDY-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pabsw:
; BTVER2: # BB#0:
@@ -182,14 +182,14 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_palignr:
; SANDY: # BB#0:
; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
-; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_palignr:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
-; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_palignr:
; BTVER2: # BB#0:
@@ -223,15 +223,15 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_phaddd:
; SANDY: # BB#0:
-; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phaddd:
; BTVER2: # BB#0:
@@ -274,15 +274,15 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phaddsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phaddsw:
; BTVER2: # BB#0:
@@ -317,15 +317,15 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phaddw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phaddw:
; BTVER2: # BB#0:
@@ -360,15 +360,15 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
;
; SANDY-LABEL: test_phsubd:
; SANDY: # BB#0:
-; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phsubd:
; BTVER2: # BB#0:
@@ -411,15 +411,15 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phsubsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phsubsw:
; BTVER2: # BB#0:
@@ -454,15 +454,15 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_phsubw:
; SANDY: # BB#0:
-; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phsubw:
; BTVER2: # BB#0:
@@ -497,15 +497,15 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; SANDY-LABEL: test_pmaddubsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaddubsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaddubsw:
; BTVER2: # BB#0:
@@ -538,13 +538,13 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
;
; SANDY-LABEL: test_pmulhrsw:
; SANDY: # BB#0:
-; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhrsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmulhrsw:
; BTVER2: # BB#0:
@@ -579,14 +579,14 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_pshufb:
; SANDY: # BB#0:
; SANDY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pshufb:
; BTVER2: # BB#0:
@@ -630,14 +630,14 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-LABEL: test_psignb:
; SANDY: # BB#0:
; SANDY-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psignb:
; BTVER2: # BB#0:
@@ -681,14 +681,14 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SANDY-LABEL: test_psignd:
; SANDY: # BB#0:
; SANDY-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psignd:
; BTVER2: # BB#0:
@@ -732,14 +732,14 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SANDY-LABEL: test_psignw:
; SANDY: # BB#0:
; SANDY-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psignw:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 4d4b7f4e822..a07d2dd90be 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -201,14 +201,14 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm5
; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm8
; AVX512DQ-NEXT: vpblendvb %ymm8, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm9
+; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm9, %ymm5, %ymm3
; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm5
; AVX512DQ-NEXT: vpaddw %ymm8, %ymm8, %ymm9
; AVX512DQ-NEXT: vpblendvb %ymm9, %ymm5, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4
; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4
@@ -328,14 +328,14 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm5
; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm8
; AVX512DQ-NEXT: vpblendvb %ymm8, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm9
+; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm9, %ymm5, %ymm3
; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm5
; AVX512DQ-NEXT: vpaddw %ymm8, %ymm8, %ymm9
; AVX512DQ-NEXT: vpblendvb %ymm9, %ymm5, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4
; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 7a5c992bb82..d6c4ff1255e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -68,13 +68,13 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
; KNL-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,12,13,12,13,10,11,0,1,4,5,4,5,0,1]
; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,3,2,2,4,5,6,7]
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm1
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm5
-; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,8,9,14,15,4,5,2,3,2,3,6,7]
-; KNL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15]
-; KNL-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; KNL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15]
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm5
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7],ymm0[8],ymm5[9],ymm0[10],ymm5[11],ymm0[12],ymm5[13],ymm0[14],ymm5[15]
; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3
; KNL-NEXT: vpbroadcastw %xmm3, %ymm3
; KNL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
OpenPOWER on IntegriCloud