summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2019-02-18 16:46:12 +0000
committerSanjay Patel <spatel@rotateright.com>2019-02-18 16:46:12 +0000
commitfff628274d462c099c17cbb20fa09beb1b8105f4 (patch)
treedefa7765ce3a2d12867ebe7b318ad6f7f5951d13
parent9d800a135a74a085c8fb7647ba6ef8b9daac202a (diff)
downloadbcm5719-llvm-fff628274d462c099c17cbb20fa09beb1b8105f4.tar.gz
bcm5719-llvm-fff628274d462c099c17cbb20fa09beb1b8105f4.zip
[x86] split more v8f32/v8i32 shuffles in lowering
Similar to D57867 - this is a small patch with lots of test diffs. With half-vector-width narrowing potential, using an extract + 128-bit vshufps is a win because it replaces a 256-bit shuffle with a 128-bit shufle. This seems like it should be a win even for targets with 'fast-variable-shuffle', but we are intentionally deferring that to an independent change to make sure that is true. Differential Revision: https://reviews.llvm.org/D58181 llvm-svn: 354279
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp9
-rw-r--r--llvm/test/CodeGen/X86/avx2-conversions.ll10
-rw-r--r--llvm/test/CodeGen/X86/avx2-vector-shifts.ll16
-rw-r--r--llvm/test/CodeGen/X86/combine-shl.ll6
-rw-r--r--llvm/test/CodeGen/X86/combine-sra.ll14
-rw-r--r--llvm/test/CodeGen/X86/combine-srl.ll10
-rw-r--r--llvm/test/CodeGen/X86/oddshuffles.ll20
-rw-r--r--llvm/test/CodeGen/X86/reduce-trunc-shl.ll5
-rw-r--r--llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll13
-rw-r--r--llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll35
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-math-widen.ll485
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-math.ll485
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll27
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-packus.ll27
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll21
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-ssat.ll21
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll21
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-usat.ll21
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-widen.ll46
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc.ll62
20 files changed, 666 insertions, 688 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e033c211943..df0009b3fd7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -14605,10 +14605,11 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
if (NumUpperHalves == 1) {
// AVX2 has efficient 32/64-bit element cross-lane shuffles.
if (Subtarget.hasAVX2()) {
- // extract128 + vunpckhps, is better than vblend + vpermps.
- // TODO: Refine to account for unary shuffle, splat, and other masks?
- if (EltWidth == 32 && NumLowerHalves &&
- HalfVT.is128BitVector() && !is128BitUnpackShuffleMask(HalfMask))
+ // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
+ if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
+ !is128BitUnpackShuffleMask(HalfMask) &&
+ (!isSingleSHUFPSMask(HalfMask) ||
+ Subtarget.hasFastVariableShuffle()))
return SDValue();
// If this is a unary shuffle (assume that the 2nd operand is
// canonicalized to undef), then we can use vpermpd. Otherwise, we
diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll
index 1060d70d0d0..070959eb74c 100644
--- a/llvm/test/CodeGen/X86/avx2-conversions.ll
+++ b/llvm/test/CodeGen/X86/avx2-conversions.ll
@@ -7,9 +7,8 @@
define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
; X32-SLOW-LABEL: trunc4:
; X32-SLOW: # %bb.0:
-; X32-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; X32-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; X32-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X32-SLOW-NEXT: vzeroupper
; X32-SLOW-NEXT: retl
;
@@ -23,9 +22,8 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
;
; X64-SLOW-LABEL: trunc4:
; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; X64-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; X64-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X64-SLOW-NEXT: vzeroupper
; X64-SLOW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
index 3dbaab0f447..d95f8a01451 100644
--- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -376,10 +376,10 @@ entry:
define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
; X32-SLOW-LABEL: srl_trunc_and_v4i64:
; X32-SLOW: # %bb.0:
-; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; X32-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; X32-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
-; X32-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X32-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; X32-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
+; X32-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
; X32-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X32-SLOW-NEXT: vzeroupper
; X32-SLOW-NEXT: retl
@@ -396,10 +396,10 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
;
; X64-SLOW-LABEL: srl_trunc_and_v4i64:
; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; X64-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; X64-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
-; X64-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X64-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
+; X64-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X64-SLOW-NEXT: vzeroupper
; X64-SLOW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index 64f9f10c4c6..f9b1b930224 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -138,9 +138,9 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
;
; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX-SLOW-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1
; AVX-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index 4a14991c5eb..368b47accf7 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -168,9 +168,9 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
;
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -209,8 +209,8 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -248,8 +248,8 @@ define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
;
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 960aa079e4c..1c8786337ac 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -205,8 +205,8 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -417,9 +417,9 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
;
; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 0f0c8239051..e0c974857cb 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -621,16 +621,16 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
;
; AVX2-SLOW-LABEL: v12i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
-; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0
-; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi)
-; AVX2-SLOW-NEXT: vmovaps %xmm2, 32(%rdi)
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
+; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm3
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX2-SLOW-NEXT: vmovaps %xmm0, 32(%rdi)
+; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdi)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/reduce-trunc-shl.ll b/llvm/test/CodeGen/X86/reduce-trunc-shl.ll
index c14acc5e35c..9637711e8af 100644
--- a/llvm/test/CodeGen/X86/reduce-trunc-shl.ll
+++ b/llvm/test/CodeGen/X86/reduce-trunc-shl.ll
@@ -13,11 +13,10 @@ define void @trunc_shl_7_v4i32_v4i64(<4 x i32> addrspace(1)* %out, <4 x i64> add
;
; AVX2-LABEL: trunc_shl_7_v4i32_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vmovaps (%rsi), %xmm0
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
; AVX2-NEXT: vpslld $7, %xmm0, %xmm0
; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%val = load <4 x i64>, <4 x i64> addrspace(1)* %in
%shl = shl <4 x i64> %val, <i64 7, i64 7, i64 7, i64 7>
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll
index 629d37bf81f..1f2adabe5fa 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll
@@ -225,10 +225,9 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
@@ -706,8 +705,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -771,8 +770,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index fc6a2cd6441..3dd705d55e8 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -225,10 +225,9 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
@@ -672,8 +671,8 @@ define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind {
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -736,8 +735,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -801,8 +800,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -865,8 +864,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind {
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -928,8 +927,8 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -1070,11 +1069,10 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
@@ -1202,11 +1200,10 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll
index 553562e5e82..7129a04fa3a 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math-widen.ll
@@ -32,9 +32,8 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -102,13 +101,13 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -255,25 +254,25 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -517,8 +516,8 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -578,11 +577,11 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
@@ -705,21 +704,21 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -901,9 +900,8 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -971,13 +969,13 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1124,25 +1122,25 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -1354,8 +1352,8 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -1415,11 +1413,11 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
@@ -1542,21 +1540,21 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -1772,10 +1770,10 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -1878,19 +1876,19 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
@@ -2125,15 +2123,15 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vpmulld %xmm7, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vpmulld %xmm8, %xmm3, %xmm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -2141,15 +2139,15 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
@@ -2453,8 +2451,8 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -2514,11 +2512,11 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
@@ -2700,11 +2698,11 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -2712,11 +2710,11 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
@@ -2940,9 +2938,8 @@ define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -3006,13 +3003,13 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -3149,25 +3146,25 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -3354,8 +3351,8 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -3415,11 +3412,11 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
@@ -3542,21 +3539,21 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -3736,9 +3733,8 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -3802,13 +3798,13 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -3945,25 +3941,25 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpxor %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -4150,8 +4146,8 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -4211,11 +4207,11 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
@@ -4338,21 +4334,21 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -4532,9 +4528,8 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -4598,13 +4593,13 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -4741,25 +4736,25 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
;
; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpor %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpor %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpor %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -4946,8 +4941,8 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -5007,11 +5002,11 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
@@ -5134,21 +5129,21 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index fdc558aae93..d28c940bcb7 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -32,9 +32,8 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -102,13 +101,13 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -255,25 +254,25 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -517,8 +516,8 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -578,11 +577,11 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
@@ -705,21 +704,21 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -901,9 +900,8 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -971,13 +969,13 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1124,25 +1122,25 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -1354,8 +1352,8 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -1415,11 +1413,11 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
@@ -1542,21 +1540,21 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -1772,10 +1770,10 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -1878,19 +1876,19 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
@@ -2125,15 +2123,15 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vpmulld %xmm7, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vpmulld %xmm8, %xmm3, %xmm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -2141,15 +2139,15 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
@@ -2453,8 +2451,8 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -2514,11 +2512,11 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
@@ -2700,11 +2698,11 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -2712,11 +2710,11 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
@@ -2940,9 +2938,8 @@ define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -3006,13 +3003,13 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -3149,25 +3146,25 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -3354,8 +3351,8 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -3415,11 +3412,11 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
@@ -3542,21 +3539,21 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -3736,9 +3733,8 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -3802,13 +3798,13 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -3945,25 +3941,25 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpxor %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -4150,8 +4146,8 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -4211,11 +4207,11 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
@@ -4338,21 +4334,21 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -4532,9 +4528,8 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -4598,13 +4593,13 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
;
; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -4741,25 +4736,25 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
;
; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpor %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpor %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpor %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
@@ -4946,8 +4941,8 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
@@ -5007,11 +5002,11 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
@@ -5134,21 +5129,21 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
;
; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll
index 232c15a4026..d3dd634be2f 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus-widen.ll
@@ -206,9 +206,8 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -615,20 +614,20 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
-; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index f49ee8f8445..f2cd43087bd 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -206,9 +206,8 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -615,20 +614,20 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
-; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll
index 7ba3dc89aa0..01dd4c4cf30 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat-widen.ll
@@ -218,9 +218,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -644,19 +643,19 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64> %a0) {
; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index d690c0f2ef0..6233b7739a9 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -218,9 +218,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -644,19 +643,19 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64> %a0) {
; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll
index 6922fde61b6..b33c85bedf4 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat-widen.ll
@@ -135,9 +135,8 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -409,17 +408,17 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64> %a0) {
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm4
+; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm4
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index c5a0bab332f..d861a794ce8 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -135,9 +135,8 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
@@ -409,17 +408,17 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64> %a0) {
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm4
+; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm4
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-trunc-widen.ll b/llvm/test/CodeGen/X86/vector-trunc-widen.ll
index c3f5729b137..9b6fd5a2c1e 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-widen.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-widen.ll
@@ -29,10 +29,10 @@ define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
;
; AVX2-SLOW-LABEL: trunc8i64_8i32:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
@@ -72,10 +72,10 @@ define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
;
; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
@@ -117,13 +117,13 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
;
; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
@@ -207,11 +207,11 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
;
; AVX2-SLOW-LABEL: trunc8i64_8i16:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -1383,10 +1383,10 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index 98362b11762..dae979635c0 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -29,10 +29,10 @@ define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
;
; AVX2-SLOW-LABEL: trunc8i64_8i32:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
@@ -72,10 +72,10 @@ define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
;
; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
@@ -117,13 +117,13 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
;
; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
@@ -207,11 +207,11 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
;
; AVX2-SLOW-LABEL: trunc8i64_8i16:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -303,10 +303,10 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
;
; AVX2-SLOW-LABEL: trunc8i64_8i8:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
@@ -1393,10 +1393,10 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
@@ -1507,10 +1507,10 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-SLOW-LABEL: trunc2x4i64_8i16:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
OpenPOWER on IntegriCloud