summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll')
-rw-r--r--llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll422
1 files changed, 362 insertions, 60 deletions
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
index 40fabcf04d4..a4698a51ba1 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
@@ -31,15 +31,25 @@ define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlw $8, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8_1:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -75,15 +85,24 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
;
; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrld $16, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16_1:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,17,19,21,23,9,11,13,15,25,27,29,31]
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
+; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
@@ -149,15 +168,39 @@ define void @shuffle_v64i8_to_v16i8_1(<64 x i8>* %L, <16 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_1:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlw $8, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_1:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -207,15 +250,39 @@ define void @shuffle_v64i8_to_v16i8_2(<64 x i8>* %L, <16 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_2:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrld $16, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_2:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -265,15 +332,39 @@ define void @shuffle_v64i8_to_v16i8_3(<64 x i8>* %L, <16 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_3:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrld $24, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_3:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -327,15 +418,32 @@ define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrld $16, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovqw %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
@@ -389,15 +497,32 @@ define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX512BW-NEXT: vpmovqw %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
@@ -451,15 +576,32 @@ define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlq $48, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovqw %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
@@ -509,15 +651,42 @@ define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlw $8, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -567,15 +736,30 @@ define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrld $16, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -625,15 +809,42 @@ define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrld $24, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -683,15 +894,30 @@ define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_4:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -741,15 +967,49 @@ define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlq $40, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrlq $40, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -799,15 +1059,30 @@ define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlq $48, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -857,15 +1132,42 @@ define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlq $56, (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7:
; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vpsrlq $56, (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [14,14,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
OpenPOWER on IntegriCloud