diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2019-02-24 19:57:52 +0000 | 
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2019-02-24 19:57:52 +0000 | 
| commit | cfaf663a356d1c97b42a62f42c34f316bf0edb49 (patch) | |
| tree | 2ed1ca09a093203caa0a9802d81a3d63ede1718a /llvm | |
| parent | 3fe4bd464cc647da36a5c4f4c0015fb653f0e3b1 (diff) | |
| download | bcm5719-llvm-cfaf663a356d1c97b42a62f42c34f316bf0edb49.tar.gz bcm5719-llvm-cfaf663a356d1c97b42a62f42c34f316bf0edb49.zip  | |
[X86] Combine zext(packus(x),packus(y)) -> concat(x,y) (PR39637)
Its proving tricky to combine shuffles across multiple vector sizes, so for now I'm adding this more specific combine - the pattern is common enough to be worth it as a first step.
llvm-svn: 354757
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 14 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/combine-udiv.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll | 9 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll | 32 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-reduce-mul.ll | 32 | 
5 files changed, 40 insertions, 50 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ae7fd6c93b3..dc50eca4258 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -40328,6 +40328,20 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,    if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))      return R; +  // TODO: Combine with any target/faux shuffle. +  if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 && +      VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) { +    SDValue N00 = N0.getOperand(0); +    SDValue N01 = N0.getOperand(1); +    unsigned NumSrcElts = N00.getValueType().getVectorNumElements(); +    unsigned NumSrcEltBits = N00.getScalarValueSizeInBits(); +    APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2); +    if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) && +        (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) { +      return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128); +    } +  } +    return SDValue();  } diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index bfd3095ab96..9da6a988e77 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -702,9 +702,6 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {  ; AVX2-NEXT:    vmovd %eax, %xmm2  ; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1  ; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1 -; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero  ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1  ; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1  ; AVX2-NEXT:    vpackuswb %xmm0, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index ac6fe217e38..c34f46ba4d9 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -387,9 +387,6 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {  ; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero  ; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1  ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2 -; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero  ; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1  ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1  ; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2 @@ -910,9 +907,6 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {  ; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero  ; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1  ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2 -; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero  ; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1  ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1  ; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2 @@ -927,9 +921,6 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {  ; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero  ; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1  ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2 -; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero  ; AVX2NOBW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1  ; AVX2NOBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1  ; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll index 360b605da78..e30da07e635 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll @@ -1840,27 +1840,21 @@ define i8 @test_v16i8(<16 x i8> %a0) {  ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]  ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero  ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]  ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm2 -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT:    vpsrld $16, %xmm2, %xmm2  ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm1 +; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1  ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero  ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index 32da534d42d..f5ce68807ce 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -1795,27 +1795,21 @@ define i8 @test_v16i8(<16 x i8> %a0) {  ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]  ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero  ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]  ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm2 -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT:    vpsrld $16, %xmm2, %xmm2  ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm1 +; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1  ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero  ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax  | 

