diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-12-05 11:25:13 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-12-05 11:25:13 +0000 |
commit | b08c98f125f44fc7ae1101bbfcedd5c48853b52e (patch) | |
tree | a670a3fbbf20f13d68bc0b70781c5e5f5d748841 /llvm | |
parent | 2dd0e1bca33c2408893f88d33c9c6595cbdc9484 (diff) | |
download | bcm5719-llvm-b08c98f125f44fc7ae1101bbfcedd5c48853b52e.tar.gz bcm5719-llvm-b08c98f125f44fc7ae1101bbfcedd5c48853b52e.zip |
[X86][SSE] Add support for combining target shuffles to UNPCKL/UNPCKH.
llvm-svn: 288663
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 92 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/combine-srem.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/combine-urem.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll | 8 |
6 files changed, 79 insertions, 57 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5ca3831ccb0..bd8c6a2302d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25846,8 +25846,10 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, SDValue &V1, SDValue &V2, const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &ShuffleVT) { + unsigned &Shuffle, MVT &ShuffleVT, + bool IsUnary) { bool FloatDomain = MaskVT.isFloatingPoint(); + unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) { @@ -25875,33 +25877,65 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, ShuffleVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) { - V2 = V1; - Shuffle = X86ISD::UNPCKL; - ShuffleVT = MVT::v4f32; - return true; - } - if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) { - V2 = V1; - Shuffle = X86ISD::UNPCKH; - ShuffleVT = MVT::v4f32; - return true; - } - if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) || - isTargetShuffleEquivalent( - Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) { - V2 = V1; - Shuffle = X86ISD::UNPCKL; - ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; - return true; - } - if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) || - isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, - 13, 14, 14, 15, 15})) { - V2 = V1; - Shuffle = X86ISD::UNPCKH; - ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8; - return true; + } + + // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle. + if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || + (MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { + MVT LegalVT = MaskVT; + if (LegalVT.is256BitVector() && !Subtarget.hasAVX2()) + LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); + + SmallVector<int, 64> Unpckl, Unpckh; + if (IsUnary) { + createUnpackShuffleMask(MaskVT, Unpckl, true, true); + if (isTargetShuffleEquivalent(Mask, Unpckl)) { + V2 = V1; + Shuffle = X86ISD::UNPCKL; + ShuffleVT = LegalVT; + return true; + } + + createUnpackShuffleMask(MaskVT, Unpckh, false, true); + if (isTargetShuffleEquivalent(Mask, Unpckh)) { + V2 = V1; + Shuffle = X86ISD::UNPCKH; + ShuffleVT = LegalVT; + return true; + } + } else { + createUnpackShuffleMask(MaskVT, Unpckl, true, false); + if (isTargetShuffleEquivalent(Mask, Unpckl)) { + Shuffle = X86ISD::UNPCKL; + ShuffleVT = LegalVT; + return true; + } + + createUnpackShuffleMask(MaskVT, Unpckh, false, false); + if (isTargetShuffleEquivalent(Mask, Unpckh)) { + Shuffle = X86ISD::UNPCKH; + ShuffleVT = LegalVT; + return true; + } + + ShuffleVectorSDNode::commuteMask(Unpckl); + if (isTargetShuffleEquivalent(Mask, Unpckl)) { + std::swap(V1, V2); + Shuffle = X86ISD::UNPCKL; + ShuffleVT = LegalVT; + return true; + } + + ShuffleVectorSDNode::commuteMask(Unpckh); + if (isTargetShuffleEquivalent(Mask, Unpckh)) { + std::swap(V1, V2); + Shuffle = X86ISD::UNPCKH; + ShuffleVT = LegalVT; + return true; + } } } @@ -26167,7 +26201,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } if (matchBinaryVectorShuffle(MaskVT, Mask, V1, V2, Subtarget, Shuffle, - ShuffleVT)) { + ShuffleVT, UnaryShuffle)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll index e4cf821e003..f400781c420 100644 --- a/llvm/test/CodeGen/X86/combine-srem.ll +++ b/llvm/test/CodeGen/X86/combine-srem.ll @@ -56,12 +56,11 @@ define <4 x i32> @combine_vec_srem_by_pos1(<4 x i32> %x) { ; SSE-NEXT: andl $7, %eax ; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; SSE-NEXT: pextrd $1, %xmm0, %eax ; SSE-NEXT: andl $3, %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_srem_by_pos1: @@ -74,12 +73,11 @@ define <4 x i32> @combine_vec_srem_by_pos1(<4 x i32> %x) { ; AVX-NEXT: andl $7, %eax ; AVX-NEXT: vmovd %eax, %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX-NEXT: vpextrd $1, %xmm0, %eax ; AVX-NEXT: andl $3, %eax ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255> %2 = srem <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16> diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll index 0dc10164a5c..0c39bb280e8 100644 --- a/llvm/test/CodeGen/X86/combine-urem.ll +++ b/llvm/test/CodeGen/X86/combine-urem.ll @@ -54,12 +54,11 @@ define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) { ; SSE-NEXT: andl $7, %eax ; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; SSE-NEXT: pextrd $1, %xmm0, %eax ; SSE-NEXT: andl $3, %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_urem_by_pow2b: @@ -71,12 +70,11 @@ define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) { ; AVX-NEXT: andl $7, %eax ; AVX-NEXT: vmovd %eax, %xmm2 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX-NEXT: vpextrd $1, %xmm0, %eax ; AVX-NEXT: andl $3, %eax ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = urem <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16> ret <4 x i32> %1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 749c49f7859..dc9df248a76 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -907,14 +907,12 @@ define <16 x i32> @combine_vpermi2var_16i32_identity(<16 x i32> %x0, <16 x i32> define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) { ; X32-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps: ; X32: # BB#0: -; X32-NEXT: vmovaps {{.*#+}} zmm2 = [18,2,19,3,22,6,23,7,26,10,27,11,30,14,31,15] -; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X32-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps: ; X64: # BB#0: -; X64-NEXT: vmovaps {{.*#+}} zmm2 = [18,2,19,3,22,6,23,7,26,10,27,11,30,14,31,15] -; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X64-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; X64-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> <i32 18, i32 2, i32 19, i32 3, i32 22, i32 6, i32 23, i32 7, i32 26, i32 10, i32 27, i32 11, i32 30, i32 14, i32 31, i32 15>, <16 x float> %a1, i16 -1) ret <16 x float> %res0 @@ -923,14 +921,12 @@ define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) { ; X32-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq: ; X32: # BB#0: -; X32-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,1,17,4,20,5,21,8,24,9,25,12,28,13,29] -; X32-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; X32-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X32-NEXT: retl ; ; X64-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq: ; X64: # BB#0: -; X64-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,1,17,4,20,5,21,8,24,9,25,12,28,13,29] -; X64-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; X64-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X64-NEXT: retq %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>, <16 x i32> %a1, i16 -1) ret <16 x i32> %res0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index 33629a3288d..c5b4c50b2ca 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -78,14 +78,12 @@ define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_vperm2(<16 x i16> %x0, define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpckhwd(<16 x i16> %a0, <16 x i16> %a1) { ; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpckhwd: ; X32: # BB#0: -; X32-NEXT: vmovdqu16 {{.*#+}} ymm2 = [20,4,21,5,22,6,23,7,28,12,29,13,30,14,31,15] -; X32-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; X32-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpckhwd: ; X64: # BB#0: -; X64-NEXT: vmovdqu16 {{.*#+}} ymm2 = [20,4,21,5,22,6,23,7,28,12,29,13,30,14,31,15] -; X64-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; X64-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; X64-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %a0, <16 x i16> <i16 20, i16 4, i16 21, i16 5, i16 22, i16 6, i16 23, i16 7, i16 28, i16 12, i16 29, i16 13, i16 30, i16 14, i16 31, i16 15>, <16 x i16> %a1, i16 -1) ret <16 x i16> %res0 @@ -94,14 +92,12 @@ define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpckhwd(<16 x i16> %a define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpcklwd(<16 x i16> %a0, <16 x i16> %a1) { ; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpcklwd: ; X32: # BB#0: -; X32-NEXT: vmovdqu16 {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] -; X32-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpcklwd: ; X64: # BB#0: -; X64-NEXT: vmovdqu16 {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] -; X64-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; X64-NEXT: retq %res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 0, i16 16, i16 1, i16 17, i16 2, i16 18, i16 3, i16 19, i16 8, i16 24, i16 9, i16 25, i16 10, i16 26, i16 11, i16 27>, <16 x i16> %a0, <16 x i16> %a1, i16 -1) ret <16 x i16> %res0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 3d864358fbf..707cafe9931 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -230,12 +230,12 @@ define <16 x i8> @combine_vpperm_as_unary_unpckhbw(<16 x i8> %a0, <16 x i8> %a1) define <16 x i8> @combine_vpperm_as_unpckhbw(<16 x i8> %a0, <16 x i8> %a1) { ; X32-LABEL: combine_vpperm_as_unpckhbw: ; X32: # BB#0: -; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; X32-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; X32-NEXT: retl ; ; X64-LABEL: combine_vpperm_as_unpckhbw: ; X64: # BB#0: -; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; X64-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; X64-NEXT: retq %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 8, i8 24, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>) ret <16 x i8> %res0 @@ -244,12 +244,12 @@ define <16 x i8> @combine_vpperm_as_unpckhbw(<16 x i8> %a0, <16 x i8> %a1) { define <16 x i8> @combine_vpperm_as_unpcklbw(<16 x i8> %a0, <16 x i8> %a1) { ; X32-LABEL: combine_vpperm_as_unpcklbw: ; X32: # BB#0: -; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X32-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X32-NEXT: retl ; ; X64-LABEL: combine_vpperm_as_unpcklbw: ; X64: # BB#0: -; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; X64-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X64-NEXT: retq %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 16, i8 0, i8 17, i8 1, i8 18, i8 2, i8 19, i8 3, i8 20, i8 4, i8 21, i8 5, i8 22, i8 6, i8 23, i8 7>) ret <16 x i8> %res0 |