diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 124 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll | 30 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll | 15 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll | 14 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/widen_conversions.ll | 2 |
5 files changed, 117 insertions, 68 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9fa6267b4d2..3bd25479a6d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7393,16 +7393,21 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, return Zeroable; } -/// \brief Try to lower a vector shuffle as a zero extension. +/// \brief Try to lower a vector shuffle as a zero extension on any micrarch. /// -/// This tries to use the SSE4.1 PMOVZX instruction family to lower a vector -/// shuffle throuh a zero extension. It doesn't check for the availability or -/// profitability of this lowering though, it tries to aggressively match this -/// pattern. It handles both blends with all-zero inputs to explicitly -/// zero-extend and undef-lanes (sometimes undef due to masking out later). -static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { +/// This routine will try to do everything in its power to cleverly lower +/// a shuffle which happens to match the pattern of a zero extend. It doesn't +/// check for the profitability of this lowering, it tries to aggressively +/// match this pattern. It will use all of the micro-architectural details it +/// can to emit an efficient lowering. It handles both blends with all-zero +/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to +/// masking out later). +/// +/// The reason we have dedicated lowering for zext-style shuffles is that they +/// are both incredibly common and often quite performance sensitive. +static SDValue lowerVectorShuffleAsZeroOrAnyExtend( + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Bits = VT.getSizeInBits(); @@ -7413,6 +7418,7 @@ static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1, // if valid. auto LowerWithStride = [&](int Stride) -> SDValue { SDValue InputV; + bool AnyExt = true; for (int i = 0; i < NumElements; ++i) { if (Mask[i] == -1) continue; // Valid anywhere but doesn't tell us anything. @@ -7420,8 +7426,10 @@ static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1, // Each of the extend elements needs to be zeroable. if (!Zeroable[i]) return SDValue(); - else - continue; + + // We no lorger are in the anyext case. + AnyExt = false; + continue; } // Each of the base elements needs to be consecutive indices into the @@ -7442,15 +7450,68 @@ static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1, if (!InputV) return SDValue(); - // Found a valid lowering! Compute all the types and the operation. We force - // everything to integer types here as that's the only way zext makes sense. - MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Stride), - NumElements / Stride); - - InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); + // Found a valid zext mask! Try various lowering strategies based on the + // input type and available ISA extensions. + if (Subtarget->hasSSE41()) { + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Stride), + NumElements / Stride); + InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); + } + + // For any extends we can cheat for larger element sizes and use shuffle + // instructions that can fold with a load and/or copy. + if (AnyExt && EltBits == 32) { + int PSHUFDMask[4] = {0, -1, 1, -1}; + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + } + if (AnyExt && EltBits == 16 && Stride > 2) { + int PSHUFDMask[4] = {0, -1, 0, -1}; + InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)); + int PSHUFHWMask[4] = {1, -1, -1, -1}; + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV), + getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG))); + } + + // If this would require more than 2 unpack instructions to expand, use + // pshufb when available. We can only use more than 2 unpack instructions + // when zero extending i8 elements which also makes it easier to use pshufb. + if (Stride > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { + assert(NumElements == 16 && "Unexpected byte vector width!"); + SDValue PSHUFBMask[16]; + for (int i = 0; i < 16; ++i) + PSHUFBMask[i] = + DAG.getConstant((i % Stride == 0) ? i / Stride : 0x80, MVT::i8); + InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v16i8, PSHUFBMask))); + } + + // Otherwise emit a sequence of unpacks. + do { + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); + SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) + : getZeroVector(InputVT, Subtarget, DAG, DL); + InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); + InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext); + Stride /= 2; + EltBits *= 2; + NumElements /= 2; + } while (Stride > 1); + return DAG.getNode(ISD::BITCAST, DL, VT, InputV); }; // The widest stride possible for zero extending is to a 64-bit integer. @@ -7469,7 +7530,7 @@ static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1, return V; } - // No viable zext lowering found. + // No viable ext lowering found. return SDValue(); } @@ -7843,10 +7904,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. - if (Subtarget->hasSSE41()) - if (SDValue ZExt = - lowerVectorShuffleAsZeroExtend(DL, MVT::v4i32, V1, V2, Mask, DAG)) - return ZExt; + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) @@ -8519,10 +8579,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. - if (Subtarget->hasSSE41()) - if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v8i16, V1, V2, - OrigMask, DAG)) - return ZExt; + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG)) + return ZExt; auto isV1 = [](int M) { return M >= 0 && M < 8; }; auto isV2 = [](int M) { return M >= 8; }; @@ -8688,10 +8747,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Rotate; // Try to use a zext lowering. - if (Subtarget->hasSSE41()) - if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v16i8, V1, V2, - OrigMask, DAG)) - return ZExt; + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + return ZExt; int MaskStorage[16] = { OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index e3684bfef00..36575463da3 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -515,13 +515,13 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu( ; SSE2-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu ; SSE2: # BB#0: ; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},1,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu @@ -533,15 +533,17 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu( } define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { +; SSE2-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz +; SSE2: # BB#0: +; SSE2-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]] +; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3],xmm0[4],[[X]][4],xmm0[5],[[X]][5],xmm0[6],[[X]][6],xmm0[7],[[X]][7] +; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3] +; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1] +; SSE2-NEXT: retq +; ; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] -; SSSE3-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]] -; SSSE3-NEXT: pshufb {{.*}} # [[X2]] = zero,[[X2]][2,4,6],zero,[[X2]][10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: por %xmm0, %[[X2]] -; SSSE3-NEXT: punpcklbw {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3],[[X2]][4],[[X1]][4],[[X2]][5],[[X1]][5],[[X2]][6],[[X1]][6],[[X2]][7],[[X1]][7] -; SSSE3-NEXT: movdqa %[[X2]], %xmm0 +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz @@ -576,16 +578,16 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu( define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) { ; SSE2-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz ; SSE2: # BB#0: -; SSE2-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] -; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] -; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] +; SSE2-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]] +; SSE2-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3],xmm0[4],[[X]][4],xmm0[5],[[X]][5],xmm0[6],[[X]][6],xmm0[7],[[X]][7] +; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz ; SSSE3: # BB#0: ; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] ; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] -; SSSE3-NEXT: punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7] +; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index 21d74928ebf..1dbc7f5e1da 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -778,23 +778,20 @@ define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) { ; SSE2-LABEL: @shuffle_v4i32_0z1z ; SSE2: # BB#0: -; SSE2-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]] -; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3] -; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]] +; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: @shuffle_v4i32_0z1z ; SSE3: # BB#0: -; SSE3-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]] -; SSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3] -; SSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] +; SSE3-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]] +; SSE3-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: @shuffle_v4i32_0z1z ; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %[[X:xmm[0-9]+]], %[[X]] -; SSSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3] -; SSSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: pxor %[[X:xmm[0-9]+]], %[[X]] +; SSSE3-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: @shuffle_v4i32_0z1z diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 89231586d06..79eecf32281 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1007,23 +1007,15 @@ define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) { ; SSE2-LABEL: @shuffle_v8i16_0zzz1zzz ; SSE2: # BB#0: ; SSE2-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] -; SSE2-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]] -; SSE2-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3] -; SSE2-NEXT: pshufd {{.*}} # xmm0 = [[X2]][0,3,2,1] -; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3] +; SSE2-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: @shuffle_v8i16_0zzz1zzz ; SSSE3: # BB#0: ; SSSE3-NEXT: pxor %[[X1:xmm[0-9]+]], %[[X1]] -; SSSE3-NEXT: pxor %[[X2:xmm[0-9]+]], %[[X2]] -; SSSE3-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3] -; SSSE3-NEXT: pshufb {{.*}} # [[X2]] = [[X2]][2,3,8,9,6,7,0,1,8,9,6,7,4,5,6,7] -; SSSE3-NEXT: punpcklwd {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3] -; SSSE3-NEXT: movdqa %[[X2]], %xmm0 +; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3] +; SSSE3-NEXT: punpckldq {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: @shuffle_v8i16_0zzz1zzz diff --git a/llvm/test/CodeGen/X86/widen_conversions.ll b/llvm/test/CodeGen/X86/widen_conversions.ll index 522ab475c2a..8e5174fbe76 100644 --- a/llvm/test/CodeGen/X86/widen_conversions.ll +++ b/llvm/test/CodeGen/X86/widen_conversions.ll @@ -9,7 +9,7 @@ define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) { ; CHECK: movd (%{{.*}}), %[[X:xmm[0-9]+]] ; CHECK-NEXT: pxor %[[Z:xmm[0-9]+]], %[[Z]] ; CHECK-NEXT: punpcklbw %[[Z]], %[[X]] -; CHECK-NEXT: punpcklbw %[[Z]], %[[X]] +; CHECK-NEXT: punpcklwd %[[Z]], %[[X]] ; CHECK-NEXT: ret %val = load <4 x i8>* %ptr |