diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-06-08 10:29:00 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-06-08 10:29:00 +0000 |
commit | ad45efc445e9dcabf0fc3d42dba89910630da4e3 (patch) | |
tree | 36cfdfa45a7ecd8887dc80422e0cfaf4bfa984f3 | |
parent | 84e6ef00dc96cedda96959f3e03852d2336c7b8c (diff) | |
download | bcm5719-llvm-ad45efc445e9dcabf0fc3d42dba89910630da4e3.tar.gz bcm5719-llvm-ad45efc445e9dcabf0fc3d42dba89910630da4e3.zip |
[X86][SSE] Consistently prefer lowering to PACKUS over PACKSS
We have some combines/lowerings that attempt to use PACKSS-then-PACKUS and others that use PACKUS-then-PACKSS.
PACKUS is much easier to combine with if we know the upper bits are zero as ComputeKnownBits can easily see through BITCASTs etc. especially now that rL333995 and rL334007 have landed. It also effectively works at byte level which further simplifies shuffle combines.
The only (minor) annoyances are that ComputeKnownBits can sometimes take longer as it doesn't fail as quickly as ComputeNumSignBits (but I'm not seeing any actual regressions in tests) and PACKUSDW only became available after SSE41 so we have more codegen diffs between targets.
llvm-svn: 334276
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 64 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avg.ll | 24 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-trunc-packus.ll | 58 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-trunc-usat.ll | 28 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-trunc.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/widen_arith-2.ll | 2 |
7 files changed, 93 insertions, 93 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f55be2958c9..de2ea88cd66 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9404,15 +9404,6 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, auto MatchPACK = [&](SDValue N1, SDValue N2) { SDValue VV1 = DAG.getBitcast(PackVT, N1); SDValue VV2 = DAG.getBitcast(PackVT, N2); - if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) && - (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) { - V1 = VV1; - V2 = VV2; - SrcVT = PackVT; - PackOpcode = X86ISD::PACKSS; - return true; - } - if (Subtarget.hasSSE41() || PackSVT == MVT::i16) { APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize); if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) && @@ -9424,7 +9415,14 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, return true; } } - + if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) && + (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) { + V1 = VV1; + V2 = VV2; + SrcVT = PackVT; + PackOpcode = X86ISD::PACKSS; + return true; + } return false; }; @@ -16897,8 +16895,8 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, if (SrcVT.is128BitVector()) { InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits()); OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits()); - SDValue Res = DAG.getNode(Opcode, DL, OutVT, - DAG.getBitcast(InVT, In), DAG.getUNDEF(InVT)); + In = DAG.getBitcast(InVT, In); + SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In); Res = extractSubVector(Res, 0, DAG, DL, 64); return DAG.getBitcast(DstVT, Res); } @@ -17057,25 +17055,26 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { } } - // Truncate with PACKSS if we are truncating a vector with sign-bits that - // extend all the way to the packed/truncated value. - unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16); - if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In)) - if (SDValue V = - truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget)) - return V; + unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16); + unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; // Truncate with PACKUS if we are truncating a vector with leading zero bits // that extend all the way to the packed/truncated value. // Pre-SSE41 we can only use PACKUSWB. KnownBits Known; DAG.computeKnownBits(In, Known); - NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8; - if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros()) + if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget)) return V; + // Truncate with PACKSS if we are truncating a vector with sign-bits that + // extend all the way to the packed/truncated value. + if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In)) + if (SDValue V = + truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget)) + return V; + if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { @@ -35282,9 +35281,6 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) && (SVT == MVT::i8 || SVT == MVT::i16) && (InSVT == MVT::i16 || InSVT == MVT::i32)) { - if (auto SSatVal = detectSSatPattern(In, VT)) - return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG, - Subtarget); if (auto USatVal = detectSSatPattern(In, VT, true)) { // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW). if (SVT == MVT::i8 && InSVT == MVT::i32) { @@ -35299,6 +35295,9 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG, Subtarget); } + if (auto SSatVal = detectSSatPattern(In, VT)) + return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG, + Subtarget); } return SDValue(); } @@ -36530,22 +36529,23 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) return SDValue(); - // Use PACKSS if the input has sign-bits that extend all the way to the - // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. - unsigned NumSignBits = DAG.ComputeNumSignBits(In); - unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16); - if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits)) - return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); + unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16); + unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; // Use PACKUS if the input has zero-bits that extend all the way to the // packed/truncated value. e.g. masks, zext_in_reg, etc. KnownBits Known; DAG.computeKnownBits(In, Known); unsigned NumLeadingZeroBits = Known.countMinLeadingZeros(); - NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8; - if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits)) + if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits)) return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget); + // Use PACKSS if the input has sign-bits that extend all the way to the + // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. + unsigned NumSignBits = DAG.ComputeNumSignBits(In); + if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits)) + return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index cd8bc34f059..95cae721472 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -347,22 +347,22 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm3 ; AVX1-NEXT: vpsrld $1, %xmm12, %xmm4 -; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrld $1, %xmm11, %xmm4 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm5 -; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5 ; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6 -; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 @@ -434,26 +434,26 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 +; AVX2-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-NEXT: vpackssdw %xmm7, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm5, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX2-NEXT: vmovdqu %xmm0, (%rax) diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll index dc60fd5c672..829dd42b8bc 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll @@ -180,7 +180,7 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 ; AVX256-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX256-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index afd692866db..6aaa00967a4 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -1415,7 +1415,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm10, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm3 @@ -1438,8 +1438,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: packssdw %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v8i64_v8i8: @@ -1521,7 +1521,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm7, %xmm0 ; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pxor %xmm10, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm3 @@ -1544,8 +1544,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: packssdw %xmm3, %xmm2 -; SSSE3-NEXT: packssdw %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v8i64_v8i8: @@ -1626,7 +1626,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { ; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: packssdw %xmm5, %xmm1 +; SSE41-NEXT: packusdw %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: xorpd %xmm10, %xmm0 ; SSE41-NEXT: movapd %xmm0, %xmm4 @@ -1650,8 +1650,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: packssdw %xmm4, %xmm3 -; SSE41-NEXT: packssdw %xmm3, %xmm1 +; SSE41-NEXT: packusdw %xmm4, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -1678,11 +1678,11 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm2 ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1698,10 +1698,10 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { ; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2109,11 +2109,11 @@ define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] @@ -2676,7 +2676,7 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) { ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: packssdw %xmm4, %xmm1 +; SSE41-NEXT: packusdw %xmm4, %xmm1 ; SSE41-NEXT: movapd %xmm6, %xmm0 ; SSE41-NEXT: xorpd %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm0, %xmm2 @@ -2701,7 +2701,7 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) { ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 -; SSE41-NEXT: packssdw %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm2, %xmm3 ; SSE41-NEXT: packusdw %xmm3, %xmm1 ; SSE41-NEXT: movapd %xmm15, %xmm0 ; SSE41-NEXT: xorpd %xmm9, %xmm0 @@ -2727,7 +2727,7 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) { ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm3 -; SSE41-NEXT: packssdw %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm2, %xmm3 ; SSE41-NEXT: movapd %xmm13, %xmm0 ; SSE41-NEXT: xorpd %xmm9, %xmm0 ; SSE41-NEXT: movapd %xmm0, %xmm2 @@ -2751,7 +2751,7 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm8 -; SSE41-NEXT: packssdw %xmm2, %xmm8 +; SSE41-NEXT: packusdw %xmm2, %xmm8 ; SSE41-NEXT: packusdw %xmm8, %xmm3 ; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 @@ -2796,17 +2796,17 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) { ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpand %xmm15, %xmm6, %xmm4 -; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpand %xmm5, %xmm13, %xmm4 ; AVX1-NEXT: vpand %xmm2, %xmm12, %xmm2 -; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm7, %xmm11, %xmm3 ; AVX1-NEXT: vpand %xmm1, %xmm10, %xmm1 -; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm14, %xmm9, %xmm3 ; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0 -; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper @@ -2886,7 +2886,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v8i32_v8i8: @@ -2909,7 +2909,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: packssdw %xmm2, %xmm0 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v8i32_v8i8: @@ -2920,7 +2920,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pmaxsd %xmm2, %xmm1 ; SSE41-NEXT: pmaxsd %xmm2, %xmm0 -; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_packus_v8i32_v8i8: @@ -2932,7 +2932,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -2943,7 +2943,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index 95321fa4271..60e7e15c606 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -1114,7 +1114,7 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) { ; SSE41-NEXT: por %xmm7, %xmm0 ; SSE41-NEXT: movapd %xmm5, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: packssdw %xmm4, %xmm1 +; SSE41-NEXT: packusdw %xmm4, %xmm1 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm0 ; SSE41-NEXT: movdqa %xmm9, %xmm4 @@ -1137,7 +1137,7 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: packssdw %xmm4, %xmm5 +; SSE41-NEXT: packusdw %xmm4, %xmm5 ; SSE41-NEXT: packusdw %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1162,9 +1162,9 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) { ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1443,11 +1443,11 @@ define void @trunc_usat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] @@ -1732,7 +1732,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) { ; SSE41-NEXT: por %xmm14, %xmm0 ; SSE41-NEXT: movapd %xmm9, %xmm13 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm13 -; SSE41-NEXT: packssdw %xmm12, %xmm13 +; SSE41-NEXT: packusdw %xmm12, %xmm13 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm10, %xmm0 ; SSE41-NEXT: movdqa %xmm11, %xmm1 @@ -1757,7 +1757,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) { ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm9, %xmm1 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: packssdw %xmm12, %xmm1 +; SSE41-NEXT: packusdw %xmm12, %xmm1 ; SSE41-NEXT: packusdw %xmm1, %xmm13 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm10, %xmm0 @@ -1783,7 +1783,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) { ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: movapd %xmm9, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: packssdw %xmm1, %xmm2 +; SSE41-NEXT: packusdw %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm10, %xmm0 ; SSE41-NEXT: movdqa %xmm11, %xmm1 @@ -1806,7 +1806,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm9 -; SSE41-NEXT: packssdw %xmm1, %xmm9 +; SSE41-NEXT: packusdw %xmm1, %xmm9 ; SSE41-NEXT: packusdw %xmm9, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm13 ; SSE41-NEXT: movdqa %xmm13, %xmm0 @@ -1846,14 +1846,14 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) { ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-NEXT: vblendvpd %ymm4, %ymm3, %ymm8, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 36818a6294e..3fde30c7122 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -974,10 +974,10 @@ define void @trunc16i32_16i8_lshr(<16 x i32> %a) { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: psrld $24, %xmm1 ; SSE41-NEXT: psrld $24, %xmm0 -; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: psrld $24, %xmm3 ; SSE41-NEXT: psrld $24, %xmm2 -; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm0 ; SSE41-NEXT: movdqu %xmm0, (%rax) ; SSE41-NEXT: retq @@ -987,11 +987,11 @@ define void @trunc16i32_16i8_lshr(<16 x i32> %a) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/widen_arith-2.ll b/llvm/test/CodeGen/X86/widen_arith-2.ll index aa2573f9b2c..d6c6aa50997 100644 --- a/llvm/test/CodeGen/X86/widen_arith-2.ll +++ b/llvm/test/CodeGen/X86/widen_arith-2.ll @@ -24,7 +24,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind { ; CHECK-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; CHECK-NEXT: psubw %xmm0, %xmm2 ; CHECK-NEXT: pand %xmm1, %xmm2 -; CHECK-NEXT: packsswb %xmm0, %xmm2 +; CHECK-NEXT: packuswb %xmm0, %xmm2 ; CHECK-NEXT: movq %xmm2, (%edx,%eax,8) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: .LBB0_1: # %forcond |