diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-10-12 14:18:47 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-10-12 14:18:47 +0000 |
| commit | 78b5a3c3ef120e51e31a592ec98b2f0558f2f284 (patch) | |
| tree | c2bec427e9a5038ceef5680db580c4620c636110 | |
| parent | 9552dd187aadd92aeacda13ad4294be12ebe85ab (diff) | |
| download | bcm5719-llvm-78b5a3c3ef120e51e31a592ec98b2f0558f2f284.tar.gz bcm5719-llvm-78b5a3c3ef120e51e31a592ec98b2f0558f2f284.zip | |
[X86][SSE] LowerVectorCTPOP - pull out repeated byte sum stage.
Pull out repeated byte sum stage for popcount of vector elements > 8bits.
This allows us to simplify the LUT/BITMATH popcnt code to always assume vXi8 vectors, and also improves avx512bitalg codegen which only has access to vpopcntb/vpopcntw.
llvm-svn: 344348
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 81 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vec_ctbits.ll | 84 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-lzcnt-128.ll | 144 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-popcnt-128.ll | 112 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-popcnt-256.ll | 38 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-popcnt-512.ll | 18 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-tzcnt-128.ll | 220 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-tzcnt-256.ll | 72 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-tzcnt-512.ll | 36 |
9 files changed, 292 insertions, 513 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 15bd238833d..d2971d0f861 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25023,7 +25023,8 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); - unsigned VecSize = VT.getSizeInBits(); + int NumElts = VT.getVectorNumElements(); + assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."); // Implement a lookup table in register by using an algorithm based on: // http://wm.ite.pl/articles/sse-popcount.html @@ -25035,56 +25036,37 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, // masked out higher ones) for each byte. PSHUFB is used separately with both // to index the in-register table. Next, both are added and the result is a // i8 vector where each element contains the pop count for input byte. - // - // To obtain the pop count for elements != i8, we follow up with the same - // approach and use additional tricks as described below. - // const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; - int NumByteElts = VecSize / 8; - MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); - SDValue In = DAG.getBitcast(ByteVecVT, Op); SmallVector<SDValue, 64> LUTVec; - for (int i = 0; i < NumByteElts; ++i) + for (int i = 0; i < NumElts; ++i) LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); - SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec); - SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT); + SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec); + SDValue M0F = DAG.getConstant(0x0F, DL, VT); // High nibbles - SDValue FourV = DAG.getConstant(4, DL, ByteVecVT); - SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); + SDValue FourV = DAG.getConstant(4, DL, VT); + SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV); // Low nibbles - SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F); + SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F); // The input vector is used as the shuffle mask that index elements into the // LUT. After counting low and high nibbles, add the vector to obtain the // final pop count per i8 element. - SDValue HighPopCnt = - DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles); - SDValue LowPopCnt = - DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles); - SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt); - - if (EltVT == MVT::i8) - return PopCnt; - - return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG); + SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles); + SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles); + return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt); } static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - assert(VT.is128BitVector() && - "Only 128-bit vector bitmath lowering supported."); - - int VecSize = VT.getSizeInBits(); - MVT EltVT = VT.getVectorElementType(); - int Len = EltVT.getSizeInBits(); + assert(VT == MVT::v16i8 && "Only v16i8 vector CTPOP lowering supported."); // This is the vectorized version of the "best" algorithm from // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel @@ -25108,36 +25090,27 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL, // x86, so set the SRL type to have elements at least i16 wide. This is // correct because all of our SRLs are followed immediately by a mask anyways // that handles any bits that sneak into the high bits of the byte elements. - MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16); - + MVT SrlVT = MVT::v8i16; SDValue V = Op; // v = v - ((v >> 1) & 0x55555555...) SDValue Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1)); - SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55))); + SDValue And = GetMask(Srl, APInt(8, 0x55)); V = DAG.getNode(ISD::SUB, DL, VT, V, And); // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) - SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33))); + SDValue AndLHS = GetMask(V, APInt(8, 0x33)); Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2)); - SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33))); + SDValue AndRHS = GetMask(Srl, APInt(8, 0x33)); V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS); // v = (v + (v >> 4)) & 0x0F0F0F0F... Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4)); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl); - V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F))); + V = GetMask(Add, APInt(8, 0x0F)); - // At this point, V contains the byte-wise population count, and we are - // merely doing a horizontal sum if necessary to get the wider element - // counts. - if (EltVT == MVT::i8) - return V; - - return LowerHorizontalByteSum( - DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget, - DAG); + return V; } // Please ensure that any codegen change from LowerVectorCTPOP is reflected in @@ -25163,12 +25136,6 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, } } - if (!Subtarget.hasSSSE3()) { - // We can't use the fast LUT approach, so fall back on vectorized bitmath. - assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); - return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); - } - // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); @@ -25177,6 +25144,18 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, if (VT.is512BitVector() && !Subtarget.hasBWI()) return Lower512IntUnary(Op, DAG); + // For element types greater than i8, do vXi8 pop counts and a bytesum. + if (VT.getScalarType() != MVT::i8) { + MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); + SDValue ByteOp = DAG.getBitcast(ByteVT, Op0); + SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp); + return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG); + } + + // We can't use the fast LUT approach, so fall back on vectorized bitmath. + if (!Subtarget.hasSSSE3()) + return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); + return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } diff --git a/llvm/test/CodeGen/X86/vec_ctbits.ll b/llvm/test/CodeGen/X86/vec_ctbits.ll index 781c61b5789..978a40cbb26 100644 --- a/llvm/test/CodeGen/X86/vec_ctbits.ll +++ b/llvm/test/CodeGen/X86/vec_ctbits.ll @@ -15,18 +15,18 @@ define <2 x i64> @footz(<2 x i64> %a) nounwind { ; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 ; CHECK-NEXT: paddq %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm0, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm3, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlq $2, %xmm3 +; CHECK-NEXT: psrlw $2, %xmm3 ; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: paddq %xmm2, %xmm3 +; CHECK-NEXT: paddb %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $4, %xmm0 -; CHECK-NEXT: paddq %xmm3, %xmm0 +; CHECK-NEXT: psrlw $4, %xmm0 +; CHECK-NEXT: paddb %xmm3, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -58,18 +58,18 @@ define <2 x i64> @foolz(<2 x i64> %a) nounwind { ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm0, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlq $2, %xmm1 +; CHECK-NEXT: psrlw $2, %xmm1 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: paddq %xmm2, %xmm1 +; CHECK-NEXT: paddb %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: psrlq $4, %xmm2 -; CHECK-NEXT: paddq %xmm1, %xmm2 +; CHECK-NEXT: psrlw $4, %xmm2 +; CHECK-NEXT: paddb %xmm1, %xmm2 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm2, %xmm0 @@ -83,18 +83,18 @@ define <2 x i64> @foopop(<2 x i64> %a) nounwind { ; CHECK-LABEL: foopop: ; CHECK: # %bb.0: ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $1, %xmm1 +; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psubq %xmm1, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pand %xmm1, %xmm2 -; CHECK-NEXT: psrlq $2, %xmm0 +; CHECK-NEXT: psrlw $2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $4, %xmm1 -; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: paddb %xmm0, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm0, %xmm1 @@ -119,18 +119,18 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind { ; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 ; CHECK-NEXT: paddq %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm0, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm3, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlq $2, %xmm3 +; CHECK-NEXT: psrlw $2, %xmm3 ; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: paddq %xmm2, %xmm3 +; CHECK-NEXT: paddb %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $4, %xmm0 -; CHECK-NEXT: paddq %xmm3, %xmm0 +; CHECK-NEXT: psrlw $4, %xmm0 +; CHECK-NEXT: paddb %xmm3, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -164,18 +164,18 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind { ; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: psrlq $2, %xmm2 +; CHECK-NEXT: psrlw $2, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: paddq %xmm3, %xmm2 +; CHECK-NEXT: paddb %xmm3, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $4, %xmm0 -; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: psrlw $4, %xmm0 +; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0 @@ -191,18 +191,18 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind { ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $1, %xmm1 +; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psubq %xmm1, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NEXT: pand %xmm1, %xmm3 -; CHECK-NEXT: psrlq $2, %xmm0 +; CHECK-NEXT: psrlw $2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: paddq %xmm3, %xmm0 +; CHECK-NEXT: paddb %xmm3, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $4, %xmm1 -; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: paddb %xmm0, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: psadbw %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll index dc945c84b19..34ea33d576c 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -37,18 +37,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubq %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlq $2, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlq $4, %xmm2 -; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm2, %xmm0 @@ -77,18 +77,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlq $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubq %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: psubb %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlq $2, %xmm1 +; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddq %xmm2, %xmm1 +; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psrlq $4, %xmm2 -; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm2, %xmm0 @@ -303,18 +303,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubq %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlq $2, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlq $4, %xmm2 -; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm2, %xmm0 @@ -343,18 +343,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlq $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubq %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: psubb %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlq $2, %xmm1 +; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddq %xmm2, %xmm1 +; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psrlq $4, %xmm2 -; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm2, %xmm0 @@ -566,18 +566,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: psubb %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $4, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -608,18 +608,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE3-NEXT: pxor %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrld $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubd %xmm0, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: psubb %xmm0, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrld $2, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: paddd %xmm1, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrld $4, %xmm0 -; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pxor %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 @@ -808,18 +808,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: psubb %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $4, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -850,18 +850,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE3-NEXT: pxor %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrld $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubd %xmm0, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: psubb %xmm0, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrld $2, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: paddd %xmm1, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrld $4, %xmm0 -; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pxor %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 @@ -1049,16 +1049,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubw %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 -; SSE2-NEXT: paddw %xmm1, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 @@ -1085,16 +1085,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubw %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: psubb %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddw %xmm2, %xmm1 +; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 -; SSE3-NEXT: paddw %xmm1, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 @@ -1255,16 +1255,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubw %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 -; SSE2-NEXT: paddw %xmm1, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 @@ -1291,16 +1291,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubw %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: psubb %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddw %xmm2, %xmm1 +; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 -; SSE3-NEXT: paddw %xmm1, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll index df42ebf2728..16539f1b2d4 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -14,18 +14,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-LABEL: testv2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $1, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlq $2, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $4, %xmm1 -; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm0, %xmm1 @@ -35,18 +35,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE3-LABEL: testv2i64: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlq $1, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubq %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlq $2, %xmm0 +; SSE3-NEXT: psrlw $2, %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddq %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlq $4, %xmm1 -; SSE3-NEXT: paddq %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm0, %xmm1 @@ -128,28 +128,16 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: retq @@ -161,18 +149,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE2-LABEL: testv4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrld $2, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $4, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -187,18 +175,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE3-LABEL: testv4i32: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrld $1, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubd %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrld $2, %xmm0 +; SSE3-NEXT: psrlw $2, %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrld $4, %xmm1 -; SSE3-NEXT: paddd %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: movdqa %xmm1, %xmm2 @@ -303,32 +291,20 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 @@ -346,16 +322,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 @@ -368,16 +344,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubw %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddw %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll index b2cc2f1ebed..570f59673d1 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -58,28 +58,15 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv4i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv4i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; BITALG-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: retq @@ -151,14 +138,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv8i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 @@ -169,14 +150,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; BITALG-LABEL: testv8i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; BITALG-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll index df5edc13c3e..eae9e6c79bd 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll @@ -50,14 +50,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; ; BITALG-LABEL: testv8i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; BITALG-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: retq @@ -122,14 +115,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; ; BITALG-LABEL: testv16i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; BITALG-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll index d19c10d68bc..a532794f89d 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -25,18 +25,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: paddq %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubq %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlq $2, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm3 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddq %xmm2, %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlq $4, %xmm0 -; SSE2-NEXT: paddq %xmm3, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -50,18 +50,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE3-NEXT: paddq %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlq $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubq %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: psubb %xmm0, %xmm3 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlq $2, %xmm3 +; SSE3-NEXT: psrlw $2, %xmm3 ; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddq %xmm2, %xmm3 +; SSE3-NEXT: paddb %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlq $4, %xmm0 -; SSE3-NEXT: paddq %xmm3, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq @@ -155,15 +155,9 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv2i64: @@ -173,14 +167,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: retq ; @@ -217,18 +204,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: paddq %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubq %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlq $2, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm3 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddq %xmm2, %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlq $4, %xmm0 -; SSE2-NEXT: paddq %xmm3, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -242,18 +229,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE3-NEXT: paddq %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlq $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubq %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: psubb %xmm0, %xmm3 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlq $2, %xmm3 +; SSE3-NEXT: psrlw $2, %xmm3 ; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddq %xmm2, %xmm3 +; SSE3-NEXT: paddb %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlq $4, %xmm0 -; SSE3-NEXT: paddq %xmm3, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq @@ -386,15 +373,9 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv2i64u: @@ -404,14 +385,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: retq ; @@ -448,18 +422,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrld $2, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm3 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $4, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -478,18 +452,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE3-NEXT: paddd %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrld $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubd %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: psubb %xmm0, %xmm3 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrld $2, %xmm3 +; SSE3-NEXT: psrlw $2, %xmm3 ; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddd %xmm2, %xmm3 +; SSE3-NEXT: paddb %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrld $4, %xmm0 -; SSE3-NEXT: paddd %xmm3, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -667,19 +641,13 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv4i32: @@ -689,14 +657,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -742,18 +703,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrld $2, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm3 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $4, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -772,18 +733,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE3-NEXT: paddd %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrld $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubd %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: psubb %xmm0, %xmm3 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrld $2, %xmm3 +; SSE3-NEXT: psrlw $2, %xmm3 ; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddd %xmm2, %xmm3 +; SSE3-NEXT: paddb %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrld $4, %xmm0 -; SSE3-NEXT: paddd %xmm3, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -938,19 +899,13 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv4i32u: @@ -960,14 +915,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1014,16 +962,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 @@ -1041,16 +989,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubw %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddw %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 @@ -1210,16 +1158,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 @@ -1237,16 +1185,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubw %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddw %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll index b1173fa4b88..cae0a2d605a 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -124,14 +124,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -142,14 +135,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: retq ; @@ -270,14 +256,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -288,14 +267,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: retq ; @@ -452,14 +424,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -474,14 +439,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -623,14 +581,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -645,14 +596,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll index 37c86f7f81a..4a9fd82593a 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll @@ -87,14 +87,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0) @@ -157,14 +150,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1) @@ -269,14 +255,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] @@ -347,14 +326,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] |

