summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/X86
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2018-10-12 14:18:47 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2018-10-12 14:18:47 +0000
commit78b5a3c3ef120e51e31a592ec98b2f0558f2f284 (patch)
treec2bec427e9a5038ceef5680db580c4620c636110 /llvm/test/CodeGen/X86
parent9552dd187aadd92aeacda13ad4294be12ebe85ab (diff)
downloadbcm5719-llvm-78b5a3c3ef120e51e31a592ec98b2f0558f2f284.tar.gz
bcm5719-llvm-78b5a3c3ef120e51e31a592ec98b2f0558f2f284.zip
[X86][SSE] LowerVectorCTPOP - pull out repeated byte sum stage.
Pull out repeated byte sum stage for popcount of vector elements > 8bits. This allows us to simplify the LUT/BITMATH popcnt code to always assume vXi8 vectors, and also improves avx512bitalg codegen which only has access to vpopcntb/vpopcntw. llvm-svn: 344348
Diffstat (limited to 'llvm/test/CodeGen/X86')
-rw-r--r--llvm/test/CodeGen/X86/vec_ctbits.ll84
-rw-r--r--llvm/test/CodeGen/X86/vector-lzcnt-128.ll144
-rw-r--r--llvm/test/CodeGen/X86/vector-popcnt-128.ll112
-rw-r--r--llvm/test/CodeGen/X86/vector-popcnt-256.ll38
-rw-r--r--llvm/test/CodeGen/X86/vector-popcnt-512.ll18
-rw-r--r--llvm/test/CodeGen/X86/vector-tzcnt-128.ll220
-rw-r--r--llvm/test/CodeGen/X86/vector-tzcnt-256.ll72
-rw-r--r--llvm/test/CodeGen/X86/vector-tzcnt-512.ll36
8 files changed, 262 insertions, 462 deletions
diff --git a/llvm/test/CodeGen/X86/vec_ctbits.ll b/llvm/test/CodeGen/X86/vec_ctbits.ll
index 781c61b5789..978a40cbb26 100644
--- a/llvm/test/CodeGen/X86/vec_ctbits.ll
+++ b/llvm/test/CodeGen/X86/vec_ctbits.ll
@@ -15,18 +15,18 @@ define <2 x i64> @footz(<2 x i64> %a) nounwind {
; CHECK-NEXT: pcmpeqd %xmm3, %xmm3
; CHECK-NEXT: paddq %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
-; CHECK-NEXT: psrlq $1, %xmm0
+; CHECK-NEXT: psrlw $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: psubq %xmm0, %xmm3
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm0, %xmm3
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm3, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: psrlq $2, %xmm3
+; CHECK-NEXT: psrlw $2, %xmm3
; CHECK-NEXT: pand %xmm0, %xmm3
-; CHECK-NEXT: paddq %xmm2, %xmm3
+; CHECK-NEXT: paddb %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
-; CHECK-NEXT: psrlq $4, %xmm0
-; CHECK-NEXT: paddq %xmm3, %xmm0
+; CHECK-NEXT: psrlw $4, %xmm0
+; CHECK-NEXT: paddb %xmm3, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -58,18 +58,18 @@ define <2 x i64> @foolz(<2 x i64> %a) nounwind {
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrlq $1, %xmm0
+; CHECK-NEXT: psrlw $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: psubq %xmm0, %xmm1
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm0, %xmm1
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm1, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: psrlq $2, %xmm1
+; CHECK-NEXT: psrlw $2, %xmm1
; CHECK-NEXT: pand %xmm0, %xmm1
-; CHECK-NEXT: paddq %xmm2, %xmm1
+; CHECK-NEXT: paddb %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: psrlq $4, %xmm2
-; CHECK-NEXT: paddq %xmm1, %xmm2
+; CHECK-NEXT: psrlw $4, %xmm2
+; CHECK-NEXT: paddb %xmm1, %xmm2
; CHECK-NEXT: pand {{.*}}(%rip), %xmm2
; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: psadbw %xmm2, %xmm0
@@ -83,18 +83,18 @@ define <2 x i64> @foopop(<2 x i64> %a) nounwind {
; CHECK-LABEL: foopop:
; CHECK: # %bb.0:
; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrlq $1, %xmm1
+; CHECK-NEXT: psrlw $1, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT: psubq %xmm1, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm1, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: pand %xmm1, %xmm2
-; CHECK-NEXT: psrlq $2, %xmm0
+; CHECK-NEXT: psrlw $2, %xmm0
; CHECK-NEXT: pand %xmm1, %xmm0
-; CHECK-NEXT: paddq %xmm2, %xmm0
+; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrlq $4, %xmm1
-; CHECK-NEXT: paddq %xmm0, %xmm1
+; CHECK-NEXT: psrlw $4, %xmm1
+; CHECK-NEXT: paddb %xmm0, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: psadbw %xmm0, %xmm1
@@ -119,18 +119,18 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind {
; CHECK-NEXT: pcmpeqd %xmm3, %xmm3
; CHECK-NEXT: paddq %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
-; CHECK-NEXT: psrlq $1, %xmm0
+; CHECK-NEXT: psrlw $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: psubq %xmm0, %xmm3
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm0, %xmm3
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm3, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: psrlq $2, %xmm3
+; CHECK-NEXT: psrlw $2, %xmm3
; CHECK-NEXT: pand %xmm0, %xmm3
-; CHECK-NEXT: paddq %xmm2, %xmm3
+; CHECK-NEXT: paddb %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
-; CHECK-NEXT: psrlq $4, %xmm0
-; CHECK-NEXT: paddq %xmm3, %xmm0
+; CHECK-NEXT: psrlw $4, %xmm0
+; CHECK-NEXT: paddb %xmm3, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -164,18 +164,18 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind {
; CHECK-NEXT: pcmpeqd %xmm2, %xmm2
; CHECK-NEXT: pxor %xmm0, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: psrlq $1, %xmm0
+; CHECK-NEXT: psrlw $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: psubq %xmm0, %xmm2
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm0, %xmm2
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm2, %xmm3
; CHECK-NEXT: pand %xmm0, %xmm3
-; CHECK-NEXT: psrlq $2, %xmm2
+; CHECK-NEXT: psrlw $2, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: paddq %xmm3, %xmm2
+; CHECK-NEXT: paddb %xmm3, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: psrlq $4, %xmm0
-; CHECK-NEXT: paddq %xmm2, %xmm0
+; CHECK-NEXT: psrlw $4, %xmm0
+; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0
@@ -191,18 +191,18 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind {
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrlq $1, %xmm1
+; CHECK-NEXT: psrlw $1, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT: psubq %xmm1, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm1, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm0, %xmm3
; CHECK-NEXT: pand %xmm1, %xmm3
-; CHECK-NEXT: psrlq $2, %xmm0
+; CHECK-NEXT: psrlw $2, %xmm0
; CHECK-NEXT: pand %xmm1, %xmm0
-; CHECK-NEXT: paddq %xmm3, %xmm0
+; CHECK-NEXT: paddb %xmm3, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrlq $4, %xmm1
-; CHECK-NEXT: paddq %xmm0, %xmm1
+; CHECK-NEXT: psrlw $4, %xmm1
+; CHECK-NEXT: paddb %xmm0, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
; CHECK-NEXT: psadbw %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
index dc945c84b19..34ea33d576c 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -37,18 +37,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubq %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT: psubb %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlq $2, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddq %xmm2, %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrlq $4, %xmm2
-; SSE2-NEXT: paddq %xmm1, %xmm2
+; SSE2-NEXT: psrlw $4, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: psadbw %xmm2, %xmm0
@@ -77,18 +77,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSE3-NEXT: pxor %xmm0, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlq $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubq %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT: psubb %xmm0, %xmm1
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlq $2, %xmm1
+; SSE3-NEXT: psrlw $2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddq %xmm2, %xmm1
+; SSE3-NEXT: paddb %xmm2, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: psrlq $4, %xmm2
-; SSE3-NEXT: paddq %xmm1, %xmm2
+; SSE3-NEXT: psrlw $4, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSE3-NEXT: pxor %xmm0, %xmm0
; SSE3-NEXT: psadbw %xmm2, %xmm0
@@ -303,18 +303,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubq %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT: psubb %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlq $2, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddq %xmm2, %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrlq $4, %xmm2
-; SSE2-NEXT: paddq %xmm1, %xmm2
+; SSE2-NEXT: psrlw $4, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: psadbw %xmm2, %xmm0
@@ -343,18 +343,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSE3-NEXT: pxor %xmm0, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlq $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubq %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT: psubb %xmm0, %xmm1
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlq $2, %xmm1
+; SSE3-NEXT: psrlw $2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddq %xmm2, %xmm1
+; SSE3-NEXT: paddb %xmm2, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: psrlq $4, %xmm2
-; SSE3-NEXT: paddq %xmm1, %xmm2
+; SSE3-NEXT: psrlw $4, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSE3-NEXT: pxor %xmm0, %xmm0
; SSE3-NEXT: psadbw %xmm2, %xmm0
@@ -566,18 +566,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psrld $2, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $4, %xmm0
-; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
@@ -608,18 +608,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSE3-NEXT: pxor %xmm1, %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrld $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubd %xmm0, %xmm2
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT: psubb %xmm0, %xmm2
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: psrld $2, %xmm2
+; SSE3-NEXT: psrlw $2, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: paddd %xmm1, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrld $4, %xmm0
-; SSE3-NEXT: paddd %xmm2, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: movdqa %xmm0, %xmm2
@@ -808,18 +808,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psrld $2, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $4, %xmm0
-; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
@@ -850,18 +850,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSE3-NEXT: pxor %xmm1, %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrld $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubd %xmm0, %xmm2
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT: psubb %xmm0, %xmm2
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: psrld $2, %xmm2
+; SSE3-NEXT: psrlw $2, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: paddd %xmm1, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrld $4, %xmm0
-; SSE3-NEXT: paddd %xmm2, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: movdqa %xmm0, %xmm2
@@ -1049,16 +1049,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubw %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT: psubb %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrlw $4, %xmm2
-; SSE2-NEXT: paddw %xmm1, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: psllw $8, %xmm0
@@ -1085,16 +1085,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE3-NEXT: movdqa %xmm1, %xmm0
; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubw %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT: psubb %xmm0, %xmm1
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
; SSE3-NEXT: psrlw $2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddw %xmm2, %xmm1
+; SSE3-NEXT: paddb %xmm2, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: psrlw $4, %xmm2
-; SSE3-NEXT: paddw %xmm1, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
@@ -1255,16 +1255,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubw %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT: psubb %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrlw $4, %xmm2
-; SSE2-NEXT: paddw %xmm1, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: psllw $8, %xmm0
@@ -1291,16 +1291,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE3-NEXT: movdqa %xmm1, %xmm0
; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubw %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT: psubb %xmm0, %xmm1
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
; SSE3-NEXT: psrlw $2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddw %xmm2, %xmm1
+; SSE3-NEXT: paddb %xmm2, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: psrlw $4, %xmm2
-; SSE3-NEXT: paddw %xmm1, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
index df42ebf2728..16539f1b2d4 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
@@ -14,18 +14,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-LABEL: testv2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $1, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: psrlq $2, %xmm0
+; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: paddq %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $4, %xmm1
-; SSE2-NEXT: paddq %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -35,18 +35,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE3-LABEL: testv2i64:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
-; SSE3-NEXT: psrlq $1, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT: psubq %xmm1, %xmm0
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: pand %xmm1, %xmm2
-; SSE3-NEXT: psrlq $2, %xmm0
+; SSE3-NEXT: psrlw $2, %xmm0
; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: paddq %xmm2, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
-; SSE3-NEXT: psrlq $4, %xmm1
-; SSE3-NEXT: paddq %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: pxor %xmm0, %xmm0
; SSE3-NEXT: psadbw %xmm0, %xmm1
@@ -128,28 +128,16 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; BITALG_NOVLX-LABEL: testv2i64:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv2i64:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG-NEXT: retq
@@ -161,18 +149,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2-LABEL: testv4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: psrld $2, %xmm0
+; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $4, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
@@ -187,18 +175,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE3-LABEL: testv4i32:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
-; SSE3-NEXT: psrld $1, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT: psubd %xmm1, %xmm0
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: pand %xmm1, %xmm2
-; SSE3-NEXT: psrld $2, %xmm0
+; SSE3-NEXT: psrlw $2, %xmm0
; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: paddd %xmm2, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
-; SSE3-NEXT: psrld $4, %xmm1
-; SSE3-NEXT: paddd %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: pxor %xmm0, %xmm0
; SSE3-NEXT: movdqa %xmm1, %xmm2
@@ -303,32 +291,20 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; BITALG_NOVLX-LABEL: testv4i32:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv4i32:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
@@ -346,16 +322,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psubw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psllw $8, %xmm0
@@ -368,16 +344,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT: psubw %xmm1, %xmm0
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: pand %xmm1, %xmm2
; SSE3-NEXT: psrlw $2, %xmm0
; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: paddw %xmm2, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddw %xmm0, %xmm1
+; SSE3-NEXT: paddb %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
index b2cc2f1ebed..570f59673d1 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll
@@ -58,28 +58,15 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
;
; BITALG_NOVLX-LABEL: testv4i64:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv4i64:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm2
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; BITALG-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG-NEXT: retq
@@ -151,14 +138,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
;
; BITALG_NOVLX-LABEL: testv8i32:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
@@ -169,14 +150,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
;
; BITALG-LABEL: testv8i32:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm2
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; BITALG-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll
index df5edc13c3e..eae9e6c79bd 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll
@@ -50,14 +50,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
;
; BITALG-LABEL: testv8i64:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm2, %zmm3, %zmm2
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; BITALG-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
; BITALG-NEXT: retq
@@ -122,14 +115,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
;
; BITALG-LABEL: testv16i32:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm2, %zmm3, %zmm2
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; BITALG-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
index d19c10d68bc..a532794f89d 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -25,18 +25,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: paddq %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlq $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubq %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT: psubb %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlq $2, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm3
; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: paddq %xmm2, %xmm3
+; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlq $4, %xmm0
-; SSE2-NEXT: paddq %xmm3, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm3, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -50,18 +50,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE3-NEXT: pcmpeqd %xmm3, %xmm3
; SSE3-NEXT: paddq %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlq $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubq %xmm0, %xmm3
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT: psubb %xmm0, %xmm3
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm3, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlq $2, %xmm3
+; SSE3-NEXT: psrlw $2, %xmm3
; SSE3-NEXT: pand %xmm0, %xmm3
-; SSE3-NEXT: paddq %xmm2, %xmm3
+; SSE3-NEXT: paddb %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlq $4, %xmm0
-; SSE3-NEXT: paddq %xmm3, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm3, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: psadbw %xmm1, %xmm0
; SSE3-NEXT: retq
@@ -155,15 +155,9 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv2i64:
@@ -173,14 +167,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG-NEXT: retq
;
@@ -217,18 +204,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: paddq %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlq $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubq %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT: psubb %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlq $2, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm3
; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: paddq %xmm2, %xmm3
+; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlq $4, %xmm0
-; SSE2-NEXT: paddq %xmm3, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm3, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -242,18 +229,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE3-NEXT: pcmpeqd %xmm3, %xmm3
; SSE3-NEXT: paddq %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlq $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubq %xmm0, %xmm3
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT: psubb %xmm0, %xmm3
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm3, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlq $2, %xmm3
+; SSE3-NEXT: psrlw $2, %xmm3
; SSE3-NEXT: pand %xmm0, %xmm3
-; SSE3-NEXT: paddq %xmm2, %xmm3
+; SSE3-NEXT: paddb %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlq $4, %xmm0
-; SSE3-NEXT: paddq %xmm3, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm3, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: psadbw %xmm1, %xmm0
; SSE3-NEXT: retq
@@ -386,15 +373,9 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv2i64u:
@@ -404,14 +385,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG-NEXT: retq
;
@@ -448,18 +422,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: paddd %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubd %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT: psubb %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrld $2, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm3
; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrld $4, %xmm0
-; SSE2-NEXT: paddd %xmm3, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm3, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -478,18 +452,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE3-NEXT: pcmpeqd %xmm3, %xmm3
; SSE3-NEXT: paddd %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrld $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubd %xmm0, %xmm3
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT: psubb %xmm0, %xmm3
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm3, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrld $2, %xmm3
+; SSE3-NEXT: psrlw $2, %xmm3
; SSE3-NEXT: pand %xmm0, %xmm3
-; SSE3-NEXT: paddd %xmm2, %xmm3
+; SSE3-NEXT: paddb %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrld $4, %xmm0
-; SSE3-NEXT: paddd %xmm3, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm3, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -667,19 +641,13 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv4i32:
@@ -689,14 +657,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -742,18 +703,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: paddd %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubd %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT: psubb %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrld $2, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm3
; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrld $4, %xmm0
-; SSE2-NEXT: paddd %xmm3, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm3, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -772,18 +733,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE3-NEXT: pcmpeqd %xmm3, %xmm3
; SSE3-NEXT: paddd %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrld $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubd %xmm0, %xmm3
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT: psubb %xmm0, %xmm3
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm3, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrld $2, %xmm3
+; SSE3-NEXT: psrlw $2, %xmm3
; SSE3-NEXT: pand %xmm0, %xmm3
-; SSE3-NEXT: paddd %xmm2, %xmm3
+; SSE3-NEXT: paddb %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrld $4, %xmm0
-; SSE3-NEXT: paddd %xmm3, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm3, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -938,19 +899,13 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv4i32u:
@@ -960,14 +915,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1014,16 +962,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psubw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psllw $8, %xmm0
@@ -1041,16 +989,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT: psubw %xmm1, %xmm0
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: pand %xmm1, %xmm2
; SSE3-NEXT: psrlw $2, %xmm0
; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: paddw %xmm2, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddw %xmm0, %xmm1
+; SSE3-NEXT: paddb %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
@@ -1210,16 +1158,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psubw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psllw $8, %xmm0
@@ -1237,16 +1185,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT: psubw %xmm1, %xmm0
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: pand %xmm1, %xmm2
; SSE3-NEXT: psrlw $2, %xmm0
; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: paddw %xmm2, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddw %xmm0, %xmm1
+; SSE3-NEXT: paddb %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
index b1173fa4b88..cae0a2d605a 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -124,14 +124,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: retq
;
@@ -142,14 +135,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG-NEXT: retq
;
@@ -270,14 +256,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: retq
;
@@ -288,14 +267,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG-NEXT: retq
;
@@ -452,14 +424,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -474,14 +439,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -623,14 +581,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -645,14 +596,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll
index 37c86f7f81a..4a9fd82593a 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -87,14 +87,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
; BITALG-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
@@ -157,14 +150,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
; BITALG-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
@@ -269,14 +255,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -347,14 +326,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
OpenPOWER on IntegriCloud