From 259eaa6e7cd021c75e36da2ea677103a6847eb38 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 19 Mar 2018 17:31:41 +0000 Subject: [X86] Remove sse41 specific code from lowering v16i8 multiply With the SRAs removed from the SSE2 code in D44267, then there doesn't appear to be any advantage to the sse41 code. The punpcklbw instruction and pmovsx seem to have the same latency and throughput on most CPUs. And the SSE41 code requires moving the upper 64-bits into the lower 64-bit before the sign extend can be done. The unpckhbw in sse2 code can do better than that. llvm-svn: 327869 --- llvm/test/CodeGen/X86/combine-mul.ll | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'llvm/test/CodeGen/X86/combine-mul.ll') diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll index 4a0d3df969d..78278409ebd 100644 --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -289,14 +289,15 @@ define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) { define <16 x i8> @PR35579(<16 x i8> %x) { ; SSE-LABEL: PR35579: ; SSE: # %bb.0: -; SSE-NEXT: pmovsxbw %xmm0, %xmm1 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pmullw %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq -- cgit v1.2.3