diff options
| -rw-r--r-- | llvm/test/CodeGen/X86/slow-pmulld.ll | 752 |
1 files changed, 505 insertions, 247 deletions
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll index 1dec2c8b227..fd4e7c831c0 100644 --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK64 +; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK32,SLM32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK64,SLM64 +; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK32,SLOW32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK64,SLOW64 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-64 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-32,AVX2-32 @@ -59,31 +61,57 @@ define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) { } define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { -; CHECK32-LABEL: test_mul_v8i32_v8i8: -; CHECK32: # %bb.0: -; CHECK32-NEXT: movdqa %xmm0, %xmm1 -; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm1 -; CHECK32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; CHECK32-NEXT: movdqa %xmm1, %xmm2 -; CHECK32-NEXT: pmullw %xmm0, %xmm1 -; CHECK32-NEXT: pmulhw %xmm0, %xmm2 -; CHECK32-NEXT: movdqa %xmm1, %xmm0 -; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK32-NEXT: retl -; -; CHECK64-LABEL: test_mul_v8i32_v8i8: -; CHECK64: # %bb.0: -; CHECK64-NEXT: movdqa %xmm0, %xmm1 -; CHECK64-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; CHECK64-NEXT: movdqa %xmm1, %xmm2 -; CHECK64-NEXT: pmullw %xmm0, %xmm1 -; CHECK64-NEXT: pmulhw %xmm0, %xmm2 -; CHECK64-NEXT: movdqa %xmm1, %xmm0 -; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK64-NEXT: retq +; SLM32-LABEL: test_mul_v8i32_v8i8: +; SLM32: # %bb.0: +; SLM32-NEXT: movdqa %xmm0, %xmm1 +; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm1 +; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM32-NEXT: movdqa %xmm1, %xmm2 +; SLM32-NEXT: pmullw %xmm0, %xmm1 +; SLM32-NEXT: pmulhw %xmm0, %xmm2 +; SLM32-NEXT: movdqa %xmm1, %xmm0 +; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLM32-NEXT: retl +; +; SLM64-LABEL: test_mul_v8i32_v8i8: +; SLM64: # %bb.0: +; SLM64-NEXT: movdqa %xmm0, %xmm1 +; SLM64-NEXT: pand {{.*}}(%rip), %xmm1 +; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM64-NEXT: movdqa %xmm1, %xmm2 +; SLM64-NEXT: pmullw %xmm0, %xmm1 +; SLM64-NEXT: pmulhw %xmm0, %xmm2 +; SLM64-NEXT: movdqa %xmm1, %xmm0 +; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLM64-NEXT: retq +; +; SLOW32-LABEL: test_mul_v8i32_v8i8: +; SLOW32: # %bb.0: +; SLOW32-NEXT: movdqa %xmm0, %xmm1 +; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm1 +; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW32-NEXT: movdqa %xmm1, %xmm2 +; SLOW32-NEXT: pmulhw %xmm0, %xmm2 +; SLOW32-NEXT: pmullw %xmm0, %xmm1 +; SLOW32-NEXT: movdqa %xmm1, %xmm0 +; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLOW32-NEXT: retl +; +; SLOW64-LABEL: test_mul_v8i32_v8i8: +; SLOW64: # %bb.0: +; SLOW64-NEXT: movdqa %xmm0, %xmm1 +; SLOW64-NEXT: pand {{.*}}(%rip), %xmm1 +; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW64-NEXT: movdqa %xmm1, %xmm2 +; SLOW64-NEXT: pmulhw %xmm0, %xmm2 +; SLOW64-NEXT: pmullw %xmm0, %xmm1 +; SLOW64-NEXT: movdqa %xmm1, %xmm0 +; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v8i32_v8i8: ; SSE4-32: # %bb.0: @@ -128,45 +156,87 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) { } define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { -; CHECK32-LABEL: test_mul_v16i32_v16i8: -; CHECK32: # %bb.0: -; CHECK32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] -; CHECK32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK32-NEXT: movdqa %xmm1, %xmm4 -; CHECK32-NEXT: movdqa %xmm3, %xmm5 -; CHECK32-NEXT: pmullw %xmm2, %xmm1 -; CHECK32-NEXT: pmullw %xmm2, %xmm3 -; CHECK32-NEXT: pmulhw %xmm2, %xmm4 -; CHECK32-NEXT: pmulhw %xmm2, %xmm5 -; CHECK32-NEXT: movdqa %xmm1, %xmm0 -; CHECK32-NEXT: movdqa %xmm3, %xmm2 -; CHECK32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; CHECK32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; CHECK32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; CHECK32-NEXT: retl -; -; CHECK64-LABEL: test_mul_v16i32_v16i8: -; CHECK64: # %bb.0: -; CHECK64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] -; CHECK64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK64-NEXT: movdqa %xmm1, %xmm4 -; CHECK64-NEXT: movdqa %xmm3, %xmm5 -; CHECK64-NEXT: pmullw %xmm2, %xmm1 -; CHECK64-NEXT: pmullw %xmm2, %xmm3 -; CHECK64-NEXT: pmulhw %xmm2, %xmm4 -; CHECK64-NEXT: pmulhw %xmm2, %xmm5 -; CHECK64-NEXT: movdqa %xmm1, %xmm0 -; CHECK64-NEXT: movdqa %xmm3, %xmm2 -; CHECK64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; CHECK64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; CHECK64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; CHECK64-NEXT: retq +; SLM32-LABEL: test_mul_v16i32_v16i8: +; SLM32: # %bb.0: +; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SLM32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLM32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SLM32-NEXT: movdqa %xmm1, %xmm4 +; SLM32-NEXT: movdqa %xmm3, %xmm5 +; SLM32-NEXT: pmullw %xmm2, %xmm1 +; SLM32-NEXT: pmullw %xmm2, %xmm3 +; SLM32-NEXT: pmulhw %xmm2, %xmm4 +; SLM32-NEXT: pmulhw %xmm2, %xmm5 +; SLM32-NEXT: movdqa %xmm1, %xmm0 +; SLM32-NEXT: movdqa %xmm3, %xmm2 +; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SLM32-NEXT: retl +; +; SLM64-LABEL: test_mul_v16i32_v16i8: +; SLM64: # %bb.0: +; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SLM64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLM64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SLM64-NEXT: movdqa %xmm1, %xmm4 +; SLM64-NEXT: movdqa %xmm3, %xmm5 +; SLM64-NEXT: pmullw %xmm2, %xmm1 +; SLM64-NEXT: pmullw %xmm2, %xmm3 +; SLM64-NEXT: pmulhw %xmm2, %xmm4 +; SLM64-NEXT: pmulhw %xmm2, %xmm5 +; SLM64-NEXT: movdqa %xmm1, %xmm0 +; SLM64-NEXT: movdqa %xmm3, %xmm2 +; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SLM64-NEXT: retq +; +; SLOW32-LABEL: test_mul_v16i32_v16i8: +; SLOW32: # %bb.0: +; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW32-NEXT: movdqa %xmm1, %xmm3 +; SLOW32-NEXT: pmulhw %xmm2, %xmm3 +; SLOW32-NEXT: pmullw %xmm2, %xmm1 +; SLOW32-NEXT: movdqa %xmm1, %xmm4 +; SLOW32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SLOW32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SLOW32-NEXT: movdqa %xmm3, %xmm0 +; SLOW32-NEXT: pmulhw %xmm2, %xmm0 +; SLOW32-NEXT: pmullw %xmm2, %xmm3 +; SLOW32-NEXT: movdqa %xmm3, %xmm2 +; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SLOW32-NEXT: movdqa %xmm4, %xmm0 +; SLOW32-NEXT: retl +; +; SLOW64-LABEL: test_mul_v16i32_v16i8: +; SLOW64: # %bb.0: +; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW64-NEXT: movdqa %xmm1, %xmm3 +; SLOW64-NEXT: pmulhw %xmm2, %xmm3 +; SLOW64-NEXT: pmullw %xmm2, %xmm1 +; SLOW64-NEXT: movdqa %xmm1, %xmm4 +; SLOW64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SLOW64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SLOW64-NEXT: movdqa %xmm3, %xmm0 +; SLOW64-NEXT: pmulhw %xmm2, %xmm0 +; SLOW64-NEXT: pmullw %xmm2, %xmm3 +; SLOW64-NEXT: movdqa %xmm3, %xmm2 +; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SLOW64-NEXT: movdqa %xmm4, %xmm0 +; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v16i32_v16i8: ; SSE4-32: # %bb.0: @@ -237,25 +307,45 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) { } define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) { -; CHECK32-LABEL: test_mul_v4i32_v4i16: -; CHECK32: # %bb.0: -; CHECK32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> -; CHECK32-NEXT: movdqa %xmm0, %xmm2 -; CHECK32-NEXT: pmullw %xmm1, %xmm0 -; CHECK32-NEXT: pmulhuw %xmm1, %xmm2 -; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK32-NEXT: retl -; -; CHECK64-LABEL: test_mul_v4i32_v4i16: -; CHECK64: # %bb.0: -; CHECK64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> -; CHECK64-NEXT: movdqa %xmm0, %xmm2 -; CHECK64-NEXT: pmullw %xmm1, %xmm0 -; CHECK64-NEXT: pmulhuw %xmm1, %xmm2 -; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK64-NEXT: retq +; SLM32-LABEL: test_mul_v4i32_v4i16: +; SLM32: # %bb.0: +; SLM32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SLM32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> +; SLM32-NEXT: movdqa %xmm0, %xmm2 +; SLM32-NEXT: pmullw %xmm1, %xmm0 +; SLM32-NEXT: pmulhuw %xmm1, %xmm2 +; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM32-NEXT: retl +; +; SLM64-LABEL: test_mul_v4i32_v4i16: +; SLM64: # %bb.0: +; SLM64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SLM64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> +; SLM64-NEXT: movdqa %xmm0, %xmm2 +; SLM64-NEXT: pmullw %xmm1, %xmm0 +; SLM64-NEXT: pmulhuw %xmm1, %xmm2 +; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM64-NEXT: retq +; +; SLOW32-LABEL: test_mul_v4i32_v4i16: +; SLOW32: # %bb.0: +; SLOW32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> +; SLOW32-NEXT: movdqa %xmm0, %xmm2 +; SLOW32-NEXT: pmulhuw %xmm1, %xmm2 +; SLOW32-NEXT: pmullw %xmm1, %xmm0 +; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLOW32-NEXT: retl +; +; SLOW64-LABEL: test_mul_v4i32_v4i16: +; SLOW64: # %bb.0: +; SLOW64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> +; SLOW64-NEXT: movdqa %xmm0, %xmm2 +; SLOW64-NEXT: pmulhuw %xmm1, %xmm2 +; SLOW64-NEXT: pmullw %xmm1, %xmm0 +; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v4i32_v4i16: ; SSE4-32: # %bb.0: @@ -292,29 +382,53 @@ define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) { } define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { -; CHECK32-LABEL: test_mul_v8i32_v8i16: -; CHECK32: # %bb.0: -; CHECK32-NEXT: movdqa %xmm0, %xmm1 -; CHECK32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; CHECK32-NEXT: movdqa %xmm1, %xmm2 -; CHECK32-NEXT: pmullw %xmm0, %xmm1 -; CHECK32-NEXT: pmulhuw %xmm0, %xmm2 -; CHECK32-NEXT: movdqa %xmm1, %xmm0 -; CHECK32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK32-NEXT: retl -; -; CHECK64-LABEL: test_mul_v8i32_v8i16: -; CHECK64: # %bb.0: -; CHECK64-NEXT: movdqa %xmm0, %xmm1 -; CHECK64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; CHECK64-NEXT: movdqa %xmm1, %xmm2 -; CHECK64-NEXT: pmullw %xmm0, %xmm1 -; CHECK64-NEXT: pmulhuw %xmm0, %xmm2 -; CHECK64-NEXT: movdqa %xmm1, %xmm0 -; CHECK64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK64-NEXT: retq +; SLM32-LABEL: test_mul_v8i32_v8i16: +; SLM32: # %bb.0: +; SLM32-NEXT: movdqa %xmm0, %xmm1 +; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM32-NEXT: movdqa %xmm1, %xmm2 +; SLM32-NEXT: pmullw %xmm0, %xmm1 +; SLM32-NEXT: pmulhuw %xmm0, %xmm2 +; SLM32-NEXT: movdqa %xmm1, %xmm0 +; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM32-NEXT: retl +; +; SLM64-LABEL: test_mul_v8i32_v8i16: +; SLM64: # %bb.0: +; SLM64-NEXT: movdqa %xmm0, %xmm1 +; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM64-NEXT: movdqa %xmm1, %xmm2 +; SLM64-NEXT: pmullw %xmm0, %xmm1 +; SLM64-NEXT: pmulhuw %xmm0, %xmm2 +; SLM64-NEXT: movdqa %xmm1, %xmm0 +; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM64-NEXT: retq +; +; SLOW32-LABEL: test_mul_v8i32_v8i16: +; SLOW32: # %bb.0: +; SLOW32-NEXT: movdqa %xmm0, %xmm1 +; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW32-NEXT: movdqa %xmm1, %xmm2 +; SLOW32-NEXT: pmulhuw %xmm0, %xmm2 +; SLOW32-NEXT: pmullw %xmm0, %xmm1 +; SLOW32-NEXT: movdqa %xmm1, %xmm0 +; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLOW32-NEXT: retl +; +; SLOW64-LABEL: test_mul_v8i32_v8i16: +; SLOW64: # %bb.0: +; SLOW64-NEXT: movdqa %xmm0, %xmm1 +; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW64-NEXT: movdqa %xmm1, %xmm2 +; SLOW64-NEXT: pmulhuw %xmm0, %xmm2 +; SLOW64-NEXT: pmullw %xmm0, %xmm1 +; SLOW64-NEXT: movdqa %xmm1, %xmm0 +; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v8i32_v8i16: ; SSE4-32: # %bb.0: @@ -355,43 +469,81 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { } define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { -; CHECK32-LABEL: test_mul_v16i32_v16i16: -; CHECK32: # %bb.0: -; CHECK32-NEXT: movdqa %xmm1, %xmm3 -; CHECK32-NEXT: movdqa %xmm0, %xmm1 -; CHECK32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; CHECK32-NEXT: movdqa %xmm1, %xmm2 -; CHECK32-NEXT: movdqa %xmm3, %xmm4 -; CHECK32-NEXT: pmullw %xmm0, %xmm1 -; CHECK32-NEXT: pmulhuw %xmm0, %xmm2 -; CHECK32-NEXT: pmullw %xmm0, %xmm3 -; CHECK32-NEXT: pmulhuw %xmm0, %xmm4 -; CHECK32-NEXT: movdqa %xmm1, %xmm0 -; CHECK32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK32-NEXT: movdqa %xmm3, %xmm2 -; CHECK32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; CHECK32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; CHECK32-NEXT: retl -; -; CHECK64-LABEL: test_mul_v16i32_v16i16: -; CHECK64: # %bb.0: -; CHECK64-NEXT: movdqa %xmm1, %xmm3 -; CHECK64-NEXT: movdqa %xmm0, %xmm1 -; CHECK64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] -; CHECK64-NEXT: movdqa %xmm1, %xmm2 -; CHECK64-NEXT: movdqa %xmm3, %xmm4 -; CHECK64-NEXT: pmullw %xmm0, %xmm1 -; CHECK64-NEXT: pmulhuw %xmm0, %xmm2 -; CHECK64-NEXT: pmullw %xmm0, %xmm3 -; CHECK64-NEXT: pmulhuw %xmm0, %xmm4 -; CHECK64-NEXT: movdqa %xmm1, %xmm0 -; CHECK64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK64-NEXT: movdqa %xmm3, %xmm2 -; CHECK64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; CHECK64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; CHECK64-NEXT: retq +; SLM32-LABEL: test_mul_v16i32_v16i16: +; SLM32: # %bb.0: +; SLM32-NEXT: movdqa %xmm1, %xmm3 +; SLM32-NEXT: movdqa %xmm0, %xmm1 +; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM32-NEXT: movdqa %xmm1, %xmm2 +; SLM32-NEXT: movdqa %xmm3, %xmm4 +; SLM32-NEXT: pmullw %xmm0, %xmm1 +; SLM32-NEXT: pmulhuw %xmm0, %xmm2 +; SLM32-NEXT: pmullw %xmm0, %xmm3 +; SLM32-NEXT: pmulhuw %xmm0, %xmm4 +; SLM32-NEXT: movdqa %xmm1, %xmm0 +; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM32-NEXT: movdqa %xmm3, %xmm2 +; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SLM32-NEXT: retl +; +; SLM64-LABEL: test_mul_v16i32_v16i16: +; SLM64: # %bb.0: +; SLM64-NEXT: movdqa %xmm1, %xmm3 +; SLM64-NEXT: movdqa %xmm0, %xmm1 +; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLM64-NEXT: movdqa %xmm1, %xmm2 +; SLM64-NEXT: movdqa %xmm3, %xmm4 +; SLM64-NEXT: pmullw %xmm0, %xmm1 +; SLM64-NEXT: pmulhuw %xmm0, %xmm2 +; SLM64-NEXT: pmullw %xmm0, %xmm3 +; SLM64-NEXT: pmulhuw %xmm0, %xmm4 +; SLM64-NEXT: movdqa %xmm1, %xmm0 +; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SLM64-NEXT: movdqa %xmm3, %xmm2 +; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SLM64-NEXT: retq +; +; SLOW32-LABEL: test_mul_v16i32_v16i16: +; SLOW32: # %bb.0: +; SLOW32-NEXT: movdqa %xmm1, %xmm3 +; SLOW32-NEXT: movdqa %xmm0, %xmm1 +; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW32-NEXT: movdqa %xmm1, %xmm4 +; SLOW32-NEXT: pmulhuw %xmm2, %xmm4 +; SLOW32-NEXT: pmullw %xmm2, %xmm1 +; SLOW32-NEXT: movdqa %xmm1, %xmm0 +; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SLOW32-NEXT: movdqa %xmm3, %xmm4 +; SLOW32-NEXT: pmulhuw %xmm2, %xmm4 +; SLOW32-NEXT: pmullw %xmm2, %xmm3 +; SLOW32-NEXT: movdqa %xmm3, %xmm2 +; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLOW32-NEXT: retl +; +; SLOW64-LABEL: test_mul_v16i32_v16i16: +; SLOW64: # %bb.0: +; SLOW64-NEXT: movdqa %xmm1, %xmm3 +; SLOW64-NEXT: movdqa %xmm0, %xmm1 +; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778] +; SLOW64-NEXT: movdqa %xmm1, %xmm4 +; SLOW64-NEXT: pmulhuw %xmm2, %xmm4 +; SLOW64-NEXT: pmullw %xmm2, %xmm1 +; SLOW64-NEXT: movdqa %xmm1, %xmm0 +; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SLOW64-NEXT: movdqa %xmm3, %xmm4 +; SLOW64-NEXT: pmulhuw %xmm2, %xmm4 +; SLOW64-NEXT: pmullw %xmm2, %xmm3 +; SLOW64-NEXT: movdqa %xmm3, %xmm2 +; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v16i32_v16i16: ; SSE4-32: # %bb.0: @@ -509,27 +661,49 @@ define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize { } define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize { -; CHECK32-LABEL: test_mul_v8i32_v8i8_minsize: -; CHECK32: # %bb.0: -; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0 -; CHECK32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; CHECK32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; CHECK32-NEXT: pmulld %xmm2, %xmm0 -; CHECK32-NEXT: pmulld %xmm2, %xmm1 -; CHECK32-NEXT: retl -; -; CHECK64-LABEL: test_mul_v8i32_v8i8_minsize: -; CHECK64: # %bb.0: -; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; CHECK64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; CHECK64-NEXT: pmulld %xmm2, %xmm0 -; CHECK64-NEXT: pmulld %xmm2, %xmm1 -; CHECK64-NEXT: retq +; SLM32-LABEL: test_mul_v8i32_v8i8_minsize: +; SLM32: # %bb.0: +; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm0 +; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM32-NEXT: pmulld %xmm2, %xmm0 +; SLM32-NEXT: pmulld %xmm2, %xmm1 +; SLM32-NEXT: retl +; +; SLM64-LABEL: test_mul_v8i32_v8i8_minsize: +; SLM64: # %bb.0: +; SLM64-NEXT: pand {{.*}}(%rip), %xmm0 +; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM64-NEXT: pmulld %xmm2, %xmm0 +; SLM64-NEXT: pmulld %xmm2, %xmm1 +; SLM64-NEXT: retq +; +; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize: +; SLOW32: # %bb.0: +; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm0 +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLOW32-NEXT: pmulld %xmm2, %xmm0 +; SLOW32-NEXT: pmulld %xmm2, %xmm1 +; SLOW32-NEXT: retl +; +; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize: +; SLOW64: # %bb.0: +; SLOW64-NEXT: pand {{.*}}(%rip), %xmm0 +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLOW64-NEXT: pmulld %xmm2, %xmm0 +; SLOW64-NEXT: pmulld %xmm2, %xmm1 +; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize: ; SSE4-32: # %bb.0: @@ -574,37 +748,69 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize { } define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize { -; CHECK32-LABEL: test_mul_v16i32_v16i8_minsize: -; CHECK32: # %bb.0: -; CHECK32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; CHECK32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] -; CHECK32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] -; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; CHECK32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; CHECK32-NEXT: pmulld %xmm5, %xmm0 -; CHECK32-NEXT: pmulld %xmm5, %xmm1 -; CHECK32-NEXT: pmulld %xmm5, %xmm2 -; CHECK32-NEXT: pmulld %xmm5, %xmm3 -; CHECK32-NEXT: retl -; -; CHECK64-LABEL: test_mul_v16i32_v16i8_minsize: -; CHECK64: # %bb.0: -; CHECK64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; CHECK64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] -; CHECK64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] -; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; CHECK64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; CHECK64-NEXT: pmulld %xmm5, %xmm0 -; CHECK64-NEXT: pmulld %xmm5, %xmm1 -; CHECK64-NEXT: pmulld %xmm5, %xmm2 -; CHECK64-NEXT: pmulld %xmm5, %xmm3 -; CHECK64-NEXT: retq +; SLM32-LABEL: test_mul_v16i32_v16i8_minsize: +; SLM32: # %bb.0: +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] +; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SLM32-NEXT: pmulld %xmm5, %xmm0 +; SLM32-NEXT: pmulld %xmm5, %xmm1 +; SLM32-NEXT: pmulld %xmm5, %xmm2 +; SLM32-NEXT: pmulld %xmm5, %xmm3 +; SLM32-NEXT: retl +; +; SLM64-LABEL: test_mul_v16i32_v16i8_minsize: +; SLM64: # %bb.0: +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] +; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SLM64-NEXT: pmulld %xmm5, %xmm0 +; SLM64-NEXT: pmulld %xmm5, %xmm1 +; SLM64-NEXT: pmulld %xmm5, %xmm2 +; SLM64-NEXT: pmulld %xmm5, %xmm3 +; SLM64-NEXT: retq +; +; SLOW32-LABEL: test_mul_v16i32_v16i8_minsize: +; SLOW32: # %bb.0: +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SLOW32-NEXT: pmulld %xmm4, %xmm0 +; SLOW32-NEXT: pmulld %xmm4, %xmm1 +; SLOW32-NEXT: pmulld %xmm4, %xmm2 +; SLOW32-NEXT: pmulld %xmm4, %xmm3 +; SLOW32-NEXT: retl +; +; SLOW64-LABEL: test_mul_v16i32_v16i8_minsize: +; SLOW64: # %bb.0: +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] +; SLOW64-NEXT: pmulld %xmm4, %xmm0 +; SLOW64-NEXT: pmulld %xmm4, %xmm1 +; SLOW64-NEXT: pmulld %xmm4, %xmm2 +; SLOW64-NEXT: pmulld %xmm4, %xmm3 +; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize: ; SSE4-32: # %bb.0: @@ -724,25 +930,45 @@ define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize { } define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { -; CHECK32-LABEL: test_mul_v8i32_v8i16_minsize: -; CHECK32: # %bb.0: -; CHECK32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; CHECK32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; CHECK32-NEXT: pmulld %xmm2, %xmm0 -; CHECK32-NEXT: pmulld %xmm2, %xmm1 -; CHECK32-NEXT: retl -; -; CHECK64-LABEL: test_mul_v8i32_v8i16_minsize: -; CHECK64: # %bb.0: -; CHECK64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; CHECK64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; CHECK64-NEXT: pmulld %xmm2, %xmm0 -; CHECK64-NEXT: pmulld %xmm2, %xmm1 -; CHECK64-NEXT: retq +; SLM32-LABEL: test_mul_v8i32_v8i16_minsize: +; SLM32: # %bb.0: +; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM32-NEXT: pmulld %xmm2, %xmm0 +; SLM32-NEXT: pmulld %xmm2, %xmm1 +; SLM32-NEXT: retl +; +; SLM64-LABEL: test_mul_v8i32_v8i16_minsize: +; SLM64: # %bb.0: +; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM64-NEXT: pmulld %xmm2, %xmm0 +; SLM64-NEXT: pmulld %xmm2, %xmm1 +; SLM64-NEXT: retq +; +; SLOW32-LABEL: test_mul_v8i32_v8i16_minsize: +; SLOW32: # %bb.0: +; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLOW32-NEXT: pmulld %xmm2, %xmm0 +; SLOW32-NEXT: pmulld %xmm2, %xmm1 +; SLOW32-NEXT: retl +; +; SLOW64-LABEL: test_mul_v8i32_v8i16_minsize: +; SLOW64: # %bb.0: +; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] +; SLOW64-NEXT: pmulld %xmm2, %xmm0 +; SLOW64-NEXT: pmulld %xmm2, %xmm1 +; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v8i32_v8i16_minsize: ; SSE4-32: # %bb.0: @@ -783,37 +1009,69 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { } define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { -; CHECK32-LABEL: test_mul_v16i32_v16i16_minsize: -; CHECK32: # %bb.0: -; CHECK32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; CHECK32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] -; CHECK32-NEXT: pmulld %xmm1, %xmm4 -; CHECK32-NEXT: pmulld %xmm1, %xmm0 -; CHECK32-NEXT: pmulld %xmm1, %xmm2 -; CHECK32-NEXT: pmulld %xmm1, %xmm3 -; CHECK32-NEXT: movdqa %xmm4, %xmm1 -; CHECK32-NEXT: retl -; -; CHECK64-LABEL: test_mul_v16i32_v16i16_minsize: -; CHECK64: # %bb.0: -; CHECK64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; CHECK64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] -; CHECK64-NEXT: pmulld %xmm1, %xmm4 -; CHECK64-NEXT: pmulld %xmm1, %xmm0 -; CHECK64-NEXT: pmulld %xmm1, %xmm2 -; CHECK64-NEXT: pmulld %xmm1, %xmm3 -; CHECK64-NEXT: movdqa %xmm4, %xmm1 -; CHECK64-NEXT: retq +; SLM32-LABEL: test_mul_v16i32_v16i16_minsize: +; SLM32: # %bb.0: +; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] +; SLM32-NEXT: pmulld %xmm1, %xmm4 +; SLM32-NEXT: pmulld %xmm1, %xmm0 +; SLM32-NEXT: pmulld %xmm1, %xmm2 +; SLM32-NEXT: pmulld %xmm1, %xmm3 +; SLM32-NEXT: movdqa %xmm4, %xmm1 +; SLM32-NEXT: retl +; +; SLM64-LABEL: test_mul_v16i32_v16i16_minsize: +; SLM64: # %bb.0: +; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] +; SLM64-NEXT: pmulld %xmm1, %xmm4 +; SLM64-NEXT: pmulld %xmm1, %xmm0 +; SLM64-NEXT: pmulld %xmm1, %xmm2 +; SLM64-NEXT: pmulld %xmm1, %xmm3 +; SLM64-NEXT: movdqa %xmm4, %xmm1 +; SLM64-NEXT: retq +; +; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize: +; SLOW32: # %bb.0: +; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] +; SLOW32-NEXT: pmulld %xmm1, %xmm0 +; SLOW32-NEXT: pmulld %xmm1, %xmm2 +; SLOW32-NEXT: pmulld %xmm1, %xmm4 +; SLOW32-NEXT: pmulld %xmm1, %xmm3 +; SLOW32-NEXT: movdqa %xmm4, %xmm1 +; SLOW32-NEXT: retl +; +; SLOW64-LABEL: test_mul_v16i32_v16i16_minsize: +; SLOW64: # %bb.0: +; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] +; SLOW64-NEXT: pmulld %xmm1, %xmm0 +; SLOW64-NEXT: pmulld %xmm1, %xmm2 +; SLOW64-NEXT: pmulld %xmm1, %xmm4 +; SLOW64-NEXT: pmulld %xmm1, %xmm3 +; SLOW64-NEXT: movdqa %xmm4, %xmm1 +; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v16i32_v16i16_minsize: ; SSE4-32: # %bb.0: |

