diff options
| author | Zvi Rackover <zvi.rackover@intel.com> | 2018-01-07 20:21:10 +0000 |
|---|---|---|
| committer | Zvi Rackover <zvi.rackover@intel.com> | 2018-01-07 20:21:10 +0000 |
| commit | 93b8bd49550750cb88f7a65f93674a2516ea51cf (patch) | |
| tree | 59189a7941ff37d5e7bc6927b3a657453e73c834 | |
| parent | 998180dad3e007f7ad53836fbc4c4b42f39140d4 (diff) | |
| download | bcm5719-llvm-93b8bd49550750cb88f7a65f93674a2516ea51cf.tar.gz bcm5719-llvm-93b8bd49550750cb88f7a65f93674a2516ea51cf.zip | |
X86 Tests: Add Tests for PMADDWD selection. NFC.
Support for ISel to be added.
llvm-svn: 321970
| -rw-r--r-- | llvm/test/CodeGen/X86/madd.ll | 472 |
1 files changed, 373 insertions, 99 deletions
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index 44e7b91eef8..2e9237a0ebf 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { ; SSE2-LABEL: _Z10test_shortPsS_i: @@ -30,53 +30,29 @@ define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX2-LABEL: _Z10test_shortPsS_i: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: .p2align 4, 0x90 -; AVX2-NEXT: .LBB0_1: # %vector.body -; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 -; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: addq $8, %rcx -; AVX2-NEXT: cmpq %rcx, %rax -; AVX2-NEXT: jne .LBB0_1 -; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: _Z10test_shortPsS_i: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: .p2align 4, 0x90 -; AVX512-NEXT: .LBB0_1: # %vector.body -; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 -; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 -; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: addq $8, %rcx -; AVX512-NEXT: cmpq %rcx, %rax -; AVX512-NEXT: jne .LBB0_1 -; AVX512-NEXT: # %bb.2: # %middle.block -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: _Z10test_shortPsS_i: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB0_1: # %vector.body +; AVX-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 +; AVX-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 +; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: addq $8, %rcx +; AVX-NEXT: cmpq %rcx, %rax +; AVX-NEXT: jne .LBB0_1 +; AVX-NEXT: # %bb.2: # %middle.block +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq entry: %3 = zext i32 %2 to i64 br label %vector.body @@ -141,55 +117,30 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; -; AVX2-LABEL: test_unsigned_short: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: xorl %ecx, %ecx -; AVX2-NEXT: .p2align 4, 0x90 -; AVX2-NEXT: .LBB1_1: # %vector.body -; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: addq $8, %rcx -; AVX2-NEXT: cmpq %rcx, %rax -; AVX2-NEXT: jne .LBB1_1 -; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_unsigned_short: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: .p2align 4, 0x90 -; AVX512-NEXT: .LBB1_1: # %vector.body -; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512-NEXT: vpmulld %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: addq $8, %rcx -; AVX512-NEXT: cmpq %rcx, %rax -; AVX512-NEXT: jne .LBB1_1 -; AVX512-NEXT: # %bb.2: # %middle.block -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test_unsigned_short: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl %edx, %eax +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB1_1: # %vector.body +; AVX-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: addq $8, %rcx +; AVX-NEXT: cmpq %rcx, %rax +; AVX-NEXT: jne .LBB1_1 +; AVX-NEXT: # %bb.2: # %middle.block +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq entry: %3 = zext i32 %2 to i64 br label %vector.body @@ -361,3 +312,326 @@ middle.block: %13 = extractelement <16 x i32> %bin.rdx20, i32 0 ret i32 %13 } + +define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) { +; SSE2-LABEL: pmaddwd_8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: pmaddwd_8: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %a = sext <8 x i16> %A to <8 x i32> + %b = sext <8 x i16> %B to <8 x i32> + %m = mul nsw <8 x i32> %a, %b + %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %ret = add <4 x i32> %odd, %even + ret <4 x i32> %ret +} + +define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) { +; SSE2-LABEL: pmaddwd_16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhw %xmm2, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pmulhw %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: pmaddwd_16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm3 +; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm0[0,2],ymm2[4,6],ymm0[4,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: pmaddwd_16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: retq + %a = sext <16 x i16> %A to <16 x i32> + %b = sext <16 x i16> %B to <16 x i32> + %m = mul nsw <16 x i32> %a, %b + %odd = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %even = shufflevector <16 x i32> %m, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %ret = add <8 x i32> %odd, %even + ret <8 x i32> %ret +} + +define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) { +; SSE2-LABEL: pmaddwd_32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pmulhw %xmm4, %xmm8 +; SSE2-NEXT: pmullw %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pmulhw %xmm5, %xmm4 +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pmulhw %xmm6, %xmm4 +; SSE2-NEXT: pmullw %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pmulhw %xmm7, %xmm4 +; SSE2-NEXT: pmullw %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[0,2] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm9[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm7[1,3] +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm6[1,3] +; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm8[1,3] +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm9[1,3] +; SSE2-NEXT: paddd %xmm11, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: pmaddwd_32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm3, %ymm6 +; AVX2-NEXT: vpmulld %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-NEXT: vpmovsxwd %xmm3, %ymm3 +; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmovsxwd %xmm2, %ymm3 +; AVX2-NEXT: vpmulld %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[0,2],ymm4[4,6],ymm1[4,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm0[0,2],ymm3[4,6],ymm0[4,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,3],ymm1[1,3],ymm4[5,7],ymm1[5,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,3],ymm0[1,3],ymm3[5,7],ymm0[5,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpaddd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: pmaddwd_32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512F-NEXT: vpmulld %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm2 +; AVX512F-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512F-NEXT: vpaddd %zmm3, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: pmaddwd_32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm2 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm3 +; AVX512BW-NEXT: vpmulld %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512BW-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vpaddd %zmm3, %zmm1, %zmm0 +; AVX512BW-NEXT: retq + %a = sext <32 x i16> %A to <32 x i32> + %b = sext <32 x i16> %B to <32 x i32> + %m = mul nsw <32 x i32> %a, %b + %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> + %ret = add <16 x i32> %odd, %even + ret <16 x i32> %ret +} + +define <4 x i32> @pmaddwd_const(<8 x i16> %A) { +; SSE2-LABEL: pmaddwd_const: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32767,32768,0,0,1,7,42,32] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: pmaddwd_const: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %a = sext <8 x i16> %A to <8 x i32> + %m = mul nsw <8 x i32> %a, <i32 32767, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32> + %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %ret = add <4 x i32> %odd, %even + ret <4 x i32> %ret +} + +; Check that there is not selection for unsigned multiplication +define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) { +; SSE2-LABEL: pmaddwd_negative1: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhuw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: pmaddwd_negative1: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %a = zext <8 x i16> %A to <8 x i32> + %b = zext <8 x i16> %B to <8 x i32> + %m = mul nuw <8 x i32> %a, %b + %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %ret = add <4 x i32> %odd, %even + ret <4 x i32> %ret +} + +; Check that there is not selection for out-of-bounds constant +define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) { +; SSE2-LABEL: pmaddwd_negative2: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,7,42,32] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,4294934528,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: pmaddwd_negative2: +; AVX: # %bb.0: +; AVX-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %a = sext <8 x i16> %A to <8 x i32> + %m = mul nsw <8 x i32> %a, <i32 32768, i32 -32768, i32 0, i32 0, i32 1, i32 7, i32 42, i32 32> + %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %even = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %ret = add <4 x i32> %odd, %even + ret <4 x i32> %ret +} |

