diff options
Diffstat (limited to 'llvm/test/CodeGen/X86/shrink_vmul.ll')
-rw-r--r-- | llvm/test/CodeGen/X86/shrink_vmul.ll | 324 |
1 files changed, 140 insertions, 184 deletions
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index 5ceb299befc..5e952472f75 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -42,13 +42,10 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx -; X86-AVX-NEXT: vmovd %edx, %xmm0 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax -; X86-AVX-NEXT: vmovd %eax, %xmm1 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -71,13 +68,10 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X64-AVX-LABEL: mul_2xi8: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -481,11 +475,10 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -505,11 +498,10 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X64-AVX-LABEL: mul_2xi16: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -912,13 +904,10 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx -; X86-AVX-NEXT: vmovd %edx, %xmm0 -; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax -; X86-AVX-NEXT: vmovd %eax, %xmm1 -; X86-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -943,13 +932,10 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, ; X64-AVX-LABEL: mul_2xi8_sext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1006,13 +992,10 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx -; X86-AVX-NEXT: vmovd %edx, %xmm0 -; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax -; X86-AVX-NEXT: vmovd %eax, %xmm1 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -1038,13 +1021,10 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl ; X64-AVX-LABEL: mul_2xi8_sext_zext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1095,11 +1075,10 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovsxwd %xmm1, %xmm1 -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -1119,11 +1098,10 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b ; X64-AVX-LABEL: mul_2xi16_sext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovsxwd %xmm1, %xmm1 -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1160,15 +1138,14 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1179,11 +1156,10 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -1194,25 +1170,23 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pxor %xmm2, %xmm2 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_sext_zext: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1405,8 +1379,8 @@ define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -1415,10 +1389,9 @@ define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1429,18 +1402,17 @@ define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi8_varconst1: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1482,10 +1454,9 @@ define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1505,10 +1476,9 @@ define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-LABEL: mul_2xi8_varconst2: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1539,8 +1509,11 @@ define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -1549,10 +1522,9 @@ define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1563,18 +1535,20 @@ define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi8_varconst3: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1618,10 +1592,9 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1643,10 +1616,9 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-LABEL: mul_2xi8_varconst4: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1690,10 +1662,9 @@ define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1715,10 +1686,9 @@ define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-LABEL: mul_2xi8_varconst5: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1762,10 +1732,9 @@ define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1787,10 +1756,9 @@ define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-LABEL: mul_2xi8_varconst6: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1831,9 +1799,9 @@ define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1852,9 +1820,9 @@ define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-LABEL: mul_2xi16_varconst1: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1895,9 +1863,9 @@ define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1916,9 +1884,9 @@ define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-LABEL: mul_2xi16_varconst2: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1948,12 +1916,9 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -1962,9 +1927,9 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1974,21 +1939,18 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pxor %xmm1, %xmm1 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_varconst3: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -2018,12 +1980,9 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -2032,9 +1991,9 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -2044,21 +2003,18 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_varconst4: ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: |