diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-03-26 15:44:55 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-03-26 15:44:55 +0000 |
commit | e4dbeb40c6a08abb5486c25ab0b31926f10d6248 (patch) | |
tree | fbba0aa7238ad81f16210166c046e34ab763383d | |
parent | 3eef33a80656ffe30f5815708c400b17e8f3bae8 (diff) | |
download | bcm5719-llvm-e4dbeb40c6a08abb5486c25ab0b31926f10d6248.tar.gz bcm5719-llvm-e4dbeb40c6a08abb5486c25ab0b31926f10d6248.zip |
[X86][AVX] Enabled MULHS/MULHU v16i16 vectors on AVX1 targets
Correct splitting of v16i16 vectors into v8i16 vectors to prevent scalarization
Differential Revision: http://reviews.llvm.org/D18307
llvm-svn: 264512
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll | 359 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll | 387 |
3 files changed, 54 insertions, 694 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f5e01bc05ac..7f1500a2c4e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1253,6 +1253,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v16i16, Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); + setOperationAction(ISD::MULHU, MVT::v16i16, Custom); + setOperationAction(ISD::MULHS, MVT::v16i16, Custom); setOperationAction(ISD::MULHU, MVT::v32i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i8, Custom); diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll index 78b29b0c9e6..3f81d2e7b8c 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -193,150 +193,15 @@ define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind { ; AVX1-LABEL: test_div7_16i16: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm1, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm1, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: movswl %cx, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: cwtl -; AVX1-NEXT: imull $18725, %eax, %eax # imm = 0x4925 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: sarw %cx -; AVX1-NEXT: shrl $31, %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] +; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm3 +; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2 +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -674,198 +539,20 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind { ; AVX1-LABEL: test_rem7_16i16: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm1, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: movswl %cx, %edx -; AVX1-NEXT: imull $18725, %edx, %edx # imm = 0x4925 -; AVX1-NEXT: movl %edx, %esi -; AVX1-NEXT: shrl $16, %esi -; AVX1-NEXT: sarw %si -; AVX1-NEXT: shrl $31, %edx -; AVX1-NEXT: addl %esi, %edx -; AVX1-NEXT: leal (,%rdx,8), %esi -; AVX1-NEXT: subl %edx, %esi -; AVX1-NEXT: subl %esi, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm1, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: movswl %cx, %edx -; AVX1-NEXT: imull $18725, %edx, %edx # imm = 0x4925 -; AVX1-NEXT: movl %edx, %esi -; AVX1-NEXT: shrl $16, %esi -; AVX1-NEXT: sarw %si -; AVX1-NEXT: shrl $31, %edx -; AVX1-NEXT: addl %esi, %edx -; AVX1-NEXT: leal (,%rdx,8), %esi -; AVX1-NEXT: subl %edx, %esi -; AVX1-NEXT: subl %esi, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: movswl %ax, %ecx -; AVX1-NEXT: imull $18725, %ecx, %ecx # imm = 0x4925 -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: sarw %dx -; AVX1-NEXT: shrl $31, %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: leal (,%rcx,8), %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: subl %edx, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] +; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm4 +; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm3 +; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll index 4f69098d0b9..992be948127 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -184,154 +184,19 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind { ; AVX1-LABEL: test_div7_16i16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: movzwl %cx, %edx -; AVX1-NEXT: imull $9363, %edx, %edx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: andl $65534, %ecx # imm = 0xFFFE -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: shrl $2, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: movzwl %cx, %edx -; AVX1-NEXT: imull $9363, %edx, %edx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: andl $65534, %ecx # imm = 0xFFFE -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: shrl $2, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: andl $65534, %eax # imm = 0xFFFE -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] +; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_div7_16i16: @@ -661,216 +526,22 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind { ; AVX1-LABEL: test_rem7_16i16: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: movzwl %cx, %edx -; AVX1-NEXT: imull $9363, %edx, %edx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: movl %ecx, %esi -; AVX1-NEXT: subl %edx, %esi -; AVX1-NEXT: andl $65534, %esi # imm = 0xFFFE -; AVX1-NEXT: shrl %esi -; AVX1-NEXT: addl %edx, %esi -; AVX1-NEXT: shrl $2, %esi -; AVX1-NEXT: leal (,%rsi,8), %edx -; AVX1-NEXT: subl %esi, %edx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: movzwl %cx, %edx -; AVX1-NEXT: imull $9363, %edx, %edx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: movl %ecx, %esi -; AVX1-NEXT: subl %edx, %esi -; AVX1-NEXT: andl $65534, %esi # imm = 0xFFFE -; AVX1-NEXT: shrl %esi -; AVX1-NEXT: addl %edx, %esi -; AVX1-NEXT: shrl $2, %esi -; AVX1-NEXT: leal (,%rsi,8), %edx -; AVX1-NEXT: subl %esi, %edx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: subl %ecx, %edx -; AVX1-NEXT: andl $65534, %edx # imm = 0xFFFE -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: shrl $2, %edx -; AVX1-NEXT: leal (,%rdx,8), %ecx -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] +; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 +; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 +; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; |