diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-07-10 07:58:33 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-07-10 07:58:33 +0000 |
| commit | d32ca2c0b78def49ed6e9b612f5481775662fc4e (patch) | |
| tree | b98de2c086c2fe9c75bc23178e05c0a5c29cc5ef /llvm/test/CodeGen | |
| parent | 5fd020c0829b1ac201fc1b7494d8f5c5375613ea (diff) | |
| download | bcm5719-llvm-d32ca2c0b78def49ed6e9b612f5481775662fc4e.tar.gz bcm5719-llvm-d32ca2c0b78def49ed6e9b612f5481775662fc4e.zip | |
[X86][SSE] Prefer BLEND(SHL(v,c1),SHL(v,c2)) over MUL(v, c3)
Now that rL336250 has landed, we should prefer 2 immediate shifts + a shuffle blend over performing a multiply. Despite the increase in instructions, this is quicker (especially for slow v4i32 multiplies), avoid loads and constant pool usage. It does mean however that we increase register pressure. The code size will go up a little but by less than what we save on the constant pool data.
This patch also adds support for v16i16 to the BLEND(SHIFT(v,c1),SHIFT(v,c2)) combine, and also prevents blending on pre-SSE41 shifts if it would introduce extra blend masks/constant pool usage.
Differential Revision: https://reviews.llvm.org/D48936
llvm-svn: 336642
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/X86/combine-shl.ll | 34 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/lower-vec-shift.ll | 25 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vec_shift6.ll | 17 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/widen_arith-4.ll | 12 |
4 files changed, 48 insertions, 40 deletions
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll index 70386bb9aef..c037b0f0aa4 100644 --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -264,22 +264,14 @@ define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) { ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,1073741824,1073741824] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [536870912,536870912,268435456,268435456] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: pslld $30, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pslld $29, %xmm2 +; SSE2-NEXT: pslld $28, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_ext_shl1: @@ -288,8 +280,14 @@ define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) { ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 -; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pslld $30, %xmm2 +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pslld $28, %xmm2 +; SSE41-NEXT: pslld $29, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_vec_shl_ext_shl1: diff --git a/llvm/test/CodeGen/X86/lower-vec-shift.ll b/llvm/test/CodeGen/X86/lower-vec-shift.ll index 0b573d7a8b8..31059c40648 100644 --- a/llvm/test/CodeGen/X86/lower-vec-shift.ll +++ b/llvm/test/CodeGen/X86/lower-vec-shift.ll @@ -266,10 +266,14 @@ define <16 x i16> @test11(<16 x i16> %a) { ; ; AVX1-LABEL: test11: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7] +; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2 +; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test11: @@ -291,15 +295,20 @@ define <16 x i16> @test12(<16 x i16> %a) { ; AVX1-LABEL: test12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8] -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7] +; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2 +; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1 +; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15] ; AVX2-NEXT: retq %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3> ret <16 x i16> %lshr diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll index 7f2e1e5c2f7..1415d8549ae 100644 --- a/llvm/test/CodeGen/X86/vec_shift6.ll +++ b/llvm/test/CodeGen/X86/vec_shift6.ll @@ -67,19 +67,18 @@ define <4 x i32> @test3(<4 x i32> %a) { define <4 x i32> @test4(<4 x i32> %a) { ; SSE2-LABEL: test4: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pslld $1, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test4: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test4: diff --git a/llvm/test/CodeGen/X86/widen_arith-4.ll b/llvm/test/CodeGen/X86/widen_arith-4.ll index a38b12a759d..d97e6d4463f 100644 --- a/llvm/test/CodeGen/X86/widen_arith-4.ll +++ b/llvm/test/CodeGen/X86/widen_arith-4.ll @@ -49,7 +49,6 @@ define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind { ; SSE41-NEXT: movw $0, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: movl $0, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u> -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u> ; SSE41-NEXT: jmp .LBB0_1 ; SSE41-NEXT: .p2align 4, 0x90 ; SSE41-NEXT: .LBB0_2: # %forbody @@ -58,10 +57,13 @@ define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind { ; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; SSE41-NEXT: shlq $4, %rax ; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; SSE41-NEXT: movdqa (%rdx,%rax), %xmm2 -; SSE41-NEXT: psubw %xmm0, %xmm2 -; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: pextrw $4, %xmm2, 8(%rcx,%rax) +; SSE41-NEXT: movdqa (%rdx,%rax), %xmm1 +; SSE41-NEXT: psubw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psllw $2, %xmm2 +; SSE41-NEXT: psllw $1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax) ; SSE41-NEXT: movq %xmm2, (%rcx,%rax) ; SSE41-NEXT: incl -{{[0-9]+}}(%rsp) ; SSE41-NEXT: .LBB0_1: # %forcond |

