diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2019-03-24 19:06:35 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2019-03-24 19:06:35 +0000 |
| commit | 87d4ab8b92e17db517499403eaa2e0b19992fae2 (patch) | |
| tree | 0c973e1dd13a30f2acd11e595f2201c6560f7fd0 /llvm/test/CodeGen/X86/vector-reduce-umax.ll | |
| parent | 6af0363857f5815fb69268198dd55f29c7a3539b (diff) | |
| download | bcm5719-llvm-87d4ab8b92e17db517499403eaa2e0b19992fae2.tar.gz bcm5719-llvm-87d4ab8b92e17db517499403eaa2e0b19992fae2.zip | |
[X86][SSE41] Start shuffle combining from ZERO_EXTEND_VECTOR_INREG (PR40685)
Enable SSE41 ZERO_EXTEND_VECTOR_INREG shuffle combines - for the PMOVZX(PSHUFD(V)) -> UNPCKH(V,0) pattern we reduce the shuffles (port5-bottleneck on Intel) at the expense of creating a zero (pxor v,v) and an extra register move - which is a good trade off as these are pretty cheap and in most cases it doesn't increase register pressure.
This also exposed a missed opportunity to use combine to ZERO_EXTEND_VECTOR_INREG with folded loads - even if we're in the float domain.
llvm-svn: 356864
Diffstat (limited to 'llvm/test/CodeGen/X86/vector-reduce-umax.ll')
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-reduce-umax.ll | 26 |
1 files changed, 14 insertions, 12 deletions
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index c56ca549fd7..23442287b4f 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -751,22 +751,24 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; ; SSE41-LABEL: test_v2i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movd %xmm2, %eax +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_v2i32: |

