summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/X86/vector-reduce-umax.ll
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2019-03-24 19:06:35 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2019-03-24 19:06:35 +0000
commit87d4ab8b92e17db517499403eaa2e0b19992fae2 (patch)
tree0c973e1dd13a30f2acd11e595f2201c6560f7fd0 /llvm/test/CodeGen/X86/vector-reduce-umax.ll
parent6af0363857f5815fb69268198dd55f29c7a3539b (diff)
downloadbcm5719-llvm-87d4ab8b92e17db517499403eaa2e0b19992fae2.tar.gz
bcm5719-llvm-87d4ab8b92e17db517499403eaa2e0b19992fae2.zip
[X86][SSE41] Start shuffle combining from ZERO_EXTEND_VECTOR_INREG (PR40685)
Enable SSE41 ZERO_EXTEND_VECTOR_INREG shuffle combines - for the PMOVZX(PSHUFD(V)) -> UNPCKH(V,0) pattern we reduce the shuffles (port5-bottleneck on Intel) at the expense of creating a zero (pxor v,v) and an extra register move - which is a good trade off as these are pretty cheap and in most cases it doesn't increase register pressure. This also exposed a missed opportunity to use combine to ZERO_EXTEND_VECTOR_INREG with folded loads - even if we're in the float domain. llvm-svn: 356864
Diffstat (limited to 'llvm/test/CodeGen/X86/vector-reduce-umax.ll')
-rw-r--r--llvm/test/CodeGen/X86/vector-reduce-umax.ll26
1 files changed, 14 insertions, 12 deletions
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
index c56ca549fd7..23442287b4f 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
@@ -751,22 +751,24 @@ define i32 @test_v2i32(<2 x i32> %a0) {
;
; SSE41-LABEL: test_v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: pxor %xmm0, %xmm3
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movd %xmm2, %eax
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movd %xmm1, %eax
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v2i32:
OpenPOWER on IntegriCloud