diff options
| author | Sanjay Patel <spatel@rotateright.com> | 2019-07-16 21:30:41 +0000 |
|---|---|---|
| committer | Sanjay Patel <spatel@rotateright.com> | 2019-07-16 21:30:41 +0000 |
| commit | d746a210e16925d8c26bd8359598d95213712218 (patch) | |
| tree | b993fe7950707c1a884926b88f123aad6b0bb565 /llvm/test | |
| parent | f4c2d57f767d870b4787c86b543ded8076fe108b (diff) | |
| download | bcm5719-llvm-d746a210e16925d8c26bd8359598d95213712218.tar.gz bcm5719-llvm-d746a210e16925d8c26bd8359598d95213712218.zip | |
[x86] use more phadd for reductions
This is part of what is requested by PR42023:
https://bugs.llvm.org/show_bug.cgi?id=42023
There's an extension needed for FP add, but exactly how we would specify
that using flags is not clear to me, so I left that as a TODO.
We're still missing patterns for partial reductions when the input vector
is 256-bit or 512-bit, but I think that's a failure of vector narrowing.
If we can reduce the widths, then this matching should work on those tests.
Differential Revision: https://reviews.llvm.org/D64760
llvm-svn: 366268
Diffstat (limited to 'llvm/test')
| -rw-r--r-- | llvm/test/CodeGen/X86/phaddsub-extract.ll | 44 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-reduce-add-widen.ll | 22 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-reduce-add.ll | 22 |
3 files changed, 32 insertions, 56 deletions
diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll index e81952d331c..2a7039e932c 100644 --- a/llvm/test/CodeGen/X86/phaddsub-extract.ll +++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll @@ -1903,10 +1903,8 @@ define i16 @hadd16_8(<8 x i16> %x223) { ; ; SSE3-FAST-LABEL: hadd16_8: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE3-FAST-NEXT: paddw %xmm0, %xmm1 -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE3-FAST-NEXT: paddw %xmm1, %xmm0 +; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1926,10 +1924,8 @@ define i16 @hadd16_8(<8 x i16> %x223) { ; ; AVX-FAST-LABEL: hadd16_8: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vmovd %xmm0, %eax ; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1956,10 +1952,9 @@ define i32 @hadd32_4(<4 x i32> %x225) { ; ; SSE3-FAST-LABEL: hadd32_4: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 -; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 -; SSE3-FAST-NEXT: movd %xmm1, %eax +; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: hadd32_4: @@ -1973,8 +1968,7 @@ define i32 @hadd32_4(<4 x i32> %x225) { ; ; AVX-FAST-LABEL: hadd32_4: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vmovd %xmm0, %eax ; AVX-FAST-NEXT: retq @@ -2097,10 +2091,8 @@ define i32 @hadd32_16(<16 x i32> %x225) { define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize { ; SSE3-LABEL: hadd16_8_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE3-NEXT: paddw %xmm0, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE3-NEXT: paddw %xmm1, %xmm0 +; SSE3-NEXT: phaddw %xmm0, %xmm0 +; SSE3-NEXT: phaddw %xmm0, %xmm0 ; SSE3-NEXT: phaddw %xmm0, %xmm0 ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: # kill: def $ax killed $ax killed $eax @@ -2108,10 +2100,8 @@ define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize { ; ; AVX-LABEL: hadd16_8_optsize: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax @@ -2129,16 +2119,14 @@ define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize { define i32 @hadd32_4_optsize(<4 x i32> %x225) optsize { ; SSE3-LABEL: hadd32_4_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE3-NEXT: paddd %xmm0, %xmm1 -; SSE3-NEXT: phaddd %xmm1, %xmm1 -; SSE3-NEXT: movd %xmm1, %eax +; SSE3-NEXT: phaddd %xmm0, %xmm0 +; SSE3-NEXT: phaddd %xmm0, %xmm0 +; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_4_optsize: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-add-widen.ll index b886a745edc..6dc5a2b54b5 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-widen.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-widen.ll @@ -254,8 +254,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -307,9 +306,8 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1-FAST-LABEL: test_v8i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper @@ -635,10 +633,8 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX1-FAST-LABEL: test_v8i16: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -704,11 +700,9 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1-FAST-LABEL: test_v16i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll index 02fb375a318..630299a1824 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -241,8 +241,7 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -294,9 +293,8 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1-FAST-LABEL: test_v8i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper @@ -605,10 +603,8 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; ; AVX1-FAST-LABEL: test_v8i16: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -674,11 +670,9 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1-FAST-LABEL: test_v16i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax |

