diff options
| author | Michael Kuperstein <michael.m.kuperstein@intel.com> | 2014-12-04 13:49:51 +0000 |
|---|---|---|
| committer | Michael Kuperstein <michael.m.kuperstein@intel.com> | 2014-12-04 13:49:51 +0000 |
| commit | 0492bd2b9e4557b4547e5ed5e8fbad8bab38da4c (patch) | |
| tree | 316cf06c62ebdc8e4519f24ff7f2db275cd66d3b /llvm/test | |
| parent | c9bcffd20111eff93cf1fe3fda905ea5aeddaf92 (diff) | |
| download | bcm5719-llvm-0492bd2b9e4557b4547e5ed5e8fbad8bab38da4c.tar.gz bcm5719-llvm-0492bd2b9e4557b4547e5ed5e8fbad8bab38da4c.zip | |
[X86] Improve a dag-combine that handles a vector extract -> zext sequence.
The current DAG combine turns a sequence of extracts from <4 x i32> followed by zexts into a store followed by scalar loads.
According to measurements by Martin Krastev (see PR 21269) for x86-64, a sequence of an extract, movs and shifts gives better performance. However, for 32-bit x86, the previous sequence still seems better.
Differential Revision: http://reviews.llvm.org/D6501
llvm-svn: 223360
Diffstat (limited to 'llvm/test')
| -rw-r--r-- | llvm/test/CodeGen/X86/gather-addresses.ll | 83 |
1 files changed, 59 insertions, 24 deletions
diff --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll index 5f48b1e32b1..6d397b21148 100644 --- a/llvm/test/CodeGen/X86/gather-addresses.ll +++ b/llvm/test/CodeGen/X86/gather-addresses.ll @@ -1,35 +1,38 @@ ; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN ; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=WIN +; RUN: llc -mtriple=i686-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN32 ; rdar://7398554 ; When doing vector gather-scatter index calculation with 32-bit indices, -; bounce the vector off of cache rather than shuffling each individual +; use an efficient mov/shift sequence rather than shuffling each individual ; element out of the index vector. -; CHECK: foo: -; LIN: movaps (%rsi), %xmm0 -; LIN: andps (%rdx), %xmm0 -; LIN: movaps %xmm0, -24(%rsp) -; LIN: movslq -24(%rsp), %[[REG1:r.+]] -; LIN: movslq -20(%rsp), %[[REG2:r.+]] -; LIN: movslq -16(%rsp), %[[REG3:r.+]] -; LIN: movslq -12(%rsp), %[[REG4:r.+]] -; LIN: movsd (%rdi,%[[REG1]],8), %xmm0 -; LIN: movhpd (%rdi,%[[REG2]],8), %xmm0 -; LIN: movsd (%rdi,%[[REG3]],8), %xmm1 -; LIN: movhpd (%rdi,%[[REG4]],8), %xmm1 +; CHECK-LABEL: foo: +; LIN: movdqa (%rsi), %xmm0 +; LIN: pand (%rdx), %xmm0 +; LIN: pextrq $1, %xmm0, %r[[REG4:.+]] +; LIN: movd %xmm0, %r[[REG2:.+]] +; LIN: movslq %e[[REG2]], %r[[REG1:.+]] +; LIN: sarq $32, %r[[REG2]] +; LIN: movslq %e[[REG4]], %r[[REG3:.+]] +; LIN: sarq $32, %r[[REG4]] +; LIN: movsd (%rdi,%r[[REG1]],8), %xmm0 +; LIN: movhpd (%rdi,%r[[REG2]],8), %xmm0 +; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1 +; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1 -; WIN: movaps (%rdx), %xmm0 -; WIN: andps (%r8), %xmm0 -; WIN: movaps %xmm0, (%rsp) -; WIN: movslq (%rsp), %[[REG1:r.+]] -; WIN: movslq 4(%rsp), %[[REG2:r.+]] -; WIN: movslq 8(%rsp), %[[REG3:r.+]] -; WIN: movslq 12(%rsp), %[[REG4:r.+]] -; WIN: movsd (%rcx,%[[REG1]],8), %xmm0 -; WIN: movhpd (%rcx,%[[REG2]],8), %xmm0 -; WIN: movsd (%rcx,%[[REG3]],8), %xmm1 -; WIN: movhpd (%rcx,%[[REG4]],8), %xmm1 +; WIN: movdqa (%rdx), %xmm0 +; WIN: pand (%r8), %xmm0 +; WIN: pextrq $1, %xmm0, %r[[REG4:.+]] +; WIN: movd %xmm0, %r[[REG2:.+]] +; WIN: movslq %e[[REG2]], %r[[REG1:.+]] +; WIN: sarq $32, %r[[REG2]] +; WIN: movslq %e[[REG4]], %r[[REG3:.+]] +; WIN: sarq $32, %r[[REG4]] +; WIN: movsd (%rcx,%r[[REG1]],8), %xmm0 +; WIN: movhpd (%rcx,%r[[REG2]],8), %xmm0 +; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1 +; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1 define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { %a = load <4 x i32>* %i @@ -53,3 +56,35 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { %v3 = insertelement <4 x double> %v2, double %r3, i32 3 ret <4 x double> %v3 } + +; Check that the sequence previously used above, which bounces the vector off the +; cache works for x86-32. Note that in this case it will not be used for index +; calculation, since indexes are 32-bit, not 64. +; CHECK-LABEL: old: +; LIN32: movaps %xmm0, (%esp) +; LIN32-DAG: {{(mov|and)}}l (%esp), +; LIN32-DAG: {{(mov|and)}}l 4(%esp), +; LIN32-DAG: {{(mov|and)}}l 8(%esp), +; LIN32-DAG: {{(mov|and)}}l 12(%esp), +define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind { + %a = load <4 x i32>* %i + %b = load <4 x i32>* %h + %j = and <4 x i32> %a, %b + %d0 = extractelement <4 x i32> %j, i32 0 + %d1 = extractelement <4 x i32> %j, i32 1 + %d2 = extractelement <4 x i32> %j, i32 2 + %d3 = extractelement <4 x i32> %j, i32 3 + %q0 = zext i32 %d0 to i64 + %q1 = zext i32 %d1 to i64 + %q2 = zext i32 %d2 to i64 + %q3 = zext i32 %d3 to i64 + %r0 = and i64 %q0, %f + %r1 = and i64 %q1, %f + %r2 = and i64 %q2, %f + %r3 = and i64 %q3, %f + %v0 = insertelement <4 x i64> undef, i64 %r0, i32 0 + %v1 = insertelement <4 x i64> %v0, i64 %r1, i32 1 + %v2 = insertelement <4 x i64> %v1, i64 %r2, i32 2 + %v3 = insertelement <4 x i64> %v2, i64 %r3, i32 3 + ret <4 x i64> %v3 +} |

