diff options
author | Igor Breger <igor.breger@intel.com> | 2017-02-20 14:16:29 +0000 |
---|---|---|
committer | Igor Breger <igor.breger@intel.com> | 2017-02-20 14:16:29 +0000 |
commit | fda32d266a076af4512c5f10148933a109c4864d (patch) | |
tree | 89b15e1ffdfcaa08f94c2bed6781b9d471b95e67 /llvm/test/CodeGen/X86/extractelement-index.ll | |
parent | d9b319e3e3b9aa77741e70480c4ce41094ff3a85 (diff) | |
download | bcm5719-llvm-fda32d266a076af4512c5f10148933a109c4864d.tar.gz bcm5719-llvm-fda32d266a076af4512c5f10148933a109c4864d.zip |
[X86] Fix EXTRACT_VECTOR_ELT with variable index from v32i16 and v64i8 vector.
Its more profitable to go through memory (1 cycles throughput)
than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) to implement EXTRACT_VECTOR_ELT with variable index.
IACA tool was used to get performace estimation (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
For example for var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8 test from vector-shuffle-variable-128.ll I get 26 cycles vs 79 cycles.
Removing the VINSERT node, we don't need it any more.
Differential Revision: https://reviews.llvm.org/D29690
llvm-svn: 295660
Diffstat (limited to 'llvm/test/CodeGen/X86/extractelement-index.ll')
-rw-r--r-- | llvm/test/CodeGen/X86/extractelement-index.ll | 34 |
1 files changed, 13 insertions, 21 deletions
diff --git a/llvm/test/CodeGen/X86/extractelement-index.ll b/llvm/test/CodeGen/X86/extractelement-index.ll index 8c12e7148aa..157e42b60a3 100644 --- a/llvm/test/CodeGen/X86/extractelement-index.ll +++ b/llvm/test/CodeGen/X86/extractelement-index.ll @@ -538,27 +538,19 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind { ; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; AVX1-LABEL: extractelement_v8i32_var: -; AVX1: # BB#0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: movq %rsp, %rbp -; AVX1-NEXT: andq $-32, %rsp -; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: andl $7, %edi -; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: movl (%rsp,%rdi,4), %eax -; AVX1-NEXT: movq %rbp, %rsp -; AVX1-NEXT: popq %rbp -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: extractelement_v8i32_var: -; AVX2: # BB#0: -; AVX2-NEXT: vmovd %edi, %xmm1 -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: extractelement_v8i32_var: +; AVX: # BB#0: +; AVX-NEXT: pushq %rbp +; AVX-NEXT: movq %rsp, %rbp +; AVX-NEXT: andq $-32, %rsp +; AVX-NEXT: subq $64, %rsp +; AVX-NEXT: andl $7, %edi +; AVX-NEXT: vmovaps %ymm0, (%rsp) +; AVX-NEXT: movl (%rsp,%rdi,4), %eax +; AVX-NEXT: movq %rbp, %rsp +; AVX-NEXT: popq %rbp +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %b = extractelement <8 x i32> %a, i256 %i ret i32 %b } |