summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2017-03-11 20:42:31 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2017-03-11 20:42:31 +0000
commit18debfa5b44aae8faa38c40ff0ad32e5fabb7773 (patch)
treea8014fcb561281d9c35bbb480b15b173c0f5a104
parent9ff5732c921ffae261784551529e82b29440cf95 (diff)
downloadbcm5719-llvm-18debfa5b44aae8faa38c40ff0ad32e5fabb7773.tar.gz
bcm5719-llvm-18debfa5b44aae8faa38c40ff0ad32e5fabb7773.zip
[X86][SSE] Improve extraction of elements from v16i8 (pre-SSE41)
Without SSE41 (pextrb) we currently extract byte elements from a vector by spilling to stack and reloading the byte. This patch is an initial attempt at using MOVD/PEXTRW to extract the relevant DWORD/WORD from the vector and then shift+truncate to collect the correct byte. Extraction of multiple bytes this way would result in code bloat, but as explained in the patch we could probably afford to be more aggressive with the supported extractions before again falling back on spilling - possibly through counting the number of extracts and which DWORD/WORD they originate? Differential Revision: https://reviews.llvm.org/D29841 llvm-svn: 297568
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp28
-rw-r--r--llvm/test/CodeGen/X86/extract-store.ll48
-rw-r--r--llvm/test/CodeGen/X86/extractelement-index.ll24
3 files changed, 54 insertions, 46 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 67bf864ddc8..95bba1a5773 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13935,7 +13935,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
return Res;
- // TODO: handle v16i8.
+ // TODO: We only extract a single element from v16i8, we can probably afford
+ // to be more aggressive here before using the default approach of spilling to
+ // stack.
+ if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
+ // Extract either the lowest i32 or any i16, and extract the sub-byte.
+ int DWordIdx = IdxVal / 4;
+ if (DWordIdx == 0) {
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Vec),
+ DAG.getIntPtrConstant(DWordIdx, dl));
+ int ShiftVal = (IdxVal % 4) * 8;
+ if (ShiftVal != 0)
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
+ DAG.getConstant(ShiftVal, dl, MVT::i32));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
+
+ int WordIdx = IdxVal / 2;
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+ DAG.getBitcast(MVT::v8i16, Vec),
+ DAG.getIntPtrConstant(WordIdx, dl));
+ int ShiftVal = (IdxVal % 2) * 8;
+ if (ShiftVal != 0)
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
+ DAG.getConstant(ShiftVal, dl, MVT::i16));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
if (VT.getSizeInBits() == 32) {
if (IdxVal == 0)
diff --git a/llvm/test/CodeGen/X86/extract-store.ll b/llvm/test/CodeGen/X86/extract-store.ll
index fda56f94000..1751f03731d 100644
--- a/llvm/test/CodeGen/X86/extract-store.ll
+++ b/llvm/test/CodeGen/X86/extract-store.ll
@@ -9,22 +9,14 @@
define void @extract_i8_0(i8* nocapture %dst, <16 x i8> %foo) nounwind {
; SSE2-X32-LABEL: extract_i8_0:
; SSE2-X32: # BB#0:
-; SSE2-X32-NEXT: pushl %ebp
-; SSE2-X32-NEXT: movl %esp, %ebp
-; SSE2-X32-NEXT: andl $-16, %esp
-; SSE2-X32-NEXT: subl $32, %esp
-; SSE2-X32-NEXT: movl 8(%ebp), %eax
-; SSE2-X32-NEXT: movaps %xmm0, (%esp)
-; SSE2-X32-NEXT: movb (%esp), %cl
+; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT: movd %xmm0, %ecx
; SSE2-X32-NEXT: movb %cl, (%eax)
-; SSE2-X32-NEXT: movl %ebp, %esp
-; SSE2-X32-NEXT: popl %ebp
; SSE2-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_i8_0:
; SSE2-X64: # BB#0:
-; SSE2-X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-X64-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-X64-NEXT: movd %xmm0, %eax
; SSE2-X64-NEXT: movb %al, (%rdi)
; SSE2-X64-NEXT: retq
;
@@ -57,22 +49,16 @@ define void @extract_i8_0(i8* nocapture %dst, <16 x i8> %foo) nounwind {
define void @extract_i8_3(i8* nocapture %dst, <16 x i8> %foo) nounwind {
; SSE2-X32-LABEL: extract_i8_3:
; SSE2-X32: # BB#0:
-; SSE2-X32-NEXT: pushl %ebp
-; SSE2-X32-NEXT: movl %esp, %ebp
-; SSE2-X32-NEXT: andl $-16, %esp
-; SSE2-X32-NEXT: subl $32, %esp
-; SSE2-X32-NEXT: movl 8(%ebp), %eax
-; SSE2-X32-NEXT: movaps %xmm0, (%esp)
-; SSE2-X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT: movd %xmm0, %ecx
+; SSE2-X32-NEXT: shrl $24, %ecx
; SSE2-X32-NEXT: movb %cl, (%eax)
-; SSE2-X32-NEXT: movl %ebp, %esp
-; SSE2-X32-NEXT: popl %ebp
; SSE2-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_i8_3:
; SSE2-X64: # BB#0:
-; SSE2-X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-X64-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-X64-NEXT: movd %xmm0, %eax
+; SSE2-X64-NEXT: shrl $24, %eax
; SSE2-X64-NEXT: movb %al, (%rdi)
; SSE2-X64-NEXT: retq
;
@@ -105,23 +91,15 @@ define void @extract_i8_3(i8* nocapture %dst, <16 x i8> %foo) nounwind {
define void @extract_i8_15(i8* nocapture %dst, <16 x i8> %foo) nounwind {
; SSE2-X32-LABEL: extract_i8_15:
; SSE2-X32: # BB#0:
-; SSE2-X32-NEXT: pushl %ebp
-; SSE2-X32-NEXT: movl %esp, %ebp
-; SSE2-X32-NEXT: andl $-16, %esp
-; SSE2-X32-NEXT: subl $32, %esp
-; SSE2-X32-NEXT: movl 8(%ebp), %eax
-; SSE2-X32-NEXT: movaps %xmm0, (%esp)
-; SSE2-X32-NEXT: movb {{[0-9]+}}(%esp), %cl
-; SSE2-X32-NEXT: movb %cl, (%eax)
-; SSE2-X32-NEXT: movl %ebp, %esp
-; SSE2-X32-NEXT: popl %ebp
+; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT: pextrw $7, %xmm0, %ecx
+; SSE2-X32-NEXT: movb %ch, (%eax)
; SSE2-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_i8_15:
; SSE2-X64: # BB#0:
-; SSE2-X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-X64-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-X64-NEXT: movb %al, (%rdi)
+; SSE2-X64-NEXT: pextrw $7, %xmm0, %eax
+; SSE2-X64-NEXT: movb %ah, (%rdi) # NOREX
; SSE2-X64-NEXT: retq
;
; SSE41-X32-LABEL: extract_i8_15:
diff --git a/llvm/test/CodeGen/X86/extractelement-index.ll b/llvm/test/CodeGen/X86/extractelement-index.ll
index 157e42b60a3..e36e33ffe66 100644
--- a/llvm/test/CodeGen/X86/extractelement-index.ll
+++ b/llvm/test/CodeGen/X86/extractelement-index.ll
@@ -11,8 +11,9 @@
define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind {
; SSE2-LABEL: extractelement_v16i8_1:
; SSE2: # BB#0:
-; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: shrl $8, %eax
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v16i8_1:
@@ -33,8 +34,9 @@ define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind {
define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind {
; SSE2-LABEL: extractelement_v16i8_11:
; SSE2: # BB#0:
-; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: pextrw $5, %xmm0, %eax
+; SSE2-NEXT: shrl $8, %eax
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v16i8_11:
@@ -55,8 +57,8 @@ define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind {
define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind {
; SSE2-LABEL: extractelement_v16i8_14:
; SSE2: # BB#0:
-; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: pextrw $7, %xmm0, %eax
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v16i8_14:
@@ -77,8 +79,9 @@ define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind {
define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind {
; SSE2-LABEL: extractelement_v32i8_1:
; SSE2: # BB#0:
-; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: shrl $8, %eax
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v32i8_1:
@@ -100,8 +103,9 @@ define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind {
define i8 @extractelement_v32i8_17(<32 x i8> %a) nounwind {
; SSE2-LABEL: extractelement_v32i8_17:
; SSE2: # BB#0:
-; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: shrl $8, %eax
+; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v32i8_17:
OpenPOWER on IntegriCloud