diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2015-07-06 20:46:41 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2015-07-06 20:46:41 +0000 |
commit | d85cae3d52a36cbed6443abebe1d6ed77fc03cad (patch) | |
tree | ce125cc5b3ca4e6dfb0a94ae817ccaeed496b77a /llvm/lib/Target/X86/Utils | |
parent | 381dffd655b5ed8a71dab57bd7ba51224cfedbe6 (diff) | |
download | bcm5719-llvm-d85cae3d52a36cbed6443abebe1d6ed77fc03cad.tar.gz bcm5719-llvm-d85cae3d52a36cbed6443abebe1d6ed77fc03cad.zip |
[X86][SSE4A] Shuffle lowering using SSE4A EXTRQ/INSERTQ instructions
This patch adds support for v8i16 and v16i8 shuffle lowering using the immediate versions of the SSE4A EXTRQ and INSERTQ instructions. Although rather limited (they can only act on the lower 64-bits of the source vectors, leave the upper 64-bits of the result vector undefined and don't have VEX encoded variants), the instructions are still useful for the zero extension of any lane (EXTRQ) or inserting a lane into another vector (INSERTQ). Testing demonstrated that it wasn't typically worth it to use these instructions for v2i64 or v4i32 vector shuffles although they are capable of it.
As well as adding specific pattern matching for the shuffles, the patch uses EXTRQ for zero extension cases where SSE41 isn't available and its more efficient than the SSE2 'unpack' default approach. It also adds shuffle decode support for the EXTRQ / INSERTQ cases when the instructions are handling full byte-sized extractions / insertions.
From this foundation, future patches will be able to make use of the instructions for situations that use their ability to extract/insert at the bit level.
Differential Revision: http://reviews.llvm.org/D10146
llvm-svn: 241508
Diffstat (limited to 'llvm/lib/Target/X86/Utils')
-rw-r--r-- | llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp | 74 | ||||
-rw-r--r-- | llvm/lib/Target/X86/Utils/X86ShuffleDecode.h | 8 |
2 files changed, 82 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index ef3318ba758..9777c0d85e9 100644 --- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -431,4 +431,78 @@ void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) { for (unsigned i = 1; i < NumElts; i++) Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i); } + +void DecodeEXTRQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask) { + // Only the bottom 6 bits are valid for each immediate. + Len &= 0x3F; + Idx &= 0x3F; + + // We can only decode this bit extraction instruction as a shuffle if both the + // length and index work with whole bytes. + if (0 != (Len % 8) || 0 != (Idx % 8)) + return; + + // A length of zero is equivalent to a bit length of 64. + if (Len == 0) + Len = 64; + + // If the length + index exceeds the bottom 64 bits the result is undefined. + if ((Len + Idx) > 64) { + ShuffleMask.append(16, SM_SentinelUndef); + return; + } + + // Convert index and index to work with bytes. + Len /= 8; + Idx /= 8; + + // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes + // of the lower 64-bits. The upper 64-bits are undefined. + for (int i = 0; i != Len; ++i) + ShuffleMask.push_back(i + Idx); + for (int i = Len; i != 8; ++i) + ShuffleMask.push_back(SM_SentinelZero); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(SM_SentinelUndef); +} + +void DecodeINSERTQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask) { + // Only the bottom 6 bits are valid for each immediate. + Len &= 0x3F; + Idx &= 0x3F; + + // We can only decode this bit insertion instruction as a shuffle if both the + // length and index work with whole bytes. + if (0 != (Len % 8) || 0 != (Idx % 8)) + return; + + // A length of zero is equivalent to a bit length of 64. + if (Len == 0) + Len = 64; + + // If the length + index exceeds the bottom 64 bits the result is undefined. + if ((Len + Idx) > 64) { + ShuffleMask.append(16, SM_SentinelUndef); + return; + } + + // Convert index and index to work with bytes. + Len /= 8; + Idx /= 8; + + // INSERTQ: Extract lowest Len bytes from lower half of second source and + // insert over first source starting at Idx byte. The upper 64-bits are + // undefined. + for (int i = 0; i != Idx; ++i) + ShuffleMask.push_back(i); + for (int i = 0; i != Len; ++i) + ShuffleMask.push_back(i + 16); + for (int i = Idx + Len; i != 8; ++i) + ShuffleMask.push_back(i); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(SM_SentinelUndef); +} + } // llvm namespace diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index 14b69434806..3d10d18e860 100644 --- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -100,6 +100,14 @@ void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); /// \brief Decode a scalar float move instruction as a shuffle mask. void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask. +void DecodeEXTRQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask. +void DecodeINSERTQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask); } // llvm namespace #endif |