From c701596e8613f5083ada201c3c658623c9b4f234 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 29 Dec 2017 14:41:50 +0000 Subject: [X86][SSE] Match PSHUFLW/PSHUFHW + PSHUFD vXi16 shuffle patterns (PR34686) As noted in PR34686, we are relying on a PSHUFD+PSHUFLW+PSHUFHW shuffle chain for most general vXi16 unary shuffles. This patch checks for simpler PSHUFLW+PSHUFD and PSHUFHW+PSHUFD cases beforehand, building on some existing code that just handled splat shuffles. By doing so we also prevent premature use of PSHUFB shuffles which can be slower and require the creation/loading of constant shuffle masks. We now have the 'fast-variable-shuffle' option for hardware that prefers combining 2 or more shuffles to VPSHUFB etc. Differential Revision: https://reviews.llvm.org/D38318 llvm-svn: 321553 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 72 ++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 11 deletions(-) (limited to 'llvm/lib/Target') diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9edd799779c..8bc25c9f080 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -11261,6 +11261,20 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( MutableArrayRef LoMask = Mask.slice(0, 4); MutableArrayRef HiMask = Mask.slice(4, 4); + // Attempt to directly match PSHUFLW or PSHUFHW. + if (isUndefOrInRange(LoMask, 0, 4) && + isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { + return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, + getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); + } + if (isUndefOrInRange(HiMask, 4, 8) && + isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { + for (int i = 0; i != 4; ++i) + HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4)); + return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, + getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); + } + SmallVector LoInputs; copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); std::sort(LoInputs.begin(), LoInputs.end()); @@ -11280,13 +11294,11 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); - // If we are splatting two values from one half - one to each half, then - // we can shuffle that half so each is splatted to a dword, then splat those - // to their respective halves. - auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp, - int DOffset) { - int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4}; - int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1}; + // If we are shuffling values from one half - check how many different DWORD + // pairs we need to create. If only 1 or 2 then we can perform this as a + // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below. + auto ShuffleDWordPairs = [&](ArrayRef PSHUFHalfMask, + ArrayRef PSHUFDMask, unsigned ShufWOp) { V = DAG.getNode(ShufWOp, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); V = DAG.getBitcast(PSHUFDVT, V); @@ -11295,10 +11307,48 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( return DAG.getBitcast(VT, V); }; - if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0) - return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0); - if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0) - return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2); + if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) { + int PSHUFDMask[4] = { -1, -1, -1, -1 }; + SmallVector, 4> DWordPairs; + int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2); + + // Collect the different DWORD pairs. + for (int DWord = 0; DWord != 4; ++DWord) { + int M0 = Mask[2 * DWord + 0]; + int M1 = Mask[2 * DWord + 1]; + M0 = (M0 >= 0 ? M0 % 4 : M0); + M1 = (M1 >= 0 ? M1 % 4 : M1); + if (M0 < 0 && M1 < 0) + continue; + + bool Match = false; + for (int j = 0, e = DWordPairs.size(); j < e; ++j) { + auto &DWordPair = DWordPairs[j]; + if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) && + (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) { + DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first); + DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second); + PSHUFDMask[DWord] = DOffset + j; + Match = true; + break; + } + } + if (!Match) { + PSHUFDMask[DWord] = DOffset + DWordPairs.size(); + DWordPairs.push_back(std::make_pair(M0, M1)); + } + } + + if (DWordPairs.size() <= 2) { + DWordPairs.resize(2, std::make_pair(-1, -1)); + int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second, + DWordPairs[1].first, DWordPairs[1].second}; + if ((NumHToL + NumHToH) == 0) + return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW); + if ((NumLToL + NumLToH) == 0) + return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW); + } + } // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up -- cgit v1.2.3