From ccf5f24b7b907680bdcfb19104cae78a92580682 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 20 Mar 2015 21:47:56 +0000 Subject: [X86, AVX] instcombine common cases of vperm2* intrinsics into shuffles vperm2* intrinsics are just shuffles. In a few special cases, they're not even shuffles. Optimizing intrinsics in InstCombine is better than handling this in the front-end for at least two reasons: 1. Optimizing custom-written SSE intrinsic code at -O0 makes vector coders really angry (and so I have regrets about some patches from last week). 2. Doing mask conversion logic in header files is hard to write and subsequently read. There are a couple of TODOs in this patch to complete this optimization. Differential Revision: http://reviews.llvm.org/D8486 llvm-svn: 232852 --- .../Transforms/InstCombine/InstCombineCalls.cpp | 59 ++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp') diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 00d92c873bd..b59c9f5d910 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -197,6 +197,57 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { return nullptr; } +/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit +/// source vectors, unless a zero bit is set. If a zero bit is set, +/// then ignore that half of the mask and clear that half of the vector. +static Value *SimplifyX86vperm2(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + if (auto CInt = dyn_cast(II.getArgOperand(2))) { + VectorType *VecTy = cast(II.getType()); + uint8_t Imm = CInt->getZExtValue(); + + // The immediate permute control byte looks like this: + // [1:0] - select 128 bits from sources for low half of destination + // [2] - ignore + // [3] - zero low half of destination + // [5:4] - select 128 bits from sources for high half of destination + // [6] - ignore + // [7] - zero high half of destination + + if ((Imm & 0x88) == 0x88) { + // If both zero mask bits are set, this was just a weird way to + // generate a zero vector. + return ConstantAggregateZero::get(VecTy); + } + + // TODO: If a single zero bit is set, replace one of the source operands + // with a zero vector and use the same mask generation logic as below. + + if ((Imm & 0x88) == 0x00) { + // If neither zero mask bit is set, this is a simple shuffle. + unsigned NumElts = VecTy->getNumElements(); + unsigned HalfSize = NumElts / 2; + unsigned HalfBegin; + SmallVector ShuffleMask(NumElts); + + // Permute low half of result. + HalfBegin = (Imm & 0x3) * HalfSize; + for (unsigned i = 0; i != HalfSize; ++i) + ShuffleMask[i] = HalfBegin + i; + + // Permute high half of result. + HalfBegin = ((Imm >> 4) & 0x3) * HalfSize; + for (unsigned i = HalfSize; i != NumElts; ++i) + ShuffleMask[i] = HalfBegin + i - HalfSize; + + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + return Builder.CreateShuffleVector(Op0, Op1, ShuffleMask); + } + } + return nullptr; +} + /// visitCallInst - CallInst simplification. This mostly only handles folding /// of intrinsic instructions. For normal calls, it allows visitCallSite to do /// the heavy lifting. @@ -904,6 +955,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return ReplaceInstUsesWith(CI, Shuffle); } + case Intrinsic::x86_avx_vperm2f128_pd_256: + case Intrinsic::x86_avx_vperm2f128_ps_256: + case Intrinsic::x86_avx_vperm2f128_si_256: + // TODO: Add the AVX2 version of this instruction. + if (Value *V = SimplifyX86vperm2(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + break; + case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. // Note that ppc_altivec_vperm has a big-endian bias, so when creating -- cgit v1.2.3