summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Transforms
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2015-03-20 21:47:56 +0000
committerSanjay Patel <spatel@rotateright.com>2015-03-20 21:47:56 +0000
commitccf5f24b7b907680bdcfb19104cae78a92580682 (patch)
tree7881b1b31a6120b5097b06b57279e3f97e03288b /llvm/lib/Transforms
parent3170e5620e463b20688bc629bde08c5ca554ea11 (diff)
downloadbcm5719-llvm-ccf5f24b7b907680bdcfb19104cae78a92580682.tar.gz
bcm5719-llvm-ccf5f24b7b907680bdcfb19104cae78a92580682.zip
[X86, AVX] instcombine common cases of vperm2* intrinsics into shuffles
vperm2* intrinsics are just shuffles. In a few special cases, they're not even shuffles. Optimizing intrinsics in InstCombine is better than handling this in the front-end for at least two reasons: 1. Optimizing custom-written SSE intrinsic code at -O0 makes vector coders really angry (and so I have regrets about some patches from last week). 2. Doing mask conversion logic in header files is hard to write and subsequently read. There are a couple of TODOs in this patch to complete this optimization. Differential Revision: http://reviews.llvm.org/D8486 llvm-svn: 232852
Diffstat (limited to 'llvm/lib/Transforms')
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp59
1 files changed, 59 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 00d92c873bd..b59c9f5d910 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -197,6 +197,57 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
return nullptr;
}
+/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
+/// source vectors, unless a zero bit is set. If a zero bit is set,
+/// then ignore that half of the mask and clear that half of the vector.
+static Value *SimplifyX86vperm2(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ if (auto CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+ VectorType *VecTy = cast<VectorType>(II.getType());
+ uint8_t Imm = CInt->getZExtValue();
+
+ // The immediate permute control byte looks like this:
+ // [1:0] - select 128 bits from sources for low half of destination
+ // [2] - ignore
+ // [3] - zero low half of destination
+ // [5:4] - select 128 bits from sources for high half of destination
+ // [6] - ignore
+ // [7] - zero high half of destination
+
+ if ((Imm & 0x88) == 0x88) {
+ // If both zero mask bits are set, this was just a weird way to
+ // generate a zero vector.
+ return ConstantAggregateZero::get(VecTy);
+ }
+
+ // TODO: If a single zero bit is set, replace one of the source operands
+ // with a zero vector and use the same mask generation logic as below.
+
+ if ((Imm & 0x88) == 0x00) {
+ // If neither zero mask bit is set, this is a simple shuffle.
+ unsigned NumElts = VecTy->getNumElements();
+ unsigned HalfSize = NumElts / 2;
+ unsigned HalfBegin;
+ SmallVector<int, 8> ShuffleMask(NumElts);
+
+ // Permute low half of result.
+ HalfBegin = (Imm & 0x3) * HalfSize;
+ for (unsigned i = 0; i != HalfSize; ++i)
+ ShuffleMask[i] = HalfBegin + i;
+
+ // Permute high half of result.
+ HalfBegin = ((Imm >> 4) & 0x3) * HalfSize;
+ for (unsigned i = HalfSize; i != NumElts; ++i)
+ ShuffleMask[i] = HalfBegin + i - HalfSize;
+
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ return Builder.CreateShuffleVector(Op0, Op1, ShuffleMask);
+ }
+ }
+ return nullptr;
+}
+
/// visitCallInst - CallInst simplification. This mostly only handles folding
/// of intrinsic instructions. For normal calls, it allows visitCallSite to do
/// the heavy lifting.
@@ -904,6 +955,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return ReplaceInstUsesWith(CI, Shuffle);
}
+ case Intrinsic::x86_avx_vperm2f128_pd_256:
+ case Intrinsic::x86_avx_vperm2f128_ps_256:
+ case Intrinsic::x86_avx_vperm2f128_si_256:
+ // TODO: Add the AVX2 version of this instruction.
+ if (Value *V = SimplifyX86vperm2(*II, *Builder))
+ return ReplaceInstUsesWith(*II, V);
+ break;
+
case Intrinsic::ppc_altivec_vperm:
// Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
// Note that ppc_altivec_vperm has a big-endian bias, so when creating
OpenPOWER on IntegriCloud