diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-07-13 15:10:43 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-07-13 15:10:43 +0000 |
commit | 48d83407602b15f80310bba0bdf5731ed194e092 (patch) | |
tree | 39b37e1cf162524943af11f168ef0f7e189a87a6 | |
parent | bd69903692dd3e919f7b3f4b1b4cf70ca2c1f376 (diff) | |
download | bcm5719-llvm-48d83407602b15f80310bba0bdf5731ed194e092.tar.gz bcm5719-llvm-48d83407602b15f80310bba0bdf5731ed194e092.zip |
[X86][AVX] Add support for target shuffle combining to VPERMILPS variable shuffle mask
Added AVX512F VPERMILPS shuffle decoding support
llvm-svn: 275270
4 files changed, 59 insertions, 4 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c779414cd0f..990671b9159 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3829,6 +3829,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) { switch (Opcode) { default: return false; case X86ISD::PSHUFB: + case X86ISD::VPERMILPV: return true; } } @@ -25211,13 +25212,42 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, if (Depth < 2) return false; + if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) + return false; + + bool MaskContainsZeros = + llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); + + // If we have a single input shuffle with different shuffle patterns in the + // the 128-bit lanes use the variable mask to VPERMILPS. + // TODO Combine other mask types at higher depths. + if (HasVariableMask && !MaskContainsZeros && + ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || + (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) { + SmallVector<SDValue, 16> VPermIdx; + for (int M : Mask) { + SDValue Idx = + M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32); + VPermIdx.push_back(Idx); + } + MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts); + SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx); + DCI.AddToWorklist(VPermMask.getNode()); + Res = DAG.getBitcast(MaskVT, Input); + DCI.AddToWorklist(Res.getNode()); + Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + // If we have 3 or more shuffle instructions or a chain involving a variable // mask, we can replace them with a single PSHUFB instruction profitably. // Intel's manuals suggest only using PSHUFB if doing so replacing 5 // instructions, but in practice PSHUFB tends to be *very* fast so we're // more aggressive. if ((Depth >= 3 || HasVariableMask) && - !is128BitLaneCrossingShuffleMask(MaskVT, Mask) && ((VT.is128BitVector() && Subtarget.hasSSSE3()) || (VT.is256BitVector() && Subtarget.hasAVX2()) || (VT.is512BitVector() && Subtarget.hasBWI()))) { diff --git a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp index 8c700e3f5cd..1adc92cfda6 100644 --- a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp +++ b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -104,9 +104,11 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, // <4 x i32> <i32 -2147483648, i32 -2147483648, // i32 -2147483648, i32 -2147483648> - unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + if (ElSize != 32 && ElSize != 64) + return; - if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + if (MaskTySize != 128 && MaskTySize != 256 && MaskTySize != 512) return; // Only support vector types. @@ -126,7 +128,8 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, return; unsigned NumElements = MaskTySize / ElSize; - assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && + assert((NumElements == 2 || NumElements == 4 || NumElements == 8 || + NumElements == 16) && "Unexpected number of vector elements."); ShuffleMask.reserve(NumElements); unsigned NumElementsPerLane = 128 / ElSize; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index c1e60a73dbf..351c91e7acf 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -91,6 +91,16 @@ define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) { ret <8 x float> %2 } +define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) { +; ALL-LABEL: combine_vpermilvar_8f32_10326u4u: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u] +; ALL-NEXT: retq + %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>) + %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 undef>) + ret <8 x float> %2 +} + define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) { ; ALL-LABEL: combine_vpermilvar_vperm2f128_8f32: ; ALL: # BB#0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index ddb83c60430..6d348bd39d6 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -3,6 +3,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) + declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) @@ -437,6 +439,16 @@ define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x ret <8 x double> %1 } +define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) { +; CHECK-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE: +; CHECK: # BB#0: +; CHECK-NEXT: vpermilps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq + %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1) + ret <16 x float> %res1 +} + define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) { ; CHECK-LABEL: combine_pshufb_as_pslldq: ; CHECK: # BB#0: |