summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2016-07-13 15:10:43 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2016-07-13 15:10:43 +0000
commit48d83407602b15f80310bba0bdf5731ed194e092 (patch)
tree39b37e1cf162524943af11f168ef0f7e189a87a6
parentbd69903692dd3e919f7b3f4b1b4cf70ca2c1f376 (diff)
downloadbcm5719-llvm-48d83407602b15f80310bba0bdf5731ed194e092.tar.gz
bcm5719-llvm-48d83407602b15f80310bba0bdf5731ed194e092.zip
[X86][AVX] Add support for target shuffle combining to VPERMILPS variable shuffle mask
Added AVX512F VPERMILPS shuffle decoding support llvm-svn: 275270
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp32
-rw-r--r--llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp9
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll10
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll12
4 files changed, 59 insertions, 4 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c779414cd0f..990671b9159 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3829,6 +3829,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
switch (Opcode) {
default: return false;
case X86ISD::PSHUFB:
+ case X86ISD::VPERMILPV:
return true;
}
}
@@ -25211,13 +25212,42 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
if (Depth < 2)
return false;
+ if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
+ return false;
+
+ bool MaskContainsZeros =
+ llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+
+ // If we have a single input shuffle with different shuffle patterns in the
+ // the 128-bit lanes use the variable mask to VPERMILPS.
+ // TODO Combine other mask types at higher depths.
+ if (HasVariableMask && !MaskContainsZeros &&
+ ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
+ SmallVector<SDValue, 16> VPermIdx;
+ for (int M : Mask) {
+ SDValue Idx =
+ M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
+ VPermIdx.push_back(Idx);
+ }
+ MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
+ SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
+ DCI.AddToWorklist(VPermMask.getNode());
+ Res = DAG.getBitcast(MaskVT, Input);
+ DCI.AddToWorklist(Res.getNode());
+ Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
// If we have 3 or more shuffle instructions or a chain involving a variable
// mask, we can replace them with a single PSHUFB instruction profitably.
// Intel's manuals suggest only using PSHUFB if doing so replacing 5
// instructions, but in practice PSHUFB tends to be *very* fast so we're
// more aggressive.
if ((Depth >= 3 || HasVariableMask) &&
- !is128BitLaneCrossingShuffleMask(MaskVT, Mask) &&
((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
(VT.is256BitVector() && Subtarget.hasAVX2()) ||
(VT.is512BitVector() && Subtarget.hasBWI()))) {
diff --git a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 8c700e3f5cd..1adc92cfda6 100644
--- a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -104,9 +104,11 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
// <4 x i32> <i32 -2147483648, i32 -2147483648,
// i32 -2147483648, i32 -2147483648>
- unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ if (ElSize != 32 && ElSize != 64)
+ return;
- if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ if (MaskTySize != 128 && MaskTySize != 256 && MaskTySize != 512)
return;
// Only support vector types.
@@ -126,7 +128,8 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
return;
unsigned NumElements = MaskTySize / ElSize;
- assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+ assert((NumElements == 2 || NumElements == 4 || NumElements == 8 ||
+ NumElements == 16) &&
"Unexpected number of vector elements.");
ShuffleMask.reserve(NumElements);
unsigned NumElementsPerLane = 128 / ElSize;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index c1e60a73dbf..351c91e7acf 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -91,6 +91,16 @@ define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) {
ret <8 x float> %2
}
+define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_10326u4u:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>)
+ %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 undef>)
+ ret <8 x float> %2
+}
+
define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_vperm2f128_8f32:
; ALL: # BB#0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index ddb83c60430..6d348bd39d6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -3,6 +3,8 @@
declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
@@ -437,6 +439,16 @@ define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x
ret <8 x double> %1
}
+define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) {
+; CHECK-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermilps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1)
+ ret <16 x float> %res1
+}
+
define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) {
; CHECK-LABEL: combine_pshufb_as_pslldq:
; CHECK: # BB#0:
OpenPOWER on IntegriCloud