[X86][AVX] Add support for target shuffle combining to VPERMILPS variable shuffle mask

Added AVX512F VPERMILPS shuffle decoding support llvm-svn: 275270
author: Simon Pilgrim <llvm-dev@redking.me.uk> 2016-07-13 15:10:43 +0000
committer: Simon Pilgrim <llvm-dev@redking.me.uk> 2016-07-13 15:10:43 +0000
commit: 48d83407602b15f80310bba0bdf5731ed194e092 (patch)
tree: 39b37e1cf162524943af11f168ef0f7e189a87a6
parent: bd69903692dd3e919f7b3f4b1b4cf70ca2c1f376 (diff)
download: bcm5719-llvm-48d83407602b15f80310bba0bdf5731ed194e092.tar.gz
bcm5719-llvm-48d83407602b15f80310bba0bdf5731ed194e092.zip
4 files changed, 59 insertions, 4 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c779414cd0f..990671b9159 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3829,6 +3829,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
   switch (Opcode) {
   default: return false;
   case X86ISD::PSHUFB:
+  case X86ISD::VPERMILPV:
     return true;
   }
 }
@@ -25211,13 +25212,42 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
   if (Depth < 2)
     return false;
 
+  if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
+    return false;
+
+  bool MaskContainsZeros =
+      llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+
+  // If we have a single input shuffle with different shuffle patterns in the
+  // the 128-bit lanes use the variable mask to VPERMILPS.
+  // TODO Combine other mask types at higher depths.
+  if (HasVariableMask && !MaskContainsZeros &&
+      ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+       (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
+    SmallVector<SDValue, 16> VPermIdx;
+    for (int M : Mask) {
+      SDValue Idx =
+          M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
+      VPermIdx.push_back(Idx);
+    }
+    MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
+    SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
+    DCI.AddToWorklist(VPermMask.getNode());
+    Res = DAG.getBitcast(MaskVT, Input);
+    DCI.AddToWorklist(Res.getNode());
+    Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
   // If we have 3 or more shuffle instructions or a chain involving a variable
   // mask, we can replace them with a single PSHUFB instruction profitably.
   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
   // instructions, but in practice PSHUFB tends to be *very* fast so we're
   // more aggressive.
   if ((Depth >= 3 || HasVariableMask) &&
-      !is128BitLaneCrossingShuffleMask(MaskVT, Mask) &&
       ((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
        (VT.is256BitVector() && Subtarget.hasAVX2()) ||
        (VT.is512BitVector() && Subtarget.hasBWI()))) {
diff --git a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 8c700e3f5cd..1adc92cfda6 100644
--- a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -104,9 +104,11 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
   //   <4 x i32> <i32 -2147483648, i32 -2147483648,
   //              i32 -2147483648, i32 -2147483648>
 
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  if (ElSize != 32 && ElSize != 64)
+    return;
 
-  if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  if (MaskTySize != 128 && MaskTySize != 256 && MaskTySize != 512)
     return;
 
   // Only support vector types.
@@ -126,7 +128,8 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
     return;
 
   unsigned NumElements = MaskTySize / ElSize;
-  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+  assert((NumElements == 2 || NumElements == 4 || NumElements == 8 ||
+          NumElements == 16) &&
          "Unexpected number of vector elements.");
   ShuffleMask.reserve(NumElements);
   unsigned NumElementsPerLane = 128 / ElSize;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index c1e60a73dbf..351c91e7acf 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -91,6 +91,16 @@ define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) {
   ret <8 x float> %2
 }
 
+define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_10326u4u:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
+; ALL-NEXT:    retq
+  %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>)
+  %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>  %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 undef>)
+  ret <8 x float> %2
+}
+
 define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
 ; ALL-LABEL: combine_vpermilvar_vperm2f128_8f32:
 ; ALL:       # BB#0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index ddb83c60430..6d348bd39d6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -3,6 +3,8 @@
 
 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
+declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
 declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
 declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
@@ -437,6 +439,16 @@ define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x
   ret <8 x double> %1
 }
 
+define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) {
+; CHECK-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1)
+  ret <16 x float> %res1
+}
+
 define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_pslldq:
 ; CHECK:       # BB#0:
author	Simon Pilgrim <llvm-dev@redking.me.uk>	2016-07-13 15:10:43 +0000
committer	Simon Pilgrim <llvm-dev@redking.me.uk>	2016-07-13 15:10:43 +0000
commit	48d83407602b15f80310bba0bdf5731ed194e092 (patch)
tree	39b37e1cf162524943af11f168ef0f7e189a87a6
parent	bd69903692dd3e919f7b3f4b1b4cf70ca2c1f376 (diff)
download	bcm5719-llvm-48d83407602b15f80310bba0bdf5731ed194e092.tar.gz bcm5719-llvm-48d83407602b15f80310bba0bdf5731ed194e092.zip