2 files changed, 90 insertions, 5 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9cac59b16d7..607ba9bc689 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1381,7 +1381,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
-      setOperationAction(ISD::VSELECT,             VT, Legal);
+      setOperationAction(ISD::VSELECT,             VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
@@ -1445,8 +1445,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
@@ -1479,7 +1477,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
-      setOperationAction(ISD::VSELECT,      VT, Legal);
+      setOperationAction(ISD::VSELECT,      VT, Custom);
       setOperationAction(ISD::ABS,          VT, Legal);
       setOperationAction(ISD::SRL,          VT, Custom);
       setOperationAction(ISD::SHL,          VT, Custom);
@@ -13817,6 +13815,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
     return SDValue();
 
+  // If this VSELECT has a vector if i1 as a mask, it will be directly matched
+  // with patterns on the mask registers on AVX-512.
+  if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
+    return Op;
+
   // Try to lower this to a blend-style vector shuffle. This can handle all
   // constant condition cases.
   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
@@ -13826,10 +13829,31 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget.hasSSE41())
     return SDValue();
 
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
+  // into an i1 condition so that we can use the mask-based 512-bit blend
+  // instructions.
+  if (VT.getSizeInBits() == 512) {
+    SDValue Cond = Op.getOperand(0);
+    // The vNi1 condition case should be handled above as it can be trivially
+    // lowered.
+    assert(Cond.getValueType().getScalarSizeInBits() ==
+               VT.getScalarSizeInBits() &&
+           "Should have a size-matched integer condition!");
+    // Build a mask by testing the condition against itself (tests for zero).
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
+    // Now return a new VSELECT using the mask.
+    return DAG.getNode(ISD::VSELECT, dl, VT, Mask, Op.getOperand(1),
+                       Op.getOperand(2));
+  }
+
   // Only some types will be legal on some subtargets. If we can emit a legal
   // VSELECT-matching blend, return Op, and but if we need to expand, return
   // a null value.
-  switch (Op.getSimpleValueType().SimpleTy) {
+  switch (VT.SimpleTy) {
   default:
     // Most of the vector types have blends past SSE4.1.
     return Op;
diff --git a/llvm/test/CodeGen/X86/avx512-vselect.ll b/llvm/test/CodeGen/X86/avx512-vselect.ll
new file mode 100644
index 00000000000..1940864824f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-vselect.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=skx | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
+; RUN: llc < %s -mcpu=knl | FileCheck %s --check-prefixes=CHECK,CHECK-KNL
+
+target triple = "x86_64-unknown-unknown"
+
+define <8 x i64> @test1(<8 x i64> %m, <8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpsllq $63, %zmm0, %zmm0
+; CHECK-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; CHECK-NEXT:    vpblendmq %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+entry:
+  %m.trunc = trunc <8 x i64> %m to <8 x i1>
+  %ret = select <8 x i1> %m.trunc, <8 x i64> %a, <8 x i64> %b
+  ret <8 x i64> %ret
+}
+
+; This is a very contrived test case to trick the legalizer into splitting the
+; v16i1 masks in the select during type legalization, and in so doing extend them
+; into two v8i64 types. This lets us ensure that the lowering code can handle
+; both formulations of vselect. All of this trickery is because we can't
+; directly form an SDAG input to the lowering.
+define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) {
+; CHECK-SKX-LABEL: test2:
+; CHECK-SKX:       # BB#0: # %entry
+; CHECK-SKX-NEXT:    vxorps %zmm6, %zmm6, %zmm6
+; CHECK-SKX-NEXT:    vcmpltps %zmm0, %zmm6, %k0
+; CHECK-SKX-NEXT:    vcmpltps %zmm6, %zmm1, %k1
+; CHECK-SKX-NEXT:    korw %k1, %k0, %k0
+; CHECK-SKX-NEXT:    kshiftrw $8, %k0, %k1
+; CHECK-SKX-NEXT:    vpmovm2q %k1, %zmm1
+; CHECK-SKX-NEXT:    vpmovm2q %k0, %zmm0
+; CHECK-SKX-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; CHECK-SKX-NEXT:    vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
+; CHECK-SKX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; CHECK-SKX-NEXT:    vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
+; CHECK-SKX-NEXT:    retq
+;
+; CHECK-KNL-LABEL: test2:
+; CHECK-KNL:       # BB#0: # %entry
+; CHECK-KNL-NEXT:    vpxord %zmm6, %zmm6, %zmm6
+; CHECK-KNL-NEXT:    vcmpltps %zmm0, %zmm6, %k0
+; CHECK-KNL-NEXT:    vcmpltps %zmm6, %zmm1, %k1
+; CHECK-KNL-NEXT:    korw %k1, %k0, %k1
+; CHECK-KNL-NEXT:    kshiftrw $8, %k1, %k2
+; CHECK-KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; CHECK-KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; CHECK-KNL-NEXT:    vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
+; CHECK-KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; CHECK-KNL-NEXT:    vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
+; CHECK-KNL-NEXT:    retq
+entry:
+  %gt.m = fcmp ogt <16 x float> %x, zeroinitializer
+  %lt.m = fcmp olt <16 x float> %y, zeroinitializer
+  %m.or = or <16 x i1> %gt.m, %lt.m
+  %ret = select <16 x i1> %m.or, <16 x double> %a, <16 x double> %b
+  ret <16 x double> %ret
+}