summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp16
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp11
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h3
-rw-r--r--llvm/lib/Target/X86/X86InstrFragmentsSIMD.td6
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td132
-rw-r--r--llvm/lib/Target/X86/X86IntrinsicsInfo.h8
6 files changed, 87 insertions, 89 deletions
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5ac153244df..a08030bb885 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -3381,13 +3381,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
}
break;
- case X86ISD::BLENDV: {
- // BLENDV selects like a regular VSELECT.
- SDValue VSelect = CurDAG->getNode(
- ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
+ case ISD::VSELECT: {
+ // Replace VSELECT with non-mask conditions with with BLENDV.
+ if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
+ break;
+
+ assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
+ SDValue Blendv = CurDAG->getNode(
+ X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
Node->getOperand(1), Node->getOperand(2));
- ReplaceNode(Node, VSelect.getNode());
- SelectCode(VSelect.getNode());
+ ReplaceNode(Node, Blendv.getNode());
+ SelectCode(Blendv.getNode());
// We already called ReplaceUses.
return;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ece273302e6..504d42ce6a5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -21840,6 +21840,17 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
+ case BLENDV: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+
+ EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
+ Src3 = DAG.getBitcast(MaskVT, Src3);
+
+ // Reverse the operands to match VSELECT order.
+ return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
+ }
case VPERM_2OP : {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 910acd80e8b..d2be32a8622 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -204,7 +204,8 @@ namespace llvm {
/// Dynamic (non-constant condition) vector blend where only the sign bits
/// of the condition elements are used. This is used to enforce that the
/// condition mask is not valid for generic VSELECT optimizations. This
- /// can also be used to implement the intrinsics.
+ /// is also used to implement the intrinsics.
+ /// Operands are in VSELECT order: MASK, TRUE, FALSE
BLENDV,
/// Combined add and sub on an FP vector.
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 11a27ba9058..bbf2b92bf37 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -448,6 +448,12 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
+def X86Blendv : SDNode<"X86ISD::BLENDV",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<2, 3>,
+ SDTCisSameNumEltsAs<0, 1>,
+ SDTCisSameSizeAs<0, 1>]>>;
def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index e2bcd18ce66..58aac7951b5 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6582,16 +6582,16 @@ def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
VR128:$src2, sub_xmm), 0xf)>;
}
-/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
-multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
- RegisterClass RC, X86MemOperand x86memop,
- PatFrag mem_frag, Intrinsic IntId,
- X86FoldableSchedWrite sched> {
+/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
+multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType VT,
+ PatFrag mem_frag, SDNode OpNode,
+ X86FoldableSchedWrite sched> {
def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
+ [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
SSEPackedInt>, TAPD, VEX_4V,
Sched<[sched]>;
@@ -6600,8 +6600,8 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
- (IntId RC:$src1, (mem_frag addr:$src2),
- RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
+ (OpNode RC:$src3, (mem_frag addr:$src2),
+ RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
Sched<[sched.Folded, sched.ReadAfterFold,
// x86memop:$src2
ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -6612,68 +6612,47 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedDouble in {
-defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
- load, int_x86_sse41_blendvpd,
- SchedWriteFVarBlend.XMM>;
-defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
- loadv4f64, int_x86_avx_blendv_pd_256,
- SchedWriteFVarBlend.YMM>, VEX_L;
+defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
+ v2f64, loadv2f64, X86Blendv,
+ SchedWriteFVarBlend.XMM>;
+defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
+ v4f64, loadv4f64, X86Blendv,
+ SchedWriteFVarBlend.YMM>, VEX_L;
} // ExeDomain = SSEPackedDouble
let ExeDomain = SSEPackedSingle in {
-defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
- load, int_x86_sse41_blendvps,
- SchedWriteFVarBlend.XMM>;
-defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
- loadv8f32, int_x86_avx_blendv_ps_256,
- SchedWriteFVarBlend.YMM>, VEX_L;
+defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
+ v4f32, loadv4f32, X86Blendv,
+ SchedWriteFVarBlend.XMM>;
+defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
+ v8f32, loadv8f32, X86Blendv,
+ SchedWriteFVarBlend.YMM>, VEX_L;
} // ExeDomain = SSEPackedSingle
-defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
- load, int_x86_sse41_pblendvb,
- SchedWriteVarBlend.XMM>;
+defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
+ v16i8, loadv16i8, X86Blendv,
+ SchedWriteVarBlend.XMM>;
}
let Predicates = [HasAVX2] in {
-defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
- load, int_x86_avx2_pblendvb,
- SchedWriteVarBlend.YMM>, VEX_L;
+defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
+ v32i8, loadv32i8, X86Blendv,
+ SchedWriteVarBlend.YMM>, VEX_L;
}
let Predicates = [HasAVX] in {
- def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
- (v16i8 VR128:$src2))),
- (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
- (v4i32 VR128:$src2))),
+ def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
(VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
- (v4f32 VR128:$src2))),
- (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
- (v2i64 VR128:$src2))),
- (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
- (v2f64 VR128:$src2))),
+ def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
(VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
- def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
- (v8i32 VR256:$src2))),
- (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
- def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
- (v8f32 VR256:$src2))),
+ def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+ (v8i32 VR256:$src2))),
(VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
- def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
- (v4i64 VR256:$src2))),
- (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
- def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
- (v4f64 VR256:$src2))),
+ def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+ (v4i64 VR256:$src2))),
(VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
}
-let Predicates = [HasAVX2] in {
- def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
- (v32i8 VR256:$src2))),
- (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
-}
-
// Prefer a movss or movsd over a blendps when optimizing for size. these were
// changed to use blends because blends have better throughput on sandybridge
// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
@@ -6747,16 +6726,17 @@ let Predicates = [UseSSE41, OptForSpeed] in {
}
-/// SS41I_ternary_int - SSE 4.1 ternary operator
+/// SS41I_ternary - SSE 4.1 ternary operator
let Uses = [XMM0], Constraints = "$src1 = $dst" in {
- multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
- X86MemOperand x86memop, Intrinsic IntId,
- X86FoldableSchedWrite sched> {
+ multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
+ PatFrag mem_frag, X86MemOperand x86memop,
+ SDNode OpNode, X86FoldableSchedWrite sched> {
def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr,
"\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
- [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+ [(set VR128:$dst,
+ (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
Sched<[sched]>;
def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
@@ -6764,20 +6744,19 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
!strconcat(OpcodeStr,
"\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
[(set VR128:$dst,
- (IntId VR128:$src1,
- (mem_frag addr:$src2), XMM0))]>,
+ (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
let ExeDomain = SSEPackedDouble in
-defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem,
- int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
+defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
+ X86Blendv, SchedWriteFVarBlend.XMM>;
let ExeDomain = SSEPackedSingle in
-defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem,
- int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
-defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem,
- int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
+defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
+ X86Blendv, SchedWriteFVarBlend.XMM>;
+defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
+ X86Blendv, SchedWriteVarBlend.XMM>;
// Aliases with the implicit xmm0 argument
def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
@@ -6794,20 +6773,11 @@ def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
(PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
let Predicates = [UseSSE41] in {
- def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
- (v16i8 VR128:$src2))),
- (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
- (v4i32 VR128:$src2))),
+ def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
+ (v4i32 VR128:$src2))),
(BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
- (v4f32 VR128:$src2))),
- (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
- (v2i64 VR128:$src2))),
- (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
- def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
- (v2f64 VR128:$src2))),
+ def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
+ (v2i64 VR128:$src2))),
(BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
}
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 151e1b9136c..37badd85580 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -23,7 +23,7 @@ enum IntrinsicType : uint16_t {
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
INTR_TYPE_3OP_IMM8,
- CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM,
+ CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV,
CVTPD2PS, CVTPD2PS_MASK, CVTPD2PS_RND_MASK,
INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
@@ -340,6 +340,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(addcarry_64, ADX, X86ISD::ADC, X86ISD::ADD),
X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
+ X86_INTRINSIC_DATA(avx_blendv_pd_256, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(avx_blendv_ps_256, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
@@ -369,6 +371,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
@@ -1156,8 +1159,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+ X86_INTRINSIC_DATA(sse41_blendvpd, BLENDV, X86ISD::BLENDV, 0),
+ X86_INTRINSIC_DATA(sse41_blendvps, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse41_pblendvb, BLENDV, X86ISD::BLENDV, 0),
X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),
OpenPOWER on IntegriCloud