diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 100 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 83 |
2 files changed, 123 insertions, 60 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index aa9fd6bc03a..f90a2478306 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10482,45 +10482,24 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, V2 = getZeroVector(VT, Subtarget, DAG, DL); switch (VT.SimpleTy) { - case MVT::v2f64: - case MVT::v4f32: - case MVT::v4f64: - case MVT::v8f32: - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); LLVM_FALLTHROUGH; + case MVT::v4f64: + case MVT::v8f32: + assert(Subtarget.hasAVX() && "256-bit float blends require AVX!"); + LLVM_FALLTHROUGH; + case MVT::v2f64: case MVT::v2i64: + case MVT::v4f32: case MVT::v4i32: - // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into - // that instruction. - if (Subtarget.hasAVX2()) { - // Scale the blend by the number of 32-bit dwords per element. - int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); - MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; - V1 = DAG.getBitcast(BlendVT, V1); - V2 = DAG.getBitcast(BlendVT, V2); - return DAG.getBitcast( - VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } - LLVM_FALLTHROUGH; - case MVT::v8i16: { - // For integer shuffles we need to expand the mask and cast the inputs to - // v8i16s prior to blending. - int Scale = 8 / VT.getVectorNumElements(); - BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); - V1 = DAG.getBitcast(MVT::v8i16, V1); - V2 = DAG.getBitcast(MVT::v8i16, V2); - return DAG.getBitcast(VT, - DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } + case MVT::v8i16: + assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v16i16: { - assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); + assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); SmallVector<int, 8> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. @@ -10548,10 +10527,11 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, } LLVM_FALLTHROUGH; } - case MVT::v16i8: - case MVT::v32i8: { - assert((VT.is128BitVector() || Subtarget.hasAVX2()) && - "256-bit byte-blends require AVX2 support!"); + case MVT::v32i8: + assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"); + LLVM_FALLTHROUGH; + case MVT::v16i8: { + assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, @@ -31055,34 +31035,11 @@ static bool matchBinaryPermuteShuffle( return true; } } else { - // Determine a type compatible with X86ISD::BLENDI. - ShuffleVT = MaskVT; - if (Subtarget.hasAVX2()) { - if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v8i32; - else if (ShuffleVT == MVT::v2i64) - ShuffleVT = MVT::v4i32; - } else { - if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) - ShuffleVT = MVT::v8i16; - else if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v4f64; - else if (ShuffleVT == MVT::v8i32) - ShuffleVT = MVT::v8f32; - } - - if (!ShuffleVT.isFloatingPoint()) { - int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); - BlendMask = - scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); - ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); - ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); - } - V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; + ShuffleVT = MaskVT; return true; } } @@ -32239,6 +32196,29 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } + case X86ISD::BLENDI: { + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + + // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types. + // TODO: Handle MVT::v16i16 repeated blend mask. + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && + N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) { + MVT SrcVT = N0.getOperand(0).getSimpleValueType(); + if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 && + SrcVT.getScalarSizeInBits() >= 32) { + unsigned Mask = N.getConstantOperandVal(2); + unsigned Size = VT.getVectorNumElements(); + unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); + unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), + N1.getOperand(0), + DAG.getConstant(ScaleMask, DL, MVT::i8))); + } + } + return SDValue(); + } case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index c37f1227438..e61666781b0 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -6507,6 +6507,40 @@ let Predicates = [HasAVX2] in { VEX_4V, VEX_L, VEX_WIG; } +// Emulate vXi32/vXi64 blends with vXf32/vXf64. +// ExecutionDomainFixPass will cleanup domains later on. +let Predicates = [HasAVX] in { +def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), + (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>; +def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), + (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>; +def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), + (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>; + +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), + (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>; +def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), + (VBLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>; +def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), + (VBLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3), + (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>; +def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3), + (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>; +def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3), + (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>; + +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), + (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>; +def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3), + (VBLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>; +def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3), + (VBLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>; +} + defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, VR128, memop, f128mem, 1, SSEPackedSingle, SchedWriteFBlend.XMM, BlendCommuteImm4>; @@ -6517,6 +6551,22 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, VR128, memop, i128mem, 1, SSEPackedInt, SchedWriteBlend.XMM, BlendCommuteImm8>; +let Predicates = [UseSSE41] in { +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), + (BLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>; +def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3), + (BLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>; +def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3), + (BLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>; + +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), + (BLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>; +def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3), + (BLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>; +def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3), + (BLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>; +} + // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128. let Predicates = [HasAVX] in { @@ -6528,6 +6578,13 @@ def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; + +def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), + (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xc)>; +def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; } /// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators @@ -7781,6 +7838,19 @@ def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), (VPBLENDDYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; + +def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), + (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), + (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), + (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), + (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; } let Predicates = [HasAVX1Only] in { @@ -7800,6 +7870,19 @@ def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; + +def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; +def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), + (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; } //===----------------------------------------------------------------------===// |

