diff options
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 121 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 83 |
2 files changed, 60 insertions, 144 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 15eab9e261c..6ad5503b8a9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1871,7 +1871,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); - setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -10482,24 +10481,45 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, V2 = getZeroVector(VT, Subtarget, DAG, DL); switch (VT.SimpleTy) { + case MVT::v2f64: + case MVT::v4f32: + case MVT::v4f64: + case MVT::v8f32: + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); LLVM_FALLTHROUGH; - case MVT::v4f64: - case MVT::v8f32: - assert(Subtarget.hasAVX() && "256-bit float blends require AVX!"); - LLVM_FALLTHROUGH; - case MVT::v2f64: case MVT::v2i64: - case MVT::v4f32: case MVT::v4i32: - case MVT::v8i16: - assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); + // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into + // that instruction. + if (Subtarget.hasAVX2()) { + // Scale the blend by the number of 32-bit dwords per element. + int Scale = VT.getScalarSizeInBits() / 32; + BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); + MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; + V1 = DAG.getBitcast(BlendVT, V1); + V2 = DAG.getBitcast(BlendVT, V2); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8))); + } + LLVM_FALLTHROUGH; + case MVT::v8i16: { + // For integer shuffles we need to expand the mask and cast the inputs to + // v8i16s prior to blending. + int Scale = 8 / VT.getVectorNumElements(); + BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); + V1 = DAG.getBitcast(MVT::v8i16, V1); + V2 = DAG.getBitcast(MVT::v8i16, V2); + return DAG.getBitcast(VT, + DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8))); + } case MVT::v16i16: { - assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); + assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); SmallVector<int, 8> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. @@ -10527,11 +10547,10 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, } LLVM_FALLTHROUGH; } - case MVT::v32i8: - assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"); - LLVM_FALLTHROUGH; - case MVT::v16i8: { - assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"); + case MVT::v16i8: + case MVT::v32i8: { + assert((VT.is128BitVector() || Subtarget.hasAVX2()) && + "256-bit byte-blends require AVX2 support!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, @@ -31023,11 +31042,34 @@ static bool matchBinaryPermuteShuffle( return true; } } else { + // Determine a type compatible with X86ISD::BLENDI. + ShuffleVT = MaskVT; + if (Subtarget.hasAVX2()) { + if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v8i32; + else if (ShuffleVT == MVT::v2i64) + ShuffleVT = MVT::v4i32; + } else { + if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) + ShuffleVT = MVT::v8i16; + else if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v4f64; + else if (ShuffleVT == MVT::v8i32) + ShuffleVT = MVT::v8f32; + } + + if (!ShuffleVT.isFloatingPoint()) { + int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); + BlendMask = + scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); + ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); + ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); + } + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; - ShuffleVT = MaskVT; return true; } } @@ -32184,29 +32226,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } - case X86ISD::BLENDI: { - SDValue N0 = N.getOperand(0); - SDValue N1 = N.getOperand(1); - - // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types. - // TODO: Handle MVT::v16i16 repeated blend mask. - if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && - N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) { - MVT SrcVT = N0.getOperand(0).getSimpleValueType(); - if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 && - SrcVT.getScalarSizeInBits() >= 32) { - unsigned Mask = N.getConstantOperandVal(2); - unsigned Size = VT.getVectorNumElements(); - unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); - unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale); - return DAG.getBitcast( - VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), - N1.getOperand(0), - DAG.getConstant(ScaleMask, DL, MVT::i8))); - } - } - return SDValue(); - } case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: @@ -42127,25 +42146,6 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG) { - // Disabling for widening legalization for now. We can enable if we find a - // case that needs it. Otherwise it can be deleted when we switch to - // widening legalization. - if (ExperimentalVectorWideningLegalization) - return SDValue(); - - EVT VT = N->getValueType(0); - SDValue In = N->getOperand(0); - - // Combine (ext_invec (ext_invec X)) -> (ext_invec X) - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (In.getOpcode() == N->getOpcode() && - TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType())) - return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0)); - - return SDValue(); -} - SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -42207,7 +42207,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); - case ISD::ANY_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG); case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index e61666781b0..c37f1227438 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -6507,40 +6507,6 @@ let Predicates = [HasAVX2] in { VEX_4V, VEX_L, VEX_WIG; } -// Emulate vXi32/vXi64 blends with vXf32/vXf64. -// ExecutionDomainFixPass will cleanup domains later on. -let Predicates = [HasAVX] in { -def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), - (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>; -def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), - (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>; -def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), - (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>; - -def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), - (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>; -def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), - (VBLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>; -def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), - (VBLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>; -} - -let Predicates = [HasAVX1Only] in { -def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3), - (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>; -def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3), - (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>; -def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3), - (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>; - -def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), - (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>; -def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3), - (VBLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>; -def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3), - (VBLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>; -} - defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, VR128, memop, f128mem, 1, SSEPackedSingle, SchedWriteFBlend.XMM, BlendCommuteImm4>; @@ -6551,22 +6517,6 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, VR128, memop, i128mem, 1, SSEPackedInt, SchedWriteBlend.XMM, BlendCommuteImm8>; -let Predicates = [UseSSE41] in { -def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), - (BLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>; -def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3), - (BLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>; -def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3), - (BLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>; - -def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), - (BLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>; -def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3), - (BLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>; -def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3), - (BLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>; -} - // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128. let Predicates = [HasAVX] in { @@ -6578,13 +6528,6 @@ def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; - -def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), - (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), - VR128:$src1, sub_xmm), addr:$src2, 0xc)>; -def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), - (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), - VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; } /// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators @@ -7838,19 +7781,6 @@ def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), (VPBLENDDYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; - -def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), - (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; -def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), - (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; -def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), - (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; -def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), - (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; } let Predicates = [HasAVX1Only] in { @@ -7870,19 +7800,6 @@ def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), (VBLENDPSYrri VR256:$src1, (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 0xf)>; - -def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), - (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; -def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), - (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; -def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), - (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; -def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), - (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; } //===----------------------------------------------------------------------===// |

