diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2019-02-19 18:05:42 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2019-02-19 18:05:42 +0000 |
| commit | 0b3b9424ca8ae3d5ca3b12470673a13617427651 (patch) | |
| tree | f602d28b370a855559a22c4e664faef47a6bdc31 /llvm/lib/Target | |
| parent | 3d13ed978ab88c58bd2f11f84b1025fd21b322a2 (diff) | |
| download | bcm5719-llvm-0b3b9424ca8ae3d5ca3b12470673a13617427651.tar.gz bcm5719-llvm-0b3b9424ca8ae3d5ca3b12470673a13617427651.zip | |
[X86][SSE] Generalize X86ISD::BLENDI support to more value types
D42042 introduced the ability for the ExecutionDomainFixPass to more easily change between BLENDPD/BLENDPS/PBLENDW as the domains required.
With this ability, we can avoid most bitcasts/scaling in the DAG that was occurring with X86ISD::BLENDI lowering/combining, blend with the vXi32/vXi64 vectors directly and use isel patterns to lower to the float vector equivalent vectors.
This helps the shuffle combining and SimplifyDemandedVectorElts be more aggressive as we lose track of fewer UNDEF elements than when we go up/down through bitcasts.
I've introduced a basic blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) fold, there are more generalizations I can do there (e.g. widening/scaling and handling the tricky v16i16 repeated mask case).
The vector-reduce-smin/smax regressions will be fixed in a future improvement to SimplifyDemandedBits to peek through bitcasts and support X86ISD::BLENDV.
Reapplied after reversion at rL353699 - AVX2 isel fix was applied at rL354358, additional test at rL354360/rL354361
Differential Revision: https://reviews.llvm.org/D57888
llvm-svn: 354363
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 100 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 23 |
2 files changed, 63 insertions, 60 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9944e23d553..cf2c83fa1cf 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10483,45 +10483,24 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, V2 = getZeroVector(VT, Subtarget, DAG, DL); switch (VT.SimpleTy) { - case MVT::v2f64: - case MVT::v4f32: - case MVT::v4f64: - case MVT::v8f32: - return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); LLVM_FALLTHROUGH; + case MVT::v4f64: + case MVT::v8f32: + assert(Subtarget.hasAVX() && "256-bit float blends require AVX!"); + LLVM_FALLTHROUGH; + case MVT::v2f64: case MVT::v2i64: + case MVT::v4f32: case MVT::v4i32: - // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into - // that instruction. - if (Subtarget.hasAVX2()) { - // Scale the blend by the number of 32-bit dwords per element. - int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); - MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; - V1 = DAG.getBitcast(BlendVT, V1); - V2 = DAG.getBitcast(BlendVT, V2); - return DAG.getBitcast( - VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } - LLVM_FALLTHROUGH; - case MVT::v8i16: { - // For integer shuffles we need to expand the mask and cast the inputs to - // v8i16s prior to blending. - int Scale = 8 / VT.getVectorNumElements(); - BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); - V1 = DAG.getBitcast(MVT::v8i16, V1); - V2 = DAG.getBitcast(MVT::v8i16, V2); - return DAG.getBitcast(VT, - DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8))); - } + case MVT::v8i16: + assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v16i16: { - assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); + assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); SmallVector<int, 8> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. @@ -10549,10 +10528,11 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, } LLVM_FALLTHROUGH; } - case MVT::v16i8: - case MVT::v32i8: { - assert((VT.is128BitVector() || Subtarget.hasAVX2()) && - "256-bit byte-blends require AVX2 support!"); + case MVT::v32i8: + assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"); + LLVM_FALLTHROUGH; + case MVT::v16i8: { + assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, @@ -31016,34 +30996,11 @@ static bool matchBinaryPermuteShuffle( return true; } } else { - // Determine a type compatible with X86ISD::BLENDI. - ShuffleVT = MaskVT; - if (Subtarget.hasAVX2()) { - if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v8i32; - else if (ShuffleVT == MVT::v2i64) - ShuffleVT = MVT::v4i32; - } else { - if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) - ShuffleVT = MVT::v8i16; - else if (ShuffleVT == MVT::v4i64) - ShuffleVT = MVT::v4f64; - else if (ShuffleVT == MVT::v8i32) - ShuffleVT = MVT::v8f32; - } - - if (!ShuffleVT.isFloatingPoint()) { - int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); - BlendMask = - scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); - ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); - ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); - } - V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; + ShuffleVT = MaskVT; return true; } } @@ -32208,6 +32165,29 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } + case X86ISD::BLENDI: { + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + + // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types. + // TODO: Handle MVT::v16i16 repeated blend mask. + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && + N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) { + MVT SrcVT = N0.getOperand(0).getSimpleValueType(); + if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 && + SrcVT.getScalarSizeInBits() >= 32) { + unsigned Mask = N.getConstantOperandVal(2); + unsigned Size = VT.getVectorNumElements(); + unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); + unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), + N1.getOperand(0), + DAG.getConstant(ScaleMask, DL, MVT::i8))); + } + } + return SDValue(); + } case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index c37f1227438..d661560461a 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -6507,6 +6507,22 @@ let Predicates = [HasAVX2] in { VEX_4V, VEX_L, VEX_WIG; } +// Emulate vXi32/vXi64 blends with vXf32/vXf64. +// ExecutionDomainFixPass will cleanup domains later on. +let Predicates = [HasAVX] in { +def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), (iPTR imm:$src3)), + (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>; +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), (iPTR imm:$src3)), + (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), (iPTR imm:$src3)), + (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>; +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), (iPTR imm:$src3)), + (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>; +} + defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, VR128, memop, f128mem, 1, SSEPackedSingle, SchedWriteFBlend.XMM, BlendCommuteImm4>; @@ -6517,6 +6533,13 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, VR128, memop, i128mem, 1, SSEPackedInt, SchedWriteBlend.XMM, BlendCommuteImm8>; +let Predicates = [UseSSE41] in { +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), (iPTR imm:$src3)), + (BLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>; +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), (iPTR imm:$src3)), + (BLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>; +} + // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128. let Predicates = [HasAVX] in { |

