summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorReid Kleckner <rnk@google.com>2019-02-23 01:19:42 +0000
committerReid Kleckner <rnk@google.com>2019-02-23 01:19:42 +0000
commite3876637cff9fd254884942bc874232b3fd33178 (patch)
tree47766f0d67c60886f6cc7a6425ae266174da7cbf /llvm/lib/Target
parentf250cf8b4106038ef656b7c40bd2ce1689c8d198 (diff)
downloadbcm5719-llvm-e3876637cff9fd254884942bc874232b3fd33178.tar.gz
bcm5719-llvm-e3876637cff9fd254884942bc874232b3fd33178.zip
Revert r354363 & co "[X86][SSE] Generalize X86ISD::BLENDI support to more value types"
r354363 caused https://crbug.com/934963#c1, which has a plain C reduced test case. I also had to revert some dependent changes: - r354648 - r354647 - r354640 - r354511 llvm-svn: 354713
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp121
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td83
2 files changed, 60 insertions, 144 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 15eab9e261c..6ad5503b8a9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1871,7 +1871,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
- setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
@@ -10482,24 +10481,45 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
V2 = getZeroVector(VT, Subtarget, DAG, DL);
switch (VT.SimpleTy) {
+ case MVT::v2f64:
+ case MVT::v4f32:
+ case MVT::v4f64:
+ case MVT::v8f32:
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8));
case MVT::v4i64:
case MVT::v8i32:
assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
LLVM_FALLTHROUGH;
- case MVT::v4f64:
- case MVT::v8f32:
- assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
- LLVM_FALLTHROUGH;
- case MVT::v2f64:
case MVT::v2i64:
- case MVT::v4f32:
case MVT::v4i32:
- case MVT::v8i16:
- assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
- return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8));
+ // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
+ // that instruction.
+ if (Subtarget.hasAVX2()) {
+ // Scale the blend by the number of 32-bit dwords per element.
+ int Scale = VT.getScalarSizeInBits() / 32;
+ BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
+ MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
+ V1 = DAG.getBitcast(BlendVT, V1);
+ V2 = DAG.getBitcast(BlendVT, V2);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8)));
+ }
+ LLVM_FALLTHROUGH;
+ case MVT::v8i16: {
+ // For integer shuffles we need to expand the mask and cast the inputs to
+ // v8i16s prior to blending.
+ int Scale = 8 / VT.getVectorNumElements();
+ BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
+ V1 = DAG.getBitcast(MVT::v8i16, V1);
+ V2 = DAG.getBitcast(MVT::v8i16, V2);
+ return DAG.getBitcast(VT,
+ DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8)));
+ }
case MVT::v16i16: {
- assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
+ assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
@@ -10527,11 +10547,10 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
}
LLVM_FALLTHROUGH;
}
- case MVT::v32i8:
- assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
- LLVM_FALLTHROUGH;
- case MVT::v16i8: {
- assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
+ case MVT::v16i8:
+ case MVT::v32i8: {
+ assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
+ "256-bit byte-blends require AVX2 support!");
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
@@ -31023,11 +31042,34 @@ static bool matchBinaryPermuteShuffle(
return true;
}
} else {
+ // Determine a type compatible with X86ISD::BLENDI.
+ ShuffleVT = MaskVT;
+ if (Subtarget.hasAVX2()) {
+ if (ShuffleVT == MVT::v4i64)
+ ShuffleVT = MVT::v8i32;
+ else if (ShuffleVT == MVT::v2i64)
+ ShuffleVT = MVT::v4i32;
+ } else {
+ if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
+ ShuffleVT = MVT::v8i16;
+ else if (ShuffleVT == MVT::v4i64)
+ ShuffleVT = MVT::v4f64;
+ else if (ShuffleVT == MVT::v8i32)
+ ShuffleVT = MVT::v8f32;
+ }
+
+ if (!ShuffleVT.isFloatingPoint()) {
+ int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
+ BlendMask =
+ scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
+ ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
+ ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
+ }
+
V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
PermuteImm = (unsigned)BlendMask;
Shuffle = X86ISD::BLENDI;
- ShuffleVT = MaskVT;
return true;
}
}
@@ -32184,29 +32226,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return SDValue();
}
- case X86ISD::BLENDI: {
- SDValue N0 = N.getOperand(0);
- SDValue N1 = N.getOperand(1);
-
- // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
- // TODO: Handle MVT::v16i16 repeated blend mask.
- if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
- N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
- MVT SrcVT = N0.getOperand(0).getSimpleValueType();
- if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
- SrcVT.getScalarSizeInBits() >= 32) {
- unsigned Mask = N.getConstantOperandVal(2);
- unsigned Size = VT.getVectorNumElements();
- unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
- unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale);
- return DAG.getBitcast(
- VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
- N1.getOperand(0),
- DAG.getConstant(ScaleMask, DL, MVT::i8)));
- }
- }
- return SDValue();
- }
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
@@ -42127,25 +42146,6 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG) {
- // Disabling for widening legalization for now. We can enable if we find a
- // case that needs it. Otherwise it can be deleted when we switch to
- // widening legalization.
- if (ExperimentalVectorWideningLegalization)
- return SDValue();
-
- EVT VT = N->getValueType(0);
- SDValue In = N->getOperand(0);
-
- // Combine (ext_invec (ext_invec X)) -> (ext_invec X)
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (In.getOpcode() == N->getOpcode() &&
- TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
- return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));
-
- return SDValue();
-}
-
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -42207,7 +42207,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
- case ISD::ANY_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG);
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index e61666781b0..c37f1227438 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6507,40 +6507,6 @@ let Predicates = [HasAVX2] in {
VEX_4V, VEX_L, VEX_WIG;
}
-// Emulate vXi32/vXi64 blends with vXf32/vXf64.
-// ExecutionDomainFixPass will cleanup domains later on.
-let Predicates = [HasAVX] in {
-def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
- (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
- (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
- (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
-
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
- (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
- (VBLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
- (VBLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>;
-}
-
-let Predicates = [HasAVX1Only] in {
-def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3),
- (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
- (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3),
- (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>;
-
-def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
- (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3),
- (VBLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3),
- (VBLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
-}
-
defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
VR128, memop, f128mem, 1, SSEPackedSingle,
SchedWriteFBlend.XMM, BlendCommuteImm4>;
@@ -6551,22 +6517,6 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
VR128, memop, i128mem, 1, SSEPackedInt,
SchedWriteBlend.XMM, BlendCommuteImm8>;
-let Predicates = [UseSSE41] in {
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
- (BLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3),
- (BLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3),
- (BLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>;
-
-def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
- (BLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3),
- (BLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3),
- (BLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
-}
-
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
let Predicates = [HasAVX] in {
@@ -6578,13 +6528,6 @@ def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
-
-def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
- (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
-def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
- (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
}
/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
@@ -7838,19 +7781,6 @@ def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
(VPBLENDDYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
-
-def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
- (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
- (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
- (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
- (VPBLENDDYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
}
let Predicates = [HasAVX1Only] in {
@@ -7870,19 +7800,6 @@ def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
(VBLENDPSYrri VR256:$src1,
(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
VR128:$src2, sub_xmm), 0xf)>;
-
-def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
- (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
- (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
- (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
-def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
- (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
}
//===----------------------------------------------------------------------===//
OpenPOWER on IntegriCloud