summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorChandler Carruth <chandlerc@gmail.com>2014-09-15 12:40:54 +0000
committerChandler Carruth <chandlerc@gmail.com>2014-09-15 12:40:54 +0000
commit707a2e098dd6b30ebf7d6246360c794296eb0ad9 (patch)
tree2c65e45ef58f991b02a68b32bb4e999d712e5161 /llvm/lib/Target
parent1009df42c0f8089e3f7c13d37a9b1d861710729e (diff)
downloadbcm5719-llvm-707a2e098dd6b30ebf7d6246360c794296eb0ad9.tar.gz
bcm5719-llvm-707a2e098dd6b30ebf7d6246360c794296eb0ad9.zip
[x86] Begin emitting PBLENDW instructions for integer blend operations
when SSE4.1 is available. This removes a ton of domain crossing from blend code paths that were ending up in the floating point code path. This is just the tip of the iceberg though. The real switch is for integer blend lowering to more actively rely on this instruction being available so we don't hit shufps at all any longer. =] That will come in a follow-up patch. Another place where we need better support is for using PBLENDVB when doing so avoids the need to have two complementary PSHUFB masks. llvm-svn: 217767
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp38
1 files changed, 36 insertions, 2 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index da3ec8b35eb..b5744e093d2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7258,8 +7258,27 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
if (Mask[i] >= 0 && Mask[i] != i)
return SDValue(); // Shuffled V1 input!
}
- return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8));
+ if (VT == MVT::v4f32 || VT == MVT::v2f64)
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8));
+ assert(!VT.isFloatingPoint() && "Only v4f32 and v2f64 are supported!");
+
+ // For integer shuffles we need to expand the mask and cast the inputs to
+ // v8i16s prior to blending.
+ assert((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64) &&
+ "Not a supported integer vector type!");
+ int Scale = 8 / VT.getVectorNumElements();
+ BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= Size)
+ for (int j = 0; j < Scale; ++j)
+ BlendMask |= 1u << (i * Scale + j);
+
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8)));
}
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
@@ -7343,6 +7362,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (isShuffleEquivalent(Mask, 1, 3))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, DAG))
+ return Blend;
+
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
// However, all the alternatives are still more cycles and newer chips don't
@@ -7631,6 +7655,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG))
return V;
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Blend;
+
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
@@ -8289,6 +8318,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG))
return V;
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Blend;
+
if (NumV1Inputs + NumV2Inputs <= 4)
return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
OpenPOWER on IntegriCloud