summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2015-03-19 22:29:40 +0000
committerSanjay Patel <spatel@rotateright.com>2015-03-19 22:29:40 +0000
commitd5c2d287f98c39968934d4a241a6c1292052e140 (patch)
treece319e077fa28b247f03ed4bf6ef5c570201afe0 /llvm/lib
parentab58a568ee6fdf7aefc20e1309f3a15d1152ee42 (diff)
downloadbcm5719-llvm-d5c2d287f98c39968934d4a241a6c1292052e140.tar.gz
bcm5719-llvm-d5c2d287f98c39968934d4a241a6c1292052e140.zip
[X86, AVX] use blends instead of insert128 with index 0
Another case of x86-specific shuffle strength reduction: avoid generating insert*128 instructions with index 0 because they are slower than their non-lane-changing blend equivalents. Shuffle lowering already catches most of these cases, but the zero vector case and some other paths such as in the modified test in vector-shuffle-256-v32.ll were getting through. Differential Revision: http://reviews.llvm.org/D8366 llvm-svn: 232773
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp45
1 files changed, 44 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 50c50259334..c270ada8ae9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -160,8 +160,51 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
/// we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering INSERT_VECTOR_ELT operations easier.
static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG,SDLoc dl) {
+ SelectionDAG &DAG, SDLoc dl) {
assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+
+ // For insertion into the zero index (low half) of a 256-bit vector, it is
+ // more efficient to generate a blend with immediate instead of an insert*128.
+ // We are still creating an INSERT_SUBVECTOR below with an undef node to
+ // extend the subvector to the size of the result vector. Make sure that
+ // we are not recursing on that node by checking for undef here.
+ if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
+ Result.getOpcode() != ISD::UNDEF) {
+ EVT ResultVT = Result.getValueType();
+ SDValue ZeroIndex = DAG.getIntPtrConstant(0);
+ SDValue Undef = DAG.getUNDEF(ResultVT);
+ SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
+ Vec, ZeroIndex);
+
+ // The blend instruction, and therefore its mask, depend on the data type.
+ MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
+ if (ScalarType.isFloatingPoint()) {
+ // Choose either vblendps (float) or vblendpd (double).
+ unsigned ScalarSize = ScalarType.getSizeInBits();
+ assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
+ unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
+ SDValue Mask = DAG.getConstant(MaskVal, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
+ }
+
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+
+ // AVX2 is needed for 256-bit integer blend support.
+ // Integers must be cast to 32-bit because there is only vpblendd;
+ // vpblendw can't be used for this because it has a handicapped mask.
+
+ // If we don't have AVX2, then cast to float. Using a wrong domain blend
+ // is still more efficient than using the wrong domain vinsertf128 that
+ // will be created by InsertSubVector().
+ MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
+
+ SDValue Mask = DAG.getConstant(0x0f, MVT::i8);
+ Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
+ Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
+ return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
+ }
+
return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
}
OpenPOWER on IntegriCloud