summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2015-03-20 21:19:52 +0000
committerSanjay Patel <spatel@rotateright.com>2015-03-20 21:19:52 +0000
commitc88f724fedeff64bd333668bdcda9d8d0a50f537 (patch)
treeddac9121533abe574c78f50a62dab68193798988 /llvm/lib/Target/X86/X86ISelLowering.cpp
parent03ad616143062560de3aa1bfe41cae60d25eb548 (diff)
downloadbcm5719-llvm-c88f724fedeff64bd333668bdcda9d8d0a50f537.tar.gz
bcm5719-llvm-c88f724fedeff64bd333668bdcda9d8d0a50f537.zip
[X86] Prefer blendps over insertps codegen for one special case
With this patch, for this one exact case, we'll generate: blendps %xmm0, %xmm1, $1 instead of: insertps %xmm0, %xmm1, $0 If there's a memory operand available for load folding and we're optimizing for size, we'll still generate the insertps. The detailed performance data motivation for this may be found in D7866; in summary, blendps has 2-3x throughput vs. insertps on widely used chips. Differential Revision: http://reviews.llvm.org/D8332 llvm-svn: 232850
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp31
1 files changed, 22 insertions, 9 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5ff71da07fc..6906c6bdd69 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -10550,16 +10550,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
}
if (EltVT == MVT::f32) {
- // Bits [7:6] of the constant are the source select. This will always be
- // zero here. The DAG Combiner may combine an extract_elt index into
- // these
- // bits. For example (insert (extract, 3), 2) could be matched by
- // putting
- // the '3' into bits [7:6] of X86ISD::INSERTPS.
- // Bits [5:4] of the constant are the destination select. This is the
- // value of the incoming immediate.
- // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+ // Bits [7:6] of the constant are the source select. This will always be
+ // zero here. The DAG Combiner may combine an extract_elt index into
+ // these bits. For example (insert (extract, 3), 2) could be matched by
+ // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
+ // Bits [5:4] of the constant are the destination select. This is the
+ // value of the incoming immediate.
+ // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
+
+ const Function *F = DAG.getMachineFunction().getFunction();
+ bool MinSize = F->hasFnAttribute(Attribute::MinSize);
+ if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+ // If this is an insertion of 32-bits into the low 32-bits of
+ // a vector, we prefer to generate a blend with immediate rather
+ // than an insertps. Blends are simpler operations in hardware and so
+ // will always have equal or better performance than insertps.
+ // But if optimizing for size and there's a load folding opportunity,
+ // generate insertps because blendps does not have a 32-bit memory
+ // operand form.
+ N2 = DAG.getIntPtrConstant(1);
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+ }
N2 = DAG.getIntPtrConstant(IdxVal << 4);
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
OpenPOWER on IntegriCloud