summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorFilipe Cabecinhas <me@filcab.net>2014-05-19 19:45:57 +0000
committerFilipe Cabecinhas <me@filcab.net>2014-05-19 19:45:57 +0000
commitdc9210276637bf67091ff8885f0c2c0e08fadbcb (patch)
tree84c9e171289e944f35f147ef6666ebeb2f76e963 /llvm/lib
parent9f7d14756df13b907e29b61160b1cb6981013bb3 (diff)
downloadbcm5719-llvm-dc9210276637bf67091ff8885f0c2c0e08fadbcb.tar.gz
bcm5719-llvm-dc9210276637bf67091ff8885f0c2c0e08fadbcb.zip
Added more insertps optimizations
Summary: When inserting an element that's coming from a vector load or a broadcast of a vector (or scalar) load, combine the load into the insertps instruction. Added PerformINSERTPSCombine for the case where we need to fix the load (load of a vector + insertps with a non-zero CountS). Added patterns for the broadcasts. Also added tests for SSE4.1, AVX, and AVX2. Reviewers: delena, nadav, craig.topper Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D3581 llvm-svn: 209156
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp60
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td23
2 files changed, 72 insertions, 11 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e1db618baf1..cfdaf0f8e3a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7412,6 +7412,23 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
getShuffleSHUFImmediate(SVOp), DAG);
}
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+ SelectionDAG &DAG) {
+ SDLoc dl(Load);
+ MVT VT = Load->getSimpleValueType(0);
+ MVT EVT = VT.getVectorElementType();
+ SDValue Addr = Load->getOperand(1);
+ SDValue NewAddr = DAG.getNode(
+ ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+ DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+
+ SDValue NewLoad =
+ DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Load->getMemOperand(), 0, EVT.getStoreSize()));
+ return NewLoad;
+}
+
// It is only safe to call this function if isINSERTPSMask is true for
// this shufflevector mask.
static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
@@ -7423,7 +7440,6 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
// If we're transferring an i32 from memory to a specific element in a
// register, we output a generic DAG that will match the PINSRD
// instruction.
- // TODO: Optimize for AVX cases too (VINSERTPS)
MVT VT = SVOp->getSimpleValueType(0);
MVT EVT = VT.getVectorElementType();
SDValue V1 = SVOp->getOperand(0);
@@ -7456,17 +7472,10 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
// Trivial case, when From comes from a load and is only used by the
// shuffle. Make it use insertps from the vector that we need from that
// load.
- SDValue Addr = From.getOperand(1);
- SDValue NewAddr =
- DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
- DAG.getConstant(DestIndex * EVT.getStoreSize(),
- Addr.getSimpleValueType()));
-
- LoadSDNode *Load = cast<LoadSDNode>(From);
SDValue NewLoad =
- DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
- DAG.getMachineFunction().getMachineMemOperand(
- Load->getMemOperand(), 0, EVT.getStoreSize()));
+ NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
+ if (!NewLoad.getNode())
+ return SDValue();
if (EVT == MVT::f32) {
// Create this as a scalar to vector to match the instruction pattern.
@@ -20281,6 +20290,33 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDLoc dl(N);
+ MVT VT = N->getOperand(1)->getSimpleValueType(0);
+ assert(VT == MVT::v4f32 ||
+ VT == MVT::v4i32 && "X86insertps is only defined for v4x32");
+
+ SDValue Ld = N->getOperand(1);
+ if (MayFoldLoad(Ld)) {
+ // Extract the countS bits from the immediate so we can get the proper
+ // address when narrowing the vector load to a specific element.
+ // When the second source op is a memory address, interps doesn't use
+ // countS and just gets an f32 from that address.
+ unsigned DestIndex =
+ cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+ Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
+ } else
+ return SDValue();
+
+ // Create this as a scalar to vector to match the instruction pattern.
+ SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+ // countS bits are ignored when loading from memory on insertps, which
+ // means we don't need to explicitly set them to 0.
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+ LoadScalarToVector, N->getOperand(2));
+}
+
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.
@@ -20584,6 +20620,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
+ case X86ISD::INSERTPS:
+ return PerformINSERTPSCombine(N, DAG, Subtarget);
}
return SDValue();
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index ac28d07cc68..1eb04851b72 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6550,6 +6550,29 @@ let ExeDomain = SSEPackedSingle in {
defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
}
+let Predicates = [UseSSE41] in {
+ // If we're inserting an element from a load or a null pshuf of a load,
+ // fold the load into the insertps instruction.
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
+ (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
+ imm:$src3)),
+ (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
+ (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
+ (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+let Predicates = [UseAVX] in {
+ // If we're inserting an element from a vbroadcast of a load, fold the
+ // load into the X86insertps instruction.
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+ (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
+ (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+ (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
+ (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
//===----------------------------------------------------------------------===//
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//
OpenPOWER on IntegriCloud