[X86][SSE] Scalarize under-aligned XMM vector nt-stores (PR42026)

If a XMM non-temporal store has less than natural alignment, scalarize the vector - with SSE4A we can stay on the vector and use MOVNTSD(f64), else we must move to GPRs and use MOVNTI(i32/i64). llvm-svn: 363592
author: Simon Pilgrim <llvm-dev@redking.me.uk> 2019-06-17 18:20:04 +0000
committer: Simon Pilgrim <llvm-dev@redking.me.uk> 2019-06-17 18:20:04 +0000
commit: 835999e48aa05ade2adf86cbe76d78743d90aa66 (patch)
tree: c6100a712a98b6329f13bfe8fa07edbfed5db871 /llvm/lib
parent: 5d942d5a95c48526c66ac7843f9b385bdb716b30 (diff)
download: bcm5719-llvm-835999e48aa05ade2adf86cbe76d78743d90aa66.tar.gz
bcm5719-llvm-835999e48aa05ade2adf86cbe76d78743d90aa66.zip
1 files changed, 45 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c9b8e5fa2c0..42fcb5e92e9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -21110,6 +21110,42 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
 }
 
+/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
+/// type.
+static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
+                                    SelectionDAG &DAG) {
+  SDValue StoredVal = Store->getValue();
+  assert(StoreVT.is128BitVector() &&
+         StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
+  StoredVal = DAG.getBitcast(StoreVT, StoredVal);
+
+  // Splitting volatile memory ops is not allowed unless the operation was not
+  // legal to begin with. We are assuming the input op is legal (this transform
+  // is only used for targets with AVX).
+  if (Store->isVolatile())
+    return SDValue();
+
+  MVT StoreSVT = StoreVT.getScalarType();
+  unsigned NumElems = StoreVT.getVectorNumElements();
+  unsigned ScalarSize = StoreSVT.getStoreSize();
+  unsigned Alignment = Store->getAlignment();
+
+  SDLoc DL(Store);
+  SmallVector<SDValue, 4> Stores;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Offset = i * ScalarSize;
+    SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
+    SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
+                              DAG.getIntPtrConstant(i, DL));
+    SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
+                              Store->getPointerInfo().getWithOffset(Offset),
+                              MinAlign(Alignment, Offset),
+                              Store->getMemOperand()->getFlags());
+    Stores.push_back(Ch);
+  }
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
@@ -39640,6 +39676,15 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
         return SDValue();
       return splitVectorStore(St, DAG);
     }
+
+    // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
+    // to use MOVNTI.
+    if (VT.is128BitVector() && Subtarget.hasSSE2()) {
+      MVT NTVT = Subtarget.hasSSE4A()
+                     ? MVT::v2f64
+                     : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
+      return scalarizeVectorStore(St, NTVT, DAG);
+    }
   }
 
   // Optimize trunc store (of multiple scalars) to shuffle and store.
author	Simon Pilgrim <llvm-dev@redking.me.uk>	2019-06-17 18:20:04 +0000
committer	Simon Pilgrim <llvm-dev@redking.me.uk>	2019-06-17 18:20:04 +0000
commit	835999e48aa05ade2adf86cbe76d78743d90aa66 (patch)
tree	c6100a712a98b6329f13bfe8fa07edbfed5db871 /llvm/lib
parent	5d942d5a95c48526c66ac7843f9b385bdb716b30 (diff)
download	bcm5719-llvm-835999e48aa05ade2adf86cbe76d78743d90aa66.tar.gz bcm5719-llvm-835999e48aa05ade2adf86cbe76d78743d90aa66.zip