Split the store of a wide value merged from an int-fp pair into multiple stores.

For the store of a wide value merged from a pair of values, especially int-fp pair, sometimes it is more efficent to split it into separate narrow stores, which can remove the bitwise instructions or sink them to colder places. Now the feature is only enabled on x86 target, and only store of int-fp pair is splitted. It is possible that the application scope gets extended with perf evidence support in the future. Differential Revision: https://reviews.llvm.org/D22840 llvm-svn: 280505
author: Wei Mi <wmi@google.com> 2016-09-02 17:17:04 +0000
committer: Wei Mi <wmi@google.com> 2016-09-02 17:17:04 +0000
commit: c54d1298f52c186cf91b92a7da8b40360478dee0 (patch)
tree: a7b740df2b6a4d016918ccddcbc276ec71324a26
parent: 521f19f2498e867696b803eedfa4e87b53efea09 (diff)
download: bcm5719-llvm-c54d1298f52c186cf91b92a7da8b40360478dee0.tar.gz
bcm5719-llvm-c54d1298f52c186cf91b92a7da8b40360478dee0.zip
4 files changed, 191 insertions, 0 deletions
diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h
index 09ceb247cdf..052b704ef29 100644
--- a/llvm/include/llvm/Target/TargetLowering.h
+++ b/llvm/include/llvm/Target/TargetLowering.h
@@ -335,6 +335,12 @@ public:
     return false;
   }
 
+  /// \brief Return true if it is cheaper to split the store of a merged int val
+  /// from a pair of smaller values into multiple stores.
+  virtual bool isMultiStoresCheaperThanBitsMerge(SDValue Lo, SDValue Hi) const {
+    return false;
+  }
+
   /// \brief Return if the target supports combining a
   /// chain like:
   /// \code
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c277152bc8c..ab47e1ae316 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -374,6 +374,7 @@ namespace {
     SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
+    SDValue splitMergedValStore(StoreSDNode *ST);
     SDValue TransformFPLoadStorePair(SDNode *N);
     SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
     SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
@@ -12200,9 +12201,111 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       return NewSt;
   }
 
+  if (SDValue NewSt = splitMergedValStore(ST))
+    return NewSt;
+
   return ReduceLoadOpStoreWidth(N);
 }
 
+/// For the instruction sequence of store below, F and I values
+/// are bundled together as an i64 value before being stored into memory.
+/// Sometimes it is more efficent to generate separate stores for F and I,
+/// which can remove the bitwise instructions or sink them to colder places.
+///
+///   (store (or (zext (bitcast F to i32) to i64),
+///              (shl (zext I to i64), 32)), addr)  -->
+///   (store F, addr) and (store I, addr+4)
+///
+/// Similarly, splitting for other merged store can also be beneficial, like:
+/// For pair of {i32, i32}, i64 store --> two i32 stores.
+/// For pair of {i32, i16}, i64 store --> two i32 stores.
+/// For pair of {i16, i16}, i32 store --> two i16 stores.
+/// For pair of {i16, i8},  i32 store --> two i16 stores.
+/// For pair of {i8, i8},   i16 store --> two i8 stores.
+///
+/// We allow each target to determine specifically which kind of splitting is
+/// supported.
+///
+/// The store patterns are commonly seen from the simple code snippet below
+/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
+///   void goo(const std::pair<int, float> &);
+///   hoo() {
+///     ...
+///     goo(std::make_pair(tmp, ftmp));
+///     ...
+///   }
+///
+SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
+  if (OptLevel == CodeGenOpt::None)
+    return SDValue();
+
+  SDValue Val = ST->getValue();
+  SDLoc DL(ST);
+
+  // Match OR operand.
+  if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR)
+    return SDValue();
+
+  // Match SHL operand and get Lower and Higher parts of Val.
+  SDValue Op1 = Val.getOperand(0);
+  SDValue Op2 = Val.getOperand(1);
+  SDValue Lo, Hi;
+  if (Op1.getOpcode() != ISD::SHL) {
+    std::swap(Op1, Op2);
+    if (Op1.getOpcode() != ISD::SHL)
+      return SDValue();
+  }
+  Lo = Op2;
+  Hi = Op1.getOperand(0);
+  if (!Op1.hasOneUse())
+    return SDValue();
+
+  // Match shift amount to HalfValBitSize.
+  unsigned HalfValBitSize = Val.getValueType().getSizeInBits() / 2;
+  ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
+  if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize)
+    return SDValue();
+
+  // Lo and Hi are zero-extended from int with size less equal than 32
+  // to i64.
+  if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() ||
+      !Lo.getOperand(0).getValueType().isScalarInteger() ||
+      Lo.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize ||
+      Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() ||
+      !Hi.getOperand(0).getValueType().isScalarInteger() ||
+      Hi.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize)
+    return SDValue();
+
+  if (!TLI.isMultiStoresCheaperThanBitsMerge(Lo.getOperand(0),
+                                             Hi.getOperand(0)))
+    return SDValue();
+
+  // Start to split store.
+  unsigned Alignment = ST->getAlignment();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = ST->getAAInfo();
+
+  // Change the sizes of Lo and Hi's value types to HalfValBitSize.
+  EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  // Lower value store.
+  SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
+                             ST->getAlignment(), MMOFlags, AAInfo);
+  Ptr =
+      DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                  DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType()));
+  // Higher value store.
+  SDValue St1 =
+      DAG.getStore(Chain, DL, Hi, Ptr,
+                   ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
+                   Alignment / 2, MMOFlags, AAInfo);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1);
+}
+
 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   SDValue InVec = N->getOperand(0);
   SDValue InVal = N->getOperand(1);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index d2df8291356..fd312d94736 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -764,6 +764,26 @@ namespace llvm {
       return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
     }
 
+    bool isMultiStoresCheaperThanBitsMerge(SDValue Lo,
+                                           SDValue Hi) const override {
+      // If the pair to store is a mixture of float and int values, we will
+      // save two bitwise instructions and one float-to-int instruction and
+      // increase one store instruction. There is potentially a more
+      // significant benefit because it avoids the float->int domain switch
+      // for input value. So It is more likely a win.
+      if (Lo.getOpcode() == ISD::BITCAST || Hi.getOpcode() == ISD::BITCAST) {
+        SDValue Opd = (Lo.getOpcode() == ISD::BITCAST) ? Lo.getOperand(0)
+                                                       : Hi.getOperand(0);
+        if (Opd.getValueType().isFloatingPoint())
+          return true;
+      }
+      // If the pair only contains int values, we will save two bitwise
+      // instructions and increase one store instruction (costing one more
+      // store buffer). Since the benefit is more blurred so we leave
+      // such pair out until we get testcase to prove it is a win.
+      return false;
+    }
+
     bool hasAndNotCompare(SDValue Y) const override;
 
     /// Return the value type to use for ISD::SETCC.
diff --git a/llvm/test/Transforms/InstCombine/split-store.ll b/llvm/test/Transforms/InstCombine/split-store.ll
new file mode 100644
index 00000000000..707690797b2
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/split-store.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: int32_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movl %edi, (%rsi)
+define void @int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: float_int32_pair
+; CHECK: movl %edi, 4(%rsi)
+; CHECK: movss %xmm0, (%rsi)
+define void @float_int32_pair(float %tmp1, i32 %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp1 to i32
+  %t1 = zext i32 %tmp2 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %t0 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int16_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movzwl	%di, %eax
+; CHECK: movl %eax, (%rsi)
+define void @int16_float_pair(i16 signext %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i16 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int8_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movzbl	%dil, %eax
+; CHECK: movl %eax, (%rsi)
+define void @int8_float_pair(i8 signext %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i8 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
author	Wei Mi <wmi@google.com>	2016-09-02 17:17:04 +0000
committer	Wei Mi <wmi@google.com>	2016-09-02 17:17:04 +0000
commit	c54d1298f52c186cf91b92a7da8b40360478dee0 (patch)
tree	a7b740df2b6a4d016918ccddcbc276ec71324a26
parent	521f19f2498e867696b803eedfa4e87b53efea09 (diff)
download	bcm5719-llvm-c54d1298f52c186cf91b92a7da8b40360478dee0.tar.gz bcm5719-llvm-c54d1298f52c186cf91b92a7da8b40360478dee0.zip