summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/Target/TargetLowering.h6
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp103
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h20
-rw-r--r--llvm/test/Transforms/InstCombine/split-store.ll62
4 files changed, 191 insertions, 0 deletions
diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h
index 09ceb247cdf..052b704ef29 100644
--- a/llvm/include/llvm/Target/TargetLowering.h
+++ b/llvm/include/llvm/Target/TargetLowering.h
@@ -335,6 +335,12 @@ public:
return false;
}
+ /// \brief Return true if it is cheaper to split the store of a merged int val
+ /// from a pair of smaller values into multiple stores.
+ virtual bool isMultiStoresCheaperThanBitsMerge(SDValue Lo, SDValue Hi) const {
+ return false;
+ }
+
/// \brief Return if the target supports combining a
/// chain like:
/// \code
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c277152bc8c..ab47e1ae316 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -374,6 +374,7 @@ namespace {
SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
SDValue ReduceLoadWidth(SDNode *N);
SDValue ReduceLoadOpStoreWidth(SDNode *N);
+ SDValue splitMergedValStore(StoreSDNode *ST);
SDValue TransformFPLoadStorePair(SDNode *N);
SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
@@ -12200,9 +12201,111 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
return NewSt;
}
+ if (SDValue NewSt = splitMergedValStore(ST))
+ return NewSt;
+
return ReduceLoadOpStoreWidth(N);
}
+/// For the instruction sequence of store below, F and I values
+/// are bundled together as an i64 value before being stored into memory.
+/// Sometimes it is more efficent to generate separate stores for F and I,
+/// which can remove the bitwise instructions or sink them to colder places.
+///
+/// (store (or (zext (bitcast F to i32) to i64),
+/// (shl (zext I to i64), 32)), addr) -->
+/// (store F, addr) and (store I, addr+4)
+///
+/// Similarly, splitting for other merged store can also be beneficial, like:
+/// For pair of {i32, i32}, i64 store --> two i32 stores.
+/// For pair of {i32, i16}, i64 store --> two i32 stores.
+/// For pair of {i16, i16}, i32 store --> two i16 stores.
+/// For pair of {i16, i8}, i32 store --> two i16 stores.
+/// For pair of {i8, i8}, i16 store --> two i8 stores.
+///
+/// We allow each target to determine specifically which kind of splitting is
+/// supported.
+///
+/// The store patterns are commonly seen from the simple code snippet below
+/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
+/// void goo(const std::pair<int, float> &);
+/// hoo() {
+/// ...
+/// goo(std::make_pair(tmp, ftmp));
+/// ...
+/// }
+///
+SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
+ if (OptLevel == CodeGenOpt::None)
+ return SDValue();
+
+ SDValue Val = ST->getValue();
+ SDLoc DL(ST);
+
+ // Match OR operand.
+ if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR)
+ return SDValue();
+
+ // Match SHL operand and get Lower and Higher parts of Val.
+ SDValue Op1 = Val.getOperand(0);
+ SDValue Op2 = Val.getOperand(1);
+ SDValue Lo, Hi;
+ if (Op1.getOpcode() != ISD::SHL) {
+ std::swap(Op1, Op2);
+ if (Op1.getOpcode() != ISD::SHL)
+ return SDValue();
+ }
+ Lo = Op2;
+ Hi = Op1.getOperand(0);
+ if (!Op1.hasOneUse())
+ return SDValue();
+
+ // Match shift amount to HalfValBitSize.
+ unsigned HalfValBitSize = Val.getValueType().getSizeInBits() / 2;
+ ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
+ if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize)
+ return SDValue();
+
+ // Lo and Hi are zero-extended from int with size less equal than 32
+ // to i64.
+ if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() ||
+ !Lo.getOperand(0).getValueType().isScalarInteger() ||
+ Lo.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize ||
+ Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() ||
+ !Hi.getOperand(0).getValueType().isScalarInteger() ||
+ Hi.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize)
+ return SDValue();
+
+ if (!TLI.isMultiStoresCheaperThanBitsMerge(Lo.getOperand(0),
+ Hi.getOperand(0)))
+ return SDValue();
+
+ // Start to split store.
+ unsigned Alignment = ST->getAlignment();
+ MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+ AAMDNodes AAInfo = ST->getAAInfo();
+
+ // Change the sizes of Lo and Hi's value types to HalfValBitSize.
+ EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
+ Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
+ Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));
+
+ SDValue Chain = ST->getChain();
+ SDValue Ptr = ST->getBasePtr();
+ // Lower value store.
+ SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
+ ST->getAlignment(), MMOFlags, AAInfo);
+ Ptr =
+ DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+ DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType()));
+ // Higher value store.
+ SDValue St1 =
+ DAG.getStore(Chain, DL, Hi, Ptr,
+ ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
+ Alignment / 2, MMOFlags, AAInfo);
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1);
+}
+
SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
SDValue InVec = N->getOperand(0);
SDValue InVal = N->getOperand(1);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index d2df8291356..fd312d94736 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -764,6 +764,26 @@ namespace llvm {
return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
}
+ bool isMultiStoresCheaperThanBitsMerge(SDValue Lo,
+ SDValue Hi) const override {
+ // If the pair to store is a mixture of float and int values, we will
+ // save two bitwise instructions and one float-to-int instruction and
+ // increase one store instruction. There is potentially a more
+ // significant benefit because it avoids the float->int domain switch
+ // for input value. So It is more likely a win.
+ if (Lo.getOpcode() == ISD::BITCAST || Hi.getOpcode() == ISD::BITCAST) {
+ SDValue Opd = (Lo.getOpcode() == ISD::BITCAST) ? Lo.getOperand(0)
+ : Hi.getOperand(0);
+ if (Opd.getValueType().isFloatingPoint())
+ return true;
+ }
+ // If the pair only contains int values, we will save two bitwise
+ // instructions and increase one store instruction (costing one more
+ // store buffer). Since the benefit is more blurred so we leave
+ // such pair out until we get testcase to prove it is a win.
+ return false;
+ }
+
bool hasAndNotCompare(SDValue Y) const override;
/// Return the value type to use for ISD::SETCC.
diff --git a/llvm/test/Transforms/InstCombine/split-store.ll b/llvm/test/Transforms/InstCombine/split-store.ll
new file mode 100644
index 00000000000..707690797b2
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/split-store.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: int32_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movl %edi, (%rsi)
+define void @int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+ %t0 = bitcast float %tmp2 to i32
+ %t1 = zext i32 %t0 to i64
+ %t2 = shl nuw i64 %t1, 32
+ %t3 = zext i32 %tmp1 to i64
+ %t4 = or i64 %t2, %t3
+ store i64 %t4, i64* %ref.tmp, align 8
+ ret void
+}
+
+; CHECK-LABEL: float_int32_pair
+; CHECK: movl %edi, 4(%rsi)
+; CHECK: movss %xmm0, (%rsi)
+define void @float_int32_pair(float %tmp1, i32 %tmp2, i64* %ref.tmp) {
+entry:
+ %t0 = bitcast float %tmp1 to i32
+ %t1 = zext i32 %tmp2 to i64
+ %t2 = shl nuw i64 %t1, 32
+ %t3 = zext i32 %t0 to i64
+ %t4 = or i64 %t2, %t3
+ store i64 %t4, i64* %ref.tmp, align 8
+ ret void
+}
+
+; CHECK-LABEL: int16_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movzwl %di, %eax
+; CHECK: movl %eax, (%rsi)
+define void @int16_float_pair(i16 signext %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+ %t0 = bitcast float %tmp2 to i32
+ %t1 = zext i32 %t0 to i64
+ %t2 = shl nuw i64 %t1, 32
+ %t3 = zext i16 %tmp1 to i64
+ %t4 = or i64 %t2, %t3
+ store i64 %t4, i64* %ref.tmp, align 8
+ ret void
+}
+
+; CHECK-LABEL: int8_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movzbl %dil, %eax
+; CHECK: movl %eax, (%rsi)
+define void @int8_float_pair(i8 signext %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+ %t0 = bitcast float %tmp2 to i32
+ %t1 = zext i32 %t0 to i64
+ %t2 = shl nuw i64 %t1, 32
+ %t3 = zext i8 %tmp1 to i64
+ %t4 = or i64 %t2, %t3
+ store i64 %t4, i64* %ref.tmp, align 8
+ ret void
+}
OpenPOWER on IntegriCloud