diff options
author | Craig Topper <craig.topper@intel.com> | 2019-04-27 03:38:15 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2019-04-27 03:38:15 +0000 |
commit | 063b471ff7fc5b3d7987f2a09279271344f4e6f7 (patch) | |
tree | 4fce604d2f52edc3a188503baf23177be317917b /llvm/lib | |
parent | 31cfb311c5cbd94963d21a76be8b7c3bec1419cc (diff) | |
download | bcm5719-llvm-063b471ff7fc5b3d7987f2a09279271344f4e6f7.tar.gz bcm5719-llvm-063b471ff7fc5b3d7987f2a09279271344f4e6f7.zip |
[X86] Use MOVQ for i64 atomic_stores when SSE2 is enabled
Summary: If we have SSE2 we can use a MOVQ to store 64-bits and avoid falling back to a cmpxchg8b loop. If its a seq_cst store we need to insert an mfence after the store.
Reviewers: spatel, RKSimon, reames, jfb, efriedma
Reviewed By: RKSimon
Subscribers: hiraditya, dexonsmith, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60546
llvm-svn: 359368
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 75 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 6 |
5 files changed, 72 insertions, 19 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 23eec373190..826ff3f9895 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25625,8 +25625,18 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { return false; } +// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? +// TODO: In 32-bit mode, use FISTP when X87 is available? bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { - return needsCmpXchgNb(SI->getValueOperand()->getType()); + Type *MemType = SI->getValueOperand()->getType(); + + bool NoImplicitFloatOps = + SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); + if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2()) + return false; + + return needsCmpXchgNb(MemType); } // Note: this turns large loads into lock cmpxchg8b/16b. @@ -26262,28 +26272,54 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, DAG.getUNDEF(VT), LockOp.getValue(1)); } -static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { - SDNode *Node = Op.getNode(); +static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + auto *Node = cast<AtomicSDNode>(Op.getNode()); SDLoc dl(Node); - EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); + EVT VT = Node->getMemoryVT(); + + bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent; + bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT); + + // If this store is not sequentially consistent and the type is legal + // we can just keep it. + if (!IsSeqCst && IsTypeLegal) + return Op; + + if (VT == MVT::i64 && !IsTypeLegal) { + // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled. + // FIXME: Use movlps with SSE1. + // FIXME: Use fist with X87. + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && + Subtarget.hasSSE2()) { + SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + Node->getOperand(2)); + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() }; + SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, + Ops, MVT::i64, + Node->getMemOperand()); + + // If this is a sequentially consistent store, also emit an mfence. + if (IsSeqCst) + Chain = DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Chain); + + return Chain; + } + } // Convert seq_cst store -> xchg // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) - // FIXME: On 32-bit, store -> fist or movq would be more efficient - // (The only way to get a 16-byte store is cmpxchg16b) // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. - if (cast<AtomicSDNode>(Node)->getOrdering() == - AtomicOrdering::SequentiallyConsistent || - !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { - SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, - cast<AtomicSDNode>(Node)->getMemoryVT(), - Node->getOperand(0), - Node->getOperand(1), Node->getOperand(2), - cast<AtomicSDNode>(Node)->getMemOperand()); - return Swap.getValue(1); - } - // Other atomic stores have a simple pattern. - return Op; + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, + Node->getMemoryVT(), + Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2), + Node->getMemOperand()); + return Swap.getValue(1); } static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { @@ -26704,7 +26740,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); - case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG); + case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget); case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); @@ -27812,6 +27848,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::LAND: return "X86ISD::LAND"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index ac787f807aa..02f34173339 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -590,6 +590,9 @@ namespace llvm { // Load, scalar_to_vector, and zero extend. VZEXT_LOAD, + // extract_vector_elt, store. + VEXTRACT_STORE, + // Store FP control world into i16 memory. FNSTCW16m, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 420a70ad5a2..78256de22a0 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3893,6 +3893,11 @@ def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst), def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", (VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>; +let Predicates = [HasAVX512] in { + def : Pat<(X86vextractstore (v2i64 VR128X:$src), addr:$dst), + (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>; +} + // Move Scalar Single to Double Int // let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 59f35e6e4bf..4d4d5faccdd 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -101,6 +101,8 @@ def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86vextractstore : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index b7d37920922..814e130dcd7 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -4406,12 +4406,18 @@ let Predicates = [UseAVX] in { (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; def : Pat<(v4i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; + + def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst), + (VMOVPQI2QImr addr:$dst, VR128:$src)>; } let Predicates = [UseSSE2] in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (MOVQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>; + + def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst), + (MOVPQI2QImr addr:$dst, VR128:$src)>; } //===---------------------------------------------------------------------===// |